direct-io.hg

view xen/arch/x86/traps.c @ 14363:40a6e2280d7b

xen/x86: Tweak #PF handler. Simplify gdbstub copy to/from guest.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Mar 13 14:04:31 2007 +0000 (2007-03-13)
parents 43e9952b07ea
children 070cf119a7ec
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <asm/paging.h>
50 #include <asm/system.h>
51 #include <asm/io.h>
52 #include <asm/atomic.h>
53 #include <asm/desc.h>
54 #include <asm/debugreg.h>
55 #include <asm/smp.h>
56 #include <asm/flushtlb.h>
57 #include <asm/uaccess.h>
58 #include <asm/i387.h>
59 #include <asm/debugger.h>
60 #include <asm/msr.h>
61 #include <asm/shared.h>
62 #include <asm/x86_emulate.h>
63 #include <asm/hvm/vpt.h>
65 /*
66 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
67 * fatal: Xen prints diagnostic message and then hangs.
68 * dom0: The NMI is virtualised to DOM0.
69 * ignore: The NMI error is cleared and ignored.
70 */
71 #ifdef NDEBUG
72 char opt_nmi[10] = "dom0";
73 #else
74 char opt_nmi[10] = "fatal";
75 #endif
76 string_param("nmi", opt_nmi);
78 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
79 idt_entry_t idt_table[IDT_ENTRIES];
81 #define DECLARE_TRAP_HANDLER(_name) \
82 asmlinkage void _name(void); \
83 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
85 asmlinkage void nmi(void);
86 DECLARE_TRAP_HANDLER(divide_error);
87 DECLARE_TRAP_HANDLER(debug);
88 DECLARE_TRAP_HANDLER(int3);
89 DECLARE_TRAP_HANDLER(overflow);
90 DECLARE_TRAP_HANDLER(bounds);
91 DECLARE_TRAP_HANDLER(invalid_op);
92 DECLARE_TRAP_HANDLER(device_not_available);
93 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
94 DECLARE_TRAP_HANDLER(invalid_TSS);
95 DECLARE_TRAP_HANDLER(segment_not_present);
96 DECLARE_TRAP_HANDLER(stack_segment);
97 DECLARE_TRAP_HANDLER(general_protection);
98 DECLARE_TRAP_HANDLER(page_fault);
99 DECLARE_TRAP_HANDLER(coprocessor_error);
100 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
101 DECLARE_TRAP_HANDLER(alignment_check);
102 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
103 DECLARE_TRAP_HANDLER(machine_check);
105 long do_set_debugreg(int reg, unsigned long value);
106 unsigned long do_get_debugreg(int reg);
108 static int debug_stack_lines = 20;
109 integer_param("debug_stack_lines", debug_stack_lines);
111 #ifdef CONFIG_X86_32
112 #define stack_words_per_line 8
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
114 #else
115 #define stack_words_per_line 4
116 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
117 #endif
119 static void show_guest_stack(struct cpu_user_regs *regs)
120 {
121 int i;
122 unsigned long *stack, addr;
124 if ( is_hvm_vcpu(current) )
125 return;
127 if ( IS_COMPAT(container_of(regs, struct cpu_info, guest_cpu_user_regs)->current_vcpu->domain) )
128 {
129 compat_show_guest_stack(regs, debug_stack_lines);
130 return;
131 }
133 if ( vm86_mode(regs) )
134 {
135 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
136 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
137 regs->ss, (uint16_t)(regs->esp & 0xffff));
138 }
139 else
140 {
141 stack = (unsigned long *)regs->esp;
142 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
143 }
145 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
146 {
147 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
148 break;
149 if ( get_user(addr, stack) )
150 {
151 if ( i != 0 )
152 printk("\n ");
153 printk("Fault while accessing guest memory.");
154 i = 1;
155 break;
156 }
157 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
158 printk("\n ");
159 printk(" %p", _p(addr));
160 stack++;
161 }
162 if ( i == 0 )
163 printk("Stack empty.");
164 printk("\n");
165 }
167 #ifdef NDEBUG
169 static void show_trace(struct cpu_user_regs *regs)
170 {
171 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
173 printk("Xen call trace:\n ");
175 printk("[<%p>]", _p(regs->eip));
176 print_symbol(" %s\n ", regs->eip);
178 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
179 {
180 addr = *stack++;
181 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
182 {
183 printk("[<%p>]", _p(addr));
184 print_symbol(" %s\n ", addr);
185 }
186 }
188 printk("\n");
189 }
191 #else
193 static void show_trace(struct cpu_user_regs *regs)
194 {
195 unsigned long *frame, next, addr, low, high;
197 printk("Xen call trace:\n ");
199 printk("[<%p>]", _p(regs->eip));
200 print_symbol(" %s\n ", regs->eip);
202 /* Bounds for range of valid frame pointer. */
203 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
204 high = (low & ~(STACK_SIZE - 1)) +
205 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
207 /* The initial frame pointer. */
208 next = regs->ebp;
210 for ( ; ; )
211 {
212 /* Valid frame pointer? */
213 if ( (next < low) || (next >= high) )
214 {
215 /*
216 * Exception stack frames have a different layout, denoted by an
217 * inverted frame pointer.
218 */
219 next = ~next;
220 if ( (next < low) || (next >= high) )
221 break;
222 frame = (unsigned long *)next;
223 next = frame[0];
224 addr = frame[(offsetof(struct cpu_user_regs, eip) -
225 offsetof(struct cpu_user_regs, ebp))
226 / BYTES_PER_LONG];
227 }
228 else
229 {
230 /* Ordinary stack frame. */
231 frame = (unsigned long *)next;
232 next = frame[0];
233 addr = frame[1];
234 }
236 printk("[<%p>]", _p(addr));
237 print_symbol(" %s\n ", addr);
239 low = (unsigned long)&frame[2];
240 }
242 printk("\n");
243 }
245 #endif
247 void show_stack(struct cpu_user_regs *regs)
248 {
249 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
250 int i;
252 if ( guest_mode(regs) )
253 return show_guest_stack(regs);
255 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
257 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
258 {
259 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
260 break;
261 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
262 printk("\n ");
263 addr = *stack++;
264 printk(" %p", _p(addr));
265 }
266 if ( i == 0 )
267 printk("Stack empty.");
268 printk("\n");
270 show_trace(regs);
271 }
273 void show_xen_trace()
274 {
275 struct cpu_user_regs regs;
276 #ifdef __x86_64
277 __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
278 __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
279 __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
280 #else
281 __asm__("movl %%esp,%0" : "=m" (regs.esp));
282 __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
283 __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
284 #endif
285 show_trace(&regs);
286 }
288 void show_stack_overflow(unsigned long esp)
289 {
290 #ifdef MEMORY_GUARD
291 unsigned long esp_top;
292 unsigned long *stack, addr;
294 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
296 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
297 if ( ((unsigned long)(esp - esp_top) > 512) &&
298 ((unsigned long)(esp_top - esp) > 512) )
299 return;
301 if ( esp < esp_top )
302 esp = esp_top;
304 printk("Xen stack overflow:\n ");
306 stack = (unsigned long *)esp;
307 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
308 {
309 addr = *stack++;
310 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
311 {
312 printk("%p: [<%p>]", stack, _p(addr));
313 print_symbol(" %s\n ", addr);
314 }
315 }
317 printk("\n");
318 #endif
319 }
321 void show_execution_state(struct cpu_user_regs *regs)
322 {
323 show_registers(regs);
324 show_stack(regs);
325 }
327 char *trapstr(int trapnr)
328 {
329 static char *strings[] = {
330 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
331 "invalid opcode", "device not available", "double fault",
332 "coprocessor segment", "invalid tss", "segment not found",
333 "stack error", "general protection fault", "page fault",
334 "spurious interrupt", "coprocessor error", "alignment check",
335 "machine check", "simd error"
336 };
338 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
339 return "???";
341 return strings[trapnr];
342 }
344 /*
345 * This is called for faults at very unexpected times (e.g., when interrupts
346 * are disabled). In such situations we can't do much that is safe. We try to
347 * print out some tracing and then we just spin.
348 */
349 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
350 {
351 watchdog_disable();
352 console_start_sync();
354 show_execution_state(regs);
356 if ( trapnr == TRAP_page_fault )
357 {
358 unsigned long cr2 = read_cr2();
359 printk("Faulting linear address: %p\n", _p(cr2));
360 show_page_walk(cr2);
361 }
363 panic("FATAL TRAP: vector = %d (%s)\n"
364 "[error_code=%04x] %s\n",
365 trapnr, trapstr(trapnr), regs->error_code,
366 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
367 }
369 static int do_guest_trap(
370 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
371 {
372 struct vcpu *v = current;
373 struct trap_bounce *tb;
374 const struct trap_info *ti;
376 tb = &v->arch.trap_bounce;
377 ti = &v->arch.guest_context.trap_ctxt[trapnr];
379 tb->flags = TBF_EXCEPTION;
380 tb->cs = ti->cs;
381 tb->eip = ti->address;
383 if ( use_error_code )
384 {
385 tb->flags |= TBF_EXCEPTION_ERRCODE;
386 tb->error_code = regs->error_code;
387 }
389 if ( TI_GET_IF(ti) )
390 tb->flags |= TBF_INTERRUPT;
392 if ( unlikely(null_trap_bounce(v, tb)) )
393 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
394 "domain %d on VCPU %d [ec=%04x]\n",
395 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
396 regs->error_code);
398 return 0;
399 }
401 static inline int do_trap(
402 int trapnr, struct cpu_user_regs *regs, int use_error_code)
403 {
404 unsigned long fixup;
406 DEBUGGER_trap_entry(trapnr, regs);
408 if ( guest_mode(regs) )
409 return do_guest_trap(trapnr, regs, use_error_code);
411 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
412 {
413 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
414 trapnr, _p(regs->eip), _p(fixup));
415 regs->eip = fixup;
416 return 0;
417 }
419 DEBUGGER_trap_fatal(trapnr, regs);
421 show_execution_state(regs);
422 panic("FATAL TRAP: vector = %d (%s)\n"
423 "[error_code=%04x]\n",
424 trapnr, trapstr(trapnr), regs->error_code);
425 return 0;
426 }
428 #define DO_ERROR_NOCODE(trapnr, name) \
429 asmlinkage int do_##name(struct cpu_user_regs *regs) \
430 { \
431 return do_trap(trapnr, regs, 0); \
432 }
434 #define DO_ERROR(trapnr, name) \
435 asmlinkage int do_##name(struct cpu_user_regs *regs) \
436 { \
437 return do_trap(trapnr, regs, 1); \
438 }
440 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
441 DO_ERROR_NOCODE(TRAP_overflow, overflow)
442 DO_ERROR_NOCODE(TRAP_bounds, bounds)
443 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
444 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
445 DO_ERROR( TRAP_no_segment, segment_not_present)
446 DO_ERROR( TRAP_stack_error, stack_segment)
447 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
448 DO_ERROR( TRAP_alignment_check, alignment_check)
449 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
451 int rdmsr_hypervisor_regs(
452 uint32_t idx, uint32_t *eax, uint32_t *edx)
453 {
454 idx -= 0x40000000;
455 if ( idx > 0 )
456 return 0;
458 *eax = *edx = 0;
459 return 1;
460 }
462 int wrmsr_hypervisor_regs(
463 uint32_t idx, uint32_t eax, uint32_t edx)
464 {
465 struct domain *d = current->domain;
467 idx -= 0x40000000;
468 if ( idx > 0 )
469 return 0;
471 switch ( idx )
472 {
473 case 0:
474 {
475 void *hypercall_page;
476 unsigned long mfn;
477 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
478 unsigned int idx = eax & 0xfff;
480 if ( idx > 0 )
481 {
482 gdprintk(XENLOG_WARNING,
483 "Dom%d: Out of range index %u to MSR %08x\n",
484 d->domain_id, idx, 0x40000000);
485 return 0;
486 }
488 mfn = gmfn_to_mfn(d, gmfn);
490 if ( !mfn_valid(mfn) ||
491 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
492 {
493 gdprintk(XENLOG_WARNING,
494 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
495 d->domain_id, gmfn, mfn, 0x40000000);
496 return 0;
497 }
499 hypercall_page = map_domain_page(mfn);
500 hypercall_page_initialise(d, hypercall_page);
501 unmap_domain_page(hypercall_page);
503 put_page_and_type(mfn_to_page(mfn));
504 break;
505 }
507 default:
508 BUG();
509 }
511 return 1;
512 }
514 int cpuid_hypervisor_leaves(
515 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
516 {
517 idx -= 0x40000000;
518 if ( idx > 2 )
519 return 0;
521 switch ( idx )
522 {
523 case 0:
524 *eax = 0x40000002; /* Largest leaf */
525 *ebx = 0x566e6558; /* Signature 1: "XenV" */
526 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
527 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
528 break;
530 case 1:
531 *eax = (xen_major_version() << 16) | xen_minor_version();
532 *ebx = 0; /* Reserved */
533 *ecx = 0; /* Reserved */
534 *edx = 0; /* Reserved */
535 break;
537 case 2:
538 *eax = 1; /* Number of hypercall-transfer pages */
539 *ebx = 0x40000000; /* MSR base address */
540 *ecx = 0; /* Features 1 */
541 *edx = 0; /* Features 2 */
542 break;
544 default:
545 BUG();
546 }
548 return 1;
549 }
551 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
552 {
553 char sig[5], instr[2];
554 uint32_t a, b, c, d;
555 unsigned long eip, rc;
557 a = regs->eax;
558 b = regs->ebx;
559 c = regs->ecx;
560 d = regs->edx;
561 eip = regs->eip;
563 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
564 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
565 {
566 propagate_page_fault(eip + sizeof(sig) - rc, 0);
567 return EXCRET_fault_fixed;
568 }
569 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
570 return 0;
571 eip += sizeof(sig);
573 /* We only emulate CPUID. */
574 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
575 {
576 propagate_page_fault(eip + sizeof(instr) - rc, 0);
577 return EXCRET_fault_fixed;
578 }
579 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
580 return 0;
581 eip += sizeof(instr);
583 __asm__ (
584 "cpuid"
585 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
586 : "0" (a), "1" (b), "2" (c), "3" (d) );
588 if ( regs->eax == 1 )
589 {
590 /* Modify Feature Information. */
591 clear_bit(X86_FEATURE_VME, &d);
592 clear_bit(X86_FEATURE_DE, &d);
593 clear_bit(X86_FEATURE_PSE, &d);
594 clear_bit(X86_FEATURE_PGE, &d);
595 if ( !supervisor_mode_kernel )
596 clear_bit(X86_FEATURE_SEP, &d);
597 if ( !IS_PRIV(current->domain) )
598 clear_bit(X86_FEATURE_MTRR, &d);
599 }
600 else if ( regs->eax == 0x80000001 )
601 {
602 /* Modify Feature Information. */
603 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
604 }
605 else
606 {
607 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
608 }
610 regs->eax = a;
611 regs->ebx = b;
612 regs->ecx = c;
613 regs->edx = d;
614 regs->eip = eip;
616 return EXCRET_fault_fixed;
617 }
619 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
620 {
621 struct bug_frame bug;
622 struct bug_frame_str bug_str;
623 char *filename, *predicate, *eip = (char *)regs->eip;
624 int rc, id, lineno;
626 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
628 if ( likely(guest_mode(regs)) )
629 {
630 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
631 return rc;
632 return do_guest_trap(TRAP_invalid_op, regs, 0);
633 }
635 if ( !is_kernel(eip) ||
636 __copy_from_user(&bug, eip, sizeof(bug)) ||
637 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
638 (bug.ret != 0xc2) )
639 goto die;
641 id = bug.id & 3;
642 if ( id == BUGFRAME_rsvd )
643 goto die;
645 if ( id == BUGFRAME_dump )
646 {
647 show_execution_state(regs);
648 regs->eip += sizeof(bug);
649 return EXCRET_fault_fixed;
650 }
652 /* BUG() or ASSERT(): decode the filename pointer and line number. */
653 ASSERT((id == BUGFRAME_bug) || (id == BUGFRAME_assert));
654 eip += sizeof(bug);
655 if ( !is_kernel(eip) ||
656 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
657 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
658 goto die;
660 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
661 lineno = bug.id >> 2;
663 if ( id == BUGFRAME_bug )
664 {
665 printk("Xen BUG at %.50s:%d\n", filename, lineno);
666 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
667 show_execution_state(regs);
668 panic("Xen BUG at %.50s:%d\n", filename, lineno);
669 }
671 /* ASSERT(): decode the predicate string pointer. */
672 ASSERT(id == BUGFRAME_assert);
673 eip += sizeof(bug_str);
674 if ( !is_kernel(eip) ||
675 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
676 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
677 goto die;
679 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
680 printk("Assertion '%s' failed at %.50s:%d\n",
681 predicate, filename, lineno);
682 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
683 show_execution_state(regs);
684 panic("Assertion '%s' failed at %.50s:%d\n",
685 predicate, filename, lineno);
687 die:
688 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
689 show_execution_state(regs);
690 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
691 return 0;
692 }
694 asmlinkage int do_int3(struct cpu_user_regs *regs)
695 {
696 DEBUGGER_trap_entry(TRAP_int3, regs);
698 if ( !guest_mode(regs) )
699 {
700 DEBUGGER_trap_fatal(TRAP_int3, regs);
701 show_execution_state(regs);
702 panic("FATAL TRAP: vector = 3 (Int3)\n");
703 }
705 return do_guest_trap(TRAP_int3, regs, 0);
706 }
708 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
709 {
710 fatal_trap(TRAP_machine_check, regs);
711 return 0;
712 }
714 void propagate_page_fault(unsigned long addr, u16 error_code)
715 {
716 struct trap_info *ti;
717 struct vcpu *v = current;
718 struct trap_bounce *tb = &v->arch.trap_bounce;
720 v->arch.guest_context.ctrlreg[2] = addr;
721 arch_set_cr2(v, addr);
723 /* Re-set error_code.user flag appropriately for the guest. */
724 error_code &= ~PFEC_user_mode;
725 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
726 error_code |= PFEC_user_mode;
728 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
729 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
730 tb->error_code = error_code;
731 tb->cs = ti->cs;
732 tb->eip = ti->address;
733 if ( TI_GET_IF(ti) )
734 tb->flags |= TBF_INTERRUPT;
735 if ( unlikely(null_trap_bounce(v, tb)) )
736 {
737 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
738 v->domain->domain_id, v->vcpu_id, error_code);
739 show_page_walk(addr);
740 }
741 }
743 static int handle_gdt_ldt_mapping_fault(
744 unsigned long offset, struct cpu_user_regs *regs)
745 {
746 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
747 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
748 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
750 /* Should never fault in another vcpu's area. */
751 BUG_ON(vcpu_area != current->vcpu_id);
753 /* Byte offset within the gdt/ldt sub-area. */
754 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
756 if ( likely(is_ldt_area) )
757 {
758 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
759 if ( unlikely(map_ldt_shadow_page(offset >> PAGE_SHIFT) == 0) )
760 {
761 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
762 if ( !guest_mode(regs) )
763 return 0;
764 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
765 propagate_page_fault(
766 current->arch.guest_context.ldt_base + offset,
767 regs->error_code);
768 }
769 }
770 else
771 {
772 /* GDT fault: handle the fault as #GP(selector). */
773 regs->error_code = (u16)offset & ~7;
774 (void)do_general_protection(regs);
775 }
777 return EXCRET_fault_fixed;
778 }
780 #ifdef HYPERVISOR_VIRT_END
781 #define IN_HYPERVISOR_RANGE(va) \
782 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
783 #else
784 #define IN_HYPERVISOR_RANGE(va) \
785 (((va) >= HYPERVISOR_VIRT_START))
786 #endif
788 static int __spurious_page_fault(
789 unsigned long addr, struct cpu_user_regs *regs)
790 {
791 unsigned long mfn, cr3 = read_cr3();
792 #if CONFIG_PAGING_LEVELS >= 4
793 l4_pgentry_t l4e, *l4t;
794 #endif
795 #if CONFIG_PAGING_LEVELS >= 3
796 l3_pgentry_t l3e, *l3t;
797 #endif
798 l2_pgentry_t l2e, *l2t;
799 l1_pgentry_t l1e, *l1t;
800 unsigned int required_flags, disallowed_flags;
802 /* Reserved bit violations are never spurious faults. */
803 if ( regs->error_code & PFEC_reserved_bit )
804 return 0;
806 required_flags = _PAGE_PRESENT;
807 if ( regs->error_code & PFEC_write_access )
808 required_flags |= _PAGE_RW;
809 if ( regs->error_code & PFEC_user_mode )
810 required_flags |= _PAGE_USER;
812 disallowed_flags = 0;
813 if ( regs->error_code & PFEC_insn_fetch )
814 disallowed_flags |= _PAGE_NX;
816 mfn = cr3 >> PAGE_SHIFT;
818 #if CONFIG_PAGING_LEVELS >= 4
819 l4t = map_domain_page(mfn);
820 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
821 mfn = l4e_get_pfn(l4e);
822 unmap_domain_page(l4t);
823 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
824 (l4e_get_flags(l4e) & disallowed_flags) )
825 return 0;
826 #endif
828 #if CONFIG_PAGING_LEVELS >= 3
829 l3t = map_domain_page(mfn);
830 #ifdef CONFIG_X86_PAE
831 l3t += (cr3 & 0xFE0UL) >> 3;
832 #endif
833 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
834 mfn = l3e_get_pfn(l3e);
835 unmap_domain_page(l3t);
836 #ifdef CONFIG_X86_PAE
837 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
838 return 0;
839 #else
840 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
841 (l3e_get_flags(l3e) & disallowed_flags) )
842 return 0;
843 #endif
844 #endif
846 l2t = map_domain_page(mfn);
847 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
848 mfn = l2e_get_pfn(l2e);
849 unmap_domain_page(l2t);
850 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
851 (l2e_get_flags(l2e) & disallowed_flags) )
852 return 0;
853 if ( l2e_get_flags(l2e) & _PAGE_PSE )
854 {
855 l1e = l1e_empty(); /* define before use in debug tracing */
856 goto spurious;
857 }
859 l1t = map_domain_page(mfn);
860 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
861 mfn = l1e_get_pfn(l1e);
862 unmap_domain_page(l1t);
863 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
864 (l1e_get_flags(l1e) & disallowed_flags) )
865 return 0;
867 spurious:
868 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
869 "at addr %lx, e/c %04x\n",
870 current->domain->domain_id, current->vcpu_id,
871 addr, regs->error_code);
872 #if CONFIG_PAGING_LEVELS >= 4
873 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
874 #endif
875 #if CONFIG_PAGING_LEVELS >= 3
876 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
877 #endif
878 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
879 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
880 #ifndef NDEBUG
881 show_registers(regs);
882 #endif
883 return 1;
884 }
886 static int spurious_page_fault(
887 unsigned long addr, struct cpu_user_regs *regs)
888 {
889 unsigned long flags;
890 int is_spurious;
892 /*
893 * Disabling interrupts prevents TLB flushing, and hence prevents
894 * page tables from becoming invalid under our feet during the walk.
895 */
896 local_irq_save(flags);
897 is_spurious = __spurious_page_fault(addr, regs);
898 local_irq_restore(flags);
900 return is_spurious;
901 }
903 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
904 {
905 struct vcpu *v = current;
906 struct domain *d = v->domain;
908 /* No fixups in interrupt context or when interrupts are disabled. */
909 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
910 return 0;
912 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
913 {
914 if ( paging_mode_external(d) && guest_mode(regs) )
915 return paging_fault(addr, regs);
916 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
917 return handle_gdt_ldt_mapping_fault(
918 addr - GDT_LDT_VIRT_START, regs);
919 return 0;
920 }
922 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
923 guest_kernel_mode(v, regs) &&
924 /* Do not check if access-protection fault since the page may
925 legitimately be not present in shadow page tables */
926 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
927 ptwr_do_page_fault(v, addr, regs) )
928 return EXCRET_fault_fixed;
930 if ( paging_mode_enabled(d) )
931 return paging_fault(addr, regs);
933 return 0;
934 }
936 /*
937 * #PF error code:
938 * Bit 0: Protection violation (=1) ; Page not present (=0)
939 * Bit 1: Write access
940 * Bit 2: User mode (=1) ; Supervisor mode (=0)
941 * Bit 3: Reserved bit violation
942 * Bit 4: Instruction fetch
943 */
944 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
945 {
946 unsigned long addr, fixup;
947 int rc;
949 addr = read_cr2();
951 DEBUGGER_trap_entry(TRAP_page_fault, regs);
953 perfc_incrc(page_faults);
955 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
956 return rc;
958 if ( unlikely(!guest_mode(regs)) )
959 {
960 if ( spurious_page_fault(addr, regs) )
961 return EXCRET_not_a_fault;
963 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
964 {
965 perfc_incrc(copy_user_faults);
966 regs->eip = fixup;
967 return 0;
968 }
970 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
972 show_execution_state(regs);
973 show_page_walk(addr);
974 panic("FATAL PAGE FAULT\n"
975 "[error_code=%04x]\n"
976 "Faulting linear address: %p\n",
977 regs->error_code, _p(addr));
978 }
980 propagate_page_fault(addr, regs->error_code);
981 return 0;
982 }
984 /*
985 * Early handler to deal with spurious page faults. For example, consider a
986 * routine that uses a mapping immediately after installing it (making it
987 * present). The CPU may speculatively execute the memory access before
988 * executing the PTE write. The instruction will then be marked to cause a
989 * page fault when it is retired, despite the fact that the PTE is present and
990 * correct at that point in time.
991 */
992 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
993 {
994 static int stuck;
995 static unsigned long prev_eip, prev_cr2;
996 unsigned long cr2 = read_cr2();
998 BUG_ON(smp_processor_id() != 0);
1000 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1002 prev_eip = regs->eip;
1003 prev_cr2 = cr2;
1004 stuck = 0;
1005 return EXCRET_not_a_fault;
1008 if ( stuck++ == 1000 )
1009 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1010 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1012 return EXCRET_not_a_fault;
1015 long do_fpu_taskswitch(int set)
1017 struct vcpu *v = current;
1019 if ( set )
1021 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1022 stts();
1024 else
1026 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1027 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
1028 clts();
1031 return 0;
1034 static int read_descriptor(unsigned int sel,
1035 const struct vcpu *v,
1036 const struct cpu_user_regs * regs,
1037 unsigned long *base,
1038 unsigned long *limit,
1039 unsigned int *ar,
1040 unsigned int vm86attr)
1042 struct desc_struct desc;
1044 if ( !vm86_mode(regs) )
1046 if ( sel < 4)
1047 desc.b = desc.a = 0;
1048 else if ( __get_user(desc,
1049 (const struct desc_struct *)(!(sel & 4)
1050 ? GDT_VIRT_START(v)
1051 : LDT_VIRT_START(v))
1052 + (sel >> 3)) )
1053 return 0;
1054 if ( !(vm86attr & _SEGMENT_CODE) )
1055 desc.b &= ~_SEGMENT_L;
1057 else
1059 desc.a = (sel << 20) | 0xffff;
1060 desc.b = vm86attr | (sel >> 12);
1063 *ar = desc.b & 0x00f0ff00;
1064 if ( !(desc.b & _SEGMENT_L) )
1066 *base = (desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000);
1067 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1068 if ( desc.b & _SEGMENT_G )
1069 *limit = ((*limit + 1) << 12) - 1;
1070 #ifndef NDEBUG
1071 if ( !vm86_mode(regs) && sel > 3 )
1073 unsigned int a, l;
1074 unsigned char valid;
1076 __asm__("larl %2, %0\n\tsetz %1" : "=r" (a), "=rm" (valid) : "rm" (sel));
1077 BUG_ON(valid && (a & 0x00f0ff00) != *ar);
1078 __asm__("lsll %2, %0\n\tsetz %1" : "=r" (l), "=rm" (valid) : "rm" (sel));
1079 BUG_ON(valid && l != *limit);
1081 #endif
1083 else
1085 *base = 0UL;
1086 *limit = ~0UL;
1089 return 1;
1092 /* Has the guest requested sufficient permission for this I/O access? */
1093 static inline int guest_io_okay(
1094 unsigned int port, unsigned int bytes,
1095 struct vcpu *v, struct cpu_user_regs *regs)
1097 #if defined(__x86_64__)
1098 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1099 int user_mode = !(v->arch.flags & TF_kernel_mode);
1100 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1101 #elif defined(__i386__)
1102 #define TOGGLE_MODE() ((void)0)
1103 #endif
1105 if ( !vm86_mode(regs) &&
1106 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1107 return 1;
1109 if ( v->arch.iobmp_limit > (port + bytes) )
1111 union { uint8_t bytes[2]; uint16_t mask; } x;
1113 /*
1114 * Grab permission bytes from guest space. Inaccessible bytes are
1115 * read as 0xff (no access allowed).
1116 */
1117 TOGGLE_MODE();
1118 switch ( __copy_from_guest_offset(&x.bytes[0], v->arch.iobmp,
1119 port>>3, 2) )
1121 default: x.bytes[0] = ~0;
1122 case 1: x.bytes[1] = ~0;
1123 case 0: break;
1125 TOGGLE_MODE();
1127 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1128 return 1;
1131 return 0;
1134 /* Has the administrator granted sufficient permission for this I/O access? */
1135 static inline int admin_io_okay(
1136 unsigned int port, unsigned int bytes,
1137 struct vcpu *v, struct cpu_user_regs *regs)
1139 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1142 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1143 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1144 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1145 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1146 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1147 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1149 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1150 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1151 __attribute__((__regparm__(1)));
1152 unsigned long guest_to_host_gpr_switch(unsigned long)
1153 __attribute__((__regparm__(1)));
1155 /* Instruction fetch with error handling. */
1156 #define insn_fetch(type, base, eip, limit) \
1157 ({ unsigned long _rc, _ptr = (base) + (eip); \
1158 type _x; \
1159 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1160 goto fail; \
1161 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1162 { \
1163 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1164 return EXCRET_fault_fixed; \
1165 } \
1166 (eip) += sizeof(_x); _x; })
1168 #if defined(CONFIG_X86_32)
1169 # define read_sreg(regs, sr) ((regs)->sr)
1170 #elif defined(CONFIG_X86_64)
1171 # define read_sreg(regs, sr) read_segment_register(sr)
1172 #endif
1174 static int emulate_privileged_op(struct cpu_user_regs *regs)
1176 struct vcpu *v = current;
1177 unsigned long *reg, eip = regs->eip, res;
1178 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1179 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1180 unsigned int port, i, data_sel, ar, data, rc;
1181 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1182 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1183 ? regs->reg \
1184 : ad_bytes == 4 \
1185 ? (u32)regs->reg \
1186 : (u16)regs->reg)
1187 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1188 ? regs->reg = (val) \
1189 : ad_bytes == 4 \
1190 ? (*(u32 *)&regs->reg = (val)) \
1191 : (*(u16 *)&regs->reg = (val)))
1192 unsigned long code_base, code_limit;
1193 char io_emul_stub[16];
1194 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1195 u32 l, h;
1197 if ( !read_descriptor(regs->cs, v, regs,
1198 &code_base, &code_limit, &ar,
1199 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1200 goto fail;
1201 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1202 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1203 if ( !(ar & _SEGMENT_S) ||
1204 !(ar & _SEGMENT_P) ||
1205 !(ar & _SEGMENT_CODE) )
1206 goto fail;
1208 /* emulating only opcodes not allowing SS to be default */
1209 data_sel = read_sreg(regs, ds);
1211 /* Legacy prefixes. */
1212 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1214 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1216 case 0x66: /* operand-size override */
1217 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1218 continue;
1219 case 0x67: /* address-size override */
1220 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1221 continue;
1222 case 0x2e: /* CS override */
1223 data_sel = regs->cs;
1224 continue;
1225 case 0x3e: /* DS override */
1226 data_sel = read_sreg(regs, ds);
1227 continue;
1228 case 0x26: /* ES override */
1229 data_sel = read_sreg(regs, es);
1230 continue;
1231 case 0x64: /* FS override */
1232 data_sel = read_sreg(regs, fs);
1233 lm_ovr = lm_seg_fs;
1234 continue;
1235 case 0x65: /* GS override */
1236 data_sel = read_sreg(regs, gs);
1237 lm_ovr = lm_seg_gs;
1238 continue;
1239 case 0x36: /* SS override */
1240 data_sel = regs->ss;
1241 continue;
1242 case 0xf0: /* LOCK */
1243 lock = 1;
1244 continue;
1245 case 0xf2: /* REPNE/REPNZ */
1246 case 0xf3: /* REP/REPE/REPZ */
1247 rep_prefix = 1;
1248 continue;
1249 default:
1250 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1252 rex = opcode;
1253 continue;
1255 break;
1257 break;
1260 /* REX prefix. */
1261 if ( rex & 8 ) /* REX.W */
1262 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1263 modrm_reg = (rex & 4) << 1; /* REX.R */
1264 /* REX.X does not need to be decoded. */
1265 modrm_rm = (rex & 1) << 3; /* REX.B */
1267 if ( opcode == 0x0f )
1268 goto twobyte_opcode;
1270 if ( lock )
1271 goto fail;
1273 /* Input/Output String instructions. */
1274 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1276 unsigned long data_base, data_limit;
1278 if ( rep_prefix && (rd_ad(ecx) == 0) )
1279 goto done;
1281 if ( !(opcode & 2) )
1283 data_sel = read_sreg(regs, es);
1284 lm_ovr = lm_seg_none;
1287 if ( !(ar & _SEGMENT_L) )
1289 if ( !read_descriptor(data_sel, v, regs,
1290 &data_base, &data_limit, &ar,
1291 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1292 goto fail;
1293 if ( !(ar & _SEGMENT_S) ||
1294 !(ar & _SEGMENT_P) ||
1295 (opcode & 2 ?
1296 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1297 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1298 goto fail;
1300 #ifdef CONFIG_X86_64
1301 else
1303 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1305 switch ( lm_ovr )
1307 case lm_seg_none:
1308 data_base = 0UL;
1309 break;
1310 case lm_seg_fs:
1311 data_base = v->arch.guest_context.fs_base;
1312 break;
1313 case lm_seg_gs:
1314 if ( guest_kernel_mode(v, regs) )
1315 data_base = v->arch.guest_context.gs_base_kernel;
1316 else
1317 data_base = v->arch.guest_context.gs_base_user;
1318 break;
1321 else
1322 read_descriptor(data_sel, v, regs,
1323 &data_base, &data_limit, &ar,
1324 0);
1325 data_limit = ~0UL;
1326 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1328 #endif
1330 continue_io_string:
1331 switch ( opcode )
1333 case 0x6c: /* INSB */
1334 op_bytes = 1;
1335 case 0x6d: /* INSW/INSL */
1336 if ( data_limit < op_bytes - 1 ||
1337 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1338 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1339 goto fail;
1340 port = (u16)regs->edx;
1341 switch ( op_bytes )
1343 case 1:
1344 /* emulate PIT counter 2 */
1345 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1346 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1347 pv_pit_handler(port, 0, 0) : ~0));
1348 break;
1349 case 2:
1350 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1351 break;
1352 case 4:
1353 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1354 break;
1356 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1358 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1359 PFEC_write_access);
1360 return EXCRET_fault_fixed;
1362 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1363 break;
1365 case 0x6e: /* OUTSB */
1366 op_bytes = 1;
1367 case 0x6f: /* OUTSW/OUTSL */
1368 if ( data_limit < op_bytes - 1 ||
1369 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1370 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1371 goto fail;
1372 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1373 if ( rc != 0 )
1375 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1376 return EXCRET_fault_fixed;
1378 port = (u16)regs->edx;
1379 switch ( op_bytes )
1381 case 1:
1382 if ( guest_outb_okay(port, v, regs) )
1383 outb((u8)data, port);
1384 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1385 pv_pit_handler(port, data, 1);
1386 break;
1387 case 2:
1388 if ( guest_outw_okay(port, v, regs) )
1389 outw((u16)data, port);
1390 break;
1391 case 4:
1392 if ( guest_outl_okay(port, v, regs) )
1393 outl((u32)data, port);
1394 break;
1396 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1397 break;
1400 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1402 if ( !hypercall_preempt_check() )
1403 goto continue_io_string;
1404 eip = regs->eip;
1407 goto done;
1410 /*
1411 * Very likely to be an I/O instruction (IN/OUT).
1412 * Build an on-stack stub to execute the instruction with full guest
1413 * GPR context. This is needed for some systems which (ab)use IN/OUT
1414 * to communicate with BIOS code in system-management mode.
1415 */
1416 /* call host_to_guest_gpr_switch */
1417 io_emul_stub[0] = 0xe8;
1418 *(s32 *)&io_emul_stub[1] =
1419 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1420 /* data16 or nop */
1421 io_emul_stub[5] = (op_bytes != 2) ? 0x90 : 0x66;
1422 /* <io-access opcode> */
1423 io_emul_stub[6] = opcode;
1424 /* imm8 or nop */
1425 io_emul_stub[7] = 0x90;
1426 /* jmp guest_to_host_gpr_switch */
1427 io_emul_stub[8] = 0xe9;
1428 *(s32 *)&io_emul_stub[9] =
1429 (char *)guest_to_host_gpr_switch - &io_emul_stub[13];
1431 /* Handy function-typed pointer to the stub. */
1432 io_emul = (void *)io_emul_stub;
1434 /* I/O Port and Interrupt Flag instructions. */
1435 switch ( opcode )
1437 case 0xe4: /* IN imm8,%al */
1438 op_bytes = 1;
1439 case 0xe5: /* IN imm8,%eax */
1440 port = insn_fetch(u8, code_base, eip, code_limit);
1441 io_emul_stub[7] = port; /* imm8 */
1442 exec_in:
1443 if ( !guest_io_okay(port, op_bytes, v, regs) )
1444 goto fail;
1445 switch ( op_bytes )
1447 case 1:
1448 if ( guest_inb_okay(port, v, regs) )
1449 io_emul(regs);
1450 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1452 regs->eax &= ~0xffUL;
1453 regs->eax |= pv_pit_handler(port, 0, 0);
1455 else
1456 regs->eax |= (u8)~0;
1457 break;
1458 case 2:
1459 if ( guest_inw_okay(port, v, regs) )
1460 io_emul(regs);
1461 else
1462 regs->eax |= (u16)~0;
1463 break;
1464 case 4:
1465 if ( guest_inl_okay(port, v, regs) )
1466 io_emul(regs);
1467 else
1468 regs->eax = (u32)~0;
1469 break;
1471 goto done;
1473 case 0xec: /* IN %dx,%al */
1474 op_bytes = 1;
1475 case 0xed: /* IN %dx,%eax */
1476 port = (u16)regs->edx;
1477 goto exec_in;
1479 case 0xe6: /* OUT %al,imm8 */
1480 op_bytes = 1;
1481 case 0xe7: /* OUT %eax,imm8 */
1482 port = insn_fetch(u8, code_base, eip, code_limit);
1483 io_emul_stub[7] = port; /* imm8 */
1484 exec_out:
1485 if ( !guest_io_okay(port, op_bytes, v, regs) )
1486 goto fail;
1487 switch ( op_bytes )
1489 case 1:
1490 if ( guest_outb_okay(port, v, regs) )
1491 io_emul(regs);
1492 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1493 pv_pit_handler(port, regs->eax, 1);
1494 break;
1495 case 2:
1496 if ( guest_outw_okay(port, v, regs) )
1497 io_emul(regs);
1498 break;
1499 case 4:
1500 if ( guest_outl_okay(port, v, regs) )
1501 io_emul(regs);
1502 break;
1504 goto done;
1506 case 0xee: /* OUT %al,%dx */
1507 op_bytes = 1;
1508 case 0xef: /* OUT %eax,%dx */
1509 port = (u16)regs->edx;
1510 goto exec_out;
1512 case 0xfa: /* CLI */
1513 case 0xfb: /* STI */
1514 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1515 goto fail;
1516 /*
1517 * This is just too dangerous to allow, in my opinion. Consider if the
1518 * caller then tries to reenable interrupts using POPF: we can't trap
1519 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1520 * do for us. :-)
1521 */
1522 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1523 goto done;
1526 /* No decode of this single-byte opcode. */
1527 goto fail;
1529 twobyte_opcode:
1530 /* Two-byte opcodes only emulated from guest kernel. */
1531 if ( !guest_kernel_mode(v, regs) )
1532 goto fail;
1534 /* Privileged (ring 0) instructions. */
1535 opcode = insn_fetch(u8, code_base, eip, code_limit);
1536 if ( lock && (opcode & ~3) != 0x20 )
1537 goto fail;
1538 switch ( opcode )
1540 case 0x06: /* CLTS */
1541 (void)do_fpu_taskswitch(0);
1542 break;
1544 case 0x09: /* WBINVD */
1545 /* Ignore the instruction if unprivileged. */
1546 if ( !cache_flush_permitted(v->domain) )
1547 /* Non-physdev domain attempted WBINVD; ignore for now since
1548 newer linux uses this in some start-of-day timing loops */
1550 else
1551 wbinvd();
1552 break;
1554 case 0x20: /* MOV CR?,<reg> */
1555 opcode = insn_fetch(u8, code_base, eip, code_limit);
1556 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1557 modrm_rm |= (opcode >> 0) & 7;
1558 reg = decode_register(modrm_rm, regs, 0);
1559 switch ( modrm_reg )
1561 case 0: /* Read CR0 */
1562 *reg = (read_cr0() & ~X86_CR0_TS) |
1563 v->arch.guest_context.ctrlreg[0];
1564 break;
1566 case 2: /* Read CR2 */
1567 *reg = v->arch.guest_context.ctrlreg[2];
1568 break;
1570 case 3: /* Read CR3 */
1571 if ( !IS_COMPAT(v->domain) )
1572 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1573 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1574 #ifdef CONFIG_COMPAT
1575 else
1576 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1577 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1578 #endif
1579 break;
1581 case 4: /* Read CR4 */
1582 /*
1583 * Guests can read CR4 to see what features Xen has enabled. We
1584 * therefore lie about PGE & PSE as they are unavailable to guests.
1585 */
1586 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1587 break;
1589 default:
1590 goto fail;
1592 break;
1594 case 0x21: /* MOV DR?,<reg> */
1595 opcode = insn_fetch(u8, code_base, eip, code_limit);
1596 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1597 modrm_rm |= (opcode >> 0) & 7;
1598 reg = decode_register(modrm_rm, regs, 0);
1599 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1600 goto fail;
1601 *reg = res;
1602 break;
1604 case 0x22: /* MOV <reg>,CR? */
1605 opcode = insn_fetch(u8, code_base, eip, code_limit);
1606 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1607 modrm_rm |= (opcode >> 0) & 7;
1608 reg = decode_register(modrm_rm, regs, 0);
1609 switch ( modrm_reg )
1611 case 0: /* Write CR0 */
1612 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1614 gdprintk(XENLOG_WARNING,
1615 "Attempt to change unmodifiable CR0 flags.\n");
1616 goto fail;
1618 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1619 break;
1621 case 2: /* Write CR2 */
1622 v->arch.guest_context.ctrlreg[2] = *reg;
1623 arch_set_cr2(v, *reg);
1624 break;
1626 case 3: /* Write CR3 */
1627 LOCK_BIGLOCK(v->domain);
1628 if ( !IS_COMPAT(v->domain) )
1629 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1630 #ifdef CONFIG_COMPAT
1631 else
1632 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1633 #endif
1634 UNLOCK_BIGLOCK(v->domain);
1635 if ( rc == 0 ) /* not okay */
1636 goto fail;
1637 break;
1639 case 4:
1640 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1642 gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags.\n");
1643 goto fail;
1645 break;
1647 default:
1648 goto fail;
1650 break;
1652 case 0x23: /* MOV <reg>,DR? */
1653 opcode = insn_fetch(u8, code_base, eip, code_limit);
1654 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1655 modrm_rm |= (opcode >> 0) & 7;
1656 reg = decode_register(modrm_rm, regs, 0);
1657 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1658 goto fail;
1659 break;
1661 case 0x30: /* WRMSR */
1662 switch ( regs->ecx )
1664 #ifdef CONFIG_X86_64
1665 case MSR_FS_BASE:
1666 if ( IS_COMPAT(v->domain) )
1667 goto fail;
1668 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1669 goto fail;
1670 v->arch.guest_context.fs_base =
1671 ((u64)regs->edx << 32) | regs->eax;
1672 break;
1673 case MSR_GS_BASE:
1674 if ( IS_COMPAT(v->domain) )
1675 goto fail;
1676 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1677 goto fail;
1678 v->arch.guest_context.gs_base_kernel =
1679 ((u64)regs->edx << 32) | regs->eax;
1680 break;
1681 case MSR_SHADOW_GS_BASE:
1682 if ( IS_COMPAT(v->domain) )
1683 goto fail;
1684 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1685 goto fail;
1686 v->arch.guest_context.gs_base_user =
1687 ((u64)regs->edx << 32) | regs->eax;
1688 break;
1689 #endif
1690 default:
1691 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1692 break;
1694 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1695 (regs->eax != l) || (regs->edx != h) )
1696 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1697 "%08x:%08x to %08lx:%08lx.\n",
1698 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1699 break;
1701 break;
1703 case 0x32: /* RDMSR */
1704 switch ( regs->ecx )
1706 #ifdef CONFIG_X86_64
1707 case MSR_FS_BASE:
1708 if ( IS_COMPAT(v->domain) )
1709 goto fail;
1710 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1711 regs->edx = v->arch.guest_context.fs_base >> 32;
1712 break;
1713 case MSR_GS_BASE:
1714 if ( IS_COMPAT(v->domain) )
1715 goto fail;
1716 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1717 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1718 break;
1719 case MSR_SHADOW_GS_BASE:
1720 if ( IS_COMPAT(v->domain) )
1721 goto fail;
1722 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1723 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1724 break;
1725 #endif
1726 case MSR_EFER:
1727 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1728 goto fail;
1729 break;
1730 default:
1731 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1733 regs->eax = l;
1734 regs->edx = h;
1735 break;
1737 /* Everyone can read the MSR space. */
1738 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1739 _p(regs->ecx));*/
1740 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1741 goto fail;
1742 break;
1744 break;
1746 default:
1747 goto fail;
1750 #undef wr_ad
1751 #undef rd_ad
1753 done:
1754 regs->eip = eip;
1755 return EXCRET_fault_fixed;
1757 fail:
1758 return 0;
1761 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1763 struct vcpu *v = current;
1764 unsigned long fixup;
1766 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1768 if ( regs->error_code & 1 )
1769 goto hardware_gp;
1771 if ( !guest_mode(regs) )
1772 goto gp_in_kernel;
1774 /*
1775 * Cunning trick to allow arbitrary "INT n" handling.
1777 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1778 * instruction from trapping to the appropriate vector, when that might not
1779 * be expected by Xen or the guest OS. For example, that entry might be for
1780 * a fault handler (unlike traps, faults don't increment EIP), or might
1781 * expect an error code on the stack (which a software trap never
1782 * provides), or might be a hardware interrupt handler that doesn't like
1783 * being called spuriously.
1785 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1786 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1787 * clear to indicate that it's a software fault, not hardware.
1789 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1790 * okay because they can only be triggered by an explicit DPL-checked
1791 * instruction. The DPL specified by the guest OS for these vectors is NOT
1792 * CHECKED!!
1793 */
1794 if ( (regs->error_code & 3) == 2 )
1796 /* This fault must be due to <INT n> instruction. */
1797 const struct trap_info *ti;
1798 unsigned char vector = regs->error_code >> 3;
1799 ti = &v->arch.guest_context.trap_ctxt[vector];
1800 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1802 regs->eip += 2;
1803 return do_guest_trap(vector, regs, 0);
1807 /* Emulate some simple privileged and I/O instructions. */
1808 if ( (regs->error_code == 0) &&
1809 emulate_privileged_op(regs) )
1810 return 0;
1812 #if defined(__i386__)
1813 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1814 (regs->error_code == 0) &&
1815 gpf_emulate_4gb(regs) )
1816 return 0;
1817 #endif
1819 /* Pass on GPF as is. */
1820 return do_guest_trap(TRAP_gp_fault, regs, 1);
1822 gp_in_kernel:
1824 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1826 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
1827 regs->error_code, _p(regs->eip), _p(fixup));
1828 regs->eip = fixup;
1829 return 0;
1832 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1834 hardware_gp:
1835 show_execution_state(regs);
1836 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1837 return 0;
1840 static void nmi_softirq(void)
1842 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1843 vcpu_kick(dom0->vcpu[0]);
1846 static void nmi_dom0_report(unsigned int reason_idx)
1848 struct domain *d;
1849 struct vcpu *v;
1851 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1852 return;
1854 set_bit(reason_idx, nmi_reason(d));
1856 if ( !test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1857 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1860 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1862 switch ( opt_nmi[0] )
1864 case 'd': /* 'dom0' */
1865 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1866 case 'i': /* 'ignore' */
1867 break;
1868 default: /* 'fatal' */
1869 console_force_unlock();
1870 printk("\n\nNMI - MEMORY ERROR\n");
1871 fatal_trap(TRAP_nmi, regs);
1874 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1875 mdelay(1);
1876 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1879 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1881 switch ( opt_nmi[0] )
1883 case 'd': /* 'dom0' */
1884 nmi_dom0_report(_XEN_NMIREASON_io_error);
1885 case 'i': /* 'ignore' */
1886 break;
1887 default: /* 'fatal' */
1888 console_force_unlock();
1889 printk("\n\nNMI - I/O ERROR\n");
1890 fatal_trap(TRAP_nmi, regs);
1893 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1894 mdelay(1);
1895 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1898 static void unknown_nmi_error(unsigned char reason)
1900 switch ( opt_nmi[0] )
1902 case 'd': /* 'dom0' */
1903 nmi_dom0_report(_XEN_NMIREASON_unknown);
1904 case 'i': /* 'ignore' */
1905 break;
1906 default: /* 'fatal' */
1907 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1908 printk("Dazed and confused, but trying to continue\n");
1909 printk("Do you have a strange power saving mode enabled?\n");
1910 kexec_crash();
1914 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1916 return 0;
1919 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1921 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1923 unsigned int cpu = smp_processor_id();
1924 unsigned char reason;
1926 ++nmi_count(cpu);
1928 if ( nmi_callback(regs, cpu) )
1929 return;
1931 if ( nmi_watchdog )
1932 nmi_watchdog_tick(regs);
1934 /* Only the BSP gets external NMIs from the system. */
1935 if ( cpu == 0 )
1937 reason = inb(0x61);
1938 if ( reason & 0x80 )
1939 mem_parity_error(regs);
1940 else if ( reason & 0x40 )
1941 io_check_error(regs);
1942 else if ( !nmi_watchdog )
1943 unknown_nmi_error((unsigned char)(reason&0xff));
1947 void set_nmi_callback(nmi_callback_t callback)
1949 nmi_callback = callback;
1952 void unset_nmi_callback(void)
1954 nmi_callback = dummy_nmi_callback;
1957 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1959 BUG_ON(!guest_mode(regs));
1961 setup_fpu(current);
1963 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1965 do_guest_trap(TRAP_no_device, regs, 0);
1966 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1969 return EXCRET_fault_fixed;
1972 asmlinkage int do_debug(struct cpu_user_regs *regs)
1974 unsigned long condition;
1975 struct vcpu *v = current;
1977 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1979 /* Mask out spurious debug traps due to lazy DR7 setting */
1980 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1981 (v->arch.guest_context.debugreg[7] == 0) )
1983 __asm__("mov %0,%%db7" : : "r" (0UL));
1984 goto out;
1987 DEBUGGER_trap_entry(TRAP_debug, regs);
1989 if ( !guest_mode(regs) )
1991 /* Clear TF just for absolute sanity. */
1992 regs->eflags &= ~EF_TF;
1993 /*
1994 * We ignore watchpoints when they trigger within Xen. This may happen
1995 * when a buffer is passed to us which previously had a watchpoint set
1996 * on it. No need to bump EIP; the only faulting trap is an instruction
1997 * breakpoint, which can't happen to us.
1998 */
1999 goto out;
2002 /* Save debug status register where guest OS can peek at it */
2003 v->arch.guest_context.debugreg[6] = condition;
2005 return do_guest_trap(TRAP_debug, regs, 0);
2007 out:
2008 return EXCRET_not_a_fault;
2011 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2013 return EXCRET_not_a_fault;
2016 void set_intr_gate(unsigned int n, void *addr)
2018 #ifdef __i386__
2019 int i;
2020 /* Keep secondary tables in sync with IRQ updates. */
2021 for ( i = 1; i < NR_CPUS; i++ )
2022 if ( idt_tables[i] != NULL )
2023 _set_gate(&idt_tables[i][n], 14, 0, addr);
2024 #endif
2025 _set_gate(&idt_table[n], 14, 0, addr);
2028 void set_system_gate(unsigned int n, void *addr)
2030 _set_gate(idt_table+n,14,3,addr);
2033 void set_task_gate(unsigned int n, unsigned int sel)
2035 idt_table[n].a = sel << 16;
2036 idt_table[n].b = 0x8500;
2039 void set_tss_desc(unsigned int n, void *addr)
2041 _set_tssldt_desc(
2042 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2043 (unsigned long)addr,
2044 offsetof(struct tss_struct, __cacheline_filler) - 1,
2045 9);
2046 #ifdef CONFIG_COMPAT
2047 _set_tssldt_desc(
2048 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2049 (unsigned long)addr,
2050 offsetof(struct tss_struct, __cacheline_filler) - 1,
2051 11);
2052 #endif
2055 void __init trap_init(void)
2057 extern void percpu_traps_init(void);
2059 /*
2060 * Note that interrupt gates are always used, rather than trap gates. We
2061 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2062 * first activation must have the "bad" value(s) for these registers and
2063 * we may lose them if another activation is installed before they are
2064 * saved. The page-fault handler also needs interrupts disabled until %cr2
2065 * has been read and saved on the stack.
2066 */
2067 set_intr_gate(TRAP_divide_error,&divide_error);
2068 set_intr_gate(TRAP_debug,&debug);
2069 set_intr_gate(TRAP_nmi,&nmi);
2070 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2071 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2072 set_intr_gate(TRAP_bounds,&bounds);
2073 set_intr_gate(TRAP_invalid_op,&invalid_op);
2074 set_intr_gate(TRAP_no_device,&device_not_available);
2075 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2076 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2077 set_intr_gate(TRAP_no_segment,&segment_not_present);
2078 set_intr_gate(TRAP_stack_error,&stack_segment);
2079 set_intr_gate(TRAP_gp_fault,&general_protection);
2080 set_intr_gate(TRAP_page_fault,&page_fault);
2081 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2082 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2083 set_intr_gate(TRAP_alignment_check,&alignment_check);
2084 set_intr_gate(TRAP_machine_check,&machine_check);
2085 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2087 percpu_traps_init();
2089 cpu_init();
2091 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2095 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2097 struct trap_info cur;
2098 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2099 long rc = 0;
2101 /* If no table is presented then clear the entire virtual IDT. */
2102 if ( guest_handle_is_null(traps) )
2104 memset(dst, 0, 256 * sizeof(*dst));
2105 init_int80_direct_trap(current);
2106 return 0;
2109 for ( ; ; )
2111 if ( hypercall_preempt_check() )
2113 rc = hypercall_create_continuation(
2114 __HYPERVISOR_set_trap_table, "h", traps);
2115 break;
2118 if ( copy_from_guest(&cur, traps, 1) )
2120 rc = -EFAULT;
2121 break;
2124 if ( cur.address == 0 )
2125 break;
2127 fixup_guest_code_selector(current->domain, cur.cs);
2129 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2131 if ( cur.vector == 0x80 )
2132 init_int80_direct_trap(current);
2134 guest_handle_add_offset(traps, 1);
2137 return rc;
2141 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2143 int i;
2145 switch ( reg )
2147 case 0:
2148 if ( !access_ok(value, sizeof(long)) )
2149 return -EPERM;
2150 if ( p == current )
2151 __asm__ ( "mov %0, %%db0" : : "r" (value) );
2152 break;
2153 case 1:
2154 if ( !access_ok(value, sizeof(long)) )
2155 return -EPERM;
2156 if ( p == current )
2157 __asm__ ( "mov %0, %%db1" : : "r" (value) );
2158 break;
2159 case 2:
2160 if ( !access_ok(value, sizeof(long)) )
2161 return -EPERM;
2162 if ( p == current )
2163 __asm__ ( "mov %0, %%db2" : : "r" (value) );
2164 break;
2165 case 3:
2166 if ( !access_ok(value, sizeof(long)) )
2167 return -EPERM;
2168 if ( p == current )
2169 __asm__ ( "mov %0, %%db3" : : "r" (value) );
2170 break;
2171 case 6:
2172 /*
2173 * DR6: Bits 4-11,16-31 reserved (set to 1).
2174 * Bit 12 reserved (set to 0).
2175 */
2176 value &= 0xffffefff; /* reserved bits => 0 */
2177 value |= 0xffff0ff0; /* reserved bits => 1 */
2178 if ( p == current )
2179 __asm__ ( "mov %0, %%db6" : : "r" (value) );
2180 break;
2181 case 7:
2182 /*
2183 * DR7: Bit 10 reserved (set to 1).
2184 * Bits 11-12,14-15 reserved (set to 0).
2185 * Privileged bits:
2186 * GD (bit 13): must be 0.
2187 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2188 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2189 */
2190 /* DR7 == 0 => debugging disabled for this domain. */
2191 if ( value != 0 )
2193 value &= 0xffff27ff; /* reserved bits => 0 */
2194 value |= 0x00000400; /* reserved bits => 1 */
2195 if ( (value & (1<<13)) != 0 ) return -EPERM;
2196 for ( i = 0; i < 16; i += 2 )
2197 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2199 if ( p == current )
2200 __asm__ ( "mov %0, %%db7" : : "r" (value) );
2201 break;
2202 default:
2203 return -EINVAL;
2206 p->arch.guest_context.debugreg[reg] = value;
2207 return 0;
2210 long do_set_debugreg(int reg, unsigned long value)
2212 return set_debugreg(current, reg, value);
2215 unsigned long do_get_debugreg(int reg)
2217 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2218 return current->arch.guest_context.debugreg[reg];
2221 /*
2222 * Local variables:
2223 * mode: C
2224 * c-set-style: "BSD"
2225 * c-basic-offset: 4
2226 * tab-width: 4
2227 * indent-tabs-mode: nil
2228 * End:
2229 */