direct-io.hg

view xen/arch/x86/traps.c @ 15399:45a44a9cbe8d

Enhance guest memory accessor macros so that source operands can be
pointers to const or arrays.

Only build-tested on ia64, and untested for powerpc (which, however,
is almost identical to ia64, except for an apparent bug in the original
version of __copy_field_{from,to}_guest in that the field offset was
multiplied by the field size).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Wed Jun 20 15:29:53 2007 +0100 (2007-06-20)
parents 342c85cfd00b
children 899a44cb6ef6
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <asm/paging.h>
50 #include <asm/system.h>
51 #include <asm/io.h>
52 #include <asm/atomic.h>
53 #include <asm/desc.h>
54 #include <asm/debugreg.h>
55 #include <asm/smp.h>
56 #include <asm/flushtlb.h>
57 #include <asm/uaccess.h>
58 #include <asm/i387.h>
59 #include <asm/debugger.h>
60 #include <asm/msr.h>
61 #include <asm/shared.h>
62 #include <asm/x86_emulate.h>
63 #include <asm/hvm/vpt.h>
65 /*
66 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
67 * fatal: Xen prints diagnostic message and then hangs.
68 * dom0: The NMI is virtualised to DOM0.
69 * ignore: The NMI error is cleared and ignored.
70 */
71 #ifdef NDEBUG
72 char opt_nmi[10] = "dom0";
73 #else
74 char opt_nmi[10] = "fatal";
75 #endif
76 string_param("nmi", opt_nmi);
78 /* Master table, used by CPU0. */
79 idt_entry_t idt_table[IDT_ENTRIES];
81 /* Pointer to the IDT of every CPU. */
82 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
84 #define DECLARE_TRAP_HANDLER(_name) \
85 asmlinkage void _name(void); \
86 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
88 asmlinkage void nmi(void);
89 DECLARE_TRAP_HANDLER(divide_error);
90 DECLARE_TRAP_HANDLER(debug);
91 DECLARE_TRAP_HANDLER(int3);
92 DECLARE_TRAP_HANDLER(overflow);
93 DECLARE_TRAP_HANDLER(bounds);
94 DECLARE_TRAP_HANDLER(invalid_op);
95 DECLARE_TRAP_HANDLER(device_not_available);
96 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
97 DECLARE_TRAP_HANDLER(invalid_TSS);
98 DECLARE_TRAP_HANDLER(segment_not_present);
99 DECLARE_TRAP_HANDLER(stack_segment);
100 DECLARE_TRAP_HANDLER(general_protection);
101 DECLARE_TRAP_HANDLER(page_fault);
102 DECLARE_TRAP_HANDLER(coprocessor_error);
103 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
104 DECLARE_TRAP_HANDLER(alignment_check);
105 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
106 DECLARE_TRAP_HANDLER(machine_check);
108 long do_set_debugreg(int reg, unsigned long value);
109 unsigned long do_get_debugreg(int reg);
111 static int debug_stack_lines = 20;
112 integer_param("debug_stack_lines", debug_stack_lines);
114 #ifdef CONFIG_X86_32
115 #define stack_words_per_line 8
116 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
117 #else
118 #define stack_words_per_line 4
119 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
120 #endif
122 static void show_guest_stack(struct cpu_user_regs *regs)
123 {
124 int i;
125 unsigned long *stack, addr;
127 if ( is_hvm_vcpu(current) )
128 return;
130 if ( is_pv_32on64_vcpu(current) )
131 {
132 compat_show_guest_stack(regs, debug_stack_lines);
133 return;
134 }
136 if ( vm86_mode(regs) )
137 {
138 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
139 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
140 regs->ss, (uint16_t)(regs->esp & 0xffff));
141 }
142 else
143 {
144 stack = (unsigned long *)regs->esp;
145 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
146 }
148 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
149 {
150 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
151 break;
152 if ( get_user(addr, stack) )
153 {
154 if ( i != 0 )
155 printk("\n ");
156 printk("Fault while accessing guest memory.");
157 i = 1;
158 break;
159 }
160 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
161 printk("\n ");
162 printk(" %p", _p(addr));
163 stack++;
164 }
165 if ( i == 0 )
166 printk("Stack empty.");
167 printk("\n");
168 }
170 #if !defined(CONFIG_FRAME_POINTER)
172 static void show_trace(struct cpu_user_regs *regs)
173 {
174 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
176 printk("Xen call trace:\n ");
178 printk("[<%p>]", _p(regs->eip));
179 print_symbol(" %s\n ", regs->eip);
181 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
182 {
183 addr = *stack++;
184 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
185 {
186 printk("[<%p>]", _p(addr));
187 print_symbol(" %s\n ", addr);
188 }
189 }
191 printk("\n");
192 }
194 #else
196 static void show_trace(struct cpu_user_regs *regs)
197 {
198 unsigned long *frame, next, addr, low, high;
200 printk("Xen call trace:\n ");
202 printk("[<%p>]", _p(regs->eip));
203 print_symbol(" %s\n ", regs->eip);
205 /* Bounds for range of valid frame pointer. */
206 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
207 high = (low & ~(STACK_SIZE - 1)) +
208 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
210 /* The initial frame pointer. */
211 next = regs->ebp;
213 for ( ; ; )
214 {
215 /* Valid frame pointer? */
216 if ( (next < low) || (next >= high) )
217 {
218 /*
219 * Exception stack frames have a different layout, denoted by an
220 * inverted frame pointer.
221 */
222 next = ~next;
223 if ( (next < low) || (next >= high) )
224 break;
225 frame = (unsigned long *)next;
226 next = frame[0];
227 addr = frame[(offsetof(struct cpu_user_regs, eip) -
228 offsetof(struct cpu_user_regs, ebp))
229 / BYTES_PER_LONG];
230 }
231 else
232 {
233 /* Ordinary stack frame. */
234 frame = (unsigned long *)next;
235 next = frame[0];
236 addr = frame[1];
237 }
239 printk("[<%p>]", _p(addr));
240 print_symbol(" %s\n ", addr);
242 low = (unsigned long)&frame[2];
243 }
245 printk("\n");
246 }
248 #endif
250 void show_stack(struct cpu_user_regs *regs)
251 {
252 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
253 int i;
255 if ( guest_mode(regs) )
256 return show_guest_stack(regs);
258 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
260 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
261 {
262 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
263 break;
264 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
265 printk("\n ");
266 addr = *stack++;
267 printk(" %p", _p(addr));
268 }
269 if ( i == 0 )
270 printk("Stack empty.");
271 printk("\n");
273 show_trace(regs);
274 }
276 void show_stack_overflow(unsigned int cpu, unsigned long esp)
277 {
278 #ifdef MEMORY_GUARD
279 unsigned long esp_top, esp_bottom;
280 unsigned long *stack, addr;
282 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
283 esp_top = esp_bottom - DEBUG_STACK_SIZE;
285 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
286 (void *)esp_top, (void *)esp_bottom, (void *)esp,
287 (void *)init_tss[cpu].esp0);
289 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
290 if ( ((unsigned long)(esp - esp_top) > 512) &&
291 ((unsigned long)(esp_top - esp) > 512) )
292 {
293 printk("No stack overflow detected. Skipping stack trace.\n");
294 return;
295 }
297 if ( esp < esp_top )
298 esp = esp_top;
300 printk("Xen stack overflow (dumping trace %p-%p):\n ",
301 (void *)esp, (void *)esp_bottom);
303 stack = (unsigned long *)esp;
304 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
305 {
306 addr = *stack++;
307 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
308 {
309 printk("%p: [<%p>]", stack, _p(addr));
310 print_symbol(" %s\n ", addr);
311 }
312 }
314 printk("\n");
315 #endif
316 }
318 void show_execution_state(struct cpu_user_regs *regs)
319 {
320 show_registers(regs);
321 show_stack(regs);
322 }
324 char *trapstr(int trapnr)
325 {
326 static char *strings[] = {
327 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
328 "invalid opcode", "device not available", "double fault",
329 "coprocessor segment", "invalid tss", "segment not found",
330 "stack error", "general protection fault", "page fault",
331 "spurious interrupt", "coprocessor error", "alignment check",
332 "machine check", "simd error"
333 };
335 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
336 return "???";
338 return strings[trapnr];
339 }
341 /*
342 * This is called for faults at very unexpected times (e.g., when interrupts
343 * are disabled). In such situations we can't do much that is safe. We try to
344 * print out some tracing and then we just spin.
345 */
346 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
347 {
348 static DEFINE_PER_CPU(char, depth);
350 /*
351 * In some cases, we can end up in a vicious cycle of fatal_trap()s
352 * within fatal_trap()s. We give the problem a couple of iterations to
353 * bottom out, and then we just panic.
354 */
355 if ( ++this_cpu(depth) < 3 )
356 {
357 watchdog_disable();
358 console_start_sync();
360 show_execution_state(regs);
362 if ( trapnr == TRAP_page_fault )
363 {
364 unsigned long cr2 = read_cr2();
365 printk("Faulting linear address: %p\n", _p(cr2));
366 show_page_walk(cr2);
367 }
368 }
370 panic("FATAL TRAP: vector = %d (%s)\n"
371 "[error_code=%04x] %s\n",
372 trapnr, trapstr(trapnr), regs->error_code,
373 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
374 }
376 static int do_guest_trap(
377 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
378 {
379 struct vcpu *v = current;
380 struct trap_bounce *tb;
381 const struct trap_info *ti;
383 tb = &v->arch.trap_bounce;
384 ti = &v->arch.guest_context.trap_ctxt[trapnr];
386 tb->flags = TBF_EXCEPTION;
387 tb->cs = ti->cs;
388 tb->eip = ti->address;
390 if ( use_error_code )
391 {
392 tb->flags |= TBF_EXCEPTION_ERRCODE;
393 tb->error_code = regs->error_code;
394 }
396 if ( TI_GET_IF(ti) )
397 tb->flags |= TBF_INTERRUPT;
399 if ( unlikely(null_trap_bounce(v, tb)) )
400 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
401 "domain %d on VCPU %d [ec=%04x]\n",
402 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
403 regs->error_code);
405 return 0;
406 }
408 static inline int do_trap(
409 int trapnr, struct cpu_user_regs *regs, int use_error_code)
410 {
411 unsigned long fixup;
413 DEBUGGER_trap_entry(trapnr, regs);
415 if ( guest_mode(regs) )
416 return do_guest_trap(trapnr, regs, use_error_code);
418 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
419 {
420 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
421 trapnr, _p(regs->eip), _p(fixup));
422 regs->eip = fixup;
423 return 0;
424 }
426 DEBUGGER_trap_fatal(trapnr, regs);
428 show_execution_state(regs);
429 panic("FATAL TRAP: vector = %d (%s)\n"
430 "[error_code=%04x]\n",
431 trapnr, trapstr(trapnr), regs->error_code);
432 return 0;
433 }
435 #define DO_ERROR_NOCODE(trapnr, name) \
436 asmlinkage int do_##name(struct cpu_user_regs *regs) \
437 { \
438 return do_trap(trapnr, regs, 0); \
439 }
441 #define DO_ERROR(trapnr, name) \
442 asmlinkage int do_##name(struct cpu_user_regs *regs) \
443 { \
444 return do_trap(trapnr, regs, 1); \
445 }
447 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
448 DO_ERROR_NOCODE(TRAP_overflow, overflow)
449 DO_ERROR_NOCODE(TRAP_bounds, bounds)
450 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
451 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
452 DO_ERROR( TRAP_no_segment, segment_not_present)
453 DO_ERROR( TRAP_stack_error, stack_segment)
454 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
455 DO_ERROR( TRAP_alignment_check, alignment_check)
456 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
458 int rdmsr_hypervisor_regs(
459 uint32_t idx, uint32_t *eax, uint32_t *edx)
460 {
461 idx -= 0x40000000;
462 if ( idx > 0 )
463 return 0;
465 switch ( idx )
466 {
467 case 0:
468 {
469 *eax = *edx = 0;
470 break;
471 }
472 default:
473 BUG();
474 }
476 return 1;
477 }
479 int wrmsr_hypervisor_regs(
480 uint32_t idx, uint32_t eax, uint32_t edx)
481 {
482 struct domain *d = current->domain;
484 idx -= 0x40000000;
485 if ( idx > 0 )
486 return 0;
488 switch ( idx )
489 {
490 case 0:
491 {
492 void *hypercall_page;
493 unsigned long mfn;
494 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
495 unsigned int idx = eax & 0xfff;
497 if ( idx > 0 )
498 {
499 gdprintk(XENLOG_WARNING,
500 "Dom%d: Out of range index %u to MSR %08x\n",
501 d->domain_id, idx, 0x40000000);
502 return 0;
503 }
505 mfn = gmfn_to_mfn(d, gmfn);
507 if ( !mfn_valid(mfn) ||
508 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
509 {
510 gdprintk(XENLOG_WARNING,
511 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
512 d->domain_id, gmfn, mfn, 0x40000000);
513 return 0;
514 }
516 hypercall_page = map_domain_page(mfn);
517 hypercall_page_initialise(d, hypercall_page);
518 unmap_domain_page(hypercall_page);
520 put_page_and_type(mfn_to_page(mfn));
521 break;
522 }
524 default:
525 BUG();
526 }
528 return 1;
529 }
531 int cpuid_hypervisor_leaves(
532 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
533 {
534 idx -= 0x40000000;
535 if ( idx > 2 )
536 return 0;
538 switch ( idx )
539 {
540 case 0:
541 *eax = 0x40000002; /* Largest leaf */
542 *ebx = 0x566e6558; /* Signature 1: "XenV" */
543 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
544 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
545 break;
547 case 1:
548 *eax = (xen_major_version() << 16) | xen_minor_version();
549 *ebx = 0; /* Reserved */
550 *ecx = 0; /* Reserved */
551 *edx = 0; /* Reserved */
552 break;
554 case 2:
555 *eax = 1; /* Number of hypercall-transfer pages */
556 *ebx = 0x40000000; /* MSR base address */
557 *ecx = 0; /* Features 1 */
558 *edx = 0; /* Features 2 */
559 break;
561 default:
562 BUG();
563 }
565 return 1;
566 }
568 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
569 {
570 char sig[5], instr[2];
571 uint32_t a, b, c, d;
572 unsigned long eip, rc;
574 a = regs->eax;
575 b = regs->ebx;
576 c = regs->ecx;
577 d = regs->edx;
578 eip = regs->eip;
580 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
581 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
582 {
583 propagate_page_fault(eip + sizeof(sig) - rc, 0);
584 return EXCRET_fault_fixed;
585 }
586 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
587 return 0;
588 eip += sizeof(sig);
590 /* We only emulate CPUID. */
591 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
592 {
593 propagate_page_fault(eip + sizeof(instr) - rc, 0);
594 return EXCRET_fault_fixed;
595 }
596 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
597 return 0;
598 eip += sizeof(instr);
600 __asm__ (
601 "cpuid"
602 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
603 : "0" (a), "1" (b), "2" (c), "3" (d) );
605 if ( regs->eax == 1 )
606 {
607 /* Modify Feature Information. */
608 clear_bit(X86_FEATURE_VME, &d);
609 clear_bit(X86_FEATURE_DE, &d);
610 clear_bit(X86_FEATURE_PSE, &d);
611 clear_bit(X86_FEATURE_PGE, &d);
612 if ( !supervisor_mode_kernel )
613 clear_bit(X86_FEATURE_SEP, &d);
614 if ( !IS_PRIV(current->domain) )
615 clear_bit(X86_FEATURE_MTRR, &d);
616 }
617 else if ( regs->eax == 0x80000001 )
618 {
619 /* Modify Feature Information. */
620 if ( is_pv_32bit_vcpu(current) )
621 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
622 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
623 }
624 else
625 {
626 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
627 }
629 regs->eax = a;
630 regs->ebx = b;
631 regs->ecx = c;
632 regs->edx = d;
633 regs->eip = eip;
635 return EXCRET_fault_fixed;
636 }
638 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
639 {
640 struct bug_frame bug;
641 struct bug_frame_str bug_str;
642 char *filename, *predicate, *eip = (char *)regs->eip;
643 int rc, id, lineno;
645 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
647 if ( likely(guest_mode(regs)) )
648 {
649 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
650 return rc;
651 return do_guest_trap(TRAP_invalid_op, regs, 0);
652 }
654 if ( !is_kernel(eip) ||
655 __copy_from_user(&bug, eip, sizeof(bug)) ||
656 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
657 (bug.ret != 0xc2) )
658 goto die;
659 eip += sizeof(bug);
661 id = bug.id & 3;
663 if ( id == BUGFRAME_dump )
664 {
665 show_execution_state(regs);
666 regs->eip = (unsigned long)eip;
667 return EXCRET_fault_fixed;
668 }
670 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
671 if ( !is_kernel(eip) ||
672 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
673 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
674 goto die;
675 eip += sizeof(bug_str);
677 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
678 lineno = bug.id >> 2;
680 if ( id == BUGFRAME_warn )
681 {
682 printk("Xen WARN at %.50s:%d\n", filename, lineno);
683 show_execution_state(regs);
684 regs->eip = (unsigned long)eip;
685 return EXCRET_fault_fixed;
686 }
688 if ( id == BUGFRAME_bug )
689 {
690 printk("Xen BUG at %.50s:%d\n", filename, lineno);
691 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
692 show_execution_state(regs);
693 panic("Xen BUG at %.50s:%d\n", filename, lineno);
694 }
696 /* ASSERT: decode the predicate string pointer. */
697 ASSERT(id == BUGFRAME_assert);
698 if ( !is_kernel(eip) ||
699 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
700 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
701 goto die;
702 eip += sizeof(bug_str);
704 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
705 printk("Assertion '%s' failed at %.50s:%d\n",
706 predicate, filename, lineno);
707 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
708 show_execution_state(regs);
709 panic("Assertion '%s' failed at %.50s:%d\n",
710 predicate, filename, lineno);
712 die:
713 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
714 show_execution_state(regs);
715 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
716 return 0;
717 }
719 asmlinkage int do_int3(struct cpu_user_regs *regs)
720 {
721 DEBUGGER_trap_entry(TRAP_int3, regs);
723 if ( !guest_mode(regs) )
724 {
725 DEBUGGER_trap_fatal(TRAP_int3, regs);
726 show_execution_state(regs);
727 panic("FATAL TRAP: vector = 3 (Int3)\n");
728 }
730 return do_guest_trap(TRAP_int3, regs, 0);
731 }
733 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
734 {
735 fatal_trap(TRAP_machine_check, regs);
736 return 0;
737 }
739 void propagate_page_fault(unsigned long addr, u16 error_code)
740 {
741 struct trap_info *ti;
742 struct vcpu *v = current;
743 struct trap_bounce *tb = &v->arch.trap_bounce;
745 v->arch.guest_context.ctrlreg[2] = addr;
746 arch_set_cr2(v, addr);
748 /* Re-set error_code.user flag appropriately for the guest. */
749 error_code &= ~PFEC_user_mode;
750 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
751 error_code |= PFEC_user_mode;
753 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
754 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
755 tb->error_code = error_code;
756 tb->cs = ti->cs;
757 tb->eip = ti->address;
758 if ( TI_GET_IF(ti) )
759 tb->flags |= TBF_INTERRUPT;
760 if ( unlikely(null_trap_bounce(v, tb)) )
761 {
762 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
763 v->domain->domain_id, v->vcpu_id, error_code);
764 show_page_walk(addr);
765 }
766 }
768 static int handle_gdt_ldt_mapping_fault(
769 unsigned long offset, struct cpu_user_regs *regs)
770 {
771 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
772 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
773 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
775 /* Should never fault in another vcpu's area. */
776 BUG_ON(vcpu_area != current->vcpu_id);
778 /* Byte offset within the gdt/ldt sub-area. */
779 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
781 if ( likely(is_ldt_area) )
782 {
783 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
784 if ( unlikely(map_ldt_shadow_page(offset >> PAGE_SHIFT) == 0) )
785 {
786 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
787 if ( !guest_mode(regs) )
788 return 0;
789 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
790 propagate_page_fault(
791 current->arch.guest_context.ldt_base + offset,
792 regs->error_code);
793 }
794 }
795 else
796 {
797 /* GDT fault: handle the fault as #GP(selector). */
798 regs->error_code = (u16)offset & ~7;
799 (void)do_general_protection(regs);
800 }
802 return EXCRET_fault_fixed;
803 }
805 #ifdef HYPERVISOR_VIRT_END
806 #define IN_HYPERVISOR_RANGE(va) \
807 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
808 #else
809 #define IN_HYPERVISOR_RANGE(va) \
810 (((va) >= HYPERVISOR_VIRT_START))
811 #endif
813 static int __spurious_page_fault(
814 unsigned long addr, struct cpu_user_regs *regs)
815 {
816 unsigned long mfn, cr3 = read_cr3();
817 #if CONFIG_PAGING_LEVELS >= 4
818 l4_pgentry_t l4e, *l4t;
819 #endif
820 #if CONFIG_PAGING_LEVELS >= 3
821 l3_pgentry_t l3e, *l3t;
822 #endif
823 l2_pgentry_t l2e, *l2t;
824 l1_pgentry_t l1e, *l1t;
825 unsigned int required_flags, disallowed_flags;
827 /* Reserved bit violations are never spurious faults. */
828 if ( regs->error_code & PFEC_reserved_bit )
829 return 0;
831 required_flags = _PAGE_PRESENT;
832 if ( regs->error_code & PFEC_write_access )
833 required_flags |= _PAGE_RW;
834 if ( regs->error_code & PFEC_user_mode )
835 required_flags |= _PAGE_USER;
837 disallowed_flags = 0;
838 if ( regs->error_code & PFEC_insn_fetch )
839 disallowed_flags |= _PAGE_NX;
841 mfn = cr3 >> PAGE_SHIFT;
843 #if CONFIG_PAGING_LEVELS >= 4
844 l4t = map_domain_page(mfn);
845 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
846 mfn = l4e_get_pfn(l4e);
847 unmap_domain_page(l4t);
848 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
849 (l4e_get_flags(l4e) & disallowed_flags) )
850 return 0;
851 #endif
853 #if CONFIG_PAGING_LEVELS >= 3
854 l3t = map_domain_page(mfn);
855 #ifdef CONFIG_X86_PAE
856 l3t += (cr3 & 0xFE0UL) >> 3;
857 #endif
858 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
859 mfn = l3e_get_pfn(l3e);
860 unmap_domain_page(l3t);
861 #ifdef CONFIG_X86_PAE
862 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
863 return 0;
864 #else
865 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
866 (l3e_get_flags(l3e) & disallowed_flags) )
867 return 0;
868 #endif
869 #endif
871 l2t = map_domain_page(mfn);
872 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
873 mfn = l2e_get_pfn(l2e);
874 unmap_domain_page(l2t);
875 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
876 (l2e_get_flags(l2e) & disallowed_flags) )
877 return 0;
878 if ( l2e_get_flags(l2e) & _PAGE_PSE )
879 {
880 l1e = l1e_empty(); /* define before use in debug tracing */
881 goto spurious;
882 }
884 l1t = map_domain_page(mfn);
885 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
886 mfn = l1e_get_pfn(l1e);
887 unmap_domain_page(l1t);
888 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
889 (l1e_get_flags(l1e) & disallowed_flags) )
890 return 0;
892 spurious:
893 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
894 "at addr %lx, e/c %04x\n",
895 current->domain->domain_id, current->vcpu_id,
896 addr, regs->error_code);
897 #if CONFIG_PAGING_LEVELS >= 4
898 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
899 #endif
900 #if CONFIG_PAGING_LEVELS >= 3
901 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
902 #endif
903 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
904 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
905 #ifndef NDEBUG
906 show_registers(regs);
907 #endif
908 return 1;
909 }
911 static int spurious_page_fault(
912 unsigned long addr, struct cpu_user_regs *regs)
913 {
914 unsigned long flags;
915 int is_spurious;
917 /*
918 * Disabling interrupts prevents TLB flushing, and hence prevents
919 * page tables from becoming invalid under our feet during the walk.
920 */
921 local_irq_save(flags);
922 is_spurious = __spurious_page_fault(addr, regs);
923 local_irq_restore(flags);
925 return is_spurious;
926 }
928 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
929 {
930 struct vcpu *v = current;
931 struct domain *d = v->domain;
933 /* No fixups in interrupt context or when interrupts are disabled. */
934 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
935 return 0;
937 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
938 {
939 if ( paging_mode_external(d) && guest_mode(regs) )
940 return paging_fault(addr, regs);
941 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
942 return handle_gdt_ldt_mapping_fault(
943 addr - GDT_LDT_VIRT_START, regs);
944 return 0;
945 }
947 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
948 guest_kernel_mode(v, regs) &&
949 /* Do not check if access-protection fault since the page may
950 legitimately be not present in shadow page tables */
951 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
952 ptwr_do_page_fault(v, addr, regs) )
953 return EXCRET_fault_fixed;
955 if ( paging_mode_enabled(d) )
956 return paging_fault(addr, regs);
958 return 0;
959 }
961 /*
962 * #PF error code:
963 * Bit 0: Protection violation (=1) ; Page not present (=0)
964 * Bit 1: Write access
965 * Bit 2: User mode (=1) ; Supervisor mode (=0)
966 * Bit 3: Reserved bit violation
967 * Bit 4: Instruction fetch
968 */
969 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
970 {
971 unsigned long addr, fixup;
972 int rc;
974 addr = read_cr2();
976 DEBUGGER_trap_entry(TRAP_page_fault, regs);
978 perfc_incr(page_faults);
980 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
981 return rc;
983 if ( unlikely(!guest_mode(regs)) )
984 {
985 if ( spurious_page_fault(addr, regs) )
986 return EXCRET_not_a_fault;
988 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
989 {
990 perfc_incr(copy_user_faults);
991 regs->eip = fixup;
992 return 0;
993 }
995 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
997 show_execution_state(regs);
998 show_page_walk(addr);
999 panic("FATAL PAGE FAULT\n"
1000 "[error_code=%04x]\n"
1001 "Faulting linear address: %p\n",
1002 regs->error_code, _p(addr));
1005 propagate_page_fault(addr, regs->error_code);
1006 return 0;
1009 /*
1010 * Early handler to deal with spurious page faults. For example, consider a
1011 * routine that uses a mapping immediately after installing it (making it
1012 * present). The CPU may speculatively execute the memory access before
1013 * executing the PTE write. The instruction will then be marked to cause a
1014 * page fault when it is retired, despite the fact that the PTE is present and
1015 * correct at that point in time.
1016 */
1017 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1019 static int stuck;
1020 static unsigned long prev_eip, prev_cr2;
1021 unsigned long cr2 = read_cr2();
1023 BUG_ON(smp_processor_id() != 0);
1025 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1027 prev_eip = regs->eip;
1028 prev_cr2 = cr2;
1029 stuck = 0;
1030 return EXCRET_not_a_fault;
1033 if ( stuck++ == 1000 )
1034 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1035 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1037 return EXCRET_not_a_fault;
1040 long do_fpu_taskswitch(int set)
1042 struct vcpu *v = current;
1044 if ( set )
1046 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1047 stts();
1049 else
1051 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1052 if ( v->fpu_dirtied )
1053 clts();
1056 return 0;
1059 static int read_descriptor(unsigned int sel,
1060 const struct vcpu *v,
1061 const struct cpu_user_regs * regs,
1062 unsigned long *base,
1063 unsigned long *limit,
1064 unsigned int *ar,
1065 unsigned int vm86attr)
1067 struct desc_struct desc;
1069 if ( !vm86_mode(regs) )
1071 if ( sel < 4)
1072 desc.b = desc.a = 0;
1073 else if ( __get_user(desc,
1074 (const struct desc_struct *)(!(sel & 4)
1075 ? GDT_VIRT_START(v)
1076 : LDT_VIRT_START(v))
1077 + (sel >> 3)) )
1078 return 0;
1079 if ( !(vm86attr & _SEGMENT_CODE) )
1080 desc.b &= ~_SEGMENT_L;
1082 else
1084 desc.a = (sel << 20) | 0xffff;
1085 desc.b = vm86attr | (sel >> 12);
1088 *ar = desc.b & 0x00f0ff00;
1089 if ( !(desc.b & _SEGMENT_L) )
1091 *base = (desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000);
1092 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1093 if ( desc.b & _SEGMENT_G )
1094 *limit = ((*limit + 1) << 12) - 1;
1095 #ifndef NDEBUG
1096 if ( !vm86_mode(regs) && sel > 3 )
1098 unsigned int a, l;
1099 unsigned char valid;
1101 __asm__("larl %2, %0\n\tsetz %1" : "=r" (a), "=rm" (valid) : "rm" (sel));
1102 BUG_ON(valid && (a & 0x00f0ff00) != *ar);
1103 __asm__("lsll %2, %0\n\tsetz %1" : "=r" (l), "=rm" (valid) : "rm" (sel));
1104 BUG_ON(valid && l != *limit);
1106 #endif
1108 else
1110 *base = 0UL;
1111 *limit = ~0UL;
1114 return 1;
1117 /* Has the guest requested sufficient permission for this I/O access? */
1118 static inline int guest_io_okay(
1119 unsigned int port, unsigned int bytes,
1120 struct vcpu *v, struct cpu_user_regs *regs)
1122 #if defined(__x86_64__)
1123 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1124 int user_mode = !(v->arch.flags & TF_kernel_mode);
1125 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1126 #elif defined(__i386__)
1127 #define TOGGLE_MODE() ((void)0)
1128 #endif
1130 if ( !vm86_mode(regs) &&
1131 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1132 return 1;
1134 if ( v->arch.iobmp_limit > (port + bytes) )
1136 union { uint8_t bytes[2]; uint16_t mask; } x;
1138 /*
1139 * Grab permission bytes from guest space. Inaccessible bytes are
1140 * read as 0xff (no access allowed).
1141 */
1142 TOGGLE_MODE();
1143 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1144 port>>3, 2) )
1146 default: x.bytes[0] = ~0;
1147 case 1: x.bytes[1] = ~0;
1148 case 0: break;
1150 TOGGLE_MODE();
1152 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1153 return 1;
1156 return 0;
1159 /* Has the administrator granted sufficient permission for this I/O access? */
1160 static inline int admin_io_okay(
1161 unsigned int port, unsigned int bytes,
1162 struct vcpu *v, struct cpu_user_regs *regs)
1164 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1167 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1168 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1169 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1170 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1171 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1172 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1174 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1175 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1176 __attribute__((__regparm__(1)));
1177 unsigned long guest_to_host_gpr_switch(unsigned long)
1178 __attribute__((__regparm__(1)));
1180 /* Instruction fetch with error handling. */
1181 #define insn_fetch(type, base, eip, limit) \
1182 ({ unsigned long _rc, _ptr = (base) + (eip); \
1183 type _x; \
1184 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1185 goto fail; \
1186 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1187 { \
1188 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1189 return EXCRET_fault_fixed; \
1190 } \
1191 (eip) += sizeof(_x); _x; })
1193 #if defined(CONFIG_X86_32)
1194 # define read_sreg(regs, sr) ((regs)->sr)
1195 #elif defined(CONFIG_X86_64)
1196 # define read_sreg(regs, sr) read_segment_register(sr)
1197 #endif
1199 static int emulate_privileged_op(struct cpu_user_regs *regs)
1201 struct vcpu *v = current;
1202 unsigned long *reg, eip = regs->eip, res;
1203 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1204 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1205 unsigned int port, i, data_sel, ar, data, rc;
1206 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1207 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1208 ? regs->reg \
1209 : ad_bytes == 4 \
1210 ? (u32)regs->reg \
1211 : (u16)regs->reg)
1212 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1213 ? regs->reg = (val) \
1214 : ad_bytes == 4 \
1215 ? (*(u32 *)&regs->reg = (val)) \
1216 : (*(u16 *)&regs->reg = (val)))
1217 unsigned long code_base, code_limit;
1218 char io_emul_stub[16];
1219 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1220 u32 l, h;
1222 if ( !read_descriptor(regs->cs, v, regs,
1223 &code_base, &code_limit, &ar,
1224 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1225 goto fail;
1226 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1227 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1228 if ( !(ar & _SEGMENT_S) ||
1229 !(ar & _SEGMENT_P) ||
1230 !(ar & _SEGMENT_CODE) )
1231 goto fail;
1233 /* emulating only opcodes not allowing SS to be default */
1234 data_sel = read_sreg(regs, ds);
1236 /* Legacy prefixes. */
1237 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1239 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1241 case 0x66: /* operand-size override */
1242 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1243 continue;
1244 case 0x67: /* address-size override */
1245 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1246 continue;
1247 case 0x2e: /* CS override */
1248 data_sel = regs->cs;
1249 continue;
1250 case 0x3e: /* DS override */
1251 data_sel = read_sreg(regs, ds);
1252 continue;
1253 case 0x26: /* ES override */
1254 data_sel = read_sreg(regs, es);
1255 continue;
1256 case 0x64: /* FS override */
1257 data_sel = read_sreg(regs, fs);
1258 lm_ovr = lm_seg_fs;
1259 continue;
1260 case 0x65: /* GS override */
1261 data_sel = read_sreg(regs, gs);
1262 lm_ovr = lm_seg_gs;
1263 continue;
1264 case 0x36: /* SS override */
1265 data_sel = regs->ss;
1266 continue;
1267 case 0xf0: /* LOCK */
1268 lock = 1;
1269 continue;
1270 case 0xf2: /* REPNE/REPNZ */
1271 case 0xf3: /* REP/REPE/REPZ */
1272 rep_prefix = 1;
1273 continue;
1274 default:
1275 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1277 rex = opcode;
1278 continue;
1280 break;
1282 break;
1285 /* REX prefix. */
1286 if ( rex & 8 ) /* REX.W */
1287 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1288 modrm_reg = (rex & 4) << 1; /* REX.R */
1289 /* REX.X does not need to be decoded. */
1290 modrm_rm = (rex & 1) << 3; /* REX.B */
1292 if ( opcode == 0x0f )
1293 goto twobyte_opcode;
1295 if ( lock )
1296 goto fail;
1298 /* Input/Output String instructions. */
1299 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1301 unsigned long data_base, data_limit;
1303 if ( rep_prefix && (rd_ad(ecx) == 0) )
1304 goto done;
1306 if ( !(opcode & 2) )
1308 data_sel = read_sreg(regs, es);
1309 lm_ovr = lm_seg_none;
1312 if ( !(ar & _SEGMENT_L) )
1314 if ( !read_descriptor(data_sel, v, regs,
1315 &data_base, &data_limit, &ar,
1316 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1317 goto fail;
1318 if ( !(ar & _SEGMENT_S) ||
1319 !(ar & _SEGMENT_P) ||
1320 (opcode & 2 ?
1321 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1322 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1323 goto fail;
1325 #ifdef CONFIG_X86_64
1326 else
1328 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1330 switch ( lm_ovr )
1332 case lm_seg_none:
1333 data_base = 0UL;
1334 break;
1335 case lm_seg_fs:
1336 data_base = v->arch.guest_context.fs_base;
1337 break;
1338 case lm_seg_gs:
1339 if ( guest_kernel_mode(v, regs) )
1340 data_base = v->arch.guest_context.gs_base_kernel;
1341 else
1342 data_base = v->arch.guest_context.gs_base_user;
1343 break;
1346 else
1347 read_descriptor(data_sel, v, regs,
1348 &data_base, &data_limit, &ar,
1349 0);
1350 data_limit = ~0UL;
1351 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1353 #endif
1355 continue_io_string:
1356 switch ( opcode )
1358 case 0x6c: /* INSB */
1359 op_bytes = 1;
1360 case 0x6d: /* INSW/INSL */
1361 if ( data_limit < op_bytes - 1 ||
1362 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1363 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1364 goto fail;
1365 port = (u16)regs->edx;
1366 switch ( op_bytes )
1368 case 1:
1369 /* emulate PIT counter 2 */
1370 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1371 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1372 pv_pit_handler(port, 0, 0) : ~0));
1373 break;
1374 case 2:
1375 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1376 break;
1377 case 4:
1378 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1379 break;
1381 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1383 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1384 PFEC_write_access);
1385 return EXCRET_fault_fixed;
1387 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1388 break;
1390 case 0x6e: /* OUTSB */
1391 op_bytes = 1;
1392 case 0x6f: /* OUTSW/OUTSL */
1393 if ( data_limit < op_bytes - 1 ||
1394 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1395 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1396 goto fail;
1397 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1398 if ( rc != 0 )
1400 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1401 return EXCRET_fault_fixed;
1403 port = (u16)regs->edx;
1404 switch ( op_bytes )
1406 case 1:
1407 if ( guest_outb_okay(port, v, regs) )
1408 outb((u8)data, port);
1409 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1410 pv_pit_handler(port, data, 1);
1411 break;
1412 case 2:
1413 if ( guest_outw_okay(port, v, regs) )
1414 outw((u16)data, port);
1415 break;
1416 case 4:
1417 if ( guest_outl_okay(port, v, regs) )
1418 outl((u32)data, port);
1419 break;
1421 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1422 break;
1425 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1427 if ( !hypercall_preempt_check() )
1428 goto continue_io_string;
1429 eip = regs->eip;
1432 goto done;
1435 /*
1436 * Very likely to be an I/O instruction (IN/OUT).
1437 * Build an on-stack stub to execute the instruction with full guest
1438 * GPR context. This is needed for some systems which (ab)use IN/OUT
1439 * to communicate with BIOS code in system-management mode.
1440 */
1441 #ifdef __x86_64__
1442 /* movq $host_to_guest_gpr_switch,%rcx */
1443 io_emul_stub[0] = 0x48;
1444 io_emul_stub[1] = 0xb9;
1445 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1446 /* callq *%rcx */
1447 io_emul_stub[10] = 0xff;
1448 io_emul_stub[11] = 0xd1;
1449 #else
1450 /* call host_to_guest_gpr_switch */
1451 io_emul_stub[0] = 0xe8;
1452 *(s32 *)&io_emul_stub[1] =
1453 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1454 /* 7 x nop */
1455 memset(&io_emul_stub[5], 0x90, 7);
1456 #endif
1457 /* data16 or nop */
1458 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1459 /* <io-access opcode> */
1460 io_emul_stub[13] = opcode;
1461 /* imm8 or nop */
1462 io_emul_stub[14] = 0x90;
1463 /* ret (jumps to guest_to_host_gpr_switch) */
1464 io_emul_stub[15] = 0xc3;
1466 /* Handy function-typed pointer to the stub. */
1467 io_emul = (void *)io_emul_stub;
1469 /* I/O Port and Interrupt Flag instructions. */
1470 switch ( opcode )
1472 case 0xe4: /* IN imm8,%al */
1473 op_bytes = 1;
1474 case 0xe5: /* IN imm8,%eax */
1475 port = insn_fetch(u8, code_base, eip, code_limit);
1476 io_emul_stub[14] = port; /* imm8 */
1477 exec_in:
1478 if ( !guest_io_okay(port, op_bytes, v, regs) )
1479 goto fail;
1480 switch ( op_bytes )
1482 case 1:
1483 if ( guest_inb_okay(port, v, regs) )
1484 io_emul(regs);
1485 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1487 regs->eax &= ~0xffUL;
1488 regs->eax |= pv_pit_handler(port, 0, 0);
1490 else
1491 regs->eax |= (u8)~0;
1492 break;
1493 case 2:
1494 if ( guest_inw_okay(port, v, regs) )
1495 io_emul(regs);
1496 else
1497 regs->eax |= (u16)~0;
1498 break;
1499 case 4:
1500 if ( guest_inl_okay(port, v, regs) )
1501 io_emul(regs);
1502 else
1503 regs->eax = (u32)~0;
1504 break;
1506 goto done;
1508 case 0xec: /* IN %dx,%al */
1509 op_bytes = 1;
1510 case 0xed: /* IN %dx,%eax */
1511 port = (u16)regs->edx;
1512 goto exec_in;
1514 case 0xe6: /* OUT %al,imm8 */
1515 op_bytes = 1;
1516 case 0xe7: /* OUT %eax,imm8 */
1517 port = insn_fetch(u8, code_base, eip, code_limit);
1518 io_emul_stub[14] = port; /* imm8 */
1519 exec_out:
1520 if ( !guest_io_okay(port, op_bytes, v, regs) )
1521 goto fail;
1522 switch ( op_bytes )
1524 case 1:
1525 if ( guest_outb_okay(port, v, regs) )
1526 io_emul(regs);
1527 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1528 pv_pit_handler(port, regs->eax, 1);
1529 break;
1530 case 2:
1531 if ( guest_outw_okay(port, v, regs) )
1532 io_emul(regs);
1533 break;
1534 case 4:
1535 if ( guest_outl_okay(port, v, regs) )
1536 io_emul(regs);
1537 break;
1539 goto done;
1541 case 0xee: /* OUT %al,%dx */
1542 op_bytes = 1;
1543 case 0xef: /* OUT %eax,%dx */
1544 port = (u16)regs->edx;
1545 goto exec_out;
1547 case 0xfa: /* CLI */
1548 case 0xfb: /* STI */
1549 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1550 goto fail;
1551 /*
1552 * This is just too dangerous to allow, in my opinion. Consider if the
1553 * caller then tries to reenable interrupts using POPF: we can't trap
1554 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1555 * do for us. :-)
1556 */
1557 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1558 goto done;
1561 /* No decode of this single-byte opcode. */
1562 goto fail;
1564 twobyte_opcode:
1565 /* Two-byte opcodes only emulated from guest kernel. */
1566 if ( !guest_kernel_mode(v, regs) )
1567 goto fail;
1569 /* Privileged (ring 0) instructions. */
1570 opcode = insn_fetch(u8, code_base, eip, code_limit);
1571 if ( lock && (opcode & ~3) != 0x20 )
1572 goto fail;
1573 switch ( opcode )
1575 case 0x06: /* CLTS */
1576 (void)do_fpu_taskswitch(0);
1577 break;
1579 case 0x09: /* WBINVD */
1580 /* Ignore the instruction if unprivileged. */
1581 if ( !cache_flush_permitted(v->domain) )
1582 /* Non-physdev domain attempted WBINVD; ignore for now since
1583 newer linux uses this in some start-of-day timing loops */
1585 else
1586 wbinvd();
1587 break;
1589 case 0x20: /* MOV CR?,<reg> */
1590 opcode = insn_fetch(u8, code_base, eip, code_limit);
1591 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1592 modrm_rm |= (opcode >> 0) & 7;
1593 reg = decode_register(modrm_rm, regs, 0);
1594 switch ( modrm_reg )
1596 case 0: /* Read CR0 */
1597 *reg = (read_cr0() & ~X86_CR0_TS) |
1598 v->arch.guest_context.ctrlreg[0];
1599 break;
1601 case 2: /* Read CR2 */
1602 *reg = v->arch.guest_context.ctrlreg[2];
1603 break;
1605 case 3: /* Read CR3 */
1606 if ( !is_pv_32on64_vcpu(v) )
1607 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1608 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1609 #ifdef CONFIG_COMPAT
1610 else
1611 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1612 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1613 #endif
1614 break;
1616 case 4: /* Read CR4 */
1617 /*
1618 * Guests can read CR4 to see what features Xen has enabled. We
1619 * therefore lie about PGE & PSE as they are unavailable to guests.
1620 */
1621 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1622 break;
1624 default:
1625 goto fail;
1627 break;
1629 case 0x21: /* MOV DR?,<reg> */
1630 opcode = insn_fetch(u8, code_base, eip, code_limit);
1631 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1632 modrm_rm |= (opcode >> 0) & 7;
1633 reg = decode_register(modrm_rm, regs, 0);
1634 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1635 goto fail;
1636 *reg = res;
1637 break;
1639 case 0x22: /* MOV <reg>,CR? */
1640 opcode = insn_fetch(u8, code_base, eip, code_limit);
1641 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1642 modrm_rm |= (opcode >> 0) & 7;
1643 reg = decode_register(modrm_rm, regs, 0);
1644 switch ( modrm_reg )
1646 case 0: /* Write CR0 */
1647 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1649 gdprintk(XENLOG_WARNING,
1650 "Attempt to change unmodifiable CR0 flags.\n");
1651 goto fail;
1653 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1654 break;
1656 case 2: /* Write CR2 */
1657 v->arch.guest_context.ctrlreg[2] = *reg;
1658 arch_set_cr2(v, *reg);
1659 break;
1661 case 3: /* Write CR3 */
1662 LOCK_BIGLOCK(v->domain);
1663 if ( !is_pv_32on64_vcpu(v) )
1664 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1665 #ifdef CONFIG_COMPAT
1666 else
1667 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1668 #endif
1669 UNLOCK_BIGLOCK(v->domain);
1670 if ( rc == 0 ) /* not okay */
1671 goto fail;
1672 break;
1674 case 4:
1675 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1677 gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags.\n");
1678 goto fail;
1680 break;
1682 default:
1683 goto fail;
1685 break;
1687 case 0x23: /* MOV <reg>,DR? */
1688 opcode = insn_fetch(u8, code_base, eip, code_limit);
1689 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1690 modrm_rm |= (opcode >> 0) & 7;
1691 reg = decode_register(modrm_rm, regs, 0);
1692 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1693 goto fail;
1694 break;
1696 case 0x30: /* WRMSR */
1697 switch ( regs->ecx )
1699 #ifdef CONFIG_X86_64
1700 case MSR_FS_BASE:
1701 if ( is_pv_32on64_vcpu(v) )
1702 goto fail;
1703 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1704 goto fail;
1705 v->arch.guest_context.fs_base =
1706 ((u64)regs->edx << 32) | regs->eax;
1707 break;
1708 case MSR_GS_BASE:
1709 if ( is_pv_32on64_vcpu(v) )
1710 goto fail;
1711 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1712 goto fail;
1713 v->arch.guest_context.gs_base_kernel =
1714 ((u64)regs->edx << 32) | regs->eax;
1715 break;
1716 case MSR_SHADOW_GS_BASE:
1717 if ( is_pv_32on64_vcpu(v) )
1718 goto fail;
1719 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1720 goto fail;
1721 v->arch.guest_context.gs_base_user =
1722 ((u64)regs->edx << 32) | regs->eax;
1723 break;
1724 #endif
1725 default:
1726 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1727 break;
1729 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1730 (regs->eax != l) || (regs->edx != h) )
1731 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1732 "%08x:%08x to %08lx:%08lx.\n",
1733 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1734 break;
1736 break;
1738 case 0x32: /* RDMSR */
1739 switch ( regs->ecx )
1741 #ifdef CONFIG_X86_64
1742 case MSR_FS_BASE:
1743 if ( is_pv_32on64_vcpu(v) )
1744 goto fail;
1745 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1746 regs->edx = v->arch.guest_context.fs_base >> 32;
1747 break;
1748 case MSR_GS_BASE:
1749 if ( is_pv_32on64_vcpu(v) )
1750 goto fail;
1751 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1752 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1753 break;
1754 case MSR_SHADOW_GS_BASE:
1755 if ( is_pv_32on64_vcpu(v) )
1756 goto fail;
1757 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1758 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1759 break;
1760 #endif
1761 case MSR_EFER:
1762 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1763 goto fail;
1764 break;
1765 default:
1766 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1768 regs->eax = l;
1769 regs->edx = h;
1770 break;
1772 /* Everyone can read the MSR space. */
1773 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1774 _p(regs->ecx));*/
1775 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1776 goto fail;
1777 break;
1779 break;
1781 default:
1782 goto fail;
1785 #undef wr_ad
1786 #undef rd_ad
1788 done:
1789 regs->eip = eip;
1790 return EXCRET_fault_fixed;
1792 fail:
1793 return 0;
1796 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1798 struct vcpu *v = current;
1799 unsigned long fixup;
1801 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1803 if ( regs->error_code & 1 )
1804 goto hardware_gp;
1806 if ( !guest_mode(regs) )
1807 goto gp_in_kernel;
1809 /*
1810 * Cunning trick to allow arbitrary "INT n" handling.
1812 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1813 * instruction from trapping to the appropriate vector, when that might not
1814 * be expected by Xen or the guest OS. For example, that entry might be for
1815 * a fault handler (unlike traps, faults don't increment EIP), or might
1816 * expect an error code on the stack (which a software trap never
1817 * provides), or might be a hardware interrupt handler that doesn't like
1818 * being called spuriously.
1820 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1821 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1822 * clear to indicate that it's a software fault, not hardware.
1824 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1825 * okay because they can only be triggered by an explicit DPL-checked
1826 * instruction. The DPL specified by the guest OS for these vectors is NOT
1827 * CHECKED!!
1828 */
1829 if ( (regs->error_code & 3) == 2 )
1831 /* This fault must be due to <INT n> instruction. */
1832 const struct trap_info *ti;
1833 unsigned char vector = regs->error_code >> 3;
1834 ti = &v->arch.guest_context.trap_ctxt[vector];
1835 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1837 regs->eip += 2;
1838 return do_guest_trap(vector, regs, 0);
1842 /* Emulate some simple privileged and I/O instructions. */
1843 if ( (regs->error_code == 0) &&
1844 emulate_privileged_op(regs) )
1845 return 0;
1847 #if defined(__i386__)
1848 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1849 (regs->error_code == 0) &&
1850 gpf_emulate_4gb(regs) )
1851 return 0;
1852 #endif
1854 /* Pass on GPF as is. */
1855 return do_guest_trap(TRAP_gp_fault, regs, 1);
1857 gp_in_kernel:
1859 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1861 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
1862 regs->error_code, _p(regs->eip), _p(fixup));
1863 regs->eip = fixup;
1864 return 0;
1867 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1869 hardware_gp:
1870 show_execution_state(regs);
1871 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1872 return 0;
1875 static void nmi_softirq(void)
1877 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1878 vcpu_kick(dom0->vcpu[0]);
1881 static void nmi_dom0_report(unsigned int reason_idx)
1883 struct domain *d;
1884 struct vcpu *v;
1886 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1887 return;
1889 set_bit(reason_idx, nmi_reason(d));
1891 if ( !xchg(&v->nmi_pending, 1) )
1892 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1895 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1897 switch ( opt_nmi[0] )
1899 case 'd': /* 'dom0' */
1900 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1901 case 'i': /* 'ignore' */
1902 break;
1903 default: /* 'fatal' */
1904 console_force_unlock();
1905 printk("\n\nNMI - MEMORY ERROR\n");
1906 fatal_trap(TRAP_nmi, regs);
1909 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1910 mdelay(1);
1911 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1914 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1916 switch ( opt_nmi[0] )
1918 case 'd': /* 'dom0' */
1919 nmi_dom0_report(_XEN_NMIREASON_io_error);
1920 case 'i': /* 'ignore' */
1921 break;
1922 default: /* 'fatal' */
1923 console_force_unlock();
1924 printk("\n\nNMI - I/O ERROR\n");
1925 fatal_trap(TRAP_nmi, regs);
1928 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1929 mdelay(1);
1930 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1933 static void unknown_nmi_error(unsigned char reason)
1935 switch ( opt_nmi[0] )
1937 case 'd': /* 'dom0' */
1938 nmi_dom0_report(_XEN_NMIREASON_unknown);
1939 case 'i': /* 'ignore' */
1940 break;
1941 default: /* 'fatal' */
1942 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1943 printk("Dazed and confused, but trying to continue\n");
1944 printk("Do you have a strange power saving mode enabled?\n");
1945 kexec_crash();
1949 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1951 return 0;
1954 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1956 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1958 unsigned int cpu = smp_processor_id();
1959 unsigned char reason;
1961 ++nmi_count(cpu);
1963 if ( nmi_callback(regs, cpu) )
1964 return;
1966 if ( nmi_watchdog )
1967 nmi_watchdog_tick(regs);
1969 /* Only the BSP gets external NMIs from the system. */
1970 if ( cpu == 0 )
1972 reason = inb(0x61);
1973 if ( reason & 0x80 )
1974 mem_parity_error(regs);
1975 else if ( reason & 0x40 )
1976 io_check_error(regs);
1977 else if ( !nmi_watchdog )
1978 unknown_nmi_error((unsigned char)(reason&0xff));
1982 void set_nmi_callback(nmi_callback_t callback)
1984 nmi_callback = callback;
1987 void unset_nmi_callback(void)
1989 nmi_callback = dummy_nmi_callback;
1992 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1994 BUG_ON(!guest_mode(regs));
1996 setup_fpu(current);
1998 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2000 do_guest_trap(TRAP_no_device, regs, 0);
2001 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2004 return EXCRET_fault_fixed;
2007 asmlinkage int do_debug(struct cpu_user_regs *regs)
2009 unsigned long condition;
2010 struct vcpu *v = current;
2012 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
2014 /* Mask out spurious debug traps due to lazy DR7 setting */
2015 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
2016 (v->arch.guest_context.debugreg[7] == 0) )
2018 __asm__("mov %0,%%db7" : : "r" (0UL));
2019 goto out;
2022 DEBUGGER_trap_entry(TRAP_debug, regs);
2024 if ( !guest_mode(regs) )
2026 /* Clear TF just for absolute sanity. */
2027 regs->eflags &= ~EF_TF;
2028 /*
2029 * We ignore watchpoints when they trigger within Xen. This may happen
2030 * when a buffer is passed to us which previously had a watchpoint set
2031 * on it. No need to bump EIP; the only faulting trap is an instruction
2032 * breakpoint, which can't happen to us.
2033 */
2034 goto out;
2037 /* Save debug status register where guest OS can peek at it */
2038 v->arch.guest_context.debugreg[6] = condition;
2040 return do_guest_trap(TRAP_debug, regs, 0);
2042 out:
2043 return EXCRET_not_a_fault;
2046 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2048 return EXCRET_not_a_fault;
2051 void set_intr_gate(unsigned int n, void *addr)
2053 int i;
2054 /* Keep secondary tables in sync with IRQ updates. */
2055 for ( i = 1; i < NR_CPUS; i++ )
2056 if ( idt_tables[i] != NULL )
2057 _set_gate(&idt_tables[i][n], 14, 0, addr);
2058 _set_gate(&idt_table[n], 14, 0, addr);
2061 void set_system_gate(unsigned int n, void *addr)
2063 _set_gate(idt_table+n,14,3,addr);
2066 void set_task_gate(unsigned int n, unsigned int sel)
2068 idt_table[n].a = sel << 16;
2069 idt_table[n].b = 0x8500;
2072 void set_tss_desc(unsigned int n, void *addr)
2074 _set_tssldt_desc(
2075 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2076 (unsigned long)addr,
2077 offsetof(struct tss_struct, __cacheline_filler) - 1,
2078 9);
2079 #ifdef CONFIG_COMPAT
2080 _set_tssldt_desc(
2081 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2082 (unsigned long)addr,
2083 offsetof(struct tss_struct, __cacheline_filler) - 1,
2084 11);
2085 #endif
2088 void __init trap_init(void)
2090 extern void percpu_traps_init(void);
2092 /*
2093 * Note that interrupt gates are always used, rather than trap gates. We
2094 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2095 * first activation must have the "bad" value(s) for these registers and
2096 * we may lose them if another activation is installed before they are
2097 * saved. The page-fault handler also needs interrupts disabled until %cr2
2098 * has been read and saved on the stack.
2099 */
2100 set_intr_gate(TRAP_divide_error,&divide_error);
2101 set_intr_gate(TRAP_debug,&debug);
2102 set_intr_gate(TRAP_nmi,&nmi);
2103 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2104 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2105 set_intr_gate(TRAP_bounds,&bounds);
2106 set_intr_gate(TRAP_invalid_op,&invalid_op);
2107 set_intr_gate(TRAP_no_device,&device_not_available);
2108 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2109 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2110 set_intr_gate(TRAP_no_segment,&segment_not_present);
2111 set_intr_gate(TRAP_stack_error,&stack_segment);
2112 set_intr_gate(TRAP_gp_fault,&general_protection);
2113 set_intr_gate(TRAP_page_fault,&page_fault);
2114 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2115 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2116 set_intr_gate(TRAP_alignment_check,&alignment_check);
2117 set_intr_gate(TRAP_machine_check,&machine_check);
2118 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2120 /* CPU0 uses the master IDT. */
2121 idt_tables[0] = idt_table;
2123 percpu_traps_init();
2125 cpu_init();
2127 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2131 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2133 struct trap_info cur;
2134 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2135 long rc = 0;
2137 /* If no table is presented then clear the entire virtual IDT. */
2138 if ( guest_handle_is_null(traps) )
2140 memset(dst, 0, 256 * sizeof(*dst));
2141 init_int80_direct_trap(current);
2142 return 0;
2145 for ( ; ; )
2147 if ( hypercall_preempt_check() )
2149 rc = hypercall_create_continuation(
2150 __HYPERVISOR_set_trap_table, "h", traps);
2151 break;
2154 if ( copy_from_guest(&cur, traps, 1) )
2156 rc = -EFAULT;
2157 break;
2160 if ( cur.address == 0 )
2161 break;
2163 fixup_guest_code_selector(current->domain, cur.cs);
2165 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2167 if ( cur.vector == 0x80 )
2168 init_int80_direct_trap(current);
2170 guest_handle_add_offset(traps, 1);
2173 return rc;
2177 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2179 int i;
2181 switch ( reg )
2183 case 0:
2184 if ( !access_ok(value, sizeof(long)) )
2185 return -EPERM;
2186 if ( p == current )
2187 __asm__ ( "mov %0, %%db0" : : "r" (value) );
2188 break;
2189 case 1:
2190 if ( !access_ok(value, sizeof(long)) )
2191 return -EPERM;
2192 if ( p == current )
2193 __asm__ ( "mov %0, %%db1" : : "r" (value) );
2194 break;
2195 case 2:
2196 if ( !access_ok(value, sizeof(long)) )
2197 return -EPERM;
2198 if ( p == current )
2199 __asm__ ( "mov %0, %%db2" : : "r" (value) );
2200 break;
2201 case 3:
2202 if ( !access_ok(value, sizeof(long)) )
2203 return -EPERM;
2204 if ( p == current )
2205 __asm__ ( "mov %0, %%db3" : : "r" (value) );
2206 break;
2207 case 6:
2208 /*
2209 * DR6: Bits 4-11,16-31 reserved (set to 1).
2210 * Bit 12 reserved (set to 0).
2211 */
2212 value &= 0xffffefff; /* reserved bits => 0 */
2213 value |= 0xffff0ff0; /* reserved bits => 1 */
2214 if ( p == current )
2215 __asm__ ( "mov %0, %%db6" : : "r" (value) );
2216 break;
2217 case 7:
2218 /*
2219 * DR7: Bit 10 reserved (set to 1).
2220 * Bits 11-12,14-15 reserved (set to 0).
2221 * Privileged bits:
2222 * GD (bit 13): must be 0.
2223 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2224 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2225 */
2226 /* DR7 == 0 => debugging disabled for this domain. */
2227 if ( value != 0 )
2229 value &= 0xffff27ff; /* reserved bits => 0 */
2230 value |= 0x00000400; /* reserved bits => 1 */
2231 if ( (value & (1<<13)) != 0 ) return -EPERM;
2232 for ( i = 0; i < 16; i += 2 )
2233 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2235 if ( p == current )
2236 __asm__ ( "mov %0, %%db7" : : "r" (value) );
2237 break;
2238 default:
2239 return -EINVAL;
2242 p->arch.guest_context.debugreg[reg] = value;
2243 return 0;
2246 long do_set_debugreg(int reg, unsigned long value)
2248 return set_debugreg(current, reg, value);
2251 unsigned long do_get_debugreg(int reg)
2253 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2254 return current->arch.guest_context.debugreg[reg];
2257 /*
2258 * Local variables:
2259 * mode: C
2260 * c-set-style: "BSD"
2261 * c-basic-offset: 4
2262 * tab-width: 4
2263 * indent-tabs-mode: nil
2264 * End:
2265 */