direct-io.hg

view xen/arch/x86/traps.c @ 11135:88e6bd5e2b54

Whitespace clean-ups.

Signed-off-by: Steven Hand <steven@xensource.com>
author shand@kneesaa.uk.xensource.com
date Wed Aug 16 11:36:13 2006 +0100 (2006-08-16)
parents f328519053f5
children 0f917d63e960
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <asm/shadow.h>
49 #include <asm/system.h>
50 #include <asm/io.h>
51 #include <asm/atomic.h>
52 #include <asm/desc.h>
53 #include <asm/debugreg.h>
54 #include <asm/smp.h>
55 #include <asm/flushtlb.h>
56 #include <asm/uaccess.h>
57 #include <asm/i387.h>
58 #include <asm/debugger.h>
59 #include <asm/msr.h>
60 #include <asm/x86_emulate.h>
62 /*
63 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
64 * fatal: Xen prints diagnostic message and then hangs.
65 * dom0: The NMI is virtualised to DOM0.
66 * ignore: The NMI error is cleared and ignored.
67 */
68 #ifdef NDEBUG
69 char opt_nmi[10] = "dom0";
70 #else
71 char opt_nmi[10] = "fatal";
72 #endif
73 string_param("nmi", opt_nmi);
75 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
76 idt_entry_t idt_table[IDT_ENTRIES];
78 #define DECLARE_TRAP_HANDLER(_name) \
79 asmlinkage void _name(void); \
80 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
82 asmlinkage void nmi(void);
83 DECLARE_TRAP_HANDLER(divide_error);
84 DECLARE_TRAP_HANDLER(debug);
85 DECLARE_TRAP_HANDLER(int3);
86 DECLARE_TRAP_HANDLER(overflow);
87 DECLARE_TRAP_HANDLER(bounds);
88 DECLARE_TRAP_HANDLER(invalid_op);
89 DECLARE_TRAP_HANDLER(device_not_available);
90 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
91 DECLARE_TRAP_HANDLER(invalid_TSS);
92 DECLARE_TRAP_HANDLER(segment_not_present);
93 DECLARE_TRAP_HANDLER(stack_segment);
94 DECLARE_TRAP_HANDLER(general_protection);
95 DECLARE_TRAP_HANDLER(page_fault);
96 DECLARE_TRAP_HANDLER(coprocessor_error);
97 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
98 DECLARE_TRAP_HANDLER(alignment_check);
99 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
100 DECLARE_TRAP_HANDLER(machine_check);
102 long do_set_debugreg(int reg, unsigned long value);
103 unsigned long do_get_debugreg(int reg);
105 static int debug_stack_lines = 20;
106 integer_param("debug_stack_lines", debug_stack_lines);
108 #ifdef CONFIG_X86_32
109 #define stack_words_per_line 8
110 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
111 #else
112 #define stack_words_per_line 4
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
114 #endif
116 int is_kernel_text(unsigned long addr)
117 {
118 extern char _stext, _etext;
119 if (addr >= (unsigned long) &_stext &&
120 addr <= (unsigned long) &_etext)
121 return 1;
122 return 0;
124 }
126 unsigned long kernel_text_end(void)
127 {
128 extern char _etext;
129 return (unsigned long) &_etext;
130 }
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
137 if ( hvm_guest(current) )
138 return;
140 if ( vm86_mode(regs) )
141 {
142 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
143 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
144 regs->ss, (uint16_t)(regs->esp & 0xffff));
145 }
146 else
147 {
148 stack = (unsigned long *)regs->esp;
149 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
150 }
152 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
153 {
154 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
155 break;
156 if ( get_user(addr, stack) )
157 {
158 if ( i != 0 )
159 printk("\n ");
160 printk("Fault while accessing guest memory.");
161 i = 1;
162 break;
163 }
164 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
165 printk("\n ");
166 printk(" %p", _p(addr));
167 stack++;
168 }
169 if ( i == 0 )
170 printk("Stack empty.");
171 printk("\n");
172 }
174 #ifdef NDEBUG
176 static void show_trace(struct cpu_user_regs *regs)
177 {
178 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
180 printk("Xen call trace:\n ");
182 printk("[<%p>]", _p(regs->eip));
183 print_symbol(" %s\n ", regs->eip);
185 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
186 {
187 addr = *stack++;
188 if ( is_kernel_text(addr) )
189 {
190 printk("[<%p>]", _p(addr));
191 print_symbol(" %s\n ", addr);
192 }
193 }
195 printk("\n");
196 }
198 #else
200 static void show_trace(struct cpu_user_regs *regs)
201 {
202 unsigned long *frame, next, addr, low, high;
204 printk("Xen call trace:\n ");
206 printk("[<%p>]", _p(regs->eip));
207 print_symbol(" %s\n ", regs->eip);
209 /* Bounds for range of valid frame pointer. */
210 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
211 high = (low & ~(STACK_SIZE - 1)) +
212 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
214 /* The initial frame pointer. */
215 next = regs->ebp;
217 for ( ; ; )
218 {
219 /* Valid frame pointer? */
220 if ( (next < low) || (next >= high) )
221 {
222 /*
223 * Exception stack frames have a different layout, denoted by an
224 * inverted frame pointer.
225 */
226 next = ~next;
227 if ( (next < low) || (next >= high) )
228 break;
229 frame = (unsigned long *)next;
230 next = frame[0];
231 addr = frame[(offsetof(struct cpu_user_regs, eip) -
232 offsetof(struct cpu_user_regs, ebp))
233 / BYTES_PER_LONG];
234 }
235 else
236 {
237 /* Ordinary stack frame. */
238 frame = (unsigned long *)next;
239 next = frame[0];
240 addr = frame[1];
241 }
243 printk("[<%p>]", _p(addr));
244 print_symbol(" %s\n ", addr);
246 low = (unsigned long)&frame[2];
247 }
249 printk("\n");
250 }
252 #endif
254 void show_stack(struct cpu_user_regs *regs)
255 {
256 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
257 int i;
259 if ( guest_mode(regs) )
260 return show_guest_stack(regs);
262 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
264 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
265 {
266 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
267 break;
268 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
269 printk("\n ");
270 addr = *stack++;
271 printk(" %p", _p(addr));
272 }
273 if ( i == 0 )
274 printk("Stack empty.");
275 printk("\n");
277 show_trace(regs);
278 }
280 void show_stack_overflow(unsigned long esp)
281 {
282 #ifdef MEMORY_GUARD
283 unsigned long esp_top;
284 unsigned long *stack, addr;
286 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
288 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
289 if ( ((unsigned long)(esp - esp_top) > 512) &&
290 ((unsigned long)(esp_top - esp) > 512) )
291 return;
293 if ( esp < esp_top )
294 esp = esp_top;
296 printk("Xen stack overflow:\n ");
298 stack = (unsigned long *)esp;
299 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
300 {
301 addr = *stack++;
302 if ( is_kernel_text(addr) )
303 {
304 printk("%p: [<%p>]", stack, _p(addr));
305 print_symbol(" %s\n ", addr);
306 }
307 }
309 printk("\n");
310 #endif
311 }
313 void show_execution_state(struct cpu_user_regs *regs)
314 {
315 show_registers(regs);
316 show_stack(regs);
317 }
319 /*
320 * This is called for faults at very unexpected times (e.g., when interrupts
321 * are disabled). In such situations we can't do much that is safe. We try to
322 * print out some tracing and then we just spin.
323 */
324 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
325 {
326 int cpu = smp_processor_id();
327 unsigned long cr2;
328 static char *trapstr[] = {
329 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
330 "invalid opcode", "device not available", "double fault",
331 "coprocessor segment", "invalid tss", "segment not found",
332 "stack error", "general protection fault", "page fault",
333 "spurious interrupt", "coprocessor error", "alignment check",
334 "machine check", "simd error"
335 };
337 watchdog_disable();
338 console_start_sync();
340 show_execution_state(regs);
342 if ( trapnr == TRAP_page_fault )
343 {
344 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
345 printk("Faulting linear address: %p\n", _p(cr2));
346 show_page_walk(cr2);
347 }
349 printk("************************************\n");
350 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
351 cpu, trapnr, trapstr[trapnr], regs->error_code,
352 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
353 printk("System shutting down -- need manual reset.\n");
354 printk("************************************\n");
356 (void)debugger_trap_fatal(trapnr, regs);
358 /* Lock up the console to prevent spurious output from other CPUs. */
359 console_force_lock();
361 /* Wait for manual reset. */
362 machine_halt();
363 }
365 static inline int do_trap(int trapnr, char *str,
366 struct cpu_user_regs *regs,
367 int use_error_code)
368 {
369 struct vcpu *v = current;
370 struct trap_bounce *tb = &v->arch.trap_bounce;
371 struct trap_info *ti;
372 unsigned long fixup;
374 DEBUGGER_trap_entry(trapnr, regs);
376 if ( !guest_mode(regs) )
377 goto xen_fault;
379 ti = &current->arch.guest_context.trap_ctxt[trapnr];
380 tb->flags = TBF_EXCEPTION;
381 tb->cs = ti->cs;
382 tb->eip = ti->address;
383 if ( use_error_code )
384 {
385 tb->flags |= TBF_EXCEPTION_ERRCODE;
386 tb->error_code = regs->error_code;
387 }
388 if ( TI_GET_IF(ti) )
389 tb->flags |= TBF_INTERRUPT;
390 return 0;
392 xen_fault:
394 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
395 {
396 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
397 regs->eip = fixup;
398 return 0;
399 }
401 DEBUGGER_trap_fatal(trapnr, regs);
403 show_execution_state(regs);
404 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
405 "[error_code=%04x]\n",
406 smp_processor_id(), trapnr, str, regs->error_code);
407 return 0;
408 }
410 #define DO_ERROR_NOCODE(trapnr, str, name) \
411 asmlinkage int do_##name(struct cpu_user_regs *regs) \
412 { \
413 return do_trap(trapnr, str, regs, 0); \
414 }
416 #define DO_ERROR(trapnr, str, name) \
417 asmlinkage int do_##name(struct cpu_user_regs *regs) \
418 { \
419 return do_trap(trapnr, str, regs, 1); \
420 }
422 DO_ERROR_NOCODE( 0, "divide error", divide_error)
423 DO_ERROR_NOCODE( 4, "overflow", overflow)
424 DO_ERROR_NOCODE( 5, "bounds", bounds)
425 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
426 DO_ERROR(10, "invalid TSS", invalid_TSS)
427 DO_ERROR(11, "segment not present", segment_not_present)
428 DO_ERROR(12, "stack segment", stack_segment)
429 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
430 DO_ERROR(17, "alignment check", alignment_check)
431 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
433 int rdmsr_hypervisor_regs(
434 uint32_t idx, uint32_t *eax, uint32_t *edx)
435 {
436 idx -= 0x40000000;
437 if ( idx > 0 )
438 return 0;
440 *eax = *edx = 0;
441 return 1;
442 }
444 int wrmsr_hypervisor_regs(
445 uint32_t idx, uint32_t eax, uint32_t edx)
446 {
447 struct domain *d = current->domain;
449 idx -= 0x40000000;
450 if ( idx > 0 )
451 return 0;
453 switch ( idx )
454 {
455 case 0:
456 {
457 void *hypercall_page;
458 unsigned long mfn;
459 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
460 unsigned int idx = eax & 0xfff;
462 if ( idx > 0 )
463 {
464 DPRINTK("Dom%d: Out of range index %u to MSR %08x\n",
465 d->domain_id, idx, 0x40000000);
466 return 0;
467 }
469 mfn = gmfn_to_mfn(d, gmfn);
471 if ( !mfn_valid(mfn) ||
472 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
473 {
474 DPRINTK("Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
475 d->domain_id, gmfn, mfn, 0x40000000);
476 return 0;
477 }
479 hypercall_page = map_domain_page(mfn);
480 hypercall_page_initialise(d, hypercall_page);
481 unmap_domain_page(hypercall_page);
483 put_page_and_type(mfn_to_page(mfn));
484 break;
485 }
487 default:
488 BUG();
489 }
491 return 1;
492 }
494 int cpuid_hypervisor_leaves(
495 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
496 {
497 idx -= 0x40000000;
498 if ( idx > 2 )
499 return 0;
501 switch ( idx )
502 {
503 case 0:
504 *eax = 0x40000002; /* Largest leaf */
505 *ebx = 0x566e6558; /* Signature 1: "XenV" */
506 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
507 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
508 break;
510 case 1:
511 *eax = (xen_major_version() << 16) | xen_minor_version();
512 *ebx = 0; /* Reserved */
513 *ecx = 0; /* Reserved */
514 *edx = 0; /* Reserved */
515 break;
517 case 2:
518 *eax = 1; /* Number of hypercall-transfer pages */
519 *ebx = 0x40000000; /* MSR base address */
520 *ecx = 0; /* Features 1 */
521 *edx = 0; /* Features 2 */
522 break;
524 default:
525 BUG();
526 }
528 return 1;
529 }
531 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
532 {
533 char sig[5], instr[2];
534 uint32_t a, b, c, d;
535 unsigned long eip, rc;
537 a = regs->eax;
538 b = regs->ebx;
539 c = regs->ecx;
540 d = regs->edx;
541 eip = regs->eip;
543 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
544 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
545 {
546 propagate_page_fault(eip + sizeof(sig) - rc, 0);
547 return EXCRET_fault_fixed;
548 }
549 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
550 return 0;
551 eip += sizeof(sig);
553 /* We only emulate CPUID. */
554 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
555 {
556 propagate_page_fault(eip + sizeof(instr) - rc, 0);
557 return EXCRET_fault_fixed;
558 }
559 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
560 return 0;
561 eip += sizeof(instr);
563 __asm__ (
564 "cpuid"
565 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
566 : "0" (a), "1" (b), "2" (c), "3" (d) );
568 if ( regs->eax == 1 )
569 {
570 /* Modify Feature Information. */
571 clear_bit(X86_FEATURE_VME, &d);
572 clear_bit(X86_FEATURE_DE, &d);
573 clear_bit(X86_FEATURE_PSE, &d);
574 clear_bit(X86_FEATURE_PGE, &d);
575 if ( !supervisor_mode_kernel )
576 clear_bit(X86_FEATURE_SEP, &d);
577 if ( !IS_PRIV(current->domain) )
578 clear_bit(X86_FEATURE_MTRR, &d);
579 }
580 else
581 {
582 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
583 }
585 regs->eax = a;
586 regs->ebx = b;
587 regs->ecx = c;
588 regs->edx = d;
589 regs->eip = eip;
591 return EXCRET_fault_fixed;
592 }
594 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
595 {
596 struct vcpu *v = current;
597 struct trap_bounce *tb = &v->arch.trap_bounce;
598 struct trap_info *ti;
599 int rc;
601 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
603 if ( unlikely(!guest_mode(regs)) )
604 {
605 char sig[5];
606 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
607 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
608 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
609 {
610 show_execution_state(regs);
611 regs->eip += sizeof(sig);
612 return EXCRET_fault_fixed;
613 }
614 printk("%02x %02x %02x %02x %02x\n",
615 (unsigned char)sig[0],
616 (unsigned char)sig[1],
617 (unsigned char)sig[2],
618 (unsigned char)sig[3],
619 (unsigned char)sig[4]);
620 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
621 show_execution_state(regs);
622 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
623 smp_processor_id(), TRAP_invalid_op);
624 }
626 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
627 return rc;
629 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
630 tb->flags = TBF_EXCEPTION;
631 tb->cs = ti->cs;
632 tb->eip = ti->address;
633 if ( TI_GET_IF(ti) )
634 tb->flags |= TBF_INTERRUPT;
636 return 0;
637 }
639 asmlinkage int do_int3(struct cpu_user_regs *regs)
640 {
641 struct vcpu *v = current;
642 struct trap_bounce *tb = &v->arch.trap_bounce;
643 struct trap_info *ti;
645 DEBUGGER_trap_entry(TRAP_int3, regs);
647 if ( !guest_mode(regs) )
648 {
649 DEBUGGER_trap_fatal(TRAP_int3, regs);
650 show_execution_state(regs);
651 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
652 }
654 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
655 tb->flags = TBF_EXCEPTION;
656 tb->cs = ti->cs;
657 tb->eip = ti->address;
658 if ( TI_GET_IF(ti) )
659 tb->flags |= TBF_INTERRUPT;
661 return 0;
662 }
664 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
665 {
666 fatal_trap(TRAP_machine_check, regs);
667 return 0;
668 }
670 void propagate_page_fault(unsigned long addr, u16 error_code)
671 {
672 struct trap_info *ti;
673 struct vcpu *v = current;
674 struct trap_bounce *tb = &v->arch.trap_bounce;
676 v->arch.guest_context.ctrlreg[2] = addr;
677 v->vcpu_info->arch.cr2 = addr;
679 /* Re-set error_code.user flag appropriately for the guest. */
680 error_code &= ~PGERR_user_mode;
681 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
682 error_code |= PGERR_user_mode;
684 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
685 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
686 tb->error_code = error_code;
687 tb->cs = ti->cs;
688 tb->eip = ti->address;
689 if ( TI_GET_IF(ti) )
690 tb->flags |= TBF_INTERRUPT;
691 }
693 static int handle_gdt_ldt_mapping_fault(
694 unsigned long offset, struct cpu_user_regs *regs)
695 {
696 extern int map_ldt_shadow_page(unsigned int);
698 struct vcpu *v = current;
699 struct domain *d = v->domain;
700 int ret;
702 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
703 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
704 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
706 /* Should never fault in another vcpu's area. */
707 BUG_ON(vcpu_area != current->vcpu_id);
709 /* Byte offset within the gdt/ldt sub-area. */
710 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
712 if ( likely(is_ldt_area) )
713 {
714 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
715 LOCK_BIGLOCK(d);
716 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
717 UNLOCK_BIGLOCK(d);
719 if ( unlikely(ret == 0) )
720 {
721 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
722 if ( !guest_mode(regs) )
723 return 0;
724 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
725 propagate_page_fault(
726 v->arch.guest_context.ldt_base + offset, regs->error_code);
727 }
728 }
729 else
730 {
731 /* GDT fault: handle the fault as #GP(selector). */
732 regs->error_code = (u16)offset & ~7;
733 (void)do_general_protection(regs);
734 }
736 return EXCRET_fault_fixed;
737 }
739 #ifdef HYPERVISOR_VIRT_END
740 #define IN_HYPERVISOR_RANGE(va) \
741 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
742 #else
743 #define IN_HYPERVISOR_RANGE(va) \
744 (((va) >= HYPERVISOR_VIRT_START))
745 #endif
747 static int __spurious_page_fault(
748 unsigned long addr, struct cpu_user_regs *regs)
749 {
750 unsigned long mfn, cr3 = read_cr3();
751 #if CONFIG_PAGING_LEVELS >= 4
752 l4_pgentry_t l4e, *l4t;
753 #endif
754 #if CONFIG_PAGING_LEVELS >= 3
755 l3_pgentry_t l3e, *l3t;
756 #endif
757 l2_pgentry_t l2e, *l2t;
758 l1_pgentry_t l1e, *l1t;
759 unsigned int required_flags, disallowed_flags;
761 /* Reserved bit violations are never spurious faults. */
762 if ( regs->error_code & PGERR_reserved_bit )
763 return 0;
765 required_flags = _PAGE_PRESENT;
766 if ( regs->error_code & PGERR_write_access )
767 required_flags |= _PAGE_RW;
768 if ( regs->error_code & PGERR_user_mode )
769 required_flags |= _PAGE_USER;
771 disallowed_flags = 0;
772 if ( regs->error_code & PGERR_instr_fetch )
773 disallowed_flags |= _PAGE_NX;
775 mfn = cr3 >> PAGE_SHIFT;
777 #if CONFIG_PAGING_LEVELS >= 4
778 l4t = map_domain_page(mfn);
779 l4e = l4t[l4_table_offset(addr)];
780 mfn = l4e_get_pfn(l4e);
781 unmap_domain_page(l4t);
782 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
783 (l4e_get_flags(l4e) & disallowed_flags) )
784 return 0;
785 #endif
787 #if CONFIG_PAGING_LEVELS >= 3
788 l3t = map_domain_page(mfn);
789 #ifdef CONFIG_X86_PAE
790 l3t += (cr3 & 0xFE0UL) >> 3;
791 #endif
792 l3e = l3t[l3_table_offset(addr)];
793 mfn = l3e_get_pfn(l3e);
794 unmap_domain_page(l3t);
795 #ifdef CONFIG_X86_PAE
796 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
797 return 0;
798 #else
799 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
800 (l3e_get_flags(l3e) & disallowed_flags) )
801 return 0;
802 #endif
803 #endif
805 l2t = map_domain_page(mfn);
806 l2e = l2t[l2_table_offset(addr)];
807 mfn = l2e_get_pfn(l2e);
808 unmap_domain_page(l2t);
809 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
810 (l2e_get_flags(l2e) & disallowed_flags) )
811 return 0;
812 if ( l2e_get_flags(l2e) & _PAGE_PSE )
813 {
814 l1e = l1e_empty(); /* define before use in debug tracing */
815 goto spurious;
816 }
818 l1t = map_domain_page(mfn);
819 l1e = l1t[l1_table_offset(addr)];
820 mfn = l1e_get_pfn(l1e);
821 unmap_domain_page(l1t);
822 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
823 (l1e_get_flags(l1e) & disallowed_flags) )
824 return 0;
826 spurious:
827 DPRINTK("Spurious fault in domain %u:%u at addr %lx, e/c %04x\n",
828 current->domain->domain_id, current->vcpu_id,
829 addr, regs->error_code);
830 #if CONFIG_PAGING_LEVELS >= 4
831 DPRINTK(" l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
832 #endif
833 #if CONFIG_PAGING_LEVELS >= 3
834 DPRINTK(" l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
835 #endif
836 DPRINTK(" l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
837 DPRINTK(" l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
838 #ifndef NDEBUG
839 show_registers(regs);
840 #endif
841 return 1;
842 }
844 static int spurious_page_fault(
845 unsigned long addr, struct cpu_user_regs *regs)
846 {
847 struct domain *d = current->domain;
848 int is_spurious;
850 LOCK_BIGLOCK(d);
851 is_spurious = __spurious_page_fault(addr, regs);
852 UNLOCK_BIGLOCK(d);
854 return is_spurious;
855 }
857 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
858 {
859 struct vcpu *v = current;
860 struct domain *d = v->domain;
862 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
863 {
864 if ( shadow_mode_external(d) && guest_mode(regs) )
865 return shadow_fault(addr, regs);
866 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
867 return handle_gdt_ldt_mapping_fault(
868 addr - GDT_LDT_VIRT_START, regs);
869 /*
870 * Do not propagate spurious faults in the hypervisor area to the
871 * guest. It cannot fix them up.
872 */
873 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
874 }
876 if ( unlikely(shadow_mode_enabled(d)) )
877 return shadow_fault(addr, regs);
879 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) &&
880 guest_kernel_mode(v, regs) &&
881 ((regs->error_code & (PGERR_write_access|PGERR_page_present)) ==
882 (PGERR_write_access|PGERR_page_present)) )
883 return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0;
885 return 0;
886 }
888 /*
889 * #PF error code:
890 * Bit 0: Protection violation (=1) ; Page not present (=0)
891 * Bit 1: Write access
892 * Bit 2: User mode (=1) ; Supervisor mode (=0)
893 * Bit 3: Reserved bit violation
894 * Bit 4: Instruction fetch
895 */
896 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
897 {
898 unsigned long addr, fixup;
899 int rc;
901 ASSERT(!in_irq());
903 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
905 DEBUGGER_trap_entry(TRAP_page_fault, regs);
907 perfc_incrc(page_faults);
909 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
910 return rc;
912 if ( unlikely(!guest_mode(regs)) )
913 {
914 if ( spurious_page_fault(addr, regs) )
915 return EXCRET_not_a_fault;
917 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
918 {
919 perfc_incrc(copy_user_faults);
920 regs->eip = fixup;
921 return 0;
922 }
924 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
926 show_execution_state(regs);
927 show_page_walk(addr);
928 panic("CPU%d FATAL PAGE FAULT\n"
929 "[error_code=%04x]\n"
930 "Faulting linear address: %p\n",
931 smp_processor_id(), regs->error_code, _p(addr));
932 }
934 propagate_page_fault(addr, regs->error_code);
935 return 0;
936 }
938 long do_fpu_taskswitch(int set)
939 {
940 struct vcpu *v = current;
942 if ( set )
943 {
944 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
945 stts();
946 }
947 else
948 {
949 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
950 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
951 clts();
952 }
954 return 0;
955 }
957 /* Has the guest requested sufficient permission for this I/O access? */
958 static inline int guest_io_okay(
959 unsigned int port, unsigned int bytes,
960 struct vcpu *v, struct cpu_user_regs *regs)
961 {
962 u16 x;
963 #if defined(__x86_64__)
964 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
965 int user_mode = !(v->arch.flags & TF_kernel_mode);
966 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
967 #elif defined(__i386__)
968 #define TOGGLE_MODE() ((void)0)
969 #endif
971 if ( !vm86_mode(regs) &&
972 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
973 return 1;
975 if ( v->arch.iobmp_limit > (port + bytes) )
976 {
977 TOGGLE_MODE();
978 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
979 TOGGLE_MODE();
980 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
981 return 1;
982 }
984 return 0;
985 }
987 /* Has the administrator granted sufficient permission for this I/O access? */
988 static inline int admin_io_okay(
989 unsigned int port, unsigned int bytes,
990 struct vcpu *v, struct cpu_user_regs *regs)
991 {
992 return ioports_access_permitted(v->domain, port, port + bytes - 1);
993 }
995 /* Check admin limits. Silently fail the access if it is disallowed. */
996 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
997 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
998 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
999 #define outb_user(_v, _p, _d, _r) \
1000 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
1001 #define outw_user(_v, _p, _d, _r) \
1002 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
1003 #define outl_user(_v, _p, _d, _r) \
1004 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
1006 /* Instruction fetch with error handling. */
1007 #define insn_fetch(_type, _size, _ptr) \
1008 ({ unsigned long _rc, _x; \
1009 if ( (_rc = copy_from_user(&_x, (_type *)eip, sizeof(_type))) != 0 ) \
1010 { \
1011 propagate_page_fault(eip + sizeof(_type) - _rc, 0); \
1012 return EXCRET_fault_fixed; \
1013 } \
1014 eip += _size; (_type)_x; })
1016 static int emulate_privileged_op(struct cpu_user_regs *regs)
1018 struct vcpu *v = current;
1019 unsigned long *reg, eip = regs->eip, res;
1020 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
1021 unsigned int port, i, op_bytes = 4, data, rc;
1022 u32 l, h;
1024 /* Legacy prefixes. */
1025 for ( i = 0; i < 8; i++ )
1027 switch ( opcode = insn_fetch(u8, 1, eip) )
1029 case 0x66: /* operand-size override */
1030 op_bytes ^= 6; /* switch between 2/4 bytes */
1031 break;
1032 case 0x67: /* address-size override */
1033 case 0x2e: /* CS override */
1034 case 0x3e: /* DS override */
1035 case 0x26: /* ES override */
1036 case 0x64: /* FS override */
1037 case 0x65: /* GS override */
1038 case 0x36: /* SS override */
1039 case 0xf0: /* LOCK */
1040 case 0xf2: /* REPNE/REPNZ */
1041 break;
1042 case 0xf3: /* REP/REPE/REPZ */
1043 rep_prefix = 1;
1044 break;
1045 default:
1046 goto done_prefixes;
1049 done_prefixes:
1051 #ifdef __x86_64__
1052 /* REX prefix. */
1053 if ( (opcode & 0xf0) == 0x40 )
1055 modrm_reg = (opcode & 4) << 1; /* REX.R */
1056 modrm_rm = (opcode & 1) << 3; /* REX.B */
1058 /* REX.W and REX.X do not need to be decoded. */
1059 opcode = insn_fetch(u8, 1, eip);
1061 #endif
1063 /* Input/Output String instructions. */
1064 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1066 if ( rep_prefix && (regs->ecx == 0) )
1067 goto done;
1069 continue_io_string:
1070 switch ( opcode )
1072 case 0x6c: /* INSB */
1073 op_bytes = 1;
1074 case 0x6d: /* INSW/INSL */
1075 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1076 goto fail;
1077 switch ( op_bytes )
1079 case 1:
1080 data = (u8)inb_user((u16)regs->edx, v, regs);
1081 break;
1082 case 2:
1083 data = (u16)inw_user((u16)regs->edx, v, regs);
1084 break;
1085 case 4:
1086 data = (u32)inl_user((u16)regs->edx, v, regs);
1087 break;
1089 if ( (rc = copy_to_user((void *)regs->edi, &data, op_bytes)) != 0 )
1091 propagate_page_fault(regs->edi + op_bytes - rc,
1092 PGERR_write_access);
1093 return EXCRET_fault_fixed;
1095 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1096 break;
1098 case 0x6e: /* OUTSB */
1099 op_bytes = 1;
1100 case 0x6f: /* OUTSW/OUTSL */
1101 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1102 goto fail;
1103 rc = copy_from_user(&data, (void *)regs->esi, op_bytes);
1104 if ( rc != 0 )
1106 propagate_page_fault(regs->esi + op_bytes - rc, 0);
1107 return EXCRET_fault_fixed;
1109 switch ( op_bytes )
1111 case 1:
1112 outb_user((u8)data, (u16)regs->edx, v, regs);
1113 break;
1114 case 2:
1115 outw_user((u16)data, (u16)regs->edx, v, regs);
1116 break;
1117 case 4:
1118 outl_user((u32)data, (u16)regs->edx, v, regs);
1119 break;
1121 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1122 break;
1125 if ( rep_prefix && (--regs->ecx != 0) )
1127 if ( !hypercall_preempt_check() )
1128 goto continue_io_string;
1129 eip = regs->eip;
1132 goto done;
1135 /* I/O Port and Interrupt Flag instructions. */
1136 switch ( opcode )
1138 case 0xe4: /* IN imm8,%al */
1139 op_bytes = 1;
1140 case 0xe5: /* IN imm8,%eax */
1141 port = insn_fetch(u8, 1, eip);
1142 exec_in:
1143 if ( !guest_io_okay(port, op_bytes, v, regs) )
1144 goto fail;
1145 switch ( op_bytes )
1147 case 1:
1148 regs->eax &= ~0xffUL;
1149 regs->eax |= (u8)inb_user(port, v, regs);
1150 break;
1151 case 2:
1152 regs->eax &= ~0xffffUL;
1153 regs->eax |= (u16)inw_user(port, v, regs);
1154 break;
1155 case 4:
1156 regs->eax = (u32)inl_user(port, v, regs);
1157 break;
1159 goto done;
1161 case 0xec: /* IN %dx,%al */
1162 op_bytes = 1;
1163 case 0xed: /* IN %dx,%eax */
1164 port = (u16)regs->edx;
1165 goto exec_in;
1167 case 0xe6: /* OUT %al,imm8 */
1168 op_bytes = 1;
1169 case 0xe7: /* OUT %eax,imm8 */
1170 port = insn_fetch(u8, 1, eip);
1171 exec_out:
1172 if ( !guest_io_okay(port, op_bytes, v, regs) )
1173 goto fail;
1174 switch ( op_bytes )
1176 case 1:
1177 outb_user((u8)regs->eax, port, v, regs);
1178 break;
1179 case 2:
1180 outw_user((u16)regs->eax, port, v, regs);
1181 break;
1182 case 4:
1183 outl_user((u32)regs->eax, port, v, regs);
1184 break;
1186 goto done;
1188 case 0xee: /* OUT %al,%dx */
1189 op_bytes = 1;
1190 case 0xef: /* OUT %eax,%dx */
1191 port = (u16)regs->edx;
1192 goto exec_out;
1194 case 0xfa: /* CLI */
1195 case 0xfb: /* STI */
1196 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1197 goto fail;
1198 /*
1199 * This is just too dangerous to allow, in my opinion. Consider if the
1200 * caller then tries to reenable interrupts using POPF: we can't trap
1201 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1202 * do for us. :-)
1203 */
1204 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1205 goto done;
1207 case 0x0f: /* Two-byte opcode */
1208 break;
1210 default:
1211 goto fail;
1214 /* Remaining instructions only emulated from guest kernel. */
1215 if ( !guest_kernel_mode(v, regs) )
1216 goto fail;
1218 /* Privileged (ring 0) instructions. */
1219 opcode = insn_fetch(u8, 1, eip);
1220 switch ( opcode )
1222 case 0x06: /* CLTS */
1223 (void)do_fpu_taskswitch(0);
1224 break;
1226 case 0x09: /* WBINVD */
1227 /* Ignore the instruction if unprivileged. */
1228 if ( !cache_flush_permitted(v->domain) )
1229 /* Non-physdev domain attempted WBINVD; ignore for now since
1230 newer linux uses this in some start-of-day timing loops */
1232 else
1233 wbinvd();
1234 break;
1236 case 0x20: /* MOV CR?,<reg> */
1237 opcode = insn_fetch(u8, 1, eip);
1238 modrm_reg |= (opcode >> 3) & 7;
1239 modrm_rm |= (opcode >> 0) & 7;
1240 reg = decode_register(modrm_rm, regs, 0);
1241 switch ( modrm_reg )
1243 case 0: /* Read CR0 */
1244 *reg = (read_cr0() & ~X86_CR0_TS) |
1245 v->arch.guest_context.ctrlreg[0];
1246 break;
1248 case 2: /* Read CR2 */
1249 *reg = v->arch.guest_context.ctrlreg[2];
1250 break;
1252 case 3: /* Read CR3 */
1253 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1254 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1255 break;
1257 case 4: /* Read CR4 */
1258 /*
1259 * Guests can read CR4 to see what features Xen has enabled. We
1260 * therefore lie about PGE & PSE as they are unavailable to guests.
1261 */
1262 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1263 break;
1265 default:
1266 goto fail;
1268 break;
1270 case 0x21: /* MOV DR?,<reg> */
1271 opcode = insn_fetch(u8, 1, eip);
1272 modrm_reg |= (opcode >> 3) & 7;
1273 modrm_rm |= (opcode >> 0) & 7;
1274 reg = decode_register(modrm_rm, regs, 0);
1275 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1276 goto fail;
1277 *reg = res;
1278 break;
1280 case 0x22: /* MOV <reg>,CR? */
1281 opcode = insn_fetch(u8, 1, eip);
1282 modrm_reg |= (opcode >> 3) & 7;
1283 modrm_rm |= (opcode >> 0) & 7;
1284 reg = decode_register(modrm_rm, regs, 0);
1285 switch ( modrm_reg )
1287 case 0: /* Write CR0 */
1288 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1290 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1291 goto fail;
1293 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1294 break;
1296 case 2: /* Write CR2 */
1297 v->arch.guest_context.ctrlreg[2] = *reg;
1298 v->vcpu_info->arch.cr2 = *reg;
1299 break;
1301 case 3: /* Write CR3 */
1302 LOCK_BIGLOCK(v->domain);
1303 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1304 UNLOCK_BIGLOCK(v->domain);
1305 break;
1307 case 4:
1308 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1310 DPRINTK("Attempt to change CR4 flags.\n");
1311 goto fail;
1313 break;
1315 default:
1316 goto fail;
1318 break;
1320 case 0x23: /* MOV <reg>,DR? */
1321 opcode = insn_fetch(u8, 1, eip);
1322 modrm_reg |= (opcode >> 3) & 7;
1323 modrm_rm |= (opcode >> 0) & 7;
1324 reg = decode_register(modrm_rm, regs, 0);
1325 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1326 goto fail;
1327 break;
1329 case 0x30: /* WRMSR */
1330 switch ( regs->ecx )
1332 #ifdef CONFIG_X86_64
1333 case MSR_FS_BASE:
1334 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1335 goto fail;
1336 v->arch.guest_context.fs_base =
1337 ((u64)regs->edx << 32) | regs->eax;
1338 break;
1339 case MSR_GS_BASE:
1340 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1341 goto fail;
1342 v->arch.guest_context.gs_base_kernel =
1343 ((u64)regs->edx << 32) | regs->eax;
1344 break;
1345 case MSR_SHADOW_GS_BASE:
1346 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1347 goto fail;
1348 v->arch.guest_context.gs_base_user =
1349 ((u64)regs->edx << 32) | regs->eax;
1350 break;
1351 #endif
1352 default:
1353 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1354 break;
1356 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1357 (regs->eax != l) || (regs->edx != h) )
1358 DPRINTK("Domain attempted WRMSR %p from "
1359 "%08x:%08x to %08lx:%08lx.\n",
1360 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1361 break;
1363 break;
1365 case 0x32: /* RDMSR */
1366 switch ( regs->ecx )
1368 #ifdef CONFIG_X86_64
1369 case MSR_FS_BASE:
1370 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1371 regs->edx = v->arch.guest_context.fs_base >> 32;
1372 break;
1373 case MSR_GS_BASE:
1374 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1375 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1376 break;
1377 case MSR_SHADOW_GS_BASE:
1378 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1379 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1380 break;
1381 #endif
1382 case MSR_EFER:
1383 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1384 goto fail;
1385 break;
1386 default:
1387 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1389 regs->eax = l;
1390 regs->edx = h;
1391 break;
1393 /* Everyone can read the MSR space. */
1394 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1395 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1396 goto fail;
1397 break;
1399 break;
1401 default:
1402 goto fail;
1405 done:
1406 regs->eip = eip;
1407 return EXCRET_fault_fixed;
1409 fail:
1410 return 0;
1413 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1415 struct vcpu *v = current;
1416 struct trap_bounce *tb = &v->arch.trap_bounce;
1417 struct trap_info *ti;
1418 unsigned long fixup;
1420 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1422 if ( regs->error_code & 1 )
1423 goto hardware_gp;
1425 if ( !guest_mode(regs) )
1426 goto gp_in_kernel;
1428 /*
1429 * Cunning trick to allow arbitrary "INT n" handling.
1431 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1432 * instruction from trapping to the appropriate vector, when that might not
1433 * be expected by Xen or the guest OS. For example, that entry might be for
1434 * a fault handler (unlike traps, faults don't increment EIP), or might
1435 * expect an error code on the stack (which a software trap never
1436 * provides), or might be a hardware interrupt handler that doesn't like
1437 * being called spuriously.
1439 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1440 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1441 * clear to indicate that it's a software fault, not hardware.
1443 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1444 * okay because they can only be triggered by an explicit DPL-checked
1445 * instruction. The DPL specified by the guest OS for these vectors is NOT
1446 * CHECKED!!
1447 */
1448 if ( (regs->error_code & 3) == 2 )
1450 /* This fault must be due to <INT n> instruction. */
1451 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1452 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1454 tb->flags = TBF_EXCEPTION;
1455 regs->eip += 2;
1456 goto finish_propagation;
1460 /* Emulate some simple privileged and I/O instructions. */
1461 if ( (regs->error_code == 0) &&
1462 emulate_privileged_op(regs) )
1463 return 0;
1465 #if defined(__i386__)
1466 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1467 (regs->error_code == 0) &&
1468 gpf_emulate_4gb(regs) )
1469 return 0;
1470 #endif
1472 /* Pass on GPF as is. */
1473 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1474 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1475 tb->error_code = regs->error_code;
1476 finish_propagation:
1477 tb->cs = ti->cs;
1478 tb->eip = ti->address;
1479 if ( TI_GET_IF(ti) )
1480 tb->flags |= TBF_INTERRUPT;
1481 return 0;
1483 gp_in_kernel:
1485 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1487 DPRINTK("GPF (%04x): %p -> %p\n",
1488 regs->error_code, _p(regs->eip), _p(fixup));
1489 regs->eip = fixup;
1490 return 0;
1493 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1495 hardware_gp:
1496 show_execution_state(regs);
1497 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1498 smp_processor_id(), regs->error_code);
1499 return 0;
1502 static void nmi_softirq(void)
1504 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1505 vcpu_kick(dom0->vcpu[0]);
1508 static void nmi_dom0_report(unsigned int reason_idx)
1510 struct domain *d;
1511 struct vcpu *v;
1513 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1514 return;
1516 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1518 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1519 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1522 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1524 switch ( opt_nmi[0] )
1526 case 'd': /* 'dom0' */
1527 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1528 case 'i': /* 'ignore' */
1529 break;
1530 default: /* 'fatal' */
1531 console_force_unlock();
1532 printk("\n\nNMI - MEMORY ERROR\n");
1533 fatal_trap(TRAP_nmi, regs);
1536 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1537 mdelay(1);
1538 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1541 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1543 switch ( opt_nmi[0] )
1545 case 'd': /* 'dom0' */
1546 nmi_dom0_report(_XEN_NMIREASON_io_error);
1547 case 'i': /* 'ignore' */
1548 break;
1549 default: /* 'fatal' */
1550 console_force_unlock();
1551 printk("\n\nNMI - I/O ERROR\n");
1552 fatal_trap(TRAP_nmi, regs);
1555 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1556 mdelay(1);
1557 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1560 static void unknown_nmi_error(unsigned char reason)
1562 switch ( opt_nmi[0] )
1564 case 'd': /* 'dom0' */
1565 nmi_dom0_report(_XEN_NMIREASON_unknown);
1566 case 'i': /* 'ignore' */
1567 break;
1568 default: /* 'fatal' */
1569 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1570 printk("Dazed and confused, but trying to continue\n");
1571 printk("Do you have a strange power saving mode enabled?\n");
1575 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1577 return 0;
1580 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1582 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1584 unsigned int cpu = smp_processor_id();
1585 unsigned char reason;
1587 ++nmi_count(cpu);
1589 if ( nmi_callback(regs, cpu) )
1590 return;
1592 if ( nmi_watchdog )
1593 nmi_watchdog_tick(regs);
1595 /* Only the BSP gets external NMIs from the system. */
1596 if ( cpu == 0 )
1598 reason = inb(0x61);
1599 if ( reason & 0x80 )
1600 mem_parity_error(regs);
1601 else if ( reason & 0x40 )
1602 io_check_error(regs);
1603 else if ( !nmi_watchdog )
1604 unknown_nmi_error((unsigned char)(reason&0xff));
1608 void set_nmi_callback(nmi_callback_t callback)
1610 nmi_callback = callback;
1613 void unset_nmi_callback(void)
1615 nmi_callback = dummy_nmi_callback;
1618 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1620 struct trap_bounce *tb;
1621 struct trap_info *ti;
1623 setup_fpu(current);
1625 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1627 tb = &current->arch.trap_bounce;
1628 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1630 tb->flags = TBF_EXCEPTION;
1631 tb->cs = ti->cs;
1632 tb->eip = ti->address;
1633 if ( TI_GET_IF(ti) )
1634 tb->flags |= TBF_INTERRUPT;
1636 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1639 return EXCRET_fault_fixed;
1642 asmlinkage int do_debug(struct cpu_user_regs *regs)
1644 unsigned long condition;
1645 struct vcpu *v = current;
1646 struct trap_bounce *tb = &v->arch.trap_bounce;
1647 struct trap_info *ti;
1649 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1651 /* Mask out spurious debug traps due to lazy DR7 setting */
1652 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1653 (v->arch.guest_context.debugreg[7] == 0) )
1655 __asm__("mov %0,%%db7" : : "r" (0UL));
1656 goto out;
1659 DEBUGGER_trap_entry(TRAP_debug, regs);
1661 if ( !guest_mode(regs) )
1663 /* Clear TF just for absolute sanity. */
1664 regs->eflags &= ~EF_TF;
1665 /*
1666 * We ignore watchpoints when they trigger within Xen. This may happen
1667 * when a buffer is passed to us which previously had a watchpoint set
1668 * on it. No need to bump EIP; the only faulting trap is an instruction
1669 * breakpoint, which can't happen to us.
1670 */
1671 goto out;
1674 /* Save debug status register where guest OS can peek at it */
1675 v->arch.guest_context.debugreg[6] = condition;
1677 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1678 tb->flags = TBF_EXCEPTION;
1679 tb->cs = ti->cs;
1680 tb->eip = ti->address;
1681 if ( TI_GET_IF(ti) )
1682 tb->flags |= TBF_INTERRUPT;
1684 out:
1685 return EXCRET_not_a_fault;
1688 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1690 return EXCRET_not_a_fault;
1693 void set_intr_gate(unsigned int n, void *addr)
1695 #ifdef __i386__
1696 int i;
1697 /* Keep secondary tables in sync with IRQ updates. */
1698 for ( i = 1; i < NR_CPUS; i++ )
1699 if ( idt_tables[i] != NULL )
1700 _set_gate(&idt_tables[i][n], 14, 0, addr);
1701 #endif
1702 _set_gate(&idt_table[n], 14, 0, addr);
1705 void set_system_gate(unsigned int n, void *addr)
1707 _set_gate(idt_table+n,14,3,addr);
1710 void set_task_gate(unsigned int n, unsigned int sel)
1712 idt_table[n].a = sel << 16;
1713 idt_table[n].b = 0x8500;
1716 void set_tss_desc(unsigned int n, void *addr)
1718 _set_tssldt_desc(
1719 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1720 (unsigned long)addr,
1721 offsetof(struct tss_struct, __cacheline_filler) - 1,
1722 9);
1725 void __init trap_init(void)
1727 extern void percpu_traps_init(void);
1729 /*
1730 * Note that interrupt gates are always used, rather than trap gates. We
1731 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1732 * first activation must have the "bad" value(s) for these registers and
1733 * we may lose them if another activation is installed before they are
1734 * saved. The page-fault handler also needs interrupts disabled until %cr2
1735 * has been read and saved on the stack.
1736 */
1737 set_intr_gate(TRAP_divide_error,&divide_error);
1738 set_intr_gate(TRAP_debug,&debug);
1739 set_intr_gate(TRAP_nmi,&nmi);
1740 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1741 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1742 set_intr_gate(TRAP_bounds,&bounds);
1743 set_intr_gate(TRAP_invalid_op,&invalid_op);
1744 set_intr_gate(TRAP_no_device,&device_not_available);
1745 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1746 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1747 set_intr_gate(TRAP_no_segment,&segment_not_present);
1748 set_intr_gate(TRAP_stack_error,&stack_segment);
1749 set_intr_gate(TRAP_gp_fault,&general_protection);
1750 set_intr_gate(TRAP_page_fault,&page_fault);
1751 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1752 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1753 set_intr_gate(TRAP_alignment_check,&alignment_check);
1754 set_intr_gate(TRAP_machine_check,&machine_check);
1755 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1757 percpu_traps_init();
1759 cpu_init();
1761 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1765 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1767 struct trap_info cur;
1768 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1769 long rc = 0;
1771 /* If no table is presented then clear the entire virtual IDT. */
1772 if ( guest_handle_is_null(traps) )
1774 memset(dst, 0, 256 * sizeof(*dst));
1775 init_int80_direct_trap(current);
1776 return 0;
1779 for ( ; ; )
1781 if ( hypercall_preempt_check() )
1783 rc = hypercall_create_continuation(
1784 __HYPERVISOR_set_trap_table, "h", traps);
1785 break;
1788 if ( copy_from_guest(&cur, traps, 1) )
1790 rc = -EFAULT;
1791 break;
1794 if ( cur.address == 0 )
1795 break;
1797 fixup_guest_code_selector(cur.cs);
1799 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1801 if ( cur.vector == 0x80 )
1802 init_int80_direct_trap(current);
1804 guest_handle_add_offset(traps, 1);
1807 return rc;
1811 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1813 int i;
1815 switch ( reg )
1817 case 0:
1818 if ( !access_ok(value, sizeof(long)) )
1819 return -EPERM;
1820 if ( p == current )
1821 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1822 break;
1823 case 1:
1824 if ( !access_ok(value, sizeof(long)) )
1825 return -EPERM;
1826 if ( p == current )
1827 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1828 break;
1829 case 2:
1830 if ( !access_ok(value, sizeof(long)) )
1831 return -EPERM;
1832 if ( p == current )
1833 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1834 break;
1835 case 3:
1836 if ( !access_ok(value, sizeof(long)) )
1837 return -EPERM;
1838 if ( p == current )
1839 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1840 break;
1841 case 6:
1842 /*
1843 * DR6: Bits 4-11,16-31 reserved (set to 1).
1844 * Bit 12 reserved (set to 0).
1845 */
1846 value &= 0xffffefff; /* reserved bits => 0 */
1847 value |= 0xffff0ff0; /* reserved bits => 1 */
1848 if ( p == current )
1849 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1850 break;
1851 case 7:
1852 /*
1853 * DR7: Bit 10 reserved (set to 1).
1854 * Bits 11-12,14-15 reserved (set to 0).
1855 * Privileged bits:
1856 * GD (bit 13): must be 0.
1857 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1858 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1859 */
1860 /* DR7 == 0 => debugging disabled for this domain. */
1861 if ( value != 0 )
1863 value &= 0xffff27ff; /* reserved bits => 0 */
1864 value |= 0x00000400; /* reserved bits => 1 */
1865 if ( (value & (1<<13)) != 0 ) return -EPERM;
1866 for ( i = 0; i < 16; i += 2 )
1867 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1869 if ( p == current )
1870 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1871 break;
1872 default:
1873 return -EINVAL;
1876 p->arch.guest_context.debugreg[reg] = value;
1877 return 0;
1880 long do_set_debugreg(int reg, unsigned long value)
1882 return set_debugreg(current, reg, value);
1885 unsigned long do_get_debugreg(int reg)
1887 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1888 return current->arch.guest_context.debugreg[reg];
1891 /*
1892 * Local variables:
1893 * mode: C
1894 * c-set-style: "BSD"
1895 * c-basic-offset: 4
1896 * tab-width: 4
1897 * indent-tabs-mode: nil
1898 * End:
1899 */