ia64/xen-unstable

view xen/arch/x86/traps.c @ 11233:58ed222274f2

[XEN] Remove unnecessary printk from invalid opcode debug output.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Mon Aug 21 09:01:53 2006 +0100 (2006-08-21)
parents 45a84091144e
children 51a98a6c2c05
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <asm/shadow.h>
49 #include <asm/system.h>
50 #include <asm/io.h>
51 #include <asm/atomic.h>
52 #include <asm/desc.h>
53 #include <asm/debugreg.h>
54 #include <asm/smp.h>
55 #include <asm/flushtlb.h>
56 #include <asm/uaccess.h>
57 #include <asm/i387.h>
58 #include <asm/debugger.h>
59 #include <asm/msr.h>
60 #include <asm/x86_emulate.h>
62 /*
63 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
64 * fatal: Xen prints diagnostic message and then hangs.
65 * dom0: The NMI is virtualised to DOM0.
66 * ignore: The NMI error is cleared and ignored.
67 */
68 #ifdef NDEBUG
69 char opt_nmi[10] = "dom0";
70 #else
71 char opt_nmi[10] = "fatal";
72 #endif
73 string_param("nmi", opt_nmi);
75 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
76 idt_entry_t idt_table[IDT_ENTRIES];
78 #define DECLARE_TRAP_HANDLER(_name) \
79 asmlinkage void _name(void); \
80 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
82 asmlinkage void nmi(void);
83 DECLARE_TRAP_HANDLER(divide_error);
84 DECLARE_TRAP_HANDLER(debug);
85 DECLARE_TRAP_HANDLER(int3);
86 DECLARE_TRAP_HANDLER(overflow);
87 DECLARE_TRAP_HANDLER(bounds);
88 DECLARE_TRAP_HANDLER(invalid_op);
89 DECLARE_TRAP_HANDLER(device_not_available);
90 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
91 DECLARE_TRAP_HANDLER(invalid_TSS);
92 DECLARE_TRAP_HANDLER(segment_not_present);
93 DECLARE_TRAP_HANDLER(stack_segment);
94 DECLARE_TRAP_HANDLER(general_protection);
95 DECLARE_TRAP_HANDLER(page_fault);
96 DECLARE_TRAP_HANDLER(coprocessor_error);
97 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
98 DECLARE_TRAP_HANDLER(alignment_check);
99 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
100 DECLARE_TRAP_HANDLER(machine_check);
102 long do_set_debugreg(int reg, unsigned long value);
103 unsigned long do_get_debugreg(int reg);
105 static int debug_stack_lines = 20;
106 integer_param("debug_stack_lines", debug_stack_lines);
108 #ifdef CONFIG_X86_32
109 #define stack_words_per_line 8
110 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
111 #else
112 #define stack_words_per_line 4
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
114 #endif
116 int is_kernel_text(unsigned long addr)
117 {
118 extern char _stext, _etext;
119 if (addr >= (unsigned long) &_stext &&
120 addr <= (unsigned long) &_etext)
121 return 1;
122 return 0;
124 }
126 unsigned long kernel_text_end(void)
127 {
128 extern char _etext;
129 return (unsigned long) &_etext;
130 }
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
137 if ( hvm_guest(current) )
138 return;
140 if ( vm86_mode(regs) )
141 {
142 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
143 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
144 regs->ss, (uint16_t)(regs->esp & 0xffff));
145 }
146 else
147 {
148 stack = (unsigned long *)regs->esp;
149 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
150 }
152 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
153 {
154 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
155 break;
156 if ( get_user(addr, stack) )
157 {
158 if ( i != 0 )
159 printk("\n ");
160 printk("Fault while accessing guest memory.");
161 i = 1;
162 break;
163 }
164 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
165 printk("\n ");
166 printk(" %p", _p(addr));
167 stack++;
168 }
169 if ( i == 0 )
170 printk("Stack empty.");
171 printk("\n");
172 }
174 #ifdef NDEBUG
176 static void show_trace(struct cpu_user_regs *regs)
177 {
178 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
180 printk("Xen call trace:\n ");
182 printk("[<%p>]", _p(regs->eip));
183 print_symbol(" %s\n ", regs->eip);
185 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
186 {
187 addr = *stack++;
188 if ( is_kernel_text(addr) )
189 {
190 printk("[<%p>]", _p(addr));
191 print_symbol(" %s\n ", addr);
192 }
193 }
195 printk("\n");
196 }
198 #else
200 static void show_trace(struct cpu_user_regs *regs)
201 {
202 unsigned long *frame, next, addr, low, high;
204 printk("Xen call trace:\n ");
206 printk("[<%p>]", _p(regs->eip));
207 print_symbol(" %s\n ", regs->eip);
209 /* Bounds for range of valid frame pointer. */
210 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
211 high = (low & ~(STACK_SIZE - 1)) +
212 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
214 /* The initial frame pointer. */
215 next = regs->ebp;
217 for ( ; ; )
218 {
219 /* Valid frame pointer? */
220 if ( (next < low) || (next >= high) )
221 {
222 /*
223 * Exception stack frames have a different layout, denoted by an
224 * inverted frame pointer.
225 */
226 next = ~next;
227 if ( (next < low) || (next >= high) )
228 break;
229 frame = (unsigned long *)next;
230 next = frame[0];
231 addr = frame[(offsetof(struct cpu_user_regs, eip) -
232 offsetof(struct cpu_user_regs, ebp))
233 / BYTES_PER_LONG];
234 }
235 else
236 {
237 /* Ordinary stack frame. */
238 frame = (unsigned long *)next;
239 next = frame[0];
240 addr = frame[1];
241 }
243 printk("[<%p>]", _p(addr));
244 print_symbol(" %s\n ", addr);
246 low = (unsigned long)&frame[2];
247 }
249 printk("\n");
250 }
252 #endif
254 void show_stack(struct cpu_user_regs *regs)
255 {
256 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
257 int i;
259 if ( guest_mode(regs) )
260 return show_guest_stack(regs);
262 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
264 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
265 {
266 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
267 break;
268 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
269 printk("\n ");
270 addr = *stack++;
271 printk(" %p", _p(addr));
272 }
273 if ( i == 0 )
274 printk("Stack empty.");
275 printk("\n");
277 show_trace(regs);
278 }
280 void show_xen_trace()
281 {
282 struct cpu_user_regs regs;
283 #ifdef __x86_64
284 __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
285 __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
286 __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
287 #else
288 __asm__("movl %%esp,%0" : "=m" (regs.esp));
289 __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
290 __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
291 #endif
292 show_trace(&regs);
293 }
295 void show_stack_overflow(unsigned long esp)
296 {
297 #ifdef MEMORY_GUARD
298 unsigned long esp_top;
299 unsigned long *stack, addr;
301 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
303 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
304 if ( ((unsigned long)(esp - esp_top) > 512) &&
305 ((unsigned long)(esp_top - esp) > 512) )
306 return;
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow:\n ");
313 stack = (unsigned long *)esp;
314 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
315 {
316 addr = *stack++;
317 if ( is_kernel_text(addr) )
318 {
319 printk("%p: [<%p>]", stack, _p(addr));
320 print_symbol(" %s\n ", addr);
321 }
322 }
324 printk("\n");
325 #endif
326 }
328 void show_execution_state(struct cpu_user_regs *regs)
329 {
330 show_registers(regs);
331 show_stack(regs);
332 }
334 /*
335 * This is called for faults at very unexpected times (e.g., when interrupts
336 * are disabled). In such situations we can't do much that is safe. We try to
337 * print out some tracing and then we just spin.
338 */
339 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
340 {
341 int cpu = smp_processor_id();
342 unsigned long cr2;
343 static char *trapstr[] = {
344 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
345 "invalid opcode", "device not available", "double fault",
346 "coprocessor segment", "invalid tss", "segment not found",
347 "stack error", "general protection fault", "page fault",
348 "spurious interrupt", "coprocessor error", "alignment check",
349 "machine check", "simd error"
350 };
352 watchdog_disable();
353 console_start_sync();
355 show_execution_state(regs);
357 if ( trapnr == TRAP_page_fault )
358 {
359 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
360 printk("Faulting linear address: %p\n", _p(cr2));
361 show_page_walk(cr2);
362 }
364 printk("************************************\n");
365 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
366 cpu, trapnr, trapstr[trapnr], regs->error_code,
367 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
368 printk("System shutting down -- need manual reset.\n");
369 printk("************************************\n");
371 (void)debugger_trap_fatal(trapnr, regs);
373 /* Lock up the console to prevent spurious output from other CPUs. */
374 console_force_lock();
376 /* Wait for manual reset. */
377 machine_halt();
378 }
380 static inline int do_trap(int trapnr, char *str,
381 struct cpu_user_regs *regs,
382 int use_error_code)
383 {
384 struct vcpu *v = current;
385 struct trap_bounce *tb = &v->arch.trap_bounce;
386 struct trap_info *ti;
387 unsigned long fixup;
389 DEBUGGER_trap_entry(trapnr, regs);
391 if ( !guest_mode(regs) )
392 goto xen_fault;
394 ti = &current->arch.guest_context.trap_ctxt[trapnr];
395 tb->flags = TBF_EXCEPTION;
396 tb->cs = ti->cs;
397 tb->eip = ti->address;
398 if ( use_error_code )
399 {
400 tb->flags |= TBF_EXCEPTION_ERRCODE;
401 tb->error_code = regs->error_code;
402 }
403 if ( TI_GET_IF(ti) )
404 tb->flags |= TBF_INTERRUPT;
405 return 0;
407 xen_fault:
409 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
410 {
411 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
412 regs->eip = fixup;
413 return 0;
414 }
416 DEBUGGER_trap_fatal(trapnr, regs);
418 show_execution_state(regs);
419 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
420 "[error_code=%04x]\n",
421 smp_processor_id(), trapnr, str, regs->error_code);
422 return 0;
423 }
425 #define DO_ERROR_NOCODE(trapnr, str, name) \
426 asmlinkage int do_##name(struct cpu_user_regs *regs) \
427 { \
428 return do_trap(trapnr, str, regs, 0); \
429 }
431 #define DO_ERROR(trapnr, str, name) \
432 asmlinkage int do_##name(struct cpu_user_regs *regs) \
433 { \
434 return do_trap(trapnr, str, regs, 1); \
435 }
437 DO_ERROR_NOCODE( 0, "divide error", divide_error)
438 DO_ERROR_NOCODE( 4, "overflow", overflow)
439 DO_ERROR_NOCODE( 5, "bounds", bounds)
440 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
441 DO_ERROR(10, "invalid TSS", invalid_TSS)
442 DO_ERROR(11, "segment not present", segment_not_present)
443 DO_ERROR(12, "stack segment", stack_segment)
444 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
445 DO_ERROR(17, "alignment check", alignment_check)
446 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
448 int rdmsr_hypervisor_regs(
449 uint32_t idx, uint32_t *eax, uint32_t *edx)
450 {
451 idx -= 0x40000000;
452 if ( idx > 0 )
453 return 0;
455 *eax = *edx = 0;
456 return 1;
457 }
459 int wrmsr_hypervisor_regs(
460 uint32_t idx, uint32_t eax, uint32_t edx)
461 {
462 struct domain *d = current->domain;
464 idx -= 0x40000000;
465 if ( idx > 0 )
466 return 0;
468 switch ( idx )
469 {
470 case 0:
471 {
472 void *hypercall_page;
473 unsigned long mfn;
474 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
475 unsigned int idx = eax & 0xfff;
477 if ( idx > 0 )
478 {
479 DPRINTK("Dom%d: Out of range index %u to MSR %08x\n",
480 d->domain_id, idx, 0x40000000);
481 return 0;
482 }
484 mfn = gmfn_to_mfn(d, gmfn);
486 if ( !mfn_valid(mfn) ||
487 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
488 {
489 DPRINTK("Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
490 d->domain_id, gmfn, mfn, 0x40000000);
491 return 0;
492 }
494 hypercall_page = map_domain_page(mfn);
495 hypercall_page_initialise(d, hypercall_page);
496 unmap_domain_page(hypercall_page);
498 put_page_and_type(mfn_to_page(mfn));
499 break;
500 }
502 default:
503 BUG();
504 }
506 return 1;
507 }
509 int cpuid_hypervisor_leaves(
510 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
511 {
512 idx -= 0x40000000;
513 if ( idx > 2 )
514 return 0;
516 switch ( idx )
517 {
518 case 0:
519 *eax = 0x40000002; /* Largest leaf */
520 *ebx = 0x566e6558; /* Signature 1: "XenV" */
521 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
522 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
523 break;
525 case 1:
526 *eax = (xen_major_version() << 16) | xen_minor_version();
527 *ebx = 0; /* Reserved */
528 *ecx = 0; /* Reserved */
529 *edx = 0; /* Reserved */
530 break;
532 case 2:
533 *eax = 1; /* Number of hypercall-transfer pages */
534 *ebx = 0x40000000; /* MSR base address */
535 *ecx = 0; /* Features 1 */
536 *edx = 0; /* Features 2 */
537 break;
539 default:
540 BUG();
541 }
543 return 1;
544 }
546 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
547 {
548 char sig[5], instr[2];
549 uint32_t a, b, c, d;
550 unsigned long eip, rc;
552 a = regs->eax;
553 b = regs->ebx;
554 c = regs->ecx;
555 d = regs->edx;
556 eip = regs->eip;
558 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
559 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
560 {
561 propagate_page_fault(eip + sizeof(sig) - rc, 0);
562 return EXCRET_fault_fixed;
563 }
564 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
565 return 0;
566 eip += sizeof(sig);
568 /* We only emulate CPUID. */
569 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
570 {
571 propagate_page_fault(eip + sizeof(instr) - rc, 0);
572 return EXCRET_fault_fixed;
573 }
574 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
575 return 0;
576 eip += sizeof(instr);
578 __asm__ (
579 "cpuid"
580 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
581 : "0" (a), "1" (b), "2" (c), "3" (d) );
583 if ( regs->eax == 1 )
584 {
585 /* Modify Feature Information. */
586 clear_bit(X86_FEATURE_VME, &d);
587 clear_bit(X86_FEATURE_DE, &d);
588 clear_bit(X86_FEATURE_PSE, &d);
589 clear_bit(X86_FEATURE_PGE, &d);
590 if ( !supervisor_mode_kernel )
591 clear_bit(X86_FEATURE_SEP, &d);
592 if ( !IS_PRIV(current->domain) )
593 clear_bit(X86_FEATURE_MTRR, &d);
594 }
595 else
596 {
597 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
598 }
600 regs->eax = a;
601 regs->ebx = b;
602 regs->ecx = c;
603 regs->edx = d;
604 regs->eip = eip;
606 return EXCRET_fault_fixed;
607 }
609 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
610 {
611 struct vcpu *v = current;
612 struct trap_bounce *tb = &v->arch.trap_bounce;
613 struct trap_info *ti;
614 int rc;
616 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
618 if ( unlikely(!guest_mode(regs)) )
619 {
620 char sig[5];
621 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
622 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
623 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
624 {
625 show_execution_state(regs);
626 regs->eip += sizeof(sig);
627 return EXCRET_fault_fixed;
628 }
629 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
630 show_execution_state(regs);
631 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
632 smp_processor_id(), TRAP_invalid_op);
633 }
635 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
636 return rc;
638 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
639 tb->flags = TBF_EXCEPTION;
640 tb->cs = ti->cs;
641 tb->eip = ti->address;
642 if ( TI_GET_IF(ti) )
643 tb->flags |= TBF_INTERRUPT;
645 return 0;
646 }
648 asmlinkage int do_int3(struct cpu_user_regs *regs)
649 {
650 struct vcpu *v = current;
651 struct trap_bounce *tb = &v->arch.trap_bounce;
652 struct trap_info *ti;
654 DEBUGGER_trap_entry(TRAP_int3, regs);
656 if ( !guest_mode(regs) )
657 {
658 DEBUGGER_trap_fatal(TRAP_int3, regs);
659 show_execution_state(regs);
660 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
661 }
663 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
664 tb->flags = TBF_EXCEPTION;
665 tb->cs = ti->cs;
666 tb->eip = ti->address;
667 if ( TI_GET_IF(ti) )
668 tb->flags |= TBF_INTERRUPT;
670 return 0;
671 }
673 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
674 {
675 fatal_trap(TRAP_machine_check, regs);
676 return 0;
677 }
679 void propagate_page_fault(unsigned long addr, u16 error_code)
680 {
681 struct trap_info *ti;
682 struct vcpu *v = current;
683 struct trap_bounce *tb = &v->arch.trap_bounce;
685 v->arch.guest_context.ctrlreg[2] = addr;
686 v->vcpu_info->arch.cr2 = addr;
688 /* Re-set error_code.user flag appropriately for the guest. */
689 error_code &= ~PGERR_user_mode;
690 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
691 error_code |= PGERR_user_mode;
693 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
694 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
695 tb->error_code = error_code;
696 tb->cs = ti->cs;
697 tb->eip = ti->address;
698 if ( TI_GET_IF(ti) )
699 tb->flags |= TBF_INTERRUPT;
700 }
702 static int handle_gdt_ldt_mapping_fault(
703 unsigned long offset, struct cpu_user_regs *regs)
704 {
705 extern int map_ldt_shadow_page(unsigned int);
707 struct vcpu *v = current;
708 struct domain *d = v->domain;
709 int ret;
711 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
712 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
713 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
715 /* Should never fault in another vcpu's area. */
716 BUG_ON(vcpu_area != current->vcpu_id);
718 /* Byte offset within the gdt/ldt sub-area. */
719 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
721 if ( likely(is_ldt_area) )
722 {
723 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
724 LOCK_BIGLOCK(d);
725 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
726 UNLOCK_BIGLOCK(d);
728 if ( unlikely(ret == 0) )
729 {
730 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
731 if ( !guest_mode(regs) )
732 return 0;
733 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
734 propagate_page_fault(
735 v->arch.guest_context.ldt_base + offset, regs->error_code);
736 }
737 }
738 else
739 {
740 /* GDT fault: handle the fault as #GP(selector). */
741 regs->error_code = (u16)offset & ~7;
742 (void)do_general_protection(regs);
743 }
745 return EXCRET_fault_fixed;
746 }
748 #ifdef HYPERVISOR_VIRT_END
749 #define IN_HYPERVISOR_RANGE(va) \
750 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
751 #else
752 #define IN_HYPERVISOR_RANGE(va) \
753 (((va) >= HYPERVISOR_VIRT_START))
754 #endif
756 static int __spurious_page_fault(
757 unsigned long addr, struct cpu_user_regs *regs)
758 {
759 unsigned long mfn, cr3 = read_cr3();
760 #if CONFIG_PAGING_LEVELS >= 4
761 l4_pgentry_t l4e, *l4t;
762 #endif
763 #if CONFIG_PAGING_LEVELS >= 3
764 l3_pgentry_t l3e, *l3t;
765 #endif
766 l2_pgentry_t l2e, *l2t;
767 l1_pgentry_t l1e, *l1t;
768 unsigned int required_flags, disallowed_flags;
770 /* Reserved bit violations are never spurious faults. */
771 if ( regs->error_code & PGERR_reserved_bit )
772 return 0;
774 required_flags = _PAGE_PRESENT;
775 if ( regs->error_code & PGERR_write_access )
776 required_flags |= _PAGE_RW;
777 if ( regs->error_code & PGERR_user_mode )
778 required_flags |= _PAGE_USER;
780 disallowed_flags = 0;
781 if ( regs->error_code & PGERR_instr_fetch )
782 disallowed_flags |= _PAGE_NX;
784 mfn = cr3 >> PAGE_SHIFT;
786 #if CONFIG_PAGING_LEVELS >= 4
787 l4t = map_domain_page(mfn);
788 l4e = l4t[l4_table_offset(addr)];
789 mfn = l4e_get_pfn(l4e);
790 unmap_domain_page(l4t);
791 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
792 (l4e_get_flags(l4e) & disallowed_flags) )
793 return 0;
794 #endif
796 #if CONFIG_PAGING_LEVELS >= 3
797 l3t = map_domain_page(mfn);
798 #ifdef CONFIG_X86_PAE
799 l3t += (cr3 & 0xFE0UL) >> 3;
800 #endif
801 l3e = l3t[l3_table_offset(addr)];
802 mfn = l3e_get_pfn(l3e);
803 unmap_domain_page(l3t);
804 #ifdef CONFIG_X86_PAE
805 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
806 return 0;
807 #else
808 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
809 (l3e_get_flags(l3e) & disallowed_flags) )
810 return 0;
811 #endif
812 #endif
814 l2t = map_domain_page(mfn);
815 l2e = l2t[l2_table_offset(addr)];
816 mfn = l2e_get_pfn(l2e);
817 unmap_domain_page(l2t);
818 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
819 (l2e_get_flags(l2e) & disallowed_flags) )
820 return 0;
821 if ( l2e_get_flags(l2e) & _PAGE_PSE )
822 {
823 l1e = l1e_empty(); /* define before use in debug tracing */
824 goto spurious;
825 }
827 l1t = map_domain_page(mfn);
828 l1e = l1t[l1_table_offset(addr)];
829 mfn = l1e_get_pfn(l1e);
830 unmap_domain_page(l1t);
831 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
832 (l1e_get_flags(l1e) & disallowed_flags) )
833 return 0;
835 spurious:
836 DPRINTK("Spurious fault in domain %u:%u at addr %lx, e/c %04x\n",
837 current->domain->domain_id, current->vcpu_id,
838 addr, regs->error_code);
839 #if CONFIG_PAGING_LEVELS >= 4
840 DPRINTK(" l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
841 #endif
842 #if CONFIG_PAGING_LEVELS >= 3
843 DPRINTK(" l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
844 #endif
845 DPRINTK(" l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
846 DPRINTK(" l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
847 #ifndef NDEBUG
848 show_registers(regs);
849 #endif
850 return 1;
851 }
853 static int spurious_page_fault(
854 unsigned long addr, struct cpu_user_regs *regs)
855 {
856 struct domain *d = current->domain;
857 int is_spurious;
859 LOCK_BIGLOCK(d);
860 is_spurious = __spurious_page_fault(addr, regs);
861 UNLOCK_BIGLOCK(d);
863 return is_spurious;
864 }
866 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
867 {
868 struct vcpu *v = current;
869 struct domain *d = v->domain;
871 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
872 {
873 if ( shadow2_mode_external(d) && guest_mode(regs) )
874 return shadow2_fault(addr, regs);
875 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
876 return handle_gdt_ldt_mapping_fault(
877 addr - GDT_LDT_VIRT_START, regs);
878 /*
879 * Do not propagate spurious faults in the hypervisor area to the
880 * guest. It cannot fix them up.
881 */
882 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
883 }
885 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
886 guest_kernel_mode(v, regs) &&
887 /* Do not check if access-protection fault since the page may
888 legitimately be not present in shadow page tables */
889 ((regs->error_code & PGERR_write_access) == PGERR_write_access) &&
890 ptwr_do_page_fault(d, addr, regs) )
891 return EXCRET_fault_fixed;
893 if ( shadow2_mode_enabled(d) )
894 return shadow2_fault(addr, regs);
896 return 0;
897 }
899 /*
900 * #PF error code:
901 * Bit 0: Protection violation (=1) ; Page not present (=0)
902 * Bit 1: Write access
903 * Bit 2: User mode (=1) ; Supervisor mode (=0)
904 * Bit 3: Reserved bit violation
905 * Bit 4: Instruction fetch
906 */
907 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
908 {
909 unsigned long addr, fixup;
910 int rc;
912 ASSERT(!in_irq());
914 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
916 DEBUGGER_trap_entry(TRAP_page_fault, regs);
918 perfc_incrc(page_faults);
920 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
921 return rc;
923 if ( unlikely(!guest_mode(regs)) )
924 {
925 if ( spurious_page_fault(addr, regs) )
926 return EXCRET_not_a_fault;
928 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
929 {
930 perfc_incrc(copy_user_faults);
931 regs->eip = fixup;
932 return 0;
933 }
935 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
937 show_execution_state(regs);
938 show_page_walk(addr);
939 panic("CPU%d FATAL PAGE FAULT\n"
940 "[error_code=%04x]\n"
941 "Faulting linear address: %p\n",
942 smp_processor_id(), regs->error_code, _p(addr));
943 }
945 propagate_page_fault(addr, regs->error_code);
946 return 0;
947 }
949 long do_fpu_taskswitch(int set)
950 {
951 struct vcpu *v = current;
953 if ( set )
954 {
955 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
956 stts();
957 }
958 else
959 {
960 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
961 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
962 clts();
963 }
965 return 0;
966 }
968 /* Has the guest requested sufficient permission for this I/O access? */
969 static inline int guest_io_okay(
970 unsigned int port, unsigned int bytes,
971 struct vcpu *v, struct cpu_user_regs *regs)
972 {
973 u16 x;
974 #if defined(__x86_64__)
975 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
976 int user_mode = !(v->arch.flags & TF_kernel_mode);
977 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
978 #elif defined(__i386__)
979 #define TOGGLE_MODE() ((void)0)
980 #endif
982 if ( !vm86_mode(regs) &&
983 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
984 return 1;
986 if ( v->arch.iobmp_limit > (port + bytes) )
987 {
988 TOGGLE_MODE();
989 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
990 TOGGLE_MODE();
991 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
992 return 1;
993 }
995 return 0;
996 }
998 /* Has the administrator granted sufficient permission for this I/O access? */
999 static inline int admin_io_okay(
1000 unsigned int port, unsigned int bytes,
1001 struct vcpu *v, struct cpu_user_regs *regs)
1003 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1006 /* Check admin limits. Silently fail the access if it is disallowed. */
1007 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
1008 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
1009 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
1010 #define outb_user(_v, _p, _d, _r) \
1011 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
1012 #define outw_user(_v, _p, _d, _r) \
1013 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
1014 #define outl_user(_v, _p, _d, _r) \
1015 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
1017 /* Instruction fetch with error handling. */
1018 #define insn_fetch(_type, _size, _ptr) \
1019 ({ unsigned long _rc, _x; \
1020 if ( (_rc = copy_from_user(&_x, (_type *)eip, sizeof(_type))) != 0 ) \
1021 { \
1022 propagate_page_fault(eip + sizeof(_type) - _rc, 0); \
1023 return EXCRET_fault_fixed; \
1024 } \
1025 eip += _size; (_type)_x; })
1027 static int emulate_privileged_op(struct cpu_user_regs *regs)
1029 struct vcpu *v = current;
1030 unsigned long *reg, eip = regs->eip, res;
1031 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
1032 unsigned int port, i, op_bytes = 4, data, rc;
1033 u32 l, h;
1035 /* Legacy prefixes. */
1036 for ( i = 0; i < 8; i++ )
1038 switch ( opcode = insn_fetch(u8, 1, eip) )
1040 case 0x66: /* operand-size override */
1041 op_bytes ^= 6; /* switch between 2/4 bytes */
1042 break;
1043 case 0x67: /* address-size override */
1044 case 0x2e: /* CS override */
1045 case 0x3e: /* DS override */
1046 case 0x26: /* ES override */
1047 case 0x64: /* FS override */
1048 case 0x65: /* GS override */
1049 case 0x36: /* SS override */
1050 case 0xf0: /* LOCK */
1051 case 0xf2: /* REPNE/REPNZ */
1052 break;
1053 case 0xf3: /* REP/REPE/REPZ */
1054 rep_prefix = 1;
1055 break;
1056 default:
1057 goto done_prefixes;
1060 done_prefixes:
1062 #ifdef __x86_64__
1063 /* REX prefix. */
1064 if ( (opcode & 0xf0) == 0x40 )
1066 modrm_reg = (opcode & 4) << 1; /* REX.R */
1067 modrm_rm = (opcode & 1) << 3; /* REX.B */
1069 /* REX.W and REX.X do not need to be decoded. */
1070 opcode = insn_fetch(u8, 1, eip);
1072 #endif
1074 /* Input/Output String instructions. */
1075 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1077 if ( rep_prefix && (regs->ecx == 0) )
1078 goto done;
1080 continue_io_string:
1081 switch ( opcode )
1083 case 0x6c: /* INSB */
1084 op_bytes = 1;
1085 case 0x6d: /* INSW/INSL */
1086 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1087 goto fail;
1088 switch ( op_bytes )
1090 case 1:
1091 data = (u8)inb_user((u16)regs->edx, v, regs);
1092 break;
1093 case 2:
1094 data = (u16)inw_user((u16)regs->edx, v, regs);
1095 break;
1096 case 4:
1097 data = (u32)inl_user((u16)regs->edx, v, regs);
1098 break;
1100 if ( (rc = copy_to_user((void *)regs->edi, &data, op_bytes)) != 0 )
1102 propagate_page_fault(regs->edi + op_bytes - rc,
1103 PGERR_write_access);
1104 return EXCRET_fault_fixed;
1106 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1107 break;
1109 case 0x6e: /* OUTSB */
1110 op_bytes = 1;
1111 case 0x6f: /* OUTSW/OUTSL */
1112 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1113 goto fail;
1114 rc = copy_from_user(&data, (void *)regs->esi, op_bytes);
1115 if ( rc != 0 )
1117 propagate_page_fault(regs->esi + op_bytes - rc, 0);
1118 return EXCRET_fault_fixed;
1120 switch ( op_bytes )
1122 case 1:
1123 outb_user((u8)data, (u16)regs->edx, v, regs);
1124 break;
1125 case 2:
1126 outw_user((u16)data, (u16)regs->edx, v, regs);
1127 break;
1128 case 4:
1129 outl_user((u32)data, (u16)regs->edx, v, regs);
1130 break;
1132 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1133 break;
1136 if ( rep_prefix && (--regs->ecx != 0) )
1138 if ( !hypercall_preempt_check() )
1139 goto continue_io_string;
1140 eip = regs->eip;
1143 goto done;
1146 /* I/O Port and Interrupt Flag instructions. */
1147 switch ( opcode )
1149 case 0xe4: /* IN imm8,%al */
1150 op_bytes = 1;
1151 case 0xe5: /* IN imm8,%eax */
1152 port = insn_fetch(u8, 1, eip);
1153 exec_in:
1154 if ( !guest_io_okay(port, op_bytes, v, regs) )
1155 goto fail;
1156 switch ( op_bytes )
1158 case 1:
1159 regs->eax &= ~0xffUL;
1160 regs->eax |= (u8)inb_user(port, v, regs);
1161 break;
1162 case 2:
1163 regs->eax &= ~0xffffUL;
1164 regs->eax |= (u16)inw_user(port, v, regs);
1165 break;
1166 case 4:
1167 regs->eax = (u32)inl_user(port, v, regs);
1168 break;
1170 goto done;
1172 case 0xec: /* IN %dx,%al */
1173 op_bytes = 1;
1174 case 0xed: /* IN %dx,%eax */
1175 port = (u16)regs->edx;
1176 goto exec_in;
1178 case 0xe6: /* OUT %al,imm8 */
1179 op_bytes = 1;
1180 case 0xe7: /* OUT %eax,imm8 */
1181 port = insn_fetch(u8, 1, eip);
1182 exec_out:
1183 if ( !guest_io_okay(port, op_bytes, v, regs) )
1184 goto fail;
1185 switch ( op_bytes )
1187 case 1:
1188 outb_user((u8)regs->eax, port, v, regs);
1189 break;
1190 case 2:
1191 outw_user((u16)regs->eax, port, v, regs);
1192 break;
1193 case 4:
1194 outl_user((u32)regs->eax, port, v, regs);
1195 break;
1197 goto done;
1199 case 0xee: /* OUT %al,%dx */
1200 op_bytes = 1;
1201 case 0xef: /* OUT %eax,%dx */
1202 port = (u16)regs->edx;
1203 goto exec_out;
1205 case 0xfa: /* CLI */
1206 case 0xfb: /* STI */
1207 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1208 goto fail;
1209 /*
1210 * This is just too dangerous to allow, in my opinion. Consider if the
1211 * caller then tries to reenable interrupts using POPF: we can't trap
1212 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1213 * do for us. :-)
1214 */
1215 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1216 goto done;
1218 case 0x0f: /* Two-byte opcode */
1219 break;
1221 default:
1222 goto fail;
1225 /* Remaining instructions only emulated from guest kernel. */
1226 if ( !guest_kernel_mode(v, regs) )
1227 goto fail;
1229 /* Privileged (ring 0) instructions. */
1230 opcode = insn_fetch(u8, 1, eip);
1231 switch ( opcode )
1233 case 0x06: /* CLTS */
1234 (void)do_fpu_taskswitch(0);
1235 break;
1237 case 0x09: /* WBINVD */
1238 /* Ignore the instruction if unprivileged. */
1239 if ( !cache_flush_permitted(v->domain) )
1240 /* Non-physdev domain attempted WBINVD; ignore for now since
1241 newer linux uses this in some start-of-day timing loops */
1243 else
1244 wbinvd();
1245 break;
1247 case 0x20: /* MOV CR?,<reg> */
1248 opcode = insn_fetch(u8, 1, eip);
1249 modrm_reg |= (opcode >> 3) & 7;
1250 modrm_rm |= (opcode >> 0) & 7;
1251 reg = decode_register(modrm_rm, regs, 0);
1252 switch ( modrm_reg )
1254 case 0: /* Read CR0 */
1255 *reg = (read_cr0() & ~X86_CR0_TS) |
1256 v->arch.guest_context.ctrlreg[0];
1257 break;
1259 case 2: /* Read CR2 */
1260 *reg = v->arch.guest_context.ctrlreg[2];
1261 break;
1263 case 3: /* Read CR3 */
1264 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1265 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1266 break;
1268 case 4: /* Read CR4 */
1269 /*
1270 * Guests can read CR4 to see what features Xen has enabled. We
1271 * therefore lie about PGE & PSE as they are unavailable to guests.
1272 */
1273 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1274 break;
1276 default:
1277 goto fail;
1279 break;
1281 case 0x21: /* MOV DR?,<reg> */
1282 opcode = insn_fetch(u8, 1, eip);
1283 modrm_reg |= (opcode >> 3) & 7;
1284 modrm_rm |= (opcode >> 0) & 7;
1285 reg = decode_register(modrm_rm, regs, 0);
1286 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1287 goto fail;
1288 *reg = res;
1289 break;
1291 case 0x22: /* MOV <reg>,CR? */
1292 opcode = insn_fetch(u8, 1, eip);
1293 modrm_reg |= (opcode >> 3) & 7;
1294 modrm_rm |= (opcode >> 0) & 7;
1295 reg = decode_register(modrm_rm, regs, 0);
1296 switch ( modrm_reg )
1298 case 0: /* Write CR0 */
1299 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1301 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1302 goto fail;
1304 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1305 break;
1307 case 2: /* Write CR2 */
1308 v->arch.guest_context.ctrlreg[2] = *reg;
1309 v->vcpu_info->arch.cr2 = *reg;
1310 break;
1312 case 3: /* Write CR3 */
1313 LOCK_BIGLOCK(v->domain);
1314 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1315 UNLOCK_BIGLOCK(v->domain);
1316 break;
1318 case 4:
1319 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1321 DPRINTK("Attempt to change CR4 flags.\n");
1322 goto fail;
1324 break;
1326 default:
1327 goto fail;
1329 break;
1331 case 0x23: /* MOV <reg>,DR? */
1332 opcode = insn_fetch(u8, 1, eip);
1333 modrm_reg |= (opcode >> 3) & 7;
1334 modrm_rm |= (opcode >> 0) & 7;
1335 reg = decode_register(modrm_rm, regs, 0);
1336 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1337 goto fail;
1338 break;
1340 case 0x30: /* WRMSR */
1341 switch ( regs->ecx )
1343 #ifdef CONFIG_X86_64
1344 case MSR_FS_BASE:
1345 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1346 goto fail;
1347 v->arch.guest_context.fs_base =
1348 ((u64)regs->edx << 32) | regs->eax;
1349 break;
1350 case MSR_GS_BASE:
1351 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1352 goto fail;
1353 v->arch.guest_context.gs_base_kernel =
1354 ((u64)regs->edx << 32) | regs->eax;
1355 break;
1356 case MSR_SHADOW_GS_BASE:
1357 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1358 goto fail;
1359 v->arch.guest_context.gs_base_user =
1360 ((u64)regs->edx << 32) | regs->eax;
1361 break;
1362 #endif
1363 default:
1364 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1365 break;
1367 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1368 (regs->eax != l) || (regs->edx != h) )
1369 DPRINTK("Domain attempted WRMSR %p from "
1370 "%08x:%08x to %08lx:%08lx.\n",
1371 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1372 break;
1374 break;
1376 case 0x32: /* RDMSR */
1377 switch ( regs->ecx )
1379 #ifdef CONFIG_X86_64
1380 case MSR_FS_BASE:
1381 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1382 regs->edx = v->arch.guest_context.fs_base >> 32;
1383 break;
1384 case MSR_GS_BASE:
1385 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1386 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1387 break;
1388 case MSR_SHADOW_GS_BASE:
1389 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1390 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1391 break;
1392 #endif
1393 case MSR_EFER:
1394 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1395 goto fail;
1396 break;
1397 default:
1398 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1400 regs->eax = l;
1401 regs->edx = h;
1402 break;
1404 /* Everyone can read the MSR space. */
1405 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1406 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1407 goto fail;
1408 break;
1410 break;
1412 default:
1413 goto fail;
1416 done:
1417 regs->eip = eip;
1418 return EXCRET_fault_fixed;
1420 fail:
1421 return 0;
1424 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1426 struct vcpu *v = current;
1427 struct trap_bounce *tb = &v->arch.trap_bounce;
1428 struct trap_info *ti;
1429 unsigned long fixup;
1431 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1433 if ( regs->error_code & 1 )
1434 goto hardware_gp;
1436 if ( !guest_mode(regs) )
1437 goto gp_in_kernel;
1439 /*
1440 * Cunning trick to allow arbitrary "INT n" handling.
1442 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1443 * instruction from trapping to the appropriate vector, when that might not
1444 * be expected by Xen or the guest OS. For example, that entry might be for
1445 * a fault handler (unlike traps, faults don't increment EIP), or might
1446 * expect an error code on the stack (which a software trap never
1447 * provides), or might be a hardware interrupt handler that doesn't like
1448 * being called spuriously.
1450 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1451 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1452 * clear to indicate that it's a software fault, not hardware.
1454 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1455 * okay because they can only be triggered by an explicit DPL-checked
1456 * instruction. The DPL specified by the guest OS for these vectors is NOT
1457 * CHECKED!!
1458 */
1459 if ( (regs->error_code & 3) == 2 )
1461 /* This fault must be due to <INT n> instruction. */
1462 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1463 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1465 tb->flags = TBF_EXCEPTION;
1466 regs->eip += 2;
1467 goto finish_propagation;
1471 /* Emulate some simple privileged and I/O instructions. */
1472 if ( (regs->error_code == 0) &&
1473 emulate_privileged_op(regs) )
1474 return 0;
1476 #if defined(__i386__)
1477 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1478 (regs->error_code == 0) &&
1479 gpf_emulate_4gb(regs) )
1480 return 0;
1481 #endif
1483 /* Pass on GPF as is. */
1484 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1485 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1486 tb->error_code = regs->error_code;
1487 finish_propagation:
1488 tb->cs = ti->cs;
1489 tb->eip = ti->address;
1490 if ( TI_GET_IF(ti) )
1491 tb->flags |= TBF_INTERRUPT;
1492 return 0;
1494 gp_in_kernel:
1496 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1498 DPRINTK("GPF (%04x): %p -> %p\n",
1499 regs->error_code, _p(regs->eip), _p(fixup));
1500 regs->eip = fixup;
1501 return 0;
1504 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1506 hardware_gp:
1507 show_execution_state(regs);
1508 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1509 smp_processor_id(), regs->error_code);
1510 return 0;
1513 static void nmi_softirq(void)
1515 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1516 vcpu_kick(dom0->vcpu[0]);
1519 static void nmi_dom0_report(unsigned int reason_idx)
1521 struct domain *d;
1522 struct vcpu *v;
1524 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1525 return;
1527 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1529 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1530 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1533 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1535 switch ( opt_nmi[0] )
1537 case 'd': /* 'dom0' */
1538 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1539 case 'i': /* 'ignore' */
1540 break;
1541 default: /* 'fatal' */
1542 console_force_unlock();
1543 printk("\n\nNMI - MEMORY ERROR\n");
1544 fatal_trap(TRAP_nmi, regs);
1547 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1548 mdelay(1);
1549 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1552 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1554 switch ( opt_nmi[0] )
1556 case 'd': /* 'dom0' */
1557 nmi_dom0_report(_XEN_NMIREASON_io_error);
1558 case 'i': /* 'ignore' */
1559 break;
1560 default: /* 'fatal' */
1561 console_force_unlock();
1562 printk("\n\nNMI - I/O ERROR\n");
1563 fatal_trap(TRAP_nmi, regs);
1566 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1567 mdelay(1);
1568 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1571 static void unknown_nmi_error(unsigned char reason)
1573 switch ( opt_nmi[0] )
1575 case 'd': /* 'dom0' */
1576 nmi_dom0_report(_XEN_NMIREASON_unknown);
1577 case 'i': /* 'ignore' */
1578 break;
1579 default: /* 'fatal' */
1580 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1581 printk("Dazed and confused, but trying to continue\n");
1582 printk("Do you have a strange power saving mode enabled?\n");
1586 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1588 return 0;
1591 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1593 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1595 unsigned int cpu = smp_processor_id();
1596 unsigned char reason;
1598 ++nmi_count(cpu);
1600 if ( nmi_callback(regs, cpu) )
1601 return;
1603 if ( nmi_watchdog )
1604 nmi_watchdog_tick(regs);
1606 /* Only the BSP gets external NMIs from the system. */
1607 if ( cpu == 0 )
1609 reason = inb(0x61);
1610 if ( reason & 0x80 )
1611 mem_parity_error(regs);
1612 else if ( reason & 0x40 )
1613 io_check_error(regs);
1614 else if ( !nmi_watchdog )
1615 unknown_nmi_error((unsigned char)(reason&0xff));
1619 void set_nmi_callback(nmi_callback_t callback)
1621 nmi_callback = callback;
1624 void unset_nmi_callback(void)
1626 nmi_callback = dummy_nmi_callback;
1629 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1631 struct trap_bounce *tb;
1632 struct trap_info *ti;
1634 setup_fpu(current);
1636 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1638 tb = &current->arch.trap_bounce;
1639 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1641 tb->flags = TBF_EXCEPTION;
1642 tb->cs = ti->cs;
1643 tb->eip = ti->address;
1644 if ( TI_GET_IF(ti) )
1645 tb->flags |= TBF_INTERRUPT;
1647 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1650 return EXCRET_fault_fixed;
1653 asmlinkage int do_debug(struct cpu_user_regs *regs)
1655 unsigned long condition;
1656 struct vcpu *v = current;
1657 struct trap_bounce *tb = &v->arch.trap_bounce;
1658 struct trap_info *ti;
1660 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1662 /* Mask out spurious debug traps due to lazy DR7 setting */
1663 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1664 (v->arch.guest_context.debugreg[7] == 0) )
1666 __asm__("mov %0,%%db7" : : "r" (0UL));
1667 goto out;
1670 DEBUGGER_trap_entry(TRAP_debug, regs);
1672 if ( !guest_mode(regs) )
1674 /* Clear TF just for absolute sanity. */
1675 regs->eflags &= ~EF_TF;
1676 /*
1677 * We ignore watchpoints when they trigger within Xen. This may happen
1678 * when a buffer is passed to us which previously had a watchpoint set
1679 * on it. No need to bump EIP; the only faulting trap is an instruction
1680 * breakpoint, which can't happen to us.
1681 */
1682 goto out;
1685 /* Save debug status register where guest OS can peek at it */
1686 v->arch.guest_context.debugreg[6] = condition;
1688 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1689 tb->flags = TBF_EXCEPTION;
1690 tb->cs = ti->cs;
1691 tb->eip = ti->address;
1692 if ( TI_GET_IF(ti) )
1693 tb->flags |= TBF_INTERRUPT;
1695 out:
1696 return EXCRET_not_a_fault;
1699 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1701 return EXCRET_not_a_fault;
1704 void set_intr_gate(unsigned int n, void *addr)
1706 #ifdef __i386__
1707 int i;
1708 /* Keep secondary tables in sync with IRQ updates. */
1709 for ( i = 1; i < NR_CPUS; i++ )
1710 if ( idt_tables[i] != NULL )
1711 _set_gate(&idt_tables[i][n], 14, 0, addr);
1712 #endif
1713 _set_gate(&idt_table[n], 14, 0, addr);
1716 void set_system_gate(unsigned int n, void *addr)
1718 _set_gate(idt_table+n,14,3,addr);
1721 void set_task_gate(unsigned int n, unsigned int sel)
1723 idt_table[n].a = sel << 16;
1724 idt_table[n].b = 0x8500;
1727 void set_tss_desc(unsigned int n, void *addr)
1729 _set_tssldt_desc(
1730 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1731 (unsigned long)addr,
1732 offsetof(struct tss_struct, __cacheline_filler) - 1,
1733 9);
1736 void __init trap_init(void)
1738 extern void percpu_traps_init(void);
1740 /*
1741 * Note that interrupt gates are always used, rather than trap gates. We
1742 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1743 * first activation must have the "bad" value(s) for these registers and
1744 * we may lose them if another activation is installed before they are
1745 * saved. The page-fault handler also needs interrupts disabled until %cr2
1746 * has been read and saved on the stack.
1747 */
1748 set_intr_gate(TRAP_divide_error,&divide_error);
1749 set_intr_gate(TRAP_debug,&debug);
1750 set_intr_gate(TRAP_nmi,&nmi);
1751 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1752 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1753 set_intr_gate(TRAP_bounds,&bounds);
1754 set_intr_gate(TRAP_invalid_op,&invalid_op);
1755 set_intr_gate(TRAP_no_device,&device_not_available);
1756 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1757 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1758 set_intr_gate(TRAP_no_segment,&segment_not_present);
1759 set_intr_gate(TRAP_stack_error,&stack_segment);
1760 set_intr_gate(TRAP_gp_fault,&general_protection);
1761 set_intr_gate(TRAP_page_fault,&page_fault);
1762 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1763 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1764 set_intr_gate(TRAP_alignment_check,&alignment_check);
1765 set_intr_gate(TRAP_machine_check,&machine_check);
1766 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1768 percpu_traps_init();
1770 cpu_init();
1772 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1776 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1778 struct trap_info cur;
1779 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1780 long rc = 0;
1782 /* If no table is presented then clear the entire virtual IDT. */
1783 if ( guest_handle_is_null(traps) )
1785 memset(dst, 0, 256 * sizeof(*dst));
1786 init_int80_direct_trap(current);
1787 return 0;
1790 for ( ; ; )
1792 if ( hypercall_preempt_check() )
1794 rc = hypercall_create_continuation(
1795 __HYPERVISOR_set_trap_table, "h", traps);
1796 break;
1799 if ( copy_from_guest(&cur, traps, 1) )
1801 rc = -EFAULT;
1802 break;
1805 if ( cur.address == 0 )
1806 break;
1808 fixup_guest_code_selector(cur.cs);
1810 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1812 if ( cur.vector == 0x80 )
1813 init_int80_direct_trap(current);
1815 guest_handle_add_offset(traps, 1);
1818 return rc;
1822 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1824 int i;
1826 switch ( reg )
1828 case 0:
1829 if ( !access_ok(value, sizeof(long)) )
1830 return -EPERM;
1831 if ( p == current )
1832 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1833 break;
1834 case 1:
1835 if ( !access_ok(value, sizeof(long)) )
1836 return -EPERM;
1837 if ( p == current )
1838 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1839 break;
1840 case 2:
1841 if ( !access_ok(value, sizeof(long)) )
1842 return -EPERM;
1843 if ( p == current )
1844 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1845 break;
1846 case 3:
1847 if ( !access_ok(value, sizeof(long)) )
1848 return -EPERM;
1849 if ( p == current )
1850 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1851 break;
1852 case 6:
1853 /*
1854 * DR6: Bits 4-11,16-31 reserved (set to 1).
1855 * Bit 12 reserved (set to 0).
1856 */
1857 value &= 0xffffefff; /* reserved bits => 0 */
1858 value |= 0xffff0ff0; /* reserved bits => 1 */
1859 if ( p == current )
1860 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1861 break;
1862 case 7:
1863 /*
1864 * DR7: Bit 10 reserved (set to 1).
1865 * Bits 11-12,14-15 reserved (set to 0).
1866 * Privileged bits:
1867 * GD (bit 13): must be 0.
1868 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1869 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1870 */
1871 /* DR7 == 0 => debugging disabled for this domain. */
1872 if ( value != 0 )
1874 value &= 0xffff27ff; /* reserved bits => 0 */
1875 value |= 0x00000400; /* reserved bits => 1 */
1876 if ( (value & (1<<13)) != 0 ) return -EPERM;
1877 for ( i = 0; i < 16; i += 2 )
1878 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1880 if ( p == current )
1881 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1882 break;
1883 default:
1884 return -EINVAL;
1887 p->arch.guest_context.debugreg[reg] = value;
1888 return 0;
1891 long do_set_debugreg(int reg, unsigned long value)
1893 return set_debugreg(current, reg, value);
1896 unsigned long do_get_debugreg(int reg)
1898 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1899 return current->arch.guest_context.debugreg[reg];
1902 /*
1903 * Local variables:
1904 * mode: C
1905 * c-set-style: "BSD"
1906 * c-basic-offset: 4
1907 * tab-width: 4
1908 * indent-tabs-mode: nil
1909 * End:
1910 */