direct-io.hg

view xen/arch/x86/traps.c @ 12529:3127a43786d8

[XEN] Small ioemul cleanup.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Nov 23 17:37:23 2006 +0000 (2006-11-23)
parents 2d8784764b52
children 2b43fb3afb3e
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <asm/shadow.h>
49 #include <asm/system.h>
50 #include <asm/io.h>
51 #include <asm/atomic.h>
52 #include <asm/desc.h>
53 #include <asm/debugreg.h>
54 #include <asm/smp.h>
55 #include <asm/flushtlb.h>
56 #include <asm/uaccess.h>
57 #include <asm/i387.h>
58 #include <asm/debugger.h>
59 #include <asm/msr.h>
60 #include <asm/x86_emulate.h>
62 /*
63 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
64 * fatal: Xen prints diagnostic message and then hangs.
65 * dom0: The NMI is virtualised to DOM0.
66 * ignore: The NMI error is cleared and ignored.
67 */
68 #ifdef NDEBUG
69 char opt_nmi[10] = "dom0";
70 #else
71 char opt_nmi[10] = "fatal";
72 #endif
73 string_param("nmi", opt_nmi);
75 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
76 idt_entry_t idt_table[IDT_ENTRIES];
78 #define DECLARE_TRAP_HANDLER(_name) \
79 asmlinkage void _name(void); \
80 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
82 asmlinkage void nmi(void);
83 DECLARE_TRAP_HANDLER(divide_error);
84 DECLARE_TRAP_HANDLER(debug);
85 DECLARE_TRAP_HANDLER(int3);
86 DECLARE_TRAP_HANDLER(overflow);
87 DECLARE_TRAP_HANDLER(bounds);
88 DECLARE_TRAP_HANDLER(invalid_op);
89 DECLARE_TRAP_HANDLER(device_not_available);
90 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
91 DECLARE_TRAP_HANDLER(invalid_TSS);
92 DECLARE_TRAP_HANDLER(segment_not_present);
93 DECLARE_TRAP_HANDLER(stack_segment);
94 DECLARE_TRAP_HANDLER(general_protection);
95 DECLARE_TRAP_HANDLER(page_fault);
96 DECLARE_TRAP_HANDLER(coprocessor_error);
97 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
98 DECLARE_TRAP_HANDLER(alignment_check);
99 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
100 DECLARE_TRAP_HANDLER(machine_check);
102 long do_set_debugreg(int reg, unsigned long value);
103 unsigned long do_get_debugreg(int reg);
105 static int debug_stack_lines = 20;
106 integer_param("debug_stack_lines", debug_stack_lines);
108 #ifdef CONFIG_X86_32
109 #define stack_words_per_line 8
110 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
111 #else
112 #define stack_words_per_line 4
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
114 #endif
116 int is_kernel_text(unsigned long addr)
117 {
118 extern char _stext, _etext;
119 if (addr >= (unsigned long) &_stext &&
120 addr <= (unsigned long) &_etext)
121 return 1;
122 return 0;
124 }
126 unsigned long kernel_text_end(void)
127 {
128 extern char _etext;
129 return (unsigned long) &_etext;
130 }
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
137 if ( is_hvm_vcpu(current) )
138 return;
140 if ( vm86_mode(regs) )
141 {
142 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
143 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
144 regs->ss, (uint16_t)(regs->esp & 0xffff));
145 }
146 else
147 {
148 stack = (unsigned long *)regs->esp;
149 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
150 }
152 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
153 {
154 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
155 break;
156 if ( get_user(addr, stack) )
157 {
158 if ( i != 0 )
159 printk("\n ");
160 printk("Fault while accessing guest memory.");
161 i = 1;
162 break;
163 }
164 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
165 printk("\n ");
166 printk(" %p", _p(addr));
167 stack++;
168 }
169 if ( i == 0 )
170 printk("Stack empty.");
171 printk("\n");
172 }
174 #ifdef NDEBUG
176 static void show_trace(struct cpu_user_regs *regs)
177 {
178 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
180 printk("Xen call trace:\n ");
182 printk("[<%p>]", _p(regs->eip));
183 print_symbol(" %s\n ", regs->eip);
185 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
186 {
187 addr = *stack++;
188 if ( is_kernel_text(addr) )
189 {
190 printk("[<%p>]", _p(addr));
191 print_symbol(" %s\n ", addr);
192 }
193 }
195 printk("\n");
196 }
198 #else
200 static void show_trace(struct cpu_user_regs *regs)
201 {
202 unsigned long *frame, next, addr, low, high;
204 printk("Xen call trace:\n ");
206 printk("[<%p>]", _p(regs->eip));
207 print_symbol(" %s\n ", regs->eip);
209 /* Bounds for range of valid frame pointer. */
210 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
211 high = (low & ~(STACK_SIZE - 1)) +
212 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
214 /* The initial frame pointer. */
215 next = regs->ebp;
217 for ( ; ; )
218 {
219 /* Valid frame pointer? */
220 if ( (next < low) || (next >= high) )
221 {
222 /*
223 * Exception stack frames have a different layout, denoted by an
224 * inverted frame pointer.
225 */
226 next = ~next;
227 if ( (next < low) || (next >= high) )
228 break;
229 frame = (unsigned long *)next;
230 next = frame[0];
231 addr = frame[(offsetof(struct cpu_user_regs, eip) -
232 offsetof(struct cpu_user_regs, ebp))
233 / BYTES_PER_LONG];
234 }
235 else
236 {
237 /* Ordinary stack frame. */
238 frame = (unsigned long *)next;
239 next = frame[0];
240 addr = frame[1];
241 }
243 printk("[<%p>]", _p(addr));
244 print_symbol(" %s\n ", addr);
246 low = (unsigned long)&frame[2];
247 }
249 printk("\n");
250 }
252 #endif
254 void show_stack(struct cpu_user_regs *regs)
255 {
256 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
257 int i;
259 if ( guest_mode(regs) )
260 return show_guest_stack(regs);
262 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
264 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
265 {
266 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
267 break;
268 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
269 printk("\n ");
270 addr = *stack++;
271 printk(" %p", _p(addr));
272 }
273 if ( i == 0 )
274 printk("Stack empty.");
275 printk("\n");
277 show_trace(regs);
278 }
280 void show_xen_trace()
281 {
282 struct cpu_user_regs regs;
283 #ifdef __x86_64
284 __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
285 __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
286 __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
287 #else
288 __asm__("movl %%esp,%0" : "=m" (regs.esp));
289 __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
290 __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
291 #endif
292 show_trace(&regs);
293 }
295 void show_stack_overflow(unsigned long esp)
296 {
297 #ifdef MEMORY_GUARD
298 unsigned long esp_top;
299 unsigned long *stack, addr;
301 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
303 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
304 if ( ((unsigned long)(esp - esp_top) > 512) &&
305 ((unsigned long)(esp_top - esp) > 512) )
306 return;
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow:\n ");
313 stack = (unsigned long *)esp;
314 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
315 {
316 addr = *stack++;
317 if ( is_kernel_text(addr) )
318 {
319 printk("%p: [<%p>]", stack, _p(addr));
320 print_symbol(" %s\n ", addr);
321 }
322 }
324 printk("\n");
325 #endif
326 }
328 void show_execution_state(struct cpu_user_regs *regs)
329 {
330 show_registers(regs);
331 show_stack(regs);
332 }
334 char *trapstr(int trapnr)
335 {
336 static char *strings[] = {
337 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
338 "invalid opcode", "device not available", "double fault",
339 "coprocessor segment", "invalid tss", "segment not found",
340 "stack error", "general protection fault", "page fault",
341 "spurious interrupt", "coprocessor error", "alignment check",
342 "machine check", "simd error"
343 };
345 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
346 return "???";
348 return strings[trapnr];
349 }
351 /*
352 * This is called for faults at very unexpected times (e.g., when interrupts
353 * are disabled). In such situations we can't do much that is safe. We try to
354 * print out some tracing and then we just spin.
355 */
356 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
357 {
358 watchdog_disable();
359 console_start_sync();
361 show_execution_state(regs);
363 if ( trapnr == TRAP_page_fault )
364 {
365 unsigned long cr2 = read_cr2();
366 printk("Faulting linear address: %p\n", _p(cr2));
367 show_page_walk(cr2);
368 }
370 panic("FATAL TRAP: vector = %d (%s)\n"
371 "[error_code=%04x] %s\n",
372 trapnr, trapstr(trapnr), regs->error_code,
373 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
374 }
376 static int do_guest_trap(
377 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
378 {
379 struct vcpu *v = current;
380 struct trap_bounce *tb;
381 const struct trap_info *ti;
383 tb = &v->arch.trap_bounce;
384 ti = &v->arch.guest_context.trap_ctxt[trapnr];
386 tb->flags = TBF_EXCEPTION;
387 tb->cs = ti->cs;
388 tb->eip = ti->address;
390 if ( use_error_code )
391 {
392 tb->flags |= TBF_EXCEPTION_ERRCODE;
393 tb->error_code = regs->error_code;
394 }
396 if ( TI_GET_IF(ti) )
397 tb->flags |= TBF_INTERRUPT;
399 if ( unlikely(null_trap_bounce(tb)) )
400 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
401 "domain %d on VCPU %d [ec=%04x]\n",
402 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
403 regs->error_code);
405 return 0;
406 }
408 static inline int do_trap(
409 int trapnr, struct cpu_user_regs *regs, int use_error_code)
410 {
411 unsigned long fixup;
413 DEBUGGER_trap_entry(trapnr, regs);
415 if ( guest_mode(regs) )
416 return do_guest_trap(trapnr, regs, use_error_code);
418 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
419 {
420 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
421 trapnr, _p(regs->eip), _p(fixup));
422 regs->eip = fixup;
423 return 0;
424 }
426 DEBUGGER_trap_fatal(trapnr, regs);
428 show_execution_state(regs);
429 panic("FATAL TRAP: vector = %d (%s)\n"
430 "[error_code=%04x]\n",
431 trapnr, trapstr(trapnr), regs->error_code);
432 return 0;
433 }
435 #define DO_ERROR_NOCODE(trapnr, name) \
436 asmlinkage int do_##name(struct cpu_user_regs *regs) \
437 { \
438 return do_trap(trapnr, regs, 0); \
439 }
441 #define DO_ERROR(trapnr, name) \
442 asmlinkage int do_##name(struct cpu_user_regs *regs) \
443 { \
444 return do_trap(trapnr, regs, 1); \
445 }
447 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
448 DO_ERROR_NOCODE(TRAP_overflow, overflow)
449 DO_ERROR_NOCODE(TRAP_bounds, bounds)
450 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
451 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
452 DO_ERROR( TRAP_no_segment, segment_not_present)
453 DO_ERROR( TRAP_stack_error, stack_segment)
454 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
455 DO_ERROR( TRAP_alignment_check, alignment_check)
456 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
458 int rdmsr_hypervisor_regs(
459 uint32_t idx, uint32_t *eax, uint32_t *edx)
460 {
461 idx -= 0x40000000;
462 if ( idx > 0 )
463 return 0;
465 *eax = *edx = 0;
466 return 1;
467 }
469 int wrmsr_hypervisor_regs(
470 uint32_t idx, uint32_t eax, uint32_t edx)
471 {
472 struct domain *d = current->domain;
474 idx -= 0x40000000;
475 if ( idx > 0 )
476 return 0;
478 switch ( idx )
479 {
480 case 0:
481 {
482 void *hypercall_page;
483 unsigned long mfn;
484 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
485 unsigned int idx = eax & 0xfff;
487 if ( idx > 0 )
488 {
489 gdprintk(XENLOG_WARNING,
490 "Dom%d: Out of range index %u to MSR %08x\n",
491 d->domain_id, idx, 0x40000000);
492 return 0;
493 }
495 mfn = gmfn_to_mfn(d, gmfn);
497 if ( !mfn_valid(mfn) ||
498 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
499 {
500 gdprintk(XENLOG_WARNING,
501 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
502 d->domain_id, gmfn, mfn, 0x40000000);
503 return 0;
504 }
506 hypercall_page = map_domain_page(mfn);
507 hypercall_page_initialise(d, hypercall_page);
508 unmap_domain_page(hypercall_page);
510 put_page_and_type(mfn_to_page(mfn));
511 break;
512 }
514 default:
515 BUG();
516 }
518 return 1;
519 }
521 int cpuid_hypervisor_leaves(
522 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
523 {
524 idx -= 0x40000000;
525 if ( idx > 2 )
526 return 0;
528 switch ( idx )
529 {
530 case 0:
531 *eax = 0x40000002; /* Largest leaf */
532 *ebx = 0x566e6558; /* Signature 1: "XenV" */
533 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
534 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
535 break;
537 case 1:
538 *eax = (xen_major_version() << 16) | xen_minor_version();
539 *ebx = 0; /* Reserved */
540 *ecx = 0; /* Reserved */
541 *edx = 0; /* Reserved */
542 break;
544 case 2:
545 *eax = 1; /* Number of hypercall-transfer pages */
546 *ebx = 0x40000000; /* MSR base address */
547 *ecx = 0; /* Features 1 */
548 *edx = 0; /* Features 2 */
549 break;
551 default:
552 BUG();
553 }
555 return 1;
556 }
558 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
559 {
560 char sig[5], instr[2];
561 uint32_t a, b, c, d;
562 unsigned long eip, rc;
564 a = regs->eax;
565 b = regs->ebx;
566 c = regs->ecx;
567 d = regs->edx;
568 eip = regs->eip;
570 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
571 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
572 {
573 propagate_page_fault(eip + sizeof(sig) - rc, 0);
574 return EXCRET_fault_fixed;
575 }
576 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
577 return 0;
578 eip += sizeof(sig);
580 /* We only emulate CPUID. */
581 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
582 {
583 propagate_page_fault(eip + sizeof(instr) - rc, 0);
584 return EXCRET_fault_fixed;
585 }
586 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
587 return 0;
588 eip += sizeof(instr);
590 __asm__ (
591 "cpuid"
592 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
593 : "0" (a), "1" (b), "2" (c), "3" (d) );
595 if ( regs->eax == 1 )
596 {
597 /* Modify Feature Information. */
598 clear_bit(X86_FEATURE_VME, &d);
599 clear_bit(X86_FEATURE_DE, &d);
600 clear_bit(X86_FEATURE_PSE, &d);
601 clear_bit(X86_FEATURE_PGE, &d);
602 if ( !supervisor_mode_kernel )
603 clear_bit(X86_FEATURE_SEP, &d);
604 if ( !IS_PRIV(current->domain) )
605 clear_bit(X86_FEATURE_MTRR, &d);
606 }
607 else
608 {
609 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
610 }
612 regs->eax = a;
613 regs->ebx = b;
614 regs->ecx = c;
615 regs->edx = d;
616 regs->eip = eip;
618 return EXCRET_fault_fixed;
619 }
621 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
622 {
623 int rc;
625 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
627 if ( unlikely(!guest_mode(regs)) )
628 {
629 char sig[5];
630 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
631 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
632 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
633 {
634 show_execution_state(regs);
635 regs->eip += sizeof(sig);
636 return EXCRET_fault_fixed;
637 }
638 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
639 show_execution_state(regs);
640 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
641 }
643 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
644 return rc;
646 return do_guest_trap(TRAP_invalid_op, regs, 0);
647 }
649 asmlinkage int do_int3(struct cpu_user_regs *regs)
650 {
651 DEBUGGER_trap_entry(TRAP_int3, regs);
653 if ( !guest_mode(regs) )
654 {
655 DEBUGGER_trap_fatal(TRAP_int3, regs);
656 show_execution_state(regs);
657 panic("FATAL TRAP: vector = 3 (Int3)\n");
658 }
660 return do_guest_trap(TRAP_int3, regs, 0);
661 }
663 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
664 {
665 fatal_trap(TRAP_machine_check, regs);
666 return 0;
667 }
669 void propagate_page_fault(unsigned long addr, u16 error_code)
670 {
671 struct trap_info *ti;
672 struct vcpu *v = current;
673 struct trap_bounce *tb = &v->arch.trap_bounce;
675 v->arch.guest_context.ctrlreg[2] = addr;
676 v->vcpu_info->arch.cr2 = addr;
678 /* Re-set error_code.user flag appropriately for the guest. */
679 error_code &= ~PFEC_user_mode;
680 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
681 error_code |= PFEC_user_mode;
683 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
684 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
685 tb->error_code = error_code;
686 tb->cs = ti->cs;
687 tb->eip = ti->address;
688 if ( TI_GET_IF(ti) )
689 tb->flags |= TBF_INTERRUPT;
690 if ( unlikely(null_trap_bounce(tb)) )
691 {
692 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
693 v->domain->domain_id, v->vcpu_id, error_code);
694 show_page_walk(addr);
695 }
696 }
698 static int handle_gdt_ldt_mapping_fault(
699 unsigned long offset, struct cpu_user_regs *regs)
700 {
701 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
702 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
703 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
705 /* Should never fault in another vcpu's area. */
706 BUG_ON(vcpu_area != current->vcpu_id);
708 /* Byte offset within the gdt/ldt sub-area. */
709 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
711 if ( likely(is_ldt_area) )
712 {
713 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
714 if ( unlikely(map_ldt_shadow_page(offset >> PAGE_SHIFT) == 0) )
715 {
716 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
717 if ( !guest_mode(regs) )
718 return 0;
719 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
720 propagate_page_fault(
721 current->arch.guest_context.ldt_base + offset,
722 regs->error_code);
723 }
724 }
725 else
726 {
727 /* GDT fault: handle the fault as #GP(selector). */
728 regs->error_code = (u16)offset & ~7;
729 (void)do_general_protection(regs);
730 }
732 return EXCRET_fault_fixed;
733 }
735 #ifdef HYPERVISOR_VIRT_END
736 #define IN_HYPERVISOR_RANGE(va) \
737 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
738 #else
739 #define IN_HYPERVISOR_RANGE(va) \
740 (((va) >= HYPERVISOR_VIRT_START))
741 #endif
743 static int __spurious_page_fault(
744 unsigned long addr, struct cpu_user_regs *regs)
745 {
746 unsigned long mfn, cr3 = read_cr3();
747 #if CONFIG_PAGING_LEVELS >= 4
748 l4_pgentry_t l4e, *l4t;
749 #endif
750 #if CONFIG_PAGING_LEVELS >= 3
751 l3_pgentry_t l3e, *l3t;
752 #endif
753 l2_pgentry_t l2e, *l2t;
754 l1_pgentry_t l1e, *l1t;
755 unsigned int required_flags, disallowed_flags;
757 /* Reserved bit violations are never spurious faults. */
758 if ( regs->error_code & PFEC_reserved_bit )
759 return 0;
761 required_flags = _PAGE_PRESENT;
762 if ( regs->error_code & PFEC_write_access )
763 required_flags |= _PAGE_RW;
764 if ( regs->error_code & PFEC_user_mode )
765 required_flags |= _PAGE_USER;
767 disallowed_flags = 0;
768 if ( regs->error_code & PFEC_insn_fetch )
769 disallowed_flags |= _PAGE_NX;
771 mfn = cr3 >> PAGE_SHIFT;
773 #if CONFIG_PAGING_LEVELS >= 4
774 l4t = map_domain_page(mfn);
775 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
776 mfn = l4e_get_pfn(l4e);
777 unmap_domain_page(l4t);
778 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
779 (l4e_get_flags(l4e) & disallowed_flags) )
780 return 0;
781 #endif
783 #if CONFIG_PAGING_LEVELS >= 3
784 l3t = map_domain_page(mfn);
785 #ifdef CONFIG_X86_PAE
786 l3t += (cr3 & 0xFE0UL) >> 3;
787 #endif
788 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
789 mfn = l3e_get_pfn(l3e);
790 unmap_domain_page(l3t);
791 #ifdef CONFIG_X86_PAE
792 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
793 return 0;
794 #else
795 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
796 (l3e_get_flags(l3e) & disallowed_flags) )
797 return 0;
798 #endif
799 #endif
801 l2t = map_domain_page(mfn);
802 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
803 mfn = l2e_get_pfn(l2e);
804 unmap_domain_page(l2t);
805 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
806 (l2e_get_flags(l2e) & disallowed_flags) )
807 return 0;
808 if ( l2e_get_flags(l2e) & _PAGE_PSE )
809 {
810 l1e = l1e_empty(); /* define before use in debug tracing */
811 goto spurious;
812 }
814 l1t = map_domain_page(mfn);
815 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
816 mfn = l1e_get_pfn(l1e);
817 unmap_domain_page(l1t);
818 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
819 (l1e_get_flags(l1e) & disallowed_flags) )
820 return 0;
822 spurious:
823 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
824 "at addr %lx, e/c %04x\n",
825 current->domain->domain_id, current->vcpu_id,
826 addr, regs->error_code);
827 #if CONFIG_PAGING_LEVELS >= 4
828 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
829 #endif
830 #if CONFIG_PAGING_LEVELS >= 3
831 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
832 #endif
833 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
834 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
835 #ifndef NDEBUG
836 show_registers(regs);
837 #endif
838 return 1;
839 }
841 static int spurious_page_fault(
842 unsigned long addr, struct cpu_user_regs *regs)
843 {
844 unsigned long flags;
845 int is_spurious;
847 /*
848 * Disabling interrupts prevents TLB flushing, and hence prevents
849 * page tables from becoming invalid under our feet during the walk.
850 */
851 local_irq_save(flags);
852 is_spurious = __spurious_page_fault(addr, regs);
853 local_irq_restore(flags);
855 return is_spurious;
856 }
858 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
859 {
860 struct vcpu *v = current;
861 struct domain *d = v->domain;
863 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
864 {
865 if ( shadow_mode_external(d) && guest_mode(regs) )
866 return shadow_fault(addr, regs);
867 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
868 return handle_gdt_ldt_mapping_fault(
869 addr - GDT_LDT_VIRT_START, regs);
870 return 0;
871 }
873 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
874 guest_kernel_mode(v, regs) &&
875 /* Do not check if access-protection fault since the page may
876 legitimately be not present in shadow page tables */
877 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
878 ptwr_do_page_fault(v, addr, regs) )
879 return EXCRET_fault_fixed;
881 if ( shadow_mode_enabled(d) )
882 return shadow_fault(addr, regs);
884 return 0;
885 }
887 /*
888 * #PF error code:
889 * Bit 0: Protection violation (=1) ; Page not present (=0)
890 * Bit 1: Write access
891 * Bit 2: User mode (=1) ; Supervisor mode (=0)
892 * Bit 3: Reserved bit violation
893 * Bit 4: Instruction fetch
894 */
895 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
896 {
897 unsigned long addr, fixup;
898 int rc;
900 ASSERT(!in_irq());
902 addr = read_cr2();
904 DEBUGGER_trap_entry(TRAP_page_fault, regs);
906 perfc_incrc(page_faults);
908 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
909 return rc;
911 if ( unlikely(!guest_mode(regs)) )
912 {
913 if ( spurious_page_fault(addr, regs) )
914 return EXCRET_not_a_fault;
916 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
917 {
918 perfc_incrc(copy_user_faults);
919 regs->eip = fixup;
920 return 0;
921 }
923 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
925 show_execution_state(regs);
926 show_page_walk(addr);
927 panic("FATAL PAGE FAULT\n"
928 "[error_code=%04x]\n"
929 "Faulting linear address: %p\n",
930 regs->error_code, _p(addr));
931 }
933 propagate_page_fault(addr, regs->error_code);
934 return 0;
935 }
937 long do_fpu_taskswitch(int set)
938 {
939 struct vcpu *v = current;
941 if ( set )
942 {
943 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
944 stts();
945 }
946 else
947 {
948 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
949 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
950 clts();
951 }
953 return 0;
954 }
956 /* Has the guest requested sufficient permission for this I/O access? */
957 static inline int guest_io_okay(
958 unsigned int port, unsigned int bytes,
959 struct vcpu *v, struct cpu_user_regs *regs)
960 {
961 #if defined(__x86_64__)
962 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
963 int user_mode = !(v->arch.flags & TF_kernel_mode);
964 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
965 #elif defined(__i386__)
966 #define TOGGLE_MODE() ((void)0)
967 #endif
969 if ( !vm86_mode(regs) &&
970 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
971 return 1;
973 if ( v->arch.iobmp_limit > (port + bytes) )
974 {
975 union { uint8_t bytes[2]; uint16_t mask; } x;
977 /*
978 * Grab permission bytes from guest space. Inaccessible bytes are
979 * read as 0xff (no access allowed).
980 */
981 TOGGLE_MODE();
982 switch ( __copy_from_guest_offset(&x.bytes[0], v->arch.iobmp,
983 port>>3, 2) )
984 {
985 default: x.bytes[0] = ~0;
986 case 1: x.bytes[1] = ~0;
987 case 0: break;
988 }
989 TOGGLE_MODE();
991 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
992 return 1;
993 }
995 return 0;
996 }
998 /* Has the administrator granted sufficient permission for this I/O access? */
999 static inline int admin_io_okay(
1000 unsigned int port, unsigned int bytes,
1001 struct vcpu *v, struct cpu_user_regs *regs)
1003 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1006 static inline int guest_inb_okay(
1007 unsigned int port, struct vcpu *v, struct cpu_user_regs *regs)
1009 /*
1010 * Allow read access to port 0x61. Bit 4 oscillates with period 30us, and
1011 * so it is often used for timing loops in BIOS code. This hack can go
1012 * away when we have separate read/write permission rangesets.
1013 * Note that we could emulate bit 4 instead of directly reading port 0x61,
1014 * but there's not really a good reason to do so.
1015 */
1016 return (admin_io_okay(port, 1, v, regs) || (port == 0x61));
1018 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1019 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1020 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1021 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1022 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1024 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1025 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1026 __attribute__((__regparm__(1)));
1027 unsigned long guest_to_host_gpr_switch(unsigned long)
1028 __attribute__((__regparm__(1)));
1030 /* Instruction fetch with error handling. */
1031 #define insn_fetch(_type, _size, cs, eip) \
1032 ({ unsigned long _rc, _x, _ptr = eip; \
1033 if ( vm86_mode(regs) ) \
1034 _ptr += cs << 4; \
1035 if ( (_rc = copy_from_user(&_x, (_type *)_ptr, sizeof(_type))) != 0 ) \
1036 { \
1037 propagate_page_fault(eip + sizeof(_type) - _rc, 0); \
1038 return EXCRET_fault_fixed; \
1039 } \
1040 eip += _size; (_type)_x; })
1042 static int emulate_privileged_op(struct cpu_user_regs *regs)
1044 struct vcpu *v = current;
1045 unsigned long *reg, eip = regs->eip, cs = regs->cs, res;
1046 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
1047 unsigned int port, i, op_bytes = 4, data, rc;
1048 char io_emul_stub[16];
1049 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1050 u32 l, h;
1052 /* Legacy prefixes. */
1053 for ( i = 0; i < 8; i++ )
1055 switch ( opcode = insn_fetch(u8, 1, cs, eip) )
1057 case 0x66: /* operand-size override */
1058 op_bytes ^= 6; /* switch between 2/4 bytes */
1059 break;
1060 case 0x67: /* address-size override */
1061 case 0x2e: /* CS override */
1062 case 0x3e: /* DS override */
1063 case 0x26: /* ES override */
1064 case 0x64: /* FS override */
1065 case 0x65: /* GS override */
1066 case 0x36: /* SS override */
1067 case 0xf0: /* LOCK */
1068 case 0xf2: /* REPNE/REPNZ */
1069 break;
1070 case 0xf3: /* REP/REPE/REPZ */
1071 rep_prefix = 1;
1072 break;
1073 default:
1074 goto done_prefixes;
1077 done_prefixes:
1079 #ifdef __x86_64__
1080 /* REX prefix. */
1081 if ( (opcode & 0xf0) == 0x40 )
1083 modrm_reg = (opcode & 4) << 1; /* REX.R */
1084 modrm_rm = (opcode & 1) << 3; /* REX.B */
1086 /* REX.W and REX.X do not need to be decoded. */
1087 opcode = insn_fetch(u8, 1, cs, eip);
1089 #endif
1091 if ( opcode == 0x0f )
1092 goto twobyte_opcode;
1094 /* Input/Output String instructions. */
1095 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1097 if ( rep_prefix && (regs->ecx == 0) )
1098 goto done;
1100 continue_io_string:
1101 switch ( opcode )
1103 case 0x6c: /* INSB */
1104 op_bytes = 1;
1105 case 0x6d: /* INSW/INSL */
1106 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1107 goto fail;
1108 port = (u16)regs->edx;
1109 switch ( op_bytes )
1111 case 1:
1112 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) : ~0);
1113 break;
1114 case 2:
1115 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1116 break;
1117 case 4:
1118 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1119 break;
1121 if ( (rc = copy_to_user((void *)regs->edi, &data, op_bytes)) != 0 )
1123 propagate_page_fault(regs->edi + op_bytes - rc,
1124 PFEC_write_access);
1125 return EXCRET_fault_fixed;
1127 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1128 break;
1130 case 0x6e: /* OUTSB */
1131 op_bytes = 1;
1132 case 0x6f: /* OUTSW/OUTSL */
1133 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1134 goto fail;
1135 rc = copy_from_user(&data, (void *)regs->esi, op_bytes);
1136 if ( rc != 0 )
1138 propagate_page_fault(regs->esi + op_bytes - rc, 0);
1139 return EXCRET_fault_fixed;
1141 port = (u16)regs->edx;
1142 switch ( op_bytes )
1144 case 1:
1145 if ( guest_outb_okay(port, v, regs) )
1146 outb((u8)data, port);
1147 break;
1148 case 2:
1149 if ( guest_outw_okay(port, v, regs) )
1150 outw((u16)data, port);
1151 break;
1152 case 4:
1153 if ( guest_outl_okay(port, v, regs) )
1154 outl((u32)data, port);
1155 break;
1157 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1158 break;
1161 if ( rep_prefix && (--regs->ecx != 0) )
1163 if ( !hypercall_preempt_check() )
1164 goto continue_io_string;
1165 eip = regs->eip;
1168 goto done;
1171 /*
1172 * Very likely to be an I/O instruction (IN/OUT).
1173 * Build an on-stack stub to execute the instruction with full guest
1174 * GPR context. This is needed for some systems which (ab)use IN/OUT
1175 * to communicate with BIOS code in system-management mode.
1176 */
1177 /* call host_to_guest_gpr_switch */
1178 io_emul_stub[0] = 0xe8;
1179 *(s32 *)&io_emul_stub[1] =
1180 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1181 /* data16 or nop */
1182 io_emul_stub[5] = (op_bytes != 2) ? 0x90 : 0x66;
1183 /* <io-access opcode> */
1184 io_emul_stub[6] = opcode;
1185 /* imm8 or nop */
1186 io_emul_stub[7] = 0x90;
1187 /* jmp guest_to_host_gpr_switch */
1188 io_emul_stub[8] = 0xe9;
1189 *(s32 *)&io_emul_stub[9] =
1190 (char *)guest_to_host_gpr_switch - &io_emul_stub[13];
1192 /* Handy function-typed pointer to the stub. */
1193 io_emul = (void *)io_emul_stub;
1195 /* I/O Port and Interrupt Flag instructions. */
1196 switch ( opcode )
1198 case 0xe4: /* IN imm8,%al */
1199 op_bytes = 1;
1200 case 0xe5: /* IN imm8,%eax */
1201 port = insn_fetch(u8, 1, cs, eip);
1202 io_emul_stub[7] = port; /* imm8 */
1203 exec_in:
1204 if ( !guest_io_okay(port, op_bytes, v, regs) )
1205 goto fail;
1206 switch ( op_bytes )
1208 case 1:
1209 if ( guest_inb_okay(port, v, regs) )
1210 io_emul(regs);
1211 else
1212 regs->eax |= (u8)~0;
1213 break;
1214 case 2:
1215 if ( guest_inw_okay(port, v, regs) )
1216 io_emul(regs);
1217 else
1218 regs->eax |= (u16)~0;
1219 break;
1220 case 4:
1221 if ( guest_inl_okay(port, v, regs) )
1222 io_emul(regs);
1223 else
1224 regs->eax = (u32)~0;
1225 break;
1227 goto done;
1229 case 0xec: /* IN %dx,%al */
1230 op_bytes = 1;
1231 case 0xed: /* IN %dx,%eax */
1232 port = (u16)regs->edx;
1233 goto exec_in;
1235 case 0xe6: /* OUT %al,imm8 */
1236 op_bytes = 1;
1237 case 0xe7: /* OUT %eax,imm8 */
1238 port = insn_fetch(u8, 1, cs, eip);
1239 io_emul_stub[7] = port; /* imm8 */
1240 exec_out:
1241 if ( !guest_io_okay(port, op_bytes, v, regs) )
1242 goto fail;
1243 switch ( op_bytes )
1245 case 1:
1246 if ( guest_outb_okay(port, v, regs) )
1247 io_emul(regs);
1248 break;
1249 case 2:
1250 if ( guest_outw_okay(port, v, regs) )
1251 io_emul(regs);
1252 break;
1253 case 4:
1254 if ( guest_outl_okay(port, v, regs) )
1255 io_emul(regs);
1256 break;
1258 goto done;
1260 case 0xee: /* OUT %al,%dx */
1261 op_bytes = 1;
1262 case 0xef: /* OUT %eax,%dx */
1263 port = (u16)regs->edx;
1264 goto exec_out;
1266 case 0xfa: /* CLI */
1267 case 0xfb: /* STI */
1268 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1269 goto fail;
1270 /*
1271 * This is just too dangerous to allow, in my opinion. Consider if the
1272 * caller then tries to reenable interrupts using POPF: we can't trap
1273 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1274 * do for us. :-)
1275 */
1276 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1277 goto done;
1280 /* No decode of this single-byte opcode. */
1281 goto fail;
1283 twobyte_opcode:
1284 /* Two-byte opcodes only emulated from guest kernel. */
1285 if ( !guest_kernel_mode(v, regs) )
1286 goto fail;
1288 /* Privileged (ring 0) instructions. */
1289 opcode = insn_fetch(u8, 1, cs, eip);
1290 switch ( opcode )
1292 case 0x06: /* CLTS */
1293 (void)do_fpu_taskswitch(0);
1294 break;
1296 case 0x09: /* WBINVD */
1297 /* Ignore the instruction if unprivileged. */
1298 if ( !cache_flush_permitted(v->domain) )
1299 /* Non-physdev domain attempted WBINVD; ignore for now since
1300 newer linux uses this in some start-of-day timing loops */
1302 else
1303 wbinvd();
1304 break;
1306 case 0x20: /* MOV CR?,<reg> */
1307 opcode = insn_fetch(u8, 1, cs, eip);
1308 modrm_reg |= (opcode >> 3) & 7;
1309 modrm_rm |= (opcode >> 0) & 7;
1310 reg = decode_register(modrm_rm, regs, 0);
1311 switch ( modrm_reg )
1313 case 0: /* Read CR0 */
1314 *reg = (read_cr0() & ~X86_CR0_TS) |
1315 v->arch.guest_context.ctrlreg[0];
1316 break;
1318 case 2: /* Read CR2 */
1319 *reg = v->arch.guest_context.ctrlreg[2];
1320 break;
1322 case 3: /* Read CR3 */
1323 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1324 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1325 break;
1327 case 4: /* Read CR4 */
1328 /*
1329 * Guests can read CR4 to see what features Xen has enabled. We
1330 * therefore lie about PGE & PSE as they are unavailable to guests.
1331 */
1332 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1333 break;
1335 default:
1336 goto fail;
1338 break;
1340 case 0x21: /* MOV DR?,<reg> */
1341 opcode = insn_fetch(u8, 1, cs, eip);
1342 modrm_reg |= (opcode >> 3) & 7;
1343 modrm_rm |= (opcode >> 0) & 7;
1344 reg = decode_register(modrm_rm, regs, 0);
1345 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1346 goto fail;
1347 *reg = res;
1348 break;
1350 case 0x22: /* MOV <reg>,CR? */
1351 opcode = insn_fetch(u8, 1, cs, eip);
1352 modrm_reg |= (opcode >> 3) & 7;
1353 modrm_rm |= (opcode >> 0) & 7;
1354 reg = decode_register(modrm_rm, regs, 0);
1355 switch ( modrm_reg )
1357 case 0: /* Write CR0 */
1358 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1360 gdprintk(XENLOG_WARNING,
1361 "Attempt to change unmodifiable CR0 flags.\n");
1362 goto fail;
1364 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1365 break;
1367 case 2: /* Write CR2 */
1368 v->arch.guest_context.ctrlreg[2] = *reg;
1369 v->vcpu_info->arch.cr2 = *reg;
1370 break;
1372 case 3: /* Write CR3 */
1373 LOCK_BIGLOCK(v->domain);
1374 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1375 UNLOCK_BIGLOCK(v->domain);
1376 if ( rc == 0 ) /* not okay */
1377 goto fail;
1378 break;
1380 case 4:
1381 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1383 gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags.\n");
1384 goto fail;
1386 break;
1388 default:
1389 goto fail;
1391 break;
1393 case 0x23: /* MOV <reg>,DR? */
1394 opcode = insn_fetch(u8, 1, cs, eip);
1395 modrm_reg |= (opcode >> 3) & 7;
1396 modrm_rm |= (opcode >> 0) & 7;
1397 reg = decode_register(modrm_rm, regs, 0);
1398 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1399 goto fail;
1400 break;
1402 case 0x30: /* WRMSR */
1403 switch ( regs->ecx )
1405 #ifdef CONFIG_X86_64
1406 case MSR_FS_BASE:
1407 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1408 goto fail;
1409 v->arch.guest_context.fs_base =
1410 ((u64)regs->edx << 32) | regs->eax;
1411 break;
1412 case MSR_GS_BASE:
1413 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1414 goto fail;
1415 v->arch.guest_context.gs_base_kernel =
1416 ((u64)regs->edx << 32) | regs->eax;
1417 break;
1418 case MSR_SHADOW_GS_BASE:
1419 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1420 goto fail;
1421 v->arch.guest_context.gs_base_user =
1422 ((u64)regs->edx << 32) | regs->eax;
1423 break;
1424 #endif
1425 default:
1426 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1427 break;
1429 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1430 (regs->eax != l) || (regs->edx != h) )
1431 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1432 "%08x:%08x to %08lx:%08lx.\n",
1433 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1434 break;
1436 break;
1438 case 0x32: /* RDMSR */
1439 switch ( regs->ecx )
1441 #ifdef CONFIG_X86_64
1442 case MSR_FS_BASE:
1443 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1444 regs->edx = v->arch.guest_context.fs_base >> 32;
1445 break;
1446 case MSR_GS_BASE:
1447 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1448 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1449 break;
1450 case MSR_SHADOW_GS_BASE:
1451 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1452 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1453 break;
1454 #endif
1455 case MSR_EFER:
1456 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1457 goto fail;
1458 break;
1459 default:
1460 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1462 regs->eax = l;
1463 regs->edx = h;
1464 break;
1466 /* Everyone can read the MSR space. */
1467 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1468 _p(regs->ecx));*/
1469 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1470 goto fail;
1471 break;
1473 break;
1475 default:
1476 goto fail;
1479 done:
1480 regs->eip = eip;
1481 return EXCRET_fault_fixed;
1483 fail:
1484 return 0;
1487 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1489 struct vcpu *v = current;
1490 unsigned long fixup;
1492 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1494 if ( regs->error_code & 1 )
1495 goto hardware_gp;
1497 if ( !guest_mode(regs) )
1498 goto gp_in_kernel;
1500 /*
1501 * Cunning trick to allow arbitrary "INT n" handling.
1503 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1504 * instruction from trapping to the appropriate vector, when that might not
1505 * be expected by Xen or the guest OS. For example, that entry might be for
1506 * a fault handler (unlike traps, faults don't increment EIP), or might
1507 * expect an error code on the stack (which a software trap never
1508 * provides), or might be a hardware interrupt handler that doesn't like
1509 * being called spuriously.
1511 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1512 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1513 * clear to indicate that it's a software fault, not hardware.
1515 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1516 * okay because they can only be triggered by an explicit DPL-checked
1517 * instruction. The DPL specified by the guest OS for these vectors is NOT
1518 * CHECKED!!
1519 */
1520 if ( (regs->error_code & 3) == 2 )
1522 /* This fault must be due to <INT n> instruction. */
1523 const struct trap_info *ti;
1524 unsigned char vector = regs->error_code >> 3;
1525 ti = &v->arch.guest_context.trap_ctxt[vector];
1526 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1528 regs->eip += 2;
1529 return do_guest_trap(vector, regs, 0);
1533 /* Emulate some simple privileged and I/O instructions. */
1534 if ( (regs->error_code == 0) &&
1535 emulate_privileged_op(regs) )
1536 return 0;
1538 #if defined(__i386__)
1539 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1540 (regs->error_code == 0) &&
1541 gpf_emulate_4gb(regs) )
1542 return 0;
1543 #endif
1545 /* Pass on GPF as is. */
1546 return do_guest_trap(TRAP_gp_fault, regs, 1);
1548 gp_in_kernel:
1550 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1552 dprintk(XENLOG_WARNING, "GPF (%04x): %p -> %p\n",
1553 regs->error_code, _p(regs->eip), _p(fixup));
1554 regs->eip = fixup;
1555 return 0;
1558 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1560 hardware_gp:
1561 show_execution_state(regs);
1562 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1563 return 0;
1566 static void nmi_softirq(void)
1568 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1569 vcpu_kick(dom0->vcpu[0]);
1572 static void nmi_dom0_report(unsigned int reason_idx)
1574 struct domain *d;
1575 struct vcpu *v;
1577 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1578 return;
1580 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1582 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1583 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1586 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1588 switch ( opt_nmi[0] )
1590 case 'd': /* 'dom0' */
1591 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1592 case 'i': /* 'ignore' */
1593 break;
1594 default: /* 'fatal' */
1595 console_force_unlock();
1596 printk("\n\nNMI - MEMORY ERROR\n");
1597 fatal_trap(TRAP_nmi, regs);
1600 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1601 mdelay(1);
1602 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1605 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1607 switch ( opt_nmi[0] )
1609 case 'd': /* 'dom0' */
1610 nmi_dom0_report(_XEN_NMIREASON_io_error);
1611 case 'i': /* 'ignore' */
1612 break;
1613 default: /* 'fatal' */
1614 console_force_unlock();
1615 printk("\n\nNMI - I/O ERROR\n");
1616 fatal_trap(TRAP_nmi, regs);
1619 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1620 mdelay(1);
1621 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1624 static void unknown_nmi_error(unsigned char reason)
1626 switch ( opt_nmi[0] )
1628 case 'd': /* 'dom0' */
1629 nmi_dom0_report(_XEN_NMIREASON_unknown);
1630 case 'i': /* 'ignore' */
1631 break;
1632 default: /* 'fatal' */
1633 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1634 printk("Dazed and confused, but trying to continue\n");
1635 printk("Do you have a strange power saving mode enabled?\n");
1639 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1641 return 0;
1644 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1646 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1648 unsigned int cpu = smp_processor_id();
1649 unsigned char reason;
1651 ++nmi_count(cpu);
1653 if ( nmi_callback(regs, cpu) )
1654 return;
1656 if ( nmi_watchdog )
1657 nmi_watchdog_tick(regs);
1659 /* Only the BSP gets external NMIs from the system. */
1660 if ( cpu == 0 )
1662 reason = inb(0x61);
1663 if ( reason & 0x80 )
1664 mem_parity_error(regs);
1665 else if ( reason & 0x40 )
1666 io_check_error(regs);
1667 else if ( !nmi_watchdog )
1668 unknown_nmi_error((unsigned char)(reason&0xff));
1672 void set_nmi_callback(nmi_callback_t callback)
1674 nmi_callback = callback;
1677 void unset_nmi_callback(void)
1679 nmi_callback = dummy_nmi_callback;
1682 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1684 setup_fpu(current);
1686 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1688 do_guest_trap(TRAP_no_device, regs, 0);
1689 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1692 return EXCRET_fault_fixed;
1695 asmlinkage int do_debug(struct cpu_user_regs *regs)
1697 unsigned long condition;
1698 struct vcpu *v = current;
1700 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1702 /* Mask out spurious debug traps due to lazy DR7 setting */
1703 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1704 (v->arch.guest_context.debugreg[7] == 0) )
1706 __asm__("mov %0,%%db7" : : "r" (0UL));
1707 goto out;
1710 DEBUGGER_trap_entry(TRAP_debug, regs);
1712 if ( !guest_mode(regs) )
1714 /* Clear TF just for absolute sanity. */
1715 regs->eflags &= ~EF_TF;
1716 /*
1717 * We ignore watchpoints when they trigger within Xen. This may happen
1718 * when a buffer is passed to us which previously had a watchpoint set
1719 * on it. No need to bump EIP; the only faulting trap is an instruction
1720 * breakpoint, which can't happen to us.
1721 */
1722 goto out;
1725 /* Save debug status register where guest OS can peek at it */
1726 v->arch.guest_context.debugreg[6] = condition;
1728 return do_guest_trap(TRAP_debug, regs, 0);
1730 out:
1731 return EXCRET_not_a_fault;
1734 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1736 return EXCRET_not_a_fault;
1739 void set_intr_gate(unsigned int n, void *addr)
1741 #ifdef __i386__
1742 int i;
1743 /* Keep secondary tables in sync with IRQ updates. */
1744 for ( i = 1; i < NR_CPUS; i++ )
1745 if ( idt_tables[i] != NULL )
1746 _set_gate(&idt_tables[i][n], 14, 0, addr);
1747 #endif
1748 _set_gate(&idt_table[n], 14, 0, addr);
1751 void set_system_gate(unsigned int n, void *addr)
1753 _set_gate(idt_table+n,14,3,addr);
1756 void set_task_gate(unsigned int n, unsigned int sel)
1758 idt_table[n].a = sel << 16;
1759 idt_table[n].b = 0x8500;
1762 void set_tss_desc(unsigned int n, void *addr)
1764 _set_tssldt_desc(
1765 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1766 (unsigned long)addr,
1767 offsetof(struct tss_struct, __cacheline_filler) - 1,
1768 9);
1771 void __init trap_init(void)
1773 extern void percpu_traps_init(void);
1775 /*
1776 * Note that interrupt gates are always used, rather than trap gates. We
1777 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1778 * first activation must have the "bad" value(s) for these registers and
1779 * we may lose them if another activation is installed before they are
1780 * saved. The page-fault handler also needs interrupts disabled until %cr2
1781 * has been read and saved on the stack.
1782 */
1783 set_intr_gate(TRAP_divide_error,&divide_error);
1784 set_intr_gate(TRAP_debug,&debug);
1785 set_intr_gate(TRAP_nmi,&nmi);
1786 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1787 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1788 set_intr_gate(TRAP_bounds,&bounds);
1789 set_intr_gate(TRAP_invalid_op,&invalid_op);
1790 set_intr_gate(TRAP_no_device,&device_not_available);
1791 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1792 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1793 set_intr_gate(TRAP_no_segment,&segment_not_present);
1794 set_intr_gate(TRAP_stack_error,&stack_segment);
1795 set_intr_gate(TRAP_gp_fault,&general_protection);
1796 set_intr_gate(TRAP_page_fault,&page_fault);
1797 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1798 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1799 set_intr_gate(TRAP_alignment_check,&alignment_check);
1800 set_intr_gate(TRAP_machine_check,&machine_check);
1801 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1803 percpu_traps_init();
1805 cpu_init();
1807 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1811 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1813 struct trap_info cur;
1814 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1815 long rc = 0;
1817 /* If no table is presented then clear the entire virtual IDT. */
1818 if ( guest_handle_is_null(traps) )
1820 memset(dst, 0, 256 * sizeof(*dst));
1821 init_int80_direct_trap(current);
1822 return 0;
1825 for ( ; ; )
1827 if ( hypercall_preempt_check() )
1829 rc = hypercall_create_continuation(
1830 __HYPERVISOR_set_trap_table, "h", traps);
1831 break;
1834 if ( copy_from_guest(&cur, traps, 1) )
1836 rc = -EFAULT;
1837 break;
1840 if ( cur.address == 0 )
1841 break;
1843 fixup_guest_code_selector(cur.cs);
1845 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1847 if ( cur.vector == 0x80 )
1848 init_int80_direct_trap(current);
1850 guest_handle_add_offset(traps, 1);
1853 return rc;
1857 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1859 int i;
1861 switch ( reg )
1863 case 0:
1864 if ( !access_ok(value, sizeof(long)) )
1865 return -EPERM;
1866 if ( p == current )
1867 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1868 break;
1869 case 1:
1870 if ( !access_ok(value, sizeof(long)) )
1871 return -EPERM;
1872 if ( p == current )
1873 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1874 break;
1875 case 2:
1876 if ( !access_ok(value, sizeof(long)) )
1877 return -EPERM;
1878 if ( p == current )
1879 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1880 break;
1881 case 3:
1882 if ( !access_ok(value, sizeof(long)) )
1883 return -EPERM;
1884 if ( p == current )
1885 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1886 break;
1887 case 6:
1888 /*
1889 * DR6: Bits 4-11,16-31 reserved (set to 1).
1890 * Bit 12 reserved (set to 0).
1891 */
1892 value &= 0xffffefff; /* reserved bits => 0 */
1893 value |= 0xffff0ff0; /* reserved bits => 1 */
1894 if ( p == current )
1895 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1896 break;
1897 case 7:
1898 /*
1899 * DR7: Bit 10 reserved (set to 1).
1900 * Bits 11-12,14-15 reserved (set to 0).
1901 * Privileged bits:
1902 * GD (bit 13): must be 0.
1903 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1904 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1905 */
1906 /* DR7 == 0 => debugging disabled for this domain. */
1907 if ( value != 0 )
1909 value &= 0xffff27ff; /* reserved bits => 0 */
1910 value |= 0x00000400; /* reserved bits => 1 */
1911 if ( (value & (1<<13)) != 0 ) return -EPERM;
1912 for ( i = 0; i < 16; i += 2 )
1913 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1915 if ( p == current )
1916 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1917 break;
1918 default:
1919 return -EINVAL;
1922 p->arch.guest_context.debugreg[reg] = value;
1923 return 0;
1926 long do_set_debugreg(int reg, unsigned long value)
1928 return set_debugreg(current, reg, value);
1931 unsigned long do_get_debugreg(int reg)
1933 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1934 return current->arch.guest_context.debugreg[reg];
1937 /*
1938 * Local variables:
1939 * mode: C
1940 * c-set-style: "BSD"
1941 * c-basic-offset: 4
1942 * tab-width: 4
1943 * indent-tabs-mode: nil
1944 * End:
1945 */