direct-io.hg

view xen/arch/x86/traps.c @ 11465:8c0cf4d3c412

[XEN] Allow direct reads of port 0x61 by any guest.
Often used by BIOS code which may be executed e.g., when starting
an X server.
From: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Sep 13 14:46:54 2006 +0100 (2006-09-13)
parents 37a42856e8d9
children 38765166ad7a
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <asm/shadow.h>
49 #include <asm/system.h>
50 #include <asm/io.h>
51 #include <asm/atomic.h>
52 #include <asm/desc.h>
53 #include <asm/debugreg.h>
54 #include <asm/smp.h>
55 #include <asm/flushtlb.h>
56 #include <asm/uaccess.h>
57 #include <asm/i387.h>
58 #include <asm/debugger.h>
59 #include <asm/msr.h>
60 #include <asm/x86_emulate.h>
62 /*
63 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
64 * fatal: Xen prints diagnostic message and then hangs.
65 * dom0: The NMI is virtualised to DOM0.
66 * ignore: The NMI error is cleared and ignored.
67 */
68 #ifdef NDEBUG
69 char opt_nmi[10] = "dom0";
70 #else
71 char opt_nmi[10] = "fatal";
72 #endif
73 string_param("nmi", opt_nmi);
75 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
76 idt_entry_t idt_table[IDT_ENTRIES];
78 #define DECLARE_TRAP_HANDLER(_name) \
79 asmlinkage void _name(void); \
80 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
82 asmlinkage void nmi(void);
83 DECLARE_TRAP_HANDLER(divide_error);
84 DECLARE_TRAP_HANDLER(debug);
85 DECLARE_TRAP_HANDLER(int3);
86 DECLARE_TRAP_HANDLER(overflow);
87 DECLARE_TRAP_HANDLER(bounds);
88 DECLARE_TRAP_HANDLER(invalid_op);
89 DECLARE_TRAP_HANDLER(device_not_available);
90 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
91 DECLARE_TRAP_HANDLER(invalid_TSS);
92 DECLARE_TRAP_HANDLER(segment_not_present);
93 DECLARE_TRAP_HANDLER(stack_segment);
94 DECLARE_TRAP_HANDLER(general_protection);
95 DECLARE_TRAP_HANDLER(page_fault);
96 DECLARE_TRAP_HANDLER(coprocessor_error);
97 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
98 DECLARE_TRAP_HANDLER(alignment_check);
99 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
100 DECLARE_TRAP_HANDLER(machine_check);
102 long do_set_debugreg(int reg, unsigned long value);
103 unsigned long do_get_debugreg(int reg);
105 static int debug_stack_lines = 20;
106 integer_param("debug_stack_lines", debug_stack_lines);
108 #ifdef CONFIG_X86_32
109 #define stack_words_per_line 8
110 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
111 #else
112 #define stack_words_per_line 4
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
114 #endif
116 int is_kernel_text(unsigned long addr)
117 {
118 extern char _stext, _etext;
119 if (addr >= (unsigned long) &_stext &&
120 addr <= (unsigned long) &_etext)
121 return 1;
122 return 0;
124 }
126 unsigned long kernel_text_end(void)
127 {
128 extern char _etext;
129 return (unsigned long) &_etext;
130 }
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
137 if ( hvm_guest(current) )
138 return;
140 if ( vm86_mode(regs) )
141 {
142 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
143 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
144 regs->ss, (uint16_t)(regs->esp & 0xffff));
145 }
146 else
147 {
148 stack = (unsigned long *)regs->esp;
149 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
150 }
152 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
153 {
154 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
155 break;
156 if ( get_user(addr, stack) )
157 {
158 if ( i != 0 )
159 printk("\n ");
160 printk("Fault while accessing guest memory.");
161 i = 1;
162 break;
163 }
164 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
165 printk("\n ");
166 printk(" %p", _p(addr));
167 stack++;
168 }
169 if ( i == 0 )
170 printk("Stack empty.");
171 printk("\n");
172 }
174 #ifdef NDEBUG
176 static void show_trace(struct cpu_user_regs *regs)
177 {
178 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
180 printk("Xen call trace:\n ");
182 printk("[<%p>]", _p(regs->eip));
183 print_symbol(" %s\n ", regs->eip);
185 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
186 {
187 addr = *stack++;
188 if ( is_kernel_text(addr) )
189 {
190 printk("[<%p>]", _p(addr));
191 print_symbol(" %s\n ", addr);
192 }
193 }
195 printk("\n");
196 }
198 #else
200 static void show_trace(struct cpu_user_regs *regs)
201 {
202 unsigned long *frame, next, addr, low, high;
204 printk("Xen call trace:\n ");
206 printk("[<%p>]", _p(regs->eip));
207 print_symbol(" %s\n ", regs->eip);
209 /* Bounds for range of valid frame pointer. */
210 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
211 high = (low & ~(STACK_SIZE - 1)) +
212 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
214 /* The initial frame pointer. */
215 next = regs->ebp;
217 for ( ; ; )
218 {
219 /* Valid frame pointer? */
220 if ( (next < low) || (next >= high) )
221 {
222 /*
223 * Exception stack frames have a different layout, denoted by an
224 * inverted frame pointer.
225 */
226 next = ~next;
227 if ( (next < low) || (next >= high) )
228 break;
229 frame = (unsigned long *)next;
230 next = frame[0];
231 addr = frame[(offsetof(struct cpu_user_regs, eip) -
232 offsetof(struct cpu_user_regs, ebp))
233 / BYTES_PER_LONG];
234 }
235 else
236 {
237 /* Ordinary stack frame. */
238 frame = (unsigned long *)next;
239 next = frame[0];
240 addr = frame[1];
241 }
243 printk("[<%p>]", _p(addr));
244 print_symbol(" %s\n ", addr);
246 low = (unsigned long)&frame[2];
247 }
249 printk("\n");
250 }
252 #endif
254 void show_stack(struct cpu_user_regs *regs)
255 {
256 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
257 int i;
259 if ( guest_mode(regs) )
260 return show_guest_stack(regs);
262 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
264 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
265 {
266 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
267 break;
268 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
269 printk("\n ");
270 addr = *stack++;
271 printk(" %p", _p(addr));
272 }
273 if ( i == 0 )
274 printk("Stack empty.");
275 printk("\n");
277 show_trace(regs);
278 }
280 void show_xen_trace()
281 {
282 struct cpu_user_regs regs;
283 #ifdef __x86_64
284 __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
285 __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
286 __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
287 #else
288 __asm__("movl %%esp,%0" : "=m" (regs.esp));
289 __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
290 __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
291 #endif
292 show_trace(&regs);
293 }
295 void show_stack_overflow(unsigned long esp)
296 {
297 #ifdef MEMORY_GUARD
298 unsigned long esp_top;
299 unsigned long *stack, addr;
301 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
303 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
304 if ( ((unsigned long)(esp - esp_top) > 512) &&
305 ((unsigned long)(esp_top - esp) > 512) )
306 return;
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow:\n ");
313 stack = (unsigned long *)esp;
314 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
315 {
316 addr = *stack++;
317 if ( is_kernel_text(addr) )
318 {
319 printk("%p: [<%p>]", stack, _p(addr));
320 print_symbol(" %s\n ", addr);
321 }
322 }
324 printk("\n");
325 #endif
326 }
328 void show_execution_state(struct cpu_user_regs *regs)
329 {
330 show_registers(regs);
331 show_stack(regs);
332 }
334 /*
335 * This is called for faults at very unexpected times (e.g., when interrupts
336 * are disabled). In such situations we can't do much that is safe. We try to
337 * print out some tracing and then we just spin.
338 */
339 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
340 {
341 int cpu = smp_processor_id();
342 static char *trapstr[] = {
343 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
344 "invalid opcode", "device not available", "double fault",
345 "coprocessor segment", "invalid tss", "segment not found",
346 "stack error", "general protection fault", "page fault",
347 "spurious interrupt", "coprocessor error", "alignment check",
348 "machine check", "simd error"
349 };
351 watchdog_disable();
352 console_start_sync();
354 show_execution_state(regs);
356 if ( trapnr == TRAP_page_fault )
357 {
358 unsigned long cr2 = read_cr2();
359 printk("Faulting linear address: %p\n", _p(cr2));
360 show_page_walk(cr2);
361 }
363 printk("************************************\n");
364 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
365 cpu, trapnr, trapstr[trapnr], regs->error_code,
366 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
367 printk("System shutting down -- need manual reset.\n");
368 printk("************************************\n");
370 (void)debugger_trap_fatal(trapnr, regs);
372 /* Lock up the console to prevent spurious output from other CPUs. */
373 console_force_lock();
375 /* Wait for manual reset. */
376 machine_halt();
377 }
379 static inline int do_trap(int trapnr, char *str,
380 struct cpu_user_regs *regs,
381 int use_error_code)
382 {
383 struct vcpu *v = current;
384 struct trap_bounce *tb = &v->arch.trap_bounce;
385 struct trap_info *ti;
386 unsigned long fixup;
388 DEBUGGER_trap_entry(trapnr, regs);
390 if ( !guest_mode(regs) )
391 goto xen_fault;
393 ti = &current->arch.guest_context.trap_ctxt[trapnr];
394 tb->flags = TBF_EXCEPTION;
395 tb->cs = ti->cs;
396 tb->eip = ti->address;
397 if ( use_error_code )
398 {
399 tb->flags |= TBF_EXCEPTION_ERRCODE;
400 tb->error_code = regs->error_code;
401 }
402 if ( TI_GET_IF(ti) )
403 tb->flags |= TBF_INTERRUPT;
404 return 0;
406 xen_fault:
408 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
409 {
410 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
411 regs->eip = fixup;
412 return 0;
413 }
415 DEBUGGER_trap_fatal(trapnr, regs);
417 show_execution_state(regs);
418 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
419 "[error_code=%04x]\n",
420 smp_processor_id(), trapnr, str, regs->error_code);
421 return 0;
422 }
424 #define DO_ERROR_NOCODE(trapnr, str, name) \
425 asmlinkage int do_##name(struct cpu_user_regs *regs) \
426 { \
427 return do_trap(trapnr, str, regs, 0); \
428 }
430 #define DO_ERROR(trapnr, str, name) \
431 asmlinkage int do_##name(struct cpu_user_regs *regs) \
432 { \
433 return do_trap(trapnr, str, regs, 1); \
434 }
436 DO_ERROR_NOCODE( 0, "divide error", divide_error)
437 DO_ERROR_NOCODE( 4, "overflow", overflow)
438 DO_ERROR_NOCODE( 5, "bounds", bounds)
439 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
440 DO_ERROR(10, "invalid TSS", invalid_TSS)
441 DO_ERROR(11, "segment not present", segment_not_present)
442 DO_ERROR(12, "stack segment", stack_segment)
443 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
444 DO_ERROR(17, "alignment check", alignment_check)
445 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
447 int rdmsr_hypervisor_regs(
448 uint32_t idx, uint32_t *eax, uint32_t *edx)
449 {
450 idx -= 0x40000000;
451 if ( idx > 0 )
452 return 0;
454 *eax = *edx = 0;
455 return 1;
456 }
458 int wrmsr_hypervisor_regs(
459 uint32_t idx, uint32_t eax, uint32_t edx)
460 {
461 struct domain *d = current->domain;
463 idx -= 0x40000000;
464 if ( idx > 0 )
465 return 0;
467 switch ( idx )
468 {
469 case 0:
470 {
471 void *hypercall_page;
472 unsigned long mfn;
473 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
474 unsigned int idx = eax & 0xfff;
476 if ( idx > 0 )
477 {
478 DPRINTK("Dom%d: Out of range index %u to MSR %08x\n",
479 d->domain_id, idx, 0x40000000);
480 return 0;
481 }
483 mfn = gmfn_to_mfn(d, gmfn);
485 if ( !mfn_valid(mfn) ||
486 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
487 {
488 DPRINTK("Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
489 d->domain_id, gmfn, mfn, 0x40000000);
490 return 0;
491 }
493 hypercall_page = map_domain_page(mfn);
494 hypercall_page_initialise(d, hypercall_page);
495 unmap_domain_page(hypercall_page);
497 put_page_and_type(mfn_to_page(mfn));
498 break;
499 }
501 default:
502 BUG();
503 }
505 return 1;
506 }
508 int cpuid_hypervisor_leaves(
509 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
510 {
511 idx -= 0x40000000;
512 if ( idx > 2 )
513 return 0;
515 switch ( idx )
516 {
517 case 0:
518 *eax = 0x40000002; /* Largest leaf */
519 *ebx = 0x566e6558; /* Signature 1: "XenV" */
520 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
521 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
522 break;
524 case 1:
525 *eax = (xen_major_version() << 16) | xen_minor_version();
526 *ebx = 0; /* Reserved */
527 *ecx = 0; /* Reserved */
528 *edx = 0; /* Reserved */
529 break;
531 case 2:
532 *eax = 1; /* Number of hypercall-transfer pages */
533 *ebx = 0x40000000; /* MSR base address */
534 *ecx = 0; /* Features 1 */
535 *edx = 0; /* Features 2 */
536 break;
538 default:
539 BUG();
540 }
542 return 1;
543 }
545 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
546 {
547 char sig[5], instr[2];
548 uint32_t a, b, c, d;
549 unsigned long eip, rc;
551 a = regs->eax;
552 b = regs->ebx;
553 c = regs->ecx;
554 d = regs->edx;
555 eip = regs->eip;
557 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
558 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
559 {
560 propagate_page_fault(eip + sizeof(sig) - rc, 0);
561 return EXCRET_fault_fixed;
562 }
563 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
564 return 0;
565 eip += sizeof(sig);
567 /* We only emulate CPUID. */
568 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
569 {
570 propagate_page_fault(eip + sizeof(instr) - rc, 0);
571 return EXCRET_fault_fixed;
572 }
573 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
574 return 0;
575 eip += sizeof(instr);
577 __asm__ (
578 "cpuid"
579 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
580 : "0" (a), "1" (b), "2" (c), "3" (d) );
582 if ( regs->eax == 1 )
583 {
584 /* Modify Feature Information. */
585 clear_bit(X86_FEATURE_VME, &d);
586 clear_bit(X86_FEATURE_DE, &d);
587 clear_bit(X86_FEATURE_PSE, &d);
588 clear_bit(X86_FEATURE_PGE, &d);
589 if ( !supervisor_mode_kernel )
590 clear_bit(X86_FEATURE_SEP, &d);
591 if ( !IS_PRIV(current->domain) )
592 clear_bit(X86_FEATURE_MTRR, &d);
593 }
594 else
595 {
596 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
597 }
599 regs->eax = a;
600 regs->ebx = b;
601 regs->ecx = c;
602 regs->edx = d;
603 regs->eip = eip;
605 return EXCRET_fault_fixed;
606 }
608 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
609 {
610 struct vcpu *v = current;
611 struct trap_bounce *tb = &v->arch.trap_bounce;
612 struct trap_info *ti;
613 int rc;
615 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
617 if ( unlikely(!guest_mode(regs)) )
618 {
619 char sig[5];
620 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
621 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
622 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
623 {
624 show_execution_state(regs);
625 regs->eip += sizeof(sig);
626 return EXCRET_fault_fixed;
627 }
628 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
629 show_execution_state(regs);
630 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
631 smp_processor_id(), TRAP_invalid_op);
632 }
634 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
635 return rc;
637 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
638 tb->flags = TBF_EXCEPTION;
639 tb->cs = ti->cs;
640 tb->eip = ti->address;
641 if ( TI_GET_IF(ti) )
642 tb->flags |= TBF_INTERRUPT;
644 return 0;
645 }
647 asmlinkage int do_int3(struct cpu_user_regs *regs)
648 {
649 struct vcpu *v = current;
650 struct trap_bounce *tb = &v->arch.trap_bounce;
651 struct trap_info *ti;
653 DEBUGGER_trap_entry(TRAP_int3, regs);
655 if ( !guest_mode(regs) )
656 {
657 DEBUGGER_trap_fatal(TRAP_int3, regs);
658 show_execution_state(regs);
659 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
660 }
662 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
663 tb->flags = TBF_EXCEPTION;
664 tb->cs = ti->cs;
665 tb->eip = ti->address;
666 if ( TI_GET_IF(ti) )
667 tb->flags |= TBF_INTERRUPT;
669 return 0;
670 }
672 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
673 {
674 fatal_trap(TRAP_machine_check, regs);
675 return 0;
676 }
678 void propagate_page_fault(unsigned long addr, u16 error_code)
679 {
680 struct trap_info *ti;
681 struct vcpu *v = current;
682 struct trap_bounce *tb = &v->arch.trap_bounce;
684 v->arch.guest_context.ctrlreg[2] = addr;
685 v->vcpu_info->arch.cr2 = addr;
687 /* Re-set error_code.user flag appropriately for the guest. */
688 error_code &= ~PFEC_user_mode;
689 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
690 error_code |= PFEC_user_mode;
692 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
693 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
694 tb->error_code = error_code;
695 tb->cs = ti->cs;
696 tb->eip = ti->address;
697 if ( TI_GET_IF(ti) )
698 tb->flags |= TBF_INTERRUPT;
699 }
701 static int handle_gdt_ldt_mapping_fault(
702 unsigned long offset, struct cpu_user_regs *regs)
703 {
704 extern int map_ldt_shadow_page(unsigned int);
706 struct vcpu *v = current;
707 struct domain *d = v->domain;
708 int ret;
710 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
711 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
712 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
714 /* Should never fault in another vcpu's area. */
715 BUG_ON(vcpu_area != current->vcpu_id);
717 /* Byte offset within the gdt/ldt sub-area. */
718 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
720 if ( likely(is_ldt_area) )
721 {
722 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
723 LOCK_BIGLOCK(d);
724 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
725 UNLOCK_BIGLOCK(d);
727 if ( unlikely(ret == 0) )
728 {
729 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
730 if ( !guest_mode(regs) )
731 return 0;
732 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
733 propagate_page_fault(
734 v->arch.guest_context.ldt_base + offset, regs->error_code);
735 }
736 }
737 else
738 {
739 /* GDT fault: handle the fault as #GP(selector). */
740 regs->error_code = (u16)offset & ~7;
741 (void)do_general_protection(regs);
742 }
744 return EXCRET_fault_fixed;
745 }
747 #ifdef HYPERVISOR_VIRT_END
748 #define IN_HYPERVISOR_RANGE(va) \
749 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
750 #else
751 #define IN_HYPERVISOR_RANGE(va) \
752 (((va) >= HYPERVISOR_VIRT_START))
753 #endif
755 static int __spurious_page_fault(
756 unsigned long addr, struct cpu_user_regs *regs)
757 {
758 unsigned long mfn, cr3 = read_cr3();
759 #if CONFIG_PAGING_LEVELS >= 4
760 l4_pgentry_t l4e, *l4t;
761 #endif
762 #if CONFIG_PAGING_LEVELS >= 3
763 l3_pgentry_t l3e, *l3t;
764 #endif
765 l2_pgentry_t l2e, *l2t;
766 l1_pgentry_t l1e, *l1t;
767 unsigned int required_flags, disallowed_flags;
769 /* Reserved bit violations are never spurious faults. */
770 if ( regs->error_code & PFEC_reserved_bit )
771 return 0;
773 required_flags = _PAGE_PRESENT;
774 if ( regs->error_code & PFEC_write_access )
775 required_flags |= _PAGE_RW;
776 if ( regs->error_code & PFEC_user_mode )
777 required_flags |= _PAGE_USER;
779 disallowed_flags = 0;
780 if ( regs->error_code & PFEC_insn_fetch )
781 disallowed_flags |= _PAGE_NX;
783 mfn = cr3 >> PAGE_SHIFT;
785 #if CONFIG_PAGING_LEVELS >= 4
786 l4t = map_domain_page(mfn);
787 l4e = l4t[l4_table_offset(addr)];
788 mfn = l4e_get_pfn(l4e);
789 unmap_domain_page(l4t);
790 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
791 (l4e_get_flags(l4e) & disallowed_flags) )
792 return 0;
793 #endif
795 #if CONFIG_PAGING_LEVELS >= 3
796 l3t = map_domain_page(mfn);
797 #ifdef CONFIG_X86_PAE
798 l3t += (cr3 & 0xFE0UL) >> 3;
799 #endif
800 l3e = l3t[l3_table_offset(addr)];
801 mfn = l3e_get_pfn(l3e);
802 unmap_domain_page(l3t);
803 #ifdef CONFIG_X86_PAE
804 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
805 return 0;
806 #else
807 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
808 (l3e_get_flags(l3e) & disallowed_flags) )
809 return 0;
810 #endif
811 #endif
813 l2t = map_domain_page(mfn);
814 l2e = l2t[l2_table_offset(addr)];
815 mfn = l2e_get_pfn(l2e);
816 unmap_domain_page(l2t);
817 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
818 (l2e_get_flags(l2e) & disallowed_flags) )
819 return 0;
820 if ( l2e_get_flags(l2e) & _PAGE_PSE )
821 {
822 l1e = l1e_empty(); /* define before use in debug tracing */
823 goto spurious;
824 }
826 l1t = map_domain_page(mfn);
827 l1e = l1t[l1_table_offset(addr)];
828 mfn = l1e_get_pfn(l1e);
829 unmap_domain_page(l1t);
830 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
831 (l1e_get_flags(l1e) & disallowed_flags) )
832 return 0;
834 spurious:
835 DPRINTK("Spurious fault in domain %u:%u at addr %lx, e/c %04x\n",
836 current->domain->domain_id, current->vcpu_id,
837 addr, regs->error_code);
838 #if CONFIG_PAGING_LEVELS >= 4
839 DPRINTK(" l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
840 #endif
841 #if CONFIG_PAGING_LEVELS >= 3
842 DPRINTK(" l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
843 #endif
844 DPRINTK(" l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
845 DPRINTK(" l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
846 #ifndef NDEBUG
847 show_registers(regs);
848 #endif
849 return 1;
850 }
852 static int spurious_page_fault(
853 unsigned long addr, struct cpu_user_regs *regs)
854 {
855 struct domain *d = current->domain;
856 int is_spurious;
858 LOCK_BIGLOCK(d);
859 is_spurious = __spurious_page_fault(addr, regs);
860 UNLOCK_BIGLOCK(d);
862 return is_spurious;
863 }
865 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
866 {
867 struct vcpu *v = current;
868 struct domain *d = v->domain;
870 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
871 {
872 if ( shadow_mode_external(d) && guest_mode(regs) )
873 return shadow_fault(addr, regs);
874 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
875 return handle_gdt_ldt_mapping_fault(
876 addr - GDT_LDT_VIRT_START, regs);
877 /*
878 * Do not propagate spurious faults in the hypervisor area to the
879 * guest. It cannot fix them up.
880 */
881 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
882 }
884 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
885 guest_kernel_mode(v, regs) &&
886 /* Do not check if access-protection fault since the page may
887 legitimately be not present in shadow page tables */
888 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
889 ptwr_do_page_fault(d, addr, regs) )
890 return EXCRET_fault_fixed;
892 if ( shadow_mode_enabled(d) )
893 return shadow_fault(addr, regs);
895 return 0;
896 }
898 /*
899 * #PF error code:
900 * Bit 0: Protection violation (=1) ; Page not present (=0)
901 * Bit 1: Write access
902 * Bit 2: User mode (=1) ; Supervisor mode (=0)
903 * Bit 3: Reserved bit violation
904 * Bit 4: Instruction fetch
905 */
906 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
907 {
908 unsigned long addr, fixup;
909 int rc;
911 ASSERT(!in_irq());
913 addr = read_cr2();
915 DEBUGGER_trap_entry(TRAP_page_fault, regs);
917 perfc_incrc(page_faults);
919 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
920 return rc;
922 if ( unlikely(!guest_mode(regs)) )
923 {
924 if ( spurious_page_fault(addr, regs) )
925 return EXCRET_not_a_fault;
927 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
928 {
929 perfc_incrc(copy_user_faults);
930 regs->eip = fixup;
931 return 0;
932 }
934 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
936 show_execution_state(regs);
937 show_page_walk(addr);
938 panic("CPU%d FATAL PAGE FAULT\n"
939 "[error_code=%04x]\n"
940 "Faulting linear address: %p\n",
941 smp_processor_id(), regs->error_code, _p(addr));
942 }
944 propagate_page_fault(addr, regs->error_code);
945 return 0;
946 }
948 long do_fpu_taskswitch(int set)
949 {
950 struct vcpu *v = current;
952 if ( set )
953 {
954 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
955 stts();
956 }
957 else
958 {
959 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
960 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
961 clts();
962 }
964 return 0;
965 }
967 /* Has the guest requested sufficient permission for this I/O access? */
968 static inline int guest_io_okay(
969 unsigned int port, unsigned int bytes,
970 struct vcpu *v, struct cpu_user_regs *regs)
971 {
972 u16 x;
973 #if defined(__x86_64__)
974 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
975 int user_mode = !(v->arch.flags & TF_kernel_mode);
976 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
977 #elif defined(__i386__)
978 #define TOGGLE_MODE() ((void)0)
979 #endif
981 if ( !vm86_mode(regs) &&
982 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
983 return 1;
985 if ( v->arch.iobmp_limit > (port + bytes) )
986 {
987 TOGGLE_MODE();
988 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
989 TOGGLE_MODE();
990 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
991 return 1;
992 }
994 return 0;
995 }
997 /* Has the administrator granted sufficient permission for this I/O access? */
998 static inline int admin_io_okay(
999 unsigned int port, unsigned int bytes,
1000 struct vcpu *v, struct cpu_user_regs *regs)
1002 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1005 /* Check admin limits. Silently fail the access if it is disallowed. */
1006 static inline unsigned char inb_user(
1007 unsigned int port, struct vcpu *v, struct cpu_user_regs *regs)
1009 /*
1010 * Allow read access to port 0x61. Bit 4 oscillates with period 30us, and
1011 * so it is often used for timing loops in BIOS code. This hack can go
1012 * away when we have separate read/write permission rangesets.
1013 * Note that we could emulate bit 4 instead of directly reading port 0x61,
1014 * but there's not really a good reason to do so.
1015 */
1016 if ( admin_io_okay(port, 1, v, regs) || (port == 0x61) )
1017 return inb(port);
1018 return ~0;
1020 //#define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
1021 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
1022 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
1023 #define outb_user(_v, _p, _d, _r) \
1024 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
1025 #define outw_user(_v, _p, _d, _r) \
1026 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
1027 #define outl_user(_v, _p, _d, _r) \
1028 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
1030 /* Instruction fetch with error handling. */
1031 #define insn_fetch(_type, _size, _ptr) \
1032 ({ unsigned long _rc, _x; \
1033 if ( (_rc = copy_from_user(&_x, (_type *)eip, sizeof(_type))) != 0 ) \
1034 { \
1035 propagate_page_fault(eip + sizeof(_type) - _rc, 0); \
1036 return EXCRET_fault_fixed; \
1037 } \
1038 eip += _size; (_type)_x; })
1040 static int emulate_privileged_op(struct cpu_user_regs *regs)
1042 struct vcpu *v = current;
1043 unsigned long *reg, eip = regs->eip, res;
1044 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
1045 unsigned int port, i, op_bytes = 4, data, rc;
1046 u32 l, h;
1048 /* Legacy prefixes. */
1049 for ( i = 0; i < 8; i++ )
1051 switch ( opcode = insn_fetch(u8, 1, eip) )
1053 case 0x66: /* operand-size override */
1054 op_bytes ^= 6; /* switch between 2/4 bytes */
1055 break;
1056 case 0x67: /* address-size override */
1057 case 0x2e: /* CS override */
1058 case 0x3e: /* DS override */
1059 case 0x26: /* ES override */
1060 case 0x64: /* FS override */
1061 case 0x65: /* GS override */
1062 case 0x36: /* SS override */
1063 case 0xf0: /* LOCK */
1064 case 0xf2: /* REPNE/REPNZ */
1065 break;
1066 case 0xf3: /* REP/REPE/REPZ */
1067 rep_prefix = 1;
1068 break;
1069 default:
1070 goto done_prefixes;
1073 done_prefixes:
1075 #ifdef __x86_64__
1076 /* REX prefix. */
1077 if ( (opcode & 0xf0) == 0x40 )
1079 modrm_reg = (opcode & 4) << 1; /* REX.R */
1080 modrm_rm = (opcode & 1) << 3; /* REX.B */
1082 /* REX.W and REX.X do not need to be decoded. */
1083 opcode = insn_fetch(u8, 1, eip);
1085 #endif
1087 /* Input/Output String instructions. */
1088 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1090 if ( rep_prefix && (regs->ecx == 0) )
1091 goto done;
1093 continue_io_string:
1094 switch ( opcode )
1096 case 0x6c: /* INSB */
1097 op_bytes = 1;
1098 case 0x6d: /* INSW/INSL */
1099 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1100 goto fail;
1101 switch ( op_bytes )
1103 case 1:
1104 data = (u8)inb_user((u16)regs->edx, v, regs);
1105 break;
1106 case 2:
1107 data = (u16)inw_user((u16)regs->edx, v, regs);
1108 break;
1109 case 4:
1110 data = (u32)inl_user((u16)regs->edx, v, regs);
1111 break;
1113 if ( (rc = copy_to_user((void *)regs->edi, &data, op_bytes)) != 0 )
1115 propagate_page_fault(regs->edi + op_bytes - rc,
1116 PFEC_write_access);
1117 return EXCRET_fault_fixed;
1119 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1120 break;
1122 case 0x6e: /* OUTSB */
1123 op_bytes = 1;
1124 case 0x6f: /* OUTSW/OUTSL */
1125 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1126 goto fail;
1127 rc = copy_from_user(&data, (void *)regs->esi, op_bytes);
1128 if ( rc != 0 )
1130 propagate_page_fault(regs->esi + op_bytes - rc, 0);
1131 return EXCRET_fault_fixed;
1133 switch ( op_bytes )
1135 case 1:
1136 outb_user((u8)data, (u16)regs->edx, v, regs);
1137 break;
1138 case 2:
1139 outw_user((u16)data, (u16)regs->edx, v, regs);
1140 break;
1141 case 4:
1142 outl_user((u32)data, (u16)regs->edx, v, regs);
1143 break;
1145 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1146 break;
1149 if ( rep_prefix && (--regs->ecx != 0) )
1151 if ( !hypercall_preempt_check() )
1152 goto continue_io_string;
1153 eip = regs->eip;
1156 goto done;
1159 /* I/O Port and Interrupt Flag instructions. */
1160 switch ( opcode )
1162 case 0xe4: /* IN imm8,%al */
1163 op_bytes = 1;
1164 case 0xe5: /* IN imm8,%eax */
1165 port = insn_fetch(u8, 1, eip);
1166 exec_in:
1167 if ( !guest_io_okay(port, op_bytes, v, regs) )
1168 goto fail;
1169 switch ( op_bytes )
1171 case 1:
1172 regs->eax &= ~0xffUL;
1173 regs->eax |= (u8)inb_user(port, v, regs);
1174 break;
1175 case 2:
1176 regs->eax &= ~0xffffUL;
1177 regs->eax |= (u16)inw_user(port, v, regs);
1178 break;
1179 case 4:
1180 regs->eax = (u32)inl_user(port, v, regs);
1181 break;
1183 goto done;
1185 case 0xec: /* IN %dx,%al */
1186 op_bytes = 1;
1187 case 0xed: /* IN %dx,%eax */
1188 port = (u16)regs->edx;
1189 goto exec_in;
1191 case 0xe6: /* OUT %al,imm8 */
1192 op_bytes = 1;
1193 case 0xe7: /* OUT %eax,imm8 */
1194 port = insn_fetch(u8, 1, eip);
1195 exec_out:
1196 if ( !guest_io_okay(port, op_bytes, v, regs) )
1197 goto fail;
1198 switch ( op_bytes )
1200 case 1:
1201 outb_user((u8)regs->eax, port, v, regs);
1202 break;
1203 case 2:
1204 outw_user((u16)regs->eax, port, v, regs);
1205 break;
1206 case 4:
1207 outl_user((u32)regs->eax, port, v, regs);
1208 break;
1210 goto done;
1212 case 0xee: /* OUT %al,%dx */
1213 op_bytes = 1;
1214 case 0xef: /* OUT %eax,%dx */
1215 port = (u16)regs->edx;
1216 goto exec_out;
1218 case 0xfa: /* CLI */
1219 case 0xfb: /* STI */
1220 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1221 goto fail;
1222 /*
1223 * This is just too dangerous to allow, in my opinion. Consider if the
1224 * caller then tries to reenable interrupts using POPF: we can't trap
1225 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1226 * do for us. :-)
1227 */
1228 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1229 goto done;
1231 case 0x0f: /* Two-byte opcode */
1232 break;
1234 default:
1235 goto fail;
1238 /* Remaining instructions only emulated from guest kernel. */
1239 if ( !guest_kernel_mode(v, regs) )
1240 goto fail;
1242 /* Privileged (ring 0) instructions. */
1243 opcode = insn_fetch(u8, 1, eip);
1244 switch ( opcode )
1246 case 0x06: /* CLTS */
1247 (void)do_fpu_taskswitch(0);
1248 break;
1250 case 0x09: /* WBINVD */
1251 /* Ignore the instruction if unprivileged. */
1252 if ( !cache_flush_permitted(v->domain) )
1253 /* Non-physdev domain attempted WBINVD; ignore for now since
1254 newer linux uses this in some start-of-day timing loops */
1256 else
1257 wbinvd();
1258 break;
1260 case 0x20: /* MOV CR?,<reg> */
1261 opcode = insn_fetch(u8, 1, eip);
1262 modrm_reg |= (opcode >> 3) & 7;
1263 modrm_rm |= (opcode >> 0) & 7;
1264 reg = decode_register(modrm_rm, regs, 0);
1265 switch ( modrm_reg )
1267 case 0: /* Read CR0 */
1268 *reg = (read_cr0() & ~X86_CR0_TS) |
1269 v->arch.guest_context.ctrlreg[0];
1270 break;
1272 case 2: /* Read CR2 */
1273 *reg = v->arch.guest_context.ctrlreg[2];
1274 break;
1276 case 3: /* Read CR3 */
1277 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1278 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1279 break;
1281 case 4: /* Read CR4 */
1282 /*
1283 * Guests can read CR4 to see what features Xen has enabled. We
1284 * therefore lie about PGE & PSE as they are unavailable to guests.
1285 */
1286 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1287 break;
1289 default:
1290 goto fail;
1292 break;
1294 case 0x21: /* MOV DR?,<reg> */
1295 opcode = insn_fetch(u8, 1, eip);
1296 modrm_reg |= (opcode >> 3) & 7;
1297 modrm_rm |= (opcode >> 0) & 7;
1298 reg = decode_register(modrm_rm, regs, 0);
1299 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1300 goto fail;
1301 *reg = res;
1302 break;
1304 case 0x22: /* MOV <reg>,CR? */
1305 opcode = insn_fetch(u8, 1, eip);
1306 modrm_reg |= (opcode >> 3) & 7;
1307 modrm_rm |= (opcode >> 0) & 7;
1308 reg = decode_register(modrm_rm, regs, 0);
1309 switch ( modrm_reg )
1311 case 0: /* Write CR0 */
1312 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1314 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1315 goto fail;
1317 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1318 break;
1320 case 2: /* Write CR2 */
1321 v->arch.guest_context.ctrlreg[2] = *reg;
1322 v->vcpu_info->arch.cr2 = *reg;
1323 break;
1325 case 3: /* Write CR3 */
1326 LOCK_BIGLOCK(v->domain);
1327 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1328 UNLOCK_BIGLOCK(v->domain);
1329 break;
1331 case 4:
1332 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1334 DPRINTK("Attempt to change CR4 flags.\n");
1335 goto fail;
1337 break;
1339 default:
1340 goto fail;
1342 break;
1344 case 0x23: /* MOV <reg>,DR? */
1345 opcode = insn_fetch(u8, 1, eip);
1346 modrm_reg |= (opcode >> 3) & 7;
1347 modrm_rm |= (opcode >> 0) & 7;
1348 reg = decode_register(modrm_rm, regs, 0);
1349 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1350 goto fail;
1351 break;
1353 case 0x30: /* WRMSR */
1354 switch ( regs->ecx )
1356 #ifdef CONFIG_X86_64
1357 case MSR_FS_BASE:
1358 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1359 goto fail;
1360 v->arch.guest_context.fs_base =
1361 ((u64)regs->edx << 32) | regs->eax;
1362 break;
1363 case MSR_GS_BASE:
1364 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1365 goto fail;
1366 v->arch.guest_context.gs_base_kernel =
1367 ((u64)regs->edx << 32) | regs->eax;
1368 break;
1369 case MSR_SHADOW_GS_BASE:
1370 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1371 goto fail;
1372 v->arch.guest_context.gs_base_user =
1373 ((u64)regs->edx << 32) | regs->eax;
1374 break;
1375 #endif
1376 default:
1377 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1378 break;
1380 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1381 (regs->eax != l) || (regs->edx != h) )
1382 DPRINTK("Domain attempted WRMSR %p from "
1383 "%08x:%08x to %08lx:%08lx.\n",
1384 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1385 break;
1387 break;
1389 case 0x32: /* RDMSR */
1390 switch ( regs->ecx )
1392 #ifdef CONFIG_X86_64
1393 case MSR_FS_BASE:
1394 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1395 regs->edx = v->arch.guest_context.fs_base >> 32;
1396 break;
1397 case MSR_GS_BASE:
1398 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1399 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1400 break;
1401 case MSR_SHADOW_GS_BASE:
1402 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1403 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1404 break;
1405 #endif
1406 case MSR_EFER:
1407 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1408 goto fail;
1409 break;
1410 default:
1411 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1413 regs->eax = l;
1414 regs->edx = h;
1415 break;
1417 /* Everyone can read the MSR space. */
1418 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1419 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1420 goto fail;
1421 break;
1423 break;
1425 default:
1426 goto fail;
1429 done:
1430 regs->eip = eip;
1431 return EXCRET_fault_fixed;
1433 fail:
1434 return 0;
1437 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1439 struct vcpu *v = current;
1440 struct trap_bounce *tb = &v->arch.trap_bounce;
1441 struct trap_info *ti;
1442 unsigned long fixup;
1444 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1446 if ( regs->error_code & 1 )
1447 goto hardware_gp;
1449 if ( !guest_mode(regs) )
1450 goto gp_in_kernel;
1452 /*
1453 * Cunning trick to allow arbitrary "INT n" handling.
1455 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1456 * instruction from trapping to the appropriate vector, when that might not
1457 * be expected by Xen or the guest OS. For example, that entry might be for
1458 * a fault handler (unlike traps, faults don't increment EIP), or might
1459 * expect an error code on the stack (which a software trap never
1460 * provides), or might be a hardware interrupt handler that doesn't like
1461 * being called spuriously.
1463 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1464 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1465 * clear to indicate that it's a software fault, not hardware.
1467 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1468 * okay because they can only be triggered by an explicit DPL-checked
1469 * instruction. The DPL specified by the guest OS for these vectors is NOT
1470 * CHECKED!!
1471 */
1472 if ( (regs->error_code & 3) == 2 )
1474 /* This fault must be due to <INT n> instruction. */
1475 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1476 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1478 tb->flags = TBF_EXCEPTION;
1479 regs->eip += 2;
1480 goto finish_propagation;
1484 /* Emulate some simple privileged and I/O instructions. */
1485 if ( (regs->error_code == 0) &&
1486 emulate_privileged_op(regs) )
1487 return 0;
1489 #if defined(__i386__)
1490 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1491 (regs->error_code == 0) &&
1492 gpf_emulate_4gb(regs) )
1493 return 0;
1494 #endif
1496 /* Pass on GPF as is. */
1497 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1498 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1499 tb->error_code = regs->error_code;
1500 finish_propagation:
1501 tb->cs = ti->cs;
1502 tb->eip = ti->address;
1503 if ( TI_GET_IF(ti) )
1504 tb->flags |= TBF_INTERRUPT;
1505 return 0;
1507 gp_in_kernel:
1509 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1511 DPRINTK("GPF (%04x): %p -> %p\n",
1512 regs->error_code, _p(regs->eip), _p(fixup));
1513 regs->eip = fixup;
1514 return 0;
1517 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1519 hardware_gp:
1520 show_execution_state(regs);
1521 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1522 smp_processor_id(), regs->error_code);
1523 return 0;
1526 static void nmi_softirq(void)
1528 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1529 vcpu_kick(dom0->vcpu[0]);
1532 static void nmi_dom0_report(unsigned int reason_idx)
1534 struct domain *d;
1535 struct vcpu *v;
1537 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1538 return;
1540 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1542 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1543 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1546 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1548 switch ( opt_nmi[0] )
1550 case 'd': /* 'dom0' */
1551 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1552 case 'i': /* 'ignore' */
1553 break;
1554 default: /* 'fatal' */
1555 console_force_unlock();
1556 printk("\n\nNMI - MEMORY ERROR\n");
1557 fatal_trap(TRAP_nmi, regs);
1560 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1561 mdelay(1);
1562 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1565 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1567 switch ( opt_nmi[0] )
1569 case 'd': /* 'dom0' */
1570 nmi_dom0_report(_XEN_NMIREASON_io_error);
1571 case 'i': /* 'ignore' */
1572 break;
1573 default: /* 'fatal' */
1574 console_force_unlock();
1575 printk("\n\nNMI - I/O ERROR\n");
1576 fatal_trap(TRAP_nmi, regs);
1579 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1580 mdelay(1);
1581 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1584 static void unknown_nmi_error(unsigned char reason)
1586 switch ( opt_nmi[0] )
1588 case 'd': /* 'dom0' */
1589 nmi_dom0_report(_XEN_NMIREASON_unknown);
1590 case 'i': /* 'ignore' */
1591 break;
1592 default: /* 'fatal' */
1593 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1594 printk("Dazed and confused, but trying to continue\n");
1595 printk("Do you have a strange power saving mode enabled?\n");
1599 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1601 return 0;
1604 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1606 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1608 unsigned int cpu = smp_processor_id();
1609 unsigned char reason;
1611 ++nmi_count(cpu);
1613 if ( nmi_callback(regs, cpu) )
1614 return;
1616 if ( nmi_watchdog )
1617 nmi_watchdog_tick(regs);
1619 /* Only the BSP gets external NMIs from the system. */
1620 if ( cpu == 0 )
1622 reason = inb(0x61);
1623 if ( reason & 0x80 )
1624 mem_parity_error(regs);
1625 else if ( reason & 0x40 )
1626 io_check_error(regs);
1627 else if ( !nmi_watchdog )
1628 unknown_nmi_error((unsigned char)(reason&0xff));
1632 void set_nmi_callback(nmi_callback_t callback)
1634 nmi_callback = callback;
1637 void unset_nmi_callback(void)
1639 nmi_callback = dummy_nmi_callback;
1642 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1644 struct trap_bounce *tb;
1645 struct trap_info *ti;
1647 setup_fpu(current);
1649 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1651 tb = &current->arch.trap_bounce;
1652 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1654 tb->flags = TBF_EXCEPTION;
1655 tb->cs = ti->cs;
1656 tb->eip = ti->address;
1657 if ( TI_GET_IF(ti) )
1658 tb->flags |= TBF_INTERRUPT;
1660 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1663 return EXCRET_fault_fixed;
1666 asmlinkage int do_debug(struct cpu_user_regs *regs)
1668 unsigned long condition;
1669 struct vcpu *v = current;
1670 struct trap_bounce *tb = &v->arch.trap_bounce;
1671 struct trap_info *ti;
1673 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1675 /* Mask out spurious debug traps due to lazy DR7 setting */
1676 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1677 (v->arch.guest_context.debugreg[7] == 0) )
1679 __asm__("mov %0,%%db7" : : "r" (0UL));
1680 goto out;
1683 DEBUGGER_trap_entry(TRAP_debug, regs);
1685 if ( !guest_mode(regs) )
1687 /* Clear TF just for absolute sanity. */
1688 regs->eflags &= ~EF_TF;
1689 /*
1690 * We ignore watchpoints when they trigger within Xen. This may happen
1691 * when a buffer is passed to us which previously had a watchpoint set
1692 * on it. No need to bump EIP; the only faulting trap is an instruction
1693 * breakpoint, which can't happen to us.
1694 */
1695 goto out;
1698 /* Save debug status register where guest OS can peek at it */
1699 v->arch.guest_context.debugreg[6] = condition;
1701 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1702 tb->flags = TBF_EXCEPTION;
1703 tb->cs = ti->cs;
1704 tb->eip = ti->address;
1705 if ( TI_GET_IF(ti) )
1706 tb->flags |= TBF_INTERRUPT;
1708 out:
1709 return EXCRET_not_a_fault;
1712 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1714 return EXCRET_not_a_fault;
1717 void set_intr_gate(unsigned int n, void *addr)
1719 #ifdef __i386__
1720 int i;
1721 /* Keep secondary tables in sync with IRQ updates. */
1722 for ( i = 1; i < NR_CPUS; i++ )
1723 if ( idt_tables[i] != NULL )
1724 _set_gate(&idt_tables[i][n], 14, 0, addr);
1725 #endif
1726 _set_gate(&idt_table[n], 14, 0, addr);
1729 void set_system_gate(unsigned int n, void *addr)
1731 _set_gate(idt_table+n,14,3,addr);
1734 void set_task_gate(unsigned int n, unsigned int sel)
1736 idt_table[n].a = sel << 16;
1737 idt_table[n].b = 0x8500;
1740 void set_tss_desc(unsigned int n, void *addr)
1742 _set_tssldt_desc(
1743 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1744 (unsigned long)addr,
1745 offsetof(struct tss_struct, __cacheline_filler) - 1,
1746 9);
1749 void __init trap_init(void)
1751 extern void percpu_traps_init(void);
1753 /*
1754 * Note that interrupt gates are always used, rather than trap gates. We
1755 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1756 * first activation must have the "bad" value(s) for these registers and
1757 * we may lose them if another activation is installed before they are
1758 * saved. The page-fault handler also needs interrupts disabled until %cr2
1759 * has been read and saved on the stack.
1760 */
1761 set_intr_gate(TRAP_divide_error,&divide_error);
1762 set_intr_gate(TRAP_debug,&debug);
1763 set_intr_gate(TRAP_nmi,&nmi);
1764 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1765 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1766 set_intr_gate(TRAP_bounds,&bounds);
1767 set_intr_gate(TRAP_invalid_op,&invalid_op);
1768 set_intr_gate(TRAP_no_device,&device_not_available);
1769 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1770 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1771 set_intr_gate(TRAP_no_segment,&segment_not_present);
1772 set_intr_gate(TRAP_stack_error,&stack_segment);
1773 set_intr_gate(TRAP_gp_fault,&general_protection);
1774 set_intr_gate(TRAP_page_fault,&page_fault);
1775 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1776 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1777 set_intr_gate(TRAP_alignment_check,&alignment_check);
1778 set_intr_gate(TRAP_machine_check,&machine_check);
1779 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1781 percpu_traps_init();
1783 cpu_init();
1785 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1789 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1791 struct trap_info cur;
1792 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1793 long rc = 0;
1795 /* If no table is presented then clear the entire virtual IDT. */
1796 if ( guest_handle_is_null(traps) )
1798 memset(dst, 0, 256 * sizeof(*dst));
1799 init_int80_direct_trap(current);
1800 return 0;
1803 for ( ; ; )
1805 if ( hypercall_preempt_check() )
1807 rc = hypercall_create_continuation(
1808 __HYPERVISOR_set_trap_table, "h", traps);
1809 break;
1812 if ( copy_from_guest(&cur, traps, 1) )
1814 rc = -EFAULT;
1815 break;
1818 if ( cur.address == 0 )
1819 break;
1821 fixup_guest_code_selector(cur.cs);
1823 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1825 if ( cur.vector == 0x80 )
1826 init_int80_direct_trap(current);
1828 guest_handle_add_offset(traps, 1);
1831 return rc;
1835 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1837 int i;
1839 switch ( reg )
1841 case 0:
1842 if ( !access_ok(value, sizeof(long)) )
1843 return -EPERM;
1844 if ( p == current )
1845 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1846 break;
1847 case 1:
1848 if ( !access_ok(value, sizeof(long)) )
1849 return -EPERM;
1850 if ( p == current )
1851 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1852 break;
1853 case 2:
1854 if ( !access_ok(value, sizeof(long)) )
1855 return -EPERM;
1856 if ( p == current )
1857 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1858 break;
1859 case 3:
1860 if ( !access_ok(value, sizeof(long)) )
1861 return -EPERM;
1862 if ( p == current )
1863 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1864 break;
1865 case 6:
1866 /*
1867 * DR6: Bits 4-11,16-31 reserved (set to 1).
1868 * Bit 12 reserved (set to 0).
1869 */
1870 value &= 0xffffefff; /* reserved bits => 0 */
1871 value |= 0xffff0ff0; /* reserved bits => 1 */
1872 if ( p == current )
1873 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1874 break;
1875 case 7:
1876 /*
1877 * DR7: Bit 10 reserved (set to 1).
1878 * Bits 11-12,14-15 reserved (set to 0).
1879 * Privileged bits:
1880 * GD (bit 13): must be 0.
1881 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1882 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1883 */
1884 /* DR7 == 0 => debugging disabled for this domain. */
1885 if ( value != 0 )
1887 value &= 0xffff27ff; /* reserved bits => 0 */
1888 value |= 0x00000400; /* reserved bits => 1 */
1889 if ( (value & (1<<13)) != 0 ) return -EPERM;
1890 for ( i = 0; i < 16; i += 2 )
1891 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1893 if ( p == current )
1894 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1895 break;
1896 default:
1897 return -EINVAL;
1900 p->arch.guest_context.debugreg[reg] = value;
1901 return 0;
1904 long do_set_debugreg(int reg, unsigned long value)
1906 return set_debugreg(current, reg, value);
1909 unsigned long do_get_debugreg(int reg)
1911 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1912 return current->arch.guest_context.debugreg[reg];
1915 /*
1916 * Local variables:
1917 * mode: C
1918 * c-set-style: "BSD"
1919 * c-basic-offset: 4
1920 * tab-width: 4
1921 * indent-tabs-mode: nil
1922 * End:
1923 */