direct-io.hg

view xen/arch/x86/traps.c @ 13800:01ec7dba9ff8

Hide RDTSCP feature flag from PV guests.

Linux 2.6.19 (x86-64) makes use of this feature if available, but Xen
(validly) fails the attempt to write the respective MSR. Hence the
feature must be hidden from PV guests.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Fri Feb 02 16:07:13 2007 +0000 (2007-02-02)
parents 2a9b6b1f848f
children 6daa91dc9247
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <asm/shadow.h>
50 #include <asm/system.h>
51 #include <asm/io.h>
52 #include <asm/atomic.h>
53 #include <asm/desc.h>
54 #include <asm/debugreg.h>
55 #include <asm/smp.h>
56 #include <asm/flushtlb.h>
57 #include <asm/uaccess.h>
58 #include <asm/i387.h>
59 #include <asm/debugger.h>
60 #include <asm/msr.h>
61 #include <asm/shared.h>
62 #include <asm/x86_emulate.h>
63 #include <asm/hvm/vpt.h>
65 /*
66 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
67 * fatal: Xen prints diagnostic message and then hangs.
68 * dom0: The NMI is virtualised to DOM0.
69 * ignore: The NMI error is cleared and ignored.
70 */
71 #ifdef NDEBUG
72 char opt_nmi[10] = "dom0";
73 #else
74 char opt_nmi[10] = "fatal";
75 #endif
76 string_param("nmi", opt_nmi);
78 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
79 idt_entry_t idt_table[IDT_ENTRIES];
81 #define DECLARE_TRAP_HANDLER(_name) \
82 asmlinkage void _name(void); \
83 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
85 asmlinkage void nmi(void);
86 DECLARE_TRAP_HANDLER(divide_error);
87 DECLARE_TRAP_HANDLER(debug);
88 DECLARE_TRAP_HANDLER(int3);
89 DECLARE_TRAP_HANDLER(overflow);
90 DECLARE_TRAP_HANDLER(bounds);
91 DECLARE_TRAP_HANDLER(invalid_op);
92 DECLARE_TRAP_HANDLER(device_not_available);
93 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
94 DECLARE_TRAP_HANDLER(invalid_TSS);
95 DECLARE_TRAP_HANDLER(segment_not_present);
96 DECLARE_TRAP_HANDLER(stack_segment);
97 DECLARE_TRAP_HANDLER(general_protection);
98 DECLARE_TRAP_HANDLER(page_fault);
99 DECLARE_TRAP_HANDLER(coprocessor_error);
100 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
101 DECLARE_TRAP_HANDLER(alignment_check);
102 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
103 DECLARE_TRAP_HANDLER(machine_check);
105 long do_set_debugreg(int reg, unsigned long value);
106 unsigned long do_get_debugreg(int reg);
108 static int debug_stack_lines = 20;
109 integer_param("debug_stack_lines", debug_stack_lines);
111 #ifdef CONFIG_X86_32
112 #define stack_words_per_line 8
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
114 #else
115 #define stack_words_per_line 4
116 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
117 #endif
119 static void show_guest_stack(struct cpu_user_regs *regs)
120 {
121 int i;
122 unsigned long *stack, addr;
124 if ( is_hvm_vcpu(current) )
125 return;
127 if ( IS_COMPAT(container_of(regs, struct cpu_info, guest_cpu_user_regs)->current_vcpu->domain) )
128 {
129 compat_show_guest_stack(regs, debug_stack_lines);
130 return;
131 }
133 if ( vm86_mode(regs) )
134 {
135 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
136 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
137 regs->ss, (uint16_t)(regs->esp & 0xffff));
138 }
139 else
140 {
141 stack = (unsigned long *)regs->esp;
142 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
143 }
145 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
146 {
147 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
148 break;
149 if ( get_user(addr, stack) )
150 {
151 if ( i != 0 )
152 printk("\n ");
153 printk("Fault while accessing guest memory.");
154 i = 1;
155 break;
156 }
157 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
158 printk("\n ");
159 printk(" %p", _p(addr));
160 stack++;
161 }
162 if ( i == 0 )
163 printk("Stack empty.");
164 printk("\n");
165 }
167 #ifdef NDEBUG
169 static void show_trace(struct cpu_user_regs *regs)
170 {
171 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
173 printk("Xen call trace:\n ");
175 printk("[<%p>]", _p(regs->eip));
176 print_symbol(" %s\n ", regs->eip);
178 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
179 {
180 addr = *stack++;
181 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
182 {
183 printk("[<%p>]", _p(addr));
184 print_symbol(" %s\n ", addr);
185 }
186 }
188 printk("\n");
189 }
191 #else
193 static void show_trace(struct cpu_user_regs *regs)
194 {
195 unsigned long *frame, next, addr, low, high;
197 printk("Xen call trace:\n ");
199 printk("[<%p>]", _p(regs->eip));
200 print_symbol(" %s\n ", regs->eip);
202 /* Bounds for range of valid frame pointer. */
203 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
204 high = (low & ~(STACK_SIZE - 1)) +
205 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
207 /* The initial frame pointer. */
208 next = regs->ebp;
210 for ( ; ; )
211 {
212 /* Valid frame pointer? */
213 if ( (next < low) || (next >= high) )
214 {
215 /*
216 * Exception stack frames have a different layout, denoted by an
217 * inverted frame pointer.
218 */
219 next = ~next;
220 if ( (next < low) || (next >= high) )
221 break;
222 frame = (unsigned long *)next;
223 next = frame[0];
224 addr = frame[(offsetof(struct cpu_user_regs, eip) -
225 offsetof(struct cpu_user_regs, ebp))
226 / BYTES_PER_LONG];
227 }
228 else
229 {
230 /* Ordinary stack frame. */
231 frame = (unsigned long *)next;
232 next = frame[0];
233 addr = frame[1];
234 }
236 printk("[<%p>]", _p(addr));
237 print_symbol(" %s\n ", addr);
239 low = (unsigned long)&frame[2];
240 }
242 printk("\n");
243 }
245 #endif
247 void show_stack(struct cpu_user_regs *regs)
248 {
249 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
250 int i;
252 if ( guest_mode(regs) )
253 return show_guest_stack(regs);
255 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
257 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
258 {
259 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
260 break;
261 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
262 printk("\n ");
263 addr = *stack++;
264 printk(" %p", _p(addr));
265 }
266 if ( i == 0 )
267 printk("Stack empty.");
268 printk("\n");
270 show_trace(regs);
271 }
273 void show_xen_trace()
274 {
275 struct cpu_user_regs regs;
276 #ifdef __x86_64
277 __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
278 __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
279 __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
280 #else
281 __asm__("movl %%esp,%0" : "=m" (regs.esp));
282 __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
283 __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
284 #endif
285 show_trace(&regs);
286 }
288 void show_stack_overflow(unsigned long esp)
289 {
290 #ifdef MEMORY_GUARD
291 unsigned long esp_top;
292 unsigned long *stack, addr;
294 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
296 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
297 if ( ((unsigned long)(esp - esp_top) > 512) &&
298 ((unsigned long)(esp_top - esp) > 512) )
299 return;
301 if ( esp < esp_top )
302 esp = esp_top;
304 printk("Xen stack overflow:\n ");
306 stack = (unsigned long *)esp;
307 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
308 {
309 addr = *stack++;
310 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
311 {
312 printk("%p: [<%p>]", stack, _p(addr));
313 print_symbol(" %s\n ", addr);
314 }
315 }
317 printk("\n");
318 #endif
319 }
321 void show_execution_state(struct cpu_user_regs *regs)
322 {
323 show_registers(regs);
324 show_stack(regs);
325 }
327 char *trapstr(int trapnr)
328 {
329 static char *strings[] = {
330 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
331 "invalid opcode", "device not available", "double fault",
332 "coprocessor segment", "invalid tss", "segment not found",
333 "stack error", "general protection fault", "page fault",
334 "spurious interrupt", "coprocessor error", "alignment check",
335 "machine check", "simd error"
336 };
338 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
339 return "???";
341 return strings[trapnr];
342 }
344 /*
345 * This is called for faults at very unexpected times (e.g., when interrupts
346 * are disabled). In such situations we can't do much that is safe. We try to
347 * print out some tracing and then we just spin.
348 */
349 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
350 {
351 watchdog_disable();
352 console_start_sync();
354 show_execution_state(regs);
356 if ( trapnr == TRAP_page_fault )
357 {
358 unsigned long cr2 = read_cr2();
359 printk("Faulting linear address: %p\n", _p(cr2));
360 show_page_walk(cr2);
361 }
363 panic("FATAL TRAP: vector = %d (%s)\n"
364 "[error_code=%04x] %s\n",
365 trapnr, trapstr(trapnr), regs->error_code,
366 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
367 }
369 static int do_guest_trap(
370 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
371 {
372 struct vcpu *v = current;
373 struct trap_bounce *tb;
374 const struct trap_info *ti;
376 tb = &v->arch.trap_bounce;
377 ti = &v->arch.guest_context.trap_ctxt[trapnr];
379 tb->flags = TBF_EXCEPTION;
380 tb->cs = ti->cs;
381 tb->eip = ti->address;
383 if ( use_error_code )
384 {
385 tb->flags |= TBF_EXCEPTION_ERRCODE;
386 tb->error_code = regs->error_code;
387 }
389 if ( TI_GET_IF(ti) )
390 tb->flags |= TBF_INTERRUPT;
392 if ( unlikely(null_trap_bounce(v, tb)) )
393 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
394 "domain %d on VCPU %d [ec=%04x]\n",
395 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
396 regs->error_code);
398 return 0;
399 }
401 static inline int do_trap(
402 int trapnr, struct cpu_user_regs *regs, int use_error_code)
403 {
404 unsigned long fixup;
406 DEBUGGER_trap_entry(trapnr, regs);
408 if ( guest_mode(regs) )
409 return do_guest_trap(trapnr, regs, use_error_code);
411 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
412 {
413 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
414 trapnr, _p(regs->eip), _p(fixup));
415 regs->eip = fixup;
416 return 0;
417 }
419 DEBUGGER_trap_fatal(trapnr, regs);
421 show_execution_state(regs);
422 panic("FATAL TRAP: vector = %d (%s)\n"
423 "[error_code=%04x]\n",
424 trapnr, trapstr(trapnr), regs->error_code);
425 return 0;
426 }
428 #define DO_ERROR_NOCODE(trapnr, name) \
429 asmlinkage int do_##name(struct cpu_user_regs *regs) \
430 { \
431 return do_trap(trapnr, regs, 0); \
432 }
434 #define DO_ERROR(trapnr, name) \
435 asmlinkage int do_##name(struct cpu_user_regs *regs) \
436 { \
437 return do_trap(trapnr, regs, 1); \
438 }
440 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
441 DO_ERROR_NOCODE(TRAP_overflow, overflow)
442 DO_ERROR_NOCODE(TRAP_bounds, bounds)
443 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
444 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
445 DO_ERROR( TRAP_no_segment, segment_not_present)
446 DO_ERROR( TRAP_stack_error, stack_segment)
447 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
448 DO_ERROR( TRAP_alignment_check, alignment_check)
449 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
451 int rdmsr_hypervisor_regs(
452 uint32_t idx, uint32_t *eax, uint32_t *edx)
453 {
454 idx -= 0x40000000;
455 if ( idx > 0 )
456 return 0;
458 *eax = *edx = 0;
459 return 1;
460 }
462 int wrmsr_hypervisor_regs(
463 uint32_t idx, uint32_t eax, uint32_t edx)
464 {
465 struct domain *d = current->domain;
467 idx -= 0x40000000;
468 if ( idx > 0 )
469 return 0;
471 switch ( idx )
472 {
473 case 0:
474 {
475 void *hypercall_page;
476 unsigned long mfn;
477 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
478 unsigned int idx = eax & 0xfff;
480 if ( idx > 0 )
481 {
482 gdprintk(XENLOG_WARNING,
483 "Dom%d: Out of range index %u to MSR %08x\n",
484 d->domain_id, idx, 0x40000000);
485 return 0;
486 }
488 mfn = gmfn_to_mfn(d, gmfn);
490 if ( !mfn_valid(mfn) ||
491 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
492 {
493 gdprintk(XENLOG_WARNING,
494 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
495 d->domain_id, gmfn, mfn, 0x40000000);
496 return 0;
497 }
499 hypercall_page = map_domain_page(mfn);
500 hypercall_page_initialise(d, hypercall_page);
501 unmap_domain_page(hypercall_page);
503 put_page_and_type(mfn_to_page(mfn));
504 break;
505 }
507 default:
508 BUG();
509 }
511 return 1;
512 }
514 int cpuid_hypervisor_leaves(
515 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
516 {
517 idx -= 0x40000000;
518 if ( idx > 2 )
519 return 0;
521 switch ( idx )
522 {
523 case 0:
524 *eax = 0x40000002; /* Largest leaf */
525 *ebx = 0x566e6558; /* Signature 1: "XenV" */
526 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
527 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
528 break;
530 case 1:
531 *eax = (xen_major_version() << 16) | xen_minor_version();
532 *ebx = 0; /* Reserved */
533 *ecx = 0; /* Reserved */
534 *edx = 0; /* Reserved */
535 break;
537 case 2:
538 *eax = 1; /* Number of hypercall-transfer pages */
539 *ebx = 0x40000000; /* MSR base address */
540 *ecx = 0; /* Features 1 */
541 *edx = 0; /* Features 2 */
542 break;
544 default:
545 BUG();
546 }
548 return 1;
549 }
551 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
552 {
553 char sig[5], instr[2];
554 uint32_t a, b, c, d;
555 unsigned long eip, rc;
557 a = regs->eax;
558 b = regs->ebx;
559 c = regs->ecx;
560 d = regs->edx;
561 eip = regs->eip;
563 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
564 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
565 {
566 propagate_page_fault(eip + sizeof(sig) - rc, 0);
567 return EXCRET_fault_fixed;
568 }
569 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
570 return 0;
571 eip += sizeof(sig);
573 /* We only emulate CPUID. */
574 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
575 {
576 propagate_page_fault(eip + sizeof(instr) - rc, 0);
577 return EXCRET_fault_fixed;
578 }
579 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
580 return 0;
581 eip += sizeof(instr);
583 __asm__ (
584 "cpuid"
585 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
586 : "0" (a), "1" (b), "2" (c), "3" (d) );
588 if ( regs->eax == 1 )
589 {
590 /* Modify Feature Information. */
591 clear_bit(X86_FEATURE_VME, &d);
592 clear_bit(X86_FEATURE_DE, &d);
593 clear_bit(X86_FEATURE_PSE, &d);
594 clear_bit(X86_FEATURE_PGE, &d);
595 if ( !supervisor_mode_kernel )
596 clear_bit(X86_FEATURE_SEP, &d);
597 if ( !IS_PRIV(current->domain) )
598 clear_bit(X86_FEATURE_MTRR, &d);
599 }
600 else if ( regs->eax == 0x80000001 )
601 {
602 /* Modify Feature Information. */
603 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
604 }
605 else
606 {
607 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
608 }
610 regs->eax = a;
611 regs->ebx = b;
612 regs->ecx = c;
613 regs->edx = d;
614 regs->eip = eip;
616 return EXCRET_fault_fixed;
617 }
619 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
620 {
621 int rc;
623 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
625 if ( unlikely(!guest_mode(regs)) )
626 {
627 char sig[5];
628 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
629 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
630 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
631 {
632 show_execution_state(regs);
633 regs->eip += sizeof(sig);
634 return EXCRET_fault_fixed;
635 }
636 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
637 show_execution_state(regs);
638 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
639 }
641 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
642 return rc;
644 return do_guest_trap(TRAP_invalid_op, regs, 0);
645 }
647 asmlinkage int do_int3(struct cpu_user_regs *regs)
648 {
649 DEBUGGER_trap_entry(TRAP_int3, regs);
651 if ( !guest_mode(regs) )
652 {
653 DEBUGGER_trap_fatal(TRAP_int3, regs);
654 show_execution_state(regs);
655 panic("FATAL TRAP: vector = 3 (Int3)\n");
656 }
658 return do_guest_trap(TRAP_int3, regs, 0);
659 }
661 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
662 {
663 fatal_trap(TRAP_machine_check, regs);
664 return 0;
665 }
667 void propagate_page_fault(unsigned long addr, u16 error_code)
668 {
669 struct trap_info *ti;
670 struct vcpu *v = current;
671 struct trap_bounce *tb = &v->arch.trap_bounce;
673 v->arch.guest_context.ctrlreg[2] = addr;
674 arch_set_cr2(v, addr);
676 /* Re-set error_code.user flag appropriately for the guest. */
677 error_code &= ~PFEC_user_mode;
678 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
679 error_code |= PFEC_user_mode;
681 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
682 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
683 tb->error_code = error_code;
684 tb->cs = ti->cs;
685 tb->eip = ti->address;
686 if ( TI_GET_IF(ti) )
687 tb->flags |= TBF_INTERRUPT;
688 if ( unlikely(null_trap_bounce(v, tb)) )
689 {
690 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
691 v->domain->domain_id, v->vcpu_id, error_code);
692 show_page_walk(addr);
693 }
694 }
696 static int handle_gdt_ldt_mapping_fault(
697 unsigned long offset, struct cpu_user_regs *regs)
698 {
699 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
700 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
701 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
703 /* Should never fault in another vcpu's area. */
704 BUG_ON(vcpu_area != current->vcpu_id);
706 /* Byte offset within the gdt/ldt sub-area. */
707 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
709 if ( likely(is_ldt_area) )
710 {
711 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
712 if ( unlikely(map_ldt_shadow_page(offset >> PAGE_SHIFT) == 0) )
713 {
714 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
715 if ( !guest_mode(regs) )
716 return 0;
717 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
718 propagate_page_fault(
719 current->arch.guest_context.ldt_base + offset,
720 regs->error_code);
721 }
722 }
723 else
724 {
725 /* GDT fault: handle the fault as #GP(selector). */
726 regs->error_code = (u16)offset & ~7;
727 (void)do_general_protection(regs);
728 }
730 return EXCRET_fault_fixed;
731 }
733 #ifdef HYPERVISOR_VIRT_END
734 #define IN_HYPERVISOR_RANGE(va) \
735 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
736 #else
737 #define IN_HYPERVISOR_RANGE(va) \
738 (((va) >= HYPERVISOR_VIRT_START))
739 #endif
741 static int __spurious_page_fault(
742 unsigned long addr, struct cpu_user_regs *regs)
743 {
744 unsigned long mfn, cr3 = read_cr3();
745 #if CONFIG_PAGING_LEVELS >= 4
746 l4_pgentry_t l4e, *l4t;
747 #endif
748 #if CONFIG_PAGING_LEVELS >= 3
749 l3_pgentry_t l3e, *l3t;
750 #endif
751 l2_pgentry_t l2e, *l2t;
752 l1_pgentry_t l1e, *l1t;
753 unsigned int required_flags, disallowed_flags;
755 /* Reserved bit violations are never spurious faults. */
756 if ( regs->error_code & PFEC_reserved_bit )
757 return 0;
759 required_flags = _PAGE_PRESENT;
760 if ( regs->error_code & PFEC_write_access )
761 required_flags |= _PAGE_RW;
762 if ( regs->error_code & PFEC_user_mode )
763 required_flags |= _PAGE_USER;
765 disallowed_flags = 0;
766 if ( regs->error_code & PFEC_insn_fetch )
767 disallowed_flags |= _PAGE_NX;
769 mfn = cr3 >> PAGE_SHIFT;
771 #if CONFIG_PAGING_LEVELS >= 4
772 l4t = map_domain_page(mfn);
773 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
774 mfn = l4e_get_pfn(l4e);
775 unmap_domain_page(l4t);
776 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
777 (l4e_get_flags(l4e) & disallowed_flags) )
778 return 0;
779 #endif
781 #if CONFIG_PAGING_LEVELS >= 3
782 l3t = map_domain_page(mfn);
783 #ifdef CONFIG_X86_PAE
784 l3t += (cr3 & 0xFE0UL) >> 3;
785 #endif
786 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
787 mfn = l3e_get_pfn(l3e);
788 unmap_domain_page(l3t);
789 #ifdef CONFIG_X86_PAE
790 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
791 return 0;
792 #else
793 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
794 (l3e_get_flags(l3e) & disallowed_flags) )
795 return 0;
796 #endif
797 #endif
799 l2t = map_domain_page(mfn);
800 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
801 mfn = l2e_get_pfn(l2e);
802 unmap_domain_page(l2t);
803 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
804 (l2e_get_flags(l2e) & disallowed_flags) )
805 return 0;
806 if ( l2e_get_flags(l2e) & _PAGE_PSE )
807 {
808 l1e = l1e_empty(); /* define before use in debug tracing */
809 goto spurious;
810 }
812 l1t = map_domain_page(mfn);
813 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
814 mfn = l1e_get_pfn(l1e);
815 unmap_domain_page(l1t);
816 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
817 (l1e_get_flags(l1e) & disallowed_flags) )
818 return 0;
820 spurious:
821 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
822 "at addr %lx, e/c %04x\n",
823 current->domain->domain_id, current->vcpu_id,
824 addr, regs->error_code);
825 #if CONFIG_PAGING_LEVELS >= 4
826 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
827 #endif
828 #if CONFIG_PAGING_LEVELS >= 3
829 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
830 #endif
831 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
832 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
833 #ifndef NDEBUG
834 show_registers(regs);
835 #endif
836 return 1;
837 }
839 static int spurious_page_fault(
840 unsigned long addr, struct cpu_user_regs *regs)
841 {
842 unsigned long flags;
843 int is_spurious;
845 /*
846 * Disabling interrupts prevents TLB flushing, and hence prevents
847 * page tables from becoming invalid under our feet during the walk.
848 */
849 local_irq_save(flags);
850 is_spurious = __spurious_page_fault(addr, regs);
851 local_irq_restore(flags);
853 return is_spurious;
854 }
856 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
857 {
858 struct vcpu *v = current;
859 struct domain *d = v->domain;
861 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
862 {
863 if ( shadow_mode_external(d) && guest_mode(regs) )
864 return shadow_fault(addr, regs);
865 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
866 return handle_gdt_ldt_mapping_fault(
867 addr - GDT_LDT_VIRT_START, regs);
868 return 0;
869 }
871 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
872 guest_kernel_mode(v, regs) &&
873 /* Do not check if access-protection fault since the page may
874 legitimately be not present in shadow page tables */
875 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
876 ptwr_do_page_fault(v, addr, regs) )
877 return EXCRET_fault_fixed;
879 if ( shadow_mode_enabled(d) )
880 return shadow_fault(addr, regs);
882 return 0;
883 }
885 /*
886 * #PF error code:
887 * Bit 0: Protection violation (=1) ; Page not present (=0)
888 * Bit 1: Write access
889 * Bit 2: User mode (=1) ; Supervisor mode (=0)
890 * Bit 3: Reserved bit violation
891 * Bit 4: Instruction fetch
892 */
893 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
894 {
895 unsigned long addr, fixup;
896 int rc;
898 ASSERT(!in_irq());
900 addr = read_cr2();
902 DEBUGGER_trap_entry(TRAP_page_fault, regs);
904 perfc_incrc(page_faults);
906 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
907 return rc;
909 if ( unlikely(!guest_mode(regs)) )
910 {
911 if ( spurious_page_fault(addr, regs) )
912 return EXCRET_not_a_fault;
914 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
915 {
916 perfc_incrc(copy_user_faults);
917 regs->eip = fixup;
918 return 0;
919 }
921 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
923 show_execution_state(regs);
924 show_page_walk(addr);
925 panic("FATAL PAGE FAULT\n"
926 "[error_code=%04x]\n"
927 "Faulting linear address: %p\n",
928 regs->error_code, _p(addr));
929 }
931 propagate_page_fault(addr, regs->error_code);
932 return 0;
933 }
935 /*
936 * Early handler to deal with spurious page faults. For example, consider a
937 * routine that uses a mapping immediately after installing it (making it
938 * present). The CPU may speculatively execute the memory access before
939 * executing the PTE write. The instruction will then be marked to cause a
940 * page fault when it is retired, despite the fact that the PTE is present and
941 * correct at that point in time.
942 */
943 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
944 {
945 static int stuck;
946 static unsigned long prev_eip, prev_cr2;
947 unsigned long cr2 = read_cr2();
949 BUG_ON(smp_processor_id() != 0);
951 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
952 {
953 prev_eip = regs->eip;
954 prev_cr2 = cr2;
955 stuck = 0;
956 return EXCRET_not_a_fault;
957 }
959 if ( stuck++ == 1000 )
960 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
961 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
963 return EXCRET_not_a_fault;
964 }
966 long do_fpu_taskswitch(int set)
967 {
968 struct vcpu *v = current;
970 if ( set )
971 {
972 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
973 stts();
974 }
975 else
976 {
977 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
978 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
979 clts();
980 }
982 return 0;
983 }
985 static int read_descriptor(unsigned int sel,
986 const struct vcpu *v,
987 const struct cpu_user_regs * regs,
988 unsigned long *base,
989 unsigned long *limit,
990 unsigned int *ar,
991 unsigned int vm86attr)
992 {
993 struct desc_struct desc;
995 if ( !vm86_mode(regs) )
996 {
997 if ( sel < 4)
998 desc.b = desc.a = 0;
999 else if ( __get_user(desc,
1000 (const struct desc_struct *)(!(sel & 4)
1001 ? GDT_VIRT_START(v)
1002 : LDT_VIRT_START(v))
1003 + (sel >> 3)) )
1004 return 0;
1005 if ( !(vm86attr & _SEGMENT_CODE) )
1006 desc.b &= ~_SEGMENT_L;
1008 else
1010 desc.a = (sel << 20) | 0xffff;
1011 desc.b = vm86attr | (sel >> 12);
1014 *ar = desc.b & 0x00f0ff00;
1015 if ( !(desc.b & _SEGMENT_L) )
1017 *base = (desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000);
1018 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1019 if ( desc.b & _SEGMENT_G )
1020 *limit = ((*limit + 1) << 12) - 1;
1021 #ifndef NDEBUG
1022 if ( !vm86_mode(regs) && sel > 3 )
1024 unsigned int a, l;
1025 unsigned char valid;
1027 __asm__("larl %2, %0\n\tsetz %1" : "=r" (a), "=rm" (valid) : "rm" (sel));
1028 BUG_ON(valid && (a & 0x00f0ff00) != *ar);
1029 __asm__("lsll %2, %0\n\tsetz %1" : "=r" (l), "=rm" (valid) : "rm" (sel));
1030 BUG_ON(valid && l != *limit);
1032 #endif
1034 else
1036 *base = 0UL;
1037 *limit = ~0UL;
1040 return 1;
1043 /* Has the guest requested sufficient permission for this I/O access? */
1044 static inline int guest_io_okay(
1045 unsigned int port, unsigned int bytes,
1046 struct vcpu *v, struct cpu_user_regs *regs)
1048 #if defined(__x86_64__)
1049 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1050 int user_mode = !(v->arch.flags & TF_kernel_mode);
1051 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1052 #elif defined(__i386__)
1053 #define TOGGLE_MODE() ((void)0)
1054 #endif
1056 if ( !vm86_mode(regs) &&
1057 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1058 return 1;
1060 if ( v->arch.iobmp_limit > (port + bytes) )
1062 union { uint8_t bytes[2]; uint16_t mask; } x;
1064 /*
1065 * Grab permission bytes from guest space. Inaccessible bytes are
1066 * read as 0xff (no access allowed).
1067 */
1068 TOGGLE_MODE();
1069 switch ( __copy_from_guest_offset(&x.bytes[0], v->arch.iobmp,
1070 port>>3, 2) )
1072 default: x.bytes[0] = ~0;
1073 case 1: x.bytes[1] = ~0;
1074 case 0: break;
1076 TOGGLE_MODE();
1078 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1079 return 1;
1082 return 0;
1085 /* Has the administrator granted sufficient permission for this I/O access? */
1086 static inline int admin_io_okay(
1087 unsigned int port, unsigned int bytes,
1088 struct vcpu *v, struct cpu_user_regs *regs)
1090 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1093 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1094 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1095 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1096 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1097 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1098 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1100 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1101 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1102 __attribute__((__regparm__(1)));
1103 unsigned long guest_to_host_gpr_switch(unsigned long)
1104 __attribute__((__regparm__(1)));
1106 /* Instruction fetch with error handling. */
1107 #define insn_fetch(type, base, eip, limit) \
1108 ({ unsigned long _rc, _ptr = (base) + (eip); \
1109 type _x; \
1110 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1111 goto fail; \
1112 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1113 { \
1114 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1115 return EXCRET_fault_fixed; \
1116 } \
1117 (eip) += sizeof(_x); _x; })
1119 #if defined(CONFIG_X86_32)
1120 # define read_sreg(regs, sr) ((regs)->sr)
1121 #elif defined(CONFIG_X86_64)
1122 # define read_sreg(regs, sr) read_segment_register(sr)
1123 #endif
1125 static int emulate_privileged_op(struct cpu_user_regs *regs)
1127 struct vcpu *v = current;
1128 unsigned long *reg, eip = regs->eip, res;
1129 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1130 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1131 unsigned int port, i, data_sel, ar, data, rc;
1132 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1133 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1134 ? regs->reg \
1135 : ad_bytes == 4 \
1136 ? (u32)regs->reg \
1137 : (u16)regs->reg)
1138 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1139 ? regs->reg = (val) \
1140 : ad_bytes == 4 \
1141 ? (*(u32 *)&regs->reg = (val)) \
1142 : (*(u16 *)&regs->reg = (val)))
1143 unsigned long code_base, code_limit;
1144 char io_emul_stub[16];
1145 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1146 u32 l, h;
1148 if ( !read_descriptor(regs->cs, v, regs,
1149 &code_base, &code_limit, &ar,
1150 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1151 goto fail;
1152 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1153 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1154 if ( !(ar & _SEGMENT_S) ||
1155 !(ar & _SEGMENT_P) ||
1156 !(ar & _SEGMENT_CODE) )
1157 goto fail;
1159 /* emulating only opcodes not allowing SS to be default */
1160 data_sel = read_sreg(regs, ds);
1162 /* Legacy prefixes. */
1163 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1165 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1167 case 0x66: /* operand-size override */
1168 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1169 continue;
1170 case 0x67: /* address-size override */
1171 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1172 continue;
1173 case 0x2e: /* CS override */
1174 data_sel = regs->cs;
1175 continue;
1176 case 0x3e: /* DS override */
1177 data_sel = read_sreg(regs, ds);
1178 continue;
1179 case 0x26: /* ES override */
1180 data_sel = read_sreg(regs, es);
1181 continue;
1182 case 0x64: /* FS override */
1183 data_sel = read_sreg(regs, fs);
1184 lm_ovr = lm_seg_fs;
1185 continue;
1186 case 0x65: /* GS override */
1187 data_sel = read_sreg(regs, gs);
1188 lm_ovr = lm_seg_gs;
1189 continue;
1190 case 0x36: /* SS override */
1191 data_sel = regs->ss;
1192 continue;
1193 case 0xf0: /* LOCK */
1194 lock = 1;
1195 continue;
1196 case 0xf2: /* REPNE/REPNZ */
1197 case 0xf3: /* REP/REPE/REPZ */
1198 rep_prefix = 1;
1199 continue;
1200 default:
1201 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1203 rex = opcode;
1204 continue;
1206 break;
1208 break;
1211 /* REX prefix. */
1212 if ( rex & 8 ) /* REX.W */
1213 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1214 modrm_reg = (rex & 4) << 1; /* REX.R */
1215 /* REX.X does not need to be decoded. */
1216 modrm_rm = (rex & 1) << 3; /* REX.B */
1218 if ( opcode == 0x0f )
1219 goto twobyte_opcode;
1221 if ( lock )
1222 goto fail;
1224 /* Input/Output String instructions. */
1225 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1227 unsigned long data_base, data_limit;
1229 if ( rep_prefix && (rd_ad(ecx) == 0) )
1230 goto done;
1232 if ( !(opcode & 2) )
1234 data_sel = read_sreg(regs, es);
1235 lm_ovr = lm_seg_none;
1238 if ( !(ar & _SEGMENT_L) )
1240 if ( !read_descriptor(data_sel, v, regs,
1241 &data_base, &data_limit, &ar,
1242 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1243 goto fail;
1244 if ( !(ar & _SEGMENT_S) ||
1245 !(ar & _SEGMENT_P) ||
1246 (opcode & 2 ?
1247 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1248 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1249 goto fail;
1251 #ifdef CONFIG_X86_64
1252 else
1254 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1256 switch ( lm_ovr )
1258 case lm_seg_none:
1259 data_base = 0UL;
1260 break;
1261 case lm_seg_fs:
1262 data_base = v->arch.guest_context.fs_base;
1263 break;
1264 case lm_seg_gs:
1265 if ( guest_kernel_mode(v, regs) )
1266 data_base = v->arch.guest_context.gs_base_kernel;
1267 else
1268 data_base = v->arch.guest_context.gs_base_user;
1269 break;
1272 else
1273 read_descriptor(data_sel, v, regs,
1274 &data_base, &data_limit, &ar,
1275 0);
1276 data_limit = ~0UL;
1277 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1279 #endif
1281 continue_io_string:
1282 switch ( opcode )
1284 case 0x6c: /* INSB */
1285 op_bytes = 1;
1286 case 0x6d: /* INSW/INSL */
1287 if ( data_limit < op_bytes - 1 ||
1288 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1289 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1290 goto fail;
1291 port = (u16)regs->edx;
1292 switch ( op_bytes )
1294 case 1:
1295 /* emulate PIT counter 2 */
1296 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1297 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1298 pv_pit_handler(port, 0, 0) : ~0));
1299 break;
1300 case 2:
1301 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1302 break;
1303 case 4:
1304 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1305 break;
1307 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1309 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1310 PFEC_write_access);
1311 return EXCRET_fault_fixed;
1313 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1314 break;
1316 case 0x6e: /* OUTSB */
1317 op_bytes = 1;
1318 case 0x6f: /* OUTSW/OUTSL */
1319 if ( data_limit < op_bytes - 1 ||
1320 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1321 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1322 goto fail;
1323 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1324 if ( rc != 0 )
1326 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1327 return EXCRET_fault_fixed;
1329 port = (u16)regs->edx;
1330 switch ( op_bytes )
1332 case 1:
1333 if ( guest_outb_okay(port, v, regs) )
1334 outb((u8)data, port);
1335 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1336 pv_pit_handler(port, data, 1);
1337 break;
1338 case 2:
1339 if ( guest_outw_okay(port, v, regs) )
1340 outw((u16)data, port);
1341 break;
1342 case 4:
1343 if ( guest_outl_okay(port, v, regs) )
1344 outl((u32)data, port);
1345 break;
1347 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1348 break;
1351 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1353 if ( !hypercall_preempt_check() )
1354 goto continue_io_string;
1355 eip = regs->eip;
1358 goto done;
1361 /*
1362 * Very likely to be an I/O instruction (IN/OUT).
1363 * Build an on-stack stub to execute the instruction with full guest
1364 * GPR context. This is needed for some systems which (ab)use IN/OUT
1365 * to communicate with BIOS code in system-management mode.
1366 */
1367 /* call host_to_guest_gpr_switch */
1368 io_emul_stub[0] = 0xe8;
1369 *(s32 *)&io_emul_stub[1] =
1370 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1371 /* data16 or nop */
1372 io_emul_stub[5] = (op_bytes != 2) ? 0x90 : 0x66;
1373 /* <io-access opcode> */
1374 io_emul_stub[6] = opcode;
1375 /* imm8 or nop */
1376 io_emul_stub[7] = 0x90;
1377 /* jmp guest_to_host_gpr_switch */
1378 io_emul_stub[8] = 0xe9;
1379 *(s32 *)&io_emul_stub[9] =
1380 (char *)guest_to_host_gpr_switch - &io_emul_stub[13];
1382 /* Handy function-typed pointer to the stub. */
1383 io_emul = (void *)io_emul_stub;
1385 /* I/O Port and Interrupt Flag instructions. */
1386 switch ( opcode )
1388 case 0xe4: /* IN imm8,%al */
1389 op_bytes = 1;
1390 case 0xe5: /* IN imm8,%eax */
1391 port = insn_fetch(u8, code_base, eip, code_limit);
1392 io_emul_stub[7] = port; /* imm8 */
1393 exec_in:
1394 if ( !guest_io_okay(port, op_bytes, v, regs) )
1395 goto fail;
1396 switch ( op_bytes )
1398 case 1:
1399 if ( guest_inb_okay(port, v, regs) )
1400 io_emul(regs);
1401 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1403 regs->eax &= ~0xffUL;
1404 regs->eax |= pv_pit_handler(port, 0, 0);
1406 else
1407 regs->eax |= (u8)~0;
1408 break;
1409 case 2:
1410 if ( guest_inw_okay(port, v, regs) )
1411 io_emul(regs);
1412 else
1413 regs->eax |= (u16)~0;
1414 break;
1415 case 4:
1416 if ( guest_inl_okay(port, v, regs) )
1417 io_emul(regs);
1418 else
1419 regs->eax = (u32)~0;
1420 break;
1422 goto done;
1424 case 0xec: /* IN %dx,%al */
1425 op_bytes = 1;
1426 case 0xed: /* IN %dx,%eax */
1427 port = (u16)regs->edx;
1428 goto exec_in;
1430 case 0xe6: /* OUT %al,imm8 */
1431 op_bytes = 1;
1432 case 0xe7: /* OUT %eax,imm8 */
1433 port = insn_fetch(u8, code_base, eip, code_limit);
1434 io_emul_stub[7] = port; /* imm8 */
1435 exec_out:
1436 if ( !guest_io_okay(port, op_bytes, v, regs) )
1437 goto fail;
1438 switch ( op_bytes )
1440 case 1:
1441 if ( guest_outb_okay(port, v, regs) )
1442 io_emul(regs);
1443 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1444 pv_pit_handler(port, regs->eax, 1);
1445 break;
1446 case 2:
1447 if ( guest_outw_okay(port, v, regs) )
1448 io_emul(regs);
1449 break;
1450 case 4:
1451 if ( guest_outl_okay(port, v, regs) )
1452 io_emul(regs);
1453 break;
1455 goto done;
1457 case 0xee: /* OUT %al,%dx */
1458 op_bytes = 1;
1459 case 0xef: /* OUT %eax,%dx */
1460 port = (u16)regs->edx;
1461 goto exec_out;
1463 case 0xfa: /* CLI */
1464 case 0xfb: /* STI */
1465 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1466 goto fail;
1467 /*
1468 * This is just too dangerous to allow, in my opinion. Consider if the
1469 * caller then tries to reenable interrupts using POPF: we can't trap
1470 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1471 * do for us. :-)
1472 */
1473 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1474 goto done;
1477 /* No decode of this single-byte opcode. */
1478 goto fail;
1480 twobyte_opcode:
1481 /* Two-byte opcodes only emulated from guest kernel. */
1482 if ( !guest_kernel_mode(v, regs) )
1483 goto fail;
1485 /* Privileged (ring 0) instructions. */
1486 opcode = insn_fetch(u8, code_base, eip, code_limit);
1487 if ( lock && (opcode & ~3) != 0x20 )
1488 goto fail;
1489 switch ( opcode )
1491 case 0x06: /* CLTS */
1492 (void)do_fpu_taskswitch(0);
1493 break;
1495 case 0x09: /* WBINVD */
1496 /* Ignore the instruction if unprivileged. */
1497 if ( !cache_flush_permitted(v->domain) )
1498 /* Non-physdev domain attempted WBINVD; ignore for now since
1499 newer linux uses this in some start-of-day timing loops */
1501 else
1502 wbinvd();
1503 break;
1505 case 0x20: /* MOV CR?,<reg> */
1506 opcode = insn_fetch(u8, code_base, eip, code_limit);
1507 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1508 modrm_rm |= (opcode >> 0) & 7;
1509 reg = decode_register(modrm_rm, regs, 0);
1510 switch ( modrm_reg )
1512 case 0: /* Read CR0 */
1513 *reg = (read_cr0() & ~X86_CR0_TS) |
1514 v->arch.guest_context.ctrlreg[0];
1515 break;
1517 case 2: /* Read CR2 */
1518 *reg = v->arch.guest_context.ctrlreg[2];
1519 break;
1521 case 3: /* Read CR3 */
1522 if ( !IS_COMPAT(v->domain) )
1523 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1524 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1525 #ifdef CONFIG_COMPAT
1526 else
1527 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1528 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1529 #endif
1530 break;
1532 case 4: /* Read CR4 */
1533 /*
1534 * Guests can read CR4 to see what features Xen has enabled. We
1535 * therefore lie about PGE & PSE as they are unavailable to guests.
1536 */
1537 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1538 break;
1540 default:
1541 goto fail;
1543 break;
1545 case 0x21: /* MOV DR?,<reg> */
1546 opcode = insn_fetch(u8, code_base, eip, code_limit);
1547 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1548 modrm_rm |= (opcode >> 0) & 7;
1549 reg = decode_register(modrm_rm, regs, 0);
1550 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1551 goto fail;
1552 *reg = res;
1553 break;
1555 case 0x22: /* MOV <reg>,CR? */
1556 opcode = insn_fetch(u8, code_base, eip, code_limit);
1557 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1558 modrm_rm |= (opcode >> 0) & 7;
1559 reg = decode_register(modrm_rm, regs, 0);
1560 switch ( modrm_reg )
1562 case 0: /* Write CR0 */
1563 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1565 gdprintk(XENLOG_WARNING,
1566 "Attempt to change unmodifiable CR0 flags.\n");
1567 goto fail;
1569 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1570 break;
1572 case 2: /* Write CR2 */
1573 v->arch.guest_context.ctrlreg[2] = *reg;
1574 arch_set_cr2(v, *reg);
1575 break;
1577 case 3: /* Write CR3 */
1578 LOCK_BIGLOCK(v->domain);
1579 if ( !IS_COMPAT(v->domain) )
1580 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1581 #ifdef CONFIG_COMPAT
1582 else
1583 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1584 #endif
1585 UNLOCK_BIGLOCK(v->domain);
1586 if ( rc == 0 ) /* not okay */
1587 goto fail;
1588 break;
1590 case 4:
1591 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1593 gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags.\n");
1594 goto fail;
1596 break;
1598 default:
1599 goto fail;
1601 break;
1603 case 0x23: /* MOV <reg>,DR? */
1604 opcode = insn_fetch(u8, code_base, eip, code_limit);
1605 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1606 modrm_rm |= (opcode >> 0) & 7;
1607 reg = decode_register(modrm_rm, regs, 0);
1608 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1609 goto fail;
1610 break;
1612 case 0x30: /* WRMSR */
1613 switch ( regs->ecx )
1615 #ifdef CONFIG_X86_64
1616 case MSR_FS_BASE:
1617 if ( IS_COMPAT(v->domain) )
1618 goto fail;
1619 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1620 goto fail;
1621 v->arch.guest_context.fs_base =
1622 ((u64)regs->edx << 32) | regs->eax;
1623 break;
1624 case MSR_GS_BASE:
1625 if ( IS_COMPAT(v->domain) )
1626 goto fail;
1627 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1628 goto fail;
1629 v->arch.guest_context.gs_base_kernel =
1630 ((u64)regs->edx << 32) | regs->eax;
1631 break;
1632 case MSR_SHADOW_GS_BASE:
1633 if ( IS_COMPAT(v->domain) )
1634 goto fail;
1635 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1636 goto fail;
1637 v->arch.guest_context.gs_base_user =
1638 ((u64)regs->edx << 32) | regs->eax;
1639 break;
1640 #endif
1641 default:
1642 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1643 break;
1645 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1646 (regs->eax != l) || (regs->edx != h) )
1647 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1648 "%08x:%08x to %08lx:%08lx.\n",
1649 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1650 break;
1652 break;
1654 case 0x32: /* RDMSR */
1655 switch ( regs->ecx )
1657 #ifdef CONFIG_X86_64
1658 case MSR_FS_BASE:
1659 if ( IS_COMPAT(v->domain) )
1660 goto fail;
1661 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1662 regs->edx = v->arch.guest_context.fs_base >> 32;
1663 break;
1664 case MSR_GS_BASE:
1665 if ( IS_COMPAT(v->domain) )
1666 goto fail;
1667 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1668 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1669 break;
1670 case MSR_SHADOW_GS_BASE:
1671 if ( IS_COMPAT(v->domain) )
1672 goto fail;
1673 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1674 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1675 break;
1676 #endif
1677 case MSR_EFER:
1678 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1679 goto fail;
1680 break;
1681 default:
1682 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1684 regs->eax = l;
1685 regs->edx = h;
1686 break;
1688 /* Everyone can read the MSR space. */
1689 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1690 _p(regs->ecx));*/
1691 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1692 goto fail;
1693 break;
1695 break;
1697 default:
1698 goto fail;
1701 #undef wr_ad
1702 #undef rd_ad
1704 done:
1705 regs->eip = eip;
1706 return EXCRET_fault_fixed;
1708 fail:
1709 return 0;
1712 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1714 struct vcpu *v = current;
1715 unsigned long fixup;
1717 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1719 if ( regs->error_code & 1 )
1720 goto hardware_gp;
1722 if ( !guest_mode(regs) )
1723 goto gp_in_kernel;
1725 /*
1726 * Cunning trick to allow arbitrary "INT n" handling.
1728 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1729 * instruction from trapping to the appropriate vector, when that might not
1730 * be expected by Xen or the guest OS. For example, that entry might be for
1731 * a fault handler (unlike traps, faults don't increment EIP), or might
1732 * expect an error code on the stack (which a software trap never
1733 * provides), or might be a hardware interrupt handler that doesn't like
1734 * being called spuriously.
1736 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1737 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1738 * clear to indicate that it's a software fault, not hardware.
1740 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1741 * okay because they can only be triggered by an explicit DPL-checked
1742 * instruction. The DPL specified by the guest OS for these vectors is NOT
1743 * CHECKED!!
1744 */
1745 if ( (regs->error_code & 3) == 2 )
1747 /* This fault must be due to <INT n> instruction. */
1748 const struct trap_info *ti;
1749 unsigned char vector = regs->error_code >> 3;
1750 ti = &v->arch.guest_context.trap_ctxt[vector];
1751 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1753 regs->eip += 2;
1754 return do_guest_trap(vector, regs, 0);
1758 /* Emulate some simple privileged and I/O instructions. */
1759 if ( (regs->error_code == 0) &&
1760 emulate_privileged_op(regs) )
1761 return 0;
1763 #if defined(__i386__)
1764 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1765 (regs->error_code == 0) &&
1766 gpf_emulate_4gb(regs) )
1767 return 0;
1768 #endif
1770 /* Pass on GPF as is. */
1771 return do_guest_trap(TRAP_gp_fault, regs, 1);
1773 gp_in_kernel:
1775 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1777 dprintk(XENLOG_WARNING, "GPF (%04x): %p -> %p\n",
1778 regs->error_code, _p(regs->eip), _p(fixup));
1779 regs->eip = fixup;
1780 return 0;
1783 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1785 hardware_gp:
1786 show_execution_state(regs);
1787 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1788 return 0;
1791 static void nmi_softirq(void)
1793 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1794 vcpu_kick(dom0->vcpu[0]);
1797 static void nmi_dom0_report(unsigned int reason_idx)
1799 struct domain *d;
1800 struct vcpu *v;
1802 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1803 return;
1805 set_bit(reason_idx, nmi_reason(d));
1807 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1808 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1811 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1813 switch ( opt_nmi[0] )
1815 case 'd': /* 'dom0' */
1816 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1817 case 'i': /* 'ignore' */
1818 break;
1819 default: /* 'fatal' */
1820 console_force_unlock();
1821 printk("\n\nNMI - MEMORY ERROR\n");
1822 fatal_trap(TRAP_nmi, regs);
1825 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1826 mdelay(1);
1827 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1830 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1832 switch ( opt_nmi[0] )
1834 case 'd': /* 'dom0' */
1835 nmi_dom0_report(_XEN_NMIREASON_io_error);
1836 case 'i': /* 'ignore' */
1837 break;
1838 default: /* 'fatal' */
1839 console_force_unlock();
1840 printk("\n\nNMI - I/O ERROR\n");
1841 fatal_trap(TRAP_nmi, regs);
1844 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1845 mdelay(1);
1846 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1849 static void unknown_nmi_error(unsigned char reason)
1851 switch ( opt_nmi[0] )
1853 case 'd': /* 'dom0' */
1854 nmi_dom0_report(_XEN_NMIREASON_unknown);
1855 case 'i': /* 'ignore' */
1856 break;
1857 default: /* 'fatal' */
1858 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1859 printk("Dazed and confused, but trying to continue\n");
1860 printk("Do you have a strange power saving mode enabled?\n");
1861 kexec_crash();
1865 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1867 return 0;
1870 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1872 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1874 unsigned int cpu = smp_processor_id();
1875 unsigned char reason;
1877 ++nmi_count(cpu);
1879 if ( nmi_callback(regs, cpu) )
1880 return;
1882 if ( nmi_watchdog )
1883 nmi_watchdog_tick(regs);
1885 /* Only the BSP gets external NMIs from the system. */
1886 if ( cpu == 0 )
1888 reason = inb(0x61);
1889 if ( reason & 0x80 )
1890 mem_parity_error(regs);
1891 else if ( reason & 0x40 )
1892 io_check_error(regs);
1893 else if ( !nmi_watchdog )
1894 unknown_nmi_error((unsigned char)(reason&0xff));
1898 void set_nmi_callback(nmi_callback_t callback)
1900 nmi_callback = callback;
1903 void unset_nmi_callback(void)
1905 nmi_callback = dummy_nmi_callback;
1908 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1910 setup_fpu(current);
1912 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1914 do_guest_trap(TRAP_no_device, regs, 0);
1915 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1918 return EXCRET_fault_fixed;
1921 asmlinkage int do_debug(struct cpu_user_regs *regs)
1923 unsigned long condition;
1924 struct vcpu *v = current;
1926 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1928 /* Mask out spurious debug traps due to lazy DR7 setting */
1929 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1930 (v->arch.guest_context.debugreg[7] == 0) )
1932 __asm__("mov %0,%%db7" : : "r" (0UL));
1933 goto out;
1936 DEBUGGER_trap_entry(TRAP_debug, regs);
1938 if ( !guest_mode(regs) )
1940 /* Clear TF just for absolute sanity. */
1941 regs->eflags &= ~EF_TF;
1942 /*
1943 * We ignore watchpoints when they trigger within Xen. This may happen
1944 * when a buffer is passed to us which previously had a watchpoint set
1945 * on it. No need to bump EIP; the only faulting trap is an instruction
1946 * breakpoint, which can't happen to us.
1947 */
1948 goto out;
1951 /* Save debug status register where guest OS can peek at it */
1952 v->arch.guest_context.debugreg[6] = condition;
1954 return do_guest_trap(TRAP_debug, regs, 0);
1956 out:
1957 return EXCRET_not_a_fault;
1960 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1962 return EXCRET_not_a_fault;
1965 void set_intr_gate(unsigned int n, void *addr)
1967 #ifdef __i386__
1968 int i;
1969 /* Keep secondary tables in sync with IRQ updates. */
1970 for ( i = 1; i < NR_CPUS; i++ )
1971 if ( idt_tables[i] != NULL )
1972 _set_gate(&idt_tables[i][n], 14, 0, addr);
1973 #endif
1974 _set_gate(&idt_table[n], 14, 0, addr);
1977 void set_system_gate(unsigned int n, void *addr)
1979 _set_gate(idt_table+n,14,3,addr);
1982 void set_task_gate(unsigned int n, unsigned int sel)
1984 idt_table[n].a = sel << 16;
1985 idt_table[n].b = 0x8500;
1988 void set_tss_desc(unsigned int n, void *addr)
1990 _set_tssldt_desc(
1991 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1992 (unsigned long)addr,
1993 offsetof(struct tss_struct, __cacheline_filler) - 1,
1994 9);
1995 #ifdef CONFIG_COMPAT
1996 _set_tssldt_desc(
1997 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1998 (unsigned long)addr,
1999 offsetof(struct tss_struct, __cacheline_filler) - 1,
2000 11);
2001 #endif
2004 void __init trap_init(void)
2006 extern void percpu_traps_init(void);
2008 /*
2009 * Note that interrupt gates are always used, rather than trap gates. We
2010 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2011 * first activation must have the "bad" value(s) for these registers and
2012 * we may lose them if another activation is installed before they are
2013 * saved. The page-fault handler also needs interrupts disabled until %cr2
2014 * has been read and saved on the stack.
2015 */
2016 set_intr_gate(TRAP_divide_error,&divide_error);
2017 set_intr_gate(TRAP_debug,&debug);
2018 set_intr_gate(TRAP_nmi,&nmi);
2019 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2020 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2021 set_intr_gate(TRAP_bounds,&bounds);
2022 set_intr_gate(TRAP_invalid_op,&invalid_op);
2023 set_intr_gate(TRAP_no_device,&device_not_available);
2024 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2025 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2026 set_intr_gate(TRAP_no_segment,&segment_not_present);
2027 set_intr_gate(TRAP_stack_error,&stack_segment);
2028 set_intr_gate(TRAP_gp_fault,&general_protection);
2029 set_intr_gate(TRAP_page_fault,&page_fault);
2030 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2031 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2032 set_intr_gate(TRAP_alignment_check,&alignment_check);
2033 set_intr_gate(TRAP_machine_check,&machine_check);
2034 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2036 percpu_traps_init();
2038 cpu_init();
2040 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2044 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2046 struct trap_info cur;
2047 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2048 long rc = 0;
2050 /* If no table is presented then clear the entire virtual IDT. */
2051 if ( guest_handle_is_null(traps) )
2053 memset(dst, 0, 256 * sizeof(*dst));
2054 init_int80_direct_trap(current);
2055 return 0;
2058 for ( ; ; )
2060 if ( hypercall_preempt_check() )
2062 rc = hypercall_create_continuation(
2063 __HYPERVISOR_set_trap_table, "h", traps);
2064 break;
2067 if ( copy_from_guest(&cur, traps, 1) )
2069 rc = -EFAULT;
2070 break;
2073 if ( cur.address == 0 )
2074 break;
2076 fixup_guest_code_selector(current->domain, cur.cs);
2078 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2080 if ( cur.vector == 0x80 )
2081 init_int80_direct_trap(current);
2083 guest_handle_add_offset(traps, 1);
2086 return rc;
2090 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2092 int i;
2094 switch ( reg )
2096 case 0:
2097 if ( !access_ok(value, sizeof(long)) )
2098 return -EPERM;
2099 if ( p == current )
2100 __asm__ ( "mov %0, %%db0" : : "r" (value) );
2101 break;
2102 case 1:
2103 if ( !access_ok(value, sizeof(long)) )
2104 return -EPERM;
2105 if ( p == current )
2106 __asm__ ( "mov %0, %%db1" : : "r" (value) );
2107 break;
2108 case 2:
2109 if ( !access_ok(value, sizeof(long)) )
2110 return -EPERM;
2111 if ( p == current )
2112 __asm__ ( "mov %0, %%db2" : : "r" (value) );
2113 break;
2114 case 3:
2115 if ( !access_ok(value, sizeof(long)) )
2116 return -EPERM;
2117 if ( p == current )
2118 __asm__ ( "mov %0, %%db3" : : "r" (value) );
2119 break;
2120 case 6:
2121 /*
2122 * DR6: Bits 4-11,16-31 reserved (set to 1).
2123 * Bit 12 reserved (set to 0).
2124 */
2125 value &= 0xffffefff; /* reserved bits => 0 */
2126 value |= 0xffff0ff0; /* reserved bits => 1 */
2127 if ( p == current )
2128 __asm__ ( "mov %0, %%db6" : : "r" (value) );
2129 break;
2130 case 7:
2131 /*
2132 * DR7: Bit 10 reserved (set to 1).
2133 * Bits 11-12,14-15 reserved (set to 0).
2134 * Privileged bits:
2135 * GD (bit 13): must be 0.
2136 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2137 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2138 */
2139 /* DR7 == 0 => debugging disabled for this domain. */
2140 if ( value != 0 )
2142 value &= 0xffff27ff; /* reserved bits => 0 */
2143 value |= 0x00000400; /* reserved bits => 1 */
2144 if ( (value & (1<<13)) != 0 ) return -EPERM;
2145 for ( i = 0; i < 16; i += 2 )
2146 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2148 if ( p == current )
2149 __asm__ ( "mov %0, %%db7" : : "r" (value) );
2150 break;
2151 default:
2152 return -EINVAL;
2155 p->arch.guest_context.debugreg[reg] = value;
2156 return 0;
2159 long do_set_debugreg(int reg, unsigned long value)
2161 return set_debugreg(current, reg, value);
2164 unsigned long do_get_debugreg(int reg)
2166 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2167 return current->arch.guest_context.debugreg[reg];
2170 /*
2171 * Local variables:
2172 * mode: C
2173 * c-set-style: "BSD"
2174 * c-basic-offset: 4
2175 * tab-width: 4
2176 * indent-tabs-mode: nil
2177 * End:
2178 */