ia64/xen-unstable

view xen/arch/x86/traps.c @ 16643:35ab2bb25e09

vmx: Do not set bit 1 of FEATURE_CONTROL MSR if SMX is not supported
by the CPU. Also generally beef up robustness of VMXON instruction.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Dec 19 15:51:01 2007 +0000 (2007-12-19)
parents 136f80d21958
children 9bf8b152df9f
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
91 DECLARE_TRAP_HANDLER(divide_error);
92 DECLARE_TRAP_HANDLER(debug);
93 DECLARE_TRAP_HANDLER(nmi);
94 DECLARE_TRAP_HANDLER(int3);
95 DECLARE_TRAP_HANDLER(overflow);
96 DECLARE_TRAP_HANDLER(bounds);
97 DECLARE_TRAP_HANDLER(invalid_op);
98 DECLARE_TRAP_HANDLER(device_not_available);
99 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
100 DECLARE_TRAP_HANDLER(invalid_TSS);
101 DECLARE_TRAP_HANDLER(segment_not_present);
102 DECLARE_TRAP_HANDLER(stack_segment);
103 DECLARE_TRAP_HANDLER(general_protection);
104 DECLARE_TRAP_HANDLER(page_fault);
105 DECLARE_TRAP_HANDLER(coprocessor_error);
106 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
107 DECLARE_TRAP_HANDLER(machine_check);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
113 void (*ioemul_handle_quirk)(
114 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
116 static int debug_stack_lines = 20;
117 integer_param("debug_stack_lines", debug_stack_lines);
119 static int opt_ler;
120 boolean_param("ler", opt_ler);
122 #ifdef CONFIG_X86_32
123 #define stack_words_per_line 8
124 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
125 #else
126 #define stack_words_per_line 4
127 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
128 #endif
130 static void show_guest_stack(struct cpu_user_regs *regs)
131 {
132 int i;
133 struct vcpu *curr = current;
134 unsigned long *stack, addr;
136 if ( is_hvm_vcpu(curr) )
137 return;
139 if ( is_pv_32on64_vcpu(curr) )
140 {
141 compat_show_guest_stack(regs, debug_stack_lines);
142 return;
143 }
145 if ( vm86_mode(regs) )
146 {
147 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
148 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
149 regs->ss, (uint16_t)(regs->esp & 0xffff));
150 }
151 else
152 {
153 stack = (unsigned long *)regs->esp;
154 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
155 }
157 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
158 {
159 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
160 break;
161 if ( get_user(addr, stack) )
162 {
163 if ( i != 0 )
164 printk("\n ");
165 printk("Fault while accessing guest memory.");
166 i = 1;
167 break;
168 }
169 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
170 printk("\n ");
171 printk(" %p", _p(addr));
172 stack++;
173 }
174 if ( i == 0 )
175 printk("Stack empty.");
176 printk("\n");
177 }
179 #if !defined(CONFIG_FRAME_POINTER)
181 static void show_trace(struct cpu_user_regs *regs)
182 {
183 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
185 printk("Xen call trace:\n ");
187 printk("[<%p>]", _p(regs->eip));
188 print_symbol(" %s\n ", regs->eip);
190 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
191 {
192 addr = *stack++;
193 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
194 {
195 printk("[<%p>]", _p(addr));
196 print_symbol(" %s\n ", addr);
197 }
198 }
200 printk("\n");
201 }
203 #else
205 static void show_trace(struct cpu_user_regs *regs)
206 {
207 unsigned long *frame, next, addr, low, high;
209 printk("Xen call trace:\n ");
211 printk("[<%p>]", _p(regs->eip));
212 print_symbol(" %s\n ", regs->eip);
214 /* Bounds for range of valid frame pointer. */
215 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
216 high = (low & ~(STACK_SIZE - 1)) +
217 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
219 /* The initial frame pointer. */
220 next = regs->ebp;
222 for ( ; ; )
223 {
224 /* Valid frame pointer? */
225 if ( (next < low) || (next >= high) )
226 {
227 /*
228 * Exception stack frames have a different layout, denoted by an
229 * inverted frame pointer.
230 */
231 next = ~next;
232 if ( (next < low) || (next >= high) )
233 break;
234 frame = (unsigned long *)next;
235 next = frame[0];
236 addr = frame[(offsetof(struct cpu_user_regs, eip) -
237 offsetof(struct cpu_user_regs, ebp))
238 / BYTES_PER_LONG];
239 }
240 else
241 {
242 /* Ordinary stack frame. */
243 frame = (unsigned long *)next;
244 next = frame[0];
245 addr = frame[1];
246 }
248 printk("[<%p>]", _p(addr));
249 print_symbol(" %s\n ", addr);
251 low = (unsigned long)&frame[2];
252 }
254 printk("\n");
255 }
257 #endif
259 void show_stack(struct cpu_user_regs *regs)
260 {
261 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
262 int i;
264 if ( guest_mode(regs) )
265 return show_guest_stack(regs);
267 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
269 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
270 {
271 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
272 break;
273 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
274 printk("\n ");
275 addr = *stack++;
276 printk(" %p", _p(addr));
277 }
278 if ( i == 0 )
279 printk("Stack empty.");
280 printk("\n");
282 show_trace(regs);
283 }
285 void show_stack_overflow(unsigned int cpu, unsigned long esp)
286 {
287 #ifdef MEMORY_GUARD
288 unsigned long esp_top, esp_bottom;
289 unsigned long *stack, addr;
291 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
292 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
294 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
295 (void *)esp_top, (void *)esp_bottom, (void *)esp,
296 (void *)init_tss[cpu].esp0);
298 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
299 if ( ((unsigned long)(esp - esp_top) > 512) &&
300 ((unsigned long)(esp_top - esp) > 512) )
301 {
302 printk("No stack overflow detected. Skipping stack trace.\n");
303 return;
304 }
306 if ( esp < esp_top )
307 esp = esp_top;
309 printk("Xen stack overflow (dumping trace %p-%p):\n ",
310 (void *)esp, (void *)esp_bottom);
312 stack = (unsigned long *)esp;
313 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
314 {
315 addr = *stack++;
316 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
317 {
318 printk("%p: [<%p>]", stack, _p(addr));
319 print_symbol(" %s\n ", addr);
320 }
321 }
323 printk("\n");
324 #endif
325 }
327 void show_execution_state(struct cpu_user_regs *regs)
328 {
329 show_registers(regs);
330 show_stack(regs);
331 }
333 char *trapstr(int trapnr)
334 {
335 static char *strings[] = {
336 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
337 "invalid opcode", "device not available", "double fault",
338 "coprocessor segment", "invalid tss", "segment not found",
339 "stack error", "general protection fault", "page fault",
340 "spurious interrupt", "coprocessor error", "alignment check",
341 "machine check", "simd error"
342 };
344 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
345 return "???";
347 return strings[trapnr];
348 }
350 /*
351 * This is called for faults at very unexpected times (e.g., when interrupts
352 * are disabled). In such situations we can't do much that is safe. We try to
353 * print out some tracing and then we just spin.
354 */
355 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
356 {
357 static DEFINE_PER_CPU(char, depth);
359 /*
360 * In some cases, we can end up in a vicious cycle of fatal_trap()s
361 * within fatal_trap()s. We give the problem a couple of iterations to
362 * bottom out, and then we just panic.
363 */
364 if ( ++this_cpu(depth) < 3 )
365 {
366 watchdog_disable();
367 console_start_sync();
369 show_execution_state(regs);
371 if ( trapnr == TRAP_page_fault )
372 {
373 unsigned long cr2 = read_cr2();
374 printk("Faulting linear address: %p\n", _p(cr2));
375 show_page_walk(cr2);
376 }
377 }
379 panic("FATAL TRAP: vector = %d (%s)\n"
380 "[error_code=%04x] %s\n",
381 trapnr, trapstr(trapnr), regs->error_code,
382 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
383 }
385 static void do_guest_trap(
386 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
387 {
388 struct vcpu *v = current;
389 struct trap_bounce *tb;
390 const struct trap_info *ti;
392 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
394 tb = &v->arch.trap_bounce;
395 ti = &v->arch.guest_context.trap_ctxt[trapnr];
397 tb->flags = TBF_EXCEPTION;
398 tb->cs = ti->cs;
399 tb->eip = ti->address;
401 if ( use_error_code )
402 {
403 tb->flags |= TBF_EXCEPTION_ERRCODE;
404 tb->error_code = regs->error_code;
405 }
407 if ( TI_GET_IF(ti) )
408 tb->flags |= TBF_INTERRUPT;
410 if ( unlikely(null_trap_bounce(v, tb)) )
411 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
412 "on VCPU %d [ec=%04x]\n",
413 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
414 }
416 static void instruction_done(
417 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
418 {
419 regs->eip = eip;
420 regs->eflags &= ~X86_EFLAGS_RF;
421 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
422 {
423 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
424 if ( regs->eflags & X86_EFLAGS_TF )
425 current->arch.guest_context.debugreg[6] |= 0x4000;
426 do_guest_trap(TRAP_debug, regs, 0);
427 }
428 }
430 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
431 unsigned int port, unsigned int len)
432 {
433 unsigned int width, i, match = 0;
434 unsigned long start;
436 if ( !(v->arch.guest_context.debugreg[5]) ||
437 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
438 return 0;
440 for ( i = 0; i < 4; i++ )
441 {
442 if ( !(v->arch.guest_context.debugreg[5] &
443 (3 << (i * DR_ENABLE_SIZE))) )
444 continue;
446 start = v->arch.guest_context.debugreg[i];
447 width = 0;
449 switch ( (v->arch.guest_context.debugreg[7] >>
450 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
451 {
452 case DR_LEN_1: width = 1; break;
453 case DR_LEN_2: width = 2; break;
454 case DR_LEN_4: width = 4; break;
455 case DR_LEN_8: width = 8; break;
456 }
458 if ( (start < (port + len)) && ((start + width) > port) )
459 match |= 1 << i;
460 }
462 return match;
463 }
465 /*
466 * Called from asm to set up the NMI trapbounce info.
467 * Returns 0 if no callback is set up, else 1.
468 */
469 asmlinkage int set_guest_nmi_trapbounce(void)
470 {
471 struct vcpu *v = current;
472 struct trap_bounce *tb = &v->arch.trap_bounce;
473 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
474 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
475 return !null_trap_bounce(v, tb);
476 }
478 static inline void do_trap(
479 int trapnr, struct cpu_user_regs *regs, int use_error_code)
480 {
481 unsigned long fixup;
483 DEBUGGER_trap_entry(trapnr, regs);
485 if ( guest_mode(regs) )
486 {
487 do_guest_trap(trapnr, regs, use_error_code);
488 return;
489 }
491 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
492 {
493 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
494 trapnr, _p(regs->eip), _p(fixup));
495 regs->eip = fixup;
496 return;
497 }
499 DEBUGGER_trap_fatal(trapnr, regs);
501 show_execution_state(regs);
502 panic("FATAL TRAP: vector = %d (%s)\n"
503 "[error_code=%04x]\n",
504 trapnr, trapstr(trapnr), regs->error_code);
505 }
507 #define DO_ERROR_NOCODE(trapnr, name) \
508 asmlinkage void do_##name(struct cpu_user_regs *regs) \
509 { \
510 do_trap(trapnr, regs, 0); \
511 }
513 #define DO_ERROR(trapnr, name) \
514 asmlinkage void do_##name(struct cpu_user_regs *regs) \
515 { \
516 do_trap(trapnr, regs, 1); \
517 }
519 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
520 DO_ERROR_NOCODE(TRAP_overflow, overflow)
521 DO_ERROR_NOCODE(TRAP_bounds, bounds)
522 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
523 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
524 DO_ERROR( TRAP_no_segment, segment_not_present)
525 DO_ERROR( TRAP_stack_error, stack_segment)
526 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
527 DO_ERROR( TRAP_alignment_check, alignment_check)
528 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
530 int rdmsr_hypervisor_regs(
531 uint32_t idx, uint32_t *eax, uint32_t *edx)
532 {
533 idx -= 0x40000000;
534 if ( idx > 0 )
535 return 0;
537 switch ( idx )
538 {
539 case 0:
540 {
541 *eax = *edx = 0;
542 break;
543 }
544 default:
545 BUG();
546 }
548 return 1;
549 }
551 int wrmsr_hypervisor_regs(
552 uint32_t idx, uint32_t eax, uint32_t edx)
553 {
554 struct domain *d = current->domain;
556 idx -= 0x40000000;
557 if ( idx > 0 )
558 return 0;
560 switch ( idx )
561 {
562 case 0:
563 {
564 void *hypercall_page;
565 unsigned long mfn;
566 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
567 unsigned int idx = eax & 0xfff;
569 if ( idx > 0 )
570 {
571 gdprintk(XENLOG_WARNING,
572 "Out of range index %u to MSR %08x\n",
573 idx, 0x40000000);
574 return 0;
575 }
577 mfn = gmfn_to_mfn(d, gmfn);
579 if ( !mfn_valid(mfn) ||
580 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
581 {
582 gdprintk(XENLOG_WARNING,
583 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
584 gmfn, mfn, 0x40000000);
585 return 0;
586 }
588 hypercall_page = map_domain_page(mfn);
589 hypercall_page_initialise(d, hypercall_page);
590 unmap_domain_page(hypercall_page);
592 put_page_and_type(mfn_to_page(mfn));
593 break;
594 }
596 default:
597 BUG();
598 }
600 return 1;
601 }
603 int cpuid_hypervisor_leaves(
604 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
605 {
606 idx -= 0x40000000;
607 if ( idx > 2 )
608 return 0;
610 switch ( idx )
611 {
612 case 0:
613 *eax = 0x40000002; /* Largest leaf */
614 *ebx = 0x566e6558; /* Signature 1: "XenV" */
615 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
616 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
617 break;
619 case 1:
620 *eax = (xen_major_version() << 16) | xen_minor_version();
621 *ebx = 0; /* Reserved */
622 *ecx = 0; /* Reserved */
623 *edx = 0; /* Reserved */
624 break;
626 case 2:
627 *eax = 1; /* Number of hypercall-transfer pages */
628 *ebx = 0x40000000; /* MSR base address */
629 *ecx = 0; /* Features 1 */
630 *edx = 0; /* Features 2 */
631 break;
633 default:
634 BUG();
635 }
637 return 1;
638 }
640 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
641 {
642 char sig[5], instr[2];
643 uint32_t a, b, c, d;
644 unsigned long eip, rc;
646 a = regs->eax;
647 b = regs->ebx;
648 c = regs->ecx;
649 d = regs->edx;
650 eip = regs->eip;
652 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
653 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
654 {
655 propagate_page_fault(eip + sizeof(sig) - rc, 0);
656 return EXCRET_fault_fixed;
657 }
658 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
659 return 0;
660 eip += sizeof(sig);
662 /* We only emulate CPUID. */
663 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
664 {
665 propagate_page_fault(eip + sizeof(instr) - rc, 0);
666 return EXCRET_fault_fixed;
667 }
668 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
669 return 0;
670 eip += sizeof(instr);
672 asm (
673 "cpuid"
674 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
675 : "0" (a), "1" (b), "2" (c), "3" (d) );
677 if ( regs->eax == 1 )
678 {
679 /* Modify Feature Information. */
680 __clear_bit(X86_FEATURE_VME, &d);
681 __clear_bit(X86_FEATURE_PSE, &d);
682 __clear_bit(X86_FEATURE_PGE, &d);
683 if ( !cpu_has_sep )
684 __clear_bit(X86_FEATURE_SEP, &d);
685 #ifdef __i386__
686 if ( !supervisor_mode_kernel )
687 __clear_bit(X86_FEATURE_SEP, &d);
688 #endif
689 if ( !IS_PRIV(current->domain) )
690 __clear_bit(X86_FEATURE_MTRR, &d);
691 }
692 else if ( regs->eax == 0x80000001 )
693 {
694 /* Modify Feature Information. */
695 #ifdef __i386__
696 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
697 #endif
698 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
699 }
700 else
701 {
702 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
703 }
705 regs->eax = a;
706 regs->ebx = b;
707 regs->ecx = c;
708 regs->edx = d;
710 instruction_done(regs, eip, 0);
712 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
714 return EXCRET_fault_fixed;
715 }
717 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
718 {
719 struct bug_frame bug;
720 struct bug_frame_str bug_str;
721 char *filename, *predicate, *eip = (char *)regs->eip;
722 unsigned long fixup;
723 int id, lineno;
725 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
727 if ( likely(guest_mode(regs)) )
728 {
729 if ( !emulate_forced_invalid_op(regs) )
730 do_guest_trap(TRAP_invalid_op, regs, 0);
731 return;
732 }
734 if ( !is_kernel(eip) ||
735 __copy_from_user(&bug, eip, sizeof(bug)) ||
736 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
737 (bug.ret != 0xc2) )
738 goto die;
739 eip += sizeof(bug);
741 id = bug.id & 3;
743 if ( id == BUGFRAME_dump )
744 {
745 show_execution_state(regs);
746 regs->eip = (unsigned long)eip;
747 return;
748 }
750 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
751 if ( !is_kernel(eip) ||
752 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
753 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
754 goto die;
755 eip += sizeof(bug_str);
757 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
758 lineno = bug.id >> 2;
760 if ( id == BUGFRAME_warn )
761 {
762 printk("Xen WARN at %.50s:%d\n", filename, lineno);
763 show_execution_state(regs);
764 regs->eip = (unsigned long)eip;
765 return;
766 }
768 if ( id == BUGFRAME_bug )
769 {
770 printk("Xen BUG at %.50s:%d\n", filename, lineno);
771 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
772 show_execution_state(regs);
773 panic("Xen BUG at %.50s:%d\n", filename, lineno);
774 }
776 /* ASSERT: decode the predicate string pointer. */
777 ASSERT(id == BUGFRAME_assert);
778 if ( !is_kernel(eip) ||
779 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
780 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
781 goto die;
782 eip += sizeof(bug_str);
784 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
785 printk("Assertion '%s' failed at %.50s:%d\n",
786 predicate, filename, lineno);
787 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
788 show_execution_state(regs);
789 panic("Assertion '%s' failed at %.50s:%d\n",
790 predicate, filename, lineno);
792 die:
793 if ( (fixup = search_exception_table(regs->eip)) != 0 )
794 {
795 regs->eip = fixup;
796 return;
797 }
798 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
799 show_execution_state(regs);
800 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
801 }
803 asmlinkage void do_int3(struct cpu_user_regs *regs)
804 {
805 DEBUGGER_trap_entry(TRAP_int3, regs);
807 if ( !guest_mode(regs) )
808 {
809 DEBUGGER_trap_fatal(TRAP_int3, regs);
810 show_execution_state(regs);
811 panic("FATAL TRAP: vector = 3 (Int3)\n");
812 }
814 do_guest_trap(TRAP_int3, regs, 0);
815 }
817 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
818 {
819 extern fastcall void (*machine_check_vector)(
820 struct cpu_user_regs *, long error_code);
821 machine_check_vector(regs, regs->error_code);
822 }
824 void propagate_page_fault(unsigned long addr, u16 error_code)
825 {
826 struct trap_info *ti;
827 struct vcpu *v = current;
828 struct trap_bounce *tb = &v->arch.trap_bounce;
830 v->arch.guest_context.ctrlreg[2] = addr;
831 arch_set_cr2(v, addr);
833 /* Re-set error_code.user flag appropriately for the guest. */
834 error_code &= ~PFEC_user_mode;
835 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
836 error_code |= PFEC_user_mode;
838 trace_pv_page_fault(addr, error_code);
840 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
841 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
842 tb->error_code = error_code;
843 tb->cs = ti->cs;
844 tb->eip = ti->address;
845 if ( TI_GET_IF(ti) )
846 tb->flags |= TBF_INTERRUPT;
847 if ( unlikely(null_trap_bounce(v, tb)) )
848 {
849 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
850 v->domain->domain_id, v->vcpu_id, error_code);
851 show_page_walk(addr);
852 }
853 }
855 static int handle_gdt_ldt_mapping_fault(
856 unsigned long offset, struct cpu_user_regs *regs)
857 {
858 struct vcpu *curr = current;
859 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
860 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
861 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
863 /* Should never fault in another vcpu's area. */
864 BUG_ON(vcpu_area != curr->vcpu_id);
866 /* Byte offset within the gdt/ldt sub-area. */
867 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
869 if ( likely(is_ldt_area) )
870 {
871 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
872 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
873 {
874 if ( guest_mode(regs) )
875 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
876 regs->eip, offset);
877 }
878 else
879 {
880 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
881 if ( !guest_mode(regs) )
882 return 0;
883 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
884 propagate_page_fault(
885 curr->arch.guest_context.ldt_base + offset,
886 regs->error_code);
887 }
888 }
889 else
890 {
891 /* GDT fault: handle the fault as #GP(selector). */
892 regs->error_code = (u16)offset & ~7;
893 (void)do_general_protection(regs);
894 }
896 return EXCRET_fault_fixed;
897 }
899 #ifdef HYPERVISOR_VIRT_END
900 #define IN_HYPERVISOR_RANGE(va) \
901 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
902 #else
903 #define IN_HYPERVISOR_RANGE(va) \
904 (((va) >= HYPERVISOR_VIRT_START))
905 #endif
907 static int __spurious_page_fault(
908 unsigned long addr, struct cpu_user_regs *regs)
909 {
910 unsigned long mfn, cr3 = read_cr3();
911 #if CONFIG_PAGING_LEVELS >= 4
912 l4_pgentry_t l4e, *l4t;
913 #endif
914 #if CONFIG_PAGING_LEVELS >= 3
915 l3_pgentry_t l3e, *l3t;
916 #endif
917 l2_pgentry_t l2e, *l2t;
918 l1_pgentry_t l1e, *l1t;
919 unsigned int required_flags, disallowed_flags;
921 /*
922 * We do not take spurious page faults in IRQ handlers as we do not
923 * modify page tables in IRQ context. We therefore bail here because
924 * map_domain_page() is not IRQ-safe.
925 */
926 if ( in_irq() )
927 return 0;
929 /* Reserved bit violations are never spurious faults. */
930 if ( regs->error_code & PFEC_reserved_bit )
931 return 0;
933 required_flags = _PAGE_PRESENT;
934 if ( regs->error_code & PFEC_write_access )
935 required_flags |= _PAGE_RW;
936 if ( regs->error_code & PFEC_user_mode )
937 required_flags |= _PAGE_USER;
939 disallowed_flags = 0;
940 if ( regs->error_code & PFEC_insn_fetch )
941 disallowed_flags |= _PAGE_NX;
943 mfn = cr3 >> PAGE_SHIFT;
945 #if CONFIG_PAGING_LEVELS >= 4
946 l4t = map_domain_page(mfn);
947 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
948 mfn = l4e_get_pfn(l4e);
949 unmap_domain_page(l4t);
950 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
951 (l4e_get_flags(l4e) & disallowed_flags) )
952 return 0;
953 #endif
955 #if CONFIG_PAGING_LEVELS >= 3
956 l3t = map_domain_page(mfn);
957 #ifdef CONFIG_X86_PAE
958 l3t += (cr3 & 0xFE0UL) >> 3;
959 #endif
960 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
961 mfn = l3e_get_pfn(l3e);
962 unmap_domain_page(l3t);
963 #ifdef CONFIG_X86_PAE
964 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
965 return 0;
966 #else
967 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
968 (l3e_get_flags(l3e) & disallowed_flags) )
969 return 0;
970 #endif
971 #endif
973 l2t = map_domain_page(mfn);
974 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
975 mfn = l2e_get_pfn(l2e);
976 unmap_domain_page(l2t);
977 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
978 (l2e_get_flags(l2e) & disallowed_flags) )
979 return 0;
980 if ( l2e_get_flags(l2e) & _PAGE_PSE )
981 {
982 l1e = l1e_empty(); /* define before use in debug tracing */
983 goto spurious;
984 }
986 l1t = map_domain_page(mfn);
987 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
988 mfn = l1e_get_pfn(l1e);
989 unmap_domain_page(l1t);
990 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
991 (l1e_get_flags(l1e) & disallowed_flags) )
992 return 0;
994 spurious:
995 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
996 "at addr %lx, e/c %04x\n",
997 current->domain->domain_id, current->vcpu_id,
998 addr, regs->error_code);
999 #if CONFIG_PAGING_LEVELS >= 4
1000 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1001 #endif
1002 #if CONFIG_PAGING_LEVELS >= 3
1003 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1004 #endif
1005 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1006 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1007 #ifndef NDEBUG
1008 show_registers(regs);
1009 #endif
1010 return 1;
1013 static int spurious_page_fault(
1014 unsigned long addr, struct cpu_user_regs *regs)
1016 unsigned long flags;
1017 int is_spurious;
1019 /*
1020 * Disabling interrupts prevents TLB flushing, and hence prevents
1021 * page tables from becoming invalid under our feet during the walk.
1022 */
1023 local_irq_save(flags);
1024 is_spurious = __spurious_page_fault(addr, regs);
1025 local_irq_restore(flags);
1027 return is_spurious;
1030 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1032 struct vcpu *v = current;
1033 struct domain *d = v->domain;
1035 /* No fixups in interrupt context or when interrupts are disabled. */
1036 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1037 return 0;
1039 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1041 if ( paging_mode_external(d) && guest_mode(regs) )
1043 int ret = paging_fault(addr, regs);
1044 if ( ret == EXCRET_fault_fixed )
1045 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1046 return ret;
1048 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1049 return handle_gdt_ldt_mapping_fault(
1050 addr - GDT_LDT_VIRT_START, regs);
1051 return 0;
1054 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1055 guest_kernel_mode(v, regs) &&
1056 /* Do not check if access-protection fault since the page may
1057 legitimately be not present in shadow page tables */
1058 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
1059 ptwr_do_page_fault(v, addr, regs) )
1060 return EXCRET_fault_fixed;
1062 if ( paging_mode_enabled(d) )
1064 int ret = paging_fault(addr, regs);
1065 if ( ret == EXCRET_fault_fixed )
1066 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1067 return ret;
1070 return 0;
1073 /*
1074 * #PF error code:
1075 * Bit 0: Protection violation (=1) ; Page not present (=0)
1076 * Bit 1: Write access
1077 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1078 * Bit 3: Reserved bit violation
1079 * Bit 4: Instruction fetch
1080 */
1081 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1083 unsigned long addr, fixup;
1085 addr = read_cr2();
1087 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1089 perfc_incr(page_faults);
1091 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1092 return;
1094 if ( unlikely(!guest_mode(regs)) )
1096 if ( spurious_page_fault(addr, regs) )
1097 return;
1099 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1101 perfc_incr(copy_user_faults);
1102 regs->eip = fixup;
1103 return;
1106 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1108 show_execution_state(regs);
1109 show_page_walk(addr);
1110 panic("FATAL PAGE FAULT\n"
1111 "[error_code=%04x]\n"
1112 "Faulting linear address: %p\n",
1113 regs->error_code, _p(addr));
1116 propagate_page_fault(addr, regs->error_code);
1119 /*
1120 * Early handler to deal with spurious page faults. For example, consider a
1121 * routine that uses a mapping immediately after installing it (making it
1122 * present). The CPU may speculatively execute the memory access before
1123 * executing the PTE write. The instruction will then be marked to cause a
1124 * page fault when it is retired, despite the fact that the PTE is present and
1125 * correct at that point in time.
1126 */
1127 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1129 static int stuck;
1130 static unsigned long prev_eip, prev_cr2;
1131 unsigned long cr2 = read_cr2();
1133 BUG_ON(smp_processor_id() != 0);
1135 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1137 prev_eip = regs->eip;
1138 prev_cr2 = cr2;
1139 stuck = 0;
1140 return;
1143 if ( stuck++ == 1000 )
1144 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1145 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1148 long do_fpu_taskswitch(int set)
1150 struct vcpu *v = current;
1152 if ( set )
1154 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1155 stts();
1157 else
1159 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1160 if ( v->fpu_dirtied )
1161 clts();
1164 return 0;
1167 static int read_descriptor(unsigned int sel,
1168 const struct vcpu *v,
1169 const struct cpu_user_regs * regs,
1170 unsigned long *base,
1171 unsigned long *limit,
1172 unsigned int *ar,
1173 unsigned int vm86attr)
1175 struct desc_struct desc;
1177 if ( !vm86_mode(regs) )
1179 if ( sel < 4)
1180 desc.b = desc.a = 0;
1181 else if ( __get_user(desc,
1182 (const struct desc_struct *)(!(sel & 4)
1183 ? GDT_VIRT_START(v)
1184 : LDT_VIRT_START(v))
1185 + (sel >> 3)) )
1186 return 0;
1187 if ( !(vm86attr & _SEGMENT_CODE) )
1188 desc.b &= ~_SEGMENT_L;
1190 else
1192 desc.a = (sel << 20) | 0xffff;
1193 desc.b = vm86attr | (sel >> 12);
1196 *ar = desc.b & 0x00f0ff00;
1197 if ( !(desc.b & _SEGMENT_L) )
1199 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1200 (desc.b & 0xff000000));
1201 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1202 if ( desc.b & _SEGMENT_G )
1203 *limit = ((*limit + 1) << 12) - 1;
1204 #ifndef NDEBUG
1205 if ( !vm86_mode(regs) && (sel > 3) )
1207 unsigned int a, l;
1208 unsigned char valid;
1210 asm volatile (
1211 "larl %2,%0 ; setz %1"
1212 : "=r" (a), "=rm" (valid) : "rm" (sel));
1213 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1214 asm volatile (
1215 "lsll %2,%0 ; setz %1"
1216 : "=r" (l), "=rm" (valid) : "rm" (sel));
1217 BUG_ON(valid && (l != *limit));
1219 #endif
1221 else
1223 *base = 0UL;
1224 *limit = ~0UL;
1227 return 1;
1230 #ifdef __x86_64__
1231 static int read_gate_descriptor(unsigned int gate_sel,
1232 const struct vcpu *v,
1233 unsigned int *sel,
1234 unsigned long *off,
1235 unsigned int *ar)
1237 struct desc_struct desc;
1238 const struct desc_struct *pdesc;
1241 pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
1242 GDT_VIRT_START(v) :
1243 LDT_VIRT_START(v))
1244 + (gate_sel >> 3);
1245 if ( gate_sel < 4 ||
1246 (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
1247 __get_user(desc, pdesc) )
1248 return 0;
1250 *sel = (desc.a >> 16) & 0x0000fffc;
1251 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1252 *ar = desc.b & 0x0000ffff;
1253 /*
1254 * check_descriptor() clears the DPL field and stores the
1255 * guest requested DPL in the selector's RPL field.
1256 */
1257 ASSERT(!(*ar & _SEGMENT_DPL));
1258 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1260 if ( !is_pv_32bit_vcpu(v) )
1262 if ( (*ar & 0x1f00) != 0x0c00 ||
1263 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1264 __get_user(desc, pdesc + 1) ||
1265 (desc.b & 0x1f00) )
1266 return 0;
1268 *off |= (unsigned long)desc.a << 32;
1269 return 1;
1272 switch ( *ar & 0x1f00 )
1274 case 0x0400:
1275 *off &= 0xffff;
1276 break;
1277 case 0x0c00:
1278 break;
1279 default:
1280 return 0;
1283 return 1;
1285 #endif
1287 /* Has the guest requested sufficient permission for this I/O access? */
1288 static inline int guest_io_okay(
1289 unsigned int port, unsigned int bytes,
1290 struct vcpu *v, struct cpu_user_regs *regs)
1292 #if defined(__x86_64__)
1293 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1294 int user_mode = !(v->arch.flags & TF_kernel_mode);
1295 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1296 #elif defined(__i386__)
1297 #define TOGGLE_MODE() ((void)0)
1298 #endif
1300 if ( !vm86_mode(regs) &&
1301 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1302 return 1;
1304 if ( v->arch.iobmp_limit > (port + bytes) )
1306 union { uint8_t bytes[2]; uint16_t mask; } x;
1308 /*
1309 * Grab permission bytes from guest space. Inaccessible bytes are
1310 * read as 0xff (no access allowed).
1311 */
1312 TOGGLE_MODE();
1313 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1314 port>>3, 2) )
1316 default: x.bytes[0] = ~0;
1317 case 1: x.bytes[1] = ~0;
1318 case 0: break;
1320 TOGGLE_MODE();
1322 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1323 return 1;
1326 return 0;
1329 /* Has the administrator granted sufficient permission for this I/O access? */
1330 static inline int admin_io_okay(
1331 unsigned int port, unsigned int bytes,
1332 struct vcpu *v, struct cpu_user_regs *regs)
1334 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1337 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1338 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1339 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1340 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1341 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1342 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1344 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1345 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1346 __attribute__((__regparm__(1)));
1347 unsigned long guest_to_host_gpr_switch(unsigned long)
1348 __attribute__((__regparm__(1)));
1350 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1352 /* Instruction fetch with error handling. */
1353 #define insn_fetch(type, base, eip, limit) \
1354 ({ unsigned long _rc, _ptr = (base) + (eip); \
1355 type _x; \
1356 if ( ad_default < 8 ) \
1357 _ptr = (unsigned int)_ptr; \
1358 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1359 goto fail; \
1360 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1361 { \
1362 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1363 goto skip; \
1364 } \
1365 (eip) += sizeof(_x); _x; })
1367 #if defined(CONFIG_X86_32)
1368 # define read_sreg(regs, sr) ((regs)->sr)
1369 #elif defined(CONFIG_X86_64)
1370 # define read_sreg(regs, sr) read_segment_register(sr)
1371 #endif
1373 static int emulate_privileged_op(struct cpu_user_regs *regs)
1375 struct vcpu *v = current;
1376 unsigned long *reg, eip = regs->eip, res;
1377 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1378 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1379 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1380 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1381 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1382 ? regs->reg \
1383 : ad_bytes == 4 \
1384 ? (u32)regs->reg \
1385 : (u16)regs->reg)
1386 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1387 ? regs->reg = (val) \
1388 : ad_bytes == 4 \
1389 ? (*(u32 *)&regs->reg = (val)) \
1390 : (*(u16 *)&regs->reg = (val)))
1391 unsigned long code_base, code_limit;
1392 char io_emul_stub[32];
1393 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1394 u32 l, h, eax, edx;
1396 if ( !read_descriptor(regs->cs, v, regs,
1397 &code_base, &code_limit, &ar,
1398 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1399 goto fail;
1400 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1401 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1402 if ( !(ar & _SEGMENT_S) ||
1403 !(ar & _SEGMENT_P) ||
1404 !(ar & _SEGMENT_CODE) )
1405 goto fail;
1407 /* emulating only opcodes not allowing SS to be default */
1408 data_sel = read_sreg(regs, ds);
1410 /* Legacy prefixes. */
1411 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1413 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1415 case 0x66: /* operand-size override */
1416 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1417 continue;
1418 case 0x67: /* address-size override */
1419 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1420 continue;
1421 case 0x2e: /* CS override */
1422 data_sel = regs->cs;
1423 continue;
1424 case 0x3e: /* DS override */
1425 data_sel = read_sreg(regs, ds);
1426 continue;
1427 case 0x26: /* ES override */
1428 data_sel = read_sreg(regs, es);
1429 continue;
1430 case 0x64: /* FS override */
1431 data_sel = read_sreg(regs, fs);
1432 lm_ovr = lm_seg_fs;
1433 continue;
1434 case 0x65: /* GS override */
1435 data_sel = read_sreg(regs, gs);
1436 lm_ovr = lm_seg_gs;
1437 continue;
1438 case 0x36: /* SS override */
1439 data_sel = regs->ss;
1440 continue;
1441 case 0xf0: /* LOCK */
1442 lock = 1;
1443 continue;
1444 case 0xf2: /* REPNE/REPNZ */
1445 case 0xf3: /* REP/REPE/REPZ */
1446 rep_prefix = 1;
1447 continue;
1448 default:
1449 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1451 rex = opcode;
1452 continue;
1454 break;
1456 break;
1459 /* REX prefix. */
1460 if ( rex & 8 ) /* REX.W */
1461 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1462 modrm_reg = (rex & 4) << 1; /* REX.R */
1463 /* REX.X does not need to be decoded. */
1464 modrm_rm = (rex & 1) << 3; /* REX.B */
1466 if ( opcode == 0x0f )
1467 goto twobyte_opcode;
1469 if ( lock )
1470 goto fail;
1472 /* Input/Output String instructions. */
1473 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1475 unsigned long data_base, data_limit;
1477 if ( rep_prefix && (rd_ad(ecx) == 0) )
1478 goto done;
1480 if ( !(opcode & 2) )
1482 data_sel = read_sreg(regs, es);
1483 lm_ovr = lm_seg_none;
1486 if ( !(ar & _SEGMENT_L) )
1488 if ( !read_descriptor(data_sel, v, regs,
1489 &data_base, &data_limit, &ar,
1490 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1491 goto fail;
1492 if ( !(ar & _SEGMENT_S) ||
1493 !(ar & _SEGMENT_P) ||
1494 (opcode & 2 ?
1495 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1496 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1497 goto fail;
1499 #ifdef CONFIG_X86_64
1500 else
1502 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1504 switch ( lm_ovr )
1506 case lm_seg_none:
1507 data_base = 0UL;
1508 break;
1509 case lm_seg_fs:
1510 data_base = v->arch.guest_context.fs_base;
1511 break;
1512 case lm_seg_gs:
1513 if ( guest_kernel_mode(v, regs) )
1514 data_base = v->arch.guest_context.gs_base_kernel;
1515 else
1516 data_base = v->arch.guest_context.gs_base_user;
1517 break;
1520 else
1521 read_descriptor(data_sel, v, regs,
1522 &data_base, &data_limit, &ar,
1523 0);
1524 data_limit = ~0UL;
1525 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1527 #endif
1529 port = (u16)regs->edx;
1531 continue_io_string:
1532 switch ( opcode )
1534 case 0x6c: /* INSB */
1535 op_bytes = 1;
1536 case 0x6d: /* INSW/INSL */
1537 if ( data_limit < op_bytes - 1 ||
1538 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1539 !guest_io_okay(port, op_bytes, v, regs) )
1540 goto fail;
1541 switch ( op_bytes )
1543 case 1:
1544 /* emulate PIT counter 2 */
1545 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1546 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1547 pv_pit_handler(port, 0, 0) : ~0));
1548 break;
1549 case 2:
1550 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1551 break;
1552 case 4:
1553 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1554 break;
1556 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1558 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1559 PFEC_write_access);
1560 return EXCRET_fault_fixed;
1562 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1563 break;
1565 case 0x6e: /* OUTSB */
1566 op_bytes = 1;
1567 case 0x6f: /* OUTSW/OUTSL */
1568 if ( data_limit < op_bytes - 1 ||
1569 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1570 !guest_io_okay(port, op_bytes, v, regs) )
1571 goto fail;
1572 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1573 if ( rc != 0 )
1575 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1576 return EXCRET_fault_fixed;
1578 switch ( op_bytes )
1580 case 1:
1581 if ( guest_outb_okay(port, v, regs) )
1583 outb((u8)data, port);
1584 if ( pv_post_outb_hook )
1585 pv_post_outb_hook(port, data);
1587 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1588 pv_pit_handler(port, data, 1);
1589 break;
1590 case 2:
1591 if ( guest_outw_okay(port, v, regs) )
1592 outw((u16)data, port);
1593 break;
1594 case 4:
1595 if ( guest_outl_okay(port, v, regs) )
1596 outl((u32)data, port);
1597 break;
1599 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1600 break;
1603 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1605 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1607 if ( !bpmatch && !hypercall_preempt_check() )
1608 goto continue_io_string;
1609 eip = regs->eip;
1612 goto done;
1615 /*
1616 * Very likely to be an I/O instruction (IN/OUT).
1617 * Build an on-stack stub to execute the instruction with full guest
1618 * GPR context. This is needed for some systems which (ab)use IN/OUT
1619 * to communicate with BIOS code in system-management mode.
1620 */
1621 #ifdef __x86_64__
1622 /* movq $host_to_guest_gpr_switch,%rcx */
1623 io_emul_stub[0] = 0x48;
1624 io_emul_stub[1] = 0xb9;
1625 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1626 /* callq *%rcx */
1627 io_emul_stub[10] = 0xff;
1628 io_emul_stub[11] = 0xd1;
1629 #else
1630 /* call host_to_guest_gpr_switch */
1631 io_emul_stub[0] = 0xe8;
1632 *(s32 *)&io_emul_stub[1] =
1633 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1634 /* 7 x nop */
1635 memset(&io_emul_stub[5], 0x90, 7);
1636 #endif
1637 /* data16 or nop */
1638 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1639 /* <io-access opcode> */
1640 io_emul_stub[13] = opcode;
1641 /* imm8 or nop */
1642 io_emul_stub[14] = 0x90;
1643 /* ret (jumps to guest_to_host_gpr_switch) */
1644 io_emul_stub[15] = 0xc3;
1646 /* Handy function-typed pointer to the stub. */
1647 io_emul = (void *)io_emul_stub;
1649 if ( ioemul_handle_quirk )
1650 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1652 /* I/O Port and Interrupt Flag instructions. */
1653 switch ( opcode )
1655 case 0xe4: /* IN imm8,%al */
1656 op_bytes = 1;
1657 case 0xe5: /* IN imm8,%eax */
1658 port = insn_fetch(u8, code_base, eip, code_limit);
1659 io_emul_stub[14] = port; /* imm8 */
1660 exec_in:
1661 if ( !guest_io_okay(port, op_bytes, v, regs) )
1662 goto fail;
1663 switch ( op_bytes )
1665 case 1:
1666 if ( guest_inb_okay(port, v, regs) )
1667 io_emul(regs);
1668 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1670 regs->eax &= ~0xffUL;
1671 regs->eax |= pv_pit_handler(port, 0, 0);
1673 else
1674 regs->eax |= (u8)~0;
1675 break;
1676 case 2:
1677 if ( guest_inw_okay(port, v, regs) )
1678 io_emul(regs);
1679 else
1680 regs->eax |= (u16)~0;
1681 break;
1682 case 4:
1683 if ( guest_inl_okay(port, v, regs) )
1684 io_emul(regs);
1685 else
1686 regs->eax = (u32)~0;
1687 break;
1689 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1690 goto done;
1692 case 0xec: /* IN %dx,%al */
1693 op_bytes = 1;
1694 case 0xed: /* IN %dx,%eax */
1695 port = (u16)regs->edx;
1696 goto exec_in;
1698 case 0xe6: /* OUT %al,imm8 */
1699 op_bytes = 1;
1700 case 0xe7: /* OUT %eax,imm8 */
1701 port = insn_fetch(u8, code_base, eip, code_limit);
1702 io_emul_stub[14] = port; /* imm8 */
1703 exec_out:
1704 if ( !guest_io_okay(port, op_bytes, v, regs) )
1705 goto fail;
1706 switch ( op_bytes )
1708 case 1:
1709 if ( guest_outb_okay(port, v, regs) )
1711 io_emul(regs);
1712 if ( pv_post_outb_hook )
1713 pv_post_outb_hook(port, regs->eax);
1715 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1716 pv_pit_handler(port, regs->eax, 1);
1717 break;
1718 case 2:
1719 if ( guest_outw_okay(port, v, regs) )
1720 io_emul(regs);
1721 break;
1722 case 4:
1723 if ( guest_outl_okay(port, v, regs) )
1724 io_emul(regs);
1725 break;
1727 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1728 goto done;
1730 case 0xee: /* OUT %al,%dx */
1731 op_bytes = 1;
1732 case 0xef: /* OUT %eax,%dx */
1733 port = (u16)regs->edx;
1734 goto exec_out;
1736 case 0xfa: /* CLI */
1737 case 0xfb: /* STI */
1738 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1739 goto fail;
1740 /*
1741 * This is just too dangerous to allow, in my opinion. Consider if the
1742 * caller then tries to reenable interrupts using POPF: we can't trap
1743 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1744 * do for us. :-)
1745 */
1746 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1747 goto done;
1750 /* No decode of this single-byte opcode. */
1751 goto fail;
1753 twobyte_opcode:
1754 /* Two-byte opcodes only emulated from guest kernel. */
1755 if ( !guest_kernel_mode(v, regs) )
1756 goto fail;
1758 /* Privileged (ring 0) instructions. */
1759 opcode = insn_fetch(u8, code_base, eip, code_limit);
1760 if ( lock && (opcode & ~3) != 0x20 )
1761 goto fail;
1762 switch ( opcode )
1764 case 0x06: /* CLTS */
1765 (void)do_fpu_taskswitch(0);
1766 break;
1768 case 0x09: /* WBINVD */
1769 /* Ignore the instruction if unprivileged. */
1770 if ( !cache_flush_permitted(v->domain) )
1771 /* Non-physdev domain attempted WBINVD; ignore for now since
1772 newer linux uses this in some start-of-day timing loops */
1774 else
1775 wbinvd();
1776 break;
1778 case 0x20: /* MOV CR?,<reg> */
1779 opcode = insn_fetch(u8, code_base, eip, code_limit);
1780 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1781 modrm_rm |= (opcode >> 0) & 7;
1782 reg = decode_register(modrm_rm, regs, 0);
1783 switch ( modrm_reg )
1785 case 0: /* Read CR0 */
1786 *reg = (read_cr0() & ~X86_CR0_TS) |
1787 v->arch.guest_context.ctrlreg[0];
1788 break;
1790 case 2: /* Read CR2 */
1791 *reg = v->arch.guest_context.ctrlreg[2];
1792 break;
1794 case 3: /* Read CR3 */
1795 if ( !is_pv_32on64_vcpu(v) )
1796 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1797 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1798 #ifdef CONFIG_COMPAT
1799 else
1800 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1801 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1802 #endif
1803 break;
1805 case 4: /* Read CR4 */
1806 /*
1807 * Guests can read CR4 to see what features Xen has enabled. We
1808 * therefore lie about PGE & PSE as they are unavailable to guests.
1809 */
1810 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1811 break;
1813 default:
1814 goto fail;
1816 break;
1818 case 0x21: /* MOV DR?,<reg> */
1819 opcode = insn_fetch(u8, code_base, eip, code_limit);
1820 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1821 modrm_rm |= (opcode >> 0) & 7;
1822 reg = decode_register(modrm_rm, regs, 0);
1823 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1824 goto fail;
1825 *reg = res;
1826 break;
1828 case 0x22: /* MOV <reg>,CR? */
1829 opcode = insn_fetch(u8, code_base, eip, code_limit);
1830 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1831 modrm_rm |= (opcode >> 0) & 7;
1832 reg = decode_register(modrm_rm, regs, 0);
1833 switch ( modrm_reg )
1835 case 0: /* Write CR0 */
1836 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1838 gdprintk(XENLOG_WARNING,
1839 "Attempt to change unmodifiable CR0 flags.\n");
1840 goto fail;
1842 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1843 break;
1845 case 2: /* Write CR2 */
1846 v->arch.guest_context.ctrlreg[2] = *reg;
1847 arch_set_cr2(v, *reg);
1848 break;
1850 case 3: /* Write CR3 */
1851 LOCK_BIGLOCK(v->domain);
1852 if ( !is_pv_32on64_vcpu(v) )
1853 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1854 #ifdef CONFIG_COMPAT
1855 else
1856 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1857 #endif
1858 UNLOCK_BIGLOCK(v->domain);
1859 if ( rc == 0 ) /* not okay */
1860 goto fail;
1861 break;
1863 case 4: /* Write CR4 */
1864 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
1865 write_cr4(pv_guest_cr4_to_real_cr4(
1866 v->arch.guest_context.ctrlreg[4]));
1867 break;
1869 default:
1870 goto fail;
1872 break;
1874 case 0x23: /* MOV <reg>,DR? */
1875 opcode = insn_fetch(u8, code_base, eip, code_limit);
1876 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1877 modrm_rm |= (opcode >> 0) & 7;
1878 reg = decode_register(modrm_rm, regs, 0);
1879 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1880 goto fail;
1881 break;
1883 case 0x30: /* WRMSR */
1884 eax = regs->eax;
1885 edx = regs->edx;
1886 res = ((u64)edx << 32) | eax;
1887 switch ( (u32)regs->ecx )
1889 #ifdef CONFIG_X86_64
1890 case MSR_FS_BASE:
1891 if ( is_pv_32on64_vcpu(v) )
1892 goto fail;
1893 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1894 goto fail;
1895 v->arch.guest_context.fs_base = res;
1896 break;
1897 case MSR_GS_BASE:
1898 if ( is_pv_32on64_vcpu(v) )
1899 goto fail;
1900 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1901 goto fail;
1902 v->arch.guest_context.gs_base_kernel = res;
1903 break;
1904 case MSR_SHADOW_GS_BASE:
1905 if ( is_pv_32on64_vcpu(v) )
1906 goto fail;
1907 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1908 goto fail;
1909 v->arch.guest_context.gs_base_user = res;
1910 break;
1911 #endif
1912 case MSR_K7_FID_VID_STATUS:
1913 case MSR_K7_FID_VID_CTL:
1914 case MSR_K8_PSTATE_LIMIT:
1915 case MSR_K8_PSTATE_CTRL:
1916 case MSR_K8_PSTATE_STATUS:
1917 case MSR_K8_PSTATE0:
1918 case MSR_K8_PSTATE1:
1919 case MSR_K8_PSTATE2:
1920 case MSR_K8_PSTATE3:
1921 case MSR_K8_PSTATE4:
1922 case MSR_K8_PSTATE5:
1923 case MSR_K8_PSTATE6:
1924 case MSR_K8_PSTATE7:
1925 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1926 goto fail;
1927 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1928 break;
1929 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
1930 goto fail;
1931 break;
1932 case MSR_IA32_PERF_CTL:
1933 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1934 goto fail;
1935 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1936 break;
1937 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
1938 goto fail;
1939 break;
1940 default:
1941 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1942 break;
1943 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1944 (eax != l) || (edx != h) )
1945 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1946 "%08x:%08x to %08x:%08x.\n",
1947 _p(regs->ecx), h, l, edx, eax);
1948 break;
1950 break;
1952 case 0x31: /* RDTSC */
1953 rdtsc(regs->eax, regs->edx);
1954 break;
1956 case 0x32: /* RDMSR */
1957 switch ( (u32)regs->ecx )
1959 #ifdef CONFIG_X86_64
1960 case MSR_FS_BASE:
1961 if ( is_pv_32on64_vcpu(v) )
1962 goto fail;
1963 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1964 regs->edx = v->arch.guest_context.fs_base >> 32;
1965 break;
1966 case MSR_GS_BASE:
1967 if ( is_pv_32on64_vcpu(v) )
1968 goto fail;
1969 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1970 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1971 break;
1972 case MSR_SHADOW_GS_BASE:
1973 if ( is_pv_32on64_vcpu(v) )
1974 goto fail;
1975 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1976 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1977 break;
1978 #endif
1979 case MSR_K7_FID_VID_CTL:
1980 case MSR_K7_FID_VID_STATUS:
1981 case MSR_K8_PSTATE_LIMIT:
1982 case MSR_K8_PSTATE_CTRL:
1983 case MSR_K8_PSTATE_STATUS:
1984 case MSR_K8_PSTATE0:
1985 case MSR_K8_PSTATE1:
1986 case MSR_K8_PSTATE2:
1987 case MSR_K8_PSTATE3:
1988 case MSR_K8_PSTATE4:
1989 case MSR_K8_PSTATE5:
1990 case MSR_K8_PSTATE6:
1991 case MSR_K8_PSTATE7:
1992 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1993 goto fail;
1994 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1996 regs->eax = regs->edx = 0;
1997 break;
1999 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2000 goto fail;
2001 break;
2002 case MSR_EFER:
2003 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2004 goto fail;
2005 break;
2006 default:
2007 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2009 regs->eax = l;
2010 regs->edx = h;
2011 break;
2013 /* Everyone can read the MSR space. */
2014 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2015 _p(regs->ecx));*/
2016 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2017 goto fail;
2018 break;
2020 break;
2022 default:
2023 goto fail;
2026 #undef wr_ad
2027 #undef rd_ad
2029 done:
2030 instruction_done(regs, eip, bpmatch);
2031 skip:
2032 return EXCRET_fault_fixed;
2034 fail:
2035 return 0;
2038 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2039 unsigned int esp, unsigned int decr)
2041 return (((esp - decr) < (esp - 1)) &&
2042 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2045 static void emulate_gate_op(struct cpu_user_regs *regs)
2047 #ifdef __x86_64__
2048 struct vcpu *v = current;
2049 unsigned int sel, ar, dpl, nparm, opnd_sel;
2050 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2051 unsigned long off, eip, opnd_off, base, limit;
2052 int jump;
2054 /* Check whether this fault is due to the use of a call gate. */
2055 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2056 ((ar >> 13) & 3) < (regs->cs & 3) ||
2057 (ar & _SEGMENT_TYPE) != 0xc00 )
2059 do_guest_trap(TRAP_gp_fault, regs, 1);
2060 return;
2062 if ( !(ar & _SEGMENT_P) )
2064 do_guest_trap(TRAP_no_segment, regs, 1);
2065 return;
2067 dpl = (ar >> 13) & 3;
2068 nparm = ar & 0x1f;
2070 /*
2071 * Decode instruction (and perhaps operand) to determine RPL,
2072 * whether this is a jump or a call, and the call return offset.
2073 */
2074 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2075 !(ar & _SEGMENT_S) ||
2076 !(ar & _SEGMENT_P) ||
2077 !(ar & _SEGMENT_CODE) )
2079 do_guest_trap(TRAP_gp_fault, regs, 1);
2080 return;
2083 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2084 ad_default = ad_bytes = op_default;
2085 opnd_sel = opnd_off = 0;
2086 jump = -1;
2087 for ( eip = regs->eip; eip - regs->_eip < 10; )
2089 switch ( insn_fetch(u8, base, eip, limit) )
2091 case 0x66: /* operand-size override */
2092 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2093 continue;
2094 case 0x67: /* address-size override */
2095 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2096 continue;
2097 case 0x2e: /* CS override */
2098 opnd_sel = regs->cs;
2099 ASSERT(opnd_sel);
2100 continue;
2101 case 0x3e: /* DS override */
2102 opnd_sel = read_sreg(regs, ds);
2103 if ( !opnd_sel )
2104 opnd_sel = dpl;
2105 continue;
2106 case 0x26: /* ES override */
2107 opnd_sel = read_sreg(regs, es);
2108 if ( !opnd_sel )
2109 opnd_sel = dpl;
2110 continue;
2111 case 0x64: /* FS override */
2112 opnd_sel = read_sreg(regs, fs);
2113 if ( !opnd_sel )
2114 opnd_sel = dpl;
2115 continue;
2116 case 0x65: /* GS override */
2117 opnd_sel = read_sreg(regs, gs);
2118 if ( !opnd_sel )
2119 opnd_sel = dpl;
2120 continue;
2121 case 0x36: /* SS override */
2122 opnd_sel = regs->ss;
2123 if ( !opnd_sel )
2124 opnd_sel = dpl;
2125 continue;
2126 case 0xea:
2127 ++jump;
2128 /* FALLTHROUGH */
2129 case 0x9a:
2130 ++jump;
2131 opnd_sel = regs->cs;
2132 opnd_off = eip;
2133 ad_bytes = ad_default;
2134 eip += op_bytes + 2;
2135 break;
2136 case 0xff:
2138 unsigned int modrm;
2140 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2142 case 0x28: case 0x68: case 0xa8:
2143 ++jump;
2144 /* FALLTHROUGH */
2145 case 0x18: case 0x58: case 0x98:
2146 ++jump;
2147 if ( ad_bytes != 2 )
2149 if ( (modrm & 7) == 4 )
2151 unsigned int sib = insn_fetch(u8, base, eip, limit);
2153 modrm = (modrm & ~7) | (sib & 7);
2154 if ( (sib >>= 3) != 4 )
2155 opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
2156 opnd_off <<= sib >> 3;
2158 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2159 opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
2160 else
2161 modrm |= 0x87;
2162 if ( !opnd_sel )
2164 switch ( modrm & 7 )
2166 default:
2167 opnd_sel = read_sreg(regs, ds);
2168 break;
2169 case 4: case 5:
2170 opnd_sel = regs->ss;
2171 break;
2175 else
2177 switch ( modrm & 7 )
2179 case 0: case 1: case 7:
2180 opnd_off = regs->ebx;
2181 break;
2182 case 6:
2183 if ( !(modrm & 0xc0) )
2184 modrm |= 0x80;
2185 else
2186 case 2: case 3:
2188 opnd_off = regs->ebp;
2189 if ( !opnd_sel )
2190 opnd_sel = regs->ss;
2192 break;
2194 if ( !opnd_sel )
2195 opnd_sel = read_sreg(regs, ds);
2196 switch ( modrm & 7 )
2198 case 0: case 2: case 4:
2199 opnd_off += regs->esi;
2200 break;
2201 case 1: case 3: case 5:
2202 opnd_off += regs->edi;
2203 break;
2206 switch ( modrm & 0xc0 )
2208 case 0x40:
2209 opnd_off += insn_fetch(s8, base, eip, limit);
2210 break;
2211 case 0x80:
2212 opnd_off += insn_fetch(s32, base, eip, limit);
2213 break;
2215 if ( ad_bytes == 4 )
2216 opnd_off = (unsigned int)opnd_off;
2217 else if ( ad_bytes == 2 )
2218 opnd_off = (unsigned short)opnd_off;
2219 break;
2222 break;
2224 break;
2227 if ( jump < 0 )
2229 fail:
2230 do_guest_trap(TRAP_gp_fault, regs, 1);
2231 skip:
2232 return;
2235 if ( (opnd_sel != regs->cs &&
2236 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2237 !(ar & _SEGMENT_S) ||
2238 !(ar & _SEGMENT_P) ||
2239 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2241 do_guest_trap(TRAP_gp_fault, regs, 1);
2242 return;
2245 opnd_off += op_bytes;
2246 #define ad_default ad_bytes
2247 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2248 #undef ad_default
2249 ASSERT((opnd_sel & ~3) == regs->error_code);
2250 if ( dpl < (opnd_sel & 3) )
2252 do_guest_trap(TRAP_gp_fault, regs, 1);
2253 return;
2256 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2257 !(ar & _SEGMENT_S) ||
2258 !(ar & _SEGMENT_CODE) ||
2259 (!jump || (ar & _SEGMENT_EC) ?
2260 ((ar >> 13) & 3) > (regs->cs & 3) :
2261 ((ar >> 13) & 3) != (regs->cs & 3)) )
2263 regs->error_code = sel;
2264 do_guest_trap(TRAP_gp_fault, regs, 1);
2265 return;
2267 if ( !(ar & _SEGMENT_P) )
2269 regs->error_code = sel;
2270 do_guest_trap(TRAP_no_segment, regs, 1);
2271 return;
2273 if ( off > limit )
2275 regs->error_code = 0;
2276 do_guest_trap(TRAP_gp_fault, regs, 1);
2277 return;
2280 if ( !jump )
2282 unsigned int ss, esp, *stkp;
2283 int rc;
2284 #define push(item) do \
2285 { \
2286 --stkp; \
2287 esp -= 4; \
2288 rc = __put_user(item, stkp); \
2289 if ( rc ) \
2290 { \
2291 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2292 PFEC_write_access); \
2293 return; \
2294 } \
2295 } while ( 0 )
2297 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2299 sel |= (ar >> 13) & 3;
2300 /* Inner stack known only for kernel ring. */
2301 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2303 do_guest_trap(TRAP_gp_fault, regs, 1);
2304 return;
2306 esp = v->arch.guest_context.kernel_sp;
2307 ss = v->arch.guest_context.kernel_ss;
2308 if ( (ss & 3) != (sel & 3) ||
2309 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2310 ((ar >> 13) & 3) != (sel & 3) ||
2311 !(ar & _SEGMENT_S) ||
2312 (ar & _SEGMENT_CODE) ||
2313 !(ar & _SEGMENT_WR) )
2315 regs->error_code = ss & ~3;
2316 do_guest_trap(TRAP_invalid_tss, regs, 1);
2317 return;
2319 if ( !(ar & _SEGMENT_P) ||
2320 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2322 regs->error_code = ss & ~3;
2323 do_guest_trap(TRAP_stack_error, regs, 1);
2324 return;
2326 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2327 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2329 do_guest_trap(TRAP_gp_fault, regs, 1);
2330 return;
2332 push(regs->ss);
2333 push(regs->esp);
2334 if ( nparm )
2336 const unsigned int *ustkp;
2338 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2339 ((ar >> 13) & 3) != (regs->cs & 3) ||
2340 !(ar & _SEGMENT_S) ||
2341 (ar & _SEGMENT_CODE) ||
2342 !(ar & _SEGMENT_WR) ||
2343 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2344 return do_guest_trap(TRAP_gp_fault, regs, 1);
2345 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2346 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2348 do_guest_trap(TRAP_gp_fault, regs, 1);
2349 return;
2351 do
2353 unsigned int parm;
2355 --ustkp;
2356 rc = __get_user(parm, ustkp);
2357 if ( rc )
2359 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2360 return;
2362 push(parm);
2363 } while ( --nparm );
2366 else
2368 sel |= (regs->cs & 3);
2369 esp = regs->esp;
2370 ss = regs->ss;
2371 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2372 ((ar >> 13) & 3) != (sel & 3) )
2374 do_guest_trap(TRAP_gp_fault, regs, 1);
2375 return;
2377 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2379 regs->error_code = 0;
2380 do_guest_trap(TRAP_stack_error, regs, 1);
2381 return;
2383 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2384 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2386 do_guest_trap(TRAP_gp_fault, regs, 1);
2387 return;
2390 push(regs->cs);
2391 push(eip);
2392 #undef push
2393 regs->esp = esp;
2394 regs->ss = ss;
2396 else
2397 sel |= (regs->cs & 3);
2399 regs->cs = sel;
2400 instruction_done(regs, off, 0);
2401 #endif
2404 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2406 struct vcpu *v = current;
2407 unsigned long fixup;
2409 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2411 if ( regs->error_code & 1 )
2412 goto hardware_gp;
2414 if ( !guest_mode(regs) )
2415 goto gp_in_kernel;
2417 /*
2418 * Cunning trick to allow arbitrary "INT n" handling.
2420 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2421 * instruction from trapping to the appropriate vector, when that might not
2422 * be expected by Xen or the guest OS. For example, that entry might be for
2423 * a fault handler (unlike traps, faults don't increment EIP), or might
2424 * expect an error code on the stack (which a software trap never
2425 * provides), or might be a hardware interrupt handler that doesn't like
2426 * being called spuriously.
2428 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2429 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2430 * clear to indicate that it's a software fault, not hardware.
2432 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2433 * okay because they can only be triggered by an explicit DPL-checked
2434 * instruction. The DPL specified by the guest OS for these vectors is NOT
2435 * CHECKED!!
2436 */
2437 if ( (regs->error_code & 3) == 2 )
2439 /* This fault must be due to <INT n> instruction. */
2440 const struct trap_info *ti;
2441 unsigned char vector = regs->error_code >> 3;
2442 ti = &v->arch.guest_context.trap_ctxt[vector];
2443 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2445 regs->eip += 2;
2446 do_guest_trap(vector, regs, 0);
2447 return;
2450 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2452 emulate_gate_op(regs);
2453 return;
2456 /* Emulate some simple privileged and I/O instructions. */
2457 if ( (regs->error_code == 0) &&
2458 emulate_privileged_op(regs) )
2460 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2461 return;
2464 #if defined(__i386__)
2465 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2466 (regs->error_code == 0) &&
2467 gpf_emulate_4gb(regs) )
2469 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2470 return;
2472 #endif
2474 /* Pass on GPF as is. */
2475 do_guest_trap(TRAP_gp_fault, regs, 1);
2476 return;
2478 gp_in_kernel:
2480 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2482 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2483 regs->error_code, _p(regs->eip), _p(fixup));
2484 regs->eip = fixup;
2485 return;
2488 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2490 hardware_gp:
2491 show_execution_state(regs);
2492 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2495 static void nmi_softirq(void)
2497 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2498 vcpu_kick(dom0->vcpu[0]);
2501 static void nmi_dom0_report(unsigned int reason_idx)
2503 struct domain *d;
2504 struct vcpu *v;
2506 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2507 return;
2509 set_bit(reason_idx, nmi_reason(d));
2511 if ( !test_and_set_bool(v->nmi_pending) )
2512 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2515 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2517 switch ( opt_nmi[0] )
2519 case 'd': /* 'dom0' */
2520 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2521 case 'i': /* 'ignore' */
2522 break;
2523 default: /* 'fatal' */
2524 console_force_unlock();
2525 printk("\n\nNMI - MEMORY ERROR\n");
2526 fatal_trap(TRAP_nmi, regs);
2529 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2530 mdelay(1);
2531 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2534 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2536 switch ( opt_nmi[0] )
2538 case 'd': /* 'dom0' */
2539 nmi_dom0_report(_XEN_NMIREASON_io_error);
2540 case 'i': /* 'ignore' */
2541 break;
2542 default: /* 'fatal' */
2543 console_force_unlock();
2544 printk("\n\nNMI - I/O ERROR\n");
2545 fatal_trap(TRAP_nmi, regs);
2548 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2549 mdelay(1);
2550 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2553 static void unknown_nmi_error(unsigned char reason)
2555 switch ( opt_nmi[0] )
2557 case 'd': /* 'dom0' */
2558 nmi_dom0_report(_XEN_NMIREASON_unknown);
2559 case 'i': /* 'ignore' */
2560 break;
2561 default: /* 'fatal' */
2562 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2563 printk("Dazed and confused, but trying to continue\n");
2564 printk("Do you have a strange power saving mode enabled?\n");
2565 kexec_crash();
2569 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2571 return 0;
2574 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2576 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2578 unsigned int cpu = smp_processor_id();
2579 unsigned char reason;
2581 ++nmi_count(cpu);
2583 if ( nmi_callback(regs, cpu) )
2584 return;
2586 if ( nmi_watchdog )
2587 nmi_watchdog_tick(regs);
2589 /* Only the BSP gets external NMIs from the system. */
2590 if ( cpu == 0 )
2592 reason = inb(0x61);
2593 if ( reason & 0x80 )
2594 mem_parity_error(regs);
2595 else if ( reason & 0x40 )
2596 io_check_error(regs);
2597 else if ( !nmi_watchdog )
2598 unknown_nmi_error((unsigned char)(reason&0xff));
2602 void set_nmi_callback(nmi_callback_t callback)
2604 nmi_callback = callback;
2607 void unset_nmi_callback(void)
2609 nmi_callback = dummy_nmi_callback;
2612 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2614 struct vcpu *curr = current;
2616 BUG_ON(!guest_mode(regs));
2618 setup_fpu(curr);
2620 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2622 do_guest_trap(TRAP_no_device, regs, 0);
2623 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2625 else
2626 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2628 return;
2631 asmlinkage void do_debug(struct cpu_user_regs *regs)
2633 struct vcpu *v = current;
2635 DEBUGGER_trap_entry(TRAP_debug, regs);
2637 if ( !guest_mode(regs) )
2639 if ( regs->eflags & EF_TF )
2641 #ifdef __x86_64__
2642 void sysenter_entry(void);
2643 void sysenter_eflags_saved(void);
2644 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2645 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2646 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2647 goto out;
2648 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2649 #else
2650 WARN_ON(1);
2651 #endif
2652 regs->eflags &= ~EF_TF;
2654 else
2656 /*
2657 * We ignore watchpoints when they trigger within Xen. This may
2658 * happen when a buffer is passed to us which previously had a
2659 * watchpoint set on it. No need to bump EIP; the only faulting
2660 * trap is an instruction breakpoint, which can't happen to us.
2661 */
2662 WARN_ON(!search_exception_table(regs->eip));
2664 goto out;
2667 /* Save debug status register where guest OS can peek at it */
2668 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2670 ler_enable();
2671 do_guest_trap(TRAP_debug, regs, 0);
2672 return;
2674 out:
2675 ler_enable();
2676 return;
2679 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2683 void set_intr_gate(unsigned int n, void *addr)
2685 int i;
2686 /* Keep secondary tables in sync with IRQ updates. */
2687 for ( i = 1; i < NR_CPUS; i++ )
2688 if ( idt_tables[i] != NULL )
2689 _set_gate(&idt_tables[i][n], 14, 0, addr);
2690 _set_gate(&idt_table[n], 14, 0, addr);
2693 void set_system_gate(unsigned int n, void *addr)
2695 _set_gate(idt_table+n,14,3,addr);
2698 void set_tss_desc(unsigned int n, void *addr)
2700 _set_tssldt_desc(
2701 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2702 (unsigned long)addr,
2703 offsetof(struct tss_struct, __cacheline_filler) - 1,
2704 9);
2705 #ifdef CONFIG_COMPAT
2706 _set_tssldt_desc(
2707 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2708 (unsigned long)addr,
2709 offsetof(struct tss_struct, __cacheline_filler) - 1,
2710 11);
2711 #endif
2714 void __devinit percpu_traps_init(void)
2716 subarch_percpu_traps_init();
2718 if ( !opt_ler )
2719 return;
2721 switch ( boot_cpu_data.x86_vendor )
2723 case X86_VENDOR_INTEL:
2724 switch ( boot_cpu_data.x86 )
2726 case 6:
2727 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2728 break;
2729 case 15:
2730 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2731 break;
2733 break;
2734 case X86_VENDOR_AMD:
2735 switch ( boot_cpu_data.x86 )
2737 case 6:
2738 case 15:
2739 case 16:
2740 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2741 break;
2743 break;
2746 ler_enable();
2749 void __init trap_init(void)
2751 /*
2752 * Note that interrupt gates are always used, rather than trap gates. We
2753 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2754 * first activation must have the "bad" value(s) for these registers and
2755 * we may lose them if another activation is installed before they are
2756 * saved. The page-fault handler also needs interrupts disabled until %cr2
2757 * has been read and saved on the stack.
2758 */
2759 set_intr_gate(TRAP_divide_error,&divide_error);
2760 set_intr_gate(TRAP_debug,&debug);
2761 set_intr_gate(TRAP_nmi,&nmi);
2762 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2763 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2764 set_intr_gate(TRAP_bounds,&bounds);
2765 set_intr_gate(TRAP_invalid_op,&invalid_op);
2766 set_intr_gate(TRAP_no_device,&device_not_available);
2767 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2768 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2769 set_intr_gate(TRAP_no_segment,&segment_not_present);
2770 set_intr_gate(TRAP_stack_error,&stack_segment);
2771 set_intr_gate(TRAP_gp_fault,&general_protection);
2772 set_intr_gate(TRAP_page_fault,&page_fault);
2773 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2774 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2775 set_intr_gate(TRAP_alignment_check,&alignment_check);
2776 set_intr_gate(TRAP_machine_check,&machine_check);
2777 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2779 /* CPU0 uses the master IDT. */
2780 idt_tables[0] = idt_table;
2782 percpu_traps_init();
2784 cpu_init();
2786 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2789 long register_guest_nmi_callback(unsigned long address)
2791 struct vcpu *v = current;
2792 struct domain *d = v->domain;
2793 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2795 t->vector = TRAP_nmi;
2796 t->flags = 0;
2797 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2798 t->address = address;
2799 TI_SET_IF(t, 1);
2801 /*
2802 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2803 * now.
2804 */
2805 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2806 v->nmi_pending = 1;
2808 return 0;
2811 long unregister_guest_nmi_callback(void)
2813 struct vcpu *v = current;
2814 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2816 memset(t, 0, sizeof(*t));
2818 return 0;
2821 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2823 struct trap_info cur;
2824 struct vcpu *curr = current;
2825 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2826 long rc = 0;
2828 /* If no table is presented then clear the entire virtual IDT. */
2829 if ( guest_handle_is_null(traps) )
2831 memset(dst, 0, 256 * sizeof(*dst));
2832 init_int80_direct_trap(curr);
2833 return 0;
2836 for ( ; ; )
2838 if ( hypercall_preempt_check() )
2840 rc = hypercall_create_continuation(
2841 __HYPERVISOR_set_trap_table, "h", traps);
2842 break;
2845 if ( copy_from_guest(&cur, traps, 1) )
2847 rc = -EFAULT;
2848 break;
2851 if ( cur.address == 0 )
2852 break;
2854 fixup_guest_code_selector(curr->domain, cur.cs);
2856 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2858 if ( cur.vector == 0x80 )
2859 init_int80_direct_trap(curr);
2861 guest_handle_add_offset(traps, 1);
2864 return rc;
2867 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
2869 int i;
2870 struct vcpu *curr = current;
2872 switch ( reg )
2874 case 0:
2875 if ( !access_ok(value, sizeof(long)) )
2876 return -EPERM;
2877 if ( v == curr )
2878 write_debugreg(0, value);
2879 break;
2880 case 1:
2881 if ( !access_ok(value, sizeof(long)) )
2882 return -EPERM;
2883 if ( v == curr )
2884 write_debugreg(1, value);
2885 break;
2886 case 2:
2887 if ( !access_ok(value, sizeof(long)) )
2888 return -EPERM;
2889 if ( v == curr )
2890 write_debugreg(2, value);
2891 break;
2892 case 3:
2893 if ( !access_ok(value, sizeof(long)) )
2894 return -EPERM;
2895 if ( v == curr )
2896 write_debugreg(3, value);
2897 break;
2898 case 6:
2899 /*
2900 * DR6: Bits 4-11,16-31 reserved (set to 1).
2901 * Bit 12 reserved (set to 0).
2902 */
2903 value &= 0xffffefff; /* reserved bits => 0 */
2904 value |= 0xffff0ff0; /* reserved bits => 1 */
2905 if ( v == curr )
2906 write_debugreg(6, value);
2907 break;
2908 case 7:
2909 /*
2910 * DR7: Bit 10 reserved (set to 1).
2911 * Bits 11-12,14-15 reserved (set to 0).
2912 */
2913 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
2914 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
2915 /*
2916 * Privileged bits:
2917 * GD (bit 13): must be 0.
2918 */
2919 if ( value & DR_GENERAL_DETECT )
2920 return -EPERM;
2921 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
2922 if ( value & DR7_ACTIVE_MASK )
2924 unsigned int io_enable = 0;
2926 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
2928 if ( ((value >> i) & 3) == DR_IO )
2930 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
2931 return -EPERM;
2932 io_enable |= value & (3 << ((i - 16) >> 1));
2934 #ifdef __i386__
2935 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
2936 !boot_cpu_has(X86_FEATURE_LM)) &&
2937 (((value >> i) & 0xc) == DR_LEN_8) )
2938 return -EPERM;
2939 #endif
2942 /* Guest DR5 is a handy stash for I/O intercept information. */
2943 v->arch.guest_context.debugreg[5] = io_enable;
2944 value &= ~io_enable;
2946 /*
2947 * If DR7 was previously clear then we need to load all other
2948 * debug registers at this point as they were not restored during
2949 * context switch.
2950 */
2951 if ( (v == curr) &&
2952 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
2954 write_debugreg(0, v->arch.guest_context.debugreg[0]);
2955 write_debugreg(1, v->arch.guest_context.debugreg[1]);
2956 write_debugreg(2, v->arch.guest_context.debugreg[2]);
2957 write_debugreg(3, v->arch.guest_context.debugreg[3]);
2958 write_debugreg(6, v->arch.guest_context.debugreg[6]);
2961 if ( v == curr )
2962 write_debugreg(7, value);
2963 break;
2964 default:
2965 return -EINVAL;
2968 v->arch.guest_context.debugreg[reg] = value;
2969 return 0;
2972 long do_set_debugreg(int reg, unsigned long value)
2974 return set_debugreg(current, reg, value);
2977 unsigned long do_get_debugreg(int reg)
2979 struct vcpu *curr = current;
2981 switch ( reg )
2983 case 0 ... 3:
2984 case 6:
2985 return curr->arch.guest_context.debugreg[reg];
2986 case 7:
2987 return (curr->arch.guest_context.debugreg[7] |
2988 curr->arch.guest_context.debugreg[5]);
2989 case 4 ... 5:
2990 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
2991 curr->arch.guest_context.debugreg[reg + 2] : 0);
2994 return -EINVAL;
2997 /*
2998 * Local variables:
2999 * mode: C
3000 * c-set-style: "BSD"
3001 * c-basic-offset: 4
3002 * tab-width: 4
3003 * indent-tabs-mode: nil
3004 * End:
3005 */