ia64/xen-unstable

view xen/arch/x86/traps.c @ 16427:fd3f6d814f6d

x86: single step after instruction emulation

Inject single step trap after emulating instructions if guest's
EFLAGS.TF is set.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Nov 22 18:28:47 2007 +0000 (2007-11-22)
parents 66a7ff355762
children 69b56d3289f5
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
91 asmlinkage void nmi(void);
92 asmlinkage void machine_check(void);
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
114 static int debug_stack_lines = 20;
115 integer_param("debug_stack_lines", debug_stack_lines);
117 static int opt_ler;
118 boolean_param("ler", opt_ler);
120 #ifdef CONFIG_X86_32
121 #define stack_words_per_line 8
122 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
123 #else
124 #define stack_words_per_line 4
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
126 #endif
128 static void show_guest_stack(struct cpu_user_regs *regs)
129 {
130 int i;
131 struct vcpu *curr = current;
132 unsigned long *stack, addr;
134 if ( is_hvm_vcpu(curr) )
135 return;
137 if ( is_pv_32on64_vcpu(curr) )
138 {
139 compat_show_guest_stack(regs, debug_stack_lines);
140 return;
141 }
143 if ( vm86_mode(regs) )
144 {
145 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
146 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
147 regs->ss, (uint16_t)(regs->esp & 0xffff));
148 }
149 else
150 {
151 stack = (unsigned long *)regs->esp;
152 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
153 }
155 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
156 {
157 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
158 break;
159 if ( get_user(addr, stack) )
160 {
161 if ( i != 0 )
162 printk("\n ");
163 printk("Fault while accessing guest memory.");
164 i = 1;
165 break;
166 }
167 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
168 printk("\n ");
169 printk(" %p", _p(addr));
170 stack++;
171 }
172 if ( i == 0 )
173 printk("Stack empty.");
174 printk("\n");
175 }
177 #if !defined(CONFIG_FRAME_POINTER)
179 static void show_trace(struct cpu_user_regs *regs)
180 {
181 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
183 printk("Xen call trace:\n ");
185 printk("[<%p>]", _p(regs->eip));
186 print_symbol(" %s\n ", regs->eip);
188 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
189 {
190 addr = *stack++;
191 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
192 {
193 printk("[<%p>]", _p(addr));
194 print_symbol(" %s\n ", addr);
195 }
196 }
198 printk("\n");
199 }
201 #else
203 static void show_trace(struct cpu_user_regs *regs)
204 {
205 unsigned long *frame, next, addr, low, high;
207 printk("Xen call trace:\n ");
209 printk("[<%p>]", _p(regs->eip));
210 print_symbol(" %s\n ", regs->eip);
212 /* Bounds for range of valid frame pointer. */
213 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
214 high = (low & ~(STACK_SIZE - 1)) +
215 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
217 /* The initial frame pointer. */
218 next = regs->ebp;
220 for ( ; ; )
221 {
222 /* Valid frame pointer? */
223 if ( (next < low) || (next >= high) )
224 {
225 /*
226 * Exception stack frames have a different layout, denoted by an
227 * inverted frame pointer.
228 */
229 next = ~next;
230 if ( (next < low) || (next >= high) )
231 break;
232 frame = (unsigned long *)next;
233 next = frame[0];
234 addr = frame[(offsetof(struct cpu_user_regs, eip) -
235 offsetof(struct cpu_user_regs, ebp))
236 / BYTES_PER_LONG];
237 }
238 else
239 {
240 /* Ordinary stack frame. */
241 frame = (unsigned long *)next;
242 next = frame[0];
243 addr = frame[1];
244 }
246 printk("[<%p>]", _p(addr));
247 print_symbol(" %s\n ", addr);
249 low = (unsigned long)&frame[2];
250 }
252 printk("\n");
253 }
255 #endif
257 void show_stack(struct cpu_user_regs *regs)
258 {
259 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
260 int i;
262 if ( guest_mode(regs) )
263 return show_guest_stack(regs);
265 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
267 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
268 {
269 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
270 break;
271 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
272 printk("\n ");
273 addr = *stack++;
274 printk(" %p", _p(addr));
275 }
276 if ( i == 0 )
277 printk("Stack empty.");
278 printk("\n");
280 show_trace(regs);
281 }
283 void show_stack_overflow(unsigned int cpu, unsigned long esp)
284 {
285 #ifdef MEMORY_GUARD
286 unsigned long esp_top, esp_bottom;
287 unsigned long *stack, addr;
289 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
290 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
292 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
293 (void *)esp_top, (void *)esp_bottom, (void *)esp,
294 (void *)init_tss[cpu].esp0);
296 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
297 if ( ((unsigned long)(esp - esp_top) > 512) &&
298 ((unsigned long)(esp_top - esp) > 512) )
299 {
300 printk("No stack overflow detected. Skipping stack trace.\n");
301 return;
302 }
304 if ( esp < esp_top )
305 esp = esp_top;
307 printk("Xen stack overflow (dumping trace %p-%p):\n ",
308 (void *)esp, (void *)esp_bottom);
310 stack = (unsigned long *)esp;
311 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
312 {
313 addr = *stack++;
314 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
315 {
316 printk("%p: [<%p>]", stack, _p(addr));
317 print_symbol(" %s\n ", addr);
318 }
319 }
321 printk("\n");
322 #endif
323 }
325 void show_execution_state(struct cpu_user_regs *regs)
326 {
327 show_registers(regs);
328 show_stack(regs);
329 }
331 char *trapstr(int trapnr)
332 {
333 static char *strings[] = {
334 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
335 "invalid opcode", "device not available", "double fault",
336 "coprocessor segment", "invalid tss", "segment not found",
337 "stack error", "general protection fault", "page fault",
338 "spurious interrupt", "coprocessor error", "alignment check",
339 "machine check", "simd error"
340 };
342 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
343 return "???";
345 return strings[trapnr];
346 }
348 /*
349 * This is called for faults at very unexpected times (e.g., when interrupts
350 * are disabled). In such situations we can't do much that is safe. We try to
351 * print out some tracing and then we just spin.
352 */
353 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
354 {
355 static DEFINE_PER_CPU(char, depth);
357 /*
358 * In some cases, we can end up in a vicious cycle of fatal_trap()s
359 * within fatal_trap()s. We give the problem a couple of iterations to
360 * bottom out, and then we just panic.
361 */
362 if ( ++this_cpu(depth) < 3 )
363 {
364 watchdog_disable();
365 console_start_sync();
367 show_execution_state(regs);
369 if ( trapnr == TRAP_page_fault )
370 {
371 unsigned long cr2 = read_cr2();
372 printk("Faulting linear address: %p\n", _p(cr2));
373 show_page_walk(cr2);
374 }
375 }
377 panic("FATAL TRAP: vector = %d (%s)\n"
378 "[error_code=%04x] %s\n",
379 trapnr, trapstr(trapnr), regs->error_code,
380 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
381 }
383 static int do_guest_trap(
384 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
385 {
386 struct vcpu *v = current;
387 struct trap_bounce *tb;
388 const struct trap_info *ti;
390 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
392 tb = &v->arch.trap_bounce;
393 ti = &v->arch.guest_context.trap_ctxt[trapnr];
395 tb->flags = TBF_EXCEPTION;
396 tb->cs = ti->cs;
397 tb->eip = ti->address;
399 if ( use_error_code )
400 {
401 tb->flags |= TBF_EXCEPTION_ERRCODE;
402 tb->error_code = regs->error_code;
403 }
405 if ( TI_GET_IF(ti) )
406 tb->flags |= TBF_INTERRUPT;
408 if ( unlikely(null_trap_bounce(v, tb)) )
409 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
410 "domain %d on VCPU %d [ec=%04x]\n",
411 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
412 regs->error_code);
414 return 0;
415 }
417 static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
418 {
419 regs->eip = eip;
420 regs->eflags &= ~X86_EFLAGS_RF;
421 if ( regs->eflags & X86_EFLAGS_TF )
422 {
423 current->arch.guest_context.debugreg[6] |= 0xffff4ff0;
424 do_guest_trap(TRAP_debug, regs, 0);
425 }
426 }
428 /*
429 * Called from asm to set up the NMI trapbounce info.
430 * Returns 0 if no callback is set up, else 1.
431 */
432 asmlinkage int set_guest_nmi_trapbounce(void)
433 {
434 struct vcpu *v = current;
435 struct trap_bounce *tb = &v->arch.trap_bounce;
436 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
437 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
438 return !null_trap_bounce(v, tb);
439 }
441 static inline int do_trap(
442 int trapnr, struct cpu_user_regs *regs, int use_error_code)
443 {
444 unsigned long fixup;
446 DEBUGGER_trap_entry(trapnr, regs);
448 if ( guest_mode(regs) )
449 return do_guest_trap(trapnr, regs, use_error_code);
451 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
452 {
453 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
454 trapnr, _p(regs->eip), _p(fixup));
455 regs->eip = fixup;
456 return 0;
457 }
459 DEBUGGER_trap_fatal(trapnr, regs);
461 show_execution_state(regs);
462 panic("FATAL TRAP: vector = %d (%s)\n"
463 "[error_code=%04x]\n",
464 trapnr, trapstr(trapnr), regs->error_code);
465 return 0;
466 }
468 #define DO_ERROR_NOCODE(trapnr, name) \
469 asmlinkage int do_##name(struct cpu_user_regs *regs) \
470 { \
471 return do_trap(trapnr, regs, 0); \
472 }
474 #define DO_ERROR(trapnr, name) \
475 asmlinkage int do_##name(struct cpu_user_regs *regs) \
476 { \
477 return do_trap(trapnr, regs, 1); \
478 }
480 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
481 DO_ERROR_NOCODE(TRAP_overflow, overflow)
482 DO_ERROR_NOCODE(TRAP_bounds, bounds)
483 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
484 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
485 DO_ERROR( TRAP_no_segment, segment_not_present)
486 DO_ERROR( TRAP_stack_error, stack_segment)
487 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
488 DO_ERROR( TRAP_alignment_check, alignment_check)
489 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
491 int rdmsr_hypervisor_regs(
492 uint32_t idx, uint32_t *eax, uint32_t *edx)
493 {
494 idx -= 0x40000000;
495 if ( idx > 0 )
496 return 0;
498 switch ( idx )
499 {
500 case 0:
501 {
502 *eax = *edx = 0;
503 break;
504 }
505 default:
506 BUG();
507 }
509 return 1;
510 }
512 int wrmsr_hypervisor_regs(
513 uint32_t idx, uint32_t eax, uint32_t edx)
514 {
515 struct domain *d = current->domain;
517 idx -= 0x40000000;
518 if ( idx > 0 )
519 return 0;
521 switch ( idx )
522 {
523 case 0:
524 {
525 void *hypercall_page;
526 unsigned long mfn;
527 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
528 unsigned int idx = eax & 0xfff;
530 if ( idx > 0 )
531 {
532 gdprintk(XENLOG_WARNING,
533 "Dom%d: Out of range index %u to MSR %08x\n",
534 d->domain_id, idx, 0x40000000);
535 return 0;
536 }
538 mfn = gmfn_to_mfn(d, gmfn);
540 if ( !mfn_valid(mfn) ||
541 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
542 {
543 gdprintk(XENLOG_WARNING,
544 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
545 d->domain_id, gmfn, mfn, 0x40000000);
546 return 0;
547 }
549 hypercall_page = map_domain_page(mfn);
550 hypercall_page_initialise(d, hypercall_page);
551 unmap_domain_page(hypercall_page);
553 put_page_and_type(mfn_to_page(mfn));
554 break;
555 }
557 default:
558 BUG();
559 }
561 return 1;
562 }
564 int cpuid_hypervisor_leaves(
565 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
566 {
567 idx -= 0x40000000;
568 if ( idx > 2 )
569 return 0;
571 switch ( idx )
572 {
573 case 0:
574 *eax = 0x40000002; /* Largest leaf */
575 *ebx = 0x566e6558; /* Signature 1: "XenV" */
576 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
577 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
578 break;
580 case 1:
581 *eax = (xen_major_version() << 16) | xen_minor_version();
582 *ebx = 0; /* Reserved */
583 *ecx = 0; /* Reserved */
584 *edx = 0; /* Reserved */
585 break;
587 case 2:
588 *eax = 1; /* Number of hypercall-transfer pages */
589 *ebx = 0x40000000; /* MSR base address */
590 *ecx = 0; /* Features 1 */
591 *edx = 0; /* Features 2 */
592 break;
594 default:
595 BUG();
596 }
598 return 1;
599 }
601 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
602 {
603 char sig[5], instr[2];
604 uint32_t a, b, c, d;
605 unsigned long eip, rc;
607 a = regs->eax;
608 b = regs->ebx;
609 c = regs->ecx;
610 d = regs->edx;
611 eip = regs->eip;
613 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
614 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
615 {
616 propagate_page_fault(eip + sizeof(sig) - rc, 0);
617 return EXCRET_fault_fixed;
618 }
619 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
620 return 0;
621 eip += sizeof(sig);
623 /* We only emulate CPUID. */
624 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
625 {
626 propagate_page_fault(eip + sizeof(instr) - rc, 0);
627 return EXCRET_fault_fixed;
628 }
629 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
630 return 0;
631 eip += sizeof(instr);
633 asm (
634 "cpuid"
635 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
636 : "0" (a), "1" (b), "2" (c), "3" (d) );
638 if ( regs->eax == 1 )
639 {
640 /* Modify Feature Information. */
641 clear_bit(X86_FEATURE_VME, &d);
642 clear_bit(X86_FEATURE_DE, &d);
643 clear_bit(X86_FEATURE_PSE, &d);
644 clear_bit(X86_FEATURE_PGE, &d);
645 if ( !cpu_has_sep )
646 clear_bit(X86_FEATURE_SEP, &d);
647 #ifdef __i386__
648 if ( !supervisor_mode_kernel )
649 clear_bit(X86_FEATURE_SEP, &d);
650 #endif
651 if ( !IS_PRIV(current->domain) )
652 clear_bit(X86_FEATURE_MTRR, &d);
653 }
654 else if ( regs->eax == 0x80000001 )
655 {
656 /* Modify Feature Information. */
657 #ifdef __i386__
658 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
659 #endif
660 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
661 }
662 else
663 {
664 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
665 }
667 regs->eax = a;
668 regs->ebx = b;
669 regs->ecx = c;
670 regs->edx = d;
672 instruction_done(regs, eip);
674 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
676 return EXCRET_fault_fixed;
677 }
679 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
680 {
681 struct bug_frame bug;
682 struct bug_frame_str bug_str;
683 char *filename, *predicate, *eip = (char *)regs->eip;
684 int rc, id, lineno;
686 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
688 if ( likely(guest_mode(regs)) )
689 {
690 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
691 return rc;
692 return do_guest_trap(TRAP_invalid_op, regs, 0);
693 }
695 if ( !is_kernel(eip) ||
696 __copy_from_user(&bug, eip, sizeof(bug)) ||
697 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
698 (bug.ret != 0xc2) )
699 goto die;
700 eip += sizeof(bug);
702 id = bug.id & 3;
704 if ( id == BUGFRAME_dump )
705 {
706 show_execution_state(regs);
707 regs->eip = (unsigned long)eip;
708 return EXCRET_fault_fixed;
709 }
711 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
712 if ( !is_kernel(eip) ||
713 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
714 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
715 goto die;
716 eip += sizeof(bug_str);
718 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
719 lineno = bug.id >> 2;
721 if ( id == BUGFRAME_warn )
722 {
723 printk("Xen WARN at %.50s:%d\n", filename, lineno);
724 show_execution_state(regs);
725 regs->eip = (unsigned long)eip;
726 return EXCRET_fault_fixed;
727 }
729 if ( id == BUGFRAME_bug )
730 {
731 printk("Xen BUG at %.50s:%d\n", filename, lineno);
732 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
733 show_execution_state(regs);
734 panic("Xen BUG at %.50s:%d\n", filename, lineno);
735 }
737 /* ASSERT: decode the predicate string pointer. */
738 ASSERT(id == BUGFRAME_assert);
739 if ( !is_kernel(eip) ||
740 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
741 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
742 goto die;
743 eip += sizeof(bug_str);
745 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
746 printk("Assertion '%s' failed at %.50s:%d\n",
747 predicate, filename, lineno);
748 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
749 show_execution_state(regs);
750 panic("Assertion '%s' failed at %.50s:%d\n",
751 predicate, filename, lineno);
753 die:
754 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
755 show_execution_state(regs);
756 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
757 return 0;
758 }
760 asmlinkage int do_int3(struct cpu_user_regs *regs)
761 {
762 DEBUGGER_trap_entry(TRAP_int3, regs);
764 if ( !guest_mode(regs) )
765 {
766 DEBUGGER_trap_fatal(TRAP_int3, regs);
767 show_execution_state(regs);
768 panic("FATAL TRAP: vector = 3 (Int3)\n");
769 }
771 return do_guest_trap(TRAP_int3, regs, 0);
772 }
774 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
775 {
776 extern fastcall void (*machine_check_vector)(
777 struct cpu_user_regs *, long error_code);
778 machine_check_vector(regs, regs->error_code);
779 }
781 void propagate_page_fault(unsigned long addr, u16 error_code)
782 {
783 struct trap_info *ti;
784 struct vcpu *v = current;
785 struct trap_bounce *tb = &v->arch.trap_bounce;
787 v->arch.guest_context.ctrlreg[2] = addr;
788 arch_set_cr2(v, addr);
790 /* Re-set error_code.user flag appropriately for the guest. */
791 error_code &= ~PFEC_user_mode;
792 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
793 error_code |= PFEC_user_mode;
795 trace_pv_page_fault(addr, error_code);
797 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
798 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
799 tb->error_code = error_code;
800 tb->cs = ti->cs;
801 tb->eip = ti->address;
802 if ( TI_GET_IF(ti) )
803 tb->flags |= TBF_INTERRUPT;
804 if ( unlikely(null_trap_bounce(v, tb)) )
805 {
806 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
807 v->domain->domain_id, v->vcpu_id, error_code);
808 show_page_walk(addr);
809 }
810 }
812 static int handle_gdt_ldt_mapping_fault(
813 unsigned long offset, struct cpu_user_regs *regs)
814 {
815 struct vcpu *curr = current;
816 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
817 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
818 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
820 /* Should never fault in another vcpu's area. */
821 BUG_ON(vcpu_area != curr->vcpu_id);
823 /* Byte offset within the gdt/ldt sub-area. */
824 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
826 if ( likely(is_ldt_area) )
827 {
828 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
829 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
830 {
831 if ( guest_mode(regs) )
832 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
833 regs->eip, offset);
834 }
835 else
836 {
837 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
838 if ( !guest_mode(regs) )
839 return 0;
840 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
841 propagate_page_fault(
842 curr->arch.guest_context.ldt_base + offset,
843 regs->error_code);
844 }
845 }
846 else
847 {
848 /* GDT fault: handle the fault as #GP(selector). */
849 regs->error_code = (u16)offset & ~7;
850 (void)do_general_protection(regs);
851 }
853 return EXCRET_fault_fixed;
854 }
856 #ifdef HYPERVISOR_VIRT_END
857 #define IN_HYPERVISOR_RANGE(va) \
858 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
859 #else
860 #define IN_HYPERVISOR_RANGE(va) \
861 (((va) >= HYPERVISOR_VIRT_START))
862 #endif
864 static int __spurious_page_fault(
865 unsigned long addr, struct cpu_user_regs *regs)
866 {
867 unsigned long mfn, cr3 = read_cr3();
868 #if CONFIG_PAGING_LEVELS >= 4
869 l4_pgentry_t l4e, *l4t;
870 #endif
871 #if CONFIG_PAGING_LEVELS >= 3
872 l3_pgentry_t l3e, *l3t;
873 #endif
874 l2_pgentry_t l2e, *l2t;
875 l1_pgentry_t l1e, *l1t;
876 unsigned int required_flags, disallowed_flags;
878 /* Reserved bit violations are never spurious faults. */
879 if ( regs->error_code & PFEC_reserved_bit )
880 return 0;
882 required_flags = _PAGE_PRESENT;
883 if ( regs->error_code & PFEC_write_access )
884 required_flags |= _PAGE_RW;
885 if ( regs->error_code & PFEC_user_mode )
886 required_flags |= _PAGE_USER;
888 disallowed_flags = 0;
889 if ( regs->error_code & PFEC_insn_fetch )
890 disallowed_flags |= _PAGE_NX;
892 mfn = cr3 >> PAGE_SHIFT;
894 #if CONFIG_PAGING_LEVELS >= 4
895 l4t = map_domain_page(mfn);
896 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
897 mfn = l4e_get_pfn(l4e);
898 unmap_domain_page(l4t);
899 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
900 (l4e_get_flags(l4e) & disallowed_flags) )
901 return 0;
902 #endif
904 #if CONFIG_PAGING_LEVELS >= 3
905 l3t = map_domain_page(mfn);
906 #ifdef CONFIG_X86_PAE
907 l3t += (cr3 & 0xFE0UL) >> 3;
908 #endif
909 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
910 mfn = l3e_get_pfn(l3e);
911 unmap_domain_page(l3t);
912 #ifdef CONFIG_X86_PAE
913 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
914 return 0;
915 #else
916 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
917 (l3e_get_flags(l3e) & disallowed_flags) )
918 return 0;
919 #endif
920 #endif
922 l2t = map_domain_page(mfn);
923 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
924 mfn = l2e_get_pfn(l2e);
925 unmap_domain_page(l2t);
926 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
927 (l2e_get_flags(l2e) & disallowed_flags) )
928 return 0;
929 if ( l2e_get_flags(l2e) & _PAGE_PSE )
930 {
931 l1e = l1e_empty(); /* define before use in debug tracing */
932 goto spurious;
933 }
935 l1t = map_domain_page(mfn);
936 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
937 mfn = l1e_get_pfn(l1e);
938 unmap_domain_page(l1t);
939 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
940 (l1e_get_flags(l1e) & disallowed_flags) )
941 return 0;
943 spurious:
944 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
945 "at addr %lx, e/c %04x\n",
946 current->domain->domain_id, current->vcpu_id,
947 addr, regs->error_code);
948 #if CONFIG_PAGING_LEVELS >= 4
949 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
950 #endif
951 #if CONFIG_PAGING_LEVELS >= 3
952 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
953 #endif
954 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
955 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
956 #ifndef NDEBUG
957 show_registers(regs);
958 #endif
959 return 1;
960 }
962 static int spurious_page_fault(
963 unsigned long addr, struct cpu_user_regs *regs)
964 {
965 unsigned long flags;
966 int is_spurious;
968 /*
969 * Disabling interrupts prevents TLB flushing, and hence prevents
970 * page tables from becoming invalid under our feet during the walk.
971 */
972 local_irq_save(flags);
973 is_spurious = __spurious_page_fault(addr, regs);
974 local_irq_restore(flags);
976 return is_spurious;
977 }
979 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
980 {
981 struct vcpu *v = current;
982 struct domain *d = v->domain;
984 /* No fixups in interrupt context or when interrupts are disabled. */
985 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
986 return 0;
988 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
989 {
990 if ( paging_mode_external(d) && guest_mode(regs) )
991 {
992 int ret = paging_fault(addr, regs);
993 if ( ret == EXCRET_fault_fixed )
994 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
995 return ret;
996 }
997 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
998 return handle_gdt_ldt_mapping_fault(
999 addr - GDT_LDT_VIRT_START, regs);
1000 return 0;
1003 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1004 guest_kernel_mode(v, regs) &&
1005 /* Do not check if access-protection fault since the page may
1006 legitimately be not present in shadow page tables */
1007 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
1008 ptwr_do_page_fault(v, addr, regs) )
1009 return EXCRET_fault_fixed;
1011 if ( paging_mode_enabled(d) )
1013 int ret = paging_fault(addr, regs);
1014 if ( ret == EXCRET_fault_fixed )
1015 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1016 return ret;
1019 return 0;
1022 /*
1023 * #PF error code:
1024 * Bit 0: Protection violation (=1) ; Page not present (=0)
1025 * Bit 1: Write access
1026 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1027 * Bit 3: Reserved bit violation
1028 * Bit 4: Instruction fetch
1029 */
1030 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
1032 unsigned long addr, fixup;
1033 int rc;
1035 addr = read_cr2();
1037 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1039 perfc_incr(page_faults);
1041 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
1042 return rc;
1044 if ( unlikely(!guest_mode(regs)) )
1046 if ( spurious_page_fault(addr, regs) )
1047 return EXCRET_not_a_fault;
1049 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1051 perfc_incr(copy_user_faults);
1052 regs->eip = fixup;
1053 return 0;
1056 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1058 show_execution_state(regs);
1059 show_page_walk(addr);
1060 panic("FATAL PAGE FAULT\n"
1061 "[error_code=%04x]\n"
1062 "Faulting linear address: %p\n",
1063 regs->error_code, _p(addr));
1066 propagate_page_fault(addr, regs->error_code);
1067 return 0;
1070 /*
1071 * Early handler to deal with spurious page faults. For example, consider a
1072 * routine that uses a mapping immediately after installing it (making it
1073 * present). The CPU may speculatively execute the memory access before
1074 * executing the PTE write. The instruction will then be marked to cause a
1075 * page fault when it is retired, despite the fact that the PTE is present and
1076 * correct at that point in time.
1077 */
1078 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1080 static int stuck;
1081 static unsigned long prev_eip, prev_cr2;
1082 unsigned long cr2 = read_cr2();
1084 BUG_ON(smp_processor_id() != 0);
1086 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1088 prev_eip = regs->eip;
1089 prev_cr2 = cr2;
1090 stuck = 0;
1091 return EXCRET_not_a_fault;
1094 if ( stuck++ == 1000 )
1095 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1096 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1098 return EXCRET_not_a_fault;
1101 long do_fpu_taskswitch(int set)
1103 struct vcpu *v = current;
1105 if ( set )
1107 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1108 stts();
1110 else
1112 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1113 if ( v->fpu_dirtied )
1114 clts();
1117 return 0;
1120 static int read_descriptor(unsigned int sel,
1121 const struct vcpu *v,
1122 const struct cpu_user_regs * regs,
1123 unsigned long *base,
1124 unsigned long *limit,
1125 unsigned int *ar,
1126 unsigned int vm86attr)
1128 struct desc_struct desc;
1130 if ( !vm86_mode(regs) )
1132 if ( sel < 4)
1133 desc.b = desc.a = 0;
1134 else if ( __get_user(desc,
1135 (const struct desc_struct *)(!(sel & 4)
1136 ? GDT_VIRT_START(v)
1137 : LDT_VIRT_START(v))
1138 + (sel >> 3)) )
1139 return 0;
1140 if ( !(vm86attr & _SEGMENT_CODE) )
1141 desc.b &= ~_SEGMENT_L;
1143 else
1145 desc.a = (sel << 20) | 0xffff;
1146 desc.b = vm86attr | (sel >> 12);
1149 *ar = desc.b & 0x00f0ff00;
1150 if ( !(desc.b & _SEGMENT_L) )
1152 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1153 (desc.b & 0xff000000));
1154 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1155 if ( desc.b & _SEGMENT_G )
1156 *limit = ((*limit + 1) << 12) - 1;
1157 #ifndef NDEBUG
1158 if ( !vm86_mode(regs) && (sel > 3) )
1160 unsigned int a, l;
1161 unsigned char valid;
1163 asm volatile (
1164 "larl %2,%0 ; setz %1"
1165 : "=r" (a), "=rm" (valid) : "rm" (sel));
1166 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1167 asm volatile (
1168 "lsll %2,%0 ; setz %1"
1169 : "=r" (l), "=rm" (valid) : "rm" (sel));
1170 BUG_ON(valid && (l != *limit));
1172 #endif
1174 else
1176 *base = 0UL;
1177 *limit = ~0UL;
1180 return 1;
1183 #ifdef __x86_64__
1184 static int read_gate_descriptor(unsigned int gate_sel,
1185 const struct vcpu *v,
1186 unsigned int *sel,
1187 unsigned long *off,
1188 unsigned int *ar)
1190 struct desc_struct desc;
1191 const struct desc_struct *pdesc;
1194 pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
1195 GDT_VIRT_START(v) :
1196 LDT_VIRT_START(v))
1197 + (gate_sel >> 3);
1198 if ( gate_sel < 4 ||
1199 (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
1200 __get_user(desc, pdesc) )
1201 return 0;
1203 *sel = (desc.a >> 16) & 0x0000fffc;
1204 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1205 *ar = desc.b & 0x0000ffff;
1206 /*
1207 * check_descriptor() clears the DPL field and stores the
1208 * guest requested DPL in the selector's RPL field.
1209 */
1210 ASSERT(!(*ar & _SEGMENT_DPL));
1211 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1213 if ( !is_pv_32bit_vcpu(v) )
1215 if ( (*ar & 0x1f00) != 0x0c00 ||
1216 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1217 __get_user(desc, pdesc + 1) ||
1218 (desc.b & 0x1f00) )
1219 return 0;
1221 *off |= (unsigned long)desc.a << 32;
1222 return 1;
1225 switch ( *ar & 0x1f00 )
1227 case 0x0400:
1228 *off &= 0xffff;
1229 break;
1230 case 0x0c00:
1231 break;
1232 default:
1233 return 0;
1236 return 1;
1238 #endif
1240 /* Has the guest requested sufficient permission for this I/O access? */
1241 static inline int guest_io_okay(
1242 unsigned int port, unsigned int bytes,
1243 struct vcpu *v, struct cpu_user_regs *regs)
1245 #if defined(__x86_64__)
1246 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1247 int user_mode = !(v->arch.flags & TF_kernel_mode);
1248 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1249 #elif defined(__i386__)
1250 #define TOGGLE_MODE() ((void)0)
1251 #endif
1253 if ( !vm86_mode(regs) &&
1254 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1255 return 1;
1257 if ( v->arch.iobmp_limit > (port + bytes) )
1259 union { uint8_t bytes[2]; uint16_t mask; } x;
1261 /*
1262 * Grab permission bytes from guest space. Inaccessible bytes are
1263 * read as 0xff (no access allowed).
1264 */
1265 TOGGLE_MODE();
1266 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1267 port>>3, 2) )
1269 default: x.bytes[0] = ~0;
1270 case 1: x.bytes[1] = ~0;
1271 case 0: break;
1273 TOGGLE_MODE();
1275 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1276 return 1;
1279 return 0;
1282 /* Has the administrator granted sufficient permission for this I/O access? */
1283 static inline int admin_io_okay(
1284 unsigned int port, unsigned int bytes,
1285 struct vcpu *v, struct cpu_user_regs *regs)
1287 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1290 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1291 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1292 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1293 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1294 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1295 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1297 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1298 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1299 __attribute__((__regparm__(1)));
1300 unsigned long guest_to_host_gpr_switch(unsigned long)
1301 __attribute__((__regparm__(1)));
1303 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1305 /* Instruction fetch with error handling. */
1306 #define insn_fetch(type, base, eip, limit) \
1307 ({ unsigned long _rc, _ptr = (base) + (eip); \
1308 type _x; \
1309 if ( ad_default < 8 ) \
1310 _ptr = (unsigned int)_ptr; \
1311 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1312 goto fail; \
1313 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1314 { \
1315 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1316 return EXCRET_fault_fixed; \
1317 } \
1318 (eip) += sizeof(_x); _x; })
1320 #if defined(CONFIG_X86_32)
1321 # define read_sreg(regs, sr) ((regs)->sr)
1322 #elif defined(CONFIG_X86_64)
1323 # define read_sreg(regs, sr) read_segment_register(sr)
1324 #endif
1326 static int emulate_privileged_op(struct cpu_user_regs *regs)
1328 struct vcpu *v = current;
1329 unsigned long *reg, eip = regs->eip, res;
1330 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1331 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1332 unsigned int port, i, data_sel, ar, data, rc;
1333 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1334 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1335 ? regs->reg \
1336 : ad_bytes == 4 \
1337 ? (u32)regs->reg \
1338 : (u16)regs->reg)
1339 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1340 ? regs->reg = (val) \
1341 : ad_bytes == 4 \
1342 ? (*(u32 *)&regs->reg = (val)) \
1343 : (*(u16 *)&regs->reg = (val)))
1344 unsigned long code_base, code_limit;
1345 char io_emul_stub[16];
1346 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1347 u32 l, h, eax, edx;
1349 if ( !read_descriptor(regs->cs, v, regs,
1350 &code_base, &code_limit, &ar,
1351 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1352 goto fail;
1353 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1354 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1355 if ( !(ar & _SEGMENT_S) ||
1356 !(ar & _SEGMENT_P) ||
1357 !(ar & _SEGMENT_CODE) )
1358 goto fail;
1360 /* emulating only opcodes not allowing SS to be default */
1361 data_sel = read_sreg(regs, ds);
1363 /* Legacy prefixes. */
1364 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1366 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1368 case 0x66: /* operand-size override */
1369 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1370 continue;
1371 case 0x67: /* address-size override */
1372 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1373 continue;
1374 case 0x2e: /* CS override */
1375 data_sel = regs->cs;
1376 continue;
1377 case 0x3e: /* DS override */
1378 data_sel = read_sreg(regs, ds);
1379 continue;
1380 case 0x26: /* ES override */
1381 data_sel = read_sreg(regs, es);
1382 continue;
1383 case 0x64: /* FS override */
1384 data_sel = read_sreg(regs, fs);
1385 lm_ovr = lm_seg_fs;
1386 continue;
1387 case 0x65: /* GS override */
1388 data_sel = read_sreg(regs, gs);
1389 lm_ovr = lm_seg_gs;
1390 continue;
1391 case 0x36: /* SS override */
1392 data_sel = regs->ss;
1393 continue;
1394 case 0xf0: /* LOCK */
1395 lock = 1;
1396 continue;
1397 case 0xf2: /* REPNE/REPNZ */
1398 case 0xf3: /* REP/REPE/REPZ */
1399 rep_prefix = 1;
1400 continue;
1401 default:
1402 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1404 rex = opcode;
1405 continue;
1407 break;
1409 break;
1412 /* REX prefix. */
1413 if ( rex & 8 ) /* REX.W */
1414 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1415 modrm_reg = (rex & 4) << 1; /* REX.R */
1416 /* REX.X does not need to be decoded. */
1417 modrm_rm = (rex & 1) << 3; /* REX.B */
1419 if ( opcode == 0x0f )
1420 goto twobyte_opcode;
1422 if ( lock )
1423 goto fail;
1425 /* Input/Output String instructions. */
1426 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1428 unsigned long data_base, data_limit;
1430 if ( rep_prefix && (rd_ad(ecx) == 0) )
1431 goto done;
1433 if ( !(opcode & 2) )
1435 data_sel = read_sreg(regs, es);
1436 lm_ovr = lm_seg_none;
1439 if ( !(ar & _SEGMENT_L) )
1441 if ( !read_descriptor(data_sel, v, regs,
1442 &data_base, &data_limit, &ar,
1443 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1444 goto fail;
1445 if ( !(ar & _SEGMENT_S) ||
1446 !(ar & _SEGMENT_P) ||
1447 (opcode & 2 ?
1448 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1449 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1450 goto fail;
1452 #ifdef CONFIG_X86_64
1453 else
1455 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1457 switch ( lm_ovr )
1459 case lm_seg_none:
1460 data_base = 0UL;
1461 break;
1462 case lm_seg_fs:
1463 data_base = v->arch.guest_context.fs_base;
1464 break;
1465 case lm_seg_gs:
1466 if ( guest_kernel_mode(v, regs) )
1467 data_base = v->arch.guest_context.gs_base_kernel;
1468 else
1469 data_base = v->arch.guest_context.gs_base_user;
1470 break;
1473 else
1474 read_descriptor(data_sel, v, regs,
1475 &data_base, &data_limit, &ar,
1476 0);
1477 data_limit = ~0UL;
1478 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1480 #endif
1482 continue_io_string:
1483 switch ( opcode )
1485 case 0x6c: /* INSB */
1486 op_bytes = 1;
1487 case 0x6d: /* INSW/INSL */
1488 if ( data_limit < op_bytes - 1 ||
1489 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1490 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1491 goto fail;
1492 port = (u16)regs->edx;
1493 switch ( op_bytes )
1495 case 1:
1496 /* emulate PIT counter 2 */
1497 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1498 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1499 pv_pit_handler(port, 0, 0) : ~0));
1500 break;
1501 case 2:
1502 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1503 break;
1504 case 4:
1505 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1506 break;
1508 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1510 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1511 PFEC_write_access);
1512 return EXCRET_fault_fixed;
1514 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1515 break;
1517 case 0x6e: /* OUTSB */
1518 op_bytes = 1;
1519 case 0x6f: /* OUTSW/OUTSL */
1520 if ( data_limit < op_bytes - 1 ||
1521 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1522 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1523 goto fail;
1524 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1525 if ( rc != 0 )
1527 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1528 return EXCRET_fault_fixed;
1530 port = (u16)regs->edx;
1531 switch ( op_bytes )
1533 case 1:
1534 if ( guest_outb_okay(port, v, regs) )
1536 outb((u8)data, port);
1537 if ( pv_post_outb_hook )
1538 pv_post_outb_hook(port, data);
1540 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1541 pv_pit_handler(port, data, 1);
1542 break;
1543 case 2:
1544 if ( guest_outw_okay(port, v, regs) )
1545 outw((u16)data, port);
1546 break;
1547 case 4:
1548 if ( guest_outl_okay(port, v, regs) )
1549 outl((u32)data, port);
1550 break;
1552 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1553 break;
1556 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1558 if ( !hypercall_preempt_check() )
1559 goto continue_io_string;
1560 eip = regs->eip;
1563 goto done;
1566 /*
1567 * Very likely to be an I/O instruction (IN/OUT).
1568 * Build an on-stack stub to execute the instruction with full guest
1569 * GPR context. This is needed for some systems which (ab)use IN/OUT
1570 * to communicate with BIOS code in system-management mode.
1571 */
1572 #ifdef __x86_64__
1573 /* movq $host_to_guest_gpr_switch,%rcx */
1574 io_emul_stub[0] = 0x48;
1575 io_emul_stub[1] = 0xb9;
1576 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1577 /* callq *%rcx */
1578 io_emul_stub[10] = 0xff;
1579 io_emul_stub[11] = 0xd1;
1580 #else
1581 /* call host_to_guest_gpr_switch */
1582 io_emul_stub[0] = 0xe8;
1583 *(s32 *)&io_emul_stub[1] =
1584 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1585 /* 7 x nop */
1586 memset(&io_emul_stub[5], 0x90, 7);
1587 #endif
1588 /* data16 or nop */
1589 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1590 /* <io-access opcode> */
1591 io_emul_stub[13] = opcode;
1592 /* imm8 or nop */
1593 io_emul_stub[14] = 0x90;
1594 /* ret (jumps to guest_to_host_gpr_switch) */
1595 io_emul_stub[15] = 0xc3;
1597 /* Handy function-typed pointer to the stub. */
1598 io_emul = (void *)io_emul_stub;
1600 /* I/O Port and Interrupt Flag instructions. */
1601 switch ( opcode )
1603 case 0xe4: /* IN imm8,%al */
1604 op_bytes = 1;
1605 case 0xe5: /* IN imm8,%eax */
1606 port = insn_fetch(u8, code_base, eip, code_limit);
1607 io_emul_stub[14] = port; /* imm8 */
1608 exec_in:
1609 if ( !guest_io_okay(port, op_bytes, v, regs) )
1610 goto fail;
1611 switch ( op_bytes )
1613 case 1:
1614 if ( guest_inb_okay(port, v, regs) )
1615 io_emul(regs);
1616 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1618 regs->eax &= ~0xffUL;
1619 regs->eax |= pv_pit_handler(port, 0, 0);
1621 else
1622 regs->eax |= (u8)~0;
1623 break;
1624 case 2:
1625 if ( guest_inw_okay(port, v, regs) )
1626 io_emul(regs);
1627 else
1628 regs->eax |= (u16)~0;
1629 break;
1630 case 4:
1631 if ( guest_inl_okay(port, v, regs) )
1632 io_emul(regs);
1633 else
1634 regs->eax = (u32)~0;
1635 break;
1637 goto done;
1639 case 0xec: /* IN %dx,%al */
1640 op_bytes = 1;
1641 case 0xed: /* IN %dx,%eax */
1642 port = (u16)regs->edx;
1643 goto exec_in;
1645 case 0xe6: /* OUT %al,imm8 */
1646 op_bytes = 1;
1647 case 0xe7: /* OUT %eax,imm8 */
1648 port = insn_fetch(u8, code_base, eip, code_limit);
1649 io_emul_stub[14] = port; /* imm8 */
1650 exec_out:
1651 if ( !guest_io_okay(port, op_bytes, v, regs) )
1652 goto fail;
1653 switch ( op_bytes )
1655 case 1:
1656 if ( guest_outb_okay(port, v, regs) )
1658 io_emul(regs);
1659 if ( pv_post_outb_hook )
1660 pv_post_outb_hook(port, regs->eax);
1662 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1663 pv_pit_handler(port, regs->eax, 1);
1664 break;
1665 case 2:
1666 if ( guest_outw_okay(port, v, regs) )
1667 io_emul(regs);
1668 break;
1669 case 4:
1670 if ( guest_outl_okay(port, v, regs) )
1671 io_emul(regs);
1672 break;
1674 goto done;
1676 case 0xee: /* OUT %al,%dx */
1677 op_bytes = 1;
1678 case 0xef: /* OUT %eax,%dx */
1679 port = (u16)regs->edx;
1680 goto exec_out;
1682 case 0xfa: /* CLI */
1683 case 0xfb: /* STI */
1684 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1685 goto fail;
1686 /*
1687 * This is just too dangerous to allow, in my opinion. Consider if the
1688 * caller then tries to reenable interrupts using POPF: we can't trap
1689 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1690 * do for us. :-)
1691 */
1692 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1693 goto done;
1696 /* No decode of this single-byte opcode. */
1697 goto fail;
1699 twobyte_opcode:
1700 /* Two-byte opcodes only emulated from guest kernel. */
1701 if ( !guest_kernel_mode(v, regs) )
1702 goto fail;
1704 /* Privileged (ring 0) instructions. */
1705 opcode = insn_fetch(u8, code_base, eip, code_limit);
1706 if ( lock && (opcode & ~3) != 0x20 )
1707 goto fail;
1708 switch ( opcode )
1710 case 0x06: /* CLTS */
1711 (void)do_fpu_taskswitch(0);
1712 break;
1714 case 0x09: /* WBINVD */
1715 /* Ignore the instruction if unprivileged. */
1716 if ( !cache_flush_permitted(v->domain) )
1717 /* Non-physdev domain attempted WBINVD; ignore for now since
1718 newer linux uses this in some start-of-day timing loops */
1720 else
1721 wbinvd();
1722 break;
1724 case 0x20: /* MOV CR?,<reg> */
1725 opcode = insn_fetch(u8, code_base, eip, code_limit);
1726 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1727 modrm_rm |= (opcode >> 0) & 7;
1728 reg = decode_register(modrm_rm, regs, 0);
1729 switch ( modrm_reg )
1731 case 0: /* Read CR0 */
1732 *reg = (read_cr0() & ~X86_CR0_TS) |
1733 v->arch.guest_context.ctrlreg[0];
1734 break;
1736 case 2: /* Read CR2 */
1737 *reg = v->arch.guest_context.ctrlreg[2];
1738 break;
1740 case 3: /* Read CR3 */
1741 if ( !is_pv_32on64_vcpu(v) )
1742 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1743 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1744 #ifdef CONFIG_COMPAT
1745 else
1746 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1747 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1748 #endif
1749 break;
1751 case 4: /* Read CR4 */
1752 /*
1753 * Guests can read CR4 to see what features Xen has enabled. We
1754 * therefore lie about PGE & PSE as they are unavailable to guests.
1755 */
1756 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1757 break;
1759 default:
1760 goto fail;
1762 break;
1764 case 0x21: /* MOV DR?,<reg> */
1765 opcode = insn_fetch(u8, code_base, eip, code_limit);
1766 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1767 modrm_rm |= (opcode >> 0) & 7;
1768 reg = decode_register(modrm_rm, regs, 0);
1769 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1770 goto fail;
1771 *reg = res;
1772 break;
1774 case 0x22: /* MOV <reg>,CR? */
1775 opcode = insn_fetch(u8, code_base, eip, code_limit);
1776 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1777 modrm_rm |= (opcode >> 0) & 7;
1778 reg = decode_register(modrm_rm, regs, 0);
1779 switch ( modrm_reg )
1781 case 0: /* Write CR0 */
1782 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1784 gdprintk(XENLOG_WARNING,
1785 "Attempt to change unmodifiable CR0 flags.\n");
1786 goto fail;
1788 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1789 break;
1791 case 2: /* Write CR2 */
1792 v->arch.guest_context.ctrlreg[2] = *reg;
1793 arch_set_cr2(v, *reg);
1794 break;
1796 case 3: /* Write CR3 */
1797 LOCK_BIGLOCK(v->domain);
1798 if ( !is_pv_32on64_vcpu(v) )
1799 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1800 #ifdef CONFIG_COMPAT
1801 else
1802 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1803 #endif
1804 UNLOCK_BIGLOCK(v->domain);
1805 if ( rc == 0 ) /* not okay */
1806 goto fail;
1807 break;
1809 case 4: /* Write CR4 */
1810 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
1811 write_cr4(pv_guest_cr4_to_real_cr4(
1812 v->arch.guest_context.ctrlreg[4]));
1813 break;
1815 default:
1816 goto fail;
1818 break;
1820 case 0x23: /* MOV <reg>,DR? */
1821 opcode = insn_fetch(u8, code_base, eip, code_limit);
1822 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1823 modrm_rm |= (opcode >> 0) & 7;
1824 reg = decode_register(modrm_rm, regs, 0);
1825 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1826 goto fail;
1827 break;
1829 case 0x30: /* WRMSR */
1830 eax = regs->eax;
1831 edx = regs->edx;
1832 res = ((u64)edx << 32) | eax;
1833 switch ( regs->ecx )
1835 #ifdef CONFIG_X86_64
1836 case MSR_FS_BASE:
1837 if ( is_pv_32on64_vcpu(v) )
1838 goto fail;
1839 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1840 goto fail;
1841 v->arch.guest_context.fs_base = res;
1842 break;
1843 case MSR_GS_BASE:
1844 if ( is_pv_32on64_vcpu(v) )
1845 goto fail;
1846 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1847 goto fail;
1848 v->arch.guest_context.gs_base_kernel = res;
1849 break;
1850 case MSR_SHADOW_GS_BASE:
1851 if ( is_pv_32on64_vcpu(v) )
1852 goto fail;
1853 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1854 goto fail;
1855 v->arch.guest_context.gs_base_user = res;
1856 break;
1857 #endif
1858 case MSR_K7_FID_VID_STATUS:
1859 case MSR_K7_FID_VID_CTL:
1860 case MSR_K8_PSTATE_LIMIT:
1861 case MSR_K8_PSTATE_CTRL:
1862 case MSR_K8_PSTATE_STATUS:
1863 case MSR_K8_PSTATE0:
1864 case MSR_K8_PSTATE1:
1865 case MSR_K8_PSTATE2:
1866 case MSR_K8_PSTATE3:
1867 case MSR_K8_PSTATE4:
1868 case MSR_K8_PSTATE5:
1869 case MSR_K8_PSTATE6:
1870 case MSR_K8_PSTATE7:
1871 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1872 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1873 wrmsr_safe(regs->ecx, eax, edx) )
1874 goto fail;
1875 break;
1876 case MSR_IA32_PERF_CTL:
1877 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1878 (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
1879 wrmsr_safe(regs->ecx, eax, edx) )
1880 goto fail;
1881 break;
1882 default:
1883 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1884 break;
1885 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1886 (eax != l) || (edx != h) )
1887 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1888 "%08x:%08x to %08x:%08x.\n",
1889 _p(regs->ecx), h, l, edx, eax);
1890 break;
1892 break;
1894 case 0x31: /* RDTSC */
1895 rdtsc(regs->eax, regs->edx);
1896 break;
1898 case 0x32: /* RDMSR */
1899 switch ( regs->ecx )
1901 #ifdef CONFIG_X86_64
1902 case MSR_FS_BASE:
1903 if ( is_pv_32on64_vcpu(v) )
1904 goto fail;
1905 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1906 regs->edx = v->arch.guest_context.fs_base >> 32;
1907 break;
1908 case MSR_GS_BASE:
1909 if ( is_pv_32on64_vcpu(v) )
1910 goto fail;
1911 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1912 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1913 break;
1914 case MSR_SHADOW_GS_BASE:
1915 if ( is_pv_32on64_vcpu(v) )
1916 goto fail;
1917 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1918 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1919 break;
1920 #endif
1921 case MSR_K7_FID_VID_CTL:
1922 case MSR_K7_FID_VID_STATUS:
1923 case MSR_K8_PSTATE_LIMIT:
1924 case MSR_K8_PSTATE_CTRL:
1925 case MSR_K8_PSTATE_STATUS:
1926 case MSR_K8_PSTATE0:
1927 case MSR_K8_PSTATE1:
1928 case MSR_K8_PSTATE2:
1929 case MSR_K8_PSTATE3:
1930 case MSR_K8_PSTATE4:
1931 case MSR_K8_PSTATE5:
1932 case MSR_K8_PSTATE6:
1933 case MSR_K8_PSTATE7:
1934 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1935 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1936 rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1937 goto fail;
1938 break;
1939 case MSR_EFER:
1940 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1941 goto fail;
1942 break;
1943 default:
1944 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1946 regs->eax = l;
1947 regs->edx = h;
1948 break;
1950 /* Everyone can read the MSR space. */
1951 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1952 _p(regs->ecx));*/
1953 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1954 goto fail;
1955 break;
1957 break;
1959 default:
1960 goto fail;
1963 #undef wr_ad
1964 #undef rd_ad
1966 done:
1967 instruction_done(regs, eip);
1968 return EXCRET_fault_fixed;
1970 fail:
1971 return 0;
1974 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
1975 unsigned int esp, unsigned int decr)
1977 return (((esp - decr) < (esp - 1)) &&
1978 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
1981 static int emulate_gate_op(struct cpu_user_regs *regs)
1983 #ifdef __x86_64__
1984 struct vcpu *v = current;
1985 unsigned int sel, ar, dpl, nparm, opnd_sel;
1986 unsigned int op_default, op_bytes, ad_default, ad_bytes;
1987 unsigned long off, eip, opnd_off, base, limit;
1988 int jump;
1990 /* Check whether this fault is due to the use of a call gate. */
1991 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
1992 ((ar >> 13) & 3) < (regs->cs & 3) ||
1993 (ar & _SEGMENT_TYPE) != 0xc00 )
1994 return do_guest_trap(TRAP_gp_fault, regs, 1);
1995 if ( !(ar & _SEGMENT_P) )
1996 return do_guest_trap(TRAP_no_segment, regs, 1);
1997 dpl = (ar >> 13) & 3;
1998 nparm = ar & 0x1f;
2000 /*
2001 * Decode instruction (and perhaps operand) to determine RPL,
2002 * whether this is a jump or a call, and the call return offset.
2003 */
2004 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2005 !(ar & _SEGMENT_S) ||
2006 !(ar & _SEGMENT_P) ||
2007 !(ar & _SEGMENT_CODE) )
2008 return do_guest_trap(TRAP_gp_fault, regs, 1);
2010 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2011 ad_default = ad_bytes = op_default;
2012 opnd_sel = opnd_off = 0;
2013 jump = -1;
2014 for ( eip = regs->eip; eip - regs->_eip < 10; )
2016 switch ( insn_fetch(u8, base, eip, limit) )
2018 case 0x66: /* operand-size override */
2019 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2020 continue;
2021 case 0x67: /* address-size override */
2022 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2023 continue;
2024 case 0x2e: /* CS override */
2025 opnd_sel = regs->cs;
2026 ASSERT(opnd_sel);
2027 continue;
2028 case 0x3e: /* DS override */
2029 opnd_sel = read_sreg(regs, ds);
2030 if ( !opnd_sel )
2031 opnd_sel = dpl;
2032 continue;
2033 case 0x26: /* ES override */
2034 opnd_sel = read_sreg(regs, es);
2035 if ( !opnd_sel )
2036 opnd_sel = dpl;
2037 continue;
2038 case 0x64: /* FS override */
2039 opnd_sel = read_sreg(regs, fs);
2040 if ( !opnd_sel )
2041 opnd_sel = dpl;
2042 continue;
2043 case 0x65: /* GS override */
2044 opnd_sel = read_sreg(regs, gs);
2045 if ( !opnd_sel )
2046 opnd_sel = dpl;
2047 continue;
2048 case 0x36: /* SS override */
2049 opnd_sel = regs->ss;
2050 if ( !opnd_sel )
2051 opnd_sel = dpl;
2052 continue;
2053 case 0xea:
2054 ++jump;
2055 /* FALLTHROUGH */
2056 case 0x9a:
2057 ++jump;
2058 opnd_sel = regs->cs;
2059 opnd_off = eip;
2060 ad_bytes = ad_default;
2061 eip += op_bytes + 2;
2062 break;
2063 case 0xff:
2065 unsigned int modrm;
2067 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2069 case 0x28: case 0x68: case 0xa8:
2070 ++jump;
2071 /* FALLTHROUGH */
2072 case 0x18: case 0x58: case 0x98:
2073 ++jump;
2074 if ( ad_bytes != 2 )
2076 if ( (modrm & 7) == 4 )
2078 unsigned int sib = insn_fetch(u8, base, eip, limit);
2080 modrm = (modrm & ~7) | (sib & 7);
2081 if ( (sib >>= 3) != 4 )
2082 opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
2083 opnd_off <<= sib >> 3;
2085 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2086 opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
2087 else
2088 modrm |= 0x87;
2089 if ( !opnd_sel )
2091 switch ( modrm & 7 )
2093 default:
2094 opnd_sel = read_sreg(regs, ds);
2095 break;
2096 case 4: case 5:
2097 opnd_sel = regs->ss;
2098 break;
2102 else
2104 switch ( modrm & 7 )
2106 case 0: case 1: case 7:
2107 opnd_off = regs->ebx;
2108 break;
2109 case 6:
2110 if ( !(modrm & 0xc0) )
2111 modrm |= 0x80;
2112 else
2113 case 2: case 3:
2115 opnd_off = regs->ebp;
2116 if ( !opnd_sel )
2117 opnd_sel = regs->ss;
2119 break;
2121 if ( !opnd_sel )
2122 opnd_sel = read_sreg(regs, ds);
2123 switch ( modrm & 7 )
2125 case 0: case 2: case 4:
2126 opnd_off += regs->esi;
2127 break;
2128 case 1: case 3: case 5:
2129 opnd_off += regs->edi;
2130 break;
2133 switch ( modrm & 0xc0 )
2135 case 0x40:
2136 opnd_off += insn_fetch(s8, base, eip, limit);
2137 break;
2138 case 0x80:
2139 opnd_off += insn_fetch(s32, base, eip, limit);
2140 break;
2142 if ( ad_bytes == 4 )
2143 opnd_off = (unsigned int)opnd_off;
2144 else if ( ad_bytes == 2 )
2145 opnd_off = (unsigned short)opnd_off;
2146 break;
2149 break;
2151 break;
2154 if ( jump < 0 )
2156 fail:
2157 return do_guest_trap(TRAP_gp_fault, regs, 1);
2160 if ( (opnd_sel != regs->cs &&
2161 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2162 !(ar & _SEGMENT_S) ||
2163 !(ar & _SEGMENT_P) ||
2164 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2165 return do_guest_trap(TRAP_gp_fault, regs, 1);
2167 opnd_off += op_bytes;
2168 #define ad_default ad_bytes
2169 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2170 #undef ad_default
2171 ASSERT((opnd_sel & ~3) == regs->error_code);
2172 if ( dpl < (opnd_sel & 3) )
2173 return do_guest_trap(TRAP_gp_fault, regs, 1);
2175 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2176 !(ar & _SEGMENT_S) ||
2177 !(ar & _SEGMENT_CODE) ||
2178 (!jump || (ar & _SEGMENT_EC) ?
2179 ((ar >> 13) & 3) > (regs->cs & 3) :
2180 ((ar >> 13) & 3) != (regs->cs & 3)) )
2182 regs->error_code = sel;
2183 return do_guest_trap(TRAP_gp_fault, regs, 1);
2185 if ( !(ar & _SEGMENT_P) )
2187 regs->error_code = sel;
2188 return do_guest_trap(TRAP_no_segment, regs, 1);
2190 if ( off > limit )
2192 regs->error_code = 0;
2193 return do_guest_trap(TRAP_gp_fault, regs, 1);
2196 if ( !jump )
2198 unsigned int ss, esp, *stkp;
2199 int rc;
2200 #define push(item) do \
2201 { \
2202 --stkp; \
2203 esp -= 4; \
2204 rc = __put_user(item, stkp); \
2205 if ( rc ) \
2206 { \
2207 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2208 PFEC_write_access); \
2209 return 0; \
2210 } \
2211 } while ( 0 )
2213 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2215 sel |= (ar >> 13) & 3;
2216 /* Inner stack known only for kernel ring. */
2217 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2218 return do_guest_trap(TRAP_gp_fault, regs, 1);
2219 esp = v->arch.guest_context.kernel_sp;
2220 ss = v->arch.guest_context.kernel_ss;
2221 if ( (ss & 3) != (sel & 3) ||
2222 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2223 ((ar >> 13) & 3) != (sel & 3) ||
2224 !(ar & _SEGMENT_S) ||
2225 (ar & _SEGMENT_CODE) ||
2226 !(ar & _SEGMENT_WR) )
2228 regs->error_code = ss & ~3;
2229 return do_guest_trap(TRAP_invalid_tss, regs, 1);
2231 if ( !(ar & _SEGMENT_P) ||
2232 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2234 regs->error_code = ss & ~3;
2235 return do_guest_trap(TRAP_stack_error, regs, 1);
2237 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2238 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2239 return do_guest_trap(TRAP_gp_fault, regs, 1);
2240 push(regs->ss);
2241 push(regs->esp);
2242 if ( nparm )
2244 const unsigned int *ustkp;
2246 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2247 ((ar >> 13) & 3) != (regs->cs & 3) ||
2248 !(ar & _SEGMENT_S) ||
2249 (ar & _SEGMENT_CODE) ||
2250 !(ar & _SEGMENT_WR) ||
2251 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2252 return do_guest_trap(TRAP_gp_fault, regs, 1);
2253 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2254 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2255 return do_guest_trap(TRAP_gp_fault, regs, 1);
2256 do
2258 unsigned int parm;
2260 --ustkp;
2261 rc = __get_user(parm, ustkp);
2262 if ( rc )
2264 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2265 return 0;
2267 push(parm);
2268 } while ( --nparm );
2271 else
2273 sel |= (regs->cs & 3);
2274 esp = regs->esp;
2275 ss = regs->ss;
2276 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2277 ((ar >> 13) & 3) != (sel & 3) )
2278 return do_guest_trap(TRAP_gp_fault, regs, 1);
2279 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2281 regs->error_code = 0;
2282 return do_guest_trap(TRAP_stack_error, regs, 1);
2284 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2285 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2286 return do_guest_trap(TRAP_gp_fault, regs, 1);
2288 push(regs->cs);
2289 push(eip);
2290 #undef push
2291 regs->esp = esp;
2292 regs->ss = ss;
2294 else
2295 sel |= (regs->cs & 3);
2297 regs->cs = sel;
2298 instruction_done(regs, off);
2299 #endif
2301 return 0;
2304 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
2306 struct vcpu *v = current;
2307 unsigned long fixup;
2309 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2311 if ( regs->error_code & 1 )
2312 goto hardware_gp;
2314 if ( !guest_mode(regs) )
2315 goto gp_in_kernel;
2317 /*
2318 * Cunning trick to allow arbitrary "INT n" handling.
2320 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2321 * instruction from trapping to the appropriate vector, when that might not
2322 * be expected by Xen or the guest OS. For example, that entry might be for
2323 * a fault handler (unlike traps, faults don't increment EIP), or might
2324 * expect an error code on the stack (which a software trap never
2325 * provides), or might be a hardware interrupt handler that doesn't like
2326 * being called spuriously.
2328 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2329 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2330 * clear to indicate that it's a software fault, not hardware.
2332 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2333 * okay because they can only be triggered by an explicit DPL-checked
2334 * instruction. The DPL specified by the guest OS for these vectors is NOT
2335 * CHECKED!!
2336 */
2337 if ( (regs->error_code & 3) == 2 )
2339 /* This fault must be due to <INT n> instruction. */
2340 const struct trap_info *ti;
2341 unsigned char vector = regs->error_code >> 3;
2342 ti = &v->arch.guest_context.trap_ctxt[vector];
2343 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2345 regs->eip += 2;
2346 return do_guest_trap(vector, regs, 0);
2349 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2350 return emulate_gate_op(regs);
2352 /* Emulate some simple privileged and I/O instructions. */
2353 if ( (regs->error_code == 0) &&
2354 emulate_privileged_op(regs) )
2356 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2357 return 0;
2360 #if defined(__i386__)
2361 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2362 (regs->error_code == 0) &&
2363 gpf_emulate_4gb(regs) )
2365 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2366 return 0;
2368 #endif
2370 /* Pass on GPF as is. */
2371 return do_guest_trap(TRAP_gp_fault, regs, 1);
2373 gp_in_kernel:
2375 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2377 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2378 regs->error_code, _p(regs->eip), _p(fixup));
2379 regs->eip = fixup;
2380 return 0;
2383 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2385 hardware_gp:
2386 show_execution_state(regs);
2387 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2388 return 0;
2391 static void nmi_softirq(void)
2393 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2394 vcpu_kick(dom0->vcpu[0]);
2397 static void nmi_dom0_report(unsigned int reason_idx)
2399 struct domain *d;
2400 struct vcpu *v;
2402 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2403 return;
2405 set_bit(reason_idx, nmi_reason(d));
2407 if ( !test_and_set_bool(v->nmi_pending) )
2408 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2411 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2413 switch ( opt_nmi[0] )
2415 case 'd': /* 'dom0' */
2416 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2417 case 'i': /* 'ignore' */
2418 break;
2419 default: /* 'fatal' */
2420 console_force_unlock();
2421 printk("\n\nNMI - MEMORY ERROR\n");
2422 fatal_trap(TRAP_nmi, regs);
2425 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2426 mdelay(1);
2427 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2430 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2432 switch ( opt_nmi[0] )
2434 case 'd': /* 'dom0' */
2435 nmi_dom0_report(_XEN_NMIREASON_io_error);
2436 case 'i': /* 'ignore' */
2437 break;
2438 default: /* 'fatal' */
2439 console_force_unlock();
2440 printk("\n\nNMI - I/O ERROR\n");
2441 fatal_trap(TRAP_nmi, regs);
2444 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2445 mdelay(1);
2446 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2449 static void unknown_nmi_error(unsigned char reason)
2451 switch ( opt_nmi[0] )
2453 case 'd': /* 'dom0' */
2454 nmi_dom0_report(_XEN_NMIREASON_unknown);
2455 case 'i': /* 'ignore' */
2456 break;
2457 default: /* 'fatal' */
2458 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2459 printk("Dazed and confused, but trying to continue\n");
2460 printk("Do you have a strange power saving mode enabled?\n");
2461 kexec_crash();
2465 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2467 return 0;
2470 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2472 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2474 unsigned int cpu = smp_processor_id();
2475 unsigned char reason;
2477 ++nmi_count(cpu);
2479 if ( nmi_callback(regs, cpu) )
2480 return;
2482 if ( nmi_watchdog )
2483 nmi_watchdog_tick(regs);
2485 /* Only the BSP gets external NMIs from the system. */
2486 if ( cpu == 0 )
2488 reason = inb(0x61);
2489 if ( reason & 0x80 )
2490 mem_parity_error(regs);
2491 else if ( reason & 0x40 )
2492 io_check_error(regs);
2493 else if ( !nmi_watchdog )
2494 unknown_nmi_error((unsigned char)(reason&0xff));
2498 void set_nmi_callback(nmi_callback_t callback)
2500 nmi_callback = callback;
2503 void unset_nmi_callback(void)
2505 nmi_callback = dummy_nmi_callback;
2508 asmlinkage int do_device_not_available(struct cpu_user_regs *regs)
2510 struct vcpu *curr = current;
2512 BUG_ON(!guest_mode(regs));
2514 setup_fpu(curr);
2516 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2518 do_guest_trap(TRAP_no_device, regs, 0);
2519 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2521 else
2522 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2524 return EXCRET_fault_fixed;
2527 asmlinkage int do_debug(struct cpu_user_regs *regs)
2529 struct vcpu *v = current;
2531 DEBUGGER_trap_entry(TRAP_debug, regs);
2533 if ( !guest_mode(regs) )
2535 if ( regs->eflags & EF_TF )
2537 #ifdef __x86_64__
2538 void sysenter_entry(void);
2539 void sysenter_eflags_saved(void);
2540 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2541 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2542 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2543 goto out;
2544 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2545 #else
2546 WARN_ON(1);
2547 #endif
2548 regs->eflags &= ~EF_TF;
2550 else
2552 /*
2553 * We ignore watchpoints when they trigger within Xen. This may
2554 * happen when a buffer is passed to us which previously had a
2555 * watchpoint set on it. No need to bump EIP; the only faulting
2556 * trap is an instruction breakpoint, which can't happen to us.
2557 */
2558 WARN_ON(!search_exception_table(regs->eip));
2560 goto out;
2563 /* Save debug status register where guest OS can peek at it */
2564 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2566 ler_enable();
2567 return do_guest_trap(TRAP_debug, regs, 0);
2569 out:
2570 ler_enable();
2571 return EXCRET_not_a_fault;
2574 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2576 return EXCRET_not_a_fault;
2579 void set_intr_gate(unsigned int n, void *addr)
2581 int i;
2582 /* Keep secondary tables in sync with IRQ updates. */
2583 for ( i = 1; i < NR_CPUS; i++ )
2584 if ( idt_tables[i] != NULL )
2585 _set_gate(&idt_tables[i][n], 14, 0, addr);
2586 _set_gate(&idt_table[n], 14, 0, addr);
2589 void set_system_gate(unsigned int n, void *addr)
2591 _set_gate(idt_table+n,14,3,addr);
2594 void set_tss_desc(unsigned int n, void *addr)
2596 _set_tssldt_desc(
2597 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2598 (unsigned long)addr,
2599 offsetof(struct tss_struct, __cacheline_filler) - 1,
2600 9);
2601 #ifdef CONFIG_COMPAT
2602 _set_tssldt_desc(
2603 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2604 (unsigned long)addr,
2605 offsetof(struct tss_struct, __cacheline_filler) - 1,
2606 11);
2607 #endif
2610 void __devinit percpu_traps_init(void)
2612 subarch_percpu_traps_init();
2614 if ( !opt_ler )
2615 return;
2617 switch ( boot_cpu_data.x86_vendor )
2619 case X86_VENDOR_INTEL:
2620 switch ( boot_cpu_data.x86 )
2622 case 6:
2623 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2624 break;
2625 case 15:
2626 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2627 break;
2629 break;
2630 case X86_VENDOR_AMD:
2631 switch ( boot_cpu_data.x86 )
2633 case 6:
2634 case 15:
2635 case 16:
2636 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2637 break;
2639 break;
2642 ler_enable();
2645 void __init trap_init(void)
2647 /*
2648 * Note that interrupt gates are always used, rather than trap gates. We
2649 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2650 * first activation must have the "bad" value(s) for these registers and
2651 * we may lose them if another activation is installed before they are
2652 * saved. The page-fault handler also needs interrupts disabled until %cr2
2653 * has been read and saved on the stack.
2654 */
2655 set_intr_gate(TRAP_divide_error,&divide_error);
2656 set_intr_gate(TRAP_debug,&debug);
2657 set_intr_gate(TRAP_nmi,&nmi);
2658 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2659 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2660 set_intr_gate(TRAP_bounds,&bounds);
2661 set_intr_gate(TRAP_invalid_op,&invalid_op);
2662 set_intr_gate(TRAP_no_device,&device_not_available);
2663 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2664 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2665 set_intr_gate(TRAP_no_segment,&segment_not_present);
2666 set_intr_gate(TRAP_stack_error,&stack_segment);
2667 set_intr_gate(TRAP_gp_fault,&general_protection);
2668 set_intr_gate(TRAP_page_fault,&page_fault);
2669 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2670 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2671 set_intr_gate(TRAP_alignment_check,&alignment_check);
2672 set_intr_gate(TRAP_machine_check,&machine_check);
2673 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2675 /* CPU0 uses the master IDT. */
2676 idt_tables[0] = idt_table;
2678 percpu_traps_init();
2680 cpu_init();
2682 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2685 long register_guest_nmi_callback(unsigned long address)
2687 struct vcpu *v = current;
2688 struct domain *d = v->domain;
2689 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2691 t->vector = TRAP_nmi;
2692 t->flags = 0;
2693 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2694 t->address = address;
2695 TI_SET_IF(t, 1);
2697 /*
2698 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2699 * now.
2700 */
2701 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2702 v->nmi_pending = 1;
2704 return 0;
2707 long unregister_guest_nmi_callback(void)
2709 struct vcpu *v = current;
2710 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2712 memset(t, 0, sizeof(*t));
2714 return 0;
2717 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2719 struct trap_info cur;
2720 struct vcpu *curr = current;
2721 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2722 long rc = 0;
2724 /* If no table is presented then clear the entire virtual IDT. */
2725 if ( guest_handle_is_null(traps) )
2727 memset(dst, 0, 256 * sizeof(*dst));
2728 init_int80_direct_trap(curr);
2729 return 0;
2732 for ( ; ; )
2734 if ( hypercall_preempt_check() )
2736 rc = hypercall_create_continuation(
2737 __HYPERVISOR_set_trap_table, "h", traps);
2738 break;
2741 if ( copy_from_guest(&cur, traps, 1) )
2743 rc = -EFAULT;
2744 break;
2747 if ( cur.address == 0 )
2748 break;
2750 fixup_guest_code_selector(curr->domain, cur.cs);
2752 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2754 if ( cur.vector == 0x80 )
2755 init_int80_direct_trap(curr);
2757 guest_handle_add_offset(traps, 1);
2760 return rc;
2763 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
2765 int i;
2766 struct vcpu *curr = current;
2768 switch ( reg )
2770 case 0:
2771 if ( !access_ok(value, sizeof(long)) )
2772 return -EPERM;
2773 if ( v == curr )
2774 write_debugreg(0, value);
2775 break;
2776 case 1:
2777 if ( !access_ok(value, sizeof(long)) )
2778 return -EPERM;
2779 if ( v == curr )
2780 write_debugreg(1, value);
2781 break;
2782 case 2:
2783 if ( !access_ok(value, sizeof(long)) )
2784 return -EPERM;
2785 if ( v == curr )
2786 write_debugreg(2, value);
2787 break;
2788 case 3:
2789 if ( !access_ok(value, sizeof(long)) )
2790 return -EPERM;
2791 if ( v == curr )
2792 write_debugreg(3, value);
2793 break;
2794 case 6:
2795 /*
2796 * DR6: Bits 4-11,16-31 reserved (set to 1).
2797 * Bit 12 reserved (set to 0).
2798 */
2799 value &= 0xffffefff; /* reserved bits => 0 */
2800 value |= 0xffff0ff0; /* reserved bits => 1 */
2801 if ( v == curr )
2802 write_debugreg(6, value);
2803 break;
2804 case 7:
2805 /*
2806 * DR7: Bit 10 reserved (set to 1).
2807 * Bits 11-12,14-15 reserved (set to 0).
2808 * Privileged bits:
2809 * GD (bit 13): must be 0.
2810 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2811 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2812 */
2813 /* DR7 == 0 => debugging disabled for this domain. */
2814 if ( value != 0 )
2816 value &= 0xffff27ff; /* reserved bits => 0 */
2817 value |= 0x00000400; /* reserved bits => 1 */
2818 if ( (value & (1<<13)) != 0 ) return -EPERM;
2819 for ( i = 0; i < 16; i += 2 )
2820 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2821 /*
2822 * If DR7 was previously clear then we need to load all other
2823 * debug registers at this point as they were not restored during
2824 * context switch.
2825 */
2826 if ( (v == curr) && (v->arch.guest_context.debugreg[7] == 0) )
2828 write_debugreg(0, v->arch.guest_context.debugreg[0]);
2829 write_debugreg(1, v->arch.guest_context.debugreg[1]);
2830 write_debugreg(2, v->arch.guest_context.debugreg[2]);
2831 write_debugreg(3, v->arch.guest_context.debugreg[3]);
2832 write_debugreg(6, v->arch.guest_context.debugreg[6]);
2835 if ( v == curr )
2836 write_debugreg(7, value);
2837 break;
2838 default:
2839 return -EINVAL;
2842 v->arch.guest_context.debugreg[reg] = value;
2843 return 0;
2846 long do_set_debugreg(int reg, unsigned long value)
2848 return set_debugreg(current, reg, value);
2851 unsigned long do_get_debugreg(int reg)
2853 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2854 return current->arch.guest_context.debugreg[reg];
2857 /*
2858 * Local variables:
2859 * mode: C
2860 * c-set-style: "BSD"
2861 * c-basic-offset: 4
2862 * tab-width: 4
2863 * indent-tabs-mode: nil
2864 * End:
2865 */