ia64/xen-unstable

view xen/arch/x86/traps.c @ 16263:23582bcda6e1

x86: Clean up NMI delivery logic. Allow set_trap_table vector 2 to be
specified as not disabling event delivery, just like any other vector.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Mon Oct 29 09:49:39 2007 +0000 (2007-10-29)
parents 3fe75ef9ca93
children 4034317507de
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
91 asmlinkage void nmi(void);
92 asmlinkage void machine_check(void);
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
114 static int debug_stack_lines = 20;
115 integer_param("debug_stack_lines", debug_stack_lines);
117 static int opt_ler;
118 boolean_param("ler", opt_ler);
120 #ifdef CONFIG_X86_32
121 #define stack_words_per_line 8
122 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
123 #else
124 #define stack_words_per_line 4
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
126 #endif
128 static void show_guest_stack(struct cpu_user_regs *regs)
129 {
130 int i;
131 unsigned long *stack, addr;
133 if ( is_hvm_vcpu(current) )
134 return;
136 if ( is_pv_32on64_vcpu(current) )
137 {
138 compat_show_guest_stack(regs, debug_stack_lines);
139 return;
140 }
142 if ( vm86_mode(regs) )
143 {
144 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
145 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
146 regs->ss, (uint16_t)(regs->esp & 0xffff));
147 }
148 else
149 {
150 stack = (unsigned long *)regs->esp;
151 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
152 }
154 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
155 {
156 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
157 break;
158 if ( get_user(addr, stack) )
159 {
160 if ( i != 0 )
161 printk("\n ");
162 printk("Fault while accessing guest memory.");
163 i = 1;
164 break;
165 }
166 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
167 printk("\n ");
168 printk(" %p", _p(addr));
169 stack++;
170 }
171 if ( i == 0 )
172 printk("Stack empty.");
173 printk("\n");
174 }
176 #if !defined(CONFIG_FRAME_POINTER)
178 static void show_trace(struct cpu_user_regs *regs)
179 {
180 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
182 printk("Xen call trace:\n ");
184 printk("[<%p>]", _p(regs->eip));
185 print_symbol(" %s\n ", regs->eip);
187 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
188 {
189 addr = *stack++;
190 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
191 {
192 printk("[<%p>]", _p(addr));
193 print_symbol(" %s\n ", addr);
194 }
195 }
197 printk("\n");
198 }
200 #else
202 static void show_trace(struct cpu_user_regs *regs)
203 {
204 unsigned long *frame, next, addr, low, high;
206 printk("Xen call trace:\n ");
208 printk("[<%p>]", _p(regs->eip));
209 print_symbol(" %s\n ", regs->eip);
211 /* Bounds for range of valid frame pointer. */
212 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
213 high = (low & ~(STACK_SIZE - 1)) +
214 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
216 /* The initial frame pointer. */
217 next = regs->ebp;
219 for ( ; ; )
220 {
221 /* Valid frame pointer? */
222 if ( (next < low) || (next >= high) )
223 {
224 /*
225 * Exception stack frames have a different layout, denoted by an
226 * inverted frame pointer.
227 */
228 next = ~next;
229 if ( (next < low) || (next >= high) )
230 break;
231 frame = (unsigned long *)next;
232 next = frame[0];
233 addr = frame[(offsetof(struct cpu_user_regs, eip) -
234 offsetof(struct cpu_user_regs, ebp))
235 / BYTES_PER_LONG];
236 }
237 else
238 {
239 /* Ordinary stack frame. */
240 frame = (unsigned long *)next;
241 next = frame[0];
242 addr = frame[1];
243 }
245 printk("[<%p>]", _p(addr));
246 print_symbol(" %s\n ", addr);
248 low = (unsigned long)&frame[2];
249 }
251 printk("\n");
252 }
254 #endif
256 void show_stack(struct cpu_user_regs *regs)
257 {
258 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
259 int i;
261 if ( guest_mode(regs) )
262 return show_guest_stack(regs);
264 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
266 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
267 {
268 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
269 break;
270 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
271 printk("\n ");
272 addr = *stack++;
273 printk(" %p", _p(addr));
274 }
275 if ( i == 0 )
276 printk("Stack empty.");
277 printk("\n");
279 show_trace(regs);
280 }
282 void show_stack_overflow(unsigned int cpu, unsigned long esp)
283 {
284 #ifdef MEMORY_GUARD
285 unsigned long esp_top, esp_bottom;
286 unsigned long *stack, addr;
288 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
289 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
291 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
292 (void *)esp_top, (void *)esp_bottom, (void *)esp,
293 (void *)init_tss[cpu].esp0);
295 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
296 if ( ((unsigned long)(esp - esp_top) > 512) &&
297 ((unsigned long)(esp_top - esp) > 512) )
298 {
299 printk("No stack overflow detected. Skipping stack trace.\n");
300 return;
301 }
303 if ( esp < esp_top )
304 esp = esp_top;
306 printk("Xen stack overflow (dumping trace %p-%p):\n ",
307 (void *)esp, (void *)esp_bottom);
309 stack = (unsigned long *)esp;
310 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
311 {
312 addr = *stack++;
313 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
314 {
315 printk("%p: [<%p>]", stack, _p(addr));
316 print_symbol(" %s\n ", addr);
317 }
318 }
320 printk("\n");
321 #endif
322 }
324 void show_execution_state(struct cpu_user_regs *regs)
325 {
326 show_registers(regs);
327 show_stack(regs);
328 }
330 char *trapstr(int trapnr)
331 {
332 static char *strings[] = {
333 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
334 "invalid opcode", "device not available", "double fault",
335 "coprocessor segment", "invalid tss", "segment not found",
336 "stack error", "general protection fault", "page fault",
337 "spurious interrupt", "coprocessor error", "alignment check",
338 "machine check", "simd error"
339 };
341 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
342 return "???";
344 return strings[trapnr];
345 }
347 /*
348 * This is called for faults at very unexpected times (e.g., when interrupts
349 * are disabled). In such situations we can't do much that is safe. We try to
350 * print out some tracing and then we just spin.
351 */
352 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
353 {
354 static DEFINE_PER_CPU(char, depth);
356 /*
357 * In some cases, we can end up in a vicious cycle of fatal_trap()s
358 * within fatal_trap()s. We give the problem a couple of iterations to
359 * bottom out, and then we just panic.
360 */
361 if ( ++this_cpu(depth) < 3 )
362 {
363 watchdog_disable();
364 console_start_sync();
366 show_execution_state(regs);
368 if ( trapnr == TRAP_page_fault )
369 {
370 unsigned long cr2 = read_cr2();
371 printk("Faulting linear address: %p\n", _p(cr2));
372 show_page_walk(cr2);
373 }
374 }
376 panic("FATAL TRAP: vector = %d (%s)\n"
377 "[error_code=%04x] %s\n",
378 trapnr, trapstr(trapnr), regs->error_code,
379 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
380 }
382 static int do_guest_trap(
383 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
384 {
385 struct vcpu *v = current;
386 struct trap_bounce *tb;
387 const struct trap_info *ti;
389 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
391 tb = &v->arch.trap_bounce;
392 ti = &v->arch.guest_context.trap_ctxt[trapnr];
394 tb->flags = TBF_EXCEPTION;
395 tb->cs = ti->cs;
396 tb->eip = ti->address;
398 if ( use_error_code )
399 {
400 tb->flags |= TBF_EXCEPTION_ERRCODE;
401 tb->error_code = regs->error_code;
402 }
404 if ( TI_GET_IF(ti) )
405 tb->flags |= TBF_INTERRUPT;
407 if ( unlikely(null_trap_bounce(v, tb)) )
408 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
409 "domain %d on VCPU %d [ec=%04x]\n",
410 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
411 regs->error_code);
413 return 0;
414 }
416 /*
417 * Called from asm to set up the NMI trapbounce info.
418 * Returns 0 if no callback is set up, else 1.
419 */
420 asmlinkage int set_guest_nmi_trapbounce(void)
421 {
422 struct vcpu *v = current;
423 struct trap_bounce *tb = &v->arch.trap_bounce;
424 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
425 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
426 return !null_trap_bounce(v, tb);
427 }
429 static inline int do_trap(
430 int trapnr, struct cpu_user_regs *regs, int use_error_code)
431 {
432 unsigned long fixup;
434 DEBUGGER_trap_entry(trapnr, regs);
436 if ( guest_mode(regs) )
437 return do_guest_trap(trapnr, regs, use_error_code);
439 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
440 {
441 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
442 trapnr, _p(regs->eip), _p(fixup));
443 regs->eip = fixup;
444 return 0;
445 }
447 DEBUGGER_trap_fatal(trapnr, regs);
449 show_execution_state(regs);
450 panic("FATAL TRAP: vector = %d (%s)\n"
451 "[error_code=%04x]\n",
452 trapnr, trapstr(trapnr), regs->error_code);
453 return 0;
454 }
456 #define DO_ERROR_NOCODE(trapnr, name) \
457 asmlinkage int do_##name(struct cpu_user_regs *regs) \
458 { \
459 return do_trap(trapnr, regs, 0); \
460 }
462 #define DO_ERROR(trapnr, name) \
463 asmlinkage int do_##name(struct cpu_user_regs *regs) \
464 { \
465 return do_trap(trapnr, regs, 1); \
466 }
468 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
469 DO_ERROR_NOCODE(TRAP_overflow, overflow)
470 DO_ERROR_NOCODE(TRAP_bounds, bounds)
471 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
472 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
473 DO_ERROR( TRAP_no_segment, segment_not_present)
474 DO_ERROR( TRAP_stack_error, stack_segment)
475 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
476 DO_ERROR( TRAP_alignment_check, alignment_check)
477 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
479 int rdmsr_hypervisor_regs(
480 uint32_t idx, uint32_t *eax, uint32_t *edx)
481 {
482 idx -= 0x40000000;
483 if ( idx > 0 )
484 return 0;
486 switch ( idx )
487 {
488 case 0:
489 {
490 *eax = *edx = 0;
491 break;
492 }
493 default:
494 BUG();
495 }
497 return 1;
498 }
500 int wrmsr_hypervisor_regs(
501 uint32_t idx, uint32_t eax, uint32_t edx)
502 {
503 struct domain *d = current->domain;
505 idx -= 0x40000000;
506 if ( idx > 0 )
507 return 0;
509 switch ( idx )
510 {
511 case 0:
512 {
513 void *hypercall_page;
514 unsigned long mfn;
515 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
516 unsigned int idx = eax & 0xfff;
518 if ( idx > 0 )
519 {
520 gdprintk(XENLOG_WARNING,
521 "Dom%d: Out of range index %u to MSR %08x\n",
522 d->domain_id, idx, 0x40000000);
523 return 0;
524 }
526 mfn = gmfn_to_mfn(d, gmfn);
528 if ( !mfn_valid(mfn) ||
529 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
530 {
531 gdprintk(XENLOG_WARNING,
532 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
533 d->domain_id, gmfn, mfn, 0x40000000);
534 return 0;
535 }
537 hypercall_page = map_domain_page(mfn);
538 hypercall_page_initialise(d, hypercall_page);
539 unmap_domain_page(hypercall_page);
541 put_page_and_type(mfn_to_page(mfn));
542 break;
543 }
545 default:
546 BUG();
547 }
549 return 1;
550 }
552 int cpuid_hypervisor_leaves(
553 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
554 {
555 idx -= 0x40000000;
556 if ( idx > 2 )
557 return 0;
559 switch ( idx )
560 {
561 case 0:
562 *eax = 0x40000002; /* Largest leaf */
563 *ebx = 0x566e6558; /* Signature 1: "XenV" */
564 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
565 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
566 break;
568 case 1:
569 *eax = (xen_major_version() << 16) | xen_minor_version();
570 *ebx = 0; /* Reserved */
571 *ecx = 0; /* Reserved */
572 *edx = 0; /* Reserved */
573 break;
575 case 2:
576 *eax = 1; /* Number of hypercall-transfer pages */
577 *ebx = 0x40000000; /* MSR base address */
578 *ecx = 0; /* Features 1 */
579 *edx = 0; /* Features 2 */
580 break;
582 default:
583 BUG();
584 }
586 return 1;
587 }
589 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
590 {
591 char sig[5], instr[2];
592 uint32_t a, b, c, d;
593 unsigned long eip, rc;
595 a = regs->eax;
596 b = regs->ebx;
597 c = regs->ecx;
598 d = regs->edx;
599 eip = regs->eip;
601 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
602 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
603 {
604 propagate_page_fault(eip + sizeof(sig) - rc, 0);
605 return EXCRET_fault_fixed;
606 }
607 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
608 return 0;
609 eip += sizeof(sig);
611 /* We only emulate CPUID. */
612 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
613 {
614 propagate_page_fault(eip + sizeof(instr) - rc, 0);
615 return EXCRET_fault_fixed;
616 }
617 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
618 return 0;
619 eip += sizeof(instr);
621 asm (
622 "cpuid"
623 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
624 : "0" (a), "1" (b), "2" (c), "3" (d) );
626 if ( regs->eax == 1 )
627 {
628 /* Modify Feature Information. */
629 clear_bit(X86_FEATURE_VME, &d);
630 clear_bit(X86_FEATURE_DE, &d);
631 clear_bit(X86_FEATURE_PSE, &d);
632 clear_bit(X86_FEATURE_PGE, &d);
633 if ( !cpu_has_sep )
634 clear_bit(X86_FEATURE_SEP, &d);
635 #ifdef __i386__
636 if ( !supervisor_mode_kernel )
637 clear_bit(X86_FEATURE_SEP, &d);
638 #endif
639 if ( !IS_PRIV(current->domain) )
640 clear_bit(X86_FEATURE_MTRR, &d);
641 }
642 else if ( regs->eax == 0x80000001 )
643 {
644 /* Modify Feature Information. */
645 #ifdef __i386__
646 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
647 #endif
648 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
649 }
650 else
651 {
652 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
653 }
655 regs->eax = a;
656 regs->ebx = b;
657 regs->ecx = c;
658 regs->edx = d;
659 regs->eip = eip;
660 regs->eflags &= ~X86_EFLAGS_RF;
662 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
664 return EXCRET_fault_fixed;
665 }
667 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
668 {
669 struct bug_frame bug;
670 struct bug_frame_str bug_str;
671 char *filename, *predicate, *eip = (char *)regs->eip;
672 int rc, id, lineno;
674 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
676 if ( likely(guest_mode(regs)) )
677 {
678 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
679 return rc;
680 return do_guest_trap(TRAP_invalid_op, regs, 0);
681 }
683 if ( !is_kernel(eip) ||
684 __copy_from_user(&bug, eip, sizeof(bug)) ||
685 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
686 (bug.ret != 0xc2) )
687 goto die;
688 eip += sizeof(bug);
690 id = bug.id & 3;
692 if ( id == BUGFRAME_dump )
693 {
694 show_execution_state(regs);
695 regs->eip = (unsigned long)eip;
696 return EXCRET_fault_fixed;
697 }
699 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
700 if ( !is_kernel(eip) ||
701 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
702 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
703 goto die;
704 eip += sizeof(bug_str);
706 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
707 lineno = bug.id >> 2;
709 if ( id == BUGFRAME_warn )
710 {
711 printk("Xen WARN at %.50s:%d\n", filename, lineno);
712 show_execution_state(regs);
713 regs->eip = (unsigned long)eip;
714 return EXCRET_fault_fixed;
715 }
717 if ( id == BUGFRAME_bug )
718 {
719 printk("Xen BUG at %.50s:%d\n", filename, lineno);
720 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
721 show_execution_state(regs);
722 panic("Xen BUG at %.50s:%d\n", filename, lineno);
723 }
725 /* ASSERT: decode the predicate string pointer. */
726 ASSERT(id == BUGFRAME_assert);
727 if ( !is_kernel(eip) ||
728 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
729 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
730 goto die;
731 eip += sizeof(bug_str);
733 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
734 printk("Assertion '%s' failed at %.50s:%d\n",
735 predicate, filename, lineno);
736 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
737 show_execution_state(regs);
738 panic("Assertion '%s' failed at %.50s:%d\n",
739 predicate, filename, lineno);
741 die:
742 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
743 show_execution_state(regs);
744 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
745 return 0;
746 }
748 asmlinkage int do_int3(struct cpu_user_regs *regs)
749 {
750 DEBUGGER_trap_entry(TRAP_int3, regs);
752 if ( !guest_mode(regs) )
753 {
754 DEBUGGER_trap_fatal(TRAP_int3, regs);
755 show_execution_state(regs);
756 panic("FATAL TRAP: vector = 3 (Int3)\n");
757 }
759 return do_guest_trap(TRAP_int3, regs, 0);
760 }
762 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
763 {
764 extern fastcall void (*machine_check_vector)(
765 struct cpu_user_regs *, long error_code);
766 machine_check_vector(regs, regs->error_code);
767 }
769 void propagate_page_fault(unsigned long addr, u16 error_code)
770 {
771 struct trap_info *ti;
772 struct vcpu *v = current;
773 struct trap_bounce *tb = &v->arch.trap_bounce;
775 v->arch.guest_context.ctrlreg[2] = addr;
776 arch_set_cr2(v, addr);
778 /* Re-set error_code.user flag appropriately for the guest. */
779 error_code &= ~PFEC_user_mode;
780 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
781 error_code |= PFEC_user_mode;
783 trace_pv_page_fault(addr, error_code);
785 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
786 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
787 tb->error_code = error_code;
788 tb->cs = ti->cs;
789 tb->eip = ti->address;
790 if ( TI_GET_IF(ti) )
791 tb->flags |= TBF_INTERRUPT;
792 if ( unlikely(null_trap_bounce(v, tb)) )
793 {
794 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
795 v->domain->domain_id, v->vcpu_id, error_code);
796 show_page_walk(addr);
797 }
798 }
800 static int handle_gdt_ldt_mapping_fault(
801 unsigned long offset, struct cpu_user_regs *regs)
802 {
803 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
804 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
805 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
807 /* Should never fault in another vcpu's area. */
808 BUG_ON(vcpu_area != current->vcpu_id);
810 /* Byte offset within the gdt/ldt sub-area. */
811 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
813 if ( likely(is_ldt_area) )
814 {
815 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
816 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
817 {
818 if ( guest_mode(regs) )
819 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
820 regs->eip, offset);
821 }
822 else
823 {
824 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
825 if ( !guest_mode(regs) )
826 return 0;
827 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
828 propagate_page_fault(
829 current->arch.guest_context.ldt_base + offset,
830 regs->error_code);
831 }
832 }
833 else
834 {
835 /* GDT fault: handle the fault as #GP(selector). */
836 regs->error_code = (u16)offset & ~7;
837 (void)do_general_protection(regs);
838 }
840 return EXCRET_fault_fixed;
841 }
843 #ifdef HYPERVISOR_VIRT_END
844 #define IN_HYPERVISOR_RANGE(va) \
845 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
846 #else
847 #define IN_HYPERVISOR_RANGE(va) \
848 (((va) >= HYPERVISOR_VIRT_START))
849 #endif
851 static int __spurious_page_fault(
852 unsigned long addr, struct cpu_user_regs *regs)
853 {
854 unsigned long mfn, cr3 = read_cr3();
855 #if CONFIG_PAGING_LEVELS >= 4
856 l4_pgentry_t l4e, *l4t;
857 #endif
858 #if CONFIG_PAGING_LEVELS >= 3
859 l3_pgentry_t l3e, *l3t;
860 #endif
861 l2_pgentry_t l2e, *l2t;
862 l1_pgentry_t l1e, *l1t;
863 unsigned int required_flags, disallowed_flags;
865 /* Reserved bit violations are never spurious faults. */
866 if ( regs->error_code & PFEC_reserved_bit )
867 return 0;
869 required_flags = _PAGE_PRESENT;
870 if ( regs->error_code & PFEC_write_access )
871 required_flags |= _PAGE_RW;
872 if ( regs->error_code & PFEC_user_mode )
873 required_flags |= _PAGE_USER;
875 disallowed_flags = 0;
876 if ( regs->error_code & PFEC_insn_fetch )
877 disallowed_flags |= _PAGE_NX;
879 mfn = cr3 >> PAGE_SHIFT;
881 #if CONFIG_PAGING_LEVELS >= 4
882 l4t = map_domain_page(mfn);
883 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
884 mfn = l4e_get_pfn(l4e);
885 unmap_domain_page(l4t);
886 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
887 (l4e_get_flags(l4e) & disallowed_flags) )
888 return 0;
889 #endif
891 #if CONFIG_PAGING_LEVELS >= 3
892 l3t = map_domain_page(mfn);
893 #ifdef CONFIG_X86_PAE
894 l3t += (cr3 & 0xFE0UL) >> 3;
895 #endif
896 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
897 mfn = l3e_get_pfn(l3e);
898 unmap_domain_page(l3t);
899 #ifdef CONFIG_X86_PAE
900 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
901 return 0;
902 #else
903 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
904 (l3e_get_flags(l3e) & disallowed_flags) )
905 return 0;
906 #endif
907 #endif
909 l2t = map_domain_page(mfn);
910 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
911 mfn = l2e_get_pfn(l2e);
912 unmap_domain_page(l2t);
913 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
914 (l2e_get_flags(l2e) & disallowed_flags) )
915 return 0;
916 if ( l2e_get_flags(l2e) & _PAGE_PSE )
917 {
918 l1e = l1e_empty(); /* define before use in debug tracing */
919 goto spurious;
920 }
922 l1t = map_domain_page(mfn);
923 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
924 mfn = l1e_get_pfn(l1e);
925 unmap_domain_page(l1t);
926 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
927 (l1e_get_flags(l1e) & disallowed_flags) )
928 return 0;
930 spurious:
931 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
932 "at addr %lx, e/c %04x\n",
933 current->domain->domain_id, current->vcpu_id,
934 addr, regs->error_code);
935 #if CONFIG_PAGING_LEVELS >= 4
936 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
937 #endif
938 #if CONFIG_PAGING_LEVELS >= 3
939 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
940 #endif
941 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
942 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
943 #ifndef NDEBUG
944 show_registers(regs);
945 #endif
946 return 1;
947 }
949 static int spurious_page_fault(
950 unsigned long addr, struct cpu_user_regs *regs)
951 {
952 unsigned long flags;
953 int is_spurious;
955 /*
956 * Disabling interrupts prevents TLB flushing, and hence prevents
957 * page tables from becoming invalid under our feet during the walk.
958 */
959 local_irq_save(flags);
960 is_spurious = __spurious_page_fault(addr, regs);
961 local_irq_restore(flags);
963 return is_spurious;
964 }
966 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
967 {
968 struct vcpu *v = current;
969 struct domain *d = v->domain;
971 /* No fixups in interrupt context or when interrupts are disabled. */
972 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
973 return 0;
975 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
976 {
977 if ( paging_mode_external(d) && guest_mode(regs) )
978 {
979 int ret = paging_fault(addr, regs);
980 if ( ret == EXCRET_fault_fixed )
981 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
982 return ret;
983 }
984 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
985 return handle_gdt_ldt_mapping_fault(
986 addr - GDT_LDT_VIRT_START, regs);
987 return 0;
988 }
990 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
991 guest_kernel_mode(v, regs) &&
992 /* Do not check if access-protection fault since the page may
993 legitimately be not present in shadow page tables */
994 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
995 ptwr_do_page_fault(v, addr, regs) )
996 return EXCRET_fault_fixed;
998 if ( paging_mode_enabled(d) )
999 {
1000 int ret = paging_fault(addr, regs);
1001 if ( ret == EXCRET_fault_fixed )
1002 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1003 return ret;
1006 return 0;
1009 /*
1010 * #PF error code:
1011 * Bit 0: Protection violation (=1) ; Page not present (=0)
1012 * Bit 1: Write access
1013 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1014 * Bit 3: Reserved bit violation
1015 * Bit 4: Instruction fetch
1016 */
1017 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
1019 unsigned long addr, fixup;
1020 int rc;
1022 addr = read_cr2();
1024 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1026 perfc_incr(page_faults);
1028 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
1029 return rc;
1031 if ( unlikely(!guest_mode(regs)) )
1033 if ( spurious_page_fault(addr, regs) )
1034 return EXCRET_not_a_fault;
1036 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1038 perfc_incr(copy_user_faults);
1039 regs->eip = fixup;
1040 return 0;
1043 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1045 show_execution_state(regs);
1046 show_page_walk(addr);
1047 panic("FATAL PAGE FAULT\n"
1048 "[error_code=%04x]\n"
1049 "Faulting linear address: %p\n",
1050 regs->error_code, _p(addr));
1053 propagate_page_fault(addr, regs->error_code);
1054 return 0;
1057 /*
1058 * Early handler to deal with spurious page faults. For example, consider a
1059 * routine that uses a mapping immediately after installing it (making it
1060 * present). The CPU may speculatively execute the memory access before
1061 * executing the PTE write. The instruction will then be marked to cause a
1062 * page fault when it is retired, despite the fact that the PTE is present and
1063 * correct at that point in time.
1064 */
1065 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1067 static int stuck;
1068 static unsigned long prev_eip, prev_cr2;
1069 unsigned long cr2 = read_cr2();
1071 BUG_ON(smp_processor_id() != 0);
1073 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1075 prev_eip = regs->eip;
1076 prev_cr2 = cr2;
1077 stuck = 0;
1078 return EXCRET_not_a_fault;
1081 if ( stuck++ == 1000 )
1082 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1083 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1085 return EXCRET_not_a_fault;
1088 long do_fpu_taskswitch(int set)
1090 struct vcpu *v = current;
1092 if ( set )
1094 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1095 stts();
1097 else
1099 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1100 if ( v->fpu_dirtied )
1101 clts();
1104 return 0;
1107 static int read_descriptor(unsigned int sel,
1108 const struct vcpu *v,
1109 const struct cpu_user_regs * regs,
1110 unsigned long *base,
1111 unsigned long *limit,
1112 unsigned int *ar,
1113 unsigned int vm86attr)
1115 struct desc_struct desc;
1117 if ( !vm86_mode(regs) )
1119 if ( sel < 4)
1120 desc.b = desc.a = 0;
1121 else if ( __get_user(desc,
1122 (const struct desc_struct *)(!(sel & 4)
1123 ? GDT_VIRT_START(v)
1124 : LDT_VIRT_START(v))
1125 + (sel >> 3)) )
1126 return 0;
1127 if ( !(vm86attr & _SEGMENT_CODE) )
1128 desc.b &= ~_SEGMENT_L;
1130 else
1132 desc.a = (sel << 20) | 0xffff;
1133 desc.b = vm86attr | (sel >> 12);
1136 *ar = desc.b & 0x00f0ff00;
1137 if ( !(desc.b & _SEGMENT_L) )
1139 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1140 (desc.b & 0xff000000));
1141 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1142 if ( desc.b & _SEGMENT_G )
1143 *limit = ((*limit + 1) << 12) - 1;
1144 #ifndef NDEBUG
1145 if ( !vm86_mode(regs) && (sel > 3) )
1147 unsigned int a, l;
1148 unsigned char valid;
1150 asm volatile (
1151 "larl %2,%0 ; setz %1"
1152 : "=r" (a), "=rm" (valid) : "rm" (sel));
1153 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1154 asm volatile (
1155 "lsll %2,%0 ; setz %1"
1156 : "=r" (l), "=rm" (valid) : "rm" (sel));
1157 BUG_ON(valid && (l != *limit));
1159 #endif
1161 else
1163 *base = 0UL;
1164 *limit = ~0UL;
1167 return 1;
1170 #ifdef __x86_64__
1171 static int read_gate_descriptor(unsigned int gate_sel,
1172 const struct vcpu *v,
1173 unsigned int *sel,
1174 unsigned long *off,
1175 unsigned int *ar)
1177 struct desc_struct desc;
1178 const struct desc_struct *pdesc;
1181 pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
1182 GDT_VIRT_START(v) :
1183 LDT_VIRT_START(v))
1184 + (gate_sel >> 3);
1185 if ( gate_sel < 4 ||
1186 (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
1187 __get_user(desc, pdesc) )
1188 return 0;
1190 *sel = (desc.a >> 16) & 0x0000fffc;
1191 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1192 *ar = desc.b & 0x0000ffff;
1193 /*
1194 * check_descriptor() clears the DPL field and stores the
1195 * guest requested DPL in the selector's RPL field.
1196 */
1197 ASSERT(!(*ar & _SEGMENT_DPL));
1198 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1200 if ( !is_pv_32bit_vcpu(v) )
1202 if ( (*ar & 0x1f00) != 0x0c00 ||
1203 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1204 __get_user(desc, pdesc + 1) ||
1205 (desc.b & 0x1f00) )
1206 return 0;
1208 *off |= (unsigned long)desc.a << 32;
1209 return 1;
1212 switch ( *ar & 0x1f00 )
1214 case 0x0400:
1215 *off &= 0xffff;
1216 break;
1217 case 0x0c00:
1218 break;
1219 default:
1220 return 0;
1223 return 1;
1225 #endif
1227 /* Has the guest requested sufficient permission for this I/O access? */
1228 static inline int guest_io_okay(
1229 unsigned int port, unsigned int bytes,
1230 struct vcpu *v, struct cpu_user_regs *regs)
1232 #if defined(__x86_64__)
1233 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1234 int user_mode = !(v->arch.flags & TF_kernel_mode);
1235 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1236 #elif defined(__i386__)
1237 #define TOGGLE_MODE() ((void)0)
1238 #endif
1240 if ( !vm86_mode(regs) &&
1241 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1242 return 1;
1244 if ( v->arch.iobmp_limit > (port + bytes) )
1246 union { uint8_t bytes[2]; uint16_t mask; } x;
1248 /*
1249 * Grab permission bytes from guest space. Inaccessible bytes are
1250 * read as 0xff (no access allowed).
1251 */
1252 TOGGLE_MODE();
1253 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1254 port>>3, 2) )
1256 default: x.bytes[0] = ~0;
1257 case 1: x.bytes[1] = ~0;
1258 case 0: break;
1260 TOGGLE_MODE();
1262 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1263 return 1;
1266 return 0;
1269 /* Has the administrator granted sufficient permission for this I/O access? */
1270 static inline int admin_io_okay(
1271 unsigned int port, unsigned int bytes,
1272 struct vcpu *v, struct cpu_user_regs *regs)
1274 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1277 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1278 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1279 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1280 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1281 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1282 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1284 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1285 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1286 __attribute__((__regparm__(1)));
1287 unsigned long guest_to_host_gpr_switch(unsigned long)
1288 __attribute__((__regparm__(1)));
1290 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1292 /* Instruction fetch with error handling. */
1293 #define insn_fetch(type, base, eip, limit) \
1294 ({ unsigned long _rc, _ptr = (base) + (eip); \
1295 type _x; \
1296 if ( ad_default < 8 ) \
1297 _ptr = (unsigned int)_ptr; \
1298 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1299 goto fail; \
1300 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1301 { \
1302 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1303 return EXCRET_fault_fixed; \
1304 } \
1305 (eip) += sizeof(_x); _x; })
1307 #if defined(CONFIG_X86_32)
1308 # define read_sreg(regs, sr) ((regs)->sr)
1309 #elif defined(CONFIG_X86_64)
1310 # define read_sreg(regs, sr) read_segment_register(sr)
1311 #endif
1313 static int emulate_privileged_op(struct cpu_user_regs *regs)
1315 struct vcpu *v = current;
1316 unsigned long *reg, eip = regs->eip, res;
1317 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1318 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1319 unsigned int port, i, data_sel, ar, data, rc;
1320 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1321 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1322 ? regs->reg \
1323 : ad_bytes == 4 \
1324 ? (u32)regs->reg \
1325 : (u16)regs->reg)
1326 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1327 ? regs->reg = (val) \
1328 : ad_bytes == 4 \
1329 ? (*(u32 *)&regs->reg = (val)) \
1330 : (*(u16 *)&regs->reg = (val)))
1331 unsigned long code_base, code_limit;
1332 char io_emul_stub[16];
1333 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1334 u32 l, h, eax, edx;
1336 if ( !read_descriptor(regs->cs, v, regs,
1337 &code_base, &code_limit, &ar,
1338 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1339 goto fail;
1340 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1341 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1342 if ( !(ar & _SEGMENT_S) ||
1343 !(ar & _SEGMENT_P) ||
1344 !(ar & _SEGMENT_CODE) )
1345 goto fail;
1347 /* emulating only opcodes not allowing SS to be default */
1348 data_sel = read_sreg(regs, ds);
1350 /* Legacy prefixes. */
1351 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1353 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1355 case 0x66: /* operand-size override */
1356 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1357 continue;
1358 case 0x67: /* address-size override */
1359 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1360 continue;
1361 case 0x2e: /* CS override */
1362 data_sel = regs->cs;
1363 continue;
1364 case 0x3e: /* DS override */
1365 data_sel = read_sreg(regs, ds);
1366 continue;
1367 case 0x26: /* ES override */
1368 data_sel = read_sreg(regs, es);
1369 continue;
1370 case 0x64: /* FS override */
1371 data_sel = read_sreg(regs, fs);
1372 lm_ovr = lm_seg_fs;
1373 continue;
1374 case 0x65: /* GS override */
1375 data_sel = read_sreg(regs, gs);
1376 lm_ovr = lm_seg_gs;
1377 continue;
1378 case 0x36: /* SS override */
1379 data_sel = regs->ss;
1380 continue;
1381 case 0xf0: /* LOCK */
1382 lock = 1;
1383 continue;
1384 case 0xf2: /* REPNE/REPNZ */
1385 case 0xf3: /* REP/REPE/REPZ */
1386 rep_prefix = 1;
1387 continue;
1388 default:
1389 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1391 rex = opcode;
1392 continue;
1394 break;
1396 break;
1399 /* REX prefix. */
1400 if ( rex & 8 ) /* REX.W */
1401 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1402 modrm_reg = (rex & 4) << 1; /* REX.R */
1403 /* REX.X does not need to be decoded. */
1404 modrm_rm = (rex & 1) << 3; /* REX.B */
1406 if ( opcode == 0x0f )
1407 goto twobyte_opcode;
1409 if ( lock )
1410 goto fail;
1412 /* Input/Output String instructions. */
1413 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1415 unsigned long data_base, data_limit;
1417 if ( rep_prefix && (rd_ad(ecx) == 0) )
1418 goto done;
1420 if ( !(opcode & 2) )
1422 data_sel = read_sreg(regs, es);
1423 lm_ovr = lm_seg_none;
1426 if ( !(ar & _SEGMENT_L) )
1428 if ( !read_descriptor(data_sel, v, regs,
1429 &data_base, &data_limit, &ar,
1430 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1431 goto fail;
1432 if ( !(ar & _SEGMENT_S) ||
1433 !(ar & _SEGMENT_P) ||
1434 (opcode & 2 ?
1435 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1436 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1437 goto fail;
1439 #ifdef CONFIG_X86_64
1440 else
1442 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1444 switch ( lm_ovr )
1446 case lm_seg_none:
1447 data_base = 0UL;
1448 break;
1449 case lm_seg_fs:
1450 data_base = v->arch.guest_context.fs_base;
1451 break;
1452 case lm_seg_gs:
1453 if ( guest_kernel_mode(v, regs) )
1454 data_base = v->arch.guest_context.gs_base_kernel;
1455 else
1456 data_base = v->arch.guest_context.gs_base_user;
1457 break;
1460 else
1461 read_descriptor(data_sel, v, regs,
1462 &data_base, &data_limit, &ar,
1463 0);
1464 data_limit = ~0UL;
1465 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1467 #endif
1469 continue_io_string:
1470 switch ( opcode )
1472 case 0x6c: /* INSB */
1473 op_bytes = 1;
1474 case 0x6d: /* INSW/INSL */
1475 if ( data_limit < op_bytes - 1 ||
1476 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1477 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1478 goto fail;
1479 port = (u16)regs->edx;
1480 switch ( op_bytes )
1482 case 1:
1483 /* emulate PIT counter 2 */
1484 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1485 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1486 pv_pit_handler(port, 0, 0) : ~0));
1487 break;
1488 case 2:
1489 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1490 break;
1491 case 4:
1492 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1493 break;
1495 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1497 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1498 PFEC_write_access);
1499 return EXCRET_fault_fixed;
1501 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1502 break;
1504 case 0x6e: /* OUTSB */
1505 op_bytes = 1;
1506 case 0x6f: /* OUTSW/OUTSL */
1507 if ( data_limit < op_bytes - 1 ||
1508 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1509 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1510 goto fail;
1511 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1512 if ( rc != 0 )
1514 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1515 return EXCRET_fault_fixed;
1517 port = (u16)regs->edx;
1518 switch ( op_bytes )
1520 case 1:
1521 if ( guest_outb_okay(port, v, regs) )
1523 outb((u8)data, port);
1524 if ( pv_post_outb_hook )
1525 pv_post_outb_hook(port, data);
1527 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1528 pv_pit_handler(port, data, 1);
1529 break;
1530 case 2:
1531 if ( guest_outw_okay(port, v, regs) )
1532 outw((u16)data, port);
1533 break;
1534 case 4:
1535 if ( guest_outl_okay(port, v, regs) )
1536 outl((u32)data, port);
1537 break;
1539 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1540 break;
1543 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1545 if ( !hypercall_preempt_check() )
1546 goto continue_io_string;
1547 eip = regs->eip;
1550 goto done;
1553 /*
1554 * Very likely to be an I/O instruction (IN/OUT).
1555 * Build an on-stack stub to execute the instruction with full guest
1556 * GPR context. This is needed for some systems which (ab)use IN/OUT
1557 * to communicate with BIOS code in system-management mode.
1558 */
1559 #ifdef __x86_64__
1560 /* movq $host_to_guest_gpr_switch,%rcx */
1561 io_emul_stub[0] = 0x48;
1562 io_emul_stub[1] = 0xb9;
1563 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1564 /* callq *%rcx */
1565 io_emul_stub[10] = 0xff;
1566 io_emul_stub[11] = 0xd1;
1567 #else
1568 /* call host_to_guest_gpr_switch */
1569 io_emul_stub[0] = 0xe8;
1570 *(s32 *)&io_emul_stub[1] =
1571 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1572 /* 7 x nop */
1573 memset(&io_emul_stub[5], 0x90, 7);
1574 #endif
1575 /* data16 or nop */
1576 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1577 /* <io-access opcode> */
1578 io_emul_stub[13] = opcode;
1579 /* imm8 or nop */
1580 io_emul_stub[14] = 0x90;
1581 /* ret (jumps to guest_to_host_gpr_switch) */
1582 io_emul_stub[15] = 0xc3;
1584 /* Handy function-typed pointer to the stub. */
1585 io_emul = (void *)io_emul_stub;
1587 /* I/O Port and Interrupt Flag instructions. */
1588 switch ( opcode )
1590 case 0xe4: /* IN imm8,%al */
1591 op_bytes = 1;
1592 case 0xe5: /* IN imm8,%eax */
1593 port = insn_fetch(u8, code_base, eip, code_limit);
1594 io_emul_stub[14] = port; /* imm8 */
1595 exec_in:
1596 if ( !guest_io_okay(port, op_bytes, v, regs) )
1597 goto fail;
1598 switch ( op_bytes )
1600 case 1:
1601 if ( guest_inb_okay(port, v, regs) )
1602 io_emul(regs);
1603 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1605 regs->eax &= ~0xffUL;
1606 regs->eax |= pv_pit_handler(port, 0, 0);
1608 else
1609 regs->eax |= (u8)~0;
1610 break;
1611 case 2:
1612 if ( guest_inw_okay(port, v, regs) )
1613 io_emul(regs);
1614 else
1615 regs->eax |= (u16)~0;
1616 break;
1617 case 4:
1618 if ( guest_inl_okay(port, v, regs) )
1619 io_emul(regs);
1620 else
1621 regs->eax = (u32)~0;
1622 break;
1624 goto done;
1626 case 0xec: /* IN %dx,%al */
1627 op_bytes = 1;
1628 case 0xed: /* IN %dx,%eax */
1629 port = (u16)regs->edx;
1630 goto exec_in;
1632 case 0xe6: /* OUT %al,imm8 */
1633 op_bytes = 1;
1634 case 0xe7: /* OUT %eax,imm8 */
1635 port = insn_fetch(u8, code_base, eip, code_limit);
1636 io_emul_stub[14] = port; /* imm8 */
1637 exec_out:
1638 if ( !guest_io_okay(port, op_bytes, v, regs) )
1639 goto fail;
1640 switch ( op_bytes )
1642 case 1:
1643 if ( guest_outb_okay(port, v, regs) )
1645 io_emul(regs);
1646 if ( pv_post_outb_hook )
1647 pv_post_outb_hook(port, regs->eax);
1649 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1650 pv_pit_handler(port, regs->eax, 1);
1651 break;
1652 case 2:
1653 if ( guest_outw_okay(port, v, regs) )
1654 io_emul(regs);
1655 break;
1656 case 4:
1657 if ( guest_outl_okay(port, v, regs) )
1658 io_emul(regs);
1659 break;
1661 goto done;
1663 case 0xee: /* OUT %al,%dx */
1664 op_bytes = 1;
1665 case 0xef: /* OUT %eax,%dx */
1666 port = (u16)regs->edx;
1667 goto exec_out;
1669 case 0xfa: /* CLI */
1670 case 0xfb: /* STI */
1671 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1672 goto fail;
1673 /*
1674 * This is just too dangerous to allow, in my opinion. Consider if the
1675 * caller then tries to reenable interrupts using POPF: we can't trap
1676 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1677 * do for us. :-)
1678 */
1679 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1680 goto done;
1683 /* No decode of this single-byte opcode. */
1684 goto fail;
1686 twobyte_opcode:
1687 /* Two-byte opcodes only emulated from guest kernel. */
1688 if ( !guest_kernel_mode(v, regs) )
1689 goto fail;
1691 /* Privileged (ring 0) instructions. */
1692 opcode = insn_fetch(u8, code_base, eip, code_limit);
1693 if ( lock && (opcode & ~3) != 0x20 )
1694 goto fail;
1695 switch ( opcode )
1697 case 0x06: /* CLTS */
1698 (void)do_fpu_taskswitch(0);
1699 break;
1701 case 0x09: /* WBINVD */
1702 /* Ignore the instruction if unprivileged. */
1703 if ( !cache_flush_permitted(v->domain) )
1704 /* Non-physdev domain attempted WBINVD; ignore for now since
1705 newer linux uses this in some start-of-day timing loops */
1707 else
1708 wbinvd();
1709 break;
1711 case 0x20: /* MOV CR?,<reg> */
1712 opcode = insn_fetch(u8, code_base, eip, code_limit);
1713 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1714 modrm_rm |= (opcode >> 0) & 7;
1715 reg = decode_register(modrm_rm, regs, 0);
1716 switch ( modrm_reg )
1718 case 0: /* Read CR0 */
1719 *reg = (read_cr0() & ~X86_CR0_TS) |
1720 v->arch.guest_context.ctrlreg[0];
1721 break;
1723 case 2: /* Read CR2 */
1724 *reg = v->arch.guest_context.ctrlreg[2];
1725 break;
1727 case 3: /* Read CR3 */
1728 if ( !is_pv_32on64_vcpu(v) )
1729 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1730 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1731 #ifdef CONFIG_COMPAT
1732 else
1733 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1734 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1735 #endif
1736 break;
1738 case 4: /* Read CR4 */
1739 /*
1740 * Guests can read CR4 to see what features Xen has enabled. We
1741 * therefore lie about PGE & PSE as they are unavailable to guests.
1742 */
1743 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1744 break;
1746 default:
1747 goto fail;
1749 break;
1751 case 0x21: /* MOV DR?,<reg> */
1752 opcode = insn_fetch(u8, code_base, eip, code_limit);
1753 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1754 modrm_rm |= (opcode >> 0) & 7;
1755 reg = decode_register(modrm_rm, regs, 0);
1756 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1757 goto fail;
1758 *reg = res;
1759 break;
1761 case 0x22: /* MOV <reg>,CR? */
1762 opcode = insn_fetch(u8, code_base, eip, code_limit);
1763 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1764 modrm_rm |= (opcode >> 0) & 7;
1765 reg = decode_register(modrm_rm, regs, 0);
1766 switch ( modrm_reg )
1768 case 0: /* Write CR0 */
1769 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1771 gdprintk(XENLOG_WARNING,
1772 "Attempt to change unmodifiable CR0 flags.\n");
1773 goto fail;
1775 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1776 break;
1778 case 2: /* Write CR2 */
1779 v->arch.guest_context.ctrlreg[2] = *reg;
1780 arch_set_cr2(v, *reg);
1781 break;
1783 case 3: /* Write CR3 */
1784 LOCK_BIGLOCK(v->domain);
1785 if ( !is_pv_32on64_vcpu(v) )
1786 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1787 #ifdef CONFIG_COMPAT
1788 else
1789 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1790 #endif
1791 UNLOCK_BIGLOCK(v->domain);
1792 if ( rc == 0 ) /* not okay */
1793 goto fail;
1794 break;
1796 case 4: /* Write CR4 */
1797 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1798 gdprintk(XENLOG_WARNING,
1799 "Attempt to change CR4 flags %08lx -> %08lx\n",
1800 read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE), *reg);
1801 break;
1803 default:
1804 goto fail;
1806 break;
1808 case 0x23: /* MOV <reg>,DR? */
1809 opcode = insn_fetch(u8, code_base, eip, code_limit);
1810 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1811 modrm_rm |= (opcode >> 0) & 7;
1812 reg = decode_register(modrm_rm, regs, 0);
1813 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1814 goto fail;
1815 break;
1817 case 0x30: /* WRMSR */
1818 eax = regs->eax;
1819 edx = regs->edx;
1820 res = ((u64)edx << 32) | eax;
1821 switch ( regs->ecx )
1823 #ifdef CONFIG_X86_64
1824 case MSR_FS_BASE:
1825 if ( is_pv_32on64_vcpu(v) )
1826 goto fail;
1827 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1828 goto fail;
1829 v->arch.guest_context.fs_base = res;
1830 break;
1831 case MSR_GS_BASE:
1832 if ( is_pv_32on64_vcpu(v) )
1833 goto fail;
1834 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1835 goto fail;
1836 v->arch.guest_context.gs_base_kernel = res;
1837 break;
1838 case MSR_SHADOW_GS_BASE:
1839 if ( is_pv_32on64_vcpu(v) )
1840 goto fail;
1841 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1842 goto fail;
1843 v->arch.guest_context.gs_base_user = res;
1844 break;
1845 #endif
1846 case MSR_K7_FID_VID_STATUS:
1847 case MSR_K7_FID_VID_CTL:
1848 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1849 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1850 wrmsr_safe(regs->ecx, eax, edx) )
1851 goto fail;
1852 break;
1853 case MSR_IA32_PERF_CTL:
1854 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1855 (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
1856 wrmsr_safe(regs->ecx, eax, edx) )
1857 goto fail;
1858 break;
1859 default:
1860 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1861 break;
1862 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1863 (eax != l) || (edx != h) )
1864 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1865 "%08x:%08x to %08x:%08x.\n",
1866 _p(regs->ecx), h, l, edx, eax);
1867 break;
1869 break;
1871 case 0x32: /* RDMSR */
1872 switch ( regs->ecx )
1874 #ifdef CONFIG_X86_64
1875 case MSR_FS_BASE:
1876 if ( is_pv_32on64_vcpu(v) )
1877 goto fail;
1878 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1879 regs->edx = v->arch.guest_context.fs_base >> 32;
1880 break;
1881 case MSR_GS_BASE:
1882 if ( is_pv_32on64_vcpu(v) )
1883 goto fail;
1884 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1885 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1886 break;
1887 case MSR_SHADOW_GS_BASE:
1888 if ( is_pv_32on64_vcpu(v) )
1889 goto fail;
1890 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1891 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1892 break;
1893 #endif
1894 case MSR_K7_FID_VID_CTL:
1895 case MSR_K7_FID_VID_STATUS:
1896 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1897 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1898 rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1899 goto fail;
1900 break;
1901 case MSR_EFER:
1902 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1903 goto fail;
1904 break;
1905 default:
1906 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1908 regs->eax = l;
1909 regs->edx = h;
1910 break;
1912 /* Everyone can read the MSR space. */
1913 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1914 _p(regs->ecx));*/
1915 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1916 goto fail;
1917 break;
1919 break;
1921 default:
1922 goto fail;
1925 #undef wr_ad
1926 #undef rd_ad
1928 done:
1929 regs->eip = eip;
1930 regs->eflags &= ~X86_EFLAGS_RF;
1931 return EXCRET_fault_fixed;
1933 fail:
1934 return 0;
1937 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
1938 unsigned int esp, unsigned int decr)
1940 return (((esp - decr) < (esp - 1)) &&
1941 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
1944 static int emulate_gate_op(struct cpu_user_regs *regs)
1946 #ifdef __x86_64__
1947 struct vcpu *v = current;
1948 unsigned int sel, ar, dpl, nparm, opnd_sel;
1949 unsigned int op_default, op_bytes, ad_default, ad_bytes;
1950 unsigned long off, eip, opnd_off, base, limit;
1951 int jump;
1953 /* Check whether this fault is due to the use of a call gate. */
1954 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
1955 ((ar >> 13) & 3) < (regs->cs & 3) ||
1956 (ar & _SEGMENT_TYPE) != 0xc00 )
1957 return do_guest_trap(TRAP_gp_fault, regs, 1);
1958 if ( !(ar & _SEGMENT_P) )
1959 return do_guest_trap(TRAP_no_segment, regs, 1);
1960 dpl = (ar >> 13) & 3;
1961 nparm = ar & 0x1f;
1963 /*
1964 * Decode instruction (and perhaps operand) to determine RPL,
1965 * whether this is a jump or a call, and the call return offset.
1966 */
1967 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
1968 !(ar & _SEGMENT_S) ||
1969 !(ar & _SEGMENT_P) ||
1970 !(ar & _SEGMENT_CODE) )
1971 return do_guest_trap(TRAP_gp_fault, regs, 1);
1973 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
1974 ad_default = ad_bytes = op_default;
1975 opnd_sel = opnd_off = 0;
1976 jump = -1;
1977 for ( eip = regs->eip; eip - regs->_eip < 10; )
1979 switch ( insn_fetch(u8, base, eip, limit) )
1981 case 0x66: /* operand-size override */
1982 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1983 continue;
1984 case 0x67: /* address-size override */
1985 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1986 continue;
1987 case 0x2e: /* CS override */
1988 opnd_sel = regs->cs;
1989 ASSERT(opnd_sel);
1990 continue;
1991 case 0x3e: /* DS override */
1992 opnd_sel = read_sreg(regs, ds);
1993 if ( !opnd_sel )
1994 opnd_sel = dpl;
1995 continue;
1996 case 0x26: /* ES override */
1997 opnd_sel = read_sreg(regs, es);
1998 if ( !opnd_sel )
1999 opnd_sel = dpl;
2000 continue;
2001 case 0x64: /* FS override */
2002 opnd_sel = read_sreg(regs, fs);
2003 if ( !opnd_sel )
2004 opnd_sel = dpl;
2005 continue;
2006 case 0x65: /* GS override */
2007 opnd_sel = read_sreg(regs, gs);
2008 if ( !opnd_sel )
2009 opnd_sel = dpl;
2010 continue;
2011 case 0x36: /* SS override */
2012 opnd_sel = regs->ss;
2013 if ( !opnd_sel )
2014 opnd_sel = dpl;
2015 continue;
2016 case 0xea:
2017 ++jump;
2018 /* FALLTHROUGH */
2019 case 0x9a:
2020 ++jump;
2021 opnd_sel = regs->cs;
2022 opnd_off = eip;
2023 ad_bytes = ad_default;
2024 eip += op_bytes + 2;
2025 break;
2026 case 0xff:
2028 unsigned int modrm;
2030 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2032 case 0x28: case 0x68: case 0xa8:
2033 ++jump;
2034 /* FALLTHROUGH */
2035 case 0x18: case 0x58: case 0x98:
2036 ++jump;
2037 if ( ad_bytes != 2 )
2039 if ( (modrm & 7) == 4 )
2041 unsigned int sib = insn_fetch(u8, base, eip, limit);
2043 modrm = (modrm & ~7) | (sib & 7);
2044 if ( (sib >>= 3) != 4 )
2045 opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
2046 opnd_off <<= sib >> 3;
2048 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2049 opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
2050 else
2051 modrm |= 0x87;
2052 if ( !opnd_sel )
2054 switch ( modrm & 7 )
2056 default:
2057 opnd_sel = read_sreg(regs, ds);
2058 break;
2059 case 4: case 5:
2060 opnd_sel = regs->ss;
2061 break;
2065 else
2067 switch ( modrm & 7 )
2069 case 0: case 1: case 7:
2070 opnd_off = regs->ebx;
2071 break;
2072 case 6:
2073 if ( !(modrm & 0xc0) )
2074 modrm |= 0x80;
2075 else
2076 case 2: case 3:
2078 opnd_off = regs->ebp;
2079 if ( !opnd_sel )
2080 opnd_sel = regs->ss;
2082 break;
2084 if ( !opnd_sel )
2085 opnd_sel = read_sreg(regs, ds);
2086 switch ( modrm & 7 )
2088 case 0: case 2: case 4:
2089 opnd_off += regs->esi;
2090 break;
2091 case 1: case 3: case 5:
2092 opnd_off += regs->edi;
2093 break;
2096 switch ( modrm & 0xc0 )
2098 case 0x40:
2099 opnd_off += insn_fetch(s8, base, eip, limit);
2100 break;
2101 case 0x80:
2102 opnd_off += insn_fetch(s32, base, eip, limit);
2103 break;
2105 if ( ad_bytes == 4 )
2106 opnd_off = (unsigned int)opnd_off;
2107 else if ( ad_bytes == 2 )
2108 opnd_off = (unsigned short)opnd_off;
2109 break;
2112 break;
2114 break;
2117 if ( jump < 0 )
2119 fail:
2120 return do_guest_trap(TRAP_gp_fault, regs, 1);
2123 if ( (opnd_sel != regs->cs &&
2124 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2125 !(ar & _SEGMENT_S) ||
2126 !(ar & _SEGMENT_P) ||
2127 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2128 return do_guest_trap(TRAP_gp_fault, regs, 1);
2130 opnd_off += op_bytes;
2131 #define ad_default ad_bytes
2132 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2133 #undef ad_default
2134 ASSERT((opnd_sel & ~3) == regs->error_code);
2135 if ( dpl < (opnd_sel & 3) )
2136 return do_guest_trap(TRAP_gp_fault, regs, 1);
2138 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2139 !(ar & _SEGMENT_S) ||
2140 !(ar & _SEGMENT_CODE) ||
2141 (!jump || (ar & _SEGMENT_EC) ?
2142 ((ar >> 13) & 3) > (regs->cs & 3) :
2143 ((ar >> 13) & 3) != (regs->cs & 3)) )
2145 regs->error_code = sel;
2146 return do_guest_trap(TRAP_gp_fault, regs, 1);
2148 if ( !(ar & _SEGMENT_P) )
2150 regs->error_code = sel;
2151 return do_guest_trap(TRAP_no_segment, regs, 1);
2153 if ( off > limit )
2155 regs->error_code = 0;
2156 return do_guest_trap(TRAP_gp_fault, regs, 1);
2159 if ( !jump )
2161 unsigned int ss, esp, *stkp;
2162 int rc;
2163 #define push(item) do \
2164 { \
2165 --stkp; \
2166 esp -= 4; \
2167 rc = __put_user(item, stkp); \
2168 if ( rc ) \
2169 { \
2170 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2171 PFEC_write_access); \
2172 return 0; \
2173 } \
2174 } while ( 0 )
2176 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2178 sel |= (ar >> 13) & 3;
2179 /* Inner stack known only for kernel ring. */
2180 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2181 return do_guest_trap(TRAP_gp_fault, regs, 1);
2182 esp = v->arch.guest_context.kernel_sp;
2183 ss = v->arch.guest_context.kernel_ss;
2184 if ( (ss & 3) != (sel & 3) ||
2185 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2186 ((ar >> 13) & 3) != (sel & 3) ||
2187 !(ar & _SEGMENT_S) ||
2188 (ar & _SEGMENT_CODE) ||
2189 !(ar & _SEGMENT_WR) )
2191 regs->error_code = ss & ~3;
2192 return do_guest_trap(TRAP_invalid_tss, regs, 1);
2194 if ( !(ar & _SEGMENT_P) ||
2195 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2197 regs->error_code = ss & ~3;
2198 return do_guest_trap(TRAP_stack_error, regs, 1);
2200 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2201 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2202 return do_guest_trap(TRAP_gp_fault, regs, 1);
2203 push(regs->ss);
2204 push(regs->esp);
2205 if ( nparm )
2207 const unsigned int *ustkp;
2209 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2210 ((ar >> 13) & 3) != (regs->cs & 3) ||
2211 !(ar & _SEGMENT_S) ||
2212 (ar & _SEGMENT_CODE) ||
2213 !(ar & _SEGMENT_WR) ||
2214 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2215 return do_guest_trap(TRAP_gp_fault, regs, 1);
2216 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2217 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2218 return do_guest_trap(TRAP_gp_fault, regs, 1);
2219 do
2221 unsigned int parm;
2223 --ustkp;
2224 rc = __get_user(parm, ustkp);
2225 if ( rc )
2227 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2228 return 0;
2230 push(parm);
2231 } while ( --nparm );
2234 else
2236 sel |= (regs->cs & 3);
2237 esp = regs->esp;
2238 ss = regs->ss;
2239 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2240 ((ar >> 13) & 3) != (sel & 3) )
2241 return do_guest_trap(TRAP_gp_fault, regs, 1);
2242 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2244 regs->error_code = 0;
2245 return do_guest_trap(TRAP_stack_error, regs, 1);
2247 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2248 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2249 return do_guest_trap(TRAP_gp_fault, regs, 1);
2251 push(regs->cs);
2252 push(eip);
2253 #undef push
2254 regs->esp = esp;
2255 regs->ss = ss;
2257 else
2258 sel |= (regs->cs & 3);
2260 regs->eip = off;
2261 regs->cs = sel;
2262 #endif
2264 return 0;
2267 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
2269 struct vcpu *v = current;
2270 unsigned long fixup;
2272 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2274 if ( regs->error_code & 1 )
2275 goto hardware_gp;
2277 if ( !guest_mode(regs) )
2278 goto gp_in_kernel;
2280 /*
2281 * Cunning trick to allow arbitrary "INT n" handling.
2283 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2284 * instruction from trapping to the appropriate vector, when that might not
2285 * be expected by Xen or the guest OS. For example, that entry might be for
2286 * a fault handler (unlike traps, faults don't increment EIP), or might
2287 * expect an error code on the stack (which a software trap never
2288 * provides), or might be a hardware interrupt handler that doesn't like
2289 * being called spuriously.
2291 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2292 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2293 * clear to indicate that it's a software fault, not hardware.
2295 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2296 * okay because they can only be triggered by an explicit DPL-checked
2297 * instruction. The DPL specified by the guest OS for these vectors is NOT
2298 * CHECKED!!
2299 */
2300 if ( (regs->error_code & 3) == 2 )
2302 /* This fault must be due to <INT n> instruction. */
2303 const struct trap_info *ti;
2304 unsigned char vector = regs->error_code >> 3;
2305 ti = &v->arch.guest_context.trap_ctxt[vector];
2306 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2308 regs->eip += 2;
2309 return do_guest_trap(vector, regs, 0);
2312 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2313 return emulate_gate_op(regs);
2315 /* Emulate some simple privileged and I/O instructions. */
2316 if ( (regs->error_code == 0) &&
2317 emulate_privileged_op(regs) )
2319 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2320 return 0;
2323 #if defined(__i386__)
2324 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2325 (regs->error_code == 0) &&
2326 gpf_emulate_4gb(regs) )
2328 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2329 return 0;
2331 #endif
2333 /* Pass on GPF as is. */
2334 return do_guest_trap(TRAP_gp_fault, regs, 1);
2336 gp_in_kernel:
2338 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2340 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2341 regs->error_code, _p(regs->eip), _p(fixup));
2342 regs->eip = fixup;
2343 return 0;
2346 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2348 hardware_gp:
2349 show_execution_state(regs);
2350 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2351 return 0;
2354 static void nmi_softirq(void)
2356 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2357 vcpu_kick(dom0->vcpu[0]);
2360 static void nmi_dom0_report(unsigned int reason_idx)
2362 struct domain *d;
2363 struct vcpu *v;
2365 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2366 return;
2368 set_bit(reason_idx, nmi_reason(d));
2370 if ( !test_and_set_bool(v->nmi_pending) )
2371 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2374 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2376 switch ( opt_nmi[0] )
2378 case 'd': /* 'dom0' */
2379 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2380 case 'i': /* 'ignore' */
2381 break;
2382 default: /* 'fatal' */
2383 console_force_unlock();
2384 printk("\n\nNMI - MEMORY ERROR\n");
2385 fatal_trap(TRAP_nmi, regs);
2388 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2389 mdelay(1);
2390 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2393 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2395 switch ( opt_nmi[0] )
2397 case 'd': /* 'dom0' */
2398 nmi_dom0_report(_XEN_NMIREASON_io_error);
2399 case 'i': /* 'ignore' */
2400 break;
2401 default: /* 'fatal' */
2402 console_force_unlock();
2403 printk("\n\nNMI - I/O ERROR\n");
2404 fatal_trap(TRAP_nmi, regs);
2407 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2408 mdelay(1);
2409 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2412 static void unknown_nmi_error(unsigned char reason)
2414 switch ( opt_nmi[0] )
2416 case 'd': /* 'dom0' */
2417 nmi_dom0_report(_XEN_NMIREASON_unknown);
2418 case 'i': /* 'ignore' */
2419 break;
2420 default: /* 'fatal' */
2421 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2422 printk("Dazed and confused, but trying to continue\n");
2423 printk("Do you have a strange power saving mode enabled?\n");
2424 kexec_crash();
2428 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2430 return 0;
2433 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2435 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2437 unsigned int cpu = smp_processor_id();
2438 unsigned char reason;
2440 ++nmi_count(cpu);
2442 if ( nmi_callback(regs, cpu) )
2443 return;
2445 if ( nmi_watchdog )
2446 nmi_watchdog_tick(regs);
2448 /* Only the BSP gets external NMIs from the system. */
2449 if ( cpu == 0 )
2451 reason = inb(0x61);
2452 if ( reason & 0x80 )
2453 mem_parity_error(regs);
2454 else if ( reason & 0x40 )
2455 io_check_error(regs);
2456 else if ( !nmi_watchdog )
2457 unknown_nmi_error((unsigned char)(reason&0xff));
2461 void set_nmi_callback(nmi_callback_t callback)
2463 nmi_callback = callback;
2466 void unset_nmi_callback(void)
2468 nmi_callback = dummy_nmi_callback;
2471 asmlinkage int do_device_not_available(struct cpu_user_regs *regs)
2473 BUG_ON(!guest_mode(regs));
2475 setup_fpu(current);
2477 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2479 do_guest_trap(TRAP_no_device, regs, 0);
2480 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2482 else
2483 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2485 return EXCRET_fault_fixed;
2488 asmlinkage int do_debug(struct cpu_user_regs *regs)
2490 unsigned long condition;
2491 struct vcpu *v = current;
2493 asm volatile ( "mov %%db6,%0" : "=r" (condition) );
2495 /* Mask out spurious debug traps due to lazy DR7 setting */
2496 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
2497 (v->arch.guest_context.debugreg[7] == 0) )
2499 asm volatile ( "mov %0,%%db7" : : "r" (0UL) );
2500 goto out;
2503 DEBUGGER_trap_entry(TRAP_debug, regs);
2505 if ( !guest_mode(regs) )
2507 #ifdef __x86_64__
2508 void sysenter_entry(void);
2509 void sysenter_eflags_saved(void);
2510 /* In SYSENTER entry path we cannot zap TF until EFLAGS is saved. */
2511 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2512 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2513 goto out;
2514 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2515 #else
2516 WARN_ON(1);
2517 #endif
2518 /* Clear TF just for absolute sanity. */
2519 regs->eflags &= ~EF_TF;
2520 /*
2521 * We ignore watchpoints when they trigger within Xen. This may happen
2522 * when a buffer is passed to us which previously had a watchpoint set
2523 * on it. No need to bump EIP; the only faulting trap is an instruction
2524 * breakpoint, which can't happen to us.
2525 */
2526 goto out;
2529 /* Save debug status register where guest OS can peek at it */
2530 v->arch.guest_context.debugreg[6] = condition;
2532 ler_enable();
2534 return do_guest_trap(TRAP_debug, regs, 0);
2536 out:
2537 ler_enable();
2538 return EXCRET_not_a_fault;
2541 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2543 return EXCRET_not_a_fault;
2546 void set_intr_gate(unsigned int n, void *addr)
2548 int i;
2549 /* Keep secondary tables in sync with IRQ updates. */
2550 for ( i = 1; i < NR_CPUS; i++ )
2551 if ( idt_tables[i] != NULL )
2552 _set_gate(&idt_tables[i][n], 14, 0, addr);
2553 _set_gate(&idt_table[n], 14, 0, addr);
2556 void set_system_gate(unsigned int n, void *addr)
2558 _set_gate(idt_table+n,14,3,addr);
2561 void set_task_gate(unsigned int n, unsigned int sel)
2563 idt_table[n].a = sel << 16;
2564 idt_table[n].b = 0x8500;
2567 void set_tss_desc(unsigned int n, void *addr)
2569 _set_tssldt_desc(
2570 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2571 (unsigned long)addr,
2572 offsetof(struct tss_struct, __cacheline_filler) - 1,
2573 9);
2574 #ifdef CONFIG_COMPAT
2575 _set_tssldt_desc(
2576 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2577 (unsigned long)addr,
2578 offsetof(struct tss_struct, __cacheline_filler) - 1,
2579 11);
2580 #endif
2583 void __devinit percpu_traps_init(void)
2585 subarch_percpu_traps_init();
2587 if ( !opt_ler )
2588 return;
2590 switch ( boot_cpu_data.x86_vendor )
2592 case X86_VENDOR_INTEL:
2593 switch ( boot_cpu_data.x86 )
2595 case 6:
2596 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2597 break;
2598 case 15:
2599 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2600 break;
2602 break;
2603 case X86_VENDOR_AMD:
2604 switch ( boot_cpu_data.x86 )
2606 case 6:
2607 case 15:
2608 case 16:
2609 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2610 break;
2612 break;
2615 ler_enable();
2618 void __init trap_init(void)
2620 /*
2621 * Note that interrupt gates are always used, rather than trap gates. We
2622 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2623 * first activation must have the "bad" value(s) for these registers and
2624 * we may lose them if another activation is installed before they are
2625 * saved. The page-fault handler also needs interrupts disabled until %cr2
2626 * has been read and saved on the stack.
2627 */
2628 set_intr_gate(TRAP_divide_error,&divide_error);
2629 set_intr_gate(TRAP_debug,&debug);
2630 set_intr_gate(TRAP_nmi,&nmi);
2631 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2632 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2633 set_intr_gate(TRAP_bounds,&bounds);
2634 set_intr_gate(TRAP_invalid_op,&invalid_op);
2635 set_intr_gate(TRAP_no_device,&device_not_available);
2636 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2637 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2638 set_intr_gate(TRAP_no_segment,&segment_not_present);
2639 set_intr_gate(TRAP_stack_error,&stack_segment);
2640 set_intr_gate(TRAP_gp_fault,&general_protection);
2641 set_intr_gate(TRAP_page_fault,&page_fault);
2642 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2643 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2644 set_intr_gate(TRAP_alignment_check,&alignment_check);
2645 set_intr_gate(TRAP_machine_check,&machine_check);
2646 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2648 /* CPU0 uses the master IDT. */
2649 idt_tables[0] = idt_table;
2651 percpu_traps_init();
2653 cpu_init();
2655 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2658 long register_guest_nmi_callback(unsigned long address)
2660 struct vcpu *v = current;
2661 struct domain *d = current->domain;
2662 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2664 t->vector = TRAP_nmi;
2665 t->flags = 0;
2666 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2667 t->address = address;
2668 TI_SET_IF(t, 1);
2670 /*
2671 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2672 * now.
2673 */
2674 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2675 v->nmi_pending = 1;
2677 return 0;
2680 long unregister_guest_nmi_callback(void)
2682 struct vcpu *v = current;
2683 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2685 memset(t, 0, sizeof(*t));
2687 return 0;
2690 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2692 struct trap_info cur;
2693 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2694 long rc = 0;
2696 /* If no table is presented then clear the entire virtual IDT. */
2697 if ( guest_handle_is_null(traps) )
2699 memset(dst, 0, 256 * sizeof(*dst));
2700 init_int80_direct_trap(current);
2701 return 0;
2704 for ( ; ; )
2706 if ( hypercall_preempt_check() )
2708 rc = hypercall_create_continuation(
2709 __HYPERVISOR_set_trap_table, "h", traps);
2710 break;
2713 if ( copy_from_guest(&cur, traps, 1) )
2715 rc = -EFAULT;
2716 break;
2719 if ( cur.address == 0 )
2720 break;
2722 fixup_guest_code_selector(current->domain, cur.cs);
2724 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2726 if ( cur.vector == 0x80 )
2727 init_int80_direct_trap(current);
2729 guest_handle_add_offset(traps, 1);
2732 return rc;
2736 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2738 int i;
2740 switch ( reg )
2742 case 0:
2743 if ( !access_ok(value, sizeof(long)) )
2744 return -EPERM;
2745 if ( p == current )
2746 asm volatile ( "mov %0, %%db0" : : "r" (value) );
2747 break;
2748 case 1:
2749 if ( !access_ok(value, sizeof(long)) )
2750 return -EPERM;
2751 if ( p == current )
2752 asm volatile ( "mov %0, %%db1" : : "r" (value) );
2753 break;
2754 case 2:
2755 if ( !access_ok(value, sizeof(long)) )
2756 return -EPERM;
2757 if ( p == current )
2758 asm volatile ( "mov %0, %%db2" : : "r" (value) );
2759 break;
2760 case 3:
2761 if ( !access_ok(value, sizeof(long)) )
2762 return -EPERM;
2763 if ( p == current )
2764 asm volatile ( "mov %0, %%db3" : : "r" (value) );
2765 break;
2766 case 6:
2767 /*
2768 * DR6: Bits 4-11,16-31 reserved (set to 1).
2769 * Bit 12 reserved (set to 0).
2770 */
2771 value &= 0xffffefff; /* reserved bits => 0 */
2772 value |= 0xffff0ff0; /* reserved bits => 1 */
2773 if ( p == current )
2774 asm volatile ( "mov %0, %%db6" : : "r" (value) );
2775 break;
2776 case 7:
2777 /*
2778 * DR7: Bit 10 reserved (set to 1).
2779 * Bits 11-12,14-15 reserved (set to 0).
2780 * Privileged bits:
2781 * GD (bit 13): must be 0.
2782 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2783 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2784 */
2785 /* DR7 == 0 => debugging disabled for this domain. */
2786 if ( value != 0 )
2788 value &= 0xffff27ff; /* reserved bits => 0 */
2789 value |= 0x00000400; /* reserved bits => 1 */
2790 if ( (value & (1<<13)) != 0 ) return -EPERM;
2791 for ( i = 0; i < 16; i += 2 )
2792 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2794 if ( p == current )
2795 asm volatile ( "mov %0, %%db7" : : "r" (value) );
2796 break;
2797 default:
2798 return -EINVAL;
2801 p->arch.guest_context.debugreg[reg] = value;
2802 return 0;
2805 long do_set_debugreg(int reg, unsigned long value)
2807 return set_debugreg(current, reg, value);
2810 unsigned long do_get_debugreg(int reg)
2812 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2813 return current->arch.guest_context.debugreg[reg];
2816 /*
2817 * Local variables:
2818 * mode: C
2819 * c-set-style: "BSD"
2820 * c-basic-offset: 4
2821 * tab-width: 4
2822 * indent-tabs-mode: nil
2823 * End:
2824 */