ia64/xen-unstable

view xen/arch/x86/traps.c @ 17965:14fd83fe71c3

Add facility to get notification of domain suspend by event channel.
This event channel will be notified when the domain transitions to the
suspended state, which can be much faster than raising VIRQ_DOM_EXC
and waiting for the notification to be propagated via xenstore.

No attempt is made here to prevent multiple subscribers (last one
wins), or to detect that the subscriber has gone away. Userspace tools
should take care.

Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jul 04 12:00:24 2008 +0100 (2008-07-04)
parents b55f6d42668d
children d711529e3de1
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 struct vcpu *curr = current;
136 unsigned long *stack, addr;
138 if ( is_hvm_vcpu(curr) )
139 return;
141 if ( is_pv_32on64_vcpu(curr) )
142 {
143 compat_show_guest_stack(regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
160 {
161 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
162 break;
163 if ( get_user(addr, stack) )
164 {
165 if ( i != 0 )
166 printk("\n ");
167 printk("Fault while accessing guest memory.");
168 i = 1;
169 break;
170 }
171 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
172 printk("\n ");
173 printk(" %p", _p(addr));
174 stack++;
175 }
176 if ( i == 0 )
177 printk("Stack empty.");
178 printk("\n");
179 }
181 #if !defined(CONFIG_FRAME_POINTER)
183 static void show_trace(struct cpu_user_regs *regs)
184 {
185 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
187 printk("Xen call trace:\n ");
189 printk("[<%p>]", _p(regs->eip));
190 print_symbol(" %s\n ", regs->eip);
192 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
193 {
194 addr = *stack++;
195 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
196 {
197 printk("[<%p>]", _p(addr));
198 print_symbol(" %s\n ", addr);
199 }
200 }
202 printk("\n");
203 }
205 #else
207 static void show_trace(struct cpu_user_regs *regs)
208 {
209 unsigned long *frame, next, addr, low, high;
211 printk("Xen call trace:\n ");
213 printk("[<%p>]", _p(regs->eip));
214 print_symbol(" %s\n ", regs->eip);
216 /* Bounds for range of valid frame pointer. */
217 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
218 high = (low & ~(STACK_SIZE - 1)) +
219 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
221 /* The initial frame pointer. */
222 next = regs->ebp;
224 for ( ; ; )
225 {
226 /* Valid frame pointer? */
227 if ( (next < low) || (next >= high) )
228 {
229 /*
230 * Exception stack frames have a different layout, denoted by an
231 * inverted frame pointer.
232 */
233 next = ~next;
234 if ( (next < low) || (next >= high) )
235 break;
236 frame = (unsigned long *)next;
237 next = frame[0];
238 addr = frame[(offsetof(struct cpu_user_regs, eip) -
239 offsetof(struct cpu_user_regs, ebp))
240 / BYTES_PER_LONG];
241 }
242 else
243 {
244 /* Ordinary stack frame. */
245 frame = (unsigned long *)next;
246 next = frame[0];
247 addr = frame[1];
248 }
250 printk("[<%p>]", _p(addr));
251 print_symbol(" %s\n ", addr);
253 low = (unsigned long)&frame[2];
254 }
256 printk("\n");
257 }
259 #endif
261 void show_stack(struct cpu_user_regs *regs)
262 {
263 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
264 int i;
266 if ( guest_mode(regs) )
267 return show_guest_stack(regs);
269 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
271 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
272 {
273 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
274 break;
275 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
276 printk("\n ");
277 addr = *stack++;
278 printk(" %p", _p(addr));
279 }
280 if ( i == 0 )
281 printk("Stack empty.");
282 printk("\n");
284 show_trace(regs);
285 }
287 void show_stack_overflow(unsigned int cpu, unsigned long esp)
288 {
289 #ifdef MEMORY_GUARD
290 unsigned long esp_top, esp_bottom;
291 unsigned long *stack, addr;
293 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
294 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
296 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
297 (void *)esp_top, (void *)esp_bottom, (void *)esp,
298 (void *)init_tss[cpu].esp0);
300 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
301 if ( ((unsigned long)(esp - esp_top) > 512) &&
302 ((unsigned long)(esp_top - esp) > 512) )
303 {
304 printk("No stack overflow detected. Skipping stack trace.\n");
305 return;
306 }
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow (dumping trace %p-%p):\n ",
312 (void *)esp, (void *)esp_bottom);
314 stack = (unsigned long *)esp;
315 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
316 {
317 addr = *stack++;
318 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
319 {
320 printk("%p: [<%p>]", stack, _p(addr));
321 print_symbol(" %s\n ", addr);
322 }
323 }
325 printk("\n");
326 #endif
327 }
329 void show_execution_state(struct cpu_user_regs *regs)
330 {
331 show_registers(regs);
332 show_stack(regs);
333 }
335 void vcpu_show_execution_state(struct vcpu *v)
336 {
337 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
338 v->domain->domain_id, v->vcpu_id);
340 if ( v == current )
341 {
342 show_execution_state(guest_cpu_user_regs());
343 return;
344 }
346 vcpu_pause(v); /* acceptably dangerous */
348 vcpu_show_registers(v);
349 /* Todo: map arbitrary vcpu's top guest stack page here. */
350 if ( (v->domain == current->domain) &&
351 guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
352 show_guest_stack(&v->arch.guest_context.user_regs);
354 vcpu_unpause(v);
355 }
357 char *trapstr(int trapnr)
358 {
359 static char *strings[] = {
360 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
361 "invalid opcode", "device not available", "double fault",
362 "coprocessor segment", "invalid tss", "segment not found",
363 "stack error", "general protection fault", "page fault",
364 "spurious interrupt", "coprocessor error", "alignment check",
365 "machine check", "simd error"
366 };
368 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
369 return "???";
371 return strings[trapnr];
372 }
374 /*
375 * This is called for faults at very unexpected times (e.g., when interrupts
376 * are disabled). In such situations we can't do much that is safe. We try to
377 * print out some tracing and then we just spin.
378 */
379 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
380 {
381 static DEFINE_PER_CPU(char, depth);
383 /*
384 * In some cases, we can end up in a vicious cycle of fatal_trap()s
385 * within fatal_trap()s. We give the problem a couple of iterations to
386 * bottom out, and then we just panic.
387 */
388 if ( ++this_cpu(depth) < 3 )
389 {
390 watchdog_disable();
391 console_start_sync();
393 show_execution_state(regs);
395 if ( trapnr == TRAP_page_fault )
396 {
397 unsigned long cr2 = read_cr2();
398 printk("Faulting linear address: %p\n", _p(cr2));
399 show_page_walk(cr2);
400 }
401 }
403 panic("FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x] %s\n",
405 trapnr, trapstr(trapnr), regs->error_code,
406 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
407 }
409 static void do_guest_trap(
410 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
411 {
412 struct vcpu *v = current;
413 struct trap_bounce *tb;
414 const struct trap_info *ti;
416 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
418 tb = &v->arch.trap_bounce;
419 ti = &v->arch.guest_context.trap_ctxt[trapnr];
421 tb->flags = TBF_EXCEPTION;
422 tb->cs = ti->cs;
423 tb->eip = ti->address;
425 if ( use_error_code )
426 {
427 tb->flags |= TBF_EXCEPTION_ERRCODE;
428 tb->error_code = regs->error_code;
429 }
431 if ( TI_GET_IF(ti) )
432 tb->flags |= TBF_INTERRUPT;
434 if ( unlikely(null_trap_bounce(v, tb)) )
435 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
436 "on VCPU %d [ec=%04x]\n",
437 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
438 }
440 static void instruction_done(
441 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
442 {
443 regs->eip = eip;
444 regs->eflags &= ~X86_EFLAGS_RF;
445 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
446 {
447 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
448 if ( regs->eflags & X86_EFLAGS_TF )
449 current->arch.guest_context.debugreg[6] |= 0x4000;
450 do_guest_trap(TRAP_debug, regs, 0);
451 }
452 }
454 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
455 unsigned int port, unsigned int len)
456 {
457 unsigned int width, i, match = 0;
458 unsigned long start;
460 if ( !(v->arch.guest_context.debugreg[5]) ||
461 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
462 return 0;
464 for ( i = 0; i < 4; i++ )
465 {
466 if ( !(v->arch.guest_context.debugreg[5] &
467 (3 << (i * DR_ENABLE_SIZE))) )
468 continue;
470 start = v->arch.guest_context.debugreg[i];
471 width = 0;
473 switch ( (v->arch.guest_context.debugreg[7] >>
474 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
475 {
476 case DR_LEN_1: width = 1; break;
477 case DR_LEN_2: width = 2; break;
478 case DR_LEN_4: width = 4; break;
479 case DR_LEN_8: width = 8; break;
480 }
482 if ( (start < (port + len)) && ((start + width) > port) )
483 match |= 1 << i;
484 }
486 return match;
487 }
489 /*
490 * Called from asm to set up the NMI trapbounce info.
491 * Returns 0 if no callback is set up, else 1.
492 */
493 asmlinkage int set_guest_nmi_trapbounce(void)
494 {
495 struct vcpu *v = current;
496 struct trap_bounce *tb = &v->arch.trap_bounce;
497 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
498 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
499 return !null_trap_bounce(v, tb);
500 }
502 static inline void do_trap(
503 int trapnr, struct cpu_user_regs *regs, int use_error_code)
504 {
505 struct vcpu *curr = current;
506 unsigned long fixup;
508 DEBUGGER_trap_entry(trapnr, regs);
510 if ( guest_mode(regs) )
511 {
512 do_guest_trap(trapnr, regs, use_error_code);
513 return;
514 }
516 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
517 {
518 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
519 trapnr, _p(regs->eip), _p(fixup));
520 regs->eip = fixup;
521 return;
522 }
524 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
525 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
526 {
527 curr->arch.hvm_vcpu.fpu_exception_callback(
528 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
529 return;
530 }
532 DEBUGGER_trap_fatal(trapnr, regs);
534 show_execution_state(regs);
535 panic("FATAL TRAP: vector = %d (%s)\n"
536 "[error_code=%04x]\n",
537 trapnr, trapstr(trapnr), regs->error_code);
538 }
540 #define DO_ERROR_NOCODE(trapnr, name) \
541 asmlinkage void do_##name(struct cpu_user_regs *regs) \
542 { \
543 do_trap(trapnr, regs, 0); \
544 }
546 #define DO_ERROR(trapnr, name) \
547 asmlinkage void do_##name(struct cpu_user_regs *regs) \
548 { \
549 do_trap(trapnr, regs, 1); \
550 }
552 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
553 DO_ERROR_NOCODE(TRAP_overflow, overflow)
554 DO_ERROR_NOCODE(TRAP_bounds, bounds)
555 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
556 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
557 DO_ERROR( TRAP_no_segment, segment_not_present)
558 DO_ERROR( TRAP_stack_error, stack_segment)
559 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
560 DO_ERROR( TRAP_alignment_check, alignment_check)
561 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
563 int rdmsr_hypervisor_regs(
564 uint32_t idx, uint32_t *eax, uint32_t *edx)
565 {
566 idx -= 0x40000000;
567 if ( idx > 0 )
568 return 0;
570 switch ( idx )
571 {
572 case 0:
573 {
574 *eax = *edx = 0;
575 break;
576 }
577 default:
578 BUG();
579 }
581 return 1;
582 }
584 int wrmsr_hypervisor_regs(
585 uint32_t idx, uint32_t eax, uint32_t edx)
586 {
587 struct domain *d = current->domain;
589 idx -= 0x40000000;
590 if ( idx > 0 )
591 return 0;
593 switch ( idx )
594 {
595 case 0:
596 {
597 void *hypercall_page;
598 unsigned long mfn;
599 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
600 unsigned int idx = eax & 0xfff;
602 if ( idx > 0 )
603 {
604 gdprintk(XENLOG_WARNING,
605 "Out of range index %u to MSR %08x\n",
606 idx, 0x40000000);
607 return 0;
608 }
610 mfn = gmfn_to_mfn(d, gmfn);
612 if ( !mfn_valid(mfn) ||
613 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
614 {
615 gdprintk(XENLOG_WARNING,
616 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
617 gmfn, mfn, 0x40000000);
618 return 0;
619 }
621 hypercall_page = map_domain_page(mfn);
622 hypercall_page_initialise(d, hypercall_page);
623 unmap_domain_page(hypercall_page);
625 put_page_and_type(mfn_to_page(mfn));
626 break;
627 }
629 default:
630 BUG();
631 }
633 return 1;
634 }
636 int cpuid_hypervisor_leaves(
637 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
638 {
639 idx -= 0x40000000;
640 if ( idx > 2 )
641 return 0;
643 switch ( idx )
644 {
645 case 0:
646 *eax = 0x40000002; /* Largest leaf */
647 *ebx = XEN_CPUID_SIGNATURE_EBX;
648 *ecx = XEN_CPUID_SIGNATURE_ECX;
649 *edx = XEN_CPUID_SIGNATURE_EDX;
650 break;
652 case 1:
653 *eax = (xen_major_version() << 16) | xen_minor_version();
654 *ebx = 0; /* Reserved */
655 *ecx = 0; /* Reserved */
656 *edx = 0; /* Reserved */
657 break;
659 case 2:
660 *eax = 1; /* Number of hypercall-transfer pages */
661 *ebx = 0x40000000; /* MSR base address */
662 *ecx = 0; /* Features 1 */
663 *edx = 0; /* Features 2 */
664 if ( !is_hvm_vcpu(current) )
665 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
666 break;
668 default:
669 BUG();
670 }
672 return 1;
673 }
675 static void pv_cpuid(struct cpu_user_regs *regs)
676 {
677 uint32_t a, b, c, d;
679 a = regs->eax;
680 b = regs->ebx;
681 c = regs->ecx;
682 d = regs->edx;
684 if ( current->domain->domain_id != 0 )
685 {
686 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
687 domain_cpuid(current->domain, a, b, &a, &b, &c, &d);
688 goto out;
689 }
691 asm (
692 "cpuid"
693 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
694 : "0" (a), "1" (b), "2" (c), "3" (d) );
696 if ( (regs->eax & 0x7fffffff) == 1 )
697 {
698 /* Modify Feature Information. */
699 __clear_bit(X86_FEATURE_VME, &d);
700 __clear_bit(X86_FEATURE_PSE, &d);
701 __clear_bit(X86_FEATURE_PGE, &d);
702 __clear_bit(X86_FEATURE_MCE, &d);
703 __clear_bit(X86_FEATURE_MCA, &d);
704 __clear_bit(X86_FEATURE_PSE36, &d);
705 }
706 switch ( (uint32_t)regs->eax )
707 {
708 case 1:
709 /* Modify Feature Information. */
710 if ( !cpu_has_sep )
711 __clear_bit(X86_FEATURE_SEP, &d);
712 #ifdef __i386__
713 if ( !supervisor_mode_kernel )
714 __clear_bit(X86_FEATURE_SEP, &d);
715 #endif
716 __clear_bit(X86_FEATURE_DS, &d);
717 __clear_bit(X86_FEATURE_ACC, &d);
718 __clear_bit(X86_FEATURE_PBE, &d);
720 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
721 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
722 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
723 __clear_bit(X86_FEATURE_VMXE % 32, &c);
724 __clear_bit(X86_FEATURE_SMXE % 32, &c);
725 __clear_bit(X86_FEATURE_TM2 % 32, &c);
726 if ( is_pv_32bit_vcpu(current) )
727 __clear_bit(X86_FEATURE_CX16 % 32, &c);
728 __clear_bit(X86_FEATURE_XTPR % 32, &c);
729 __clear_bit(X86_FEATURE_PDCM % 32, &c);
730 __clear_bit(X86_FEATURE_DCA % 32, &c);
731 break;
732 case 0x80000001:
733 /* Modify Feature Information. */
734 if ( is_pv_32bit_vcpu(current) )
735 {
736 __clear_bit(X86_FEATURE_LM % 32, &d);
737 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
738 }
739 #ifndef __i386__
740 if ( is_pv_32on64_vcpu(current) &&
741 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
742 #endif
743 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
744 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
745 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
747 __clear_bit(X86_FEATURE_SVME % 32, &c);
748 __clear_bit(X86_FEATURE_OSVW % 32, &c);
749 __clear_bit(X86_FEATURE_IBS % 32, &c);
750 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
751 __clear_bit(X86_FEATURE_WDT % 32, &c);
752 break;
753 case 5: /* MONITOR/MWAIT */
754 case 0xa: /* Architectural Performance Monitor Features */
755 case 0x8000000a: /* SVM revision and features */
756 case 0x8000001b: /* Instruction Based Sampling */
757 a = b = c = d = 0;
758 break;
759 default:
760 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
761 break;
762 }
764 out:
765 regs->eax = a;
766 regs->ebx = b;
767 regs->ecx = c;
768 regs->edx = d;
769 }
771 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
772 {
773 char sig[5], instr[2];
774 unsigned long eip, rc;
776 eip = regs->eip;
778 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
779 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
780 {
781 propagate_page_fault(eip + sizeof(sig) - rc, 0);
782 return EXCRET_fault_fixed;
783 }
784 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
785 return 0;
786 eip += sizeof(sig);
788 /* We only emulate CPUID. */
789 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
790 {
791 propagate_page_fault(eip + sizeof(instr) - rc, 0);
792 return EXCRET_fault_fixed;
793 }
794 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
795 return 0;
796 eip += sizeof(instr);
798 pv_cpuid(regs);
800 instruction_done(regs, eip, 0);
802 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
804 return EXCRET_fault_fixed;
805 }
807 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
808 {
809 struct bug_frame bug;
810 struct bug_frame_str bug_str;
811 char *filename, *predicate, *eip = (char *)regs->eip;
812 unsigned long fixup;
813 int id, lineno;
815 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
817 if ( likely(guest_mode(regs)) )
818 {
819 if ( !emulate_forced_invalid_op(regs) )
820 do_guest_trap(TRAP_invalid_op, regs, 0);
821 return;
822 }
824 if ( !is_kernel(eip) ||
825 __copy_from_user(&bug, eip, sizeof(bug)) ||
826 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
827 (bug.ret != 0xc2) )
828 goto die;
829 eip += sizeof(bug);
831 id = bug.id & 3;
833 if ( id == BUGFRAME_dump )
834 {
835 show_execution_state(regs);
836 regs->eip = (unsigned long)eip;
837 return;
838 }
840 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
841 if ( !is_kernel(eip) ||
842 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
843 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
844 goto die;
845 eip += sizeof(bug_str);
847 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
848 lineno = bug.id >> 2;
850 if ( id == BUGFRAME_warn )
851 {
852 printk("Xen WARN at %.50s:%d\n", filename, lineno);
853 show_execution_state(regs);
854 regs->eip = (unsigned long)eip;
855 return;
856 }
858 if ( id == BUGFRAME_bug )
859 {
860 printk("Xen BUG at %.50s:%d\n", filename, lineno);
861 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
862 show_execution_state(regs);
863 panic("Xen BUG at %.50s:%d\n", filename, lineno);
864 }
866 /* ASSERT: decode the predicate string pointer. */
867 ASSERT(id == BUGFRAME_assert);
868 if ( !is_kernel(eip) ||
869 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
870 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
871 goto die;
872 eip += sizeof(bug_str);
874 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
875 printk("Assertion '%s' failed at %.50s:%d\n",
876 predicate, filename, lineno);
877 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
878 show_execution_state(regs);
879 panic("Assertion '%s' failed at %.50s:%d\n",
880 predicate, filename, lineno);
882 die:
883 if ( (fixup = search_exception_table(regs->eip)) != 0 )
884 {
885 regs->eip = fixup;
886 return;
887 }
888 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
889 show_execution_state(regs);
890 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
891 }
893 asmlinkage void do_int3(struct cpu_user_regs *regs)
894 {
895 DEBUGGER_trap_entry(TRAP_int3, regs);
897 if ( !guest_mode(regs) )
898 {
899 debugger_trap_fatal(TRAP_int3, regs);
900 return;
901 }
903 do_guest_trap(TRAP_int3, regs, 0);
904 }
906 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
907 {
908 extern fastcall void (*machine_check_vector)(
909 struct cpu_user_regs *, long error_code);
910 machine_check_vector(regs, regs->error_code);
911 }
913 static void reserved_bit_page_fault(
914 unsigned long addr, struct cpu_user_regs *regs)
915 {
916 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
917 current->domain->domain_id, current->vcpu_id, regs->error_code);
918 show_page_walk(addr);
919 show_execution_state(regs);
920 }
922 void propagate_page_fault(unsigned long addr, u16 error_code)
923 {
924 struct trap_info *ti;
925 struct vcpu *v = current;
926 struct trap_bounce *tb = &v->arch.trap_bounce;
928 v->arch.guest_context.ctrlreg[2] = addr;
929 arch_set_cr2(v, addr);
931 /* Re-set error_code.user flag appropriately for the guest. */
932 error_code &= ~PFEC_user_mode;
933 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
934 error_code |= PFEC_user_mode;
936 trace_pv_page_fault(addr, error_code);
938 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
939 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
940 tb->error_code = error_code;
941 tb->cs = ti->cs;
942 tb->eip = ti->address;
943 if ( TI_GET_IF(ti) )
944 tb->flags |= TBF_INTERRUPT;
945 if ( unlikely(null_trap_bounce(v, tb)) )
946 {
947 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
948 v->domain->domain_id, v->vcpu_id, error_code);
949 show_page_walk(addr);
950 }
952 if ( unlikely(error_code & PFEC_reserved_bit) )
953 reserved_bit_page_fault(addr, guest_cpu_user_regs());
954 }
956 static int handle_gdt_ldt_mapping_fault(
957 unsigned long offset, struct cpu_user_regs *regs)
958 {
959 struct vcpu *curr = current;
960 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
961 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
962 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
964 /* Should never fault in another vcpu's area. */
965 BUG_ON(vcpu_area != curr->vcpu_id);
967 /* Byte offset within the gdt/ldt sub-area. */
968 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
970 if ( likely(is_ldt_area) )
971 {
972 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
973 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
974 {
975 if ( guest_mode(regs) )
976 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
977 regs->eip, offset);
978 }
979 else
980 {
981 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
982 if ( !guest_mode(regs) )
983 return 0;
984 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
985 propagate_page_fault(
986 curr->arch.guest_context.ldt_base + offset,
987 regs->error_code);
988 }
989 }
990 else
991 {
992 /* GDT fault: handle the fault as #GP(selector). */
993 regs->error_code = (u16)offset & ~7;
994 (void)do_general_protection(regs);
995 }
997 return EXCRET_fault_fixed;
998 }
1000 #ifdef HYPERVISOR_VIRT_END
1001 #define IN_HYPERVISOR_RANGE(va) \
1002 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1003 #else
1004 #define IN_HYPERVISOR_RANGE(va) \
1005 (((va) >= HYPERVISOR_VIRT_START))
1006 #endif
1008 static int __spurious_page_fault(
1009 unsigned long addr, struct cpu_user_regs *regs)
1011 unsigned long mfn, cr3 = read_cr3();
1012 #if CONFIG_PAGING_LEVELS >= 4
1013 l4_pgentry_t l4e, *l4t;
1014 #endif
1015 #if CONFIG_PAGING_LEVELS >= 3
1016 l3_pgentry_t l3e, *l3t;
1017 #endif
1018 l2_pgentry_t l2e, *l2t;
1019 l1_pgentry_t l1e, *l1t;
1020 unsigned int required_flags, disallowed_flags;
1022 /*
1023 * We do not take spurious page faults in IRQ handlers as we do not
1024 * modify page tables in IRQ context. We therefore bail here because
1025 * map_domain_page() is not IRQ-safe.
1026 */
1027 if ( in_irq() )
1028 return 0;
1030 /* Reserved bit violations are never spurious faults. */
1031 if ( regs->error_code & PFEC_reserved_bit )
1032 return 0;
1034 required_flags = _PAGE_PRESENT;
1035 if ( regs->error_code & PFEC_write_access )
1036 required_flags |= _PAGE_RW;
1037 if ( regs->error_code & PFEC_user_mode )
1038 required_flags |= _PAGE_USER;
1040 disallowed_flags = 0;
1041 if ( regs->error_code & PFEC_insn_fetch )
1042 disallowed_flags |= _PAGE_NX;
1044 mfn = cr3 >> PAGE_SHIFT;
1046 #if CONFIG_PAGING_LEVELS >= 4
1047 l4t = map_domain_page(mfn);
1048 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1049 mfn = l4e_get_pfn(l4e);
1050 unmap_domain_page(l4t);
1051 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1052 (l4e_get_flags(l4e) & disallowed_flags) )
1053 return 0;
1054 #endif
1056 #if CONFIG_PAGING_LEVELS >= 3
1057 l3t = map_domain_page(mfn);
1058 #if CONFIG_PAGING_LEVELS == 3
1059 l3t += (cr3 & 0xFE0UL) >> 3;
1060 #endif
1061 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1062 mfn = l3e_get_pfn(l3e);
1063 unmap_domain_page(l3t);
1064 #if CONFIG_PAGING_LEVELS == 3
1065 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1066 return 0;
1067 #else
1068 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1069 (l3e_get_flags(l3e) & disallowed_flags) )
1070 return 0;
1071 #endif
1072 #endif
1074 l2t = map_domain_page(mfn);
1075 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1076 mfn = l2e_get_pfn(l2e);
1077 unmap_domain_page(l2t);
1078 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1079 (l2e_get_flags(l2e) & disallowed_flags) )
1080 return 0;
1081 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1083 l1e = l1e_empty(); /* define before use in debug tracing */
1084 goto spurious;
1087 l1t = map_domain_page(mfn);
1088 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1089 mfn = l1e_get_pfn(l1e);
1090 unmap_domain_page(l1t);
1091 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1092 (l1e_get_flags(l1e) & disallowed_flags) )
1093 return 0;
1095 spurious:
1096 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1097 "at addr %lx, e/c %04x\n",
1098 current->domain->domain_id, current->vcpu_id,
1099 addr, regs->error_code);
1100 #if CONFIG_PAGING_LEVELS >= 4
1101 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1102 #endif
1103 #if CONFIG_PAGING_LEVELS >= 3
1104 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1105 #endif
1106 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1107 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1108 #ifndef NDEBUG
1109 show_registers(regs);
1110 #endif
1111 return 1;
1114 static int spurious_page_fault(
1115 unsigned long addr, struct cpu_user_regs *regs)
1117 unsigned long flags;
1118 int is_spurious;
1120 /*
1121 * Disabling interrupts prevents TLB flushing, and hence prevents
1122 * page tables from becoming invalid under our feet during the walk.
1123 */
1124 local_irq_save(flags);
1125 is_spurious = __spurious_page_fault(addr, regs);
1126 local_irq_restore(flags);
1128 return is_spurious;
1131 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1133 struct vcpu *v = current;
1134 struct domain *d = v->domain;
1136 /* No fixups in interrupt context or when interrupts are disabled. */
1137 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1138 return 0;
1140 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1142 if ( paging_mode_external(d) && guest_mode(regs) )
1144 int ret = paging_fault(addr, regs);
1145 if ( ret == EXCRET_fault_fixed )
1146 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1147 return ret;
1149 if ( !(regs->error_code & PFEC_reserved_bit) &&
1150 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1151 return handle_gdt_ldt_mapping_fault(
1152 addr - GDT_LDT_VIRT_START, regs);
1153 return 0;
1156 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1157 guest_kernel_mode(v, regs) &&
1158 /* Do not check if access-protection fault since the page may
1159 legitimately be not present in shadow page tables */
1160 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1161 PFEC_write_access) &&
1162 ptwr_do_page_fault(v, addr, regs) )
1163 return EXCRET_fault_fixed;
1165 if ( paging_mode_enabled(d) )
1167 int ret = paging_fault(addr, regs);
1168 if ( ret == EXCRET_fault_fixed )
1169 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1170 return ret;
1173 return 0;
1176 /*
1177 * #PF error code:
1178 * Bit 0: Protection violation (=1) ; Page not present (=0)
1179 * Bit 1: Write access
1180 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1181 * Bit 3: Reserved bit violation
1182 * Bit 4: Instruction fetch
1183 */
1184 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1186 unsigned long addr, fixup;
1188 addr = read_cr2();
1190 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1192 perfc_incr(page_faults);
1194 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1195 return;
1197 if ( unlikely(!guest_mode(regs)) )
1199 if ( spurious_page_fault(addr, regs) )
1200 return;
1202 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1204 perfc_incr(copy_user_faults);
1205 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1206 reserved_bit_page_fault(addr, regs);
1207 regs->eip = fixup;
1208 return;
1211 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1213 show_execution_state(regs);
1214 show_page_walk(addr);
1215 panic("FATAL PAGE FAULT\n"
1216 "[error_code=%04x]\n"
1217 "Faulting linear address: %p\n",
1218 regs->error_code, _p(addr));
1221 propagate_page_fault(addr, regs->error_code);
1224 /*
1225 * Early #PF handler to print CR2, error code, and stack.
1227 * We also deal with spurious faults here, even though they should never happen
1228 * during early boot (an issue was seen once, but was most likely a hardware
1229 * problem).
1230 */
1231 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1233 static int stuck;
1234 static unsigned long prev_eip, prev_cr2;
1235 unsigned long cr2 = read_cr2();
1237 BUG_ON(smp_processor_id() != 0);
1239 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1241 prev_eip = regs->eip;
1242 prev_cr2 = cr2;
1243 stuck = 0;
1244 return;
1247 if ( stuck++ == 1000 )
1249 unsigned long *stk = (unsigned long *)regs;
1250 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1251 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1252 printk("Stack dump: ");
1253 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1254 printk("%p ", _p(*stk++));
1255 for ( ; ; ) ;
1259 long do_fpu_taskswitch(int set)
1261 struct vcpu *v = current;
1263 if ( set )
1265 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1266 stts();
1268 else
1270 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1271 if ( v->fpu_dirtied )
1272 clts();
1275 return 0;
1278 static int read_descriptor(unsigned int sel,
1279 const struct vcpu *v,
1280 const struct cpu_user_regs * regs,
1281 unsigned long *base,
1282 unsigned long *limit,
1283 unsigned int *ar,
1284 unsigned int vm86attr)
1286 struct desc_struct desc;
1288 if ( !vm86_mode(regs) )
1290 if ( sel < 4)
1291 desc.b = desc.a = 0;
1292 else if ( __get_user(desc,
1293 (const struct desc_struct *)(!(sel & 4)
1294 ? GDT_VIRT_START(v)
1295 : LDT_VIRT_START(v))
1296 + (sel >> 3)) )
1297 return 0;
1298 if ( !(vm86attr & _SEGMENT_CODE) )
1299 desc.b &= ~_SEGMENT_L;
1301 else
1303 desc.a = (sel << 20) | 0xffff;
1304 desc.b = vm86attr | (sel >> 12);
1307 *ar = desc.b & 0x00f0ff00;
1308 if ( !(desc.b & _SEGMENT_L) )
1310 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1311 (desc.b & 0xff000000));
1312 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1313 if ( desc.b & _SEGMENT_G )
1314 *limit = ((*limit + 1) << 12) - 1;
1315 #ifndef NDEBUG
1316 if ( !vm86_mode(regs) && (sel > 3) )
1318 unsigned int a, l;
1319 unsigned char valid;
1321 asm volatile (
1322 "larl %2,%0 ; setz %1"
1323 : "=r" (a), "=rm" (valid) : "rm" (sel));
1324 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1325 asm volatile (
1326 "lsll %2,%0 ; setz %1"
1327 : "=r" (l), "=rm" (valid) : "rm" (sel));
1328 BUG_ON(valid && (l != *limit));
1330 #endif
1332 else
1334 *base = 0UL;
1335 *limit = ~0UL;
1338 return 1;
1341 #ifdef __x86_64__
1342 static int read_gate_descriptor(unsigned int gate_sel,
1343 const struct vcpu *v,
1344 unsigned int *sel,
1345 unsigned long *off,
1346 unsigned int *ar)
1348 struct desc_struct desc;
1349 const struct desc_struct *pdesc;
1352 pdesc = (const struct desc_struct *)
1353 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1354 + (gate_sel >> 3);
1355 if ( (gate_sel < 4) ||
1356 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1357 __get_user(desc, pdesc) )
1358 return 0;
1360 *sel = (desc.a >> 16) & 0x0000fffc;
1361 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1362 *ar = desc.b & 0x0000ffff;
1364 /*
1365 * check_descriptor() clears the DPL field and stores the
1366 * guest requested DPL in the selector's RPL field.
1367 */
1368 if ( *ar & _SEGMENT_DPL )
1369 return 0;
1370 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1372 if ( !is_pv_32bit_vcpu(v) )
1374 if ( (*ar & 0x1f00) != 0x0c00 ||
1375 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1376 __get_user(desc, pdesc + 1) ||
1377 (desc.b & 0x1f00) )
1378 return 0;
1380 *off |= (unsigned long)desc.a << 32;
1381 return 1;
1384 switch ( *ar & 0x1f00 )
1386 case 0x0400:
1387 *off &= 0xffff;
1388 break;
1389 case 0x0c00:
1390 break;
1391 default:
1392 return 0;
1395 return 1;
1397 #endif
1399 /* Has the guest requested sufficient permission for this I/O access? */
1400 static int guest_io_okay(
1401 unsigned int port, unsigned int bytes,
1402 struct vcpu *v, struct cpu_user_regs *regs)
1404 #if defined(__x86_64__)
1405 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1406 int user_mode = !(v->arch.flags & TF_kernel_mode);
1407 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1408 #elif defined(__i386__)
1409 #define TOGGLE_MODE() ((void)0)
1410 #endif
1412 if ( !vm86_mode(regs) &&
1413 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1414 return 1;
1416 if ( v->arch.iobmp_limit > (port + bytes) )
1418 union { uint8_t bytes[2]; uint16_t mask; } x;
1420 /*
1421 * Grab permission bytes from guest space. Inaccessible bytes are
1422 * read as 0xff (no access allowed).
1423 */
1424 TOGGLE_MODE();
1425 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1426 port>>3, 2) )
1428 default: x.bytes[0] = ~0;
1429 case 1: x.bytes[1] = ~0;
1430 case 0: break;
1432 TOGGLE_MODE();
1434 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1435 return 1;
1438 return 0;
1441 /* Has the administrator granted sufficient permission for this I/O access? */
1442 static int admin_io_okay(
1443 unsigned int port, unsigned int bytes,
1444 struct vcpu *v, struct cpu_user_regs *regs)
1446 /*
1447 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1448 * We never permit direct access to that register.
1449 */
1450 if ( (port == 0xcf8) && (bytes == 4) )
1451 return 0;
1453 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1456 static uint32_t guest_io_read(
1457 unsigned int port, unsigned int bytes,
1458 struct vcpu *v, struct cpu_user_regs *regs)
1460 extern uint32_t pci_conf_read(
1461 uint32_t cf8, uint8_t offset, uint8_t bytes);
1463 uint32_t data = 0;
1464 unsigned int shift = 0;
1466 if ( admin_io_okay(port, bytes, v, regs) )
1468 switch ( bytes )
1470 case 1: return inb(port);
1471 case 2: return inw(port);
1472 case 4: return inl(port);
1476 while ( bytes != 0 )
1478 unsigned int size = 1;
1479 uint32_t sub_data = 0xff;
1481 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1483 sub_data = pv_pit_handler(port, 0, 0);
1485 else if ( (port == 0xcf8) && (bytes == 4) )
1487 size = 4;
1488 sub_data = v->domain->arch.pci_cf8;
1490 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1492 size = min(bytes, 4 - (port & 3));
1493 if ( size == 3 )
1494 size = 2;
1495 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1498 if ( size == 4 )
1499 return sub_data;
1501 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1502 shift += size * 8;
1503 port += size;
1504 bytes -= size;
1507 return data;
1510 static void guest_io_write(
1511 unsigned int port, unsigned int bytes, uint32_t data,
1512 struct vcpu *v, struct cpu_user_regs *regs)
1514 extern void pci_conf_write(
1515 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1517 if ( admin_io_okay(port, bytes, v, regs) )
1519 switch ( bytes ) {
1520 case 1:
1521 outb((uint8_t)data, port);
1522 if ( pv_post_outb_hook )
1523 pv_post_outb_hook(port, (uint8_t)data);
1524 break;
1525 case 2:
1526 outw((uint16_t)data, port);
1527 break;
1528 case 4:
1529 outl(data, port);
1530 break;
1532 return;
1535 while ( bytes != 0 )
1537 unsigned int size = 1;
1539 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1541 pv_pit_handler(port, (uint8_t)data, 1);
1543 else if ( (port == 0xcf8) && (bytes == 4) )
1545 size = 4;
1546 v->domain->arch.pci_cf8 = data;
1548 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1550 size = min(bytes, 4 - (port & 3));
1551 if ( size == 3 )
1552 size = 2;
1553 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1556 if ( size == 4 )
1557 return;
1559 port += size;
1560 bytes -= size;
1561 data >>= size * 8;
1565 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1566 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1567 __attribute__((__regparm__(1)));
1568 unsigned long guest_to_host_gpr_switch(unsigned long)
1569 __attribute__((__regparm__(1)));
1571 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1573 /* Instruction fetch with error handling. */
1574 #define insn_fetch(type, base, eip, limit) \
1575 ({ unsigned long _rc, _ptr = (base) + (eip); \
1576 type _x; \
1577 if ( ad_default < 8 ) \
1578 _ptr = (unsigned int)_ptr; \
1579 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1580 goto fail; \
1581 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1582 { \
1583 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1584 goto skip; \
1585 } \
1586 (eip) += sizeof(_x); _x; })
1588 #if defined(CONFIG_X86_32)
1589 # define read_sreg(regs, sr) ((regs)->sr)
1590 #elif defined(CONFIG_X86_64)
1591 # define read_sreg(regs, sr) read_segment_register(sr)
1592 #endif
1594 static int emulate_privileged_op(struct cpu_user_regs *regs)
1596 struct vcpu *v = current;
1597 unsigned long *reg, eip = regs->eip, res;
1598 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1599 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1600 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1601 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1602 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1603 ? regs->reg \
1604 : ad_bytes == 4 \
1605 ? (u32)regs->reg \
1606 : (u16)regs->reg)
1607 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1608 ? regs->reg = (val) \
1609 : ad_bytes == 4 \
1610 ? (*(u32 *)&regs->reg = (val)) \
1611 : (*(u16 *)&regs->reg = (val)))
1612 unsigned long code_base, code_limit;
1613 char io_emul_stub[32];
1614 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1615 u32 l, h, eax, edx;
1617 if ( !read_descriptor(regs->cs, v, regs,
1618 &code_base, &code_limit, &ar,
1619 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1620 goto fail;
1621 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1622 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1623 if ( !(ar & _SEGMENT_S) ||
1624 !(ar & _SEGMENT_P) ||
1625 !(ar & _SEGMENT_CODE) )
1626 goto fail;
1628 /* emulating only opcodes not allowing SS to be default */
1629 data_sel = read_sreg(regs, ds);
1631 /* Legacy prefixes. */
1632 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1634 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1636 case 0x66: /* operand-size override */
1637 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1638 continue;
1639 case 0x67: /* address-size override */
1640 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1641 continue;
1642 case 0x2e: /* CS override */
1643 data_sel = regs->cs;
1644 continue;
1645 case 0x3e: /* DS override */
1646 data_sel = read_sreg(regs, ds);
1647 continue;
1648 case 0x26: /* ES override */
1649 data_sel = read_sreg(regs, es);
1650 continue;
1651 case 0x64: /* FS override */
1652 data_sel = read_sreg(regs, fs);
1653 lm_ovr = lm_seg_fs;
1654 continue;
1655 case 0x65: /* GS override */
1656 data_sel = read_sreg(regs, gs);
1657 lm_ovr = lm_seg_gs;
1658 continue;
1659 case 0x36: /* SS override */
1660 data_sel = regs->ss;
1661 continue;
1662 case 0xf0: /* LOCK */
1663 lock = 1;
1664 continue;
1665 case 0xf2: /* REPNE/REPNZ */
1666 case 0xf3: /* REP/REPE/REPZ */
1667 rep_prefix = 1;
1668 continue;
1669 default:
1670 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1672 rex = opcode;
1673 continue;
1675 break;
1677 break;
1680 /* REX prefix. */
1681 if ( rex & 8 ) /* REX.W */
1682 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1683 modrm_reg = (rex & 4) << 1; /* REX.R */
1684 /* REX.X does not need to be decoded. */
1685 modrm_rm = (rex & 1) << 3; /* REX.B */
1687 if ( opcode == 0x0f )
1688 goto twobyte_opcode;
1690 if ( lock )
1691 goto fail;
1693 /* Input/Output String instructions. */
1694 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1696 unsigned long data_base, data_limit;
1698 if ( rep_prefix && (rd_ad(ecx) == 0) )
1699 goto done;
1701 if ( !(opcode & 2) )
1703 data_sel = read_sreg(regs, es);
1704 lm_ovr = lm_seg_none;
1707 if ( !(ar & _SEGMENT_L) )
1709 if ( !read_descriptor(data_sel, v, regs,
1710 &data_base, &data_limit, &ar,
1711 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1712 _SEGMENT_P) )
1713 goto fail;
1714 if ( !(ar & _SEGMENT_S) ||
1715 !(ar & _SEGMENT_P) ||
1716 (opcode & 2 ?
1717 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1718 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1719 goto fail;
1721 #ifdef CONFIG_X86_64
1722 else
1724 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1726 switch ( lm_ovr )
1728 case lm_seg_none:
1729 data_base = 0UL;
1730 break;
1731 case lm_seg_fs:
1732 data_base = v->arch.guest_context.fs_base;
1733 break;
1734 case lm_seg_gs:
1735 if ( guest_kernel_mode(v, regs) )
1736 data_base = v->arch.guest_context.gs_base_kernel;
1737 else
1738 data_base = v->arch.guest_context.gs_base_user;
1739 break;
1742 else
1743 read_descriptor(data_sel, v, regs,
1744 &data_base, &data_limit, &ar,
1745 0);
1746 data_limit = ~0UL;
1747 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1749 #endif
1751 port = (u16)regs->edx;
1753 continue_io_string:
1754 switch ( opcode )
1756 case 0x6c: /* INSB */
1757 op_bytes = 1;
1758 case 0x6d: /* INSW/INSL */
1759 if ( (data_limit < (op_bytes - 1)) ||
1760 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1761 !guest_io_okay(port, op_bytes, v, regs) )
1762 goto fail;
1763 data = guest_io_read(port, op_bytes, v, regs);
1764 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1765 &data, op_bytes)) != 0 )
1767 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1768 PFEC_write_access);
1769 return EXCRET_fault_fixed;
1771 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1772 ? -op_bytes : op_bytes));
1773 break;
1775 case 0x6e: /* OUTSB */
1776 op_bytes = 1;
1777 case 0x6f: /* OUTSW/OUTSL */
1778 if ( (data_limit < (op_bytes - 1)) ||
1779 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1780 !guest_io_okay(port, op_bytes, v, regs) )
1781 goto fail;
1782 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1783 op_bytes)) != 0 )
1785 propagate_page_fault(data_base + rd_ad(esi)
1786 + op_bytes - rc, 0);
1787 return EXCRET_fault_fixed;
1789 guest_io_write(port, op_bytes, data, v, regs);
1790 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1791 ? -op_bytes : op_bytes));
1792 break;
1795 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1797 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1799 if ( !bpmatch && !hypercall_preempt_check() )
1800 goto continue_io_string;
1801 eip = regs->eip;
1804 goto done;
1807 /*
1808 * Very likely to be an I/O instruction (IN/OUT).
1809 * Build an on-stack stub to execute the instruction with full guest
1810 * GPR context. This is needed for some systems which (ab)use IN/OUT
1811 * to communicate with BIOS code in system-management mode.
1812 */
1813 #ifdef __x86_64__
1814 /* movq $host_to_guest_gpr_switch,%rcx */
1815 io_emul_stub[0] = 0x48;
1816 io_emul_stub[1] = 0xb9;
1817 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1818 /* callq *%rcx */
1819 io_emul_stub[10] = 0xff;
1820 io_emul_stub[11] = 0xd1;
1821 #else
1822 /* call host_to_guest_gpr_switch */
1823 io_emul_stub[0] = 0xe8;
1824 *(s32 *)&io_emul_stub[1] =
1825 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1826 /* 7 x nop */
1827 memset(&io_emul_stub[5], 0x90, 7);
1828 #endif
1829 /* data16 or nop */
1830 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1831 /* <io-access opcode> */
1832 io_emul_stub[13] = opcode;
1833 /* imm8 or nop */
1834 io_emul_stub[14] = 0x90;
1835 /* ret (jumps to guest_to_host_gpr_switch) */
1836 io_emul_stub[15] = 0xc3;
1838 /* Handy function-typed pointer to the stub. */
1839 io_emul = (void *)io_emul_stub;
1841 if ( ioemul_handle_quirk )
1842 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1844 /* I/O Port and Interrupt Flag instructions. */
1845 switch ( opcode )
1847 case 0xe4: /* IN imm8,%al */
1848 op_bytes = 1;
1849 case 0xe5: /* IN imm8,%eax */
1850 port = insn_fetch(u8, code_base, eip, code_limit);
1851 io_emul_stub[14] = port; /* imm8 */
1852 exec_in:
1853 if ( !guest_io_okay(port, op_bytes, v, regs) )
1854 goto fail;
1855 if ( admin_io_okay(port, op_bytes, v, regs) )
1857 io_emul(regs);
1859 else
1861 if ( op_bytes == 4 )
1862 regs->eax = 0;
1863 else
1864 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1865 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1867 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1868 goto done;
1870 case 0xec: /* IN %dx,%al */
1871 op_bytes = 1;
1872 case 0xed: /* IN %dx,%eax */
1873 port = (u16)regs->edx;
1874 goto exec_in;
1876 case 0xe6: /* OUT %al,imm8 */
1877 op_bytes = 1;
1878 case 0xe7: /* OUT %eax,imm8 */
1879 port = insn_fetch(u8, code_base, eip, code_limit);
1880 io_emul_stub[14] = port; /* imm8 */
1881 exec_out:
1882 if ( !guest_io_okay(port, op_bytes, v, regs) )
1883 goto fail;
1884 if ( admin_io_okay(port, op_bytes, v, regs) )
1886 io_emul(regs);
1887 if ( (op_bytes == 1) && pv_post_outb_hook )
1888 pv_post_outb_hook(port, regs->eax);
1890 else
1892 guest_io_write(port, op_bytes, regs->eax, v, regs);
1894 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1895 goto done;
1897 case 0xee: /* OUT %al,%dx */
1898 op_bytes = 1;
1899 case 0xef: /* OUT %eax,%dx */
1900 port = (u16)regs->edx;
1901 goto exec_out;
1903 case 0xfa: /* CLI */
1904 case 0xfb: /* STI */
1905 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1906 goto fail;
1907 /*
1908 * This is just too dangerous to allow, in my opinion. Consider if the
1909 * caller then tries to reenable interrupts using POPF: we can't trap
1910 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1911 * do for us. :-)
1912 */
1913 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1914 goto done;
1917 /* No decode of this single-byte opcode. */
1918 goto fail;
1920 twobyte_opcode:
1921 /* Two-byte opcodes only emulated from guest kernel. */
1922 if ( !guest_kernel_mode(v, regs) )
1923 goto fail;
1925 /* Privileged (ring 0) instructions. */
1926 opcode = insn_fetch(u8, code_base, eip, code_limit);
1927 if ( lock && (opcode & ~3) != 0x20 )
1928 goto fail;
1929 switch ( opcode )
1931 case 0x06: /* CLTS */
1932 (void)do_fpu_taskswitch(0);
1933 break;
1935 case 0x09: /* WBINVD */
1936 /* Ignore the instruction if unprivileged. */
1937 if ( !cache_flush_permitted(v->domain) )
1938 /* Non-physdev domain attempted WBINVD; ignore for now since
1939 newer linux uses this in some start-of-day timing loops */
1941 else
1942 wbinvd();
1943 break;
1945 case 0x20: /* MOV CR?,<reg> */
1946 opcode = insn_fetch(u8, code_base, eip, code_limit);
1947 if ( opcode < 0xc0 )
1948 goto fail;
1949 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1950 modrm_rm |= (opcode >> 0) & 7;
1951 reg = decode_register(modrm_rm, regs, 0);
1952 switch ( modrm_reg )
1954 case 0: /* Read CR0 */
1955 *reg = (read_cr0() & ~X86_CR0_TS) |
1956 v->arch.guest_context.ctrlreg[0];
1957 break;
1959 case 2: /* Read CR2 */
1960 *reg = v->arch.guest_context.ctrlreg[2];
1961 break;
1963 case 3: /* Read CR3 */
1964 if ( !is_pv_32on64_vcpu(v) )
1965 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1966 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1967 #ifdef CONFIG_COMPAT
1968 else
1969 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1970 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1971 #endif
1972 break;
1974 case 4: /* Read CR4 */
1975 /*
1976 * Guests can read CR4 to see what features Xen has enabled. We
1977 * therefore lie about PGE & PSE as they are unavailable to guests.
1978 */
1979 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1980 break;
1982 default:
1983 goto fail;
1985 break;
1987 case 0x21: /* MOV DR?,<reg> */
1988 opcode = insn_fetch(u8, code_base, eip, code_limit);
1989 if ( opcode < 0xc0 )
1990 goto fail;
1991 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1992 modrm_rm |= (opcode >> 0) & 7;
1993 reg = decode_register(modrm_rm, regs, 0);
1994 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1995 goto fail;
1996 *reg = res;
1997 break;
1999 case 0x22: /* MOV <reg>,CR? */
2000 opcode = insn_fetch(u8, code_base, eip, code_limit);
2001 if ( opcode < 0xc0 )
2002 goto fail;
2003 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2004 modrm_rm |= (opcode >> 0) & 7;
2005 reg = decode_register(modrm_rm, regs, 0);
2006 switch ( modrm_reg )
2008 case 0: /* Write CR0 */
2009 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2011 gdprintk(XENLOG_WARNING,
2012 "Attempt to change unmodifiable CR0 flags.\n");
2013 goto fail;
2015 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2016 break;
2018 case 2: /* Write CR2 */
2019 v->arch.guest_context.ctrlreg[2] = *reg;
2020 arch_set_cr2(v, *reg);
2021 break;
2023 case 3: /* Write CR3 */
2024 domain_lock(v->domain);
2025 if ( !is_pv_32on64_vcpu(v) )
2026 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2027 #ifdef CONFIG_COMPAT
2028 else
2029 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2030 #endif
2031 domain_unlock(v->domain);
2032 if ( rc == 0 ) /* not okay */
2033 goto fail;
2034 break;
2036 case 4: /* Write CR4 */
2037 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2038 write_cr4(pv_guest_cr4_to_real_cr4(
2039 v->arch.guest_context.ctrlreg[4]));
2040 break;
2042 default:
2043 goto fail;
2045 break;
2047 case 0x23: /* MOV <reg>,DR? */
2048 opcode = insn_fetch(u8, code_base, eip, code_limit);
2049 if ( opcode < 0xc0 )
2050 goto fail;
2051 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2052 modrm_rm |= (opcode >> 0) & 7;
2053 reg = decode_register(modrm_rm, regs, 0);
2054 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2055 goto fail;
2056 break;
2058 case 0x30: /* WRMSR */
2059 eax = regs->eax;
2060 edx = regs->edx;
2061 res = ((u64)edx << 32) | eax;
2062 switch ( (u32)regs->ecx )
2064 #ifdef CONFIG_X86_64
2065 case MSR_FS_BASE:
2066 if ( is_pv_32on64_vcpu(v) )
2067 goto fail;
2068 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2069 goto fail;
2070 v->arch.guest_context.fs_base = res;
2071 break;
2072 case MSR_GS_BASE:
2073 if ( is_pv_32on64_vcpu(v) )
2074 goto fail;
2075 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2076 goto fail;
2077 v->arch.guest_context.gs_base_kernel = res;
2078 break;
2079 case MSR_SHADOW_GS_BASE:
2080 if ( is_pv_32on64_vcpu(v) )
2081 goto fail;
2082 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2083 goto fail;
2084 v->arch.guest_context.gs_base_user = res;
2085 break;
2086 #endif
2087 case MSR_K7_FID_VID_STATUS:
2088 case MSR_K7_FID_VID_CTL:
2089 case MSR_K8_PSTATE_LIMIT:
2090 case MSR_K8_PSTATE_CTRL:
2091 case MSR_K8_PSTATE_STATUS:
2092 case MSR_K8_PSTATE0:
2093 case MSR_K8_PSTATE1:
2094 case MSR_K8_PSTATE2:
2095 case MSR_K8_PSTATE3:
2096 case MSR_K8_PSTATE4:
2097 case MSR_K8_PSTATE5:
2098 case MSR_K8_PSTATE6:
2099 case MSR_K8_PSTATE7:
2100 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2101 goto fail;
2102 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2103 break;
2104 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2105 goto fail;
2106 break;
2107 case MSR_IA32_PERF_CTL:
2108 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2109 goto fail;
2110 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2111 break;
2112 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2113 goto fail;
2114 break;
2115 default:
2116 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2117 break;
2118 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2119 (eax != l) || (edx != h) )
2120 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2121 "%08x:%08x to %08x:%08x.\n",
2122 _p(regs->ecx), h, l, edx, eax);
2123 break;
2125 break;
2127 case 0x31: /* RDTSC */
2128 rdtsc(regs->eax, regs->edx);
2129 break;
2131 case 0x32: /* RDMSR */
2132 switch ( (u32)regs->ecx )
2134 #ifdef CONFIG_X86_64
2135 case MSR_FS_BASE:
2136 if ( is_pv_32on64_vcpu(v) )
2137 goto fail;
2138 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2139 regs->edx = v->arch.guest_context.fs_base >> 32;
2140 break;
2141 case MSR_GS_BASE:
2142 if ( is_pv_32on64_vcpu(v) )
2143 goto fail;
2144 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2145 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2146 break;
2147 case MSR_SHADOW_GS_BASE:
2148 if ( is_pv_32on64_vcpu(v) )
2149 goto fail;
2150 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2151 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2152 break;
2153 #endif
2154 case MSR_K7_FID_VID_CTL:
2155 case MSR_K7_FID_VID_STATUS:
2156 case MSR_K8_PSTATE_LIMIT:
2157 case MSR_K8_PSTATE_CTRL:
2158 case MSR_K8_PSTATE_STATUS:
2159 case MSR_K8_PSTATE0:
2160 case MSR_K8_PSTATE1:
2161 case MSR_K8_PSTATE2:
2162 case MSR_K8_PSTATE3:
2163 case MSR_K8_PSTATE4:
2164 case MSR_K8_PSTATE5:
2165 case MSR_K8_PSTATE6:
2166 case MSR_K8_PSTATE7:
2167 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2168 goto fail;
2169 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2171 regs->eax = regs->edx = 0;
2172 break;
2174 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2175 goto fail;
2176 break;
2177 case MSR_EFER:
2178 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2179 goto fail;
2180 break;
2181 case MSR_IA32_MISC_ENABLE:
2182 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2183 goto fail;
2184 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2185 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2186 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2187 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2188 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2189 break;
2190 default:
2191 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2193 regs->eax = l;
2194 regs->edx = h;
2195 break;
2197 /* Everyone can read the MSR space. */
2198 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2199 _p(regs->ecx));*/
2200 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2201 goto fail;
2202 break;
2204 break;
2206 default:
2207 goto fail;
2210 #undef wr_ad
2211 #undef rd_ad
2213 done:
2214 instruction_done(regs, eip, bpmatch);
2215 skip:
2216 return EXCRET_fault_fixed;
2218 fail:
2219 return 0;
2222 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2223 unsigned int esp, unsigned int decr)
2225 return (((esp - decr) < (esp - 1)) &&
2226 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2229 static void emulate_gate_op(struct cpu_user_regs *regs)
2231 #ifdef __x86_64__
2232 struct vcpu *v = current;
2233 unsigned int sel, ar, dpl, nparm, opnd_sel;
2234 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2235 unsigned long off, eip, opnd_off, base, limit;
2236 int jump;
2238 /* Check whether this fault is due to the use of a call gate. */
2239 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2240 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2241 ((ar & _SEGMENT_TYPE) != 0xc00) )
2243 do_guest_trap(TRAP_gp_fault, regs, 1);
2244 return;
2246 if ( !(ar & _SEGMENT_P) )
2248 do_guest_trap(TRAP_no_segment, regs, 1);
2249 return;
2251 dpl = (ar >> 13) & 3;
2252 nparm = ar & 0x1f;
2254 /*
2255 * Decode instruction (and perhaps operand) to determine RPL,
2256 * whether this is a jump or a call, and the call return offset.
2257 */
2258 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2259 !(ar & _SEGMENT_S) ||
2260 !(ar & _SEGMENT_P) ||
2261 !(ar & _SEGMENT_CODE) )
2263 do_guest_trap(TRAP_gp_fault, regs, 1);
2264 return;
2267 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2268 ad_default = ad_bytes = op_default;
2269 opnd_sel = opnd_off = 0;
2270 jump = -1;
2271 for ( eip = regs->eip; eip - regs->_eip < 10; )
2273 switch ( insn_fetch(u8, base, eip, limit) )
2275 case 0x66: /* operand-size override */
2276 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2277 continue;
2278 case 0x67: /* address-size override */
2279 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2280 continue;
2281 case 0x2e: /* CS override */
2282 opnd_sel = regs->cs;
2283 ASSERT(opnd_sel);
2284 continue;
2285 case 0x3e: /* DS override */
2286 opnd_sel = read_sreg(regs, ds);
2287 if ( !opnd_sel )
2288 opnd_sel = dpl;
2289 continue;
2290 case 0x26: /* ES override */
2291 opnd_sel = read_sreg(regs, es);
2292 if ( !opnd_sel )
2293 opnd_sel = dpl;
2294 continue;
2295 case 0x64: /* FS override */
2296 opnd_sel = read_sreg(regs, fs);
2297 if ( !opnd_sel )
2298 opnd_sel = dpl;
2299 continue;
2300 case 0x65: /* GS override */
2301 opnd_sel = read_sreg(regs, gs);
2302 if ( !opnd_sel )
2303 opnd_sel = dpl;
2304 continue;
2305 case 0x36: /* SS override */
2306 opnd_sel = regs->ss;
2307 if ( !opnd_sel )
2308 opnd_sel = dpl;
2309 continue;
2310 case 0xea:
2311 ++jump;
2312 /* FALLTHROUGH */
2313 case 0x9a:
2314 ++jump;
2315 opnd_sel = regs->cs;
2316 opnd_off = eip;
2317 ad_bytes = ad_default;
2318 eip += op_bytes + 2;
2319 break;
2320 case 0xff:
2322 unsigned int modrm;
2324 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2326 case 0x28: case 0x68: case 0xa8:
2327 ++jump;
2328 /* FALLTHROUGH */
2329 case 0x18: case 0x58: case 0x98:
2330 ++jump;
2331 if ( ad_bytes != 2 )
2333 if ( (modrm & 7) == 4 )
2335 unsigned int sib;
2336 sib = insn_fetch(u8, base, eip, limit);
2338 modrm = (modrm & ~7) | (sib & 7);
2339 if ( (sib >>= 3) != 4 )
2340 opnd_off = *(unsigned long *)
2341 decode_register(sib & 7, regs, 0);
2342 opnd_off <<= sib >> 3;
2344 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2345 opnd_off += *(unsigned long *)
2346 decode_register(modrm & 7, regs, 0);
2347 else
2348 modrm |= 0x87;
2349 if ( !opnd_sel )
2351 switch ( modrm & 7 )
2353 default:
2354 opnd_sel = read_sreg(regs, ds);
2355 break;
2356 case 4: case 5:
2357 opnd_sel = regs->ss;
2358 break;
2362 else
2364 switch ( modrm & 7 )
2366 case 0: case 1: case 7:
2367 opnd_off = regs->ebx;
2368 break;
2369 case 6:
2370 if ( !(modrm & 0xc0) )
2371 modrm |= 0x80;
2372 else
2373 case 2: case 3:
2375 opnd_off = regs->ebp;
2376 if ( !opnd_sel )
2377 opnd_sel = regs->ss;
2379 break;
2381 if ( !opnd_sel )
2382 opnd_sel = read_sreg(regs, ds);
2383 switch ( modrm & 7 )
2385 case 0: case 2: case 4:
2386 opnd_off += regs->esi;
2387 break;
2388 case 1: case 3: case 5:
2389 opnd_off += regs->edi;
2390 break;
2393 switch ( modrm & 0xc0 )
2395 case 0x40:
2396 opnd_off += insn_fetch(s8, base, eip, limit);
2397 break;
2398 case 0x80:
2399 opnd_off += insn_fetch(s32, base, eip, limit);
2400 break;
2402 if ( ad_bytes == 4 )
2403 opnd_off = (unsigned int)opnd_off;
2404 else if ( ad_bytes == 2 )
2405 opnd_off = (unsigned short)opnd_off;
2406 break;
2409 break;
2411 break;
2414 if ( jump < 0 )
2416 fail:
2417 do_guest_trap(TRAP_gp_fault, regs, 1);
2418 skip:
2419 return;
2422 if ( (opnd_sel != regs->cs &&
2423 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2424 !(ar & _SEGMENT_S) ||
2425 !(ar & _SEGMENT_P) ||
2426 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2428 do_guest_trap(TRAP_gp_fault, regs, 1);
2429 return;
2432 opnd_off += op_bytes;
2433 #define ad_default ad_bytes
2434 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2435 #undef ad_default
2436 ASSERT((opnd_sel & ~3) == regs->error_code);
2437 if ( dpl < (opnd_sel & 3) )
2439 do_guest_trap(TRAP_gp_fault, regs, 1);
2440 return;
2443 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2444 !(ar & _SEGMENT_S) ||
2445 !(ar & _SEGMENT_CODE) ||
2446 (!jump || (ar & _SEGMENT_EC) ?
2447 ((ar >> 13) & 3) > (regs->cs & 3) :
2448 ((ar >> 13) & 3) != (regs->cs & 3)) )
2450 regs->error_code = sel;
2451 do_guest_trap(TRAP_gp_fault, regs, 1);
2452 return;
2454 if ( !(ar & _SEGMENT_P) )
2456 regs->error_code = sel;
2457 do_guest_trap(TRAP_no_segment, regs, 1);
2458 return;
2460 if ( off > limit )
2462 regs->error_code = 0;
2463 do_guest_trap(TRAP_gp_fault, regs, 1);
2464 return;
2467 if ( !jump )
2469 unsigned int ss, esp, *stkp;
2470 int rc;
2471 #define push(item) do \
2472 { \
2473 --stkp; \
2474 esp -= 4; \
2475 rc = __put_user(item, stkp); \
2476 if ( rc ) \
2477 { \
2478 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2479 PFEC_write_access); \
2480 return; \
2481 } \
2482 } while ( 0 )
2484 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2486 sel |= (ar >> 13) & 3;
2487 /* Inner stack known only for kernel ring. */
2488 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2490 do_guest_trap(TRAP_gp_fault, regs, 1);
2491 return;
2493 esp = v->arch.guest_context.kernel_sp;
2494 ss = v->arch.guest_context.kernel_ss;
2495 if ( (ss & 3) != (sel & 3) ||
2496 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2497 ((ar >> 13) & 3) != (sel & 3) ||
2498 !(ar & _SEGMENT_S) ||
2499 (ar & _SEGMENT_CODE) ||
2500 !(ar & _SEGMENT_WR) )
2502 regs->error_code = ss & ~3;
2503 do_guest_trap(TRAP_invalid_tss, regs, 1);
2504 return;
2506 if ( !(ar & _SEGMENT_P) ||
2507 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2509 regs->error_code = ss & ~3;
2510 do_guest_trap(TRAP_stack_error, regs, 1);
2511 return;
2513 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2514 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2516 do_guest_trap(TRAP_gp_fault, regs, 1);
2517 return;
2519 push(regs->ss);
2520 push(regs->esp);
2521 if ( nparm )
2523 const unsigned int *ustkp;
2525 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2526 ((ar >> 13) & 3) != (regs->cs & 3) ||
2527 !(ar & _SEGMENT_S) ||
2528 (ar & _SEGMENT_CODE) ||
2529 !(ar & _SEGMENT_WR) ||
2530 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2531 return do_guest_trap(TRAP_gp_fault, regs, 1);
2532 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2533 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2535 do_guest_trap(TRAP_gp_fault, regs, 1);
2536 return;
2538 do
2540 unsigned int parm;
2542 --ustkp;
2543 rc = __get_user(parm, ustkp);
2544 if ( rc )
2546 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2547 return;
2549 push(parm);
2550 } while ( --nparm );
2553 else
2555 sel |= (regs->cs & 3);
2556 esp = regs->esp;
2557 ss = regs->ss;
2558 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2559 ((ar >> 13) & 3) != (sel & 3) )
2561 do_guest_trap(TRAP_gp_fault, regs, 1);
2562 return;
2564 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2566 regs->error_code = 0;
2567 do_guest_trap(TRAP_stack_error, regs, 1);
2568 return;
2570 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2571 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2573 do_guest_trap(TRAP_gp_fault, regs, 1);
2574 return;
2577 push(regs->cs);
2578 push(eip);
2579 #undef push
2580 regs->esp = esp;
2581 regs->ss = ss;
2583 else
2584 sel |= (regs->cs & 3);
2586 regs->cs = sel;
2587 instruction_done(regs, off, 0);
2588 #endif
2591 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2593 struct vcpu *v = current;
2594 unsigned long fixup;
2596 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2598 if ( regs->error_code & 1 )
2599 goto hardware_gp;
2601 if ( !guest_mode(regs) )
2602 goto gp_in_kernel;
2604 /*
2605 * Cunning trick to allow arbitrary "INT n" handling.
2607 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2608 * instruction from trapping to the appropriate vector, when that might not
2609 * be expected by Xen or the guest OS. For example, that entry might be for
2610 * a fault handler (unlike traps, faults don't increment EIP), or might
2611 * expect an error code on the stack (which a software trap never
2612 * provides), or might be a hardware interrupt handler that doesn't like
2613 * being called spuriously.
2615 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2616 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2617 * clear to indicate that it's a software fault, not hardware.
2619 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2620 * okay because they can only be triggered by an explicit DPL-checked
2621 * instruction. The DPL specified by the guest OS for these vectors is NOT
2622 * CHECKED!!
2623 */
2624 if ( (regs->error_code & 3) == 2 )
2626 /* This fault must be due to <INT n> instruction. */
2627 const struct trap_info *ti;
2628 unsigned char vector = regs->error_code >> 3;
2629 ti = &v->arch.guest_context.trap_ctxt[vector];
2630 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2632 regs->eip += 2;
2633 do_guest_trap(vector, regs, 0);
2634 return;
2637 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2639 emulate_gate_op(regs);
2640 return;
2643 /* Emulate some simple privileged and I/O instructions. */
2644 if ( (regs->error_code == 0) &&
2645 emulate_privileged_op(regs) )
2647 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2648 return;
2651 #if defined(__i386__)
2652 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2653 (regs->error_code == 0) &&
2654 gpf_emulate_4gb(regs) )
2656 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2657 return;
2659 #endif
2661 /* Pass on GPF as is. */
2662 do_guest_trap(TRAP_gp_fault, regs, 1);
2663 return;
2665 gp_in_kernel:
2667 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2669 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2670 regs->error_code, _p(regs->eip), _p(fixup));
2671 regs->eip = fixup;
2672 return;
2675 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2677 hardware_gp:
2678 show_execution_state(regs);
2679 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2682 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2684 static void nmi_mce_softirq(void)
2686 int cpu = smp_processor_id();
2687 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2688 cpumask_t affinity;
2690 BUG_ON(st == NULL);
2691 BUG_ON(st->vcpu == NULL);
2693 /* Set the tmp value unconditionally, so that
2694 * the check in the iret hypercall works. */
2695 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2697 if ((cpu != st->processor)
2698 || (st->processor != st->vcpu->processor))
2700 /* We are on a different physical cpu.
2701 * Make sure to wakeup the vcpu on the
2702 * specified processor.
2703 */
2704 cpus_clear(affinity);
2705 cpu_set(st->processor, affinity);
2706 vcpu_set_affinity(st->vcpu, &affinity);
2708 /* Affinity is restored in the iret hypercall. */
2711 /* Only used to defer wakeup of domain/vcpu to
2712 * a safe (non-NMI/MCE) context.
2713 */
2714 vcpu_kick(st->vcpu);
2717 static void nmi_dom0_report(unsigned int reason_idx)
2719 struct domain *d = dom0;
2721 if ( (d == NULL) || (d->vcpu[0] == NULL) )
2722 return;
2724 set_bit(reason_idx, nmi_reason(d));
2726 send_guest_trap(d, 0, TRAP_nmi);
2729 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2731 switch ( opt_nmi[0] )
2733 case 'd': /* 'dom0' */
2734 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2735 case 'i': /* 'ignore' */
2736 break;
2737 default: /* 'fatal' */
2738 console_force_unlock();
2739 printk("\n\nNMI - MEMORY ERROR\n");
2740 fatal_trap(TRAP_nmi, regs);
2743 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2744 mdelay(1);
2745 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2748 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2750 switch ( opt_nmi[0] )
2752 case 'd': /* 'dom0' */
2753 nmi_dom0_report(_XEN_NMIREASON_io_error);
2754 case 'i': /* 'ignore' */
2755 break;
2756 default: /* 'fatal' */
2757 console_force_unlock();
2758 printk("\n\nNMI - I/O ERROR\n");
2759 fatal_trap(TRAP_nmi, regs);
2762 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2763 mdelay(1);
2764 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2767 static void unknown_nmi_error(unsigned char reason)
2769 switch ( opt_nmi[0] )
2771 case 'd': /* 'dom0' */
2772 nmi_dom0_report(_XEN_NMIREASON_unknown);
2773 case 'i': /* 'ignore' */
2774 break;
2775 default: /* 'fatal' */
2776 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2777 printk("Dazed and confused, but trying to continue\n");
2778 printk("Do you have a strange power saving mode enabled?\n");
2779 kexec_crash();
2783 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2785 return 0;
2788 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2790 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2792 unsigned int cpu = smp_processor_id();
2793 unsigned char reason;
2795 ++nmi_count(cpu);
2797 if ( nmi_callback(regs, cpu) )
2798 return;
2800 if ( nmi_watchdog )
2801 nmi_watchdog_tick(regs);
2803 /* Only the BSP gets external NMIs from the system. */
2804 if ( cpu == 0 )
2806 reason = inb(0x61);
2807 if ( reason & 0x80 )
2808 mem_parity_error(regs);
2809 else if ( reason & 0x40 )
2810 io_check_error(regs);
2811 else if ( !nmi_watchdog )
2812 unknown_nmi_error((unsigned char)(reason&0xff));
2816 void set_nmi_callback(nmi_callback_t callback)
2818 nmi_callback = callback;
2821 void unset_nmi_callback(void)
2823 nmi_callback = dummy_nmi_callback;
2826 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2828 struct vcpu *curr = current;
2830 BUG_ON(!guest_mode(regs));
2832 setup_fpu(curr);
2834 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2836 do_guest_trap(TRAP_no_device, regs, 0);
2837 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2839 else
2840 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2842 return;
2845 asmlinkage void do_debug(struct cpu_user_regs *regs)
2847 struct vcpu *v = current;
2849 DEBUGGER_trap_entry(TRAP_debug, regs);
2851 if ( !guest_mode(regs) )
2853 if ( regs->eflags & EF_TF )
2855 #ifdef __x86_64__
2856 void sysenter_entry(void);
2857 void sysenter_eflags_saved(void);
2858 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2859 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2860 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2861 goto out;
2862 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2863 #else
2864 WARN_ON(1);
2865 #endif
2866 regs->eflags &= ~EF_TF;
2868 else
2870 /*
2871 * We ignore watchpoints when they trigger within Xen. This may
2872 * happen when a buffer is passed to us which previously had a
2873 * watchpoint set on it. No need to bump EIP; the only faulting
2874 * trap is an instruction breakpoint, which can't happen to us.
2875 */
2876 WARN_ON(!search_exception_table(regs->eip));
2878 goto out;
2881 /* Save debug status register where guest OS can peek at it */
2882 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2884 ler_enable();
2885 do_guest_trap(TRAP_debug, regs, 0);
2886 return;
2888 out:
2889 ler_enable();
2890 return;
2893 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2897 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2899 int i;
2900 /* Keep secondary tables in sync with IRQ updates. */
2901 for ( i = 1; i < NR_CPUS; i++ )
2902 if ( idt_tables[i] != NULL )
2903 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2904 _set_gate(&idt_table[n], 14, dpl, addr);
2907 static void set_swint_gate(unsigned int n, void *addr)
2909 __set_intr_gate(n, 3, addr);
2912 void set_intr_gate(unsigned int n, void *addr)
2914 __set_intr_gate(n, 0, addr);
2917 void set_tss_desc(unsigned int n, void *addr)
2919 _set_tssldt_desc(
2920 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2921 (unsigned long)addr,
2922 offsetof(struct tss_struct, __cacheline_filler) - 1,
2923 9);
2924 #ifdef CONFIG_COMPAT
2925 _set_tssldt_desc(
2926 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2927 (unsigned long)addr,
2928 offsetof(struct tss_struct, __cacheline_filler) - 1,
2929 11);
2930 #endif
2933 void __devinit percpu_traps_init(void)
2935 subarch_percpu_traps_init();
2937 if ( !opt_ler )
2938 return;
2940 switch ( boot_cpu_data.x86_vendor )
2942 case X86_VENDOR_INTEL:
2943 switch ( boot_cpu_data.x86 )
2945 case 6:
2946 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2947 break;
2948 case 15:
2949 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2950 break;
2952 break;
2953 case X86_VENDOR_AMD:
2954 switch ( boot_cpu_data.x86 )
2956 case 6:
2957 case 15:
2958 case 16:
2959 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2960 break;
2962 break;
2965 ler_enable();
2968 void __init trap_init(void)
2970 /*
2971 * Note that interrupt gates are always used, rather than trap gates. We
2972 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2973 * first activation must have the "bad" value(s) for these registers and
2974 * we may lose them if another activation is installed before they are
2975 * saved. The page-fault handler also needs interrupts disabled until %cr2
2976 * has been read and saved on the stack.
2977 */
2978 set_intr_gate(TRAP_divide_error,&divide_error);
2979 set_intr_gate(TRAP_debug,&debug);
2980 set_intr_gate(TRAP_nmi,&nmi);
2981 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
2982 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2983 set_intr_gate(TRAP_bounds,&bounds);
2984 set_intr_gate(TRAP_invalid_op,&invalid_op);
2985 set_intr_gate(TRAP_no_device,&device_not_available);
2986 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2987 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2988 set_intr_gate(TRAP_no_segment,&segment_not_present);
2989 set_intr_gate(TRAP_stack_error,&stack_segment);
2990 set_intr_gate(TRAP_gp_fault,&general_protection);
2991 set_intr_gate(TRAP_page_fault,&page_fault);
2992 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2993 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2994 set_intr_gate(TRAP_alignment_check,&alignment_check);
2995 set_intr_gate(TRAP_machine_check,&machine_check);
2996 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2998 /* CPU0 uses the master IDT. */
2999 idt_tables[0] = idt_table;
3001 percpu_traps_init();
3003 cpu_init();
3005 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3008 long register_guest_nmi_callback(unsigned long address)
3010 struct vcpu *v = current;
3011 struct domain *d = v->domain;
3012 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3014 t->vector = TRAP_nmi;
3015 t->flags = 0;
3016 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
3017 t->address = address;
3018 TI_SET_IF(t, 1);
3020 /*
3021 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3022 * now.
3023 */
3024 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3025 v->nmi_pending = 1;
3027 return 0;
3030 long unregister_guest_nmi_callback(void)
3032 struct vcpu *v = current;
3033 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3035 memset(t, 0, sizeof(*t));
3037 return 0;
3040 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3042 struct vcpu *v;
3043 struct softirq_trap *st;
3045 BUG_ON(d == NULL);
3046 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3047 v = d->vcpu[vcpuid];
3049 switch (trap_nr) {
3050 case TRAP_nmi:
3051 if ( !test_and_set_bool(v->nmi_pending) ) {
3052 st = &per_cpu(softirq_trap, smp_processor_id());
3053 st->domain = dom0;
3054 st->vcpu = dom0->vcpu[0];
3055 st->processor = st->vcpu->processor;
3057 /* not safe to wake up a vcpu here */
3058 raise_softirq(NMI_MCE_SOFTIRQ);
3059 return 0;
3061 break;
3064 /* delivery failed */
3065 return -EIO;
3069 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3071 struct trap_info cur;
3072 struct vcpu *curr = current;
3073 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3074 long rc = 0;
3076 /* If no table is presented then clear the entire virtual IDT. */
3077 if ( guest_handle_is_null(traps) )
3079 memset(dst, 0, 256 * sizeof(*dst));
3080 init_int80_direct_trap(curr);
3081 return 0;
3084 for ( ; ; )
3086 if ( hypercall_preempt_check() )
3088 rc = hypercall_create_continuation(
3089 __HYPERVISOR_set_trap_table, "h", traps);
3090 break;
3093 if ( copy_from_guest(&cur, traps, 1) )
3095 rc = -EFAULT;
3096 break;
3099 if ( cur.address == 0 )
3100 break;
3102 fixup_guest_code_selector(curr->domain, cur.cs);
3104 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3106 if ( cur.vector == 0x80 )
3107 init_int80_direct_trap(curr);
3109 guest_handle_add_offset(traps, 1);
3112 return rc;
3115 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3117 int i;
3118 struct vcpu *curr = current;
3120 switch ( reg )
3122 case 0:
3123 if ( !access_ok(value, sizeof(long)) )
3124 return -EPERM;
3125 if ( v == curr )
3126 write_debugreg(0, value);
3127 break;
3128 case 1:
3129 if ( !access_ok(value, sizeof(long)) )
3130 return -EPERM;
3131 if ( v == curr )
3132 write_debugreg(1, value);
3133 break;
3134 case 2:
3135 if ( !access_ok(value, sizeof(long)) )
3136 return -EPERM;
3137 if ( v == curr )
3138 write_debugreg(2, value);
3139 break;
3140 case 3:
3141 if ( !access_ok(value, sizeof(long)) )
3142 return -EPERM;
3143 if ( v == curr )
3144 write_debugreg(3, value);
3145 break;
3146 case 6:
3147 /*
3148 * DR6: Bits 4-11,16-31 reserved (set to 1).
3149 * Bit 12 reserved (set to 0).
3150 */
3151 value &= 0xffffefff; /* reserved bits => 0 */
3152 value |= 0xffff0ff0; /* reserved bits => 1 */
3153 if ( v == curr )
3154 write_debugreg(6, value);
3155 break;
3156 case 7:
3157 /*
3158 * DR7: Bit 10 reserved (set to 1).
3159 * Bits 11-12,14-15 reserved (set to 0).
3160 */
3161 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3162 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3163 /*
3164 * Privileged bits:
3165 * GD (bit 13): must be 0.
3166 */
3167 if ( value & DR_GENERAL_DETECT )
3168 return -EPERM;
3169 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3170 if ( value & DR7_ACTIVE_MASK )
3172 unsigned int io_enable = 0;
3174 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3176 if ( ((value >> i) & 3) == DR_IO )
3178 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3179 return -EPERM;
3180 io_enable |= value & (3 << ((i - 16) >> 1));
3182 #ifdef __i386__
3183 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3184 !boot_cpu_has(X86_FEATURE_LM)) &&
3185 (((value >> i) & 0xc) == DR_LEN_8) )
3186 return -EPERM;
3187 #endif
3190 /* Guest DR5 is a handy stash for I/O intercept information. */
3191 v->arch.guest_context.debugreg[5] = io_enable;
3192 value &= ~io_enable;
3194 /*
3195 * If DR7 was previously clear then we need to load all other
3196 * debug registers at this point as they were not restored during
3197 * context switch.
3198 */
3199 if ( (v == curr) &&
3200 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3202 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3203 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3204 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3205 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3206 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3209 if ( v == curr )
3210 write_debugreg(7, value);
3211 break;
3212 default:
3213 return -EINVAL;
3216 v->arch.guest_context.debugreg[reg] = value;
3217 return 0;
3220 long do_set_debugreg(int reg, unsigned long value)
3222 return set_debugreg(current, reg, value);
3225 unsigned long do_get_debugreg(int reg)
3227 struct vcpu *curr = current;
3229 switch ( reg )
3231 case 0 ... 3:
3232 case 6:
3233 return curr->arch.guest_context.debugreg[reg];
3234 case 7:
3235 return (curr->arch.guest_context.debugreg[7] |
3236 curr->arch.guest_context.debugreg[5]);
3237 case 4 ... 5:
3238 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3239 curr->arch.guest_context.debugreg[reg + 2] : 0);
3242 return -EINVAL;
3245 /*
3246 * Local variables:
3247 * mode: C
3248 * c-set-style: "BSD"
3249 * c-basic-offset: 4
3250 * tab-width: 4
3251 * indent-tabs-mode: nil
3252 * End:
3253 */