ia64/xen-unstable

view xen/arch/x86/traps.c @ 17576:0eb471aa24dc

Enable Px/Cx related CPUID/MSR bits for dom0 to get correct Px/Cx info.

Signed-off-by: Wei Gang <gang.wei@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon May 05 10:16:06 2008 +0100 (2008-05-05)
parents defbab4dba1a
children 777f294e3be8
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
65 #include <public/arch-x86/cpuid.h>
67 /*
68 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
69 * fatal: Xen prints diagnostic message and then hangs.
70 * dom0: The NMI is virtualised to DOM0.
71 * ignore: The NMI error is cleared and ignored.
72 */
73 #ifdef NDEBUG
74 char opt_nmi[10] = "dom0";
75 #else
76 char opt_nmi[10] = "fatal";
77 #endif
78 string_param("nmi", opt_nmi);
80 DEFINE_PER_CPU(u32, ler_msr);
82 /* Master table, used by CPU0. */
83 idt_entry_t idt_table[IDT_ENTRIES];
85 /* Pointer to the IDT of every CPU. */
86 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
88 #define DECLARE_TRAP_HANDLER(_name) \
89 asmlinkage void _name(void); \
90 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
92 DECLARE_TRAP_HANDLER(divide_error);
93 DECLARE_TRAP_HANDLER(debug);
94 DECLARE_TRAP_HANDLER(nmi);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(machine_check);
109 DECLARE_TRAP_HANDLER(alignment_check);
110 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
112 long do_set_debugreg(int reg, unsigned long value);
113 unsigned long do_get_debugreg(int reg);
114 void (*ioemul_handle_quirk)(
115 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
117 static int debug_stack_lines = 20;
118 integer_param("debug_stack_lines", debug_stack_lines);
120 static int opt_ler;
121 boolean_param("ler", opt_ler);
123 #ifdef CONFIG_X86_32
124 #define stack_words_per_line 8
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
126 #else
127 #define stack_words_per_line 4
128 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
129 #endif
131 static void show_guest_stack(struct cpu_user_regs *regs)
132 {
133 int i;
134 struct vcpu *curr = current;
135 unsigned long *stack, addr;
137 if ( is_hvm_vcpu(curr) )
138 return;
140 if ( is_pv_32on64_vcpu(curr) )
141 {
142 compat_show_guest_stack(regs, debug_stack_lines);
143 return;
144 }
146 if ( vm86_mode(regs) )
147 {
148 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
149 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
150 regs->ss, (uint16_t)(regs->esp & 0xffff));
151 }
152 else
153 {
154 stack = (unsigned long *)regs->esp;
155 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
156 }
158 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
159 {
160 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
161 break;
162 if ( get_user(addr, stack) )
163 {
164 if ( i != 0 )
165 printk("\n ");
166 printk("Fault while accessing guest memory.");
167 i = 1;
168 break;
169 }
170 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
171 printk("\n ");
172 printk(" %p", _p(addr));
173 stack++;
174 }
175 if ( i == 0 )
176 printk("Stack empty.");
177 printk("\n");
178 }
180 #if !defined(CONFIG_FRAME_POINTER)
182 static void show_trace(struct cpu_user_regs *regs)
183 {
184 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
186 printk("Xen call trace:\n ");
188 printk("[<%p>]", _p(regs->eip));
189 print_symbol(" %s\n ", regs->eip);
191 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
192 {
193 addr = *stack++;
194 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
195 {
196 printk("[<%p>]", _p(addr));
197 print_symbol(" %s\n ", addr);
198 }
199 }
201 printk("\n");
202 }
204 #else
206 static void show_trace(struct cpu_user_regs *regs)
207 {
208 unsigned long *frame, next, addr, low, high;
210 printk("Xen call trace:\n ");
212 printk("[<%p>]", _p(regs->eip));
213 print_symbol(" %s\n ", regs->eip);
215 /* Bounds for range of valid frame pointer. */
216 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
217 high = (low & ~(STACK_SIZE - 1)) +
218 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
220 /* The initial frame pointer. */
221 next = regs->ebp;
223 for ( ; ; )
224 {
225 /* Valid frame pointer? */
226 if ( (next < low) || (next >= high) )
227 {
228 /*
229 * Exception stack frames have a different layout, denoted by an
230 * inverted frame pointer.
231 */
232 next = ~next;
233 if ( (next < low) || (next >= high) )
234 break;
235 frame = (unsigned long *)next;
236 next = frame[0];
237 addr = frame[(offsetof(struct cpu_user_regs, eip) -
238 offsetof(struct cpu_user_regs, ebp))
239 / BYTES_PER_LONG];
240 }
241 else
242 {
243 /* Ordinary stack frame. */
244 frame = (unsigned long *)next;
245 next = frame[0];
246 addr = frame[1];
247 }
249 printk("[<%p>]", _p(addr));
250 print_symbol(" %s\n ", addr);
252 low = (unsigned long)&frame[2];
253 }
255 printk("\n");
256 }
258 #endif
260 void show_stack(struct cpu_user_regs *regs)
261 {
262 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
263 int i;
265 if ( guest_mode(regs) )
266 return show_guest_stack(regs);
268 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
270 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
271 {
272 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
273 break;
274 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
275 printk("\n ");
276 addr = *stack++;
277 printk(" %p", _p(addr));
278 }
279 if ( i == 0 )
280 printk("Stack empty.");
281 printk("\n");
283 show_trace(regs);
284 }
286 void show_stack_overflow(unsigned int cpu, unsigned long esp)
287 {
288 #ifdef MEMORY_GUARD
289 unsigned long esp_top, esp_bottom;
290 unsigned long *stack, addr;
292 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
293 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
295 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
296 (void *)esp_top, (void *)esp_bottom, (void *)esp,
297 (void *)init_tss[cpu].esp0);
299 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
300 if ( ((unsigned long)(esp - esp_top) > 512) &&
301 ((unsigned long)(esp_top - esp) > 512) )
302 {
303 printk("No stack overflow detected. Skipping stack trace.\n");
304 return;
305 }
307 if ( esp < esp_top )
308 esp = esp_top;
310 printk("Xen stack overflow (dumping trace %p-%p):\n ",
311 (void *)esp, (void *)esp_bottom);
313 stack = (unsigned long *)esp;
314 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
315 {
316 addr = *stack++;
317 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
318 {
319 printk("%p: [<%p>]", stack, _p(addr));
320 print_symbol(" %s\n ", addr);
321 }
322 }
324 printk("\n");
325 #endif
326 }
328 void show_execution_state(struct cpu_user_regs *regs)
329 {
330 show_registers(regs);
331 show_stack(regs);
332 }
334 char *trapstr(int trapnr)
335 {
336 static char *strings[] = {
337 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
338 "invalid opcode", "device not available", "double fault",
339 "coprocessor segment", "invalid tss", "segment not found",
340 "stack error", "general protection fault", "page fault",
341 "spurious interrupt", "coprocessor error", "alignment check",
342 "machine check", "simd error"
343 };
345 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
346 return "???";
348 return strings[trapnr];
349 }
351 /*
352 * This is called for faults at very unexpected times (e.g., when interrupts
353 * are disabled). In such situations we can't do much that is safe. We try to
354 * print out some tracing and then we just spin.
355 */
356 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
357 {
358 static DEFINE_PER_CPU(char, depth);
360 /*
361 * In some cases, we can end up in a vicious cycle of fatal_trap()s
362 * within fatal_trap()s. We give the problem a couple of iterations to
363 * bottom out, and then we just panic.
364 */
365 if ( ++this_cpu(depth) < 3 )
366 {
367 watchdog_disable();
368 console_start_sync();
370 show_execution_state(regs);
372 if ( trapnr == TRAP_page_fault )
373 {
374 unsigned long cr2 = read_cr2();
375 printk("Faulting linear address: %p\n", _p(cr2));
376 show_page_walk(cr2);
377 }
378 }
380 panic("FATAL TRAP: vector = %d (%s)\n"
381 "[error_code=%04x] %s\n",
382 trapnr, trapstr(trapnr), regs->error_code,
383 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
384 }
386 static void do_guest_trap(
387 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
388 {
389 struct vcpu *v = current;
390 struct trap_bounce *tb;
391 const struct trap_info *ti;
393 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
395 tb = &v->arch.trap_bounce;
396 ti = &v->arch.guest_context.trap_ctxt[trapnr];
398 tb->flags = TBF_EXCEPTION;
399 tb->cs = ti->cs;
400 tb->eip = ti->address;
402 if ( use_error_code )
403 {
404 tb->flags |= TBF_EXCEPTION_ERRCODE;
405 tb->error_code = regs->error_code;
406 }
408 if ( TI_GET_IF(ti) )
409 tb->flags |= TBF_INTERRUPT;
411 if ( unlikely(null_trap_bounce(v, tb)) )
412 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
413 "on VCPU %d [ec=%04x]\n",
414 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
415 }
417 static void instruction_done(
418 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
419 {
420 regs->eip = eip;
421 regs->eflags &= ~X86_EFLAGS_RF;
422 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
423 {
424 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
425 if ( regs->eflags & X86_EFLAGS_TF )
426 current->arch.guest_context.debugreg[6] |= 0x4000;
427 do_guest_trap(TRAP_debug, regs, 0);
428 }
429 }
431 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
432 unsigned int port, unsigned int len)
433 {
434 unsigned int width, i, match = 0;
435 unsigned long start;
437 if ( !(v->arch.guest_context.debugreg[5]) ||
438 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
439 return 0;
441 for ( i = 0; i < 4; i++ )
442 {
443 if ( !(v->arch.guest_context.debugreg[5] &
444 (3 << (i * DR_ENABLE_SIZE))) )
445 continue;
447 start = v->arch.guest_context.debugreg[i];
448 width = 0;
450 switch ( (v->arch.guest_context.debugreg[7] >>
451 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
452 {
453 case DR_LEN_1: width = 1; break;
454 case DR_LEN_2: width = 2; break;
455 case DR_LEN_4: width = 4; break;
456 case DR_LEN_8: width = 8; break;
457 }
459 if ( (start < (port + len)) && ((start + width) > port) )
460 match |= 1 << i;
461 }
463 return match;
464 }
466 /*
467 * Called from asm to set up the NMI trapbounce info.
468 * Returns 0 if no callback is set up, else 1.
469 */
470 asmlinkage int set_guest_nmi_trapbounce(void)
471 {
472 struct vcpu *v = current;
473 struct trap_bounce *tb = &v->arch.trap_bounce;
474 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
475 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
476 return !null_trap_bounce(v, tb);
477 }
479 static inline void do_trap(
480 int trapnr, struct cpu_user_regs *regs, int use_error_code)
481 {
482 struct vcpu *curr = current;
483 unsigned long fixup;
485 DEBUGGER_trap_entry(trapnr, regs);
487 if ( guest_mode(regs) )
488 {
489 do_guest_trap(trapnr, regs, use_error_code);
490 return;
491 }
493 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
494 {
495 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
496 trapnr, _p(regs->eip), _p(fixup));
497 regs->eip = fixup;
498 return;
499 }
501 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
502 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
503 {
504 curr->arch.hvm_vcpu.fpu_exception_callback(
505 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
506 return;
507 }
509 DEBUGGER_trap_fatal(trapnr, regs);
511 show_execution_state(regs);
512 panic("FATAL TRAP: vector = %d (%s)\n"
513 "[error_code=%04x]\n",
514 trapnr, trapstr(trapnr), regs->error_code);
515 }
517 #define DO_ERROR_NOCODE(trapnr, name) \
518 asmlinkage void do_##name(struct cpu_user_regs *regs) \
519 { \
520 do_trap(trapnr, regs, 0); \
521 }
523 #define DO_ERROR(trapnr, name) \
524 asmlinkage void do_##name(struct cpu_user_regs *regs) \
525 { \
526 do_trap(trapnr, regs, 1); \
527 }
529 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
530 DO_ERROR_NOCODE(TRAP_overflow, overflow)
531 DO_ERROR_NOCODE(TRAP_bounds, bounds)
532 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
533 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
534 DO_ERROR( TRAP_no_segment, segment_not_present)
535 DO_ERROR( TRAP_stack_error, stack_segment)
536 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
537 DO_ERROR( TRAP_alignment_check, alignment_check)
538 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
540 int rdmsr_hypervisor_regs(
541 uint32_t idx, uint32_t *eax, uint32_t *edx)
542 {
543 idx -= 0x40000000;
544 if ( idx > 0 )
545 return 0;
547 switch ( idx )
548 {
549 case 0:
550 {
551 *eax = *edx = 0;
552 break;
553 }
554 default:
555 BUG();
556 }
558 return 1;
559 }
561 int wrmsr_hypervisor_regs(
562 uint32_t idx, uint32_t eax, uint32_t edx)
563 {
564 struct domain *d = current->domain;
566 idx -= 0x40000000;
567 if ( idx > 0 )
568 return 0;
570 switch ( idx )
571 {
572 case 0:
573 {
574 void *hypercall_page;
575 unsigned long mfn;
576 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
577 unsigned int idx = eax & 0xfff;
579 if ( idx > 0 )
580 {
581 gdprintk(XENLOG_WARNING,
582 "Out of range index %u to MSR %08x\n",
583 idx, 0x40000000);
584 return 0;
585 }
587 mfn = gmfn_to_mfn(d, gmfn);
589 if ( !mfn_valid(mfn) ||
590 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
591 {
592 gdprintk(XENLOG_WARNING,
593 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
594 gmfn, mfn, 0x40000000);
595 return 0;
596 }
598 hypercall_page = map_domain_page(mfn);
599 hypercall_page_initialise(d, hypercall_page);
600 unmap_domain_page(hypercall_page);
602 put_page_and_type(mfn_to_page(mfn));
603 break;
604 }
606 default:
607 BUG();
608 }
610 return 1;
611 }
613 int cpuid_hypervisor_leaves(
614 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
615 {
616 idx -= 0x40000000;
617 if ( idx > 2 )
618 return 0;
620 switch ( idx )
621 {
622 case 0:
623 *eax = 0x40000002; /* Largest leaf */
624 *ebx = XEN_CPUID_SIGNATURE_EBX;
625 *ecx = XEN_CPUID_SIGNATURE_ECX;
626 *edx = XEN_CPUID_SIGNATURE_EDX;
627 break;
629 case 1:
630 *eax = (xen_major_version() << 16) | xen_minor_version();
631 *ebx = 0; /* Reserved */
632 *ecx = 0; /* Reserved */
633 *edx = 0; /* Reserved */
634 break;
636 case 2:
637 *eax = 1; /* Number of hypercall-transfer pages */
638 *ebx = 0x40000000; /* MSR base address */
639 *ecx = 0; /* Features 1 */
640 *edx = 0; /* Features 2 */
641 if ( !is_hvm_vcpu(current) )
642 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
643 break;
645 default:
646 BUG();
647 }
649 return 1;
650 }
652 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
653 {
654 char sig[5], instr[2];
655 uint32_t a, b, c, d;
656 unsigned long eip, rc;
658 a = regs->eax;
659 b = regs->ebx;
660 c = regs->ecx;
661 d = regs->edx;
662 eip = regs->eip;
664 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
665 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
666 {
667 propagate_page_fault(eip + sizeof(sig) - rc, 0);
668 return EXCRET_fault_fixed;
669 }
670 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
671 return 0;
672 eip += sizeof(sig);
674 /* We only emulate CPUID. */
675 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
676 {
677 propagate_page_fault(eip + sizeof(instr) - rc, 0);
678 return EXCRET_fault_fixed;
679 }
680 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
681 return 0;
682 eip += sizeof(instr);
684 asm (
685 "cpuid"
686 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
687 : "0" (a), "1" (b), "2" (c), "3" (d) );
689 if ( (regs->eax & 0x7fffffff) == 1 )
690 {
691 /* Modify Feature Information. */
692 __clear_bit(X86_FEATURE_VME, &d);
693 __clear_bit(X86_FEATURE_PSE, &d);
694 __clear_bit(X86_FEATURE_PGE, &d);
695 __clear_bit(X86_FEATURE_MCE, &d);
696 __clear_bit(X86_FEATURE_MCA, &d);
697 if ( !IS_PRIV(current->domain) )
698 __clear_bit(X86_FEATURE_MTRR, &d);
699 __clear_bit(X86_FEATURE_PSE36, &d);
700 }
701 switch ( (uint32_t)regs->eax )
702 {
703 case 1:
704 /* Modify Feature Information. */
705 if ( !cpu_has_sep )
706 __clear_bit(X86_FEATURE_SEP, &d);
707 #ifdef __i386__
708 if ( !supervisor_mode_kernel )
709 __clear_bit(X86_FEATURE_SEP, &d);
710 #endif
711 __clear_bit(X86_FEATURE_DS, &d);
712 __clear_bit(X86_FEATURE_ACC, &d);
713 __clear_bit(X86_FEATURE_PBE, &d);
715 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
716 if ( !IS_PRIV(current->domain) )
717 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
718 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
719 __clear_bit(X86_FEATURE_VMXE % 32, &c);
720 __clear_bit(X86_FEATURE_SMXE % 32, &c);
721 if ( !IS_PRIV(current->domain) )
722 __clear_bit(X86_FEATURE_EST % 32, &c);
723 __clear_bit(X86_FEATURE_TM2 % 32, &c);
724 if ( is_pv_32bit_vcpu(current) )
725 __clear_bit(X86_FEATURE_CX16 % 32, &c);
726 __clear_bit(X86_FEATURE_XTPR % 32, &c);
727 __clear_bit(X86_FEATURE_PDCM % 32, &c);
728 __clear_bit(X86_FEATURE_DCA % 32, &c);
729 break;
730 case 0x80000001:
731 /* Modify Feature Information. */
732 if ( is_pv_32bit_vcpu(current) )
733 {
734 __clear_bit(X86_FEATURE_LM % 32, &d);
735 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
736 }
737 #ifndef __i386__
738 if ( is_pv_32on64_vcpu(current) &&
739 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
740 #endif
741 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
742 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
743 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
745 __clear_bit(X86_FEATURE_SVME % 32, &c);
746 __clear_bit(X86_FEATURE_OSVW % 32, &c);
747 __clear_bit(X86_FEATURE_IBS % 32, &c);
748 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
749 __clear_bit(X86_FEATURE_WDT % 32, &c);
750 break;
751 case 5: /* MONITOR/MWAIT */
752 case 0xa: /* Architectural Performance Monitor Features */
753 case 0x8000000a: /* SVM revision and features */
754 case 0x8000001b: /* Instruction Based Sampling */
755 a = b = c = d = 0;
756 break;
757 default:
758 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
759 break;
760 }
762 regs->eax = a;
763 regs->ebx = b;
764 regs->ecx = c;
765 regs->edx = d;
767 instruction_done(regs, eip, 0);
769 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
771 return EXCRET_fault_fixed;
772 }
774 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
775 {
776 struct bug_frame bug;
777 struct bug_frame_str bug_str;
778 char *filename, *predicate, *eip = (char *)regs->eip;
779 unsigned long fixup;
780 int id, lineno;
782 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
784 if ( likely(guest_mode(regs)) )
785 {
786 if ( !emulate_forced_invalid_op(regs) )
787 do_guest_trap(TRAP_invalid_op, regs, 0);
788 return;
789 }
791 if ( !is_kernel(eip) ||
792 __copy_from_user(&bug, eip, sizeof(bug)) ||
793 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
794 (bug.ret != 0xc2) )
795 goto die;
796 eip += sizeof(bug);
798 id = bug.id & 3;
800 if ( id == BUGFRAME_dump )
801 {
802 show_execution_state(regs);
803 regs->eip = (unsigned long)eip;
804 return;
805 }
807 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
808 if ( !is_kernel(eip) ||
809 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
810 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
811 goto die;
812 eip += sizeof(bug_str);
814 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
815 lineno = bug.id >> 2;
817 if ( id == BUGFRAME_warn )
818 {
819 printk("Xen WARN at %.50s:%d\n", filename, lineno);
820 show_execution_state(regs);
821 regs->eip = (unsigned long)eip;
822 return;
823 }
825 if ( id == BUGFRAME_bug )
826 {
827 printk("Xen BUG at %.50s:%d\n", filename, lineno);
828 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
829 show_execution_state(regs);
830 panic("Xen BUG at %.50s:%d\n", filename, lineno);
831 }
833 /* ASSERT: decode the predicate string pointer. */
834 ASSERT(id == BUGFRAME_assert);
835 if ( !is_kernel(eip) ||
836 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
837 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
838 goto die;
839 eip += sizeof(bug_str);
841 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
842 printk("Assertion '%s' failed at %.50s:%d\n",
843 predicate, filename, lineno);
844 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
845 show_execution_state(regs);
846 panic("Assertion '%s' failed at %.50s:%d\n",
847 predicate, filename, lineno);
849 die:
850 if ( (fixup = search_exception_table(regs->eip)) != 0 )
851 {
852 regs->eip = fixup;
853 return;
854 }
855 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
856 show_execution_state(regs);
857 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
858 }
860 asmlinkage void do_int3(struct cpu_user_regs *regs)
861 {
862 DEBUGGER_trap_entry(TRAP_int3, regs);
864 if ( !guest_mode(regs) )
865 {
866 debugger_trap_fatal(TRAP_int3, regs);
867 return;
868 }
870 do_guest_trap(TRAP_int3, regs, 0);
871 }
873 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
874 {
875 extern fastcall void (*machine_check_vector)(
876 struct cpu_user_regs *, long error_code);
877 machine_check_vector(regs, regs->error_code);
878 }
880 static void reserved_bit_page_fault(
881 unsigned long addr, struct cpu_user_regs *regs)
882 {
883 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
884 current->domain->domain_id, current->vcpu_id, regs->error_code);
885 show_page_walk(addr);
886 show_execution_state(regs);
887 }
889 void propagate_page_fault(unsigned long addr, u16 error_code)
890 {
891 struct trap_info *ti;
892 struct vcpu *v = current;
893 struct trap_bounce *tb = &v->arch.trap_bounce;
895 v->arch.guest_context.ctrlreg[2] = addr;
896 arch_set_cr2(v, addr);
898 /* Re-set error_code.user flag appropriately for the guest. */
899 error_code &= ~PFEC_user_mode;
900 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
901 error_code |= PFEC_user_mode;
903 trace_pv_page_fault(addr, error_code);
905 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
906 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
907 tb->error_code = error_code;
908 tb->cs = ti->cs;
909 tb->eip = ti->address;
910 if ( TI_GET_IF(ti) )
911 tb->flags |= TBF_INTERRUPT;
912 if ( unlikely(null_trap_bounce(v, tb)) )
913 {
914 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
915 v->domain->domain_id, v->vcpu_id, error_code);
916 show_page_walk(addr);
917 }
919 if ( unlikely(error_code & PFEC_reserved_bit) )
920 reserved_bit_page_fault(addr, guest_cpu_user_regs());
921 }
923 static int handle_gdt_ldt_mapping_fault(
924 unsigned long offset, struct cpu_user_regs *regs)
925 {
926 struct vcpu *curr = current;
927 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
928 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
929 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
931 /* Should never fault in another vcpu's area. */
932 BUG_ON(vcpu_area != curr->vcpu_id);
934 /* Byte offset within the gdt/ldt sub-area. */
935 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
937 if ( likely(is_ldt_area) )
938 {
939 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
940 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
941 {
942 if ( guest_mode(regs) )
943 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
944 regs->eip, offset);
945 }
946 else
947 {
948 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
949 if ( !guest_mode(regs) )
950 return 0;
951 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
952 propagate_page_fault(
953 curr->arch.guest_context.ldt_base + offset,
954 regs->error_code);
955 }
956 }
957 else
958 {
959 /* GDT fault: handle the fault as #GP(selector). */
960 regs->error_code = (u16)offset & ~7;
961 (void)do_general_protection(regs);
962 }
964 return EXCRET_fault_fixed;
965 }
967 #ifdef HYPERVISOR_VIRT_END
968 #define IN_HYPERVISOR_RANGE(va) \
969 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
970 #else
971 #define IN_HYPERVISOR_RANGE(va) \
972 (((va) >= HYPERVISOR_VIRT_START))
973 #endif
975 static int __spurious_page_fault(
976 unsigned long addr, struct cpu_user_regs *regs)
977 {
978 unsigned long mfn, cr3 = read_cr3();
979 #if CONFIG_PAGING_LEVELS >= 4
980 l4_pgentry_t l4e, *l4t;
981 #endif
982 #if CONFIG_PAGING_LEVELS >= 3
983 l3_pgentry_t l3e, *l3t;
984 #endif
985 l2_pgentry_t l2e, *l2t;
986 l1_pgentry_t l1e, *l1t;
987 unsigned int required_flags, disallowed_flags;
989 /*
990 * We do not take spurious page faults in IRQ handlers as we do not
991 * modify page tables in IRQ context. We therefore bail here because
992 * map_domain_page() is not IRQ-safe.
993 */
994 if ( in_irq() )
995 return 0;
997 /* Reserved bit violations are never spurious faults. */
998 if ( regs->error_code & PFEC_reserved_bit )
999 return 0;
1001 required_flags = _PAGE_PRESENT;
1002 if ( regs->error_code & PFEC_write_access )
1003 required_flags |= _PAGE_RW;
1004 if ( regs->error_code & PFEC_user_mode )
1005 required_flags |= _PAGE_USER;
1007 disallowed_flags = 0;
1008 if ( regs->error_code & PFEC_insn_fetch )
1009 disallowed_flags |= _PAGE_NX;
1011 mfn = cr3 >> PAGE_SHIFT;
1013 #if CONFIG_PAGING_LEVELS >= 4
1014 l4t = map_domain_page(mfn);
1015 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1016 mfn = l4e_get_pfn(l4e);
1017 unmap_domain_page(l4t);
1018 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1019 (l4e_get_flags(l4e) & disallowed_flags) )
1020 return 0;
1021 #endif
1023 #if CONFIG_PAGING_LEVELS >= 3
1024 l3t = map_domain_page(mfn);
1025 #ifdef CONFIG_X86_PAE
1026 l3t += (cr3 & 0xFE0UL) >> 3;
1027 #endif
1028 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1029 mfn = l3e_get_pfn(l3e);
1030 unmap_domain_page(l3t);
1031 #ifdef CONFIG_X86_PAE
1032 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1033 return 0;
1034 #else
1035 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1036 (l3e_get_flags(l3e) & disallowed_flags) )
1037 return 0;
1038 #endif
1039 #endif
1041 l2t = map_domain_page(mfn);
1042 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1043 mfn = l2e_get_pfn(l2e);
1044 unmap_domain_page(l2t);
1045 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1046 (l2e_get_flags(l2e) & disallowed_flags) )
1047 return 0;
1048 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1050 l1e = l1e_empty(); /* define before use in debug tracing */
1051 goto spurious;
1054 l1t = map_domain_page(mfn);
1055 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1056 mfn = l1e_get_pfn(l1e);
1057 unmap_domain_page(l1t);
1058 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1059 (l1e_get_flags(l1e) & disallowed_flags) )
1060 return 0;
1062 spurious:
1063 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1064 "at addr %lx, e/c %04x\n",
1065 current->domain->domain_id, current->vcpu_id,
1066 addr, regs->error_code);
1067 #if CONFIG_PAGING_LEVELS >= 4
1068 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1069 #endif
1070 #if CONFIG_PAGING_LEVELS >= 3
1071 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1072 #endif
1073 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1074 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1075 #ifndef NDEBUG
1076 show_registers(regs);
1077 #endif
1078 return 1;
1081 static int spurious_page_fault(
1082 unsigned long addr, struct cpu_user_regs *regs)
1084 unsigned long flags;
1085 int is_spurious;
1087 /*
1088 * Disabling interrupts prevents TLB flushing, and hence prevents
1089 * page tables from becoming invalid under our feet during the walk.
1090 */
1091 local_irq_save(flags);
1092 is_spurious = __spurious_page_fault(addr, regs);
1093 local_irq_restore(flags);
1095 return is_spurious;
1098 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1100 struct vcpu *v = current;
1101 struct domain *d = v->domain;
1103 /* No fixups in interrupt context or when interrupts are disabled. */
1104 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1105 return 0;
1107 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1109 if ( paging_mode_external(d) && guest_mode(regs) )
1111 int ret = paging_fault(addr, regs);
1112 if ( ret == EXCRET_fault_fixed )
1113 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1114 return ret;
1116 if ( !(regs->error_code & PFEC_reserved_bit) &&
1117 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1118 return handle_gdt_ldt_mapping_fault(
1119 addr - GDT_LDT_VIRT_START, regs);
1120 return 0;
1123 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1124 guest_kernel_mode(v, regs) &&
1125 /* Do not check if access-protection fault since the page may
1126 legitimately be not present in shadow page tables */
1127 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1128 PFEC_write_access) &&
1129 ptwr_do_page_fault(v, addr, regs) )
1130 return EXCRET_fault_fixed;
1132 if ( paging_mode_enabled(d) )
1134 int ret = paging_fault(addr, regs);
1135 if ( ret == EXCRET_fault_fixed )
1136 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1137 return ret;
1140 return 0;
1143 /*
1144 * #PF error code:
1145 * Bit 0: Protection violation (=1) ; Page not present (=0)
1146 * Bit 1: Write access
1147 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1148 * Bit 3: Reserved bit violation
1149 * Bit 4: Instruction fetch
1150 */
1151 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1153 unsigned long addr, fixup;
1155 addr = read_cr2();
1157 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1159 perfc_incr(page_faults);
1161 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1162 return;
1164 if ( unlikely(!guest_mode(regs)) )
1166 if ( spurious_page_fault(addr, regs) )
1167 return;
1169 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1171 perfc_incr(copy_user_faults);
1172 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1173 reserved_bit_page_fault(addr, regs);
1174 regs->eip = fixup;
1175 return;
1178 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1180 show_execution_state(regs);
1181 show_page_walk(addr);
1182 panic("FATAL PAGE FAULT\n"
1183 "[error_code=%04x]\n"
1184 "Faulting linear address: %p\n",
1185 regs->error_code, _p(addr));
1188 propagate_page_fault(addr, regs->error_code);
1191 /*
1192 * Early #PF handler to print CR2, error code, and stack.
1194 * We also deal with spurious faults here, even though they should never happen
1195 * during early boot (an issue was seen once, but was most likely a hardware
1196 * problem).
1197 */
1198 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1200 static int stuck;
1201 static unsigned long prev_eip, prev_cr2;
1202 unsigned long cr2 = read_cr2();
1204 BUG_ON(smp_processor_id() != 0);
1206 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1208 prev_eip = regs->eip;
1209 prev_cr2 = cr2;
1210 stuck = 0;
1211 return;
1214 if ( stuck++ == 1000 )
1216 unsigned long *stk = (unsigned long *)regs;
1217 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1218 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1219 printk("Stack dump: ");
1220 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1221 printk("%p ", _p(*stk++));
1222 for ( ; ; ) ;
1226 long do_fpu_taskswitch(int set)
1228 struct vcpu *v = current;
1230 if ( set )
1232 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1233 stts();
1235 else
1237 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1238 if ( v->fpu_dirtied )
1239 clts();
1242 return 0;
1245 static int read_descriptor(unsigned int sel,
1246 const struct vcpu *v,
1247 const struct cpu_user_regs * regs,
1248 unsigned long *base,
1249 unsigned long *limit,
1250 unsigned int *ar,
1251 unsigned int vm86attr)
1253 struct desc_struct desc;
1255 if ( !vm86_mode(regs) )
1257 if ( sel < 4)
1258 desc.b = desc.a = 0;
1259 else if ( __get_user(desc,
1260 (const struct desc_struct *)(!(sel & 4)
1261 ? GDT_VIRT_START(v)
1262 : LDT_VIRT_START(v))
1263 + (sel >> 3)) )
1264 return 0;
1265 if ( !(vm86attr & _SEGMENT_CODE) )
1266 desc.b &= ~_SEGMENT_L;
1268 else
1270 desc.a = (sel << 20) | 0xffff;
1271 desc.b = vm86attr | (sel >> 12);
1274 *ar = desc.b & 0x00f0ff00;
1275 if ( !(desc.b & _SEGMENT_L) )
1277 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1278 (desc.b & 0xff000000));
1279 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1280 if ( desc.b & _SEGMENT_G )
1281 *limit = ((*limit + 1) << 12) - 1;
1282 #ifndef NDEBUG
1283 if ( !vm86_mode(regs) && (sel > 3) )
1285 unsigned int a, l;
1286 unsigned char valid;
1288 asm volatile (
1289 "larl %2,%0 ; setz %1"
1290 : "=r" (a), "=rm" (valid) : "rm" (sel));
1291 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1292 asm volatile (
1293 "lsll %2,%0 ; setz %1"
1294 : "=r" (l), "=rm" (valid) : "rm" (sel));
1295 BUG_ON(valid && (l != *limit));
1297 #endif
1299 else
1301 *base = 0UL;
1302 *limit = ~0UL;
1305 return 1;
1308 #ifdef __x86_64__
1309 static int read_gate_descriptor(unsigned int gate_sel,
1310 const struct vcpu *v,
1311 unsigned int *sel,
1312 unsigned long *off,
1313 unsigned int *ar)
1315 struct desc_struct desc;
1316 const struct desc_struct *pdesc;
1319 pdesc = (const struct desc_struct *)
1320 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1321 + (gate_sel >> 3);
1322 if ( (gate_sel < 4) ||
1323 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1324 __get_user(desc, pdesc) )
1325 return 0;
1327 *sel = (desc.a >> 16) & 0x0000fffc;
1328 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1329 *ar = desc.b & 0x0000ffff;
1331 /*
1332 * check_descriptor() clears the DPL field and stores the
1333 * guest requested DPL in the selector's RPL field.
1334 */
1335 if ( *ar & _SEGMENT_DPL )
1336 return 0;
1337 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1339 if ( !is_pv_32bit_vcpu(v) )
1341 if ( (*ar & 0x1f00) != 0x0c00 ||
1342 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1343 __get_user(desc, pdesc + 1) ||
1344 (desc.b & 0x1f00) )
1345 return 0;
1347 *off |= (unsigned long)desc.a << 32;
1348 return 1;
1351 switch ( *ar & 0x1f00 )
1353 case 0x0400:
1354 *off &= 0xffff;
1355 break;
1356 case 0x0c00:
1357 break;
1358 default:
1359 return 0;
1362 return 1;
1364 #endif
1366 /* Has the guest requested sufficient permission for this I/O access? */
1367 static int guest_io_okay(
1368 unsigned int port, unsigned int bytes,
1369 struct vcpu *v, struct cpu_user_regs *regs)
1371 #if defined(__x86_64__)
1372 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1373 int user_mode = !(v->arch.flags & TF_kernel_mode);
1374 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1375 #elif defined(__i386__)
1376 #define TOGGLE_MODE() ((void)0)
1377 #endif
1379 if ( !vm86_mode(regs) &&
1380 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1381 return 1;
1383 if ( v->arch.iobmp_limit > (port + bytes) )
1385 union { uint8_t bytes[2]; uint16_t mask; } x;
1387 /*
1388 * Grab permission bytes from guest space. Inaccessible bytes are
1389 * read as 0xff (no access allowed).
1390 */
1391 TOGGLE_MODE();
1392 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1393 port>>3, 2) )
1395 default: x.bytes[0] = ~0;
1396 case 1: x.bytes[1] = ~0;
1397 case 0: break;
1399 TOGGLE_MODE();
1401 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1402 return 1;
1405 return 0;
1408 /* Has the administrator granted sufficient permission for this I/O access? */
1409 static int admin_io_okay(
1410 unsigned int port, unsigned int bytes,
1411 struct vcpu *v, struct cpu_user_regs *regs)
1413 /*
1414 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1415 * We never permit direct access to that register.
1416 */
1417 if ( (port == 0xcf8) && (bytes == 4) )
1418 return 0;
1420 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1423 static uint32_t guest_io_read(
1424 unsigned int port, unsigned int bytes,
1425 struct vcpu *v, struct cpu_user_regs *regs)
1427 extern uint32_t pci_conf_read(
1428 uint32_t cf8, uint8_t offset, uint8_t bytes);
1430 uint32_t data = 0;
1431 unsigned int shift = 0;
1433 if ( admin_io_okay(port, bytes, v, regs) )
1435 switch ( bytes )
1437 case 1: return inb(port);
1438 case 2: return inw(port);
1439 case 4: return inl(port);
1443 while ( bytes != 0 )
1445 unsigned int size = 1;
1446 uint32_t sub_data = 0xff;
1448 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1450 sub_data = pv_pit_handler(port, 0, 0);
1452 else if ( (port == 0xcf8) && (bytes == 4) )
1454 size = 4;
1455 sub_data = v->domain->arch.pci_cf8;
1457 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1459 size = min(bytes, 4 - (port & 3));
1460 if ( size == 3 )
1461 size = 2;
1462 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1465 if ( size == 4 )
1466 return sub_data;
1468 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1469 shift += size * 8;
1470 port += size;
1471 bytes -= size;
1474 return data;
1477 static void guest_io_write(
1478 unsigned int port, unsigned int bytes, uint32_t data,
1479 struct vcpu *v, struct cpu_user_regs *regs)
1481 extern void pci_conf_write(
1482 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1484 if ( admin_io_okay(port, bytes, v, regs) )
1486 switch ( bytes ) {
1487 case 1:
1488 outb((uint8_t)data, port);
1489 if ( pv_post_outb_hook )
1490 pv_post_outb_hook(port, (uint8_t)data);
1491 break;
1492 case 2:
1493 outw((uint16_t)data, port);
1494 break;
1495 case 4:
1496 outl(data, port);
1497 break;
1499 return;
1502 while ( bytes != 0 )
1504 unsigned int size = 1;
1506 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1508 pv_pit_handler(port, (uint8_t)data, 1);
1510 else if ( (port == 0xcf8) && (bytes == 4) )
1512 size = 4;
1513 v->domain->arch.pci_cf8 = data;
1515 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1517 size = min(bytes, 4 - (port & 3));
1518 if ( size == 3 )
1519 size = 2;
1520 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1523 if ( size == 4 )
1524 return;
1526 port += size;
1527 bytes -= size;
1528 data >>= size * 8;
1532 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1533 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1534 __attribute__((__regparm__(1)));
1535 unsigned long guest_to_host_gpr_switch(unsigned long)
1536 __attribute__((__regparm__(1)));
1538 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1540 /* Instruction fetch with error handling. */
1541 #define insn_fetch(type, base, eip, limit) \
1542 ({ unsigned long _rc, _ptr = (base) + (eip); \
1543 type _x; \
1544 if ( ad_default < 8 ) \
1545 _ptr = (unsigned int)_ptr; \
1546 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1547 goto fail; \
1548 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1549 { \
1550 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1551 goto skip; \
1552 } \
1553 (eip) += sizeof(_x); _x; })
1555 #if defined(CONFIG_X86_32)
1556 # define read_sreg(regs, sr) ((regs)->sr)
1557 #elif defined(CONFIG_X86_64)
1558 # define read_sreg(regs, sr) read_segment_register(sr)
1559 #endif
1561 static int emulate_privileged_op(struct cpu_user_regs *regs)
1563 struct vcpu *v = current;
1564 unsigned long *reg, eip = regs->eip, res;
1565 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1566 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1567 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1568 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1569 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1570 ? regs->reg \
1571 : ad_bytes == 4 \
1572 ? (u32)regs->reg \
1573 : (u16)regs->reg)
1574 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1575 ? regs->reg = (val) \
1576 : ad_bytes == 4 \
1577 ? (*(u32 *)&regs->reg = (val)) \
1578 : (*(u16 *)&regs->reg = (val)))
1579 unsigned long code_base, code_limit;
1580 char io_emul_stub[32];
1581 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1582 u32 l, h, eax, edx;
1584 if ( !read_descriptor(regs->cs, v, regs,
1585 &code_base, &code_limit, &ar,
1586 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1587 goto fail;
1588 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1589 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1590 if ( !(ar & _SEGMENT_S) ||
1591 !(ar & _SEGMENT_P) ||
1592 !(ar & _SEGMENT_CODE) )
1593 goto fail;
1595 /* emulating only opcodes not allowing SS to be default */
1596 data_sel = read_sreg(regs, ds);
1598 /* Legacy prefixes. */
1599 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1601 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1603 case 0x66: /* operand-size override */
1604 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1605 continue;
1606 case 0x67: /* address-size override */
1607 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1608 continue;
1609 case 0x2e: /* CS override */
1610 data_sel = regs->cs;
1611 continue;
1612 case 0x3e: /* DS override */
1613 data_sel = read_sreg(regs, ds);
1614 continue;
1615 case 0x26: /* ES override */
1616 data_sel = read_sreg(regs, es);
1617 continue;
1618 case 0x64: /* FS override */
1619 data_sel = read_sreg(regs, fs);
1620 lm_ovr = lm_seg_fs;
1621 continue;
1622 case 0x65: /* GS override */
1623 data_sel = read_sreg(regs, gs);
1624 lm_ovr = lm_seg_gs;
1625 continue;
1626 case 0x36: /* SS override */
1627 data_sel = regs->ss;
1628 continue;
1629 case 0xf0: /* LOCK */
1630 lock = 1;
1631 continue;
1632 case 0xf2: /* REPNE/REPNZ */
1633 case 0xf3: /* REP/REPE/REPZ */
1634 rep_prefix = 1;
1635 continue;
1636 default:
1637 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1639 rex = opcode;
1640 continue;
1642 break;
1644 break;
1647 /* REX prefix. */
1648 if ( rex & 8 ) /* REX.W */
1649 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1650 modrm_reg = (rex & 4) << 1; /* REX.R */
1651 /* REX.X does not need to be decoded. */
1652 modrm_rm = (rex & 1) << 3; /* REX.B */
1654 if ( opcode == 0x0f )
1655 goto twobyte_opcode;
1657 if ( lock )
1658 goto fail;
1660 /* Input/Output String instructions. */
1661 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1663 unsigned long data_base, data_limit;
1665 if ( rep_prefix && (rd_ad(ecx) == 0) )
1666 goto done;
1668 if ( !(opcode & 2) )
1670 data_sel = read_sreg(regs, es);
1671 lm_ovr = lm_seg_none;
1674 if ( !(ar & _SEGMENT_L) )
1676 if ( !read_descriptor(data_sel, v, regs,
1677 &data_base, &data_limit, &ar,
1678 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1679 _SEGMENT_P) )
1680 goto fail;
1681 if ( !(ar & _SEGMENT_S) ||
1682 !(ar & _SEGMENT_P) ||
1683 (opcode & 2 ?
1684 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1685 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1686 goto fail;
1688 #ifdef CONFIG_X86_64
1689 else
1691 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1693 switch ( lm_ovr )
1695 case lm_seg_none:
1696 data_base = 0UL;
1697 break;
1698 case lm_seg_fs:
1699 data_base = v->arch.guest_context.fs_base;
1700 break;
1701 case lm_seg_gs:
1702 if ( guest_kernel_mode(v, regs) )
1703 data_base = v->arch.guest_context.gs_base_kernel;
1704 else
1705 data_base = v->arch.guest_context.gs_base_user;
1706 break;
1709 else
1710 read_descriptor(data_sel, v, regs,
1711 &data_base, &data_limit, &ar,
1712 0);
1713 data_limit = ~0UL;
1714 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1716 #endif
1718 port = (u16)regs->edx;
1720 continue_io_string:
1721 switch ( opcode )
1723 case 0x6c: /* INSB */
1724 op_bytes = 1;
1725 case 0x6d: /* INSW/INSL */
1726 if ( (data_limit < (op_bytes - 1)) ||
1727 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1728 !guest_io_okay(port, op_bytes, v, regs) )
1729 goto fail;
1730 data = guest_io_read(port, op_bytes, v, regs);
1731 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1732 &data, op_bytes)) != 0 )
1734 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1735 PFEC_write_access);
1736 return EXCRET_fault_fixed;
1738 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1739 ? -op_bytes : op_bytes));
1740 break;
1742 case 0x6e: /* OUTSB */
1743 op_bytes = 1;
1744 case 0x6f: /* OUTSW/OUTSL */
1745 if ( (data_limit < (op_bytes - 1)) ||
1746 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1747 !guest_io_okay(port, op_bytes, v, regs) )
1748 goto fail;
1749 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1750 op_bytes)) != 0 )
1752 propagate_page_fault(data_base + rd_ad(esi)
1753 + op_bytes - rc, 0);
1754 return EXCRET_fault_fixed;
1756 guest_io_write(port, op_bytes, data, v, regs);
1757 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1758 ? -op_bytes : op_bytes));
1759 break;
1762 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1764 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1766 if ( !bpmatch && !hypercall_preempt_check() )
1767 goto continue_io_string;
1768 eip = regs->eip;
1771 goto done;
1774 /*
1775 * Very likely to be an I/O instruction (IN/OUT).
1776 * Build an on-stack stub to execute the instruction with full guest
1777 * GPR context. This is needed for some systems which (ab)use IN/OUT
1778 * to communicate with BIOS code in system-management mode.
1779 */
1780 #ifdef __x86_64__
1781 /* movq $host_to_guest_gpr_switch,%rcx */
1782 io_emul_stub[0] = 0x48;
1783 io_emul_stub[1] = 0xb9;
1784 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1785 /* callq *%rcx */
1786 io_emul_stub[10] = 0xff;
1787 io_emul_stub[11] = 0xd1;
1788 #else
1789 /* call host_to_guest_gpr_switch */
1790 io_emul_stub[0] = 0xe8;
1791 *(s32 *)&io_emul_stub[1] =
1792 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1793 /* 7 x nop */
1794 memset(&io_emul_stub[5], 0x90, 7);
1795 #endif
1796 /* data16 or nop */
1797 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1798 /* <io-access opcode> */
1799 io_emul_stub[13] = opcode;
1800 /* imm8 or nop */
1801 io_emul_stub[14] = 0x90;
1802 /* ret (jumps to guest_to_host_gpr_switch) */
1803 io_emul_stub[15] = 0xc3;
1805 /* Handy function-typed pointer to the stub. */
1806 io_emul = (void *)io_emul_stub;
1808 if ( ioemul_handle_quirk )
1809 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1811 /* I/O Port and Interrupt Flag instructions. */
1812 switch ( opcode )
1814 case 0xe4: /* IN imm8,%al */
1815 op_bytes = 1;
1816 case 0xe5: /* IN imm8,%eax */
1817 port = insn_fetch(u8, code_base, eip, code_limit);
1818 io_emul_stub[14] = port; /* imm8 */
1819 exec_in:
1820 if ( !guest_io_okay(port, op_bytes, v, regs) )
1821 goto fail;
1822 if ( admin_io_okay(port, op_bytes, v, regs) )
1824 io_emul(regs);
1826 else
1828 if ( op_bytes == 4 )
1829 regs->eax = 0;
1830 else
1831 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1832 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1834 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1835 goto done;
1837 case 0xec: /* IN %dx,%al */
1838 op_bytes = 1;
1839 case 0xed: /* IN %dx,%eax */
1840 port = (u16)regs->edx;
1841 goto exec_in;
1843 case 0xe6: /* OUT %al,imm8 */
1844 op_bytes = 1;
1845 case 0xe7: /* OUT %eax,imm8 */
1846 port = insn_fetch(u8, code_base, eip, code_limit);
1847 io_emul_stub[14] = port; /* imm8 */
1848 exec_out:
1849 if ( !guest_io_okay(port, op_bytes, v, regs) )
1850 goto fail;
1851 if ( admin_io_okay(port, op_bytes, v, regs) )
1853 io_emul(regs);
1854 if ( (op_bytes == 1) && pv_post_outb_hook )
1855 pv_post_outb_hook(port, regs->eax);
1857 else
1859 guest_io_write(port, op_bytes, regs->eax, v, regs);
1861 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1862 goto done;
1864 case 0xee: /* OUT %al,%dx */
1865 op_bytes = 1;
1866 case 0xef: /* OUT %eax,%dx */
1867 port = (u16)regs->edx;
1868 goto exec_out;
1870 case 0xfa: /* CLI */
1871 case 0xfb: /* STI */
1872 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1873 goto fail;
1874 /*
1875 * This is just too dangerous to allow, in my opinion. Consider if the
1876 * caller then tries to reenable interrupts using POPF: we can't trap
1877 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1878 * do for us. :-)
1879 */
1880 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1881 goto done;
1884 /* No decode of this single-byte opcode. */
1885 goto fail;
1887 twobyte_opcode:
1888 /* Two-byte opcodes only emulated from guest kernel. */
1889 if ( !guest_kernel_mode(v, regs) )
1890 goto fail;
1892 /* Privileged (ring 0) instructions. */
1893 opcode = insn_fetch(u8, code_base, eip, code_limit);
1894 if ( lock && (opcode & ~3) != 0x20 )
1895 goto fail;
1896 switch ( opcode )
1898 case 0x06: /* CLTS */
1899 (void)do_fpu_taskswitch(0);
1900 break;
1902 case 0x09: /* WBINVD */
1903 /* Ignore the instruction if unprivileged. */
1904 if ( !cache_flush_permitted(v->domain) )
1905 /* Non-physdev domain attempted WBINVD; ignore for now since
1906 newer linux uses this in some start-of-day timing loops */
1908 else
1909 wbinvd();
1910 break;
1912 case 0x20: /* MOV CR?,<reg> */
1913 opcode = insn_fetch(u8, code_base, eip, code_limit);
1914 if ( opcode < 0xc0 )
1915 goto fail;
1916 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1917 modrm_rm |= (opcode >> 0) & 7;
1918 reg = decode_register(modrm_rm, regs, 0);
1919 switch ( modrm_reg )
1921 case 0: /* Read CR0 */
1922 *reg = (read_cr0() & ~X86_CR0_TS) |
1923 v->arch.guest_context.ctrlreg[0];
1924 break;
1926 case 2: /* Read CR2 */
1927 *reg = v->arch.guest_context.ctrlreg[2];
1928 break;
1930 case 3: /* Read CR3 */
1931 if ( !is_pv_32on64_vcpu(v) )
1932 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1933 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1934 #ifdef CONFIG_COMPAT
1935 else
1936 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1937 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1938 #endif
1939 break;
1941 case 4: /* Read CR4 */
1942 /*
1943 * Guests can read CR4 to see what features Xen has enabled. We
1944 * therefore lie about PGE & PSE as they are unavailable to guests.
1945 */
1946 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1947 break;
1949 default:
1950 goto fail;
1952 break;
1954 case 0x21: /* MOV DR?,<reg> */
1955 opcode = insn_fetch(u8, code_base, eip, code_limit);
1956 if ( opcode < 0xc0 )
1957 goto fail;
1958 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1959 modrm_rm |= (opcode >> 0) & 7;
1960 reg = decode_register(modrm_rm, regs, 0);
1961 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1962 goto fail;
1963 *reg = res;
1964 break;
1966 case 0x22: /* MOV <reg>,CR? */
1967 opcode = insn_fetch(u8, code_base, eip, code_limit);
1968 if ( opcode < 0xc0 )
1969 goto fail;
1970 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1971 modrm_rm |= (opcode >> 0) & 7;
1972 reg = decode_register(modrm_rm, regs, 0);
1973 switch ( modrm_reg )
1975 case 0: /* Write CR0 */
1976 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1978 gdprintk(XENLOG_WARNING,
1979 "Attempt to change unmodifiable CR0 flags.\n");
1980 goto fail;
1982 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1983 break;
1985 case 2: /* Write CR2 */
1986 v->arch.guest_context.ctrlreg[2] = *reg;
1987 arch_set_cr2(v, *reg);
1988 break;
1990 case 3: /* Write CR3 */
1991 domain_lock(v->domain);
1992 if ( !is_pv_32on64_vcpu(v) )
1993 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1994 #ifdef CONFIG_COMPAT
1995 else
1996 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1997 #endif
1998 domain_unlock(v->domain);
1999 if ( rc == 0 ) /* not okay */
2000 goto fail;
2001 break;
2003 case 4: /* Write CR4 */
2004 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2005 write_cr4(pv_guest_cr4_to_real_cr4(
2006 v->arch.guest_context.ctrlreg[4]));
2007 break;
2009 default:
2010 goto fail;
2012 break;
2014 case 0x23: /* MOV <reg>,DR? */
2015 opcode = insn_fetch(u8, code_base, eip, code_limit);
2016 if ( opcode < 0xc0 )
2017 goto fail;
2018 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2019 modrm_rm |= (opcode >> 0) & 7;
2020 reg = decode_register(modrm_rm, regs, 0);
2021 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2022 goto fail;
2023 break;
2025 case 0x30: /* WRMSR */
2026 eax = regs->eax;
2027 edx = regs->edx;
2028 res = ((u64)edx << 32) | eax;
2029 switch ( (u32)regs->ecx )
2031 #ifdef CONFIG_X86_64
2032 case MSR_FS_BASE:
2033 if ( is_pv_32on64_vcpu(v) )
2034 goto fail;
2035 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2036 goto fail;
2037 v->arch.guest_context.fs_base = res;
2038 break;
2039 case MSR_GS_BASE:
2040 if ( is_pv_32on64_vcpu(v) )
2041 goto fail;
2042 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2043 goto fail;
2044 v->arch.guest_context.gs_base_kernel = res;
2045 break;
2046 case MSR_SHADOW_GS_BASE:
2047 if ( is_pv_32on64_vcpu(v) )
2048 goto fail;
2049 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2050 goto fail;
2051 v->arch.guest_context.gs_base_user = res;
2052 break;
2053 #endif
2054 case MSR_K7_FID_VID_STATUS:
2055 case MSR_K7_FID_VID_CTL:
2056 case MSR_K8_PSTATE_LIMIT:
2057 case MSR_K8_PSTATE_CTRL:
2058 case MSR_K8_PSTATE_STATUS:
2059 case MSR_K8_PSTATE0:
2060 case MSR_K8_PSTATE1:
2061 case MSR_K8_PSTATE2:
2062 case MSR_K8_PSTATE3:
2063 case MSR_K8_PSTATE4:
2064 case MSR_K8_PSTATE5:
2065 case MSR_K8_PSTATE6:
2066 case MSR_K8_PSTATE7:
2067 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2068 goto fail;
2069 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2070 break;
2071 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2072 goto fail;
2073 break;
2074 case MSR_IA32_PERF_CTL:
2075 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2076 goto fail;
2077 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2078 break;
2079 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2080 goto fail;
2081 break;
2082 default:
2083 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2084 break;
2085 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2086 (eax != l) || (edx != h) )
2087 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2088 "%08x:%08x to %08x:%08x.\n",
2089 _p(regs->ecx), h, l, edx, eax);
2090 break;
2092 break;
2094 case 0x31: /* RDTSC */
2095 rdtsc(regs->eax, regs->edx);
2096 break;
2098 case 0x32: /* RDMSR */
2099 switch ( (u32)regs->ecx )
2101 #ifdef CONFIG_X86_64
2102 case MSR_FS_BASE:
2103 if ( is_pv_32on64_vcpu(v) )
2104 goto fail;
2105 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2106 regs->edx = v->arch.guest_context.fs_base >> 32;
2107 break;
2108 case MSR_GS_BASE:
2109 if ( is_pv_32on64_vcpu(v) )
2110 goto fail;
2111 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2112 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2113 break;
2114 case MSR_SHADOW_GS_BASE:
2115 if ( is_pv_32on64_vcpu(v) )
2116 goto fail;
2117 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2118 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2119 break;
2120 #endif
2121 case MSR_K7_FID_VID_CTL:
2122 case MSR_K7_FID_VID_STATUS:
2123 case MSR_K8_PSTATE_LIMIT:
2124 case MSR_K8_PSTATE_CTRL:
2125 case MSR_K8_PSTATE_STATUS:
2126 case MSR_K8_PSTATE0:
2127 case MSR_K8_PSTATE1:
2128 case MSR_K8_PSTATE2:
2129 case MSR_K8_PSTATE3:
2130 case MSR_K8_PSTATE4:
2131 case MSR_K8_PSTATE5:
2132 case MSR_K8_PSTATE6:
2133 case MSR_K8_PSTATE7:
2134 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2135 goto fail;
2136 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2138 regs->eax = regs->edx = 0;
2139 break;
2141 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2142 goto fail;
2143 break;
2144 case MSR_EFER:
2145 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2146 goto fail;
2147 break;
2148 case MSR_IA32_MISC_ENABLE:
2149 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2150 goto fail;
2151 regs->eax &= ~MSR_IA32_MISC_ENABLE_PERF_AVAIL;
2152 if ( !IS_PRIV(current->domain) )
2153 regs->eax &= ~MSR_IA32_MISC_ENABLE_MONITOR_ENABLE;
2154 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2155 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2156 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2157 break;
2158 default:
2159 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2161 regs->eax = l;
2162 regs->edx = h;
2163 break;
2165 /* Everyone can read the MSR space. */
2166 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2167 _p(regs->ecx));*/
2168 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2169 goto fail;
2170 break;
2172 break;
2174 default:
2175 goto fail;
2178 #undef wr_ad
2179 #undef rd_ad
2181 done:
2182 instruction_done(regs, eip, bpmatch);
2183 skip:
2184 return EXCRET_fault_fixed;
2186 fail:
2187 return 0;
2190 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2191 unsigned int esp, unsigned int decr)
2193 return (((esp - decr) < (esp - 1)) &&
2194 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2197 static void emulate_gate_op(struct cpu_user_regs *regs)
2199 #ifdef __x86_64__
2200 struct vcpu *v = current;
2201 unsigned int sel, ar, dpl, nparm, opnd_sel;
2202 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2203 unsigned long off, eip, opnd_off, base, limit;
2204 int jump;
2206 /* Check whether this fault is due to the use of a call gate. */
2207 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2208 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2209 ((ar & _SEGMENT_TYPE) != 0xc00) )
2211 do_guest_trap(TRAP_gp_fault, regs, 1);
2212 return;
2214 if ( !(ar & _SEGMENT_P) )
2216 do_guest_trap(TRAP_no_segment, regs, 1);
2217 return;
2219 dpl = (ar >> 13) & 3;
2220 nparm = ar & 0x1f;
2222 /*
2223 * Decode instruction (and perhaps operand) to determine RPL,
2224 * whether this is a jump or a call, and the call return offset.
2225 */
2226 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2227 !(ar & _SEGMENT_S) ||
2228 !(ar & _SEGMENT_P) ||
2229 !(ar & _SEGMENT_CODE) )
2231 do_guest_trap(TRAP_gp_fault, regs, 1);
2232 return;
2235 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2236 ad_default = ad_bytes = op_default;
2237 opnd_sel = opnd_off = 0;
2238 jump = -1;
2239 for ( eip = regs->eip; eip - regs->_eip < 10; )
2241 switch ( insn_fetch(u8, base, eip, limit) )
2243 case 0x66: /* operand-size override */
2244 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2245 continue;
2246 case 0x67: /* address-size override */
2247 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2248 continue;
2249 case 0x2e: /* CS override */
2250 opnd_sel = regs->cs;
2251 ASSERT(opnd_sel);
2252 continue;
2253 case 0x3e: /* DS override */
2254 opnd_sel = read_sreg(regs, ds);
2255 if ( !opnd_sel )
2256 opnd_sel = dpl;
2257 continue;
2258 case 0x26: /* ES override */
2259 opnd_sel = read_sreg(regs, es);
2260 if ( !opnd_sel )
2261 opnd_sel = dpl;
2262 continue;
2263 case 0x64: /* FS override */
2264 opnd_sel = read_sreg(regs, fs);
2265 if ( !opnd_sel )
2266 opnd_sel = dpl;
2267 continue;
2268 case 0x65: /* GS override */
2269 opnd_sel = read_sreg(regs, gs);
2270 if ( !opnd_sel )
2271 opnd_sel = dpl;
2272 continue;
2273 case 0x36: /* SS override */
2274 opnd_sel = regs->ss;
2275 if ( !opnd_sel )
2276 opnd_sel = dpl;
2277 continue;
2278 case 0xea:
2279 ++jump;
2280 /* FALLTHROUGH */
2281 case 0x9a:
2282 ++jump;
2283 opnd_sel = regs->cs;
2284 opnd_off = eip;
2285 ad_bytes = ad_default;
2286 eip += op_bytes + 2;
2287 break;
2288 case 0xff:
2290 unsigned int modrm;
2292 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2294 case 0x28: case 0x68: case 0xa8:
2295 ++jump;
2296 /* FALLTHROUGH */
2297 case 0x18: case 0x58: case 0x98:
2298 ++jump;
2299 if ( ad_bytes != 2 )
2301 if ( (modrm & 7) == 4 )
2303 unsigned int sib;
2304 sib = insn_fetch(u8, base, eip, limit);
2306 modrm = (modrm & ~7) | (sib & 7);
2307 if ( (sib >>= 3) != 4 )
2308 opnd_off = *(unsigned long *)
2309 decode_register(sib & 7, regs, 0);
2310 opnd_off <<= sib >> 3;
2312 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2313 opnd_off += *(unsigned long *)
2314 decode_register(modrm & 7, regs, 0);
2315 else
2316 modrm |= 0x87;
2317 if ( !opnd_sel )
2319 switch ( modrm & 7 )
2321 default:
2322 opnd_sel = read_sreg(regs, ds);
2323 break;
2324 case 4: case 5:
2325 opnd_sel = regs->ss;
2326 break;
2330 else
2332 switch ( modrm & 7 )
2334 case 0: case 1: case 7:
2335 opnd_off = regs->ebx;
2336 break;
2337 case 6:
2338 if ( !(modrm & 0xc0) )
2339 modrm |= 0x80;
2340 else
2341 case 2: case 3:
2343 opnd_off = regs->ebp;
2344 if ( !opnd_sel )
2345 opnd_sel = regs->ss;
2347 break;
2349 if ( !opnd_sel )
2350 opnd_sel = read_sreg(regs, ds);
2351 switch ( modrm & 7 )
2353 case 0: case 2: case 4:
2354 opnd_off += regs->esi;
2355 break;
2356 case 1: case 3: case 5:
2357 opnd_off += regs->edi;
2358 break;
2361 switch ( modrm & 0xc0 )
2363 case 0x40:
2364 opnd_off += insn_fetch(s8, base, eip, limit);
2365 break;
2366 case 0x80:
2367 opnd_off += insn_fetch(s32, base, eip, limit);
2368 break;
2370 if ( ad_bytes == 4 )
2371 opnd_off = (unsigned int)opnd_off;
2372 else if ( ad_bytes == 2 )
2373 opnd_off = (unsigned short)opnd_off;
2374 break;
2377 break;
2379 break;
2382 if ( jump < 0 )
2384 fail:
2385 do_guest_trap(TRAP_gp_fault, regs, 1);
2386 skip:
2387 return;
2390 if ( (opnd_sel != regs->cs &&
2391 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2392 !(ar & _SEGMENT_S) ||
2393 !(ar & _SEGMENT_P) ||
2394 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2396 do_guest_trap(TRAP_gp_fault, regs, 1);
2397 return;
2400 opnd_off += op_bytes;
2401 #define ad_default ad_bytes
2402 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2403 #undef ad_default
2404 ASSERT((opnd_sel & ~3) == regs->error_code);
2405 if ( dpl < (opnd_sel & 3) )
2407 do_guest_trap(TRAP_gp_fault, regs, 1);
2408 return;
2411 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2412 !(ar & _SEGMENT_S) ||
2413 !(ar & _SEGMENT_CODE) ||
2414 (!jump || (ar & _SEGMENT_EC) ?
2415 ((ar >> 13) & 3) > (regs->cs & 3) :
2416 ((ar >> 13) & 3) != (regs->cs & 3)) )
2418 regs->error_code = sel;
2419 do_guest_trap(TRAP_gp_fault, regs, 1);
2420 return;
2422 if ( !(ar & _SEGMENT_P) )
2424 regs->error_code = sel;
2425 do_guest_trap(TRAP_no_segment, regs, 1);
2426 return;
2428 if ( off > limit )
2430 regs->error_code = 0;
2431 do_guest_trap(TRAP_gp_fault, regs, 1);
2432 return;
2435 if ( !jump )
2437 unsigned int ss, esp, *stkp;
2438 int rc;
2439 #define push(item) do \
2440 { \
2441 --stkp; \
2442 esp -= 4; \
2443 rc = __put_user(item, stkp); \
2444 if ( rc ) \
2445 { \
2446 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2447 PFEC_write_access); \
2448 return; \
2449 } \
2450 } while ( 0 )
2452 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2454 sel |= (ar >> 13) & 3;
2455 /* Inner stack known only for kernel ring. */
2456 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2458 do_guest_trap(TRAP_gp_fault, regs, 1);
2459 return;
2461 esp = v->arch.guest_context.kernel_sp;
2462 ss = v->arch.guest_context.kernel_ss;
2463 if ( (ss & 3) != (sel & 3) ||
2464 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2465 ((ar >> 13) & 3) != (sel & 3) ||
2466 !(ar & _SEGMENT_S) ||
2467 (ar & _SEGMENT_CODE) ||
2468 !(ar & _SEGMENT_WR) )
2470 regs->error_code = ss & ~3;
2471 do_guest_trap(TRAP_invalid_tss, regs, 1);
2472 return;
2474 if ( !(ar & _SEGMENT_P) ||
2475 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2477 regs->error_code = ss & ~3;
2478 do_guest_trap(TRAP_stack_error, regs, 1);
2479 return;
2481 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2482 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2484 do_guest_trap(TRAP_gp_fault, regs, 1);
2485 return;
2487 push(regs->ss);
2488 push(regs->esp);
2489 if ( nparm )
2491 const unsigned int *ustkp;
2493 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2494 ((ar >> 13) & 3) != (regs->cs & 3) ||
2495 !(ar & _SEGMENT_S) ||
2496 (ar & _SEGMENT_CODE) ||
2497 !(ar & _SEGMENT_WR) ||
2498 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2499 return do_guest_trap(TRAP_gp_fault, regs, 1);
2500 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2501 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2503 do_guest_trap(TRAP_gp_fault, regs, 1);
2504 return;
2506 do
2508 unsigned int parm;
2510 --ustkp;
2511 rc = __get_user(parm, ustkp);
2512 if ( rc )
2514 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2515 return;
2517 push(parm);
2518 } while ( --nparm );
2521 else
2523 sel |= (regs->cs & 3);
2524 esp = regs->esp;
2525 ss = regs->ss;
2526 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2527 ((ar >> 13) & 3) != (sel & 3) )
2529 do_guest_trap(TRAP_gp_fault, regs, 1);
2530 return;
2532 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2534 regs->error_code = 0;
2535 do_guest_trap(TRAP_stack_error, regs, 1);
2536 return;
2538 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2539 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2541 do_guest_trap(TRAP_gp_fault, regs, 1);
2542 return;
2545 push(regs->cs);
2546 push(eip);
2547 #undef push
2548 regs->esp = esp;
2549 regs->ss = ss;
2551 else
2552 sel |= (regs->cs & 3);
2554 regs->cs = sel;
2555 instruction_done(regs, off, 0);
2556 #endif
2559 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2561 struct vcpu *v = current;
2562 unsigned long fixup;
2564 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2566 if ( regs->error_code & 1 )
2567 goto hardware_gp;
2569 if ( !guest_mode(regs) )
2570 goto gp_in_kernel;
2572 /*
2573 * Cunning trick to allow arbitrary "INT n" handling.
2575 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2576 * instruction from trapping to the appropriate vector, when that might not
2577 * be expected by Xen or the guest OS. For example, that entry might be for
2578 * a fault handler (unlike traps, faults don't increment EIP), or might
2579 * expect an error code on the stack (which a software trap never
2580 * provides), or might be a hardware interrupt handler that doesn't like
2581 * being called spuriously.
2583 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2584 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2585 * clear to indicate that it's a software fault, not hardware.
2587 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2588 * okay because they can only be triggered by an explicit DPL-checked
2589 * instruction. The DPL specified by the guest OS for these vectors is NOT
2590 * CHECKED!!
2591 */
2592 if ( (regs->error_code & 3) == 2 )
2594 /* This fault must be due to <INT n> instruction. */
2595 const struct trap_info *ti;
2596 unsigned char vector = regs->error_code >> 3;
2597 ti = &v->arch.guest_context.trap_ctxt[vector];
2598 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2600 regs->eip += 2;
2601 do_guest_trap(vector, regs, 0);
2602 return;
2605 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2607 emulate_gate_op(regs);
2608 return;
2611 /* Emulate some simple privileged and I/O instructions. */
2612 if ( (regs->error_code == 0) &&
2613 emulate_privileged_op(regs) )
2615 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2616 return;
2619 #if defined(__i386__)
2620 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2621 (regs->error_code == 0) &&
2622 gpf_emulate_4gb(regs) )
2624 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2625 return;
2627 #endif
2629 /* Pass on GPF as is. */
2630 do_guest_trap(TRAP_gp_fault, regs, 1);
2631 return;
2633 gp_in_kernel:
2635 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2637 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2638 regs->error_code, _p(regs->eip), _p(fixup));
2639 regs->eip = fixup;
2640 return;
2643 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2645 hardware_gp:
2646 show_execution_state(regs);
2647 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2650 static void nmi_action(unsigned long unused)
2652 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2653 vcpu_kick(dom0->vcpu[0]);
2656 static DECLARE_TASKLET(nmi_tasklet, nmi_action, 0);
2658 static void nmi_dom0_report(unsigned int reason_idx)
2660 struct domain *d;
2661 struct vcpu *v;
2663 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2664 return;
2666 set_bit(reason_idx, nmi_reason(d));
2668 if ( !test_and_set_bool(v->nmi_pending) )
2669 tasklet_schedule(&nmi_tasklet); /* not safe to wake a vcpu here */
2672 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2674 switch ( opt_nmi[0] )
2676 case 'd': /* 'dom0' */
2677 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2678 case 'i': /* 'ignore' */
2679 break;
2680 default: /* 'fatal' */
2681 console_force_unlock();
2682 printk("\n\nNMI - MEMORY ERROR\n");
2683 fatal_trap(TRAP_nmi, regs);
2686 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2687 mdelay(1);
2688 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2691 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2693 switch ( opt_nmi[0] )
2695 case 'd': /* 'dom0' */
2696 nmi_dom0_report(_XEN_NMIREASON_io_error);
2697 case 'i': /* 'ignore' */
2698 break;
2699 default: /* 'fatal' */
2700 console_force_unlock();
2701 printk("\n\nNMI - I/O ERROR\n");
2702 fatal_trap(TRAP_nmi, regs);
2705 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2706 mdelay(1);
2707 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2710 static void unknown_nmi_error(unsigned char reason)
2712 switch ( opt_nmi[0] )
2714 case 'd': /* 'dom0' */
2715 nmi_dom0_report(_XEN_NMIREASON_unknown);
2716 case 'i': /* 'ignore' */
2717 break;
2718 default: /* 'fatal' */
2719 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2720 printk("Dazed and confused, but trying to continue\n");
2721 printk("Do you have a strange power saving mode enabled?\n");
2722 kexec_crash();
2726 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2728 return 0;
2731 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2733 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2735 unsigned int cpu = smp_processor_id();
2736 unsigned char reason;
2738 ++nmi_count(cpu);
2740 if ( nmi_callback(regs, cpu) )
2741 return;
2743 if ( nmi_watchdog )
2744 nmi_watchdog_tick(regs);
2746 /* Only the BSP gets external NMIs from the system. */
2747 if ( cpu == 0 )
2749 reason = inb(0x61);
2750 if ( reason & 0x80 )
2751 mem_parity_error(regs);
2752 else if ( reason & 0x40 )
2753 io_check_error(regs);
2754 else if ( !nmi_watchdog )
2755 unknown_nmi_error((unsigned char)(reason&0xff));
2759 void set_nmi_callback(nmi_callback_t callback)
2761 nmi_callback = callback;
2764 void unset_nmi_callback(void)
2766 nmi_callback = dummy_nmi_callback;
2769 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2771 struct vcpu *curr = current;
2773 BUG_ON(!guest_mode(regs));
2775 setup_fpu(curr);
2777 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2779 do_guest_trap(TRAP_no_device, regs, 0);
2780 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2782 else
2783 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2785 return;
2788 asmlinkage void do_debug(struct cpu_user_regs *regs)
2790 struct vcpu *v = current;
2792 DEBUGGER_trap_entry(TRAP_debug, regs);
2794 if ( !guest_mode(regs) )
2796 if ( regs->eflags & EF_TF )
2798 #ifdef __x86_64__
2799 void sysenter_entry(void);
2800 void sysenter_eflags_saved(void);
2801 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2802 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2803 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2804 goto out;
2805 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2806 #else
2807 WARN_ON(1);
2808 #endif
2809 regs->eflags &= ~EF_TF;
2811 else
2813 /*
2814 * We ignore watchpoints when they trigger within Xen. This may
2815 * happen when a buffer is passed to us which previously had a
2816 * watchpoint set on it. No need to bump EIP; the only faulting
2817 * trap is an instruction breakpoint, which can't happen to us.
2818 */
2819 WARN_ON(!search_exception_table(regs->eip));
2821 goto out;
2824 /* Save debug status register where guest OS can peek at it */
2825 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2827 ler_enable();
2828 do_guest_trap(TRAP_debug, regs, 0);
2829 return;
2831 out:
2832 ler_enable();
2833 return;
2836 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2840 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2842 int i;
2843 /* Keep secondary tables in sync with IRQ updates. */
2844 for ( i = 1; i < NR_CPUS; i++ )
2845 if ( idt_tables[i] != NULL )
2846 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2847 _set_gate(&idt_table[n], 14, dpl, addr);
2850 static void set_swint_gate(unsigned int n, void *addr)
2852 __set_intr_gate(n, 3, addr);
2855 void set_intr_gate(unsigned int n, void *addr)
2857 __set_intr_gate(n, 0, addr);
2860 void set_tss_desc(unsigned int n, void *addr)
2862 _set_tssldt_desc(
2863 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2864 (unsigned long)addr,
2865 offsetof(struct tss_struct, __cacheline_filler) - 1,
2866 9);
2867 #ifdef CONFIG_COMPAT
2868 _set_tssldt_desc(
2869 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2870 (unsigned long)addr,
2871 offsetof(struct tss_struct, __cacheline_filler) - 1,
2872 11);
2873 #endif
2876 void __devinit percpu_traps_init(void)
2878 subarch_percpu_traps_init();
2880 if ( !opt_ler )
2881 return;
2883 switch ( boot_cpu_data.x86_vendor )
2885 case X86_VENDOR_INTEL:
2886 switch ( boot_cpu_data.x86 )
2888 case 6:
2889 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2890 break;
2891 case 15:
2892 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2893 break;
2895 break;
2896 case X86_VENDOR_AMD:
2897 switch ( boot_cpu_data.x86 )
2899 case 6:
2900 case 15:
2901 case 16:
2902 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2903 break;
2905 break;
2908 ler_enable();
2911 void __init trap_init(void)
2913 /*
2914 * Note that interrupt gates are always used, rather than trap gates. We
2915 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2916 * first activation must have the "bad" value(s) for these registers and
2917 * we may lose them if another activation is installed before they are
2918 * saved. The page-fault handler also needs interrupts disabled until %cr2
2919 * has been read and saved on the stack.
2920 */
2921 set_intr_gate(TRAP_divide_error,&divide_error);
2922 set_intr_gate(TRAP_debug,&debug);
2923 set_intr_gate(TRAP_nmi,&nmi);
2924 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
2925 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2926 set_intr_gate(TRAP_bounds,&bounds);
2927 set_intr_gate(TRAP_invalid_op,&invalid_op);
2928 set_intr_gate(TRAP_no_device,&device_not_available);
2929 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2930 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2931 set_intr_gate(TRAP_no_segment,&segment_not_present);
2932 set_intr_gate(TRAP_stack_error,&stack_segment);
2933 set_intr_gate(TRAP_gp_fault,&general_protection);
2934 set_intr_gate(TRAP_page_fault,&page_fault);
2935 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2936 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2937 set_intr_gate(TRAP_alignment_check,&alignment_check);
2938 set_intr_gate(TRAP_machine_check,&machine_check);
2939 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2941 /* CPU0 uses the master IDT. */
2942 idt_tables[0] = idt_table;
2944 percpu_traps_init();
2946 cpu_init();
2949 long register_guest_nmi_callback(unsigned long address)
2951 struct vcpu *v = current;
2952 struct domain *d = v->domain;
2953 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2955 t->vector = TRAP_nmi;
2956 t->flags = 0;
2957 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2958 t->address = address;
2959 TI_SET_IF(t, 1);
2961 /*
2962 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2963 * now.
2964 */
2965 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2966 v->nmi_pending = 1;
2968 return 0;
2971 long unregister_guest_nmi_callback(void)
2973 struct vcpu *v = current;
2974 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2976 memset(t, 0, sizeof(*t));
2978 return 0;
2981 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
2983 struct trap_info cur;
2984 struct vcpu *curr = current;
2985 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2986 long rc = 0;
2988 /* If no table is presented then clear the entire virtual IDT. */
2989 if ( guest_handle_is_null(traps) )
2991 memset(dst, 0, 256 * sizeof(*dst));
2992 init_int80_direct_trap(curr);
2993 return 0;
2996 for ( ; ; )
2998 if ( hypercall_preempt_check() )
3000 rc = hypercall_create_continuation(
3001 __HYPERVISOR_set_trap_table, "h", traps);
3002 break;
3005 if ( copy_from_guest(&cur, traps, 1) )
3007 rc = -EFAULT;
3008 break;
3011 if ( cur.address == 0 )
3012 break;
3014 fixup_guest_code_selector(curr->domain, cur.cs);
3016 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3018 if ( cur.vector == 0x80 )
3019 init_int80_direct_trap(curr);
3021 guest_handle_add_offset(traps, 1);
3024 return rc;
3027 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3029 int i;
3030 struct vcpu *curr = current;
3032 switch ( reg )
3034 case 0:
3035 if ( !access_ok(value, sizeof(long)) )
3036 return -EPERM;
3037 if ( v == curr )
3038 write_debugreg(0, value);
3039 break;
3040 case 1:
3041 if ( !access_ok(value, sizeof(long)) )
3042 return -EPERM;
3043 if ( v == curr )
3044 write_debugreg(1, value);
3045 break;
3046 case 2:
3047 if ( !access_ok(value, sizeof(long)) )
3048 return -EPERM;
3049 if ( v == curr )
3050 write_debugreg(2, value);
3051 break;
3052 case 3:
3053 if ( !access_ok(value, sizeof(long)) )
3054 return -EPERM;
3055 if ( v == curr )
3056 write_debugreg(3, value);
3057 break;
3058 case 6:
3059 /*
3060 * DR6: Bits 4-11,16-31 reserved (set to 1).
3061 * Bit 12 reserved (set to 0).
3062 */
3063 value &= 0xffffefff; /* reserved bits => 0 */
3064 value |= 0xffff0ff0; /* reserved bits => 1 */
3065 if ( v == curr )
3066 write_debugreg(6, value);
3067 break;
3068 case 7:
3069 /*
3070 * DR7: Bit 10 reserved (set to 1).
3071 * Bits 11-12,14-15 reserved (set to 0).
3072 */
3073 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3074 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3075 /*
3076 * Privileged bits:
3077 * GD (bit 13): must be 0.
3078 */
3079 if ( value & DR_GENERAL_DETECT )
3080 return -EPERM;
3081 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3082 if ( value & DR7_ACTIVE_MASK )
3084 unsigned int io_enable = 0;
3086 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3088 if ( ((value >> i) & 3) == DR_IO )
3090 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3091 return -EPERM;
3092 io_enable |= value & (3 << ((i - 16) >> 1));
3094 #ifdef __i386__
3095 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3096 !boot_cpu_has(X86_FEATURE_LM)) &&
3097 (((value >> i) & 0xc) == DR_LEN_8) )
3098 return -EPERM;
3099 #endif
3102 /* Guest DR5 is a handy stash for I/O intercept information. */
3103 v->arch.guest_context.debugreg[5] = io_enable;
3104 value &= ~io_enable;
3106 /*
3107 * If DR7 was previously clear then we need to load all other
3108 * debug registers at this point as they were not restored during
3109 * context switch.
3110 */
3111 if ( (v == curr) &&
3112 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3114 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3115 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3116 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3117 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3118 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3121 if ( v == curr )
3122 write_debugreg(7, value);
3123 break;
3124 default:
3125 return -EINVAL;
3128 v->arch.guest_context.debugreg[reg] = value;
3129 return 0;
3132 long do_set_debugreg(int reg, unsigned long value)
3134 return set_debugreg(current, reg, value);
3137 unsigned long do_get_debugreg(int reg)
3139 struct vcpu *curr = current;
3141 switch ( reg )
3143 case 0 ... 3:
3144 case 6:
3145 return curr->arch.guest_context.debugreg[reg];
3146 case 7:
3147 return (curr->arch.guest_context.debugreg[7] |
3148 curr->arch.guest_context.debugreg[5]);
3149 case 4 ... 5:
3150 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3151 curr->arch.guest_context.debugreg[reg + 2] : 0);
3154 return -EINVAL;
3157 /*
3158 * Local variables:
3159 * mode: C
3160 * c-set-style: "BSD"
3161 * c-basic-offset: 4
3162 * tab-width: 4
3163 * indent-tabs-mode: nil
3164 * End:
3165 */