ia64/xen-unstable

view xen/arch/x86/traps.c @ 17403:324f772239a7

x86_64: Be more careful in emulating 32-bit call gates.

An assertion could legitimately fire.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Apr 08 09:46:57 2008 +0100 (2008-04-08)
parents 4b157affc08f
children 80ba1b427032
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
65 #include <public/arch-x86/cpuid.h>
67 /*
68 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
69 * fatal: Xen prints diagnostic message and then hangs.
70 * dom0: The NMI is virtualised to DOM0.
71 * ignore: The NMI error is cleared and ignored.
72 */
73 #ifdef NDEBUG
74 char opt_nmi[10] = "dom0";
75 #else
76 char opt_nmi[10] = "fatal";
77 #endif
78 string_param("nmi", opt_nmi);
80 DEFINE_PER_CPU(u32, ler_msr);
82 /* Master table, used by CPU0. */
83 idt_entry_t idt_table[IDT_ENTRIES];
85 /* Pointer to the IDT of every CPU. */
86 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
88 #define DECLARE_TRAP_HANDLER(_name) \
89 asmlinkage void _name(void); \
90 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
92 DECLARE_TRAP_HANDLER(divide_error);
93 DECLARE_TRAP_HANDLER(debug);
94 DECLARE_TRAP_HANDLER(nmi);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(machine_check);
109 DECLARE_TRAP_HANDLER(alignment_check);
110 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
112 long do_set_debugreg(int reg, unsigned long value);
113 unsigned long do_get_debugreg(int reg);
114 void (*ioemul_handle_quirk)(
115 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
117 static int debug_stack_lines = 20;
118 integer_param("debug_stack_lines", debug_stack_lines);
120 static int opt_ler;
121 boolean_param("ler", opt_ler);
123 #ifdef CONFIG_X86_32
124 #define stack_words_per_line 8
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
126 #else
127 #define stack_words_per_line 4
128 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
129 #endif
131 static void show_guest_stack(struct cpu_user_regs *regs)
132 {
133 int i;
134 struct vcpu *curr = current;
135 unsigned long *stack, addr;
137 if ( is_hvm_vcpu(curr) )
138 return;
140 if ( is_pv_32on64_vcpu(curr) )
141 {
142 compat_show_guest_stack(regs, debug_stack_lines);
143 return;
144 }
146 if ( vm86_mode(regs) )
147 {
148 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
149 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
150 regs->ss, (uint16_t)(regs->esp & 0xffff));
151 }
152 else
153 {
154 stack = (unsigned long *)regs->esp;
155 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
156 }
158 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
159 {
160 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
161 break;
162 if ( get_user(addr, stack) )
163 {
164 if ( i != 0 )
165 printk("\n ");
166 printk("Fault while accessing guest memory.");
167 i = 1;
168 break;
169 }
170 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
171 printk("\n ");
172 printk(" %p", _p(addr));
173 stack++;
174 }
175 if ( i == 0 )
176 printk("Stack empty.");
177 printk("\n");
178 }
180 #if !defined(CONFIG_FRAME_POINTER)
182 static void show_trace(struct cpu_user_regs *regs)
183 {
184 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
186 printk("Xen call trace:\n ");
188 printk("[<%p>]", _p(regs->eip));
189 print_symbol(" %s\n ", regs->eip);
191 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
192 {
193 addr = *stack++;
194 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
195 {
196 printk("[<%p>]", _p(addr));
197 print_symbol(" %s\n ", addr);
198 }
199 }
201 printk("\n");
202 }
204 #else
206 static void show_trace(struct cpu_user_regs *regs)
207 {
208 unsigned long *frame, next, addr, low, high;
210 printk("Xen call trace:\n ");
212 printk("[<%p>]", _p(regs->eip));
213 print_symbol(" %s\n ", regs->eip);
215 /* Bounds for range of valid frame pointer. */
216 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
217 high = (low & ~(STACK_SIZE - 1)) +
218 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
220 /* The initial frame pointer. */
221 next = regs->ebp;
223 for ( ; ; )
224 {
225 /* Valid frame pointer? */
226 if ( (next < low) || (next >= high) )
227 {
228 /*
229 * Exception stack frames have a different layout, denoted by an
230 * inverted frame pointer.
231 */
232 next = ~next;
233 if ( (next < low) || (next >= high) )
234 break;
235 frame = (unsigned long *)next;
236 next = frame[0];
237 addr = frame[(offsetof(struct cpu_user_regs, eip) -
238 offsetof(struct cpu_user_regs, ebp))
239 / BYTES_PER_LONG];
240 }
241 else
242 {
243 /* Ordinary stack frame. */
244 frame = (unsigned long *)next;
245 next = frame[0];
246 addr = frame[1];
247 }
249 printk("[<%p>]", _p(addr));
250 print_symbol(" %s\n ", addr);
252 low = (unsigned long)&frame[2];
253 }
255 printk("\n");
256 }
258 #endif
260 void show_stack(struct cpu_user_regs *regs)
261 {
262 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
263 int i;
265 if ( guest_mode(regs) )
266 return show_guest_stack(regs);
268 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
270 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
271 {
272 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
273 break;
274 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
275 printk("\n ");
276 addr = *stack++;
277 printk(" %p", _p(addr));
278 }
279 if ( i == 0 )
280 printk("Stack empty.");
281 printk("\n");
283 show_trace(regs);
284 }
286 void show_stack_overflow(unsigned int cpu, unsigned long esp)
287 {
288 #ifdef MEMORY_GUARD
289 unsigned long esp_top, esp_bottom;
290 unsigned long *stack, addr;
292 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
293 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
295 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
296 (void *)esp_top, (void *)esp_bottom, (void *)esp,
297 (void *)init_tss[cpu].esp0);
299 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
300 if ( ((unsigned long)(esp - esp_top) > 512) &&
301 ((unsigned long)(esp_top - esp) > 512) )
302 {
303 printk("No stack overflow detected. Skipping stack trace.\n");
304 return;
305 }
307 if ( esp < esp_top )
308 esp = esp_top;
310 printk("Xen stack overflow (dumping trace %p-%p):\n ",
311 (void *)esp, (void *)esp_bottom);
313 stack = (unsigned long *)esp;
314 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
315 {
316 addr = *stack++;
317 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
318 {
319 printk("%p: [<%p>]", stack, _p(addr));
320 print_symbol(" %s\n ", addr);
321 }
322 }
324 printk("\n");
325 #endif
326 }
328 void show_execution_state(struct cpu_user_regs *regs)
329 {
330 show_registers(regs);
331 show_stack(regs);
332 }
334 char *trapstr(int trapnr)
335 {
336 static char *strings[] = {
337 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
338 "invalid opcode", "device not available", "double fault",
339 "coprocessor segment", "invalid tss", "segment not found",
340 "stack error", "general protection fault", "page fault",
341 "spurious interrupt", "coprocessor error", "alignment check",
342 "machine check", "simd error"
343 };
345 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
346 return "???";
348 return strings[trapnr];
349 }
351 /*
352 * This is called for faults at very unexpected times (e.g., when interrupts
353 * are disabled). In such situations we can't do much that is safe. We try to
354 * print out some tracing and then we just spin.
355 */
356 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
357 {
358 static DEFINE_PER_CPU(char, depth);
360 /*
361 * In some cases, we can end up in a vicious cycle of fatal_trap()s
362 * within fatal_trap()s. We give the problem a couple of iterations to
363 * bottom out, and then we just panic.
364 */
365 if ( ++this_cpu(depth) < 3 )
366 {
367 watchdog_disable();
368 console_start_sync();
370 show_execution_state(regs);
372 if ( trapnr == TRAP_page_fault )
373 {
374 unsigned long cr2 = read_cr2();
375 printk("Faulting linear address: %p\n", _p(cr2));
376 show_page_walk(cr2);
377 }
378 }
380 panic("FATAL TRAP: vector = %d (%s)\n"
381 "[error_code=%04x] %s\n",
382 trapnr, trapstr(trapnr), regs->error_code,
383 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
384 }
386 static void do_guest_trap(
387 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
388 {
389 struct vcpu *v = current;
390 struct trap_bounce *tb;
391 const struct trap_info *ti;
393 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
395 tb = &v->arch.trap_bounce;
396 ti = &v->arch.guest_context.trap_ctxt[trapnr];
398 tb->flags = TBF_EXCEPTION;
399 tb->cs = ti->cs;
400 tb->eip = ti->address;
402 if ( use_error_code )
403 {
404 tb->flags |= TBF_EXCEPTION_ERRCODE;
405 tb->error_code = regs->error_code;
406 }
408 if ( TI_GET_IF(ti) )
409 tb->flags |= TBF_INTERRUPT;
411 if ( unlikely(null_trap_bounce(v, tb)) )
412 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
413 "on VCPU %d [ec=%04x]\n",
414 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
415 }
417 static void instruction_done(
418 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
419 {
420 regs->eip = eip;
421 regs->eflags &= ~X86_EFLAGS_RF;
422 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
423 {
424 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
425 if ( regs->eflags & X86_EFLAGS_TF )
426 current->arch.guest_context.debugreg[6] |= 0x4000;
427 do_guest_trap(TRAP_debug, regs, 0);
428 }
429 }
431 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
432 unsigned int port, unsigned int len)
433 {
434 unsigned int width, i, match = 0;
435 unsigned long start;
437 if ( !(v->arch.guest_context.debugreg[5]) ||
438 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
439 return 0;
441 for ( i = 0; i < 4; i++ )
442 {
443 if ( !(v->arch.guest_context.debugreg[5] &
444 (3 << (i * DR_ENABLE_SIZE))) )
445 continue;
447 start = v->arch.guest_context.debugreg[i];
448 width = 0;
450 switch ( (v->arch.guest_context.debugreg[7] >>
451 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
452 {
453 case DR_LEN_1: width = 1; break;
454 case DR_LEN_2: width = 2; break;
455 case DR_LEN_4: width = 4; break;
456 case DR_LEN_8: width = 8; break;
457 }
459 if ( (start < (port + len)) && ((start + width) > port) )
460 match |= 1 << i;
461 }
463 return match;
464 }
466 /*
467 * Called from asm to set up the NMI trapbounce info.
468 * Returns 0 if no callback is set up, else 1.
469 */
470 asmlinkage int set_guest_nmi_trapbounce(void)
471 {
472 struct vcpu *v = current;
473 struct trap_bounce *tb = &v->arch.trap_bounce;
474 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
475 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
476 return !null_trap_bounce(v, tb);
477 }
479 static inline void do_trap(
480 int trapnr, struct cpu_user_regs *regs, int use_error_code)
481 {
482 unsigned long fixup;
484 DEBUGGER_trap_entry(trapnr, regs);
486 if ( guest_mode(regs) )
487 {
488 do_guest_trap(trapnr, regs, use_error_code);
489 return;
490 }
492 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
493 {
494 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
495 trapnr, _p(regs->eip), _p(fixup));
496 regs->eip = fixup;
497 return;
498 }
500 DEBUGGER_trap_fatal(trapnr, regs);
502 show_execution_state(regs);
503 panic("FATAL TRAP: vector = %d (%s)\n"
504 "[error_code=%04x]\n",
505 trapnr, trapstr(trapnr), regs->error_code);
506 }
508 #define DO_ERROR_NOCODE(trapnr, name) \
509 asmlinkage void do_##name(struct cpu_user_regs *regs) \
510 { \
511 do_trap(trapnr, regs, 0); \
512 }
514 #define DO_ERROR(trapnr, name) \
515 asmlinkage void do_##name(struct cpu_user_regs *regs) \
516 { \
517 do_trap(trapnr, regs, 1); \
518 }
520 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
521 DO_ERROR_NOCODE(TRAP_overflow, overflow)
522 DO_ERROR_NOCODE(TRAP_bounds, bounds)
523 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
524 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
525 DO_ERROR( TRAP_no_segment, segment_not_present)
526 DO_ERROR( TRAP_stack_error, stack_segment)
527 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
528 DO_ERROR( TRAP_alignment_check, alignment_check)
529 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
531 int rdmsr_hypervisor_regs(
532 uint32_t idx, uint32_t *eax, uint32_t *edx)
533 {
534 idx -= 0x40000000;
535 if ( idx > 0 )
536 return 0;
538 switch ( idx )
539 {
540 case 0:
541 {
542 *eax = *edx = 0;
543 break;
544 }
545 default:
546 BUG();
547 }
549 return 1;
550 }
552 int wrmsr_hypervisor_regs(
553 uint32_t idx, uint32_t eax, uint32_t edx)
554 {
555 struct domain *d = current->domain;
557 idx -= 0x40000000;
558 if ( idx > 0 )
559 return 0;
561 switch ( idx )
562 {
563 case 0:
564 {
565 void *hypercall_page;
566 unsigned long mfn;
567 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
568 unsigned int idx = eax & 0xfff;
570 if ( idx > 0 )
571 {
572 gdprintk(XENLOG_WARNING,
573 "Out of range index %u to MSR %08x\n",
574 idx, 0x40000000);
575 return 0;
576 }
578 mfn = gmfn_to_mfn(d, gmfn);
580 if ( !mfn_valid(mfn) ||
581 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
582 {
583 gdprintk(XENLOG_WARNING,
584 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
585 gmfn, mfn, 0x40000000);
586 return 0;
587 }
589 hypercall_page = map_domain_page(mfn);
590 hypercall_page_initialise(d, hypercall_page);
591 unmap_domain_page(hypercall_page);
593 put_page_and_type(mfn_to_page(mfn));
594 break;
595 }
597 default:
598 BUG();
599 }
601 return 1;
602 }
604 int cpuid_hypervisor_leaves(
605 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
606 {
607 idx -= 0x40000000;
608 if ( idx > 2 )
609 return 0;
611 switch ( idx )
612 {
613 case 0:
614 *eax = 0x40000002; /* Largest leaf */
615 *ebx = XEN_CPUID_SIGNATURE_EBX;
616 *ecx = XEN_CPUID_SIGNATURE_ECX;
617 *edx = XEN_CPUID_SIGNATURE_EDX;
618 break;
620 case 1:
621 *eax = (xen_major_version() << 16) | xen_minor_version();
622 *ebx = 0; /* Reserved */
623 *ecx = 0; /* Reserved */
624 *edx = 0; /* Reserved */
625 break;
627 case 2:
628 *eax = 1; /* Number of hypercall-transfer pages */
629 *ebx = 0x40000000; /* MSR base address */
630 *ecx = 0; /* Features 1 */
631 *edx = 0; /* Features 2 */
632 if ( !is_hvm_vcpu(current) )
633 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
634 break;
636 default:
637 BUG();
638 }
640 return 1;
641 }
643 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
644 {
645 char sig[5], instr[2];
646 uint32_t a, b, c, d;
647 unsigned long eip, rc;
649 a = regs->eax;
650 b = regs->ebx;
651 c = regs->ecx;
652 d = regs->edx;
653 eip = regs->eip;
655 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
656 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
657 {
658 propagate_page_fault(eip + sizeof(sig) - rc, 0);
659 return EXCRET_fault_fixed;
660 }
661 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
662 return 0;
663 eip += sizeof(sig);
665 /* We only emulate CPUID. */
666 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
667 {
668 propagate_page_fault(eip + sizeof(instr) - rc, 0);
669 return EXCRET_fault_fixed;
670 }
671 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
672 return 0;
673 eip += sizeof(instr);
675 asm (
676 "cpuid"
677 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
678 : "0" (a), "1" (b), "2" (c), "3" (d) );
680 if ( (regs->eax & 0x7fffffff) == 1 )
681 {
682 /* Modify Feature Information. */
683 __clear_bit(X86_FEATURE_VME, &d);
684 __clear_bit(X86_FEATURE_PSE, &d);
685 __clear_bit(X86_FEATURE_PGE, &d);
686 __clear_bit(X86_FEATURE_MCE, &d);
687 __clear_bit(X86_FEATURE_MCA, &d);
688 if ( !IS_PRIV(current->domain) )
689 __clear_bit(X86_FEATURE_MTRR, &d);
690 __clear_bit(X86_FEATURE_PSE36, &d);
691 }
692 switch ( (uint32_t)regs->eax )
693 {
694 case 1:
695 /* Modify Feature Information. */
696 if ( !cpu_has_sep )
697 __clear_bit(X86_FEATURE_SEP, &d);
698 #ifdef __i386__
699 if ( !supervisor_mode_kernel )
700 __clear_bit(X86_FEATURE_SEP, &d);
701 #endif
702 __clear_bit(X86_FEATURE_DS, &d);
703 __clear_bit(X86_FEATURE_ACC, &d);
704 __clear_bit(X86_FEATURE_PBE, &d);
706 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
707 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
708 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
709 __clear_bit(X86_FEATURE_VMXE % 32, &c);
710 __clear_bit(X86_FEATURE_SMXE % 32, &c);
711 __clear_bit(X86_FEATURE_EST % 32, &c);
712 __clear_bit(X86_FEATURE_TM2 % 32, &c);
713 if ( is_pv_32bit_vcpu(current) )
714 __clear_bit(X86_FEATURE_CX16 % 32, &c);
715 __clear_bit(X86_FEATURE_XTPR % 32, &c);
716 __clear_bit(X86_FEATURE_PDCM % 32, &c);
717 __clear_bit(X86_FEATURE_DCA % 32, &c);
718 break;
719 case 0x80000001:
720 /* Modify Feature Information. */
721 if ( is_pv_32bit_vcpu(current) )
722 {
723 __clear_bit(X86_FEATURE_LM % 32, &d);
724 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
725 }
726 #ifndef __i386__
727 if ( is_pv_32on64_vcpu(current) &&
728 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
729 #endif
730 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
731 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
732 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
734 __clear_bit(X86_FEATURE_SVME % 32, &c);
735 __clear_bit(X86_FEATURE_OSVW % 32, &c);
736 __clear_bit(X86_FEATURE_IBS % 32, &c);
737 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
738 __clear_bit(X86_FEATURE_WDT % 32, &c);
739 break;
740 case 5: /* MONITOR/MWAIT */
741 case 0xa: /* Architectural Performance Monitor Features */
742 case 0x8000000a: /* SVM revision and features */
743 case 0x8000001b: /* Instruction Based Sampling */
744 a = b = c = d = 0;
745 break;
746 default:
747 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
748 break;
749 }
751 regs->eax = a;
752 regs->ebx = b;
753 regs->ecx = c;
754 regs->edx = d;
756 instruction_done(regs, eip, 0);
758 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
760 return EXCRET_fault_fixed;
761 }
763 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
764 {
765 struct bug_frame bug;
766 struct bug_frame_str bug_str;
767 char *filename, *predicate, *eip = (char *)regs->eip;
768 unsigned long fixup;
769 int id, lineno;
771 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
773 if ( likely(guest_mode(regs)) )
774 {
775 if ( !emulate_forced_invalid_op(regs) )
776 do_guest_trap(TRAP_invalid_op, regs, 0);
777 return;
778 }
780 if ( !is_kernel(eip) ||
781 __copy_from_user(&bug, eip, sizeof(bug)) ||
782 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
783 (bug.ret != 0xc2) )
784 goto die;
785 eip += sizeof(bug);
787 id = bug.id & 3;
789 if ( id == BUGFRAME_dump )
790 {
791 show_execution_state(regs);
792 regs->eip = (unsigned long)eip;
793 return;
794 }
796 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
797 if ( !is_kernel(eip) ||
798 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
799 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
800 goto die;
801 eip += sizeof(bug_str);
803 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
804 lineno = bug.id >> 2;
806 if ( id == BUGFRAME_warn )
807 {
808 printk("Xen WARN at %.50s:%d\n", filename, lineno);
809 show_execution_state(regs);
810 regs->eip = (unsigned long)eip;
811 return;
812 }
814 if ( id == BUGFRAME_bug )
815 {
816 printk("Xen BUG at %.50s:%d\n", filename, lineno);
817 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
818 show_execution_state(regs);
819 panic("Xen BUG at %.50s:%d\n", filename, lineno);
820 }
822 /* ASSERT: decode the predicate string pointer. */
823 ASSERT(id == BUGFRAME_assert);
824 if ( !is_kernel(eip) ||
825 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
826 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
827 goto die;
828 eip += sizeof(bug_str);
830 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
831 printk("Assertion '%s' failed at %.50s:%d\n",
832 predicate, filename, lineno);
833 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
834 show_execution_state(regs);
835 panic("Assertion '%s' failed at %.50s:%d\n",
836 predicate, filename, lineno);
838 die:
839 if ( (fixup = search_exception_table(regs->eip)) != 0 )
840 {
841 regs->eip = fixup;
842 return;
843 }
844 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
845 show_execution_state(regs);
846 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
847 }
849 asmlinkage void do_int3(struct cpu_user_regs *regs)
850 {
851 DEBUGGER_trap_entry(TRAP_int3, regs);
853 if ( !guest_mode(regs) )
854 {
855 debugger_trap_fatal(TRAP_int3, regs);
856 return;
857 }
859 do_guest_trap(TRAP_int3, regs, 0);
860 }
862 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
863 {
864 extern fastcall void (*machine_check_vector)(
865 struct cpu_user_regs *, long error_code);
866 machine_check_vector(regs, regs->error_code);
867 }
869 static void reserved_bit_page_fault(
870 unsigned long addr, struct cpu_user_regs *regs)
871 {
872 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
873 current->domain->domain_id, current->vcpu_id, regs->error_code);
874 show_page_walk(addr);
875 show_execution_state(regs);
876 }
878 void propagate_page_fault(unsigned long addr, u16 error_code)
879 {
880 struct trap_info *ti;
881 struct vcpu *v = current;
882 struct trap_bounce *tb = &v->arch.trap_bounce;
884 v->arch.guest_context.ctrlreg[2] = addr;
885 arch_set_cr2(v, addr);
887 /* Re-set error_code.user flag appropriately for the guest. */
888 error_code &= ~PFEC_user_mode;
889 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
890 error_code |= PFEC_user_mode;
892 trace_pv_page_fault(addr, error_code);
894 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
895 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
896 tb->error_code = error_code;
897 tb->cs = ti->cs;
898 tb->eip = ti->address;
899 if ( TI_GET_IF(ti) )
900 tb->flags |= TBF_INTERRUPT;
901 if ( unlikely(null_trap_bounce(v, tb)) )
902 {
903 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
904 v->domain->domain_id, v->vcpu_id, error_code);
905 show_page_walk(addr);
906 }
908 if ( unlikely(error_code & PFEC_reserved_bit) )
909 reserved_bit_page_fault(addr, guest_cpu_user_regs());
910 }
912 static int handle_gdt_ldt_mapping_fault(
913 unsigned long offset, struct cpu_user_regs *regs)
914 {
915 struct vcpu *curr = current;
916 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
917 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
918 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
920 /* Should never fault in another vcpu's area. */
921 BUG_ON(vcpu_area != curr->vcpu_id);
923 /* Byte offset within the gdt/ldt sub-area. */
924 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
926 if ( likely(is_ldt_area) )
927 {
928 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
929 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
930 {
931 if ( guest_mode(regs) )
932 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
933 regs->eip, offset);
934 }
935 else
936 {
937 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
938 if ( !guest_mode(regs) )
939 return 0;
940 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
941 propagate_page_fault(
942 curr->arch.guest_context.ldt_base + offset,
943 regs->error_code);
944 }
945 }
946 else
947 {
948 /* GDT fault: handle the fault as #GP(selector). */
949 regs->error_code = (u16)offset & ~7;
950 (void)do_general_protection(regs);
951 }
953 return EXCRET_fault_fixed;
954 }
956 #ifdef HYPERVISOR_VIRT_END
957 #define IN_HYPERVISOR_RANGE(va) \
958 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
959 #else
960 #define IN_HYPERVISOR_RANGE(va) \
961 (((va) >= HYPERVISOR_VIRT_START))
962 #endif
964 static int __spurious_page_fault(
965 unsigned long addr, struct cpu_user_regs *regs)
966 {
967 unsigned long mfn, cr3 = read_cr3();
968 #if CONFIG_PAGING_LEVELS >= 4
969 l4_pgentry_t l4e, *l4t;
970 #endif
971 #if CONFIG_PAGING_LEVELS >= 3
972 l3_pgentry_t l3e, *l3t;
973 #endif
974 l2_pgentry_t l2e, *l2t;
975 l1_pgentry_t l1e, *l1t;
976 unsigned int required_flags, disallowed_flags;
978 /*
979 * We do not take spurious page faults in IRQ handlers as we do not
980 * modify page tables in IRQ context. We therefore bail here because
981 * map_domain_page() is not IRQ-safe.
982 */
983 if ( in_irq() )
984 return 0;
986 /* Reserved bit violations are never spurious faults. */
987 if ( regs->error_code & PFEC_reserved_bit )
988 return 0;
990 required_flags = _PAGE_PRESENT;
991 if ( regs->error_code & PFEC_write_access )
992 required_flags |= _PAGE_RW;
993 if ( regs->error_code & PFEC_user_mode )
994 required_flags |= _PAGE_USER;
996 disallowed_flags = 0;
997 if ( regs->error_code & PFEC_insn_fetch )
998 disallowed_flags |= _PAGE_NX;
1000 mfn = cr3 >> PAGE_SHIFT;
1002 #if CONFIG_PAGING_LEVELS >= 4
1003 l4t = map_domain_page(mfn);
1004 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1005 mfn = l4e_get_pfn(l4e);
1006 unmap_domain_page(l4t);
1007 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1008 (l4e_get_flags(l4e) & disallowed_flags) )
1009 return 0;
1010 #endif
1012 #if CONFIG_PAGING_LEVELS >= 3
1013 l3t = map_domain_page(mfn);
1014 #ifdef CONFIG_X86_PAE
1015 l3t += (cr3 & 0xFE0UL) >> 3;
1016 #endif
1017 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1018 mfn = l3e_get_pfn(l3e);
1019 unmap_domain_page(l3t);
1020 #ifdef CONFIG_X86_PAE
1021 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1022 return 0;
1023 #else
1024 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1025 (l3e_get_flags(l3e) & disallowed_flags) )
1026 return 0;
1027 #endif
1028 #endif
1030 l2t = map_domain_page(mfn);
1031 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1032 mfn = l2e_get_pfn(l2e);
1033 unmap_domain_page(l2t);
1034 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1035 (l2e_get_flags(l2e) & disallowed_flags) )
1036 return 0;
1037 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1039 l1e = l1e_empty(); /* define before use in debug tracing */
1040 goto spurious;
1043 l1t = map_domain_page(mfn);
1044 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1045 mfn = l1e_get_pfn(l1e);
1046 unmap_domain_page(l1t);
1047 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1048 (l1e_get_flags(l1e) & disallowed_flags) )
1049 return 0;
1051 spurious:
1052 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1053 "at addr %lx, e/c %04x\n",
1054 current->domain->domain_id, current->vcpu_id,
1055 addr, regs->error_code);
1056 #if CONFIG_PAGING_LEVELS >= 4
1057 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1058 #endif
1059 #if CONFIG_PAGING_LEVELS >= 3
1060 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1061 #endif
1062 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1063 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1064 #ifndef NDEBUG
1065 show_registers(regs);
1066 #endif
1067 return 1;
1070 static int spurious_page_fault(
1071 unsigned long addr, struct cpu_user_regs *regs)
1073 unsigned long flags;
1074 int is_spurious;
1076 /*
1077 * Disabling interrupts prevents TLB flushing, and hence prevents
1078 * page tables from becoming invalid under our feet during the walk.
1079 */
1080 local_irq_save(flags);
1081 is_spurious = __spurious_page_fault(addr, regs);
1082 local_irq_restore(flags);
1084 return is_spurious;
1087 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1089 struct vcpu *v = current;
1090 struct domain *d = v->domain;
1092 /* No fixups in interrupt context or when interrupts are disabled. */
1093 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1094 return 0;
1096 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1098 if ( paging_mode_external(d) && guest_mode(regs) )
1100 int ret = paging_fault(addr, regs);
1101 if ( ret == EXCRET_fault_fixed )
1102 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1103 return ret;
1105 if ( !(regs->error_code & PFEC_reserved_bit) &&
1106 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1107 return handle_gdt_ldt_mapping_fault(
1108 addr - GDT_LDT_VIRT_START, regs);
1109 return 0;
1112 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1113 guest_kernel_mode(v, regs) &&
1114 /* Do not check if access-protection fault since the page may
1115 legitimately be not present in shadow page tables */
1116 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1117 PFEC_write_access) &&
1118 ptwr_do_page_fault(v, addr, regs) )
1119 return EXCRET_fault_fixed;
1121 if ( paging_mode_enabled(d) )
1123 int ret = paging_fault(addr, regs);
1124 if ( ret == EXCRET_fault_fixed )
1125 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1126 return ret;
1129 return 0;
1132 /*
1133 * #PF error code:
1134 * Bit 0: Protection violation (=1) ; Page not present (=0)
1135 * Bit 1: Write access
1136 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1137 * Bit 3: Reserved bit violation
1138 * Bit 4: Instruction fetch
1139 */
1140 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1142 unsigned long addr, fixup;
1144 addr = read_cr2();
1146 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1148 perfc_incr(page_faults);
1150 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1151 return;
1153 if ( unlikely(!guest_mode(regs)) )
1155 if ( spurious_page_fault(addr, regs) )
1156 return;
1158 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1160 perfc_incr(copy_user_faults);
1161 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1162 reserved_bit_page_fault(addr, regs);
1163 regs->eip = fixup;
1164 return;
1167 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1169 show_execution_state(regs);
1170 show_page_walk(addr);
1171 panic("FATAL PAGE FAULT\n"
1172 "[error_code=%04x]\n"
1173 "Faulting linear address: %p\n",
1174 regs->error_code, _p(addr));
1177 propagate_page_fault(addr, regs->error_code);
1180 /*
1181 * Early #PF handler to print CR2, error code, and stack.
1183 * We also deal with spurious faults here, even though they should never happen
1184 * during early boot (an issue was seen once, but was most likely a hardware
1185 * problem).
1186 */
1187 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1189 static int stuck;
1190 static unsigned long prev_eip, prev_cr2;
1191 unsigned long cr2 = read_cr2();
1193 BUG_ON(smp_processor_id() != 0);
1195 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1197 prev_eip = regs->eip;
1198 prev_cr2 = cr2;
1199 stuck = 0;
1200 return;
1203 if ( stuck++ == 1000 )
1205 unsigned long *stk = (unsigned long *)regs;
1206 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1207 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1208 printk("Stack dump: ");
1209 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1210 printk("%p ", _p(*stk++));
1211 for ( ; ; ) ;
1215 long do_fpu_taskswitch(int set)
1217 struct vcpu *v = current;
1219 if ( set )
1221 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1222 stts();
1224 else
1226 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1227 if ( v->fpu_dirtied )
1228 clts();
1231 return 0;
1234 static int read_descriptor(unsigned int sel,
1235 const struct vcpu *v,
1236 const struct cpu_user_regs * regs,
1237 unsigned long *base,
1238 unsigned long *limit,
1239 unsigned int *ar,
1240 unsigned int vm86attr)
1242 struct desc_struct desc;
1244 if ( !vm86_mode(regs) )
1246 if ( sel < 4)
1247 desc.b = desc.a = 0;
1248 else if ( __get_user(desc,
1249 (const struct desc_struct *)(!(sel & 4)
1250 ? GDT_VIRT_START(v)
1251 : LDT_VIRT_START(v))
1252 + (sel >> 3)) )
1253 return 0;
1254 if ( !(vm86attr & _SEGMENT_CODE) )
1255 desc.b &= ~_SEGMENT_L;
1257 else
1259 desc.a = (sel << 20) | 0xffff;
1260 desc.b = vm86attr | (sel >> 12);
1263 *ar = desc.b & 0x00f0ff00;
1264 if ( !(desc.b & _SEGMENT_L) )
1266 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1267 (desc.b & 0xff000000));
1268 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1269 if ( desc.b & _SEGMENT_G )
1270 *limit = ((*limit + 1) << 12) - 1;
1271 #ifndef NDEBUG
1272 if ( !vm86_mode(regs) && (sel > 3) )
1274 unsigned int a, l;
1275 unsigned char valid;
1277 asm volatile (
1278 "larl %2,%0 ; setz %1"
1279 : "=r" (a), "=rm" (valid) : "rm" (sel));
1280 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1281 asm volatile (
1282 "lsll %2,%0 ; setz %1"
1283 : "=r" (l), "=rm" (valid) : "rm" (sel));
1284 BUG_ON(valid && (l != *limit));
1286 #endif
1288 else
1290 *base = 0UL;
1291 *limit = ~0UL;
1294 return 1;
1297 #ifdef __x86_64__
1298 static int read_gate_descriptor(unsigned int gate_sel,
1299 const struct vcpu *v,
1300 unsigned int *sel,
1301 unsigned long *off,
1302 unsigned int *ar)
1304 struct desc_struct desc;
1305 const struct desc_struct *pdesc;
1308 pdesc = (const struct desc_struct *)
1309 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1310 + (gate_sel >> 3);
1311 if ( (gate_sel < 4) ||
1312 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1313 __get_user(desc, pdesc) )
1314 return 0;
1316 *sel = (desc.a >> 16) & 0x0000fffc;
1317 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1318 *ar = desc.b & 0x0000ffff;
1320 /*
1321 * check_descriptor() clears the DPL field and stores the
1322 * guest requested DPL in the selector's RPL field.
1323 */
1324 if ( *ar & _SEGMENT_DPL )
1325 return 0;
1326 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1328 if ( !is_pv_32bit_vcpu(v) )
1330 if ( (*ar & 0x1f00) != 0x0c00 ||
1331 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1332 __get_user(desc, pdesc + 1) ||
1333 (desc.b & 0x1f00) )
1334 return 0;
1336 *off |= (unsigned long)desc.a << 32;
1337 return 1;
1340 switch ( *ar & 0x1f00 )
1342 case 0x0400:
1343 *off &= 0xffff;
1344 break;
1345 case 0x0c00:
1346 break;
1347 default:
1348 return 0;
1351 return 1;
1353 #endif
1355 /* Has the guest requested sufficient permission for this I/O access? */
1356 static inline int guest_io_okay(
1357 unsigned int port, unsigned int bytes,
1358 struct vcpu *v, struct cpu_user_regs *regs)
1360 #if defined(__x86_64__)
1361 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1362 int user_mode = !(v->arch.flags & TF_kernel_mode);
1363 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1364 #elif defined(__i386__)
1365 #define TOGGLE_MODE() ((void)0)
1366 #endif
1368 if ( !vm86_mode(regs) &&
1369 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1370 return 1;
1372 if ( v->arch.iobmp_limit > (port + bytes) )
1374 union { uint8_t bytes[2]; uint16_t mask; } x;
1376 /*
1377 * Grab permission bytes from guest space. Inaccessible bytes are
1378 * read as 0xff (no access allowed).
1379 */
1380 TOGGLE_MODE();
1381 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1382 port>>3, 2) )
1384 default: x.bytes[0] = ~0;
1385 case 1: x.bytes[1] = ~0;
1386 case 0: break;
1388 TOGGLE_MODE();
1390 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1391 return 1;
1394 return 0;
1397 /* Has the administrator granted sufficient permission for this I/O access? */
1398 static inline int admin_io_okay(
1399 unsigned int port, unsigned int bytes,
1400 struct vcpu *v, struct cpu_user_regs *regs)
1402 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1405 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1406 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1407 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1408 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1409 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1410 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1412 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1413 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1414 __attribute__((__regparm__(1)));
1415 unsigned long guest_to_host_gpr_switch(unsigned long)
1416 __attribute__((__regparm__(1)));
1418 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1420 /* Instruction fetch with error handling. */
1421 #define insn_fetch(type, base, eip, limit) \
1422 ({ unsigned long _rc, _ptr = (base) + (eip); \
1423 type _x; \
1424 if ( ad_default < 8 ) \
1425 _ptr = (unsigned int)_ptr; \
1426 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1427 goto fail; \
1428 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1429 { \
1430 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1431 goto skip; \
1432 } \
1433 (eip) += sizeof(_x); _x; })
1435 #if defined(CONFIG_X86_32)
1436 # define read_sreg(regs, sr) ((regs)->sr)
1437 #elif defined(CONFIG_X86_64)
1438 # define read_sreg(regs, sr) read_segment_register(sr)
1439 #endif
1441 static int emulate_privileged_op(struct cpu_user_regs *regs)
1443 struct vcpu *v = current;
1444 unsigned long *reg, eip = regs->eip, res;
1445 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1446 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1447 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1448 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1449 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1450 ? regs->reg \
1451 : ad_bytes == 4 \
1452 ? (u32)regs->reg \
1453 : (u16)regs->reg)
1454 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1455 ? regs->reg = (val) \
1456 : ad_bytes == 4 \
1457 ? (*(u32 *)&regs->reg = (val)) \
1458 : (*(u16 *)&regs->reg = (val)))
1459 unsigned long code_base, code_limit;
1460 char io_emul_stub[32];
1461 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1462 u32 l, h, eax, edx;
1464 if ( !read_descriptor(regs->cs, v, regs,
1465 &code_base, &code_limit, &ar,
1466 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1467 goto fail;
1468 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1469 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1470 if ( !(ar & _SEGMENT_S) ||
1471 !(ar & _SEGMENT_P) ||
1472 !(ar & _SEGMENT_CODE) )
1473 goto fail;
1475 /* emulating only opcodes not allowing SS to be default */
1476 data_sel = read_sreg(regs, ds);
1478 /* Legacy prefixes. */
1479 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1481 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1483 case 0x66: /* operand-size override */
1484 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1485 continue;
1486 case 0x67: /* address-size override */
1487 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1488 continue;
1489 case 0x2e: /* CS override */
1490 data_sel = regs->cs;
1491 continue;
1492 case 0x3e: /* DS override */
1493 data_sel = read_sreg(regs, ds);
1494 continue;
1495 case 0x26: /* ES override */
1496 data_sel = read_sreg(regs, es);
1497 continue;
1498 case 0x64: /* FS override */
1499 data_sel = read_sreg(regs, fs);
1500 lm_ovr = lm_seg_fs;
1501 continue;
1502 case 0x65: /* GS override */
1503 data_sel = read_sreg(regs, gs);
1504 lm_ovr = lm_seg_gs;
1505 continue;
1506 case 0x36: /* SS override */
1507 data_sel = regs->ss;
1508 continue;
1509 case 0xf0: /* LOCK */
1510 lock = 1;
1511 continue;
1512 case 0xf2: /* REPNE/REPNZ */
1513 case 0xf3: /* REP/REPE/REPZ */
1514 rep_prefix = 1;
1515 continue;
1516 default:
1517 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1519 rex = opcode;
1520 continue;
1522 break;
1524 break;
1527 /* REX prefix. */
1528 if ( rex & 8 ) /* REX.W */
1529 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1530 modrm_reg = (rex & 4) << 1; /* REX.R */
1531 /* REX.X does not need to be decoded. */
1532 modrm_rm = (rex & 1) << 3; /* REX.B */
1534 if ( opcode == 0x0f )
1535 goto twobyte_opcode;
1537 if ( lock )
1538 goto fail;
1540 /* Input/Output String instructions. */
1541 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1543 unsigned long data_base, data_limit;
1545 if ( rep_prefix && (rd_ad(ecx) == 0) )
1546 goto done;
1548 if ( !(opcode & 2) )
1550 data_sel = read_sreg(regs, es);
1551 lm_ovr = lm_seg_none;
1554 if ( !(ar & _SEGMENT_L) )
1556 if ( !read_descriptor(data_sel, v, regs,
1557 &data_base, &data_limit, &ar,
1558 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1559 goto fail;
1560 if ( !(ar & _SEGMENT_S) ||
1561 !(ar & _SEGMENT_P) ||
1562 (opcode & 2 ?
1563 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1564 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1565 goto fail;
1567 #ifdef CONFIG_X86_64
1568 else
1570 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1572 switch ( lm_ovr )
1574 case lm_seg_none:
1575 data_base = 0UL;
1576 break;
1577 case lm_seg_fs:
1578 data_base = v->arch.guest_context.fs_base;
1579 break;
1580 case lm_seg_gs:
1581 if ( guest_kernel_mode(v, regs) )
1582 data_base = v->arch.guest_context.gs_base_kernel;
1583 else
1584 data_base = v->arch.guest_context.gs_base_user;
1585 break;
1588 else
1589 read_descriptor(data_sel, v, regs,
1590 &data_base, &data_limit, &ar,
1591 0);
1592 data_limit = ~0UL;
1593 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1595 #endif
1597 port = (u16)regs->edx;
1599 continue_io_string:
1600 switch ( opcode )
1602 case 0x6c: /* INSB */
1603 op_bytes = 1;
1604 case 0x6d: /* INSW/INSL */
1605 if ( data_limit < op_bytes - 1 ||
1606 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1607 !guest_io_okay(port, op_bytes, v, regs) )
1608 goto fail;
1609 switch ( op_bytes )
1611 case 1:
1612 /* emulate PIT counter 2 */
1613 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1614 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1615 pv_pit_handler(port, 0, 0) : ~0));
1616 break;
1617 case 2:
1618 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1619 break;
1620 case 4:
1621 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1622 break;
1624 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1626 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1627 PFEC_write_access);
1628 return EXCRET_fault_fixed;
1630 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1631 break;
1633 case 0x6e: /* OUTSB */
1634 op_bytes = 1;
1635 case 0x6f: /* OUTSW/OUTSL */
1636 if ( data_limit < op_bytes - 1 ||
1637 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1638 !guest_io_okay(port, op_bytes, v, regs) )
1639 goto fail;
1640 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1641 if ( rc != 0 )
1643 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1644 return EXCRET_fault_fixed;
1646 switch ( op_bytes )
1648 case 1:
1649 if ( guest_outb_okay(port, v, regs) )
1651 outb((u8)data, port);
1652 if ( pv_post_outb_hook )
1653 pv_post_outb_hook(port, data);
1655 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1656 pv_pit_handler(port, data, 1);
1657 break;
1658 case 2:
1659 if ( guest_outw_okay(port, v, regs) )
1660 outw((u16)data, port);
1661 break;
1662 case 4:
1663 if ( guest_outl_okay(port, v, regs) )
1664 outl((u32)data, port);
1665 break;
1667 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1668 break;
1671 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1673 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1675 if ( !bpmatch && !hypercall_preempt_check() )
1676 goto continue_io_string;
1677 eip = regs->eip;
1680 goto done;
1683 /*
1684 * Very likely to be an I/O instruction (IN/OUT).
1685 * Build an on-stack stub to execute the instruction with full guest
1686 * GPR context. This is needed for some systems which (ab)use IN/OUT
1687 * to communicate with BIOS code in system-management mode.
1688 */
1689 #ifdef __x86_64__
1690 /* movq $host_to_guest_gpr_switch,%rcx */
1691 io_emul_stub[0] = 0x48;
1692 io_emul_stub[1] = 0xb9;
1693 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1694 /* callq *%rcx */
1695 io_emul_stub[10] = 0xff;
1696 io_emul_stub[11] = 0xd1;
1697 #else
1698 /* call host_to_guest_gpr_switch */
1699 io_emul_stub[0] = 0xe8;
1700 *(s32 *)&io_emul_stub[1] =
1701 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1702 /* 7 x nop */
1703 memset(&io_emul_stub[5], 0x90, 7);
1704 #endif
1705 /* data16 or nop */
1706 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1707 /* <io-access opcode> */
1708 io_emul_stub[13] = opcode;
1709 /* imm8 or nop */
1710 io_emul_stub[14] = 0x90;
1711 /* ret (jumps to guest_to_host_gpr_switch) */
1712 io_emul_stub[15] = 0xc3;
1714 /* Handy function-typed pointer to the stub. */
1715 io_emul = (void *)io_emul_stub;
1717 if ( ioemul_handle_quirk )
1718 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1720 /* I/O Port and Interrupt Flag instructions. */
1721 switch ( opcode )
1723 case 0xe4: /* IN imm8,%al */
1724 op_bytes = 1;
1725 case 0xe5: /* IN imm8,%eax */
1726 port = insn_fetch(u8, code_base, eip, code_limit);
1727 io_emul_stub[14] = port; /* imm8 */
1728 exec_in:
1729 if ( !guest_io_okay(port, op_bytes, v, regs) )
1730 goto fail;
1731 switch ( op_bytes )
1733 case 1:
1734 if ( guest_inb_okay(port, v, regs) )
1735 io_emul(regs);
1736 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1738 regs->eax &= ~0xffUL;
1739 regs->eax |= pv_pit_handler(port, 0, 0);
1741 else
1742 regs->eax |= (u8)~0;
1743 break;
1744 case 2:
1745 if ( guest_inw_okay(port, v, regs) )
1746 io_emul(regs);
1747 else
1748 regs->eax |= (u16)~0;
1749 break;
1750 case 4:
1751 if ( guest_inl_okay(port, v, regs) )
1752 io_emul(regs);
1753 else
1754 regs->eax = (u32)~0;
1755 break;
1757 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1758 goto done;
1760 case 0xec: /* IN %dx,%al */
1761 op_bytes = 1;
1762 case 0xed: /* IN %dx,%eax */
1763 port = (u16)regs->edx;
1764 goto exec_in;
1766 case 0xe6: /* OUT %al,imm8 */
1767 op_bytes = 1;
1768 case 0xe7: /* OUT %eax,imm8 */
1769 port = insn_fetch(u8, code_base, eip, code_limit);
1770 io_emul_stub[14] = port; /* imm8 */
1771 exec_out:
1772 if ( !guest_io_okay(port, op_bytes, v, regs) )
1773 goto fail;
1774 switch ( op_bytes )
1776 case 1:
1777 if ( guest_outb_okay(port, v, regs) )
1779 io_emul(regs);
1780 if ( pv_post_outb_hook )
1781 pv_post_outb_hook(port, regs->eax);
1783 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1784 pv_pit_handler(port, regs->eax, 1);
1785 break;
1786 case 2:
1787 if ( guest_outw_okay(port, v, regs) )
1788 io_emul(regs);
1789 break;
1790 case 4:
1791 if ( guest_outl_okay(port, v, regs) )
1792 io_emul(regs);
1793 break;
1795 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1796 goto done;
1798 case 0xee: /* OUT %al,%dx */
1799 op_bytes = 1;
1800 case 0xef: /* OUT %eax,%dx */
1801 port = (u16)regs->edx;
1802 goto exec_out;
1804 case 0xfa: /* CLI */
1805 case 0xfb: /* STI */
1806 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1807 goto fail;
1808 /*
1809 * This is just too dangerous to allow, in my opinion. Consider if the
1810 * caller then tries to reenable interrupts using POPF: we can't trap
1811 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1812 * do for us. :-)
1813 */
1814 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1815 goto done;
1818 /* No decode of this single-byte opcode. */
1819 goto fail;
1821 twobyte_opcode:
1822 /* Two-byte opcodes only emulated from guest kernel. */
1823 if ( !guest_kernel_mode(v, regs) )
1824 goto fail;
1826 /* Privileged (ring 0) instructions. */
1827 opcode = insn_fetch(u8, code_base, eip, code_limit);
1828 if ( lock && (opcode & ~3) != 0x20 )
1829 goto fail;
1830 switch ( opcode )
1832 case 0x06: /* CLTS */
1833 (void)do_fpu_taskswitch(0);
1834 break;
1836 case 0x09: /* WBINVD */
1837 /* Ignore the instruction if unprivileged. */
1838 if ( !cache_flush_permitted(v->domain) )
1839 /* Non-physdev domain attempted WBINVD; ignore for now since
1840 newer linux uses this in some start-of-day timing loops */
1842 else
1843 wbinvd();
1844 break;
1846 case 0x20: /* MOV CR?,<reg> */
1847 opcode = insn_fetch(u8, code_base, eip, code_limit);
1848 if ( opcode < 0xc0 )
1849 goto fail;
1850 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1851 modrm_rm |= (opcode >> 0) & 7;
1852 reg = decode_register(modrm_rm, regs, 0);
1853 switch ( modrm_reg )
1855 case 0: /* Read CR0 */
1856 *reg = (read_cr0() & ~X86_CR0_TS) |
1857 v->arch.guest_context.ctrlreg[0];
1858 break;
1860 case 2: /* Read CR2 */
1861 *reg = v->arch.guest_context.ctrlreg[2];
1862 break;
1864 case 3: /* Read CR3 */
1865 if ( !is_pv_32on64_vcpu(v) )
1866 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1867 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1868 #ifdef CONFIG_COMPAT
1869 else
1870 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1871 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1872 #endif
1873 break;
1875 case 4: /* Read CR4 */
1876 /*
1877 * Guests can read CR4 to see what features Xen has enabled. We
1878 * therefore lie about PGE & PSE as they are unavailable to guests.
1879 */
1880 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1881 break;
1883 default:
1884 goto fail;
1886 break;
1888 case 0x21: /* MOV DR?,<reg> */
1889 opcode = insn_fetch(u8, code_base, eip, code_limit);
1890 if ( opcode < 0xc0 )
1891 goto fail;
1892 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1893 modrm_rm |= (opcode >> 0) & 7;
1894 reg = decode_register(modrm_rm, regs, 0);
1895 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1896 goto fail;
1897 *reg = res;
1898 break;
1900 case 0x22: /* MOV <reg>,CR? */
1901 opcode = insn_fetch(u8, code_base, eip, code_limit);
1902 if ( opcode < 0xc0 )
1903 goto fail;
1904 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1905 modrm_rm |= (opcode >> 0) & 7;
1906 reg = decode_register(modrm_rm, regs, 0);
1907 switch ( modrm_reg )
1909 case 0: /* Write CR0 */
1910 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1912 gdprintk(XENLOG_WARNING,
1913 "Attempt to change unmodifiable CR0 flags.\n");
1914 goto fail;
1916 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1917 break;
1919 case 2: /* Write CR2 */
1920 v->arch.guest_context.ctrlreg[2] = *reg;
1921 arch_set_cr2(v, *reg);
1922 break;
1924 case 3: /* Write CR3 */
1925 LOCK_BIGLOCK(v->domain);
1926 if ( !is_pv_32on64_vcpu(v) )
1927 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1928 #ifdef CONFIG_COMPAT
1929 else
1930 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1931 #endif
1932 UNLOCK_BIGLOCK(v->domain);
1933 if ( rc == 0 ) /* not okay */
1934 goto fail;
1935 break;
1937 case 4: /* Write CR4 */
1938 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
1939 write_cr4(pv_guest_cr4_to_real_cr4(
1940 v->arch.guest_context.ctrlreg[4]));
1941 break;
1943 default:
1944 goto fail;
1946 break;
1948 case 0x23: /* MOV <reg>,DR? */
1949 opcode = insn_fetch(u8, code_base, eip, code_limit);
1950 if ( opcode < 0xc0 )
1951 goto fail;
1952 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1953 modrm_rm |= (opcode >> 0) & 7;
1954 reg = decode_register(modrm_rm, regs, 0);
1955 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1956 goto fail;
1957 break;
1959 case 0x30: /* WRMSR */
1960 eax = regs->eax;
1961 edx = regs->edx;
1962 res = ((u64)edx << 32) | eax;
1963 switch ( (u32)regs->ecx )
1965 #ifdef CONFIG_X86_64
1966 case MSR_FS_BASE:
1967 if ( is_pv_32on64_vcpu(v) )
1968 goto fail;
1969 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1970 goto fail;
1971 v->arch.guest_context.fs_base = res;
1972 break;
1973 case MSR_GS_BASE:
1974 if ( is_pv_32on64_vcpu(v) )
1975 goto fail;
1976 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1977 goto fail;
1978 v->arch.guest_context.gs_base_kernel = res;
1979 break;
1980 case MSR_SHADOW_GS_BASE:
1981 if ( is_pv_32on64_vcpu(v) )
1982 goto fail;
1983 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1984 goto fail;
1985 v->arch.guest_context.gs_base_user = res;
1986 break;
1987 #endif
1988 case MSR_K7_FID_VID_STATUS:
1989 case MSR_K7_FID_VID_CTL:
1990 case MSR_K8_PSTATE_LIMIT:
1991 case MSR_K8_PSTATE_CTRL:
1992 case MSR_K8_PSTATE_STATUS:
1993 case MSR_K8_PSTATE0:
1994 case MSR_K8_PSTATE1:
1995 case MSR_K8_PSTATE2:
1996 case MSR_K8_PSTATE3:
1997 case MSR_K8_PSTATE4:
1998 case MSR_K8_PSTATE5:
1999 case MSR_K8_PSTATE6:
2000 case MSR_K8_PSTATE7:
2001 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2002 goto fail;
2003 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2004 break;
2005 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2006 goto fail;
2007 break;
2008 case MSR_IA32_PERF_CTL:
2009 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2010 goto fail;
2011 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2012 break;
2013 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2014 goto fail;
2015 break;
2016 default:
2017 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2018 break;
2019 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2020 (eax != l) || (edx != h) )
2021 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2022 "%08x:%08x to %08x:%08x.\n",
2023 _p(regs->ecx), h, l, edx, eax);
2024 break;
2026 break;
2028 case 0x31: /* RDTSC */
2029 rdtsc(regs->eax, regs->edx);
2030 break;
2032 case 0x32: /* RDMSR */
2033 switch ( (u32)regs->ecx )
2035 #ifdef CONFIG_X86_64
2036 case MSR_FS_BASE:
2037 if ( is_pv_32on64_vcpu(v) )
2038 goto fail;
2039 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2040 regs->edx = v->arch.guest_context.fs_base >> 32;
2041 break;
2042 case MSR_GS_BASE:
2043 if ( is_pv_32on64_vcpu(v) )
2044 goto fail;
2045 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2046 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2047 break;
2048 case MSR_SHADOW_GS_BASE:
2049 if ( is_pv_32on64_vcpu(v) )
2050 goto fail;
2051 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2052 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2053 break;
2054 #endif
2055 case MSR_K7_FID_VID_CTL:
2056 case MSR_K7_FID_VID_STATUS:
2057 case MSR_K8_PSTATE_LIMIT:
2058 case MSR_K8_PSTATE_CTRL:
2059 case MSR_K8_PSTATE_STATUS:
2060 case MSR_K8_PSTATE0:
2061 case MSR_K8_PSTATE1:
2062 case MSR_K8_PSTATE2:
2063 case MSR_K8_PSTATE3:
2064 case MSR_K8_PSTATE4:
2065 case MSR_K8_PSTATE5:
2066 case MSR_K8_PSTATE6:
2067 case MSR_K8_PSTATE7:
2068 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2069 goto fail;
2070 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2072 regs->eax = regs->edx = 0;
2073 break;
2075 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2076 goto fail;
2077 break;
2078 case MSR_EFER:
2079 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2080 goto fail;
2081 break;
2082 case MSR_IA32_MISC_ENABLE:
2083 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2084 goto fail;
2085 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2086 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2087 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2088 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2089 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2090 break;
2091 default:
2092 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2094 regs->eax = l;
2095 regs->edx = h;
2096 break;
2098 /* Everyone can read the MSR space. */
2099 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2100 _p(regs->ecx));*/
2101 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2102 goto fail;
2103 break;
2105 break;
2107 default:
2108 goto fail;
2111 #undef wr_ad
2112 #undef rd_ad
2114 done:
2115 instruction_done(regs, eip, bpmatch);
2116 skip:
2117 return EXCRET_fault_fixed;
2119 fail:
2120 return 0;
2123 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2124 unsigned int esp, unsigned int decr)
2126 return (((esp - decr) < (esp - 1)) &&
2127 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2130 static void emulate_gate_op(struct cpu_user_regs *regs)
2132 #ifdef __x86_64__
2133 struct vcpu *v = current;
2134 unsigned int sel, ar, dpl, nparm, opnd_sel;
2135 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2136 unsigned long off, eip, opnd_off, base, limit;
2137 int jump;
2139 /* Check whether this fault is due to the use of a call gate. */
2140 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2141 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2142 ((ar & _SEGMENT_TYPE) != 0xc00) )
2144 do_guest_trap(TRAP_gp_fault, regs, 1);
2145 return;
2147 if ( !(ar & _SEGMENT_P) )
2149 do_guest_trap(TRAP_no_segment, regs, 1);
2150 return;
2152 dpl = (ar >> 13) & 3;
2153 nparm = ar & 0x1f;
2155 /*
2156 * Decode instruction (and perhaps operand) to determine RPL,
2157 * whether this is a jump or a call, and the call return offset.
2158 */
2159 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2160 !(ar & _SEGMENT_S) ||
2161 !(ar & _SEGMENT_P) ||
2162 !(ar & _SEGMENT_CODE) )
2164 do_guest_trap(TRAP_gp_fault, regs, 1);
2165 return;
2168 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2169 ad_default = ad_bytes = op_default;
2170 opnd_sel = opnd_off = 0;
2171 jump = -1;
2172 for ( eip = regs->eip; eip - regs->_eip < 10; )
2174 switch ( insn_fetch(u8, base, eip, limit) )
2176 case 0x66: /* operand-size override */
2177 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2178 continue;
2179 case 0x67: /* address-size override */
2180 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2181 continue;
2182 case 0x2e: /* CS override */
2183 opnd_sel = regs->cs;
2184 ASSERT(opnd_sel);
2185 continue;
2186 case 0x3e: /* DS override */
2187 opnd_sel = read_sreg(regs, ds);
2188 if ( !opnd_sel )
2189 opnd_sel = dpl;
2190 continue;
2191 case 0x26: /* ES override */
2192 opnd_sel = read_sreg(regs, es);
2193 if ( !opnd_sel )
2194 opnd_sel = dpl;
2195 continue;
2196 case 0x64: /* FS override */
2197 opnd_sel = read_sreg(regs, fs);
2198 if ( !opnd_sel )
2199 opnd_sel = dpl;
2200 continue;
2201 case 0x65: /* GS override */
2202 opnd_sel = read_sreg(regs, gs);
2203 if ( !opnd_sel )
2204 opnd_sel = dpl;
2205 continue;
2206 case 0x36: /* SS override */
2207 opnd_sel = regs->ss;
2208 if ( !opnd_sel )
2209 opnd_sel = dpl;
2210 continue;
2211 case 0xea:
2212 ++jump;
2213 /* FALLTHROUGH */
2214 case 0x9a:
2215 ++jump;
2216 opnd_sel = regs->cs;
2217 opnd_off = eip;
2218 ad_bytes = ad_default;
2219 eip += op_bytes + 2;
2220 break;
2221 case 0xff:
2223 unsigned int modrm;
2225 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2227 case 0x28: case 0x68: case 0xa8:
2228 ++jump;
2229 /* FALLTHROUGH */
2230 case 0x18: case 0x58: case 0x98:
2231 ++jump;
2232 if ( ad_bytes != 2 )
2234 if ( (modrm & 7) == 4 )
2236 unsigned int sib;
2237 sib = insn_fetch(u8, base, eip, limit);
2239 modrm = (modrm & ~7) | (sib & 7);
2240 if ( (sib >>= 3) != 4 )
2241 opnd_off = *(unsigned long *)
2242 decode_register(sib & 7, regs, 0);
2243 opnd_off <<= sib >> 3;
2245 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2246 opnd_off += *(unsigned long *)
2247 decode_register(modrm & 7, regs, 0);
2248 else
2249 modrm |= 0x87;
2250 if ( !opnd_sel )
2252 switch ( modrm & 7 )
2254 default:
2255 opnd_sel = read_sreg(regs, ds);
2256 break;
2257 case 4: case 5:
2258 opnd_sel = regs->ss;
2259 break;
2263 else
2265 switch ( modrm & 7 )
2267 case 0: case 1: case 7:
2268 opnd_off = regs->ebx;
2269 break;
2270 case 6:
2271 if ( !(modrm & 0xc0) )
2272 modrm |= 0x80;
2273 else
2274 case 2: case 3:
2276 opnd_off = regs->ebp;
2277 if ( !opnd_sel )
2278 opnd_sel = regs->ss;
2280 break;
2282 if ( !opnd_sel )
2283 opnd_sel = read_sreg(regs, ds);
2284 switch ( modrm & 7 )
2286 case 0: case 2: case 4:
2287 opnd_off += regs->esi;
2288 break;
2289 case 1: case 3: case 5:
2290 opnd_off += regs->edi;
2291 break;
2294 switch ( modrm & 0xc0 )
2296 case 0x40:
2297 opnd_off += insn_fetch(s8, base, eip, limit);
2298 break;
2299 case 0x80:
2300 opnd_off += insn_fetch(s32, base, eip, limit);
2301 break;
2303 if ( ad_bytes == 4 )
2304 opnd_off = (unsigned int)opnd_off;
2305 else if ( ad_bytes == 2 )
2306 opnd_off = (unsigned short)opnd_off;
2307 break;
2310 break;
2312 break;
2315 if ( jump < 0 )
2317 fail:
2318 do_guest_trap(TRAP_gp_fault, regs, 1);
2319 skip:
2320 return;
2323 if ( (opnd_sel != regs->cs &&
2324 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2325 !(ar & _SEGMENT_S) ||
2326 !(ar & _SEGMENT_P) ||
2327 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2329 do_guest_trap(TRAP_gp_fault, regs, 1);
2330 return;
2333 opnd_off += op_bytes;
2334 #define ad_default ad_bytes
2335 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2336 #undef ad_default
2337 ASSERT((opnd_sel & ~3) == regs->error_code);
2338 if ( dpl < (opnd_sel & 3) )
2340 do_guest_trap(TRAP_gp_fault, regs, 1);
2341 return;
2344 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2345 !(ar & _SEGMENT_S) ||
2346 !(ar & _SEGMENT_CODE) ||
2347 (!jump || (ar & _SEGMENT_EC) ?
2348 ((ar >> 13) & 3) > (regs->cs & 3) :
2349 ((ar >> 13) & 3) != (regs->cs & 3)) )
2351 regs->error_code = sel;
2352 do_guest_trap(TRAP_gp_fault, regs, 1);
2353 return;
2355 if ( !(ar & _SEGMENT_P) )
2357 regs->error_code = sel;
2358 do_guest_trap(TRAP_no_segment, regs, 1);
2359 return;
2361 if ( off > limit )
2363 regs->error_code = 0;
2364 do_guest_trap(TRAP_gp_fault, regs, 1);
2365 return;
2368 if ( !jump )
2370 unsigned int ss, esp, *stkp;
2371 int rc;
2372 #define push(item) do \
2373 { \
2374 --stkp; \
2375 esp -= 4; \
2376 rc = __put_user(item, stkp); \
2377 if ( rc ) \
2378 { \
2379 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2380 PFEC_write_access); \
2381 return; \
2382 } \
2383 } while ( 0 )
2385 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2387 sel |= (ar >> 13) & 3;
2388 /* Inner stack known only for kernel ring. */
2389 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2391 do_guest_trap(TRAP_gp_fault, regs, 1);
2392 return;
2394 esp = v->arch.guest_context.kernel_sp;
2395 ss = v->arch.guest_context.kernel_ss;
2396 if ( (ss & 3) != (sel & 3) ||
2397 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2398 ((ar >> 13) & 3) != (sel & 3) ||
2399 !(ar & _SEGMENT_S) ||
2400 (ar & _SEGMENT_CODE) ||
2401 !(ar & _SEGMENT_WR) )
2403 regs->error_code = ss & ~3;
2404 do_guest_trap(TRAP_invalid_tss, regs, 1);
2405 return;
2407 if ( !(ar & _SEGMENT_P) ||
2408 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2410 regs->error_code = ss & ~3;
2411 do_guest_trap(TRAP_stack_error, regs, 1);
2412 return;
2414 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2415 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2417 do_guest_trap(TRAP_gp_fault, regs, 1);
2418 return;
2420 push(regs->ss);
2421 push(regs->esp);
2422 if ( nparm )
2424 const unsigned int *ustkp;
2426 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2427 ((ar >> 13) & 3) != (regs->cs & 3) ||
2428 !(ar & _SEGMENT_S) ||
2429 (ar & _SEGMENT_CODE) ||
2430 !(ar & _SEGMENT_WR) ||
2431 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2432 return do_guest_trap(TRAP_gp_fault, regs, 1);
2433 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2434 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2436 do_guest_trap(TRAP_gp_fault, regs, 1);
2437 return;
2439 do
2441 unsigned int parm;
2443 --ustkp;
2444 rc = __get_user(parm, ustkp);
2445 if ( rc )
2447 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2448 return;
2450 push(parm);
2451 } while ( --nparm );
2454 else
2456 sel |= (regs->cs & 3);
2457 esp = regs->esp;
2458 ss = regs->ss;
2459 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2460 ((ar >> 13) & 3) != (sel & 3) )
2462 do_guest_trap(TRAP_gp_fault, regs, 1);
2463 return;
2465 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2467 regs->error_code = 0;
2468 do_guest_trap(TRAP_stack_error, regs, 1);
2469 return;
2471 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2472 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2474 do_guest_trap(TRAP_gp_fault, regs, 1);
2475 return;
2478 push(regs->cs);
2479 push(eip);
2480 #undef push
2481 regs->esp = esp;
2482 regs->ss = ss;
2484 else
2485 sel |= (regs->cs & 3);
2487 regs->cs = sel;
2488 instruction_done(regs, off, 0);
2489 #endif
2492 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2494 struct vcpu *v = current;
2495 unsigned long fixup;
2497 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2499 if ( regs->error_code & 1 )
2500 goto hardware_gp;
2502 if ( !guest_mode(regs) )
2503 goto gp_in_kernel;
2505 /*
2506 * Cunning trick to allow arbitrary "INT n" handling.
2508 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2509 * instruction from trapping to the appropriate vector, when that might not
2510 * be expected by Xen or the guest OS. For example, that entry might be for
2511 * a fault handler (unlike traps, faults don't increment EIP), or might
2512 * expect an error code on the stack (which a software trap never
2513 * provides), or might be a hardware interrupt handler that doesn't like
2514 * being called spuriously.
2516 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2517 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2518 * clear to indicate that it's a software fault, not hardware.
2520 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2521 * okay because they can only be triggered by an explicit DPL-checked
2522 * instruction. The DPL specified by the guest OS for these vectors is NOT
2523 * CHECKED!!
2524 */
2525 if ( (regs->error_code & 3) == 2 )
2527 /* This fault must be due to <INT n> instruction. */
2528 const struct trap_info *ti;
2529 unsigned char vector = regs->error_code >> 3;
2530 ti = &v->arch.guest_context.trap_ctxt[vector];
2531 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2533 regs->eip += 2;
2534 do_guest_trap(vector, regs, 0);
2535 return;
2538 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2540 emulate_gate_op(regs);
2541 return;
2544 /* Emulate some simple privileged and I/O instructions. */
2545 if ( (regs->error_code == 0) &&
2546 emulate_privileged_op(regs) )
2548 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2549 return;
2552 #if defined(__i386__)
2553 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2554 (regs->error_code == 0) &&
2555 gpf_emulate_4gb(regs) )
2557 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2558 return;
2560 #endif
2562 /* Pass on GPF as is. */
2563 do_guest_trap(TRAP_gp_fault, regs, 1);
2564 return;
2566 gp_in_kernel:
2568 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2570 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2571 regs->error_code, _p(regs->eip), _p(fixup));
2572 regs->eip = fixup;
2573 return;
2576 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2578 hardware_gp:
2579 show_execution_state(regs);
2580 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2583 static void nmi_softirq(void)
2585 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2586 vcpu_kick(dom0->vcpu[0]);
2589 static void nmi_dom0_report(unsigned int reason_idx)
2591 struct domain *d;
2592 struct vcpu *v;
2594 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2595 return;
2597 set_bit(reason_idx, nmi_reason(d));
2599 if ( !test_and_set_bool(v->nmi_pending) )
2600 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2603 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2605 switch ( opt_nmi[0] )
2607 case 'd': /* 'dom0' */
2608 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2609 case 'i': /* 'ignore' */
2610 break;
2611 default: /* 'fatal' */
2612 console_force_unlock();
2613 printk("\n\nNMI - MEMORY ERROR\n");
2614 fatal_trap(TRAP_nmi, regs);
2617 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2618 mdelay(1);
2619 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2622 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2624 switch ( opt_nmi[0] )
2626 case 'd': /* 'dom0' */
2627 nmi_dom0_report(_XEN_NMIREASON_io_error);
2628 case 'i': /* 'ignore' */
2629 break;
2630 default: /* 'fatal' */
2631 console_force_unlock();
2632 printk("\n\nNMI - I/O ERROR\n");
2633 fatal_trap(TRAP_nmi, regs);
2636 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2637 mdelay(1);
2638 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2641 static void unknown_nmi_error(unsigned char reason)
2643 switch ( opt_nmi[0] )
2645 case 'd': /* 'dom0' */
2646 nmi_dom0_report(_XEN_NMIREASON_unknown);
2647 case 'i': /* 'ignore' */
2648 break;
2649 default: /* 'fatal' */
2650 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2651 printk("Dazed and confused, but trying to continue\n");
2652 printk("Do you have a strange power saving mode enabled?\n");
2653 kexec_crash();
2657 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2659 return 0;
2662 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2664 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2666 unsigned int cpu = smp_processor_id();
2667 unsigned char reason;
2669 ++nmi_count(cpu);
2671 if ( nmi_callback(regs, cpu) )
2672 return;
2674 if ( nmi_watchdog )
2675 nmi_watchdog_tick(regs);
2677 /* Only the BSP gets external NMIs from the system. */
2678 if ( cpu == 0 )
2680 reason = inb(0x61);
2681 if ( reason & 0x80 )
2682 mem_parity_error(regs);
2683 else if ( reason & 0x40 )
2684 io_check_error(regs);
2685 else if ( !nmi_watchdog )
2686 unknown_nmi_error((unsigned char)(reason&0xff));
2690 void set_nmi_callback(nmi_callback_t callback)
2692 nmi_callback = callback;
2695 void unset_nmi_callback(void)
2697 nmi_callback = dummy_nmi_callback;
2700 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2702 struct vcpu *curr = current;
2704 BUG_ON(!guest_mode(regs));
2706 setup_fpu(curr);
2708 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2710 do_guest_trap(TRAP_no_device, regs, 0);
2711 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2713 else
2714 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2716 return;
2719 asmlinkage void do_debug(struct cpu_user_regs *regs)
2721 struct vcpu *v = current;
2723 DEBUGGER_trap_entry(TRAP_debug, regs);
2725 if ( !guest_mode(regs) )
2727 if ( regs->eflags & EF_TF )
2729 #ifdef __x86_64__
2730 void sysenter_entry(void);
2731 void sysenter_eflags_saved(void);
2732 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2733 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2734 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2735 goto out;
2736 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2737 #else
2738 WARN_ON(1);
2739 #endif
2740 regs->eflags &= ~EF_TF;
2742 else
2744 /*
2745 * We ignore watchpoints when they trigger within Xen. This may
2746 * happen when a buffer is passed to us which previously had a
2747 * watchpoint set on it. No need to bump EIP; the only faulting
2748 * trap is an instruction breakpoint, which can't happen to us.
2749 */
2750 WARN_ON(!search_exception_table(regs->eip));
2752 goto out;
2755 /* Save debug status register where guest OS can peek at it */
2756 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2758 ler_enable();
2759 do_guest_trap(TRAP_debug, regs, 0);
2760 return;
2762 out:
2763 ler_enable();
2764 return;
2767 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2771 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2773 int i;
2774 /* Keep secondary tables in sync with IRQ updates. */
2775 for ( i = 1; i < NR_CPUS; i++ )
2776 if ( idt_tables[i] != NULL )
2777 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2778 _set_gate(&idt_table[n], 14, dpl, addr);
2781 static void set_swint_gate(unsigned int n, void *addr)
2783 __set_intr_gate(n, 3, addr);
2786 void set_intr_gate(unsigned int n, void *addr)
2788 __set_intr_gate(n, 0, addr);
2791 void set_tss_desc(unsigned int n, void *addr)
2793 _set_tssldt_desc(
2794 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2795 (unsigned long)addr,
2796 offsetof(struct tss_struct, __cacheline_filler) - 1,
2797 9);
2798 #ifdef CONFIG_COMPAT
2799 _set_tssldt_desc(
2800 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2801 (unsigned long)addr,
2802 offsetof(struct tss_struct, __cacheline_filler) - 1,
2803 11);
2804 #endif
2807 void __devinit percpu_traps_init(void)
2809 subarch_percpu_traps_init();
2811 if ( !opt_ler )
2812 return;
2814 switch ( boot_cpu_data.x86_vendor )
2816 case X86_VENDOR_INTEL:
2817 switch ( boot_cpu_data.x86 )
2819 case 6:
2820 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2821 break;
2822 case 15:
2823 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2824 break;
2826 break;
2827 case X86_VENDOR_AMD:
2828 switch ( boot_cpu_data.x86 )
2830 case 6:
2831 case 15:
2832 case 16:
2833 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2834 break;
2836 break;
2839 ler_enable();
2842 void __init trap_init(void)
2844 /*
2845 * Note that interrupt gates are always used, rather than trap gates. We
2846 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2847 * first activation must have the "bad" value(s) for these registers and
2848 * we may lose them if another activation is installed before they are
2849 * saved. The page-fault handler also needs interrupts disabled until %cr2
2850 * has been read and saved on the stack.
2851 */
2852 set_intr_gate(TRAP_divide_error,&divide_error);
2853 set_intr_gate(TRAP_debug,&debug);
2854 set_intr_gate(TRAP_nmi,&nmi);
2855 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
2856 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2857 set_intr_gate(TRAP_bounds,&bounds);
2858 set_intr_gate(TRAP_invalid_op,&invalid_op);
2859 set_intr_gate(TRAP_no_device,&device_not_available);
2860 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2861 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2862 set_intr_gate(TRAP_no_segment,&segment_not_present);
2863 set_intr_gate(TRAP_stack_error,&stack_segment);
2864 set_intr_gate(TRAP_gp_fault,&general_protection);
2865 set_intr_gate(TRAP_page_fault,&page_fault);
2866 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2867 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2868 set_intr_gate(TRAP_alignment_check,&alignment_check);
2869 set_intr_gate(TRAP_machine_check,&machine_check);
2870 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2872 /* CPU0 uses the master IDT. */
2873 idt_tables[0] = idt_table;
2875 percpu_traps_init();
2877 cpu_init();
2879 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2882 long register_guest_nmi_callback(unsigned long address)
2884 struct vcpu *v = current;
2885 struct domain *d = v->domain;
2886 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2888 t->vector = TRAP_nmi;
2889 t->flags = 0;
2890 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2891 t->address = address;
2892 TI_SET_IF(t, 1);
2894 /*
2895 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2896 * now.
2897 */
2898 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2899 v->nmi_pending = 1;
2901 return 0;
2904 long unregister_guest_nmi_callback(void)
2906 struct vcpu *v = current;
2907 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2909 memset(t, 0, sizeof(*t));
2911 return 0;
2914 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
2916 struct trap_info cur;
2917 struct vcpu *curr = current;
2918 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2919 long rc = 0;
2921 /* If no table is presented then clear the entire virtual IDT. */
2922 if ( guest_handle_is_null(traps) )
2924 memset(dst, 0, 256 * sizeof(*dst));
2925 init_int80_direct_trap(curr);
2926 return 0;
2929 for ( ; ; )
2931 if ( hypercall_preempt_check() )
2933 rc = hypercall_create_continuation(
2934 __HYPERVISOR_set_trap_table, "h", traps);
2935 break;
2938 if ( copy_from_guest(&cur, traps, 1) )
2940 rc = -EFAULT;
2941 break;
2944 if ( cur.address == 0 )
2945 break;
2947 fixup_guest_code_selector(curr->domain, cur.cs);
2949 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2951 if ( cur.vector == 0x80 )
2952 init_int80_direct_trap(curr);
2954 guest_handle_add_offset(traps, 1);
2957 return rc;
2960 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
2962 int i;
2963 struct vcpu *curr = current;
2965 switch ( reg )
2967 case 0:
2968 if ( !access_ok(value, sizeof(long)) )
2969 return -EPERM;
2970 if ( v == curr )
2971 write_debugreg(0, value);
2972 break;
2973 case 1:
2974 if ( !access_ok(value, sizeof(long)) )
2975 return -EPERM;
2976 if ( v == curr )
2977 write_debugreg(1, value);
2978 break;
2979 case 2:
2980 if ( !access_ok(value, sizeof(long)) )
2981 return -EPERM;
2982 if ( v == curr )
2983 write_debugreg(2, value);
2984 break;
2985 case 3:
2986 if ( !access_ok(value, sizeof(long)) )
2987 return -EPERM;
2988 if ( v == curr )
2989 write_debugreg(3, value);
2990 break;
2991 case 6:
2992 /*
2993 * DR6: Bits 4-11,16-31 reserved (set to 1).
2994 * Bit 12 reserved (set to 0).
2995 */
2996 value &= 0xffffefff; /* reserved bits => 0 */
2997 value |= 0xffff0ff0; /* reserved bits => 1 */
2998 if ( v == curr )
2999 write_debugreg(6, value);
3000 break;
3001 case 7:
3002 /*
3003 * DR7: Bit 10 reserved (set to 1).
3004 * Bits 11-12,14-15 reserved (set to 0).
3005 */
3006 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3007 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3008 /*
3009 * Privileged bits:
3010 * GD (bit 13): must be 0.
3011 */
3012 if ( value & DR_GENERAL_DETECT )
3013 return -EPERM;
3014 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3015 if ( value & DR7_ACTIVE_MASK )
3017 unsigned int io_enable = 0;
3019 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3021 if ( ((value >> i) & 3) == DR_IO )
3023 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3024 return -EPERM;
3025 io_enable |= value & (3 << ((i - 16) >> 1));
3027 #ifdef __i386__
3028 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3029 !boot_cpu_has(X86_FEATURE_LM)) &&
3030 (((value >> i) & 0xc) == DR_LEN_8) )
3031 return -EPERM;
3032 #endif
3035 /* Guest DR5 is a handy stash for I/O intercept information. */
3036 v->arch.guest_context.debugreg[5] = io_enable;
3037 value &= ~io_enable;
3039 /*
3040 * If DR7 was previously clear then we need to load all other
3041 * debug registers at this point as they were not restored during
3042 * context switch.
3043 */
3044 if ( (v == curr) &&
3045 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3047 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3048 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3049 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3050 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3051 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3054 if ( v == curr )
3055 write_debugreg(7, value);
3056 break;
3057 default:
3058 return -EINVAL;
3061 v->arch.guest_context.debugreg[reg] = value;
3062 return 0;
3065 long do_set_debugreg(int reg, unsigned long value)
3067 return set_debugreg(current, reg, value);
3070 unsigned long do_get_debugreg(int reg)
3072 struct vcpu *curr = current;
3074 switch ( reg )
3076 case 0 ... 3:
3077 case 6:
3078 return curr->arch.guest_context.debugreg[reg];
3079 case 7:
3080 return (curr->arch.guest_context.debugreg[7] |
3081 curr->arch.guest_context.debugreg[5]);
3082 case 4 ... 5:
3083 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3084 curr->arch.guest_context.debugreg[reg + 2] : 0);
3087 return -EINVAL;
3090 /*
3091 * Local variables:
3092 * mode: C
3093 * c-set-style: "BSD"
3094 * c-basic-offset: 4
3095 * tab-width: 4
3096 * indent-tabs-mode: nil
3097 * End:
3098 */