ia64/xen-unstable

view xen/arch/x86/traps.c @ 18806:ed8524f4a044

x86: Re-initialise HPET on resume from S3

Signed-off-by: Guanqun Lu <guanqun.lu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 18 15:55:14 2008 +0000 (2008-11-18)
parents 5fd51e1e9c79
children 4107618ee0d8
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 struct vcpu *curr = current;
136 unsigned long *stack, addr;
138 if ( is_hvm_vcpu(curr) )
139 return;
141 if ( is_pv_32on64_vcpu(curr) )
142 {
143 compat_show_guest_stack(regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
160 {
161 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
162 break;
163 if ( get_user(addr, stack) )
164 {
165 if ( i != 0 )
166 printk("\n ");
167 printk("Fault while accessing guest memory.");
168 i = 1;
169 break;
170 }
171 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
172 printk("\n ");
173 printk(" %p", _p(addr));
174 stack++;
175 }
176 if ( i == 0 )
177 printk("Stack empty.");
178 printk("\n");
179 }
181 #if !defined(CONFIG_FRAME_POINTER)
183 static void show_trace(struct cpu_user_regs *regs)
184 {
185 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
187 printk("Xen call trace:\n ");
189 printk("[<%p>]", _p(regs->eip));
190 print_symbol(" %s\n ", regs->eip);
192 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
193 {
194 addr = *stack++;
195 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
196 {
197 printk("[<%p>]", _p(addr));
198 print_symbol(" %s\n ", addr);
199 }
200 }
202 printk("\n");
203 }
205 #else
207 static void show_trace(struct cpu_user_regs *regs)
208 {
209 unsigned long *frame, next, addr, low, high;
211 printk("Xen call trace:\n ");
213 printk("[<%p>]", _p(regs->eip));
214 print_symbol(" %s\n ", regs->eip);
216 /* Bounds for range of valid frame pointer. */
217 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
218 high = (low & ~(STACK_SIZE - 1)) +
219 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
221 /* The initial frame pointer. */
222 next = regs->ebp;
224 for ( ; ; )
225 {
226 /* Valid frame pointer? */
227 if ( (next < low) || (next >= high) )
228 {
229 /*
230 * Exception stack frames have a different layout, denoted by an
231 * inverted frame pointer.
232 */
233 next = ~next;
234 if ( (next < low) || (next >= high) )
235 break;
236 frame = (unsigned long *)next;
237 next = frame[0];
238 addr = frame[(offsetof(struct cpu_user_regs, eip) -
239 offsetof(struct cpu_user_regs, ebp))
240 / BYTES_PER_LONG];
241 }
242 else
243 {
244 /* Ordinary stack frame. */
245 frame = (unsigned long *)next;
246 next = frame[0];
247 addr = frame[1];
248 }
250 printk("[<%p>]", _p(addr));
251 print_symbol(" %s\n ", addr);
253 low = (unsigned long)&frame[2];
254 }
256 printk("\n");
257 }
259 #endif
261 void show_stack(struct cpu_user_regs *regs)
262 {
263 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
264 int i;
266 if ( guest_mode(regs) )
267 return show_guest_stack(regs);
269 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
271 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
272 {
273 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
274 break;
275 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
276 printk("\n ");
277 addr = *stack++;
278 printk(" %p", _p(addr));
279 }
280 if ( i == 0 )
281 printk("Stack empty.");
282 printk("\n");
284 show_trace(regs);
285 }
287 void show_stack_overflow(unsigned int cpu, unsigned long esp)
288 {
289 #ifdef MEMORY_GUARD
290 unsigned long esp_top, esp_bottom;
291 unsigned long *stack, addr;
293 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
294 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
296 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
297 (void *)esp_top, (void *)esp_bottom, (void *)esp,
298 (void *)init_tss[cpu].esp0);
300 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
301 if ( ((unsigned long)(esp - esp_top) > 512) &&
302 ((unsigned long)(esp_top - esp) > 512) )
303 {
304 printk("No stack overflow detected. Skipping stack trace.\n");
305 return;
306 }
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow (dumping trace %p-%p):\n ",
312 (void *)esp, (void *)esp_bottom);
314 stack = (unsigned long *)esp;
315 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
316 {
317 addr = *stack++;
318 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
319 {
320 printk("%p: [<%p>]", stack, _p(addr));
321 print_symbol(" %s\n ", addr);
322 }
323 }
325 printk("\n");
326 #endif
327 }
329 void show_execution_state(struct cpu_user_regs *regs)
330 {
331 show_registers(regs);
332 show_stack(regs);
333 }
335 void vcpu_show_execution_state(struct vcpu *v)
336 {
337 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
338 v->domain->domain_id, v->vcpu_id);
340 if ( v == current )
341 {
342 show_execution_state(guest_cpu_user_regs());
343 return;
344 }
346 vcpu_pause(v); /* acceptably dangerous */
348 vcpu_show_registers(v);
349 /* Todo: map arbitrary vcpu's top guest stack page here. */
350 if ( (v->domain == current->domain) &&
351 guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
352 show_guest_stack(&v->arch.guest_context.user_regs);
354 vcpu_unpause(v);
355 }
357 char *trapstr(int trapnr)
358 {
359 static char *strings[] = {
360 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
361 "invalid opcode", "device not available", "double fault",
362 "coprocessor segment", "invalid tss", "segment not found",
363 "stack error", "general protection fault", "page fault",
364 "spurious interrupt", "coprocessor error", "alignment check",
365 "machine check", "simd error"
366 };
368 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
369 return "???";
371 return strings[trapnr];
372 }
374 /*
375 * This is called for faults at very unexpected times (e.g., when interrupts
376 * are disabled). In such situations we can't do much that is safe. We try to
377 * print out some tracing and then we just spin.
378 */
379 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
380 {
381 static DEFINE_PER_CPU(char, depth);
383 /*
384 * In some cases, we can end up in a vicious cycle of fatal_trap()s
385 * within fatal_trap()s. We give the problem a couple of iterations to
386 * bottom out, and then we just panic.
387 */
388 if ( ++this_cpu(depth) < 3 )
389 {
390 watchdog_disable();
391 console_start_sync();
393 show_execution_state(regs);
395 if ( trapnr == TRAP_page_fault )
396 {
397 unsigned long cr2 = read_cr2();
398 printk("Faulting linear address: %p\n", _p(cr2));
399 show_page_walk(cr2);
400 }
401 }
403 panic("FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x] %s\n",
405 trapnr, trapstr(trapnr), regs->error_code,
406 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
407 }
409 static void do_guest_trap(
410 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
411 {
412 struct vcpu *v = current;
413 struct trap_bounce *tb;
414 const struct trap_info *ti;
416 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
418 tb = &v->arch.trap_bounce;
419 ti = &v->arch.guest_context.trap_ctxt[trapnr];
421 tb->flags = TBF_EXCEPTION;
422 tb->cs = ti->cs;
423 tb->eip = ti->address;
425 if ( use_error_code )
426 {
427 tb->flags |= TBF_EXCEPTION_ERRCODE;
428 tb->error_code = regs->error_code;
429 }
431 if ( TI_GET_IF(ti) )
432 tb->flags |= TBF_INTERRUPT;
434 if ( unlikely(null_trap_bounce(v, tb)) )
435 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
436 "on VCPU %d [ec=%04x]\n",
437 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
438 }
440 static void instruction_done(
441 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
442 {
443 regs->eip = eip;
444 regs->eflags &= ~X86_EFLAGS_RF;
445 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
446 {
447 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
448 if ( regs->eflags & X86_EFLAGS_TF )
449 current->arch.guest_context.debugreg[6] |= 0x4000;
450 do_guest_trap(TRAP_debug, regs, 0);
451 }
452 }
454 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
455 unsigned int port, unsigned int len)
456 {
457 unsigned int width, i, match = 0;
458 unsigned long start;
460 if ( !(v->arch.guest_context.debugreg[5]) ||
461 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
462 return 0;
464 for ( i = 0; i < 4; i++ )
465 {
466 if ( !(v->arch.guest_context.debugreg[5] &
467 (3 << (i * DR_ENABLE_SIZE))) )
468 continue;
470 start = v->arch.guest_context.debugreg[i];
471 width = 0;
473 switch ( (v->arch.guest_context.debugreg[7] >>
474 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
475 {
476 case DR_LEN_1: width = 1; break;
477 case DR_LEN_2: width = 2; break;
478 case DR_LEN_4: width = 4; break;
479 case DR_LEN_8: width = 8; break;
480 }
482 if ( (start < (port + len)) && ((start + width) > port) )
483 match |= 1 << i;
484 }
486 return match;
487 }
489 /*
490 * Called from asm to set up the MCE trapbounce info.
491 * Returns 0 if no callback is set up, else 1.
492 */
493 asmlinkage int set_guest_machinecheck_trapbounce(void)
494 {
495 struct vcpu *v = current;
496 struct trap_bounce *tb = &v->arch.trap_bounce;
498 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
499 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
500 return !null_trap_bounce(v, tb);
501 }
503 /*
504 * Called from asm to set up the NMI trapbounce info.
505 * Returns 0 if no callback is set up, else 1.
506 */
507 asmlinkage int set_guest_nmi_trapbounce(void)
508 {
509 struct vcpu *v = current;
510 struct trap_bounce *tb = &v->arch.trap_bounce;
511 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
512 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
513 return !null_trap_bounce(v, tb);
514 }
516 static inline void do_trap(
517 int trapnr, struct cpu_user_regs *regs, int use_error_code)
518 {
519 struct vcpu *curr = current;
520 unsigned long fixup;
522 DEBUGGER_trap_entry(trapnr, regs);
524 if ( guest_mode(regs) )
525 {
526 do_guest_trap(trapnr, regs, use_error_code);
527 return;
528 }
530 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
531 {
532 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
533 trapnr, _p(regs->eip), _p(fixup));
534 regs->eip = fixup;
535 return;
536 }
538 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
539 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
540 {
541 curr->arch.hvm_vcpu.fpu_exception_callback(
542 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
543 return;
544 }
546 DEBUGGER_trap_fatal(trapnr, regs);
548 show_execution_state(regs);
549 panic("FATAL TRAP: vector = %d (%s)\n"
550 "[error_code=%04x]\n",
551 trapnr, trapstr(trapnr), regs->error_code);
552 }
554 #define DO_ERROR_NOCODE(trapnr, name) \
555 asmlinkage void do_##name(struct cpu_user_regs *regs) \
556 { \
557 do_trap(trapnr, regs, 0); \
558 }
560 #define DO_ERROR(trapnr, name) \
561 asmlinkage void do_##name(struct cpu_user_regs *regs) \
562 { \
563 do_trap(trapnr, regs, 1); \
564 }
566 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
567 DO_ERROR_NOCODE(TRAP_overflow, overflow)
568 DO_ERROR_NOCODE(TRAP_bounds, bounds)
569 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
570 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
571 DO_ERROR( TRAP_no_segment, segment_not_present)
572 DO_ERROR( TRAP_stack_error, stack_segment)
573 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
574 DO_ERROR( TRAP_alignment_check, alignment_check)
575 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
577 int rdmsr_hypervisor_regs(
578 uint32_t idx, uint32_t *eax, uint32_t *edx)
579 {
580 struct domain *d = current->domain;
581 /* Optionally shift out of the way of Viridian architectural MSRs. */
582 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
584 idx -= base;
585 if ( idx > 0 )
586 return 0;
588 switch ( idx )
589 {
590 case 0:
591 {
592 *eax = *edx = 0;
593 break;
594 }
595 default:
596 BUG();
597 }
599 return 1;
600 }
602 int wrmsr_hypervisor_regs(
603 uint32_t idx, uint32_t eax, uint32_t edx)
604 {
605 struct domain *d = current->domain;
606 /* Optionally shift out of the way of Viridian architectural MSRs. */
607 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
609 idx -= base;
610 if ( idx > 0 )
611 return 0;
613 switch ( idx )
614 {
615 case 0:
616 {
617 void *hypercall_page;
618 unsigned long mfn;
619 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
620 unsigned int idx = eax & 0xfff;
622 if ( idx > 0 )
623 {
624 gdprintk(XENLOG_WARNING,
625 "Out of range index %u to MSR %08x\n",
626 idx, 0x40000000);
627 return 0;
628 }
630 mfn = gmfn_to_mfn(d, gmfn);
632 if ( !mfn_valid(mfn) ||
633 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
634 {
635 gdprintk(XENLOG_WARNING,
636 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
637 gmfn, mfn, base + idx);
638 return 0;
639 }
641 hypercall_page = map_domain_page(mfn);
642 hypercall_page_initialise(d, hypercall_page);
643 unmap_domain_page(hypercall_page);
645 put_page_and_type(mfn_to_page(mfn));
646 break;
647 }
649 default:
650 BUG();
651 }
653 return 1;
654 }
656 int cpuid_hypervisor_leaves(
657 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
658 {
659 struct domain *d = current->domain;
660 /* Optionally shift out of the way of Viridian architectural leaves. */
661 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
663 idx -= base;
664 if ( idx > 2 )
665 return 0;
667 switch ( idx )
668 {
669 case 0:
670 *eax = base + 2; /* Largest leaf */
671 *ebx = XEN_CPUID_SIGNATURE_EBX;
672 *ecx = XEN_CPUID_SIGNATURE_ECX;
673 *edx = XEN_CPUID_SIGNATURE_EDX;
674 break;
676 case 1:
677 *eax = (xen_major_version() << 16) | xen_minor_version();
678 *ebx = 0; /* Reserved */
679 *ecx = 0; /* Reserved */
680 *edx = 0; /* Reserved */
681 break;
683 case 2:
684 *eax = 1; /* Number of hypercall-transfer pages */
685 *ebx = 0x40000000; /* MSR base address */
686 if ( is_viridian_domain(d) )
687 *ebx = 0x40000200;
688 *ecx = 0; /* Features 1 */
689 *edx = 0; /* Features 2 */
690 if ( !is_hvm_vcpu(current) )
691 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
692 break;
694 default:
695 BUG();
696 }
698 return 1;
699 }
701 static void pv_cpuid(struct cpu_user_regs *regs)
702 {
703 uint32_t a, b, c, d;
705 a = regs->eax;
706 b = regs->ebx;
707 c = regs->ecx;
708 d = regs->edx;
710 if ( current->domain->domain_id != 0 )
711 {
712 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
713 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
714 goto out;
715 }
717 asm (
718 "cpuid"
719 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
720 : "0" (a), "1" (b), "2" (c), "3" (d) );
722 if ( (regs->eax & 0x7fffffff) == 1 )
723 {
724 /* Modify Feature Information. */
725 __clear_bit(X86_FEATURE_VME, &d);
726 if ( !opt_allow_hugepage )
727 __clear_bit(X86_FEATURE_PSE, &d);
728 __clear_bit(X86_FEATURE_PGE, &d);
729 __clear_bit(X86_FEATURE_MCE, &d);
730 __clear_bit(X86_FEATURE_MCA, &d);
731 __clear_bit(X86_FEATURE_PSE36, &d);
732 }
733 switch ( (uint32_t)regs->eax )
734 {
735 case 1:
736 /* Modify Feature Information. */
737 if ( !cpu_has_sep )
738 __clear_bit(X86_FEATURE_SEP, &d);
739 #ifdef __i386__
740 if ( !supervisor_mode_kernel )
741 __clear_bit(X86_FEATURE_SEP, &d);
742 #endif
743 __clear_bit(X86_FEATURE_DS, &d);
744 __clear_bit(X86_FEATURE_ACC, &d);
745 __clear_bit(X86_FEATURE_PBE, &d);
747 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
748 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
749 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
750 __clear_bit(X86_FEATURE_VMXE % 32, &c);
751 __clear_bit(X86_FEATURE_SMXE % 32, &c);
752 __clear_bit(X86_FEATURE_TM2 % 32, &c);
753 if ( is_pv_32bit_vcpu(current) )
754 __clear_bit(X86_FEATURE_CX16 % 32, &c);
755 __clear_bit(X86_FEATURE_XTPR % 32, &c);
756 __clear_bit(X86_FEATURE_PDCM % 32, &c);
757 __clear_bit(X86_FEATURE_DCA % 32, &c);
758 break;
759 case 0x80000001:
760 /* Modify Feature Information. */
761 if ( is_pv_32bit_vcpu(current) )
762 {
763 __clear_bit(X86_FEATURE_LM % 32, &d);
764 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
765 }
766 #ifndef __i386__
767 if ( is_pv_32on64_vcpu(current) &&
768 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
769 #endif
770 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
771 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
772 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
774 __clear_bit(X86_FEATURE_SVME % 32, &c);
775 __clear_bit(X86_FEATURE_OSVW % 32, &c);
776 __clear_bit(X86_FEATURE_IBS % 32, &c);
777 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
778 __clear_bit(X86_FEATURE_WDT % 32, &c);
779 break;
780 case 5: /* MONITOR/MWAIT */
781 case 0xa: /* Architectural Performance Monitor Features */
782 case 0x8000000a: /* SVM revision and features */
783 case 0x8000001b: /* Instruction Based Sampling */
784 a = b = c = d = 0;
785 break;
786 default:
787 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
788 break;
789 }
791 out:
792 regs->eax = a;
793 regs->ebx = b;
794 regs->ecx = c;
795 regs->edx = d;
796 }
798 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
799 {
800 char sig[5], instr[2];
801 unsigned long eip, rc;
803 eip = regs->eip;
805 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
806 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
807 {
808 propagate_page_fault(eip + sizeof(sig) - rc, 0);
809 return EXCRET_fault_fixed;
810 }
811 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
812 return 0;
813 eip += sizeof(sig);
815 /* We only emulate CPUID. */
816 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
817 {
818 propagate_page_fault(eip + sizeof(instr) - rc, 0);
819 return EXCRET_fault_fixed;
820 }
821 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
822 return 0;
823 eip += sizeof(instr);
825 pv_cpuid(regs);
827 instruction_done(regs, eip, 0);
829 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
831 return EXCRET_fault_fixed;
832 }
834 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
835 {
836 struct bug_frame bug;
837 struct bug_frame_str bug_str;
838 char *filename, *predicate, *eip = (char *)regs->eip;
839 unsigned long fixup;
840 int id, lineno;
842 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
844 if ( likely(guest_mode(regs)) )
845 {
846 if ( !emulate_forced_invalid_op(regs) )
847 do_guest_trap(TRAP_invalid_op, regs, 0);
848 return;
849 }
851 if ( !is_kernel(eip) ||
852 __copy_from_user(&bug, eip, sizeof(bug)) ||
853 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
854 (bug.ret != 0xc2) )
855 goto die;
856 eip += sizeof(bug);
858 id = bug.id & 3;
860 if ( id == BUGFRAME_dump )
861 {
862 show_execution_state(regs);
863 regs->eip = (unsigned long)eip;
864 return;
865 }
867 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
868 if ( !is_kernel(eip) ||
869 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
870 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
871 goto die;
872 eip += sizeof(bug_str);
874 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
875 lineno = bug.id >> 2;
877 if ( id == BUGFRAME_warn )
878 {
879 printk("Xen WARN at %.50s:%d\n", filename, lineno);
880 show_execution_state(regs);
881 regs->eip = (unsigned long)eip;
882 return;
883 }
885 if ( id == BUGFRAME_bug )
886 {
887 printk("Xen BUG at %.50s:%d\n", filename, lineno);
888 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
889 show_execution_state(regs);
890 panic("Xen BUG at %.50s:%d\n", filename, lineno);
891 }
893 /* ASSERT: decode the predicate string pointer. */
894 ASSERT(id == BUGFRAME_assert);
895 if ( !is_kernel(eip) ||
896 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
897 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
898 goto die;
899 eip += sizeof(bug_str);
901 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
902 printk("Assertion '%s' failed at %.50s:%d\n",
903 predicate, filename, lineno);
904 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
905 show_execution_state(regs);
906 panic("Assertion '%s' failed at %.50s:%d\n",
907 predicate, filename, lineno);
909 die:
910 if ( (fixup = search_exception_table(regs->eip)) != 0 )
911 {
912 regs->eip = fixup;
913 return;
914 }
915 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
916 show_execution_state(regs);
917 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
918 }
920 asmlinkage void do_int3(struct cpu_user_regs *regs)
921 {
922 DEBUGGER_trap_entry(TRAP_int3, regs);
924 if ( !guest_mode(regs) )
925 {
926 debugger_trap_fatal(TRAP_int3, regs);
927 return;
928 }
930 do_guest_trap(TRAP_int3, regs, 0);
931 }
933 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
934 {
935 machine_check_vector(regs, regs->error_code);
936 }
938 static void reserved_bit_page_fault(
939 unsigned long addr, struct cpu_user_regs *regs)
940 {
941 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
942 current->domain->domain_id, current->vcpu_id, regs->error_code);
943 show_page_walk(addr);
944 show_execution_state(regs);
945 }
947 void propagate_page_fault(unsigned long addr, u16 error_code)
948 {
949 struct trap_info *ti;
950 struct vcpu *v = current;
951 struct trap_bounce *tb = &v->arch.trap_bounce;
953 v->arch.guest_context.ctrlreg[2] = addr;
954 arch_set_cr2(v, addr);
956 /* Re-set error_code.user flag appropriately for the guest. */
957 error_code &= ~PFEC_user_mode;
958 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
959 error_code |= PFEC_user_mode;
961 trace_pv_page_fault(addr, error_code);
963 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
964 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
965 tb->error_code = error_code;
966 tb->cs = ti->cs;
967 tb->eip = ti->address;
968 if ( TI_GET_IF(ti) )
969 tb->flags |= TBF_INTERRUPT;
970 if ( unlikely(null_trap_bounce(v, tb)) )
971 {
972 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
973 v->domain->domain_id, v->vcpu_id, error_code);
974 show_page_walk(addr);
975 }
977 if ( unlikely(error_code & PFEC_reserved_bit) )
978 reserved_bit_page_fault(addr, guest_cpu_user_regs());
979 }
981 static int handle_gdt_ldt_mapping_fault(
982 unsigned long offset, struct cpu_user_regs *regs)
983 {
984 struct vcpu *curr = current;
985 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
986 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
987 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
989 /* Should never fault in another vcpu's area. */
990 BUG_ON(vcpu_area != curr->vcpu_id);
992 /* Byte offset within the gdt/ldt sub-area. */
993 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
995 if ( likely(is_ldt_area) )
996 {
997 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
998 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
999 {
1000 if ( guest_mode(regs) )
1001 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1002 regs->eip, offset);
1004 else
1006 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1007 if ( !guest_mode(regs) )
1008 return 0;
1009 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1010 propagate_page_fault(
1011 curr->arch.guest_context.ldt_base + offset,
1012 regs->error_code);
1015 else
1017 /* GDT fault: handle the fault as #GP(selector). */
1018 regs->error_code = (u16)offset & ~7;
1019 (void)do_general_protection(regs);
1022 return EXCRET_fault_fixed;
1025 #ifdef HYPERVISOR_VIRT_END
1026 #define IN_HYPERVISOR_RANGE(va) \
1027 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1028 #else
1029 #define IN_HYPERVISOR_RANGE(va) \
1030 (((va) >= HYPERVISOR_VIRT_START))
1031 #endif
1033 static int __spurious_page_fault(
1034 unsigned long addr, unsigned int error_code)
1036 unsigned long mfn, cr3 = read_cr3();
1037 #if CONFIG_PAGING_LEVELS >= 4
1038 l4_pgentry_t l4e, *l4t;
1039 #endif
1040 #if CONFIG_PAGING_LEVELS >= 3
1041 l3_pgentry_t l3e, *l3t;
1042 #endif
1043 l2_pgentry_t l2e, *l2t;
1044 l1_pgentry_t l1e, *l1t;
1045 unsigned int required_flags, disallowed_flags;
1047 /*
1048 * We do not take spurious page faults in IRQ handlers as we do not
1049 * modify page tables in IRQ context. We therefore bail here because
1050 * map_domain_page() is not IRQ-safe.
1051 */
1052 if ( in_irq() )
1053 return 0;
1055 /* Reserved bit violations are never spurious faults. */
1056 if ( error_code & PFEC_reserved_bit )
1057 return 0;
1059 required_flags = _PAGE_PRESENT;
1060 if ( error_code & PFEC_write_access )
1061 required_flags |= _PAGE_RW;
1062 if ( error_code & PFEC_user_mode )
1063 required_flags |= _PAGE_USER;
1065 disallowed_flags = 0;
1066 if ( error_code & PFEC_insn_fetch )
1067 disallowed_flags |= _PAGE_NX;
1069 mfn = cr3 >> PAGE_SHIFT;
1071 #if CONFIG_PAGING_LEVELS >= 4
1072 l4t = map_domain_page(mfn);
1073 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1074 mfn = l4e_get_pfn(l4e);
1075 unmap_domain_page(l4t);
1076 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1077 (l4e_get_flags(l4e) & disallowed_flags) )
1078 return 0;
1079 #endif
1081 #if CONFIG_PAGING_LEVELS >= 3
1082 l3t = map_domain_page(mfn);
1083 #if CONFIG_PAGING_LEVELS == 3
1084 l3t += (cr3 & 0xFE0UL) >> 3;
1085 #endif
1086 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1087 mfn = l3e_get_pfn(l3e);
1088 unmap_domain_page(l3t);
1089 #if CONFIG_PAGING_LEVELS == 3
1090 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1091 return 0;
1092 #else
1093 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1094 (l3e_get_flags(l3e) & disallowed_flags) )
1095 return 0;
1096 #endif
1097 #endif
1099 l2t = map_domain_page(mfn);
1100 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1101 mfn = l2e_get_pfn(l2e);
1102 unmap_domain_page(l2t);
1103 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1104 (l2e_get_flags(l2e) & disallowed_flags) )
1105 return 0;
1106 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1108 l1e = l1e_empty(); /* define before use in debug tracing */
1109 goto spurious;
1112 l1t = map_domain_page(mfn);
1113 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1114 mfn = l1e_get_pfn(l1e);
1115 unmap_domain_page(l1t);
1116 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1117 (l1e_get_flags(l1e) & disallowed_flags) )
1118 return 0;
1120 spurious:
1121 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1122 "at addr %lx, e/c %04x\n",
1123 current->domain->domain_id, current->vcpu_id,
1124 addr, error_code);
1125 #if CONFIG_PAGING_LEVELS >= 4
1126 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1127 #endif
1128 #if CONFIG_PAGING_LEVELS >= 3
1129 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1130 #endif
1131 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1132 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1133 return 1;
1136 static int spurious_page_fault(
1137 unsigned long addr, unsigned int error_code)
1139 unsigned long flags;
1140 int is_spurious;
1142 /*
1143 * Disabling interrupts prevents TLB flushing, and hence prevents
1144 * page tables from becoming invalid under our feet during the walk.
1145 */
1146 local_irq_save(flags);
1147 is_spurious = __spurious_page_fault(addr, error_code);
1148 local_irq_restore(flags);
1150 return is_spurious;
1153 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1155 struct vcpu *v = current;
1156 struct domain *d = v->domain;
1158 /* No fixups in interrupt context or when interrupts are disabled. */
1159 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1160 return 0;
1162 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1164 if ( paging_mode_external(d) && guest_mode(regs) )
1166 int ret = paging_fault(addr, regs);
1167 if ( ret == EXCRET_fault_fixed )
1168 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1169 return ret;
1171 if ( !(regs->error_code & PFEC_reserved_bit) &&
1172 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1173 return handle_gdt_ldt_mapping_fault(
1174 addr - GDT_LDT_VIRT_START, regs);
1175 return 0;
1178 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1179 guest_kernel_mode(v, regs) &&
1180 /* Do not check if access-protection fault since the page may
1181 legitimately be not present in shadow page tables */
1182 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1183 PFEC_write_access) &&
1184 ptwr_do_page_fault(v, addr, regs) )
1185 return EXCRET_fault_fixed;
1187 if ( paging_mode_enabled(d) )
1189 int ret = paging_fault(addr, regs);
1190 if ( ret == EXCRET_fault_fixed )
1191 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1192 return ret;
1195 return 0;
1198 /*
1199 * #PF error code:
1200 * Bit 0: Protection violation (=1) ; Page not present (=0)
1201 * Bit 1: Write access
1202 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1203 * Bit 3: Reserved bit violation
1204 * Bit 4: Instruction fetch
1205 */
1206 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1208 unsigned long addr, fixup;
1209 unsigned int error_code;
1211 addr = read_cr2();
1213 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1214 error_code = regs->error_code;
1216 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1218 perfc_incr(page_faults);
1220 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1221 return;
1223 if ( unlikely(!guest_mode(regs)) )
1225 if ( spurious_page_fault(addr, error_code) )
1226 return;
1228 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1230 perfc_incr(copy_user_faults);
1231 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1232 reserved_bit_page_fault(addr, regs);
1233 regs->eip = fixup;
1234 return;
1237 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1239 show_execution_state(regs);
1240 show_page_walk(addr);
1241 panic("FATAL PAGE FAULT\n"
1242 "[error_code=%04x]\n"
1243 "Faulting linear address: %p\n",
1244 error_code, _p(addr));
1247 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1248 && spurious_page_fault(addr, error_code)) )
1249 return;
1251 propagate_page_fault(addr, regs->error_code);
1254 /*
1255 * Early #PF handler to print CR2, error code, and stack.
1257 * We also deal with spurious faults here, even though they should never happen
1258 * during early boot (an issue was seen once, but was most likely a hardware
1259 * problem).
1260 */
1261 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1263 static int stuck;
1264 static unsigned long prev_eip, prev_cr2;
1265 unsigned long cr2 = read_cr2();
1267 BUG_ON(smp_processor_id() != 0);
1269 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1271 prev_eip = regs->eip;
1272 prev_cr2 = cr2;
1273 stuck = 0;
1274 return;
1277 if ( stuck++ == 1000 )
1279 unsigned long *stk = (unsigned long *)regs;
1280 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1281 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1282 printk("Stack dump: ");
1283 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1284 printk("%p ", _p(*stk++));
1285 for ( ; ; ) ;
1289 long do_fpu_taskswitch(int set)
1291 struct vcpu *v = current;
1293 if ( set )
1295 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1296 stts();
1298 else
1300 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1301 if ( v->fpu_dirtied )
1302 clts();
1305 return 0;
1308 static int read_descriptor(unsigned int sel,
1309 const struct vcpu *v,
1310 const struct cpu_user_regs * regs,
1311 unsigned long *base,
1312 unsigned long *limit,
1313 unsigned int *ar,
1314 unsigned int vm86attr)
1316 struct desc_struct desc;
1318 if ( !vm86_mode(regs) )
1320 if ( sel < 4)
1321 desc.b = desc.a = 0;
1322 else if ( __get_user(desc,
1323 (const struct desc_struct *)(!(sel & 4)
1324 ? GDT_VIRT_START(v)
1325 : LDT_VIRT_START(v))
1326 + (sel >> 3)) )
1327 return 0;
1328 if ( !(vm86attr & _SEGMENT_CODE) )
1329 desc.b &= ~_SEGMENT_L;
1331 else
1333 desc.a = (sel << 20) | 0xffff;
1334 desc.b = vm86attr | (sel >> 12);
1337 *ar = desc.b & 0x00f0ff00;
1338 if ( !(desc.b & _SEGMENT_L) )
1340 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1341 (desc.b & 0xff000000));
1342 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1343 if ( desc.b & _SEGMENT_G )
1344 *limit = ((*limit + 1) << 12) - 1;
1345 #ifndef NDEBUG
1346 if ( !vm86_mode(regs) && (sel > 3) )
1348 unsigned int a, l;
1349 unsigned char valid;
1351 asm volatile (
1352 "larl %2,%0 ; setz %1"
1353 : "=r" (a), "=rm" (valid) : "rm" (sel));
1354 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1355 asm volatile (
1356 "lsll %2,%0 ; setz %1"
1357 : "=r" (l), "=rm" (valid) : "rm" (sel));
1358 BUG_ON(valid && (l != *limit));
1360 #endif
1362 else
1364 *base = 0UL;
1365 *limit = ~0UL;
1368 return 1;
1371 #ifdef __x86_64__
1372 static int read_gate_descriptor(unsigned int gate_sel,
1373 const struct vcpu *v,
1374 unsigned int *sel,
1375 unsigned long *off,
1376 unsigned int *ar)
1378 struct desc_struct desc;
1379 const struct desc_struct *pdesc;
1382 pdesc = (const struct desc_struct *)
1383 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1384 + (gate_sel >> 3);
1385 if ( (gate_sel < 4) ||
1386 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1387 __get_user(desc, pdesc) )
1388 return 0;
1390 *sel = (desc.a >> 16) & 0x0000fffc;
1391 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1392 *ar = desc.b & 0x0000ffff;
1394 /*
1395 * check_descriptor() clears the DPL field and stores the
1396 * guest requested DPL in the selector's RPL field.
1397 */
1398 if ( *ar & _SEGMENT_DPL )
1399 return 0;
1400 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1402 if ( !is_pv_32bit_vcpu(v) )
1404 if ( (*ar & 0x1f00) != 0x0c00 ||
1405 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1406 __get_user(desc, pdesc + 1) ||
1407 (desc.b & 0x1f00) )
1408 return 0;
1410 *off |= (unsigned long)desc.a << 32;
1411 return 1;
1414 switch ( *ar & 0x1f00 )
1416 case 0x0400:
1417 *off &= 0xffff;
1418 break;
1419 case 0x0c00:
1420 break;
1421 default:
1422 return 0;
1425 return 1;
1427 #endif
1429 /* Has the guest requested sufficient permission for this I/O access? */
1430 static int guest_io_okay(
1431 unsigned int port, unsigned int bytes,
1432 struct vcpu *v, struct cpu_user_regs *regs)
1434 #if defined(__x86_64__)
1435 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1436 int user_mode = !(v->arch.flags & TF_kernel_mode);
1437 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1438 #elif defined(__i386__)
1439 #define TOGGLE_MODE() ((void)0)
1440 #endif
1442 if ( !vm86_mode(regs) &&
1443 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1444 return 1;
1446 if ( v->arch.iobmp_limit > (port + bytes) )
1448 union { uint8_t bytes[2]; uint16_t mask; } x;
1450 /*
1451 * Grab permission bytes from guest space. Inaccessible bytes are
1452 * read as 0xff (no access allowed).
1453 */
1454 TOGGLE_MODE();
1455 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1456 port>>3, 2) )
1458 default: x.bytes[0] = ~0;
1459 case 1: x.bytes[1] = ~0;
1460 case 0: break;
1462 TOGGLE_MODE();
1464 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1465 return 1;
1468 return 0;
1471 /* Has the administrator granted sufficient permission for this I/O access? */
1472 static int admin_io_okay(
1473 unsigned int port, unsigned int bytes,
1474 struct vcpu *v, struct cpu_user_regs *regs)
1476 /*
1477 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1478 * We never permit direct access to that register.
1479 */
1480 if ( (port == 0xcf8) && (bytes == 4) )
1481 return 0;
1483 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1486 static uint32_t guest_io_read(
1487 unsigned int port, unsigned int bytes,
1488 struct vcpu *v, struct cpu_user_regs *regs)
1490 extern uint32_t pci_conf_read(
1491 uint32_t cf8, uint8_t offset, uint8_t bytes);
1493 uint32_t data = 0;
1494 unsigned int shift = 0;
1496 if ( admin_io_okay(port, bytes, v, regs) )
1498 switch ( bytes )
1500 case 1: return inb(port);
1501 case 2: return inw(port);
1502 case 4: return inl(port);
1506 while ( bytes != 0 )
1508 unsigned int size = 1;
1509 uint32_t sub_data = 0xff;
1511 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1513 sub_data = pv_pit_handler(port, 0, 0);
1515 else if ( (port == 0xcf8) && (bytes == 4) )
1517 size = 4;
1518 sub_data = v->domain->arch.pci_cf8;
1520 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1522 size = min(bytes, 4 - (port & 3));
1523 if ( size == 3 )
1524 size = 2;
1525 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1528 if ( size == 4 )
1529 return sub_data;
1531 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1532 shift += size * 8;
1533 port += size;
1534 bytes -= size;
1537 return data;
1540 static void guest_io_write(
1541 unsigned int port, unsigned int bytes, uint32_t data,
1542 struct vcpu *v, struct cpu_user_regs *regs)
1544 extern void pci_conf_write(
1545 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1547 if ( admin_io_okay(port, bytes, v, regs) )
1549 switch ( bytes ) {
1550 case 1:
1551 outb((uint8_t)data, port);
1552 if ( pv_post_outb_hook )
1553 pv_post_outb_hook(port, (uint8_t)data);
1554 break;
1555 case 2:
1556 outw((uint16_t)data, port);
1557 break;
1558 case 4:
1559 outl(data, port);
1560 break;
1562 return;
1565 while ( bytes != 0 )
1567 unsigned int size = 1;
1569 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1571 pv_pit_handler(port, (uint8_t)data, 1);
1573 else if ( (port == 0xcf8) && (bytes == 4) )
1575 size = 4;
1576 v->domain->arch.pci_cf8 = data;
1578 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1580 size = min(bytes, 4 - (port & 3));
1581 if ( size == 3 )
1582 size = 2;
1583 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1586 if ( size == 4 )
1587 return;
1589 port += size;
1590 bytes -= size;
1591 data >>= size * 8;
1595 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1596 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1597 __attribute__((__regparm__(1)));
1598 unsigned long guest_to_host_gpr_switch(unsigned long)
1599 __attribute__((__regparm__(1)));
1601 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1603 /* Instruction fetch with error handling. */
1604 #define insn_fetch(type, base, eip, limit) \
1605 ({ unsigned long _rc, _ptr = (base) + (eip); \
1606 type _x; \
1607 if ( ad_default < 8 ) \
1608 _ptr = (unsigned int)_ptr; \
1609 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1610 goto fail; \
1611 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1612 { \
1613 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1614 goto skip; \
1615 } \
1616 (eip) += sizeof(_x); _x; })
1618 #if defined(CONFIG_X86_32)
1619 # define read_sreg(regs, sr) ((regs)->sr)
1620 #elif defined(CONFIG_X86_64)
1621 # define read_sreg(regs, sr) read_segment_register(sr)
1622 #endif
1624 static int emulate_privileged_op(struct cpu_user_regs *regs)
1626 struct vcpu *v = current;
1627 unsigned long *reg, eip = regs->eip, res;
1628 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1629 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1630 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1631 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1632 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1633 ? regs->reg \
1634 : ad_bytes == 4 \
1635 ? (u32)regs->reg \
1636 : (u16)regs->reg)
1637 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1638 ? regs->reg = (val) \
1639 : ad_bytes == 4 \
1640 ? (*(u32 *)&regs->reg = (val)) \
1641 : (*(u16 *)&regs->reg = (val)))
1642 unsigned long code_base, code_limit;
1643 char io_emul_stub[32];
1644 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1645 u32 l, h, eax, edx;
1647 if ( !read_descriptor(regs->cs, v, regs,
1648 &code_base, &code_limit, &ar,
1649 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1650 goto fail;
1651 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1652 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1653 if ( !(ar & _SEGMENT_S) ||
1654 !(ar & _SEGMENT_P) ||
1655 !(ar & _SEGMENT_CODE) )
1656 goto fail;
1658 /* emulating only opcodes not allowing SS to be default */
1659 data_sel = read_sreg(regs, ds);
1661 /* Legacy prefixes. */
1662 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1664 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1666 case 0x66: /* operand-size override */
1667 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1668 continue;
1669 case 0x67: /* address-size override */
1670 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1671 continue;
1672 case 0x2e: /* CS override */
1673 data_sel = regs->cs;
1674 continue;
1675 case 0x3e: /* DS override */
1676 data_sel = read_sreg(regs, ds);
1677 continue;
1678 case 0x26: /* ES override */
1679 data_sel = read_sreg(regs, es);
1680 continue;
1681 case 0x64: /* FS override */
1682 data_sel = read_sreg(regs, fs);
1683 lm_ovr = lm_seg_fs;
1684 continue;
1685 case 0x65: /* GS override */
1686 data_sel = read_sreg(regs, gs);
1687 lm_ovr = lm_seg_gs;
1688 continue;
1689 case 0x36: /* SS override */
1690 data_sel = regs->ss;
1691 continue;
1692 case 0xf0: /* LOCK */
1693 lock = 1;
1694 continue;
1695 case 0xf2: /* REPNE/REPNZ */
1696 case 0xf3: /* REP/REPE/REPZ */
1697 rep_prefix = 1;
1698 continue;
1699 default:
1700 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1702 rex = opcode;
1703 continue;
1705 break;
1707 break;
1710 /* REX prefix. */
1711 if ( rex & 8 ) /* REX.W */
1712 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1713 modrm_reg = (rex & 4) << 1; /* REX.R */
1714 /* REX.X does not need to be decoded. */
1715 modrm_rm = (rex & 1) << 3; /* REX.B */
1717 if ( opcode == 0x0f )
1718 goto twobyte_opcode;
1720 if ( lock )
1721 goto fail;
1723 /* Input/Output String instructions. */
1724 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1726 unsigned long data_base, data_limit;
1728 if ( rep_prefix && (rd_ad(ecx) == 0) )
1729 goto done;
1731 if ( !(opcode & 2) )
1733 data_sel = read_sreg(regs, es);
1734 lm_ovr = lm_seg_none;
1737 if ( !(ar & _SEGMENT_L) )
1739 if ( !read_descriptor(data_sel, v, regs,
1740 &data_base, &data_limit, &ar,
1741 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1742 _SEGMENT_P) )
1743 goto fail;
1744 if ( !(ar & _SEGMENT_S) ||
1745 !(ar & _SEGMENT_P) ||
1746 (opcode & 2 ?
1747 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1748 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1749 goto fail;
1751 #ifdef CONFIG_X86_64
1752 else
1754 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1756 switch ( lm_ovr )
1758 case lm_seg_none:
1759 data_base = 0UL;
1760 break;
1761 case lm_seg_fs:
1762 data_base = v->arch.guest_context.fs_base;
1763 break;
1764 case lm_seg_gs:
1765 if ( guest_kernel_mode(v, regs) )
1766 data_base = v->arch.guest_context.gs_base_kernel;
1767 else
1768 data_base = v->arch.guest_context.gs_base_user;
1769 break;
1772 else
1773 read_descriptor(data_sel, v, regs,
1774 &data_base, &data_limit, &ar,
1775 0);
1776 data_limit = ~0UL;
1777 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1779 #endif
1781 port = (u16)regs->edx;
1783 continue_io_string:
1784 switch ( opcode )
1786 case 0x6c: /* INSB */
1787 op_bytes = 1;
1788 case 0x6d: /* INSW/INSL */
1789 if ( (data_limit < (op_bytes - 1)) ||
1790 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1791 !guest_io_okay(port, op_bytes, v, regs) )
1792 goto fail;
1793 data = guest_io_read(port, op_bytes, v, regs);
1794 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1795 &data, op_bytes)) != 0 )
1797 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1798 PFEC_write_access);
1799 return EXCRET_fault_fixed;
1801 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1802 ? -op_bytes : op_bytes));
1803 break;
1805 case 0x6e: /* OUTSB */
1806 op_bytes = 1;
1807 case 0x6f: /* OUTSW/OUTSL */
1808 if ( (data_limit < (op_bytes - 1)) ||
1809 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1810 !guest_io_okay(port, op_bytes, v, regs) )
1811 goto fail;
1812 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1813 op_bytes)) != 0 )
1815 propagate_page_fault(data_base + rd_ad(esi)
1816 + op_bytes - rc, 0);
1817 return EXCRET_fault_fixed;
1819 guest_io_write(port, op_bytes, data, v, regs);
1820 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1821 ? -op_bytes : op_bytes));
1822 break;
1825 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1827 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1829 if ( !bpmatch && !hypercall_preempt_check() )
1830 goto continue_io_string;
1831 eip = regs->eip;
1834 goto done;
1837 /*
1838 * Very likely to be an I/O instruction (IN/OUT).
1839 * Build an on-stack stub to execute the instruction with full guest
1840 * GPR context. This is needed for some systems which (ab)use IN/OUT
1841 * to communicate with BIOS code in system-management mode.
1842 */
1843 #ifdef __x86_64__
1844 /* movq $host_to_guest_gpr_switch,%rcx */
1845 io_emul_stub[0] = 0x48;
1846 io_emul_stub[1] = 0xb9;
1847 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1848 /* callq *%rcx */
1849 io_emul_stub[10] = 0xff;
1850 io_emul_stub[11] = 0xd1;
1851 #else
1852 /* call host_to_guest_gpr_switch */
1853 io_emul_stub[0] = 0xe8;
1854 *(s32 *)&io_emul_stub[1] =
1855 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1856 /* 7 x nop */
1857 memset(&io_emul_stub[5], 0x90, 7);
1858 #endif
1859 /* data16 or nop */
1860 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1861 /* <io-access opcode> */
1862 io_emul_stub[13] = opcode;
1863 /* imm8 or nop */
1864 io_emul_stub[14] = 0x90;
1865 /* ret (jumps to guest_to_host_gpr_switch) */
1866 io_emul_stub[15] = 0xc3;
1868 /* Handy function-typed pointer to the stub. */
1869 io_emul = (void *)io_emul_stub;
1871 if ( ioemul_handle_quirk )
1872 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1874 /* I/O Port and Interrupt Flag instructions. */
1875 switch ( opcode )
1877 case 0xe4: /* IN imm8,%al */
1878 op_bytes = 1;
1879 case 0xe5: /* IN imm8,%eax */
1880 port = insn_fetch(u8, code_base, eip, code_limit);
1881 io_emul_stub[14] = port; /* imm8 */
1882 exec_in:
1883 if ( !guest_io_okay(port, op_bytes, v, regs) )
1884 goto fail;
1885 if ( admin_io_okay(port, op_bytes, v, regs) )
1887 io_emul(regs);
1889 else
1891 if ( op_bytes == 4 )
1892 regs->eax = 0;
1893 else
1894 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1895 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1897 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1898 goto done;
1900 case 0xec: /* IN %dx,%al */
1901 op_bytes = 1;
1902 case 0xed: /* IN %dx,%eax */
1903 port = (u16)regs->edx;
1904 goto exec_in;
1906 case 0xe6: /* OUT %al,imm8 */
1907 op_bytes = 1;
1908 case 0xe7: /* OUT %eax,imm8 */
1909 port = insn_fetch(u8, code_base, eip, code_limit);
1910 io_emul_stub[14] = port; /* imm8 */
1911 exec_out:
1912 if ( !guest_io_okay(port, op_bytes, v, regs) )
1913 goto fail;
1914 if ( admin_io_okay(port, op_bytes, v, regs) )
1916 io_emul(regs);
1917 if ( (op_bytes == 1) && pv_post_outb_hook )
1918 pv_post_outb_hook(port, regs->eax);
1920 else
1922 guest_io_write(port, op_bytes, regs->eax, v, regs);
1924 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1925 goto done;
1927 case 0xee: /* OUT %al,%dx */
1928 op_bytes = 1;
1929 case 0xef: /* OUT %eax,%dx */
1930 port = (u16)regs->edx;
1931 goto exec_out;
1933 case 0xfa: /* CLI */
1934 case 0xfb: /* STI */
1935 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1936 goto fail;
1937 /*
1938 * This is just too dangerous to allow, in my opinion. Consider if the
1939 * caller then tries to reenable interrupts using POPF: we can't trap
1940 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1941 * do for us. :-)
1942 */
1943 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1944 goto done;
1947 /* No decode of this single-byte opcode. */
1948 goto fail;
1950 twobyte_opcode:
1951 /* Two-byte opcodes only emulated from guest kernel. */
1952 if ( !guest_kernel_mode(v, regs) )
1953 goto fail;
1955 /* Privileged (ring 0) instructions. */
1956 opcode = insn_fetch(u8, code_base, eip, code_limit);
1957 if ( lock && (opcode & ~3) != 0x20 )
1958 goto fail;
1959 switch ( opcode )
1961 case 0x06: /* CLTS */
1962 (void)do_fpu_taskswitch(0);
1963 break;
1965 case 0x09: /* WBINVD */
1966 /* Ignore the instruction if unprivileged. */
1967 if ( !cache_flush_permitted(v->domain) )
1968 /* Non-physdev domain attempted WBINVD; ignore for now since
1969 newer linux uses this in some start-of-day timing loops */
1971 else
1972 wbinvd();
1973 break;
1975 case 0x20: /* MOV CR?,<reg> */
1976 opcode = insn_fetch(u8, code_base, eip, code_limit);
1977 if ( opcode < 0xc0 )
1978 goto fail;
1979 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1980 modrm_rm |= (opcode >> 0) & 7;
1981 reg = decode_register(modrm_rm, regs, 0);
1982 switch ( modrm_reg )
1984 case 0: /* Read CR0 */
1985 *reg = (read_cr0() & ~X86_CR0_TS) |
1986 v->arch.guest_context.ctrlreg[0];
1987 break;
1989 case 2: /* Read CR2 */
1990 *reg = v->arch.guest_context.ctrlreg[2];
1991 break;
1993 case 3: /* Read CR3 */
1994 if ( !is_pv_32on64_vcpu(v) )
1995 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1996 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1997 #ifdef CONFIG_COMPAT
1998 else
1999 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2000 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
2001 #endif
2002 break;
2004 case 4: /* Read CR4 */
2005 /*
2006 * Guests can read CR4 to see what features Xen has enabled. We
2007 * therefore lie about PGE as it is unavailable to guests.
2008 * Also disallow PSE if hugepages are not enabled.
2009 */
2010 *reg = read_cr4() & ~X86_CR4_PGE;
2011 if ( !opt_allow_hugepage )
2012 *reg &= ~X86_CR4_PSE;
2013 break;
2015 default:
2016 goto fail;
2018 break;
2020 case 0x21: /* MOV DR?,<reg> */
2021 opcode = insn_fetch(u8, code_base, eip, code_limit);
2022 if ( opcode < 0xc0 )
2023 goto fail;
2024 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2025 modrm_rm |= (opcode >> 0) & 7;
2026 reg = decode_register(modrm_rm, regs, 0);
2027 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2028 goto fail;
2029 *reg = res;
2030 break;
2032 case 0x22: /* MOV <reg>,CR? */
2033 opcode = insn_fetch(u8, code_base, eip, code_limit);
2034 if ( opcode < 0xc0 )
2035 goto fail;
2036 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2037 modrm_rm |= (opcode >> 0) & 7;
2038 reg = decode_register(modrm_rm, regs, 0);
2039 switch ( modrm_reg )
2041 case 0: /* Write CR0 */
2042 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2044 gdprintk(XENLOG_WARNING,
2045 "Attempt to change unmodifiable CR0 flags.\n");
2046 goto fail;
2048 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2049 break;
2051 case 2: /* Write CR2 */
2052 v->arch.guest_context.ctrlreg[2] = *reg;
2053 arch_set_cr2(v, *reg);
2054 break;
2056 case 3: /* Write CR3 */
2057 domain_lock(v->domain);
2058 if ( !is_pv_32on64_vcpu(v) )
2059 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2060 #ifdef CONFIG_COMPAT
2061 else
2062 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2063 #endif
2064 domain_unlock(v->domain);
2065 if ( rc == 0 ) /* not okay */
2066 goto fail;
2067 break;
2069 case 4: /* Write CR4 */
2070 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2071 write_cr4(pv_guest_cr4_to_real_cr4(
2072 v->arch.guest_context.ctrlreg[4]));
2073 break;
2075 default:
2076 goto fail;
2078 break;
2080 case 0x23: /* MOV <reg>,DR? */
2081 opcode = insn_fetch(u8, code_base, eip, code_limit);
2082 if ( opcode < 0xc0 )
2083 goto fail;
2084 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2085 modrm_rm |= (opcode >> 0) & 7;
2086 reg = decode_register(modrm_rm, regs, 0);
2087 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2088 goto fail;
2089 break;
2091 case 0x30: /* WRMSR */
2092 eax = regs->eax;
2093 edx = regs->edx;
2094 res = ((u64)edx << 32) | eax;
2095 switch ( (u32)regs->ecx )
2097 #ifdef CONFIG_X86_64
2098 case MSR_FS_BASE:
2099 if ( is_pv_32on64_vcpu(v) )
2100 goto fail;
2101 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2102 goto fail;
2103 v->arch.guest_context.fs_base = res;
2104 break;
2105 case MSR_GS_BASE:
2106 if ( is_pv_32on64_vcpu(v) )
2107 goto fail;
2108 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2109 goto fail;
2110 v->arch.guest_context.gs_base_kernel = res;
2111 break;
2112 case MSR_SHADOW_GS_BASE:
2113 if ( is_pv_32on64_vcpu(v) )
2114 goto fail;
2115 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2116 goto fail;
2117 v->arch.guest_context.gs_base_user = res;
2118 break;
2119 #endif
2120 case MSR_K7_FID_VID_STATUS:
2121 case MSR_K7_FID_VID_CTL:
2122 case MSR_K8_PSTATE_LIMIT:
2123 case MSR_K8_PSTATE_CTRL:
2124 case MSR_K8_PSTATE_STATUS:
2125 case MSR_K8_PSTATE0:
2126 case MSR_K8_PSTATE1:
2127 case MSR_K8_PSTATE2:
2128 case MSR_K8_PSTATE3:
2129 case MSR_K8_PSTATE4:
2130 case MSR_K8_PSTATE5:
2131 case MSR_K8_PSTATE6:
2132 case MSR_K8_PSTATE7:
2133 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2134 goto fail;
2135 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2136 break;
2137 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2138 goto fail;
2139 break;
2140 case MSR_AMD64_NB_CFG:
2141 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2142 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2143 goto fail;
2144 if ( !IS_PRIV(v->domain) )
2145 break;
2146 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2147 (eax != l) ||
2148 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2149 goto invalid;
2150 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2151 goto fail;
2152 break;
2153 case MSR_FAM10H_MMIO_CONF_BASE:
2154 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2155 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2156 goto fail;
2157 if ( !IS_PRIV(v->domain) )
2158 break;
2159 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2160 (((((u64)h << 32) | l) ^ res) &
2161 ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
2162 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2163 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2164 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2165 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2166 goto invalid;
2167 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2168 goto fail;
2169 break;
2170 case MSR_IA32_PERF_CTL:
2171 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2172 goto fail;
2173 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2174 break;
2175 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2176 goto fail;
2177 break;
2178 case MSR_IA32_THERM_CONTROL:
2179 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2180 goto fail;
2181 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2182 goto fail;
2183 break;
2184 default:
2185 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2186 break;
2187 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2188 (eax != l) || (edx != h) )
2189 invalid:
2190 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2191 "%08x:%08x to %08x:%08x.\n",
2192 _p(regs->ecx), h, l, edx, eax);
2193 break;
2195 break;
2197 case 0x31: /* RDTSC */
2198 rdtsc(regs->eax, regs->edx);
2199 break;
2201 case 0x32: /* RDMSR */
2202 switch ( (u32)regs->ecx )
2204 #ifdef CONFIG_X86_64
2205 case MSR_FS_BASE:
2206 if ( is_pv_32on64_vcpu(v) )
2207 goto fail;
2208 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2209 regs->edx = v->arch.guest_context.fs_base >> 32;
2210 break;
2211 case MSR_GS_BASE:
2212 if ( is_pv_32on64_vcpu(v) )
2213 goto fail;
2214 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2215 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2216 break;
2217 case MSR_SHADOW_GS_BASE:
2218 if ( is_pv_32on64_vcpu(v) )
2219 goto fail;
2220 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2221 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2222 break;
2223 #endif
2224 case MSR_K7_FID_VID_CTL:
2225 case MSR_K7_FID_VID_STATUS:
2226 case MSR_K8_PSTATE_LIMIT:
2227 case MSR_K8_PSTATE_CTRL:
2228 case MSR_K8_PSTATE_STATUS:
2229 case MSR_K8_PSTATE0:
2230 case MSR_K8_PSTATE1:
2231 case MSR_K8_PSTATE2:
2232 case MSR_K8_PSTATE3:
2233 case MSR_K8_PSTATE4:
2234 case MSR_K8_PSTATE5:
2235 case MSR_K8_PSTATE6:
2236 case MSR_K8_PSTATE7:
2237 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2238 goto fail;
2239 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2241 regs->eax = regs->edx = 0;
2242 break;
2244 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2245 goto fail;
2246 break;
2247 case MSR_IA32_MISC_ENABLE:
2248 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2249 goto fail;
2250 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2251 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2252 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2253 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2254 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2255 break;
2256 case MSR_EFER:
2257 case MSR_IA32_THERM_CONTROL:
2258 case MSR_AMD_PATCHLEVEL:
2259 default:
2260 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2262 regs->eax = l;
2263 regs->edx = h;
2264 break;
2266 /* Everyone can read the MSR space. */
2267 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2268 _p(regs->ecx));*/
2269 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2270 goto fail;
2271 break;
2273 break;
2275 default:
2276 goto fail;
2279 #undef wr_ad
2280 #undef rd_ad
2282 done:
2283 instruction_done(regs, eip, bpmatch);
2284 skip:
2285 return EXCRET_fault_fixed;
2287 fail:
2288 return 0;
2291 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2292 unsigned int esp, unsigned int decr)
2294 return (((esp - decr) < (esp - 1)) &&
2295 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2298 static void emulate_gate_op(struct cpu_user_regs *regs)
2300 #ifdef __x86_64__
2301 struct vcpu *v = current;
2302 unsigned int sel, ar, dpl, nparm, opnd_sel;
2303 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2304 unsigned long off, eip, opnd_off, base, limit;
2305 int jump;
2307 /* Check whether this fault is due to the use of a call gate. */
2308 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2309 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2310 ((ar & _SEGMENT_TYPE) != 0xc00) )
2312 do_guest_trap(TRAP_gp_fault, regs, 1);
2313 return;
2315 if ( !(ar & _SEGMENT_P) )
2317 do_guest_trap(TRAP_no_segment, regs, 1);
2318 return;
2320 dpl = (ar >> 13) & 3;
2321 nparm = ar & 0x1f;
2323 /*
2324 * Decode instruction (and perhaps operand) to determine RPL,
2325 * whether this is a jump or a call, and the call return offset.
2326 */
2327 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2328 !(ar & _SEGMENT_S) ||
2329 !(ar & _SEGMENT_P) ||
2330 !(ar & _SEGMENT_CODE) )
2332 do_guest_trap(TRAP_gp_fault, regs, 1);
2333 return;
2336 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2337 ad_default = ad_bytes = op_default;
2338 opnd_sel = opnd_off = 0;
2339 jump = -1;
2340 for ( eip = regs->eip; eip - regs->_eip < 10; )
2342 switch ( insn_fetch(u8, base, eip, limit) )
2344 case 0x66: /* operand-size override */
2345 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2346 continue;
2347 case 0x67: /* address-size override */
2348 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2349 continue;
2350 case 0x2e: /* CS override */
2351 opnd_sel = regs->cs;
2352 ASSERT(opnd_sel);
2353 continue;
2354 case 0x3e: /* DS override */
2355 opnd_sel = read_sreg(regs, ds);
2356 if ( !opnd_sel )
2357 opnd_sel = dpl;
2358 continue;
2359 case 0x26: /* ES override */
2360 opnd_sel = read_sreg(regs, es);
2361 if ( !opnd_sel )
2362 opnd_sel = dpl;
2363 continue;
2364 case 0x64: /* FS override */
2365 opnd_sel = read_sreg(regs, fs);
2366 if ( !opnd_sel )
2367 opnd_sel = dpl;
2368 continue;
2369 case 0x65: /* GS override */
2370 opnd_sel = read_sreg(regs, gs);
2371 if ( !opnd_sel )
2372 opnd_sel = dpl;
2373 continue;
2374 case 0x36: /* SS override */
2375 opnd_sel = regs->ss;
2376 if ( !opnd_sel )
2377 opnd_sel = dpl;
2378 continue;
2379 case 0xea:
2380 ++jump;
2381 /* FALLTHROUGH */
2382 case 0x9a:
2383 ++jump;
2384 opnd_sel = regs->cs;
2385 opnd_off = eip;
2386 ad_bytes = ad_default;
2387 eip += op_bytes + 2;
2388 break;
2389 case 0xff:
2391 unsigned int modrm;
2393 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2395 case 0x28: case 0x68: case 0xa8:
2396 ++jump;
2397 /* FALLTHROUGH */
2398 case 0x18: case 0x58: case 0x98:
2399 ++jump;
2400 if ( ad_bytes != 2 )
2402 if ( (modrm & 7) == 4 )
2404 unsigned int sib;
2405 sib = insn_fetch(u8, base, eip, limit);
2407 modrm = (modrm & ~7) | (sib & 7);
2408 if ( (sib >>= 3) != 4 )
2409 opnd_off = *(unsigned long *)
2410 decode_register(sib & 7, regs, 0);
2411 opnd_off <<= sib >> 3;
2413 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2414 opnd_off += *(unsigned long *)
2415 decode_register(modrm & 7, regs, 0);
2416 else
2417 modrm |= 0x87;
2418 if ( !opnd_sel )
2420 switch ( modrm & 7 )
2422 default:
2423 opnd_sel = read_sreg(regs, ds);
2424 break;
2425 case 4: case 5:
2426 opnd_sel = regs->ss;
2427 break;
2431 else
2433 switch ( modrm & 7 )
2435 case 0: case 1: case 7:
2436 opnd_off = regs->ebx;
2437 break;
2438 case 6:
2439 if ( !(modrm & 0xc0) )
2440 modrm |= 0x80;
2441 else
2442 case 2: case 3:
2444 opnd_off = regs->ebp;
2445 if ( !opnd_sel )
2446 opnd_sel = regs->ss;
2448 break;
2450 if ( !opnd_sel )
2451 opnd_sel = read_sreg(regs, ds);
2452 switch ( modrm & 7 )
2454 case 0: case 2: case 4:
2455 opnd_off += regs->esi;
2456 break;
2457 case 1: case 3: case 5:
2458 opnd_off += regs->edi;
2459 break;
2462 switch ( modrm & 0xc0 )
2464 case 0x40:
2465 opnd_off += insn_fetch(s8, base, eip, limit);
2466 break;
2467 case 0x80:
2468 opnd_off += insn_fetch(s32, base, eip, limit);
2469 break;
2471 if ( ad_bytes == 4 )
2472 opnd_off = (unsigned int)opnd_off;
2473 else if ( ad_bytes == 2 )
2474 opnd_off = (unsigned short)opnd_off;
2475 break;
2478 break;
2480 break;
2483 if ( jump < 0 )
2485 fail:
2486 do_guest_trap(TRAP_gp_fault, regs, 1);
2487 skip:
2488 return;
2491 if ( (opnd_sel != regs->cs &&
2492 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2493 !(ar & _SEGMENT_S) ||
2494 !(ar & _SEGMENT_P) ||
2495 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2497 do_guest_trap(TRAP_gp_fault, regs, 1);
2498 return;
2501 opnd_off += op_bytes;
2502 #define ad_default ad_bytes
2503 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2504 #undef ad_default
2505 ASSERT((opnd_sel & ~3) == regs->error_code);
2506 if ( dpl < (opnd_sel & 3) )
2508 do_guest_trap(TRAP_gp_fault, regs, 1);
2509 return;
2512 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2513 !(ar & _SEGMENT_S) ||
2514 !(ar & _SEGMENT_CODE) ||
2515 (!jump || (ar & _SEGMENT_EC) ?
2516 ((ar >> 13) & 3) > (regs->cs & 3) :
2517 ((ar >> 13) & 3) != (regs->cs & 3)) )
2519 regs->error_code = sel;
2520 do_guest_trap(TRAP_gp_fault, regs, 1);
2521 return;
2523 if ( !(ar & _SEGMENT_P) )
2525 regs->error_code = sel;
2526 do_guest_trap(TRAP_no_segment, regs, 1);
2527 return;
2529 if ( off > limit )
2531 regs->error_code = 0;
2532 do_guest_trap(TRAP_gp_fault, regs, 1);
2533 return;
2536 if ( !jump )
2538 unsigned int ss, esp, *stkp;
2539 int rc;
2540 #define push(item) do \
2541 { \
2542 --stkp; \
2543 esp -= 4; \
2544 rc = __put_user(item, stkp); \
2545 if ( rc ) \
2546 { \
2547 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2548 PFEC_write_access); \
2549 return; \
2550 } \
2551 } while ( 0 )
2553 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2555 sel |= (ar >> 13) & 3;
2556 /* Inner stack known only for kernel ring. */
2557 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2559 do_guest_trap(TRAP_gp_fault, regs, 1);
2560 return;
2562 esp = v->arch.guest_context.kernel_sp;
2563 ss = v->arch.guest_context.kernel_ss;
2564 if ( (ss & 3) != (sel & 3) ||
2565 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2566 ((ar >> 13) & 3) != (sel & 3) ||
2567 !(ar & _SEGMENT_S) ||
2568 (ar & _SEGMENT_CODE) ||
2569 !(ar & _SEGMENT_WR) )
2571 regs->error_code = ss & ~3;
2572 do_guest_trap(TRAP_invalid_tss, regs, 1);
2573 return;
2575 if ( !(ar & _SEGMENT_P) ||
2576 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2578 regs->error_code = ss & ~3;
2579 do_guest_trap(TRAP_stack_error, regs, 1);
2580 return;
2582 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2583 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2585 do_guest_trap(TRAP_gp_fault, regs, 1);
2586 return;
2588 push(regs->ss);
2589 push(regs->esp);
2590 if ( nparm )
2592 const unsigned int *ustkp;
2594 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2595 ((ar >> 13) & 3) != (regs->cs & 3) ||
2596 !(ar & _SEGMENT_S) ||
2597 (ar & _SEGMENT_CODE) ||
2598 !(ar & _SEGMENT_WR) ||
2599 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2600 return do_guest_trap(TRAP_gp_fault, regs, 1);
2601 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2602 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2604 do_guest_trap(TRAP_gp_fault, regs, 1);
2605 return;
2607 do
2609 unsigned int parm;
2611 --ustkp;
2612 rc = __get_user(parm, ustkp);
2613 if ( rc )
2615 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2616 return;
2618 push(parm);
2619 } while ( --nparm );
2622 else
2624 sel |= (regs->cs & 3);
2625 esp = regs->esp;
2626 ss = regs->ss;
2627 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2628 ((ar >> 13) & 3) != (sel & 3) )
2630 do_guest_trap(TRAP_gp_fault, regs, 1);
2631 return;
2633 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2635 regs->error_code = 0;
2636 do_guest_trap(TRAP_stack_error, regs, 1);
2637 return;
2639 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2640 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2642 do_guest_trap(TRAP_gp_fault, regs, 1);
2643 return;
2646 push(regs->cs);
2647 push(eip);
2648 #undef push
2649 regs->esp = esp;
2650 regs->ss = ss;
2652 else
2653 sel |= (regs->cs & 3);
2655 regs->cs = sel;
2656 instruction_done(regs, off, 0);
2657 #endif
2660 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2662 struct vcpu *v = current;
2663 unsigned long fixup;
2665 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2667 if ( regs->error_code & 1 )
2668 goto hardware_gp;
2670 if ( !guest_mode(regs) )
2671 goto gp_in_kernel;
2673 /*
2674 * Cunning trick to allow arbitrary "INT n" handling.
2676 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2677 * instruction from trapping to the appropriate vector, when that might not
2678 * be expected by Xen or the guest OS. For example, that entry might be for
2679 * a fault handler (unlike traps, faults don't increment EIP), or might
2680 * expect an error code on the stack (which a software trap never
2681 * provides), or might be a hardware interrupt handler that doesn't like
2682 * being called spuriously.
2684 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2685 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2686 * clear to indicate that it's a software fault, not hardware.
2688 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2689 * okay because they can only be triggered by an explicit DPL-checked
2690 * instruction. The DPL specified by the guest OS for these vectors is NOT
2691 * CHECKED!!
2692 */
2693 if ( (regs->error_code & 3) == 2 )
2695 /* This fault must be due to <INT n> instruction. */
2696 const struct trap_info *ti;
2697 unsigned char vector = regs->error_code >> 3;
2698 ti = &v->arch.guest_context.trap_ctxt[vector];
2699 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2701 regs->eip += 2;
2702 do_guest_trap(vector, regs, 0);
2703 return;
2706 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2708 emulate_gate_op(regs);
2709 return;
2712 /* Emulate some simple privileged and I/O instructions. */
2713 if ( (regs->error_code == 0) &&
2714 emulate_privileged_op(regs) )
2716 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2717 return;
2720 #if defined(__i386__)
2721 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2722 (regs->error_code == 0) &&
2723 gpf_emulate_4gb(regs) )
2725 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2726 return;
2728 #endif
2730 /* Pass on GPF as is. */
2731 do_guest_trap(TRAP_gp_fault, regs, 1);
2732 return;
2734 gp_in_kernel:
2736 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2738 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2739 regs->error_code, _p(regs->eip), _p(fixup));
2740 regs->eip = fixup;
2741 return;
2744 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2746 hardware_gp:
2747 show_execution_state(regs);
2748 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2751 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2753 static void nmi_mce_softirq(void)
2755 int cpu = smp_processor_id();
2756 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2757 cpumask_t affinity;
2759 BUG_ON(st == NULL);
2760 BUG_ON(st->vcpu == NULL);
2762 /* Set the tmp value unconditionally, so that
2763 * the check in the iret hypercall works. */
2764 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2766 if ((cpu != st->processor)
2767 || (st->processor != st->vcpu->processor))
2769 /* We are on a different physical cpu.
2770 * Make sure to wakeup the vcpu on the
2771 * specified processor.
2772 */
2773 cpus_clear(affinity);
2774 cpu_set(st->processor, affinity);
2775 vcpu_set_affinity(st->vcpu, &affinity);
2777 /* Affinity is restored in the iret hypercall. */
2780 /* Only used to defer wakeup of domain/vcpu to
2781 * a safe (non-NMI/MCE) context.
2782 */
2783 vcpu_kick(st->vcpu);
2786 static void nmi_dom0_report(unsigned int reason_idx)
2788 struct domain *d = dom0;
2790 if ( (d == NULL) || (d->vcpu[0] == NULL) )
2791 return;
2793 set_bit(reason_idx, nmi_reason(d));
2795 send_guest_trap(d, 0, TRAP_nmi);
2798 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2800 switch ( opt_nmi[0] )
2802 case 'd': /* 'dom0' */
2803 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2804 case 'i': /* 'ignore' */
2805 break;
2806 default: /* 'fatal' */
2807 console_force_unlock();
2808 printk("\n\nNMI - MEMORY ERROR\n");
2809 fatal_trap(TRAP_nmi, regs);
2812 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2813 mdelay(1);
2814 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2817 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2819 switch ( opt_nmi[0] )
2821 case 'd': /* 'dom0' */
2822 nmi_dom0_report(_XEN_NMIREASON_io_error);
2823 case 'i': /* 'ignore' */
2824 break;
2825 default: /* 'fatal' */
2826 console_force_unlock();
2827 printk("\n\nNMI - I/O ERROR\n");
2828 fatal_trap(TRAP_nmi, regs);
2831 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2832 mdelay(1);
2833 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2836 static void unknown_nmi_error(unsigned char reason)
2838 switch ( opt_nmi[0] )
2840 case 'd': /* 'dom0' */
2841 nmi_dom0_report(_XEN_NMIREASON_unknown);
2842 case 'i': /* 'ignore' */
2843 break;
2844 default: /* 'fatal' */
2845 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2846 printk("Dazed and confused, but trying to continue\n");
2847 printk("Do you have a strange power saving mode enabled?\n");
2848 kexec_crash();
2852 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2854 return 0;
2857 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2859 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2861 unsigned int cpu = smp_processor_id();
2862 unsigned char reason;
2864 ++nmi_count(cpu);
2866 if ( nmi_callback(regs, cpu) )
2867 return;
2869 if ( nmi_watchdog )
2870 nmi_watchdog_tick(regs);
2872 /* Only the BSP gets external NMIs from the system. */
2873 if ( cpu == 0 )
2875 reason = inb(0x61);
2876 if ( reason & 0x80 )
2877 mem_parity_error(regs);
2878 else if ( reason & 0x40 )
2879 io_check_error(regs);
2880 else if ( !nmi_watchdog )
2881 unknown_nmi_error((unsigned char)(reason&0xff));
2885 void set_nmi_callback(nmi_callback_t callback)
2887 nmi_callback = callback;
2890 void unset_nmi_callback(void)
2892 nmi_callback = dummy_nmi_callback;
2895 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2897 struct vcpu *curr = current;
2899 BUG_ON(!guest_mode(regs));
2901 setup_fpu(curr);
2903 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2905 do_guest_trap(TRAP_no_device, regs, 0);
2906 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2908 else
2909 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2911 return;
2914 asmlinkage void do_debug(struct cpu_user_regs *regs)
2916 struct vcpu *v = current;
2918 DEBUGGER_trap_entry(TRAP_debug, regs);
2920 if ( !guest_mode(regs) )
2922 if ( regs->eflags & EF_TF )
2924 #ifdef __x86_64__
2925 void sysenter_entry(void);
2926 void sysenter_eflags_saved(void);
2927 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2928 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2929 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2930 goto out;
2931 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2932 #else
2933 WARN_ON(1);
2934 #endif
2935 regs->eflags &= ~EF_TF;
2937 else
2939 /*
2940 * We ignore watchpoints when they trigger within Xen. This may
2941 * happen when a buffer is passed to us which previously had a
2942 * watchpoint set on it. No need to bump EIP; the only faulting
2943 * trap is an instruction breakpoint, which can't happen to us.
2944 */
2945 WARN_ON(!search_exception_table(regs->eip));
2947 goto out;
2950 /* Save debug status register where guest OS can peek at it */
2951 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2953 ler_enable();
2954 do_guest_trap(TRAP_debug, regs, 0);
2955 return;
2957 out:
2958 ler_enable();
2959 return;
2962 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2966 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2968 int i;
2969 /* Keep secondary tables in sync with IRQ updates. */
2970 for ( i = 1; i < NR_CPUS; i++ )
2971 if ( idt_tables[i] != NULL )
2972 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2973 _set_gate(&idt_table[n], 14, dpl, addr);
2976 static void set_swint_gate(unsigned int n, void *addr)
2978 __set_intr_gate(n, 3, addr);
2981 void set_intr_gate(unsigned int n, void *addr)
2983 __set_intr_gate(n, 0, addr);
2986 void set_tss_desc(unsigned int n, void *addr)
2988 _set_tssldt_desc(
2989 per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
2990 (unsigned long)addr,
2991 offsetof(struct tss_struct, __cacheline_filler) - 1,
2992 9);
2993 #ifdef CONFIG_COMPAT
2994 _set_tssldt_desc(
2995 per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
2996 (unsigned long)addr,
2997 offsetof(struct tss_struct, __cacheline_filler) - 1,
2998 11);
2999 #endif
3002 void __devinit percpu_traps_init(void)
3004 subarch_percpu_traps_init();
3006 if ( !opt_ler )
3007 return;
3009 switch ( boot_cpu_data.x86_vendor )
3011 case X86_VENDOR_INTEL:
3012 switch ( boot_cpu_data.x86 )
3014 case 6:
3015 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3016 break;
3017 case 15:
3018 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3019 break;
3021 break;
3022 case X86_VENDOR_AMD:
3023 switch ( boot_cpu_data.x86 )
3025 case 6:
3026 case 15:
3027 case 16:
3028 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3029 break;
3031 break;
3034 ler_enable();
3037 void __init trap_init(void)
3039 /*
3040 * Note that interrupt gates are always used, rather than trap gates. We
3041 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3042 * first activation must have the "bad" value(s) for these registers and
3043 * we may lose them if another activation is installed before they are
3044 * saved. The page-fault handler also needs interrupts disabled until %cr2
3045 * has been read and saved on the stack.
3046 */
3047 set_intr_gate(TRAP_divide_error,&divide_error);
3048 set_intr_gate(TRAP_debug,&debug);
3049 set_intr_gate(TRAP_nmi,&nmi);
3050 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3051 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3052 set_intr_gate(TRAP_bounds,&bounds);
3053 set_intr_gate(TRAP_invalid_op,&invalid_op);
3054 set_intr_gate(TRAP_no_device,&device_not_available);
3055 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3056 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3057 set_intr_gate(TRAP_no_segment,&segment_not_present);
3058 set_intr_gate(TRAP_stack_error,&stack_segment);
3059 set_intr_gate(TRAP_gp_fault,&general_protection);
3060 set_intr_gate(TRAP_page_fault,&page_fault);
3061 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3062 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3063 set_intr_gate(TRAP_alignment_check,&alignment_check);
3064 set_intr_gate(TRAP_machine_check,&machine_check);
3065 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3067 /* CPU0 uses the master IDT. */
3068 idt_tables[0] = idt_table;
3070 percpu_traps_init();
3072 cpu_init();
3074 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3077 long register_guest_nmi_callback(unsigned long address)
3079 struct vcpu *v = current;
3080 struct domain *d = v->domain;
3081 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3083 t->vector = TRAP_nmi;
3084 t->flags = 0;
3085 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
3086 t->address = address;
3087 TI_SET_IF(t, 1);
3089 /*
3090 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3091 * now.
3092 */
3093 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3094 v->nmi_pending = 1;
3096 return 0;
3099 long unregister_guest_nmi_callback(void)
3101 struct vcpu *v = current;
3102 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3104 memset(t, 0, sizeof(*t));
3106 return 0;
3109 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3111 struct vcpu *v;
3112 struct trap_info *t;
3114 BUG_ON(d == NULL);
3115 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3117 /* Sanity check - XXX should be more fine grained. */
3118 BUG_ON(trap_nr > TRAP_syscall);
3120 v = d->vcpu[vcpuid];
3121 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3123 return (t->address != 0);
3127 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3129 struct vcpu *v;
3130 struct softirq_trap *st;
3132 BUG_ON(d == NULL);
3133 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3134 v = d->vcpu[vcpuid];
3136 switch (trap_nr) {
3137 case TRAP_nmi:
3138 if ( !test_and_set_bool(v->nmi_pending) ) {
3139 st = &per_cpu(softirq_trap, smp_processor_id());
3140 st->domain = dom0;
3141 st->vcpu = dom0->vcpu[0];
3142 st->processor = st->vcpu->processor;
3144 /* not safe to wake up a vcpu here */
3145 raise_softirq(NMI_MCE_SOFTIRQ);
3146 return 0;
3148 break;
3150 case TRAP_machine_check:
3152 /* We are called by the machine check (exception or polling) handlers
3153 * on the physical CPU that reported a machine check error. */
3155 if ( !test_and_set_bool(v->mce_pending) ) {
3156 st = &per_cpu(softirq_trap, smp_processor_id());
3157 st->domain = d;
3158 st->vcpu = v;
3159 st->processor = v->processor;
3161 /* not safe to wake up a vcpu here */
3162 raise_softirq(NMI_MCE_SOFTIRQ);
3163 return 0;
3165 break;
3168 /* delivery failed */
3169 return -EIO;
3173 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3175 struct trap_info cur;
3176 struct vcpu *curr = current;
3177 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3178 long rc = 0;
3180 /* If no table is presented then clear the entire virtual IDT. */
3181 if ( guest_handle_is_null(traps) )
3183 memset(dst, 0, 256 * sizeof(*dst));
3184 init_int80_direct_trap(curr);
3185 return 0;
3188 for ( ; ; )
3190 if ( hypercall_preempt_check() )
3192 rc = hypercall_create_continuation(
3193 __HYPERVISOR_set_trap_table, "h", traps);
3194 break;
3197 if ( copy_from_guest(&cur, traps, 1) )
3199 rc = -EFAULT;
3200 break;
3203 if ( cur.address == 0 )
3204 break;
3206 fixup_guest_code_selector(curr->domain, cur.cs);
3208 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3210 if ( cur.vector == 0x80 )
3211 init_int80_direct_trap(curr);
3213 guest_handle_add_offset(traps, 1);
3216 return rc;
3219 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3221 int i;
3222 struct vcpu *curr = current;
3224 switch ( reg )
3226 case 0:
3227 if ( !access_ok(value, sizeof(long)) )
3228 return -EPERM;
3229 if ( v == curr )
3230 write_debugreg(0, value);
3231 break;
3232 case 1:
3233 if ( !access_ok(value, sizeof(long)) )
3234 return -EPERM;
3235 if ( v == curr )
3236 write_debugreg(1, value);
3237 break;
3238 case 2:
3239 if ( !access_ok(value, sizeof(long)) )
3240 return -EPERM;
3241 if ( v == curr )
3242 write_debugreg(2, value);
3243 break;
3244 case 3:
3245 if ( !access_ok(value, sizeof(long)) )
3246 return -EPERM;
3247 if ( v == curr )
3248 write_debugreg(3, value);
3249 break;
3250 case 6:
3251 /*
3252 * DR6: Bits 4-11,16-31 reserved (set to 1).
3253 * Bit 12 reserved (set to 0).
3254 */
3255 value &= 0xffffefff; /* reserved bits => 0 */
3256 value |= 0xffff0ff0; /* reserved bits => 1 */
3257 if ( v == curr )
3258 write_debugreg(6, value);
3259 break;
3260 case 7:
3261 /*
3262 * DR7: Bit 10 reserved (set to 1).
3263 * Bits 11-12,14-15 reserved (set to 0).
3264 */
3265 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3266 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3267 /*
3268 * Privileged bits:
3269 * GD (bit 13): must be 0.
3270 */
3271 if ( value & DR_GENERAL_DETECT )
3272 return -EPERM;
3273 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3274 if ( value & DR7_ACTIVE_MASK )
3276 unsigned int io_enable = 0;
3278 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3280 if ( ((value >> i) & 3) == DR_IO )
3282 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3283 return -EPERM;
3284 io_enable |= value & (3 << ((i - 16) >> 1));
3286 #ifdef __i386__
3287 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3288 !boot_cpu_has(X86_FEATURE_LM)) &&
3289 (((value >> i) & 0xc) == DR_LEN_8) )
3290 return -EPERM;
3291 #endif
3294 /* Guest DR5 is a handy stash for I/O intercept information. */
3295 v->arch.guest_context.debugreg[5] = io_enable;
3296 value &= ~io_enable;
3298 /*
3299 * If DR7 was previously clear then we need to load all other
3300 * debug registers at this point as they were not restored during
3301 * context switch.
3302 */
3303 if ( (v == curr) &&
3304 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3306 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3307 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3308 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3309 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3310 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3313 if ( v == curr )
3314 write_debugreg(7, value);
3315 break;
3316 default:
3317 return -EINVAL;
3320 v->arch.guest_context.debugreg[reg] = value;
3321 return 0;
3324 long do_set_debugreg(int reg, unsigned long value)
3326 return set_debugreg(current, reg, value);
3329 unsigned long do_get_debugreg(int reg)
3331 struct vcpu *curr = current;
3333 switch ( reg )
3335 case 0 ... 3:
3336 case 6:
3337 return curr->arch.guest_context.debugreg[reg];
3338 case 7:
3339 return (curr->arch.guest_context.debugreg[7] |
3340 curr->arch.guest_context.debugreg[5]);
3341 case 4 ... 5:
3342 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3343 curr->arch.guest_context.debugreg[reg + 2] : 0);
3346 return -EINVAL;
3349 /*
3350 * Local variables:
3351 * mode: C
3352 * c-set-style: "BSD"
3353 * c-basic-offset: 4
3354 * tab-width: 4
3355 * indent-tabs-mode: nil
3356 * End:
3357 */