ia64/xen-unstable

view xen/arch/x86/traps.c @ 18641:22c89412fc8c

x86 cpuid: leaf 4 sub-index goes in %ecx, not %ebx

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Oct 15 15:58:09 2008 +0100 (2008-10-15)
parents 3603e95245fa
children 85ba96069dfb
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 struct vcpu *curr = current;
136 unsigned long *stack, addr;
138 if ( is_hvm_vcpu(curr) )
139 return;
141 if ( is_pv_32on64_vcpu(curr) )
142 {
143 compat_show_guest_stack(regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
160 {
161 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
162 break;
163 if ( get_user(addr, stack) )
164 {
165 if ( i != 0 )
166 printk("\n ");
167 printk("Fault while accessing guest memory.");
168 i = 1;
169 break;
170 }
171 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
172 printk("\n ");
173 printk(" %p", _p(addr));
174 stack++;
175 }
176 if ( i == 0 )
177 printk("Stack empty.");
178 printk("\n");
179 }
181 #if !defined(CONFIG_FRAME_POINTER)
183 static void show_trace(struct cpu_user_regs *regs)
184 {
185 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
187 printk("Xen call trace:\n ");
189 printk("[<%p>]", _p(regs->eip));
190 print_symbol(" %s\n ", regs->eip);
192 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
193 {
194 addr = *stack++;
195 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
196 {
197 printk("[<%p>]", _p(addr));
198 print_symbol(" %s\n ", addr);
199 }
200 }
202 printk("\n");
203 }
205 #else
207 static void show_trace(struct cpu_user_regs *regs)
208 {
209 unsigned long *frame, next, addr, low, high;
211 printk("Xen call trace:\n ");
213 printk("[<%p>]", _p(regs->eip));
214 print_symbol(" %s\n ", regs->eip);
216 /* Bounds for range of valid frame pointer. */
217 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
218 high = (low & ~(STACK_SIZE - 1)) +
219 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
221 /* The initial frame pointer. */
222 next = regs->ebp;
224 for ( ; ; )
225 {
226 /* Valid frame pointer? */
227 if ( (next < low) || (next >= high) )
228 {
229 /*
230 * Exception stack frames have a different layout, denoted by an
231 * inverted frame pointer.
232 */
233 next = ~next;
234 if ( (next < low) || (next >= high) )
235 break;
236 frame = (unsigned long *)next;
237 next = frame[0];
238 addr = frame[(offsetof(struct cpu_user_regs, eip) -
239 offsetof(struct cpu_user_regs, ebp))
240 / BYTES_PER_LONG];
241 }
242 else
243 {
244 /* Ordinary stack frame. */
245 frame = (unsigned long *)next;
246 next = frame[0];
247 addr = frame[1];
248 }
250 printk("[<%p>]", _p(addr));
251 print_symbol(" %s\n ", addr);
253 low = (unsigned long)&frame[2];
254 }
256 printk("\n");
257 }
259 #endif
261 void show_stack(struct cpu_user_regs *regs)
262 {
263 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
264 int i;
266 if ( guest_mode(regs) )
267 return show_guest_stack(regs);
269 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
271 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
272 {
273 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
274 break;
275 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
276 printk("\n ");
277 addr = *stack++;
278 printk(" %p", _p(addr));
279 }
280 if ( i == 0 )
281 printk("Stack empty.");
282 printk("\n");
284 show_trace(regs);
285 }
287 void show_stack_overflow(unsigned int cpu, unsigned long esp)
288 {
289 #ifdef MEMORY_GUARD
290 unsigned long esp_top, esp_bottom;
291 unsigned long *stack, addr;
293 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
294 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
296 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
297 (void *)esp_top, (void *)esp_bottom, (void *)esp,
298 (void *)init_tss[cpu].esp0);
300 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
301 if ( ((unsigned long)(esp - esp_top) > 512) &&
302 ((unsigned long)(esp_top - esp) > 512) )
303 {
304 printk("No stack overflow detected. Skipping stack trace.\n");
305 return;
306 }
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow (dumping trace %p-%p):\n ",
312 (void *)esp, (void *)esp_bottom);
314 stack = (unsigned long *)esp;
315 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
316 {
317 addr = *stack++;
318 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
319 {
320 printk("%p: [<%p>]", stack, _p(addr));
321 print_symbol(" %s\n ", addr);
322 }
323 }
325 printk("\n");
326 #endif
327 }
329 void show_execution_state(struct cpu_user_regs *regs)
330 {
331 show_registers(regs);
332 show_stack(regs);
333 }
335 void vcpu_show_execution_state(struct vcpu *v)
336 {
337 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
338 v->domain->domain_id, v->vcpu_id);
340 if ( v == current )
341 {
342 show_execution_state(guest_cpu_user_regs());
343 return;
344 }
346 vcpu_pause(v); /* acceptably dangerous */
348 vcpu_show_registers(v);
349 /* Todo: map arbitrary vcpu's top guest stack page here. */
350 if ( (v->domain == current->domain) &&
351 guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
352 show_guest_stack(&v->arch.guest_context.user_regs);
354 vcpu_unpause(v);
355 }
357 char *trapstr(int trapnr)
358 {
359 static char *strings[] = {
360 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
361 "invalid opcode", "device not available", "double fault",
362 "coprocessor segment", "invalid tss", "segment not found",
363 "stack error", "general protection fault", "page fault",
364 "spurious interrupt", "coprocessor error", "alignment check",
365 "machine check", "simd error"
366 };
368 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
369 return "???";
371 return strings[trapnr];
372 }
374 /*
375 * This is called for faults at very unexpected times (e.g., when interrupts
376 * are disabled). In such situations we can't do much that is safe. We try to
377 * print out some tracing and then we just spin.
378 */
379 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
380 {
381 static DEFINE_PER_CPU(char, depth);
383 /*
384 * In some cases, we can end up in a vicious cycle of fatal_trap()s
385 * within fatal_trap()s. We give the problem a couple of iterations to
386 * bottom out, and then we just panic.
387 */
388 if ( ++this_cpu(depth) < 3 )
389 {
390 watchdog_disable();
391 console_start_sync();
393 show_execution_state(regs);
395 if ( trapnr == TRAP_page_fault )
396 {
397 unsigned long cr2 = read_cr2();
398 printk("Faulting linear address: %p\n", _p(cr2));
399 show_page_walk(cr2);
400 }
401 }
403 panic("FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x] %s\n",
405 trapnr, trapstr(trapnr), regs->error_code,
406 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
407 }
409 static void do_guest_trap(
410 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
411 {
412 struct vcpu *v = current;
413 struct trap_bounce *tb;
414 const struct trap_info *ti;
416 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
418 tb = &v->arch.trap_bounce;
419 ti = &v->arch.guest_context.trap_ctxt[trapnr];
421 tb->flags = TBF_EXCEPTION;
422 tb->cs = ti->cs;
423 tb->eip = ti->address;
425 if ( use_error_code )
426 {
427 tb->flags |= TBF_EXCEPTION_ERRCODE;
428 tb->error_code = regs->error_code;
429 }
431 if ( TI_GET_IF(ti) )
432 tb->flags |= TBF_INTERRUPT;
434 if ( unlikely(null_trap_bounce(v, tb)) )
435 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
436 "on VCPU %d [ec=%04x]\n",
437 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
438 }
440 static void instruction_done(
441 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
442 {
443 regs->eip = eip;
444 regs->eflags &= ~X86_EFLAGS_RF;
445 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
446 {
447 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
448 if ( regs->eflags & X86_EFLAGS_TF )
449 current->arch.guest_context.debugreg[6] |= 0x4000;
450 do_guest_trap(TRAP_debug, regs, 0);
451 }
452 }
454 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
455 unsigned int port, unsigned int len)
456 {
457 unsigned int width, i, match = 0;
458 unsigned long start;
460 if ( !(v->arch.guest_context.debugreg[5]) ||
461 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
462 return 0;
464 for ( i = 0; i < 4; i++ )
465 {
466 if ( !(v->arch.guest_context.debugreg[5] &
467 (3 << (i * DR_ENABLE_SIZE))) )
468 continue;
470 start = v->arch.guest_context.debugreg[i];
471 width = 0;
473 switch ( (v->arch.guest_context.debugreg[7] >>
474 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
475 {
476 case DR_LEN_1: width = 1; break;
477 case DR_LEN_2: width = 2; break;
478 case DR_LEN_4: width = 4; break;
479 case DR_LEN_8: width = 8; break;
480 }
482 if ( (start < (port + len)) && ((start + width) > port) )
483 match |= 1 << i;
484 }
486 return match;
487 }
489 /*
490 * Called from asm to set up the MCE trapbounce info.
491 * Returns 0 if no callback is set up, else 1.
492 */
493 asmlinkage int set_guest_machinecheck_trapbounce(void)
494 {
495 struct vcpu *v = current;
496 struct trap_bounce *tb = &v->arch.trap_bounce;
498 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
499 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
500 return !null_trap_bounce(v, tb);
501 }
503 /*
504 * Called from asm to set up the NMI trapbounce info.
505 * Returns 0 if no callback is set up, else 1.
506 */
507 asmlinkage int set_guest_nmi_trapbounce(void)
508 {
509 struct vcpu *v = current;
510 struct trap_bounce *tb = &v->arch.trap_bounce;
511 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
512 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
513 return !null_trap_bounce(v, tb);
514 }
516 static inline void do_trap(
517 int trapnr, struct cpu_user_regs *regs, int use_error_code)
518 {
519 struct vcpu *curr = current;
520 unsigned long fixup;
522 DEBUGGER_trap_entry(trapnr, regs);
524 if ( guest_mode(regs) )
525 {
526 do_guest_trap(trapnr, regs, use_error_code);
527 return;
528 }
530 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
531 {
532 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
533 trapnr, _p(regs->eip), _p(fixup));
534 regs->eip = fixup;
535 return;
536 }
538 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
539 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
540 {
541 curr->arch.hvm_vcpu.fpu_exception_callback(
542 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
543 return;
544 }
546 DEBUGGER_trap_fatal(trapnr, regs);
548 show_execution_state(regs);
549 panic("FATAL TRAP: vector = %d (%s)\n"
550 "[error_code=%04x]\n",
551 trapnr, trapstr(trapnr), regs->error_code);
552 }
554 #define DO_ERROR_NOCODE(trapnr, name) \
555 asmlinkage void do_##name(struct cpu_user_regs *regs) \
556 { \
557 do_trap(trapnr, regs, 0); \
558 }
560 #define DO_ERROR(trapnr, name) \
561 asmlinkage void do_##name(struct cpu_user_regs *regs) \
562 { \
563 do_trap(trapnr, regs, 1); \
564 }
566 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
567 DO_ERROR_NOCODE(TRAP_overflow, overflow)
568 DO_ERROR_NOCODE(TRAP_bounds, bounds)
569 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
570 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
571 DO_ERROR( TRAP_no_segment, segment_not_present)
572 DO_ERROR( TRAP_stack_error, stack_segment)
573 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
574 DO_ERROR( TRAP_alignment_check, alignment_check)
575 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
577 int rdmsr_hypervisor_regs(
578 uint32_t idx, uint32_t *eax, uint32_t *edx)
579 {
580 struct domain *d = current->domain;
581 /* Optionally shift out of the way of Viridian architectural MSRs. */
582 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
584 idx -= base;
585 if ( idx > 0 )
586 return 0;
588 switch ( idx )
589 {
590 case 0:
591 {
592 *eax = *edx = 0;
593 break;
594 }
595 default:
596 BUG();
597 }
599 return 1;
600 }
602 int wrmsr_hypervisor_regs(
603 uint32_t idx, uint32_t eax, uint32_t edx)
604 {
605 struct domain *d = current->domain;
606 /* Optionally shift out of the way of Viridian architectural MSRs. */
607 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
609 idx -= base;
610 if ( idx > 0 )
611 return 0;
613 switch ( idx )
614 {
615 case 0:
616 {
617 void *hypercall_page;
618 unsigned long mfn;
619 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
620 unsigned int idx = eax & 0xfff;
622 if ( idx > 0 )
623 {
624 gdprintk(XENLOG_WARNING,
625 "Out of range index %u to MSR %08x\n",
626 idx, 0x40000000);
627 return 0;
628 }
630 mfn = gmfn_to_mfn(d, gmfn);
632 if ( !mfn_valid(mfn) ||
633 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
634 {
635 gdprintk(XENLOG_WARNING,
636 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
637 gmfn, mfn, base + idx);
638 return 0;
639 }
641 hypercall_page = map_domain_page(mfn);
642 hypercall_page_initialise(d, hypercall_page);
643 unmap_domain_page(hypercall_page);
645 put_page_and_type(mfn_to_page(mfn));
646 break;
647 }
649 default:
650 BUG();
651 }
653 return 1;
654 }
656 int cpuid_hypervisor_leaves(
657 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
658 {
659 struct domain *d = current->domain;
660 /* Optionally shift out of the way of Viridian architectural leaves. */
661 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
663 idx -= base;
664 if ( idx > 2 )
665 return 0;
667 switch ( idx )
668 {
669 case 0:
670 *eax = base + 2; /* Largest leaf */
671 *ebx = XEN_CPUID_SIGNATURE_EBX;
672 *ecx = XEN_CPUID_SIGNATURE_ECX;
673 *edx = XEN_CPUID_SIGNATURE_EDX;
674 break;
676 case 1:
677 *eax = (xen_major_version() << 16) | xen_minor_version();
678 *ebx = 0; /* Reserved */
679 *ecx = 0; /* Reserved */
680 *edx = 0; /* Reserved */
681 break;
683 case 2:
684 *eax = 1; /* Number of hypercall-transfer pages */
685 *ebx = 0x40000000; /* MSR base address */
686 if ( is_viridian_domain(d) )
687 *ebx = 0x40000200;
688 *ecx = 0; /* Features 1 */
689 *edx = 0; /* Features 2 */
690 if ( !is_hvm_vcpu(current) )
691 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
692 break;
694 default:
695 BUG();
696 }
698 return 1;
699 }
701 static void pv_cpuid(struct cpu_user_regs *regs)
702 {
703 uint32_t a, b, c, d;
705 a = regs->eax;
706 b = regs->ebx;
707 c = regs->ecx;
708 d = regs->edx;
710 if ( current->domain->domain_id != 0 )
711 {
712 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
713 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
714 goto out;
715 }
717 asm (
718 "cpuid"
719 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
720 : "0" (a), "1" (b), "2" (c), "3" (d) );
722 if ( (regs->eax & 0x7fffffff) == 1 )
723 {
724 /* Modify Feature Information. */
725 __clear_bit(X86_FEATURE_VME, &d);
726 __clear_bit(X86_FEATURE_PSE, &d);
727 __clear_bit(X86_FEATURE_PGE, &d);
728 __clear_bit(X86_FEATURE_MCE, &d);
729 __clear_bit(X86_FEATURE_MCA, &d);
730 __clear_bit(X86_FEATURE_PSE36, &d);
731 }
732 switch ( (uint32_t)regs->eax )
733 {
734 case 1:
735 /* Modify Feature Information. */
736 if ( !cpu_has_sep )
737 __clear_bit(X86_FEATURE_SEP, &d);
738 #ifdef __i386__
739 if ( !supervisor_mode_kernel )
740 __clear_bit(X86_FEATURE_SEP, &d);
741 #endif
742 __clear_bit(X86_FEATURE_DS, &d);
743 __clear_bit(X86_FEATURE_ACC, &d);
744 __clear_bit(X86_FEATURE_PBE, &d);
746 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
747 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
748 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
749 __clear_bit(X86_FEATURE_VMXE % 32, &c);
750 __clear_bit(X86_FEATURE_SMXE % 32, &c);
751 __clear_bit(X86_FEATURE_TM2 % 32, &c);
752 if ( is_pv_32bit_vcpu(current) )
753 __clear_bit(X86_FEATURE_CX16 % 32, &c);
754 __clear_bit(X86_FEATURE_XTPR % 32, &c);
755 __clear_bit(X86_FEATURE_PDCM % 32, &c);
756 __clear_bit(X86_FEATURE_DCA % 32, &c);
757 break;
758 case 0x80000001:
759 /* Modify Feature Information. */
760 if ( is_pv_32bit_vcpu(current) )
761 {
762 __clear_bit(X86_FEATURE_LM % 32, &d);
763 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
764 }
765 #ifndef __i386__
766 if ( is_pv_32on64_vcpu(current) &&
767 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
768 #endif
769 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
770 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
771 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
773 __clear_bit(X86_FEATURE_SVME % 32, &c);
774 __clear_bit(X86_FEATURE_OSVW % 32, &c);
775 __clear_bit(X86_FEATURE_IBS % 32, &c);
776 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
777 __clear_bit(X86_FEATURE_WDT % 32, &c);
778 break;
779 case 5: /* MONITOR/MWAIT */
780 case 0xa: /* Architectural Performance Monitor Features */
781 case 0x8000000a: /* SVM revision and features */
782 case 0x8000001b: /* Instruction Based Sampling */
783 a = b = c = d = 0;
784 break;
785 default:
786 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
787 break;
788 }
790 out:
791 regs->eax = a;
792 regs->ebx = b;
793 regs->ecx = c;
794 regs->edx = d;
795 }
797 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
798 {
799 char sig[5], instr[2];
800 unsigned long eip, rc;
802 eip = regs->eip;
804 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
805 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
806 {
807 propagate_page_fault(eip + sizeof(sig) - rc, 0);
808 return EXCRET_fault_fixed;
809 }
810 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
811 return 0;
812 eip += sizeof(sig);
814 /* We only emulate CPUID. */
815 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
816 {
817 propagate_page_fault(eip + sizeof(instr) - rc, 0);
818 return EXCRET_fault_fixed;
819 }
820 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
821 return 0;
822 eip += sizeof(instr);
824 pv_cpuid(regs);
826 instruction_done(regs, eip, 0);
828 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
830 return EXCRET_fault_fixed;
831 }
833 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
834 {
835 struct bug_frame bug;
836 struct bug_frame_str bug_str;
837 char *filename, *predicate, *eip = (char *)regs->eip;
838 unsigned long fixup;
839 int id, lineno;
841 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
843 if ( likely(guest_mode(regs)) )
844 {
845 if ( !emulate_forced_invalid_op(regs) )
846 do_guest_trap(TRAP_invalid_op, regs, 0);
847 return;
848 }
850 if ( !is_kernel(eip) ||
851 __copy_from_user(&bug, eip, sizeof(bug)) ||
852 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
853 (bug.ret != 0xc2) )
854 goto die;
855 eip += sizeof(bug);
857 id = bug.id & 3;
859 if ( id == BUGFRAME_dump )
860 {
861 show_execution_state(regs);
862 regs->eip = (unsigned long)eip;
863 return;
864 }
866 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
867 if ( !is_kernel(eip) ||
868 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
869 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
870 goto die;
871 eip += sizeof(bug_str);
873 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
874 lineno = bug.id >> 2;
876 if ( id == BUGFRAME_warn )
877 {
878 printk("Xen WARN at %.50s:%d\n", filename, lineno);
879 show_execution_state(regs);
880 regs->eip = (unsigned long)eip;
881 return;
882 }
884 if ( id == BUGFRAME_bug )
885 {
886 printk("Xen BUG at %.50s:%d\n", filename, lineno);
887 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
888 show_execution_state(regs);
889 panic("Xen BUG at %.50s:%d\n", filename, lineno);
890 }
892 /* ASSERT: decode the predicate string pointer. */
893 ASSERT(id == BUGFRAME_assert);
894 if ( !is_kernel(eip) ||
895 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
896 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
897 goto die;
898 eip += sizeof(bug_str);
900 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
901 printk("Assertion '%s' failed at %.50s:%d\n",
902 predicate, filename, lineno);
903 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
904 show_execution_state(regs);
905 panic("Assertion '%s' failed at %.50s:%d\n",
906 predicate, filename, lineno);
908 die:
909 if ( (fixup = search_exception_table(regs->eip)) != 0 )
910 {
911 regs->eip = fixup;
912 return;
913 }
914 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
915 show_execution_state(regs);
916 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
917 }
919 asmlinkage void do_int3(struct cpu_user_regs *regs)
920 {
921 DEBUGGER_trap_entry(TRAP_int3, regs);
923 if ( !guest_mode(regs) )
924 {
925 debugger_trap_fatal(TRAP_int3, regs);
926 return;
927 }
929 do_guest_trap(TRAP_int3, regs, 0);
930 }
932 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
933 {
934 machine_check_vector(regs, regs->error_code);
935 }
937 static void reserved_bit_page_fault(
938 unsigned long addr, struct cpu_user_regs *regs)
939 {
940 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
941 current->domain->domain_id, current->vcpu_id, regs->error_code);
942 show_page_walk(addr);
943 show_execution_state(regs);
944 }
946 void propagate_page_fault(unsigned long addr, u16 error_code)
947 {
948 struct trap_info *ti;
949 struct vcpu *v = current;
950 struct trap_bounce *tb = &v->arch.trap_bounce;
952 v->arch.guest_context.ctrlreg[2] = addr;
953 arch_set_cr2(v, addr);
955 /* Re-set error_code.user flag appropriately for the guest. */
956 error_code &= ~PFEC_user_mode;
957 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
958 error_code |= PFEC_user_mode;
960 trace_pv_page_fault(addr, error_code);
962 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
963 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
964 tb->error_code = error_code;
965 tb->cs = ti->cs;
966 tb->eip = ti->address;
967 if ( TI_GET_IF(ti) )
968 tb->flags |= TBF_INTERRUPT;
969 if ( unlikely(null_trap_bounce(v, tb)) )
970 {
971 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
972 v->domain->domain_id, v->vcpu_id, error_code);
973 show_page_walk(addr);
974 }
976 if ( unlikely(error_code & PFEC_reserved_bit) )
977 reserved_bit_page_fault(addr, guest_cpu_user_regs());
978 }
980 static int handle_gdt_ldt_mapping_fault(
981 unsigned long offset, struct cpu_user_regs *regs)
982 {
983 struct vcpu *curr = current;
984 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
985 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
986 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
988 /* Should never fault in another vcpu's area. */
989 BUG_ON(vcpu_area != curr->vcpu_id);
991 /* Byte offset within the gdt/ldt sub-area. */
992 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
994 if ( likely(is_ldt_area) )
995 {
996 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
997 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
998 {
999 if ( guest_mode(regs) )
1000 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1001 regs->eip, offset);
1003 else
1005 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1006 if ( !guest_mode(regs) )
1007 return 0;
1008 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1009 propagate_page_fault(
1010 curr->arch.guest_context.ldt_base + offset,
1011 regs->error_code);
1014 else
1016 /* GDT fault: handle the fault as #GP(selector). */
1017 regs->error_code = (u16)offset & ~7;
1018 (void)do_general_protection(regs);
1021 return EXCRET_fault_fixed;
1024 #ifdef HYPERVISOR_VIRT_END
1025 #define IN_HYPERVISOR_RANGE(va) \
1026 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1027 #else
1028 #define IN_HYPERVISOR_RANGE(va) \
1029 (((va) >= HYPERVISOR_VIRT_START))
1030 #endif
1032 static int __spurious_page_fault(
1033 unsigned long addr, struct cpu_user_regs *regs)
1035 unsigned long mfn, cr3 = read_cr3();
1036 #if CONFIG_PAGING_LEVELS >= 4
1037 l4_pgentry_t l4e, *l4t;
1038 #endif
1039 #if CONFIG_PAGING_LEVELS >= 3
1040 l3_pgentry_t l3e, *l3t;
1041 #endif
1042 l2_pgentry_t l2e, *l2t;
1043 l1_pgentry_t l1e, *l1t;
1044 unsigned int required_flags, disallowed_flags;
1046 /*
1047 * We do not take spurious page faults in IRQ handlers as we do not
1048 * modify page tables in IRQ context. We therefore bail here because
1049 * map_domain_page() is not IRQ-safe.
1050 */
1051 if ( in_irq() )
1052 return 0;
1054 /* Reserved bit violations are never spurious faults. */
1055 if ( regs->error_code & PFEC_reserved_bit )
1056 return 0;
1058 required_flags = _PAGE_PRESENT;
1059 if ( regs->error_code & PFEC_write_access )
1060 required_flags |= _PAGE_RW;
1061 if ( regs->error_code & PFEC_user_mode )
1062 required_flags |= _PAGE_USER;
1064 disallowed_flags = 0;
1065 if ( regs->error_code & PFEC_insn_fetch )
1066 disallowed_flags |= _PAGE_NX;
1068 mfn = cr3 >> PAGE_SHIFT;
1070 #if CONFIG_PAGING_LEVELS >= 4
1071 l4t = map_domain_page(mfn);
1072 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1073 mfn = l4e_get_pfn(l4e);
1074 unmap_domain_page(l4t);
1075 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1076 (l4e_get_flags(l4e) & disallowed_flags) )
1077 return 0;
1078 #endif
1080 #if CONFIG_PAGING_LEVELS >= 3
1081 l3t = map_domain_page(mfn);
1082 #if CONFIG_PAGING_LEVELS == 3
1083 l3t += (cr3 & 0xFE0UL) >> 3;
1084 #endif
1085 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1086 mfn = l3e_get_pfn(l3e);
1087 unmap_domain_page(l3t);
1088 #if CONFIG_PAGING_LEVELS == 3
1089 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1090 return 0;
1091 #else
1092 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1093 (l3e_get_flags(l3e) & disallowed_flags) )
1094 return 0;
1095 #endif
1096 #endif
1098 l2t = map_domain_page(mfn);
1099 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1100 mfn = l2e_get_pfn(l2e);
1101 unmap_domain_page(l2t);
1102 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1103 (l2e_get_flags(l2e) & disallowed_flags) )
1104 return 0;
1105 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1107 l1e = l1e_empty(); /* define before use in debug tracing */
1108 goto spurious;
1111 l1t = map_domain_page(mfn);
1112 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1113 mfn = l1e_get_pfn(l1e);
1114 unmap_domain_page(l1t);
1115 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1116 (l1e_get_flags(l1e) & disallowed_flags) )
1117 return 0;
1119 spurious:
1120 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1121 "at addr %lx, e/c %04x\n",
1122 current->domain->domain_id, current->vcpu_id,
1123 addr, regs->error_code);
1124 #if CONFIG_PAGING_LEVELS >= 4
1125 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1126 #endif
1127 #if CONFIG_PAGING_LEVELS >= 3
1128 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1129 #endif
1130 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1131 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1132 #ifndef NDEBUG
1133 show_registers(regs);
1134 #endif
1135 return 1;
1138 static int spurious_page_fault(
1139 unsigned long addr, struct cpu_user_regs *regs)
1141 unsigned long flags;
1142 int is_spurious;
1144 /*
1145 * Disabling interrupts prevents TLB flushing, and hence prevents
1146 * page tables from becoming invalid under our feet during the walk.
1147 */
1148 local_irq_save(flags);
1149 is_spurious = __spurious_page_fault(addr, regs);
1150 local_irq_restore(flags);
1152 return is_spurious;
1155 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1157 struct vcpu *v = current;
1158 struct domain *d = v->domain;
1160 /* No fixups in interrupt context or when interrupts are disabled. */
1161 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1162 return 0;
1164 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1166 if ( paging_mode_external(d) && guest_mode(regs) )
1168 int ret = paging_fault(addr, regs);
1169 if ( ret == EXCRET_fault_fixed )
1170 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1171 return ret;
1173 if ( !(regs->error_code & PFEC_reserved_bit) &&
1174 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1175 return handle_gdt_ldt_mapping_fault(
1176 addr - GDT_LDT_VIRT_START, regs);
1177 return 0;
1180 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1181 guest_kernel_mode(v, regs) &&
1182 /* Do not check if access-protection fault since the page may
1183 legitimately be not present in shadow page tables */
1184 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1185 PFEC_write_access) &&
1186 ptwr_do_page_fault(v, addr, regs) )
1187 return EXCRET_fault_fixed;
1189 if ( paging_mode_enabled(d) )
1191 int ret = paging_fault(addr, regs);
1192 if ( ret == EXCRET_fault_fixed )
1193 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1194 return ret;
1197 return 0;
1200 /*
1201 * #PF error code:
1202 * Bit 0: Protection violation (=1) ; Page not present (=0)
1203 * Bit 1: Write access
1204 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1205 * Bit 3: Reserved bit violation
1206 * Bit 4: Instruction fetch
1207 */
1208 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1210 unsigned long addr, fixup;
1212 addr = read_cr2();
1214 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1216 perfc_incr(page_faults);
1218 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1219 return;
1221 if ( unlikely(!guest_mode(regs)) )
1223 if ( spurious_page_fault(addr, regs) )
1224 return;
1226 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1228 perfc_incr(copy_user_faults);
1229 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1230 reserved_bit_page_fault(addr, regs);
1231 regs->eip = fixup;
1232 return;
1235 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1237 show_execution_state(regs);
1238 show_page_walk(addr);
1239 panic("FATAL PAGE FAULT\n"
1240 "[error_code=%04x]\n"
1241 "Faulting linear address: %p\n",
1242 regs->error_code, _p(addr));
1245 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1246 && spurious_page_fault(addr, regs)) )
1247 return;
1249 propagate_page_fault(addr, regs->error_code);
1252 /*
1253 * Early #PF handler to print CR2, error code, and stack.
1255 * We also deal with spurious faults here, even though they should never happen
1256 * during early boot (an issue was seen once, but was most likely a hardware
1257 * problem).
1258 */
1259 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1261 static int stuck;
1262 static unsigned long prev_eip, prev_cr2;
1263 unsigned long cr2 = read_cr2();
1265 BUG_ON(smp_processor_id() != 0);
1267 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1269 prev_eip = regs->eip;
1270 prev_cr2 = cr2;
1271 stuck = 0;
1272 return;
1275 if ( stuck++ == 1000 )
1277 unsigned long *stk = (unsigned long *)regs;
1278 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1279 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1280 printk("Stack dump: ");
1281 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1282 printk("%p ", _p(*stk++));
1283 for ( ; ; ) ;
1287 long do_fpu_taskswitch(int set)
1289 struct vcpu *v = current;
1291 if ( set )
1293 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1294 stts();
1296 else
1298 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1299 if ( v->fpu_dirtied )
1300 clts();
1303 return 0;
1306 static int read_descriptor(unsigned int sel,
1307 const struct vcpu *v,
1308 const struct cpu_user_regs * regs,
1309 unsigned long *base,
1310 unsigned long *limit,
1311 unsigned int *ar,
1312 unsigned int vm86attr)
1314 struct desc_struct desc;
1316 if ( !vm86_mode(regs) )
1318 if ( sel < 4)
1319 desc.b = desc.a = 0;
1320 else if ( __get_user(desc,
1321 (const struct desc_struct *)(!(sel & 4)
1322 ? GDT_VIRT_START(v)
1323 : LDT_VIRT_START(v))
1324 + (sel >> 3)) )
1325 return 0;
1326 if ( !(vm86attr & _SEGMENT_CODE) )
1327 desc.b &= ~_SEGMENT_L;
1329 else
1331 desc.a = (sel << 20) | 0xffff;
1332 desc.b = vm86attr | (sel >> 12);
1335 *ar = desc.b & 0x00f0ff00;
1336 if ( !(desc.b & _SEGMENT_L) )
1338 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1339 (desc.b & 0xff000000));
1340 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1341 if ( desc.b & _SEGMENT_G )
1342 *limit = ((*limit + 1) << 12) - 1;
1343 #ifndef NDEBUG
1344 if ( !vm86_mode(regs) && (sel > 3) )
1346 unsigned int a, l;
1347 unsigned char valid;
1349 asm volatile (
1350 "larl %2,%0 ; setz %1"
1351 : "=r" (a), "=rm" (valid) : "rm" (sel));
1352 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1353 asm volatile (
1354 "lsll %2,%0 ; setz %1"
1355 : "=r" (l), "=rm" (valid) : "rm" (sel));
1356 BUG_ON(valid && (l != *limit));
1358 #endif
1360 else
1362 *base = 0UL;
1363 *limit = ~0UL;
1366 return 1;
1369 #ifdef __x86_64__
1370 static int read_gate_descriptor(unsigned int gate_sel,
1371 const struct vcpu *v,
1372 unsigned int *sel,
1373 unsigned long *off,
1374 unsigned int *ar)
1376 struct desc_struct desc;
1377 const struct desc_struct *pdesc;
1380 pdesc = (const struct desc_struct *)
1381 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1382 + (gate_sel >> 3);
1383 if ( (gate_sel < 4) ||
1384 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1385 __get_user(desc, pdesc) )
1386 return 0;
1388 *sel = (desc.a >> 16) & 0x0000fffc;
1389 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1390 *ar = desc.b & 0x0000ffff;
1392 /*
1393 * check_descriptor() clears the DPL field and stores the
1394 * guest requested DPL in the selector's RPL field.
1395 */
1396 if ( *ar & _SEGMENT_DPL )
1397 return 0;
1398 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1400 if ( !is_pv_32bit_vcpu(v) )
1402 if ( (*ar & 0x1f00) != 0x0c00 ||
1403 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1404 __get_user(desc, pdesc + 1) ||
1405 (desc.b & 0x1f00) )
1406 return 0;
1408 *off |= (unsigned long)desc.a << 32;
1409 return 1;
1412 switch ( *ar & 0x1f00 )
1414 case 0x0400:
1415 *off &= 0xffff;
1416 break;
1417 case 0x0c00:
1418 break;
1419 default:
1420 return 0;
1423 return 1;
1425 #endif
1427 /* Has the guest requested sufficient permission for this I/O access? */
1428 static int guest_io_okay(
1429 unsigned int port, unsigned int bytes,
1430 struct vcpu *v, struct cpu_user_regs *regs)
1432 #if defined(__x86_64__)
1433 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1434 int user_mode = !(v->arch.flags & TF_kernel_mode);
1435 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1436 #elif defined(__i386__)
1437 #define TOGGLE_MODE() ((void)0)
1438 #endif
1440 if ( !vm86_mode(regs) &&
1441 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1442 return 1;
1444 if ( v->arch.iobmp_limit > (port + bytes) )
1446 union { uint8_t bytes[2]; uint16_t mask; } x;
1448 /*
1449 * Grab permission bytes from guest space. Inaccessible bytes are
1450 * read as 0xff (no access allowed).
1451 */
1452 TOGGLE_MODE();
1453 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1454 port>>3, 2) )
1456 default: x.bytes[0] = ~0;
1457 case 1: x.bytes[1] = ~0;
1458 case 0: break;
1460 TOGGLE_MODE();
1462 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1463 return 1;
1466 return 0;
1469 /* Has the administrator granted sufficient permission for this I/O access? */
1470 static int admin_io_okay(
1471 unsigned int port, unsigned int bytes,
1472 struct vcpu *v, struct cpu_user_regs *regs)
1474 /*
1475 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1476 * We never permit direct access to that register.
1477 */
1478 if ( (port == 0xcf8) && (bytes == 4) )
1479 return 0;
1481 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1484 static uint32_t guest_io_read(
1485 unsigned int port, unsigned int bytes,
1486 struct vcpu *v, struct cpu_user_regs *regs)
1488 extern uint32_t pci_conf_read(
1489 uint32_t cf8, uint8_t offset, uint8_t bytes);
1491 uint32_t data = 0;
1492 unsigned int shift = 0;
1494 if ( admin_io_okay(port, bytes, v, regs) )
1496 switch ( bytes )
1498 case 1: return inb(port);
1499 case 2: return inw(port);
1500 case 4: return inl(port);
1504 while ( bytes != 0 )
1506 unsigned int size = 1;
1507 uint32_t sub_data = 0xff;
1509 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1511 sub_data = pv_pit_handler(port, 0, 0);
1513 else if ( (port == 0xcf8) && (bytes == 4) )
1515 size = 4;
1516 sub_data = v->domain->arch.pci_cf8;
1518 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1520 size = min(bytes, 4 - (port & 3));
1521 if ( size == 3 )
1522 size = 2;
1523 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1526 if ( size == 4 )
1527 return sub_data;
1529 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1530 shift += size * 8;
1531 port += size;
1532 bytes -= size;
1535 return data;
1538 static void guest_io_write(
1539 unsigned int port, unsigned int bytes, uint32_t data,
1540 struct vcpu *v, struct cpu_user_regs *regs)
1542 extern void pci_conf_write(
1543 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1545 if ( admin_io_okay(port, bytes, v, regs) )
1547 switch ( bytes ) {
1548 case 1:
1549 outb((uint8_t)data, port);
1550 if ( pv_post_outb_hook )
1551 pv_post_outb_hook(port, (uint8_t)data);
1552 break;
1553 case 2:
1554 outw((uint16_t)data, port);
1555 break;
1556 case 4:
1557 outl(data, port);
1558 break;
1560 return;
1563 while ( bytes != 0 )
1565 unsigned int size = 1;
1567 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1569 pv_pit_handler(port, (uint8_t)data, 1);
1571 else if ( (port == 0xcf8) && (bytes == 4) )
1573 size = 4;
1574 v->domain->arch.pci_cf8 = data;
1576 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1578 size = min(bytes, 4 - (port & 3));
1579 if ( size == 3 )
1580 size = 2;
1581 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1584 if ( size == 4 )
1585 return;
1587 port += size;
1588 bytes -= size;
1589 data >>= size * 8;
1593 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1594 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1595 __attribute__((__regparm__(1)));
1596 unsigned long guest_to_host_gpr_switch(unsigned long)
1597 __attribute__((__regparm__(1)));
1599 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1601 /* Instruction fetch with error handling. */
1602 #define insn_fetch(type, base, eip, limit) \
1603 ({ unsigned long _rc, _ptr = (base) + (eip); \
1604 type _x; \
1605 if ( ad_default < 8 ) \
1606 _ptr = (unsigned int)_ptr; \
1607 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1608 goto fail; \
1609 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1610 { \
1611 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1612 goto skip; \
1613 } \
1614 (eip) += sizeof(_x); _x; })
1616 #if defined(CONFIG_X86_32)
1617 # define read_sreg(regs, sr) ((regs)->sr)
1618 #elif defined(CONFIG_X86_64)
1619 # define read_sreg(regs, sr) read_segment_register(sr)
1620 #endif
1622 static int emulate_privileged_op(struct cpu_user_regs *regs)
1624 struct vcpu *v = current;
1625 unsigned long *reg, eip = regs->eip, res;
1626 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1627 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1628 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1629 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1630 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1631 ? regs->reg \
1632 : ad_bytes == 4 \
1633 ? (u32)regs->reg \
1634 : (u16)regs->reg)
1635 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1636 ? regs->reg = (val) \
1637 : ad_bytes == 4 \
1638 ? (*(u32 *)&regs->reg = (val)) \
1639 : (*(u16 *)&regs->reg = (val)))
1640 unsigned long code_base, code_limit;
1641 char io_emul_stub[32];
1642 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1643 u32 l, h, eax, edx;
1645 if ( !read_descriptor(regs->cs, v, regs,
1646 &code_base, &code_limit, &ar,
1647 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1648 goto fail;
1649 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1650 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1651 if ( !(ar & _SEGMENT_S) ||
1652 !(ar & _SEGMENT_P) ||
1653 !(ar & _SEGMENT_CODE) )
1654 goto fail;
1656 /* emulating only opcodes not allowing SS to be default */
1657 data_sel = read_sreg(regs, ds);
1659 /* Legacy prefixes. */
1660 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1662 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1664 case 0x66: /* operand-size override */
1665 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1666 continue;
1667 case 0x67: /* address-size override */
1668 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1669 continue;
1670 case 0x2e: /* CS override */
1671 data_sel = regs->cs;
1672 continue;
1673 case 0x3e: /* DS override */
1674 data_sel = read_sreg(regs, ds);
1675 continue;
1676 case 0x26: /* ES override */
1677 data_sel = read_sreg(regs, es);
1678 continue;
1679 case 0x64: /* FS override */
1680 data_sel = read_sreg(regs, fs);
1681 lm_ovr = lm_seg_fs;
1682 continue;
1683 case 0x65: /* GS override */
1684 data_sel = read_sreg(regs, gs);
1685 lm_ovr = lm_seg_gs;
1686 continue;
1687 case 0x36: /* SS override */
1688 data_sel = regs->ss;
1689 continue;
1690 case 0xf0: /* LOCK */
1691 lock = 1;
1692 continue;
1693 case 0xf2: /* REPNE/REPNZ */
1694 case 0xf3: /* REP/REPE/REPZ */
1695 rep_prefix = 1;
1696 continue;
1697 default:
1698 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1700 rex = opcode;
1701 continue;
1703 break;
1705 break;
1708 /* REX prefix. */
1709 if ( rex & 8 ) /* REX.W */
1710 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1711 modrm_reg = (rex & 4) << 1; /* REX.R */
1712 /* REX.X does not need to be decoded. */
1713 modrm_rm = (rex & 1) << 3; /* REX.B */
1715 if ( opcode == 0x0f )
1716 goto twobyte_opcode;
1718 if ( lock )
1719 goto fail;
1721 /* Input/Output String instructions. */
1722 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1724 unsigned long data_base, data_limit;
1726 if ( rep_prefix && (rd_ad(ecx) == 0) )
1727 goto done;
1729 if ( !(opcode & 2) )
1731 data_sel = read_sreg(regs, es);
1732 lm_ovr = lm_seg_none;
1735 if ( !(ar & _SEGMENT_L) )
1737 if ( !read_descriptor(data_sel, v, regs,
1738 &data_base, &data_limit, &ar,
1739 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1740 _SEGMENT_P) )
1741 goto fail;
1742 if ( !(ar & _SEGMENT_S) ||
1743 !(ar & _SEGMENT_P) ||
1744 (opcode & 2 ?
1745 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1746 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1747 goto fail;
1749 #ifdef CONFIG_X86_64
1750 else
1752 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1754 switch ( lm_ovr )
1756 case lm_seg_none:
1757 data_base = 0UL;
1758 break;
1759 case lm_seg_fs:
1760 data_base = v->arch.guest_context.fs_base;
1761 break;
1762 case lm_seg_gs:
1763 if ( guest_kernel_mode(v, regs) )
1764 data_base = v->arch.guest_context.gs_base_kernel;
1765 else
1766 data_base = v->arch.guest_context.gs_base_user;
1767 break;
1770 else
1771 read_descriptor(data_sel, v, regs,
1772 &data_base, &data_limit, &ar,
1773 0);
1774 data_limit = ~0UL;
1775 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1777 #endif
1779 port = (u16)regs->edx;
1781 continue_io_string:
1782 switch ( opcode )
1784 case 0x6c: /* INSB */
1785 op_bytes = 1;
1786 case 0x6d: /* INSW/INSL */
1787 if ( (data_limit < (op_bytes - 1)) ||
1788 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1789 !guest_io_okay(port, op_bytes, v, regs) )
1790 goto fail;
1791 data = guest_io_read(port, op_bytes, v, regs);
1792 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1793 &data, op_bytes)) != 0 )
1795 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1796 PFEC_write_access);
1797 return EXCRET_fault_fixed;
1799 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1800 ? -op_bytes : op_bytes));
1801 break;
1803 case 0x6e: /* OUTSB */
1804 op_bytes = 1;
1805 case 0x6f: /* OUTSW/OUTSL */
1806 if ( (data_limit < (op_bytes - 1)) ||
1807 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1808 !guest_io_okay(port, op_bytes, v, regs) )
1809 goto fail;
1810 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1811 op_bytes)) != 0 )
1813 propagate_page_fault(data_base + rd_ad(esi)
1814 + op_bytes - rc, 0);
1815 return EXCRET_fault_fixed;
1817 guest_io_write(port, op_bytes, data, v, regs);
1818 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1819 ? -op_bytes : op_bytes));
1820 break;
1823 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1825 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1827 if ( !bpmatch && !hypercall_preempt_check() )
1828 goto continue_io_string;
1829 eip = regs->eip;
1832 goto done;
1835 /*
1836 * Very likely to be an I/O instruction (IN/OUT).
1837 * Build an on-stack stub to execute the instruction with full guest
1838 * GPR context. This is needed for some systems which (ab)use IN/OUT
1839 * to communicate with BIOS code in system-management mode.
1840 */
1841 #ifdef __x86_64__
1842 /* movq $host_to_guest_gpr_switch,%rcx */
1843 io_emul_stub[0] = 0x48;
1844 io_emul_stub[1] = 0xb9;
1845 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1846 /* callq *%rcx */
1847 io_emul_stub[10] = 0xff;
1848 io_emul_stub[11] = 0xd1;
1849 #else
1850 /* call host_to_guest_gpr_switch */
1851 io_emul_stub[0] = 0xe8;
1852 *(s32 *)&io_emul_stub[1] =
1853 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1854 /* 7 x nop */
1855 memset(&io_emul_stub[5], 0x90, 7);
1856 #endif
1857 /* data16 or nop */
1858 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1859 /* <io-access opcode> */
1860 io_emul_stub[13] = opcode;
1861 /* imm8 or nop */
1862 io_emul_stub[14] = 0x90;
1863 /* ret (jumps to guest_to_host_gpr_switch) */
1864 io_emul_stub[15] = 0xc3;
1866 /* Handy function-typed pointer to the stub. */
1867 io_emul = (void *)io_emul_stub;
1869 if ( ioemul_handle_quirk )
1870 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1872 /* I/O Port and Interrupt Flag instructions. */
1873 switch ( opcode )
1875 case 0xe4: /* IN imm8,%al */
1876 op_bytes = 1;
1877 case 0xe5: /* IN imm8,%eax */
1878 port = insn_fetch(u8, code_base, eip, code_limit);
1879 io_emul_stub[14] = port; /* imm8 */
1880 exec_in:
1881 if ( !guest_io_okay(port, op_bytes, v, regs) )
1882 goto fail;
1883 if ( admin_io_okay(port, op_bytes, v, regs) )
1885 io_emul(regs);
1887 else
1889 if ( op_bytes == 4 )
1890 regs->eax = 0;
1891 else
1892 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1893 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1895 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1896 goto done;
1898 case 0xec: /* IN %dx,%al */
1899 op_bytes = 1;
1900 case 0xed: /* IN %dx,%eax */
1901 port = (u16)regs->edx;
1902 goto exec_in;
1904 case 0xe6: /* OUT %al,imm8 */
1905 op_bytes = 1;
1906 case 0xe7: /* OUT %eax,imm8 */
1907 port = insn_fetch(u8, code_base, eip, code_limit);
1908 io_emul_stub[14] = port; /* imm8 */
1909 exec_out:
1910 if ( !guest_io_okay(port, op_bytes, v, regs) )
1911 goto fail;
1912 if ( admin_io_okay(port, op_bytes, v, regs) )
1914 io_emul(regs);
1915 if ( (op_bytes == 1) && pv_post_outb_hook )
1916 pv_post_outb_hook(port, regs->eax);
1918 else
1920 guest_io_write(port, op_bytes, regs->eax, v, regs);
1922 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1923 goto done;
1925 case 0xee: /* OUT %al,%dx */
1926 op_bytes = 1;
1927 case 0xef: /* OUT %eax,%dx */
1928 port = (u16)regs->edx;
1929 goto exec_out;
1931 case 0xfa: /* CLI */
1932 case 0xfb: /* STI */
1933 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1934 goto fail;
1935 /*
1936 * This is just too dangerous to allow, in my opinion. Consider if the
1937 * caller then tries to reenable interrupts using POPF: we can't trap
1938 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1939 * do for us. :-)
1940 */
1941 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1942 goto done;
1945 /* No decode of this single-byte opcode. */
1946 goto fail;
1948 twobyte_opcode:
1949 /* Two-byte opcodes only emulated from guest kernel. */
1950 if ( !guest_kernel_mode(v, regs) )
1951 goto fail;
1953 /* Privileged (ring 0) instructions. */
1954 opcode = insn_fetch(u8, code_base, eip, code_limit);
1955 if ( lock && (opcode & ~3) != 0x20 )
1956 goto fail;
1957 switch ( opcode )
1959 case 0x06: /* CLTS */
1960 (void)do_fpu_taskswitch(0);
1961 break;
1963 case 0x09: /* WBINVD */
1964 /* Ignore the instruction if unprivileged. */
1965 if ( !cache_flush_permitted(v->domain) )
1966 /* Non-physdev domain attempted WBINVD; ignore for now since
1967 newer linux uses this in some start-of-day timing loops */
1969 else
1970 wbinvd();
1971 break;
1973 case 0x20: /* MOV CR?,<reg> */
1974 opcode = insn_fetch(u8, code_base, eip, code_limit);
1975 if ( opcode < 0xc0 )
1976 goto fail;
1977 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1978 modrm_rm |= (opcode >> 0) & 7;
1979 reg = decode_register(modrm_rm, regs, 0);
1980 switch ( modrm_reg )
1982 case 0: /* Read CR0 */
1983 *reg = (read_cr0() & ~X86_CR0_TS) |
1984 v->arch.guest_context.ctrlreg[0];
1985 break;
1987 case 2: /* Read CR2 */
1988 *reg = v->arch.guest_context.ctrlreg[2];
1989 break;
1991 case 3: /* Read CR3 */
1992 if ( !is_pv_32on64_vcpu(v) )
1993 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1994 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1995 #ifdef CONFIG_COMPAT
1996 else
1997 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1998 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1999 #endif
2000 break;
2002 case 4: /* Read CR4 */
2003 /*
2004 * Guests can read CR4 to see what features Xen has enabled. We
2005 * therefore lie about PGE & PSE as they are unavailable to guests.
2006 */
2007 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
2008 break;
2010 default:
2011 goto fail;
2013 break;
2015 case 0x21: /* MOV DR?,<reg> */
2016 opcode = insn_fetch(u8, code_base, eip, code_limit);
2017 if ( opcode < 0xc0 )
2018 goto fail;
2019 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2020 modrm_rm |= (opcode >> 0) & 7;
2021 reg = decode_register(modrm_rm, regs, 0);
2022 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2023 goto fail;
2024 *reg = res;
2025 break;
2027 case 0x22: /* MOV <reg>,CR? */
2028 opcode = insn_fetch(u8, code_base, eip, code_limit);
2029 if ( opcode < 0xc0 )
2030 goto fail;
2031 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2032 modrm_rm |= (opcode >> 0) & 7;
2033 reg = decode_register(modrm_rm, regs, 0);
2034 switch ( modrm_reg )
2036 case 0: /* Write CR0 */
2037 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2039 gdprintk(XENLOG_WARNING,
2040 "Attempt to change unmodifiable CR0 flags.\n");
2041 goto fail;
2043 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2044 break;
2046 case 2: /* Write CR2 */
2047 v->arch.guest_context.ctrlreg[2] = *reg;
2048 arch_set_cr2(v, *reg);
2049 break;
2051 case 3: /* Write CR3 */
2052 domain_lock(v->domain);
2053 if ( !is_pv_32on64_vcpu(v) )
2054 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2055 #ifdef CONFIG_COMPAT
2056 else
2057 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2058 #endif
2059 domain_unlock(v->domain);
2060 if ( rc == 0 ) /* not okay */
2061 goto fail;
2062 break;
2064 case 4: /* Write CR4 */
2065 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2066 write_cr4(pv_guest_cr4_to_real_cr4(
2067 v->arch.guest_context.ctrlreg[4]));
2068 break;
2070 default:
2071 goto fail;
2073 break;
2075 case 0x23: /* MOV <reg>,DR? */
2076 opcode = insn_fetch(u8, code_base, eip, code_limit);
2077 if ( opcode < 0xc0 )
2078 goto fail;
2079 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2080 modrm_rm |= (opcode >> 0) & 7;
2081 reg = decode_register(modrm_rm, regs, 0);
2082 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2083 goto fail;
2084 break;
2086 case 0x30: /* WRMSR */
2087 eax = regs->eax;
2088 edx = regs->edx;
2089 res = ((u64)edx << 32) | eax;
2090 switch ( (u32)regs->ecx )
2092 #ifdef CONFIG_X86_64
2093 case MSR_FS_BASE:
2094 if ( is_pv_32on64_vcpu(v) )
2095 goto fail;
2096 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2097 goto fail;
2098 v->arch.guest_context.fs_base = res;
2099 break;
2100 case MSR_GS_BASE:
2101 if ( is_pv_32on64_vcpu(v) )
2102 goto fail;
2103 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2104 goto fail;
2105 v->arch.guest_context.gs_base_kernel = res;
2106 break;
2107 case MSR_SHADOW_GS_BASE:
2108 if ( is_pv_32on64_vcpu(v) )
2109 goto fail;
2110 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2111 goto fail;
2112 v->arch.guest_context.gs_base_user = res;
2113 break;
2114 #endif
2115 case MSR_K7_FID_VID_STATUS:
2116 case MSR_K7_FID_VID_CTL:
2117 case MSR_K8_PSTATE_LIMIT:
2118 case MSR_K8_PSTATE_CTRL:
2119 case MSR_K8_PSTATE_STATUS:
2120 case MSR_K8_PSTATE0:
2121 case MSR_K8_PSTATE1:
2122 case MSR_K8_PSTATE2:
2123 case MSR_K8_PSTATE3:
2124 case MSR_K8_PSTATE4:
2125 case MSR_K8_PSTATE5:
2126 case MSR_K8_PSTATE6:
2127 case MSR_K8_PSTATE7:
2128 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2129 goto fail;
2130 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2131 break;
2132 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2133 goto fail;
2134 break;
2135 case MSR_AMD64_NB_CFG:
2136 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2137 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2138 goto fail;
2139 if ( !IS_PRIV(v->domain) )
2140 break;
2141 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2142 (eax != l) ||
2143 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2144 goto invalid;
2145 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2146 goto fail;
2147 break;
2148 case MSR_FAM10H_MMIO_CONF_BASE:
2149 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2150 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2151 goto fail;
2152 if ( !IS_PRIV(v->domain) )
2153 break;
2154 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2155 (((((u64)h << 32) | l) ^ res) &
2156 ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
2157 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2158 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2159 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2160 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2161 goto invalid;
2162 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2163 goto fail;
2164 break;
2165 case MSR_IA32_PERF_CTL:
2166 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2167 goto fail;
2168 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2169 break;
2170 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2171 goto fail;
2172 break;
2173 case MSR_IA32_THERM_CONTROL:
2174 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2175 goto fail;
2176 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2177 goto fail;
2178 break;
2179 default:
2180 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2181 break;
2182 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2183 (eax != l) || (edx != h) )
2184 invalid:
2185 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2186 "%08x:%08x to %08x:%08x.\n",
2187 _p(regs->ecx), h, l, edx, eax);
2188 break;
2190 break;
2192 case 0x31: /* RDTSC */
2193 rdtsc(regs->eax, regs->edx);
2194 break;
2196 case 0x32: /* RDMSR */
2197 switch ( (u32)regs->ecx )
2199 #ifdef CONFIG_X86_64
2200 case MSR_FS_BASE:
2201 if ( is_pv_32on64_vcpu(v) )
2202 goto fail;
2203 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2204 regs->edx = v->arch.guest_context.fs_base >> 32;
2205 break;
2206 case MSR_GS_BASE:
2207 if ( is_pv_32on64_vcpu(v) )
2208 goto fail;
2209 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2210 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2211 break;
2212 case MSR_SHADOW_GS_BASE:
2213 if ( is_pv_32on64_vcpu(v) )
2214 goto fail;
2215 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2216 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2217 break;
2218 #endif
2219 case MSR_K7_FID_VID_CTL:
2220 case MSR_K7_FID_VID_STATUS:
2221 case MSR_K8_PSTATE_LIMIT:
2222 case MSR_K8_PSTATE_CTRL:
2223 case MSR_K8_PSTATE_STATUS:
2224 case MSR_K8_PSTATE0:
2225 case MSR_K8_PSTATE1:
2226 case MSR_K8_PSTATE2:
2227 case MSR_K8_PSTATE3:
2228 case MSR_K8_PSTATE4:
2229 case MSR_K8_PSTATE5:
2230 case MSR_K8_PSTATE6:
2231 case MSR_K8_PSTATE7:
2232 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2233 goto fail;
2234 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2236 regs->eax = regs->edx = 0;
2237 break;
2239 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2240 goto fail;
2241 break;
2242 case MSR_IA32_MISC_ENABLE:
2243 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2244 goto fail;
2245 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2246 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2247 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2248 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2249 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2250 break;
2251 case MSR_EFER:
2252 case MSR_IA32_THERM_CONTROL:
2253 case MSR_AMD_PATCHLEVEL:
2254 default:
2255 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2257 regs->eax = l;
2258 regs->edx = h;
2259 break;
2261 /* Everyone can read the MSR space. */
2262 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2263 _p(regs->ecx));*/
2264 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2265 goto fail;
2266 break;
2268 break;
2270 default:
2271 goto fail;
2274 #undef wr_ad
2275 #undef rd_ad
2277 done:
2278 instruction_done(regs, eip, bpmatch);
2279 skip:
2280 return EXCRET_fault_fixed;
2282 fail:
2283 return 0;
2286 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2287 unsigned int esp, unsigned int decr)
2289 return (((esp - decr) < (esp - 1)) &&
2290 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2293 static void emulate_gate_op(struct cpu_user_regs *regs)
2295 #ifdef __x86_64__
2296 struct vcpu *v = current;
2297 unsigned int sel, ar, dpl, nparm, opnd_sel;
2298 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2299 unsigned long off, eip, opnd_off, base, limit;
2300 int jump;
2302 /* Check whether this fault is due to the use of a call gate. */
2303 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2304 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2305 ((ar & _SEGMENT_TYPE) != 0xc00) )
2307 do_guest_trap(TRAP_gp_fault, regs, 1);
2308 return;
2310 if ( !(ar & _SEGMENT_P) )
2312 do_guest_trap(TRAP_no_segment, regs, 1);
2313 return;
2315 dpl = (ar >> 13) & 3;
2316 nparm = ar & 0x1f;
2318 /*
2319 * Decode instruction (and perhaps operand) to determine RPL,
2320 * whether this is a jump or a call, and the call return offset.
2321 */
2322 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2323 !(ar & _SEGMENT_S) ||
2324 !(ar & _SEGMENT_P) ||
2325 !(ar & _SEGMENT_CODE) )
2327 do_guest_trap(TRAP_gp_fault, regs, 1);
2328 return;
2331 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2332 ad_default = ad_bytes = op_default;
2333 opnd_sel = opnd_off = 0;
2334 jump = -1;
2335 for ( eip = regs->eip; eip - regs->_eip < 10; )
2337 switch ( insn_fetch(u8, base, eip, limit) )
2339 case 0x66: /* operand-size override */
2340 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2341 continue;
2342 case 0x67: /* address-size override */
2343 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2344 continue;
2345 case 0x2e: /* CS override */
2346 opnd_sel = regs->cs;
2347 ASSERT(opnd_sel);
2348 continue;
2349 case 0x3e: /* DS override */
2350 opnd_sel = read_sreg(regs, ds);
2351 if ( !opnd_sel )
2352 opnd_sel = dpl;
2353 continue;
2354 case 0x26: /* ES override */
2355 opnd_sel = read_sreg(regs, es);
2356 if ( !opnd_sel )
2357 opnd_sel = dpl;
2358 continue;
2359 case 0x64: /* FS override */
2360 opnd_sel = read_sreg(regs, fs);
2361 if ( !opnd_sel )
2362 opnd_sel = dpl;
2363 continue;
2364 case 0x65: /* GS override */
2365 opnd_sel = read_sreg(regs, gs);
2366 if ( !opnd_sel )
2367 opnd_sel = dpl;
2368 continue;
2369 case 0x36: /* SS override */
2370 opnd_sel = regs->ss;
2371 if ( !opnd_sel )
2372 opnd_sel = dpl;
2373 continue;
2374 case 0xea:
2375 ++jump;
2376 /* FALLTHROUGH */
2377 case 0x9a:
2378 ++jump;
2379 opnd_sel = regs->cs;
2380 opnd_off = eip;
2381 ad_bytes = ad_default;
2382 eip += op_bytes + 2;
2383 break;
2384 case 0xff:
2386 unsigned int modrm;
2388 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2390 case 0x28: case 0x68: case 0xa8:
2391 ++jump;
2392 /* FALLTHROUGH */
2393 case 0x18: case 0x58: case 0x98:
2394 ++jump;
2395 if ( ad_bytes != 2 )
2397 if ( (modrm & 7) == 4 )
2399 unsigned int sib;
2400 sib = insn_fetch(u8, base, eip, limit);
2402 modrm = (modrm & ~7) | (sib & 7);
2403 if ( (sib >>= 3) != 4 )
2404 opnd_off = *(unsigned long *)
2405 decode_register(sib & 7, regs, 0);
2406 opnd_off <<= sib >> 3;
2408 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2409 opnd_off += *(unsigned long *)
2410 decode_register(modrm & 7, regs, 0);
2411 else
2412 modrm |= 0x87;
2413 if ( !opnd_sel )
2415 switch ( modrm & 7 )
2417 default:
2418 opnd_sel = read_sreg(regs, ds);
2419 break;
2420 case 4: case 5:
2421 opnd_sel = regs->ss;
2422 break;
2426 else
2428 switch ( modrm & 7 )
2430 case 0: case 1: case 7:
2431 opnd_off = regs->ebx;
2432 break;
2433 case 6:
2434 if ( !(modrm & 0xc0) )
2435 modrm |= 0x80;
2436 else
2437 case 2: case 3:
2439 opnd_off = regs->ebp;
2440 if ( !opnd_sel )
2441 opnd_sel = regs->ss;
2443 break;
2445 if ( !opnd_sel )
2446 opnd_sel = read_sreg(regs, ds);
2447 switch ( modrm & 7 )
2449 case 0: case 2: case 4:
2450 opnd_off += regs->esi;
2451 break;
2452 case 1: case 3: case 5:
2453 opnd_off += regs->edi;
2454 break;
2457 switch ( modrm & 0xc0 )
2459 case 0x40:
2460 opnd_off += insn_fetch(s8, base, eip, limit);
2461 break;
2462 case 0x80:
2463 opnd_off += insn_fetch(s32, base, eip, limit);
2464 break;
2466 if ( ad_bytes == 4 )
2467 opnd_off = (unsigned int)opnd_off;
2468 else if ( ad_bytes == 2 )
2469 opnd_off = (unsigned short)opnd_off;
2470 break;
2473 break;
2475 break;
2478 if ( jump < 0 )
2480 fail:
2481 do_guest_trap(TRAP_gp_fault, regs, 1);
2482 skip:
2483 return;
2486 if ( (opnd_sel != regs->cs &&
2487 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2488 !(ar & _SEGMENT_S) ||
2489 !(ar & _SEGMENT_P) ||
2490 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2492 do_guest_trap(TRAP_gp_fault, regs, 1);
2493 return;
2496 opnd_off += op_bytes;
2497 #define ad_default ad_bytes
2498 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2499 #undef ad_default
2500 ASSERT((opnd_sel & ~3) == regs->error_code);
2501 if ( dpl < (opnd_sel & 3) )
2503 do_guest_trap(TRAP_gp_fault, regs, 1);
2504 return;
2507 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2508 !(ar & _SEGMENT_S) ||
2509 !(ar & _SEGMENT_CODE) ||
2510 (!jump || (ar & _SEGMENT_EC) ?
2511 ((ar >> 13) & 3) > (regs->cs & 3) :
2512 ((ar >> 13) & 3) != (regs->cs & 3)) )
2514 regs->error_code = sel;
2515 do_guest_trap(TRAP_gp_fault, regs, 1);
2516 return;
2518 if ( !(ar & _SEGMENT_P) )
2520 regs->error_code = sel;
2521 do_guest_trap(TRAP_no_segment, regs, 1);
2522 return;
2524 if ( off > limit )
2526 regs->error_code = 0;
2527 do_guest_trap(TRAP_gp_fault, regs, 1);
2528 return;
2531 if ( !jump )
2533 unsigned int ss, esp, *stkp;
2534 int rc;
2535 #define push(item) do \
2536 { \
2537 --stkp; \
2538 esp -= 4; \
2539 rc = __put_user(item, stkp); \
2540 if ( rc ) \
2541 { \
2542 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2543 PFEC_write_access); \
2544 return; \
2545 } \
2546 } while ( 0 )
2548 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2550 sel |= (ar >> 13) & 3;
2551 /* Inner stack known only for kernel ring. */
2552 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2554 do_guest_trap(TRAP_gp_fault, regs, 1);
2555 return;
2557 esp = v->arch.guest_context.kernel_sp;
2558 ss = v->arch.guest_context.kernel_ss;
2559 if ( (ss & 3) != (sel & 3) ||
2560 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2561 ((ar >> 13) & 3) != (sel & 3) ||
2562 !(ar & _SEGMENT_S) ||
2563 (ar & _SEGMENT_CODE) ||
2564 !(ar & _SEGMENT_WR) )
2566 regs->error_code = ss & ~3;
2567 do_guest_trap(TRAP_invalid_tss, regs, 1);
2568 return;
2570 if ( !(ar & _SEGMENT_P) ||
2571 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2573 regs->error_code = ss & ~3;
2574 do_guest_trap(TRAP_stack_error, regs, 1);
2575 return;
2577 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2578 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2580 do_guest_trap(TRAP_gp_fault, regs, 1);
2581 return;
2583 push(regs->ss);
2584 push(regs->esp);
2585 if ( nparm )
2587 const unsigned int *ustkp;
2589 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2590 ((ar >> 13) & 3) != (regs->cs & 3) ||
2591 !(ar & _SEGMENT_S) ||
2592 (ar & _SEGMENT_CODE) ||
2593 !(ar & _SEGMENT_WR) ||
2594 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2595 return do_guest_trap(TRAP_gp_fault, regs, 1);
2596 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2597 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2599 do_guest_trap(TRAP_gp_fault, regs, 1);
2600 return;
2602 do
2604 unsigned int parm;
2606 --ustkp;
2607 rc = __get_user(parm, ustkp);
2608 if ( rc )
2610 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2611 return;
2613 push(parm);
2614 } while ( --nparm );
2617 else
2619 sel |= (regs->cs & 3);
2620 esp = regs->esp;
2621 ss = regs->ss;
2622 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2623 ((ar >> 13) & 3) != (sel & 3) )
2625 do_guest_trap(TRAP_gp_fault, regs, 1);
2626 return;
2628 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2630 regs->error_code = 0;
2631 do_guest_trap(TRAP_stack_error, regs, 1);
2632 return;
2634 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2635 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2637 do_guest_trap(TRAP_gp_fault, regs, 1);
2638 return;
2641 push(regs->cs);
2642 push(eip);
2643 #undef push
2644 regs->esp = esp;
2645 regs->ss = ss;
2647 else
2648 sel |= (regs->cs & 3);
2650 regs->cs = sel;
2651 instruction_done(regs, off, 0);
2652 #endif
2655 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2657 struct vcpu *v = current;
2658 unsigned long fixup;
2660 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2662 if ( regs->error_code & 1 )
2663 goto hardware_gp;
2665 if ( !guest_mode(regs) )
2666 goto gp_in_kernel;
2668 /*
2669 * Cunning trick to allow arbitrary "INT n" handling.
2671 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2672 * instruction from trapping to the appropriate vector, when that might not
2673 * be expected by Xen or the guest OS. For example, that entry might be for
2674 * a fault handler (unlike traps, faults don't increment EIP), or might
2675 * expect an error code on the stack (which a software trap never
2676 * provides), or might be a hardware interrupt handler that doesn't like
2677 * being called spuriously.
2679 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2680 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2681 * clear to indicate that it's a software fault, not hardware.
2683 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2684 * okay because they can only be triggered by an explicit DPL-checked
2685 * instruction. The DPL specified by the guest OS for these vectors is NOT
2686 * CHECKED!!
2687 */
2688 if ( (regs->error_code & 3) == 2 )
2690 /* This fault must be due to <INT n> instruction. */
2691 const struct trap_info *ti;
2692 unsigned char vector = regs->error_code >> 3;
2693 ti = &v->arch.guest_context.trap_ctxt[vector];
2694 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2696 regs->eip += 2;
2697 do_guest_trap(vector, regs, 0);
2698 return;
2701 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2703 emulate_gate_op(regs);
2704 return;
2707 /* Emulate some simple privileged and I/O instructions. */
2708 if ( (regs->error_code == 0) &&
2709 emulate_privileged_op(regs) )
2711 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2712 return;
2715 #if defined(__i386__)
2716 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2717 (regs->error_code == 0) &&
2718 gpf_emulate_4gb(regs) )
2720 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2721 return;
2723 #endif
2725 /* Pass on GPF as is. */
2726 do_guest_trap(TRAP_gp_fault, regs, 1);
2727 return;
2729 gp_in_kernel:
2731 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2733 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2734 regs->error_code, _p(regs->eip), _p(fixup));
2735 regs->eip = fixup;
2736 return;
2739 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2741 hardware_gp:
2742 show_execution_state(regs);
2743 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2746 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2748 static void nmi_mce_softirq(void)
2750 int cpu = smp_processor_id();
2751 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2752 cpumask_t affinity;
2754 BUG_ON(st == NULL);
2755 BUG_ON(st->vcpu == NULL);
2757 /* Set the tmp value unconditionally, so that
2758 * the check in the iret hypercall works. */
2759 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2761 if ((cpu != st->processor)
2762 || (st->processor != st->vcpu->processor))
2764 /* We are on a different physical cpu.
2765 * Make sure to wakeup the vcpu on the
2766 * specified processor.
2767 */
2768 cpus_clear(affinity);
2769 cpu_set(st->processor, affinity);
2770 vcpu_set_affinity(st->vcpu, &affinity);
2772 /* Affinity is restored in the iret hypercall. */
2775 /* Only used to defer wakeup of domain/vcpu to
2776 * a safe (non-NMI/MCE) context.
2777 */
2778 vcpu_kick(st->vcpu);
2781 static void nmi_dom0_report(unsigned int reason_idx)
2783 struct domain *d = dom0;
2785 if ( (d == NULL) || (d->vcpu[0] == NULL) )
2786 return;
2788 set_bit(reason_idx, nmi_reason(d));
2790 send_guest_trap(d, 0, TRAP_nmi);
2793 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2795 switch ( opt_nmi[0] )
2797 case 'd': /* 'dom0' */
2798 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2799 case 'i': /* 'ignore' */
2800 break;
2801 default: /* 'fatal' */
2802 console_force_unlock();
2803 printk("\n\nNMI - MEMORY ERROR\n");
2804 fatal_trap(TRAP_nmi, regs);
2807 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2808 mdelay(1);
2809 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2812 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2814 switch ( opt_nmi[0] )
2816 case 'd': /* 'dom0' */
2817 nmi_dom0_report(_XEN_NMIREASON_io_error);
2818 case 'i': /* 'ignore' */
2819 break;
2820 default: /* 'fatal' */
2821 console_force_unlock();
2822 printk("\n\nNMI - I/O ERROR\n");
2823 fatal_trap(TRAP_nmi, regs);
2826 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2827 mdelay(1);
2828 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2831 static void unknown_nmi_error(unsigned char reason)
2833 switch ( opt_nmi[0] )
2835 case 'd': /* 'dom0' */
2836 nmi_dom0_report(_XEN_NMIREASON_unknown);
2837 case 'i': /* 'ignore' */
2838 break;
2839 default: /* 'fatal' */
2840 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2841 printk("Dazed and confused, but trying to continue\n");
2842 printk("Do you have a strange power saving mode enabled?\n");
2843 kexec_crash();
2847 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2849 return 0;
2852 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2854 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2856 unsigned int cpu = smp_processor_id();
2857 unsigned char reason;
2859 ++nmi_count(cpu);
2861 if ( nmi_callback(regs, cpu) )
2862 return;
2864 if ( nmi_watchdog )
2865 nmi_watchdog_tick(regs);
2867 /* Only the BSP gets external NMIs from the system. */
2868 if ( cpu == 0 )
2870 reason = inb(0x61);
2871 if ( reason & 0x80 )
2872 mem_parity_error(regs);
2873 else if ( reason & 0x40 )
2874 io_check_error(regs);
2875 else if ( !nmi_watchdog )
2876 unknown_nmi_error((unsigned char)(reason&0xff));
2880 void set_nmi_callback(nmi_callback_t callback)
2882 nmi_callback = callback;
2885 void unset_nmi_callback(void)
2887 nmi_callback = dummy_nmi_callback;
2890 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2892 struct vcpu *curr = current;
2894 BUG_ON(!guest_mode(regs));
2896 setup_fpu(curr);
2898 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2900 do_guest_trap(TRAP_no_device, regs, 0);
2901 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2903 else
2904 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2906 return;
2909 asmlinkage void do_debug(struct cpu_user_regs *regs)
2911 struct vcpu *v = current;
2913 DEBUGGER_trap_entry(TRAP_debug, regs);
2915 if ( !guest_mode(regs) )
2917 if ( regs->eflags & EF_TF )
2919 #ifdef __x86_64__
2920 void sysenter_entry(void);
2921 void sysenter_eflags_saved(void);
2922 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2923 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2924 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2925 goto out;
2926 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2927 #else
2928 WARN_ON(1);
2929 #endif
2930 regs->eflags &= ~EF_TF;
2932 else
2934 /*
2935 * We ignore watchpoints when they trigger within Xen. This may
2936 * happen when a buffer is passed to us which previously had a
2937 * watchpoint set on it. No need to bump EIP; the only faulting
2938 * trap is an instruction breakpoint, which can't happen to us.
2939 */
2940 WARN_ON(!search_exception_table(regs->eip));
2942 goto out;
2945 /* Save debug status register where guest OS can peek at it */
2946 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2948 ler_enable();
2949 do_guest_trap(TRAP_debug, regs, 0);
2950 return;
2952 out:
2953 ler_enable();
2954 return;
2957 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2961 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2963 int i;
2964 /* Keep secondary tables in sync with IRQ updates. */
2965 for ( i = 1; i < NR_CPUS; i++ )
2966 if ( idt_tables[i] != NULL )
2967 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2968 _set_gate(&idt_table[n], 14, dpl, addr);
2971 static void set_swint_gate(unsigned int n, void *addr)
2973 __set_intr_gate(n, 3, addr);
2976 void set_intr_gate(unsigned int n, void *addr)
2978 __set_intr_gate(n, 0, addr);
2981 void set_tss_desc(unsigned int n, void *addr)
2983 _set_tssldt_desc(
2984 per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
2985 (unsigned long)addr,
2986 offsetof(struct tss_struct, __cacheline_filler) - 1,
2987 9);
2988 #ifdef CONFIG_COMPAT
2989 _set_tssldt_desc(
2990 per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
2991 (unsigned long)addr,
2992 offsetof(struct tss_struct, __cacheline_filler) - 1,
2993 11);
2994 #endif
2997 void __devinit percpu_traps_init(void)
2999 subarch_percpu_traps_init();
3001 if ( !opt_ler )
3002 return;
3004 switch ( boot_cpu_data.x86_vendor )
3006 case X86_VENDOR_INTEL:
3007 switch ( boot_cpu_data.x86 )
3009 case 6:
3010 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3011 break;
3012 case 15:
3013 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3014 break;
3016 break;
3017 case X86_VENDOR_AMD:
3018 switch ( boot_cpu_data.x86 )
3020 case 6:
3021 case 15:
3022 case 16:
3023 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3024 break;
3026 break;
3029 ler_enable();
3032 void __init trap_init(void)
3034 /*
3035 * Note that interrupt gates are always used, rather than trap gates. We
3036 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3037 * first activation must have the "bad" value(s) for these registers and
3038 * we may lose them if another activation is installed before they are
3039 * saved. The page-fault handler also needs interrupts disabled until %cr2
3040 * has been read and saved on the stack.
3041 */
3042 set_intr_gate(TRAP_divide_error,&divide_error);
3043 set_intr_gate(TRAP_debug,&debug);
3044 set_intr_gate(TRAP_nmi,&nmi);
3045 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3046 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3047 set_intr_gate(TRAP_bounds,&bounds);
3048 set_intr_gate(TRAP_invalid_op,&invalid_op);
3049 set_intr_gate(TRAP_no_device,&device_not_available);
3050 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3051 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3052 set_intr_gate(TRAP_no_segment,&segment_not_present);
3053 set_intr_gate(TRAP_stack_error,&stack_segment);
3054 set_intr_gate(TRAP_gp_fault,&general_protection);
3055 set_intr_gate(TRAP_page_fault,&page_fault);
3056 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3057 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3058 set_intr_gate(TRAP_alignment_check,&alignment_check);
3059 set_intr_gate(TRAP_machine_check,&machine_check);
3060 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3062 /* CPU0 uses the master IDT. */
3063 idt_tables[0] = idt_table;
3065 percpu_traps_init();
3067 cpu_init();
3069 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3072 long register_guest_nmi_callback(unsigned long address)
3074 struct vcpu *v = current;
3075 struct domain *d = v->domain;
3076 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3078 t->vector = TRAP_nmi;
3079 t->flags = 0;
3080 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
3081 t->address = address;
3082 TI_SET_IF(t, 1);
3084 /*
3085 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3086 * now.
3087 */
3088 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3089 v->nmi_pending = 1;
3091 return 0;
3094 long unregister_guest_nmi_callback(void)
3096 struct vcpu *v = current;
3097 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3099 memset(t, 0, sizeof(*t));
3101 return 0;
3104 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3106 struct vcpu *v;
3107 struct trap_info *t;
3109 BUG_ON(d == NULL);
3110 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3112 /* Sanity check - XXX should be more fine grained. */
3113 BUG_ON(trap_nr > TRAP_syscall);
3115 v = d->vcpu[vcpuid];
3116 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3118 return (t->address != 0);
3122 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3124 struct vcpu *v;
3125 struct softirq_trap *st;
3127 BUG_ON(d == NULL);
3128 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3129 v = d->vcpu[vcpuid];
3131 switch (trap_nr) {
3132 case TRAP_nmi:
3133 if ( !test_and_set_bool(v->nmi_pending) ) {
3134 st = &per_cpu(softirq_trap, smp_processor_id());
3135 st->domain = dom0;
3136 st->vcpu = dom0->vcpu[0];
3137 st->processor = st->vcpu->processor;
3139 /* not safe to wake up a vcpu here */
3140 raise_softirq(NMI_MCE_SOFTIRQ);
3141 return 0;
3143 break;
3145 case TRAP_machine_check:
3147 /* We are called by the machine check (exception or polling) handlers
3148 * on the physical CPU that reported a machine check error. */
3150 if ( !test_and_set_bool(v->mce_pending) ) {
3151 st = &per_cpu(softirq_trap, smp_processor_id());
3152 st->domain = d;
3153 st->vcpu = v;
3154 st->processor = v->processor;
3156 /* not safe to wake up a vcpu here */
3157 raise_softirq(NMI_MCE_SOFTIRQ);
3158 return 0;
3160 break;
3163 /* delivery failed */
3164 return -EIO;
3168 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3170 struct trap_info cur;
3171 struct vcpu *curr = current;
3172 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3173 long rc = 0;
3175 /* If no table is presented then clear the entire virtual IDT. */
3176 if ( guest_handle_is_null(traps) )
3178 memset(dst, 0, 256 * sizeof(*dst));
3179 init_int80_direct_trap(curr);
3180 return 0;
3183 for ( ; ; )
3185 if ( hypercall_preempt_check() )
3187 rc = hypercall_create_continuation(
3188 __HYPERVISOR_set_trap_table, "h", traps);
3189 break;
3192 if ( copy_from_guest(&cur, traps, 1) )
3194 rc = -EFAULT;
3195 break;
3198 if ( cur.address == 0 )
3199 break;
3201 fixup_guest_code_selector(curr->domain, cur.cs);
3203 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3205 if ( cur.vector == 0x80 )
3206 init_int80_direct_trap(curr);
3208 guest_handle_add_offset(traps, 1);
3211 return rc;
3214 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3216 int i;
3217 struct vcpu *curr = current;
3219 switch ( reg )
3221 case 0:
3222 if ( !access_ok(value, sizeof(long)) )
3223 return -EPERM;
3224 if ( v == curr )
3225 write_debugreg(0, value);
3226 break;
3227 case 1:
3228 if ( !access_ok(value, sizeof(long)) )
3229 return -EPERM;
3230 if ( v == curr )
3231 write_debugreg(1, value);
3232 break;
3233 case 2:
3234 if ( !access_ok(value, sizeof(long)) )
3235 return -EPERM;
3236 if ( v == curr )
3237 write_debugreg(2, value);
3238 break;
3239 case 3:
3240 if ( !access_ok(value, sizeof(long)) )
3241 return -EPERM;
3242 if ( v == curr )
3243 write_debugreg(3, value);
3244 break;
3245 case 6:
3246 /*
3247 * DR6: Bits 4-11,16-31 reserved (set to 1).
3248 * Bit 12 reserved (set to 0).
3249 */
3250 value &= 0xffffefff; /* reserved bits => 0 */
3251 value |= 0xffff0ff0; /* reserved bits => 1 */
3252 if ( v == curr )
3253 write_debugreg(6, value);
3254 break;
3255 case 7:
3256 /*
3257 * DR7: Bit 10 reserved (set to 1).
3258 * Bits 11-12,14-15 reserved (set to 0).
3259 */
3260 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3261 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3262 /*
3263 * Privileged bits:
3264 * GD (bit 13): must be 0.
3265 */
3266 if ( value & DR_GENERAL_DETECT )
3267 return -EPERM;
3268 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3269 if ( value & DR7_ACTIVE_MASK )
3271 unsigned int io_enable = 0;
3273 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3275 if ( ((value >> i) & 3) == DR_IO )
3277 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3278 return -EPERM;
3279 io_enable |= value & (3 << ((i - 16) >> 1));
3281 #ifdef __i386__
3282 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3283 !boot_cpu_has(X86_FEATURE_LM)) &&
3284 (((value >> i) & 0xc) == DR_LEN_8) )
3285 return -EPERM;
3286 #endif
3289 /* Guest DR5 is a handy stash for I/O intercept information. */
3290 v->arch.guest_context.debugreg[5] = io_enable;
3291 value &= ~io_enable;
3293 /*
3294 * If DR7 was previously clear then we need to load all other
3295 * debug registers at this point as they were not restored during
3296 * context switch.
3297 */
3298 if ( (v == curr) &&
3299 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3301 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3302 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3303 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3304 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3305 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3308 if ( v == curr )
3309 write_debugreg(7, value);
3310 break;
3311 default:
3312 return -EINVAL;
3315 v->arch.guest_context.debugreg[reg] = value;
3316 return 0;
3319 long do_set_debugreg(int reg, unsigned long value)
3321 return set_debugreg(current, reg, value);
3324 unsigned long do_get_debugreg(int reg)
3326 struct vcpu *curr = current;
3328 switch ( reg )
3330 case 0 ... 3:
3331 case 6:
3332 return curr->arch.guest_context.debugreg[reg];
3333 case 7:
3334 return (curr->arch.guest_context.debugreg[7] |
3335 curr->arch.guest_context.debugreg[5]);
3336 case 4 ... 5:
3337 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3338 curr->arch.guest_context.debugreg[reg + 2] : 0);
3341 return -EINVAL;
3344 /*
3345 * Local variables:
3346 * mode: C
3347 * c-set-style: "BSD"
3348 * c-basic-offset: 4
3349 * tab-width: 4
3350 * indent-tabs-mode: nil
3351 * End:
3352 */