ia64/xen-unstable

view xen/arch/x86/traps.c @ 18481:a5cc38391afb

ACPI: Grant access of MSR_IA32_THERM_CONTROL MSR to dom0

The purpose is to support dom0 throttling control via MSR.

Signed-off-by: Wei Gang <gang.wei@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Sep 11 16:51:13 2008 +0100 (2008-09-11)
parents 1e98ea5c8604
children 88445b184dc6
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 struct vcpu *curr = current;
136 unsigned long *stack, addr;
138 if ( is_hvm_vcpu(curr) )
139 return;
141 if ( is_pv_32on64_vcpu(curr) )
142 {
143 compat_show_guest_stack(regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
160 {
161 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
162 break;
163 if ( get_user(addr, stack) )
164 {
165 if ( i != 0 )
166 printk("\n ");
167 printk("Fault while accessing guest memory.");
168 i = 1;
169 break;
170 }
171 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
172 printk("\n ");
173 printk(" %p", _p(addr));
174 stack++;
175 }
176 if ( i == 0 )
177 printk("Stack empty.");
178 printk("\n");
179 }
181 #if !defined(CONFIG_FRAME_POINTER)
183 static void show_trace(struct cpu_user_regs *regs)
184 {
185 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
187 printk("Xen call trace:\n ");
189 printk("[<%p>]", _p(regs->eip));
190 print_symbol(" %s\n ", regs->eip);
192 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
193 {
194 addr = *stack++;
195 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
196 {
197 printk("[<%p>]", _p(addr));
198 print_symbol(" %s\n ", addr);
199 }
200 }
202 printk("\n");
203 }
205 #else
207 static void show_trace(struct cpu_user_regs *regs)
208 {
209 unsigned long *frame, next, addr, low, high;
211 printk("Xen call trace:\n ");
213 printk("[<%p>]", _p(regs->eip));
214 print_symbol(" %s\n ", regs->eip);
216 /* Bounds for range of valid frame pointer. */
217 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
218 high = (low & ~(STACK_SIZE - 1)) +
219 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
221 /* The initial frame pointer. */
222 next = regs->ebp;
224 for ( ; ; )
225 {
226 /* Valid frame pointer? */
227 if ( (next < low) || (next >= high) )
228 {
229 /*
230 * Exception stack frames have a different layout, denoted by an
231 * inverted frame pointer.
232 */
233 next = ~next;
234 if ( (next < low) || (next >= high) )
235 break;
236 frame = (unsigned long *)next;
237 next = frame[0];
238 addr = frame[(offsetof(struct cpu_user_regs, eip) -
239 offsetof(struct cpu_user_regs, ebp))
240 / BYTES_PER_LONG];
241 }
242 else
243 {
244 /* Ordinary stack frame. */
245 frame = (unsigned long *)next;
246 next = frame[0];
247 addr = frame[1];
248 }
250 printk("[<%p>]", _p(addr));
251 print_symbol(" %s\n ", addr);
253 low = (unsigned long)&frame[2];
254 }
256 printk("\n");
257 }
259 #endif
261 void show_stack(struct cpu_user_regs *regs)
262 {
263 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
264 int i;
266 if ( guest_mode(regs) )
267 return show_guest_stack(regs);
269 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
271 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
272 {
273 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
274 break;
275 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
276 printk("\n ");
277 addr = *stack++;
278 printk(" %p", _p(addr));
279 }
280 if ( i == 0 )
281 printk("Stack empty.");
282 printk("\n");
284 show_trace(regs);
285 }
287 void show_stack_overflow(unsigned int cpu, unsigned long esp)
288 {
289 #ifdef MEMORY_GUARD
290 unsigned long esp_top, esp_bottom;
291 unsigned long *stack, addr;
293 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
294 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
296 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
297 (void *)esp_top, (void *)esp_bottom, (void *)esp,
298 (void *)init_tss[cpu].esp0);
300 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
301 if ( ((unsigned long)(esp - esp_top) > 512) &&
302 ((unsigned long)(esp_top - esp) > 512) )
303 {
304 printk("No stack overflow detected. Skipping stack trace.\n");
305 return;
306 }
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow (dumping trace %p-%p):\n ",
312 (void *)esp, (void *)esp_bottom);
314 stack = (unsigned long *)esp;
315 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
316 {
317 addr = *stack++;
318 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
319 {
320 printk("%p: [<%p>]", stack, _p(addr));
321 print_symbol(" %s\n ", addr);
322 }
323 }
325 printk("\n");
326 #endif
327 }
329 void show_execution_state(struct cpu_user_regs *regs)
330 {
331 show_registers(regs);
332 show_stack(regs);
333 }
335 void vcpu_show_execution_state(struct vcpu *v)
336 {
337 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
338 v->domain->domain_id, v->vcpu_id);
340 if ( v == current )
341 {
342 show_execution_state(guest_cpu_user_regs());
343 return;
344 }
346 vcpu_pause(v); /* acceptably dangerous */
348 vcpu_show_registers(v);
349 /* Todo: map arbitrary vcpu's top guest stack page here. */
350 if ( (v->domain == current->domain) &&
351 guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
352 show_guest_stack(&v->arch.guest_context.user_regs);
354 vcpu_unpause(v);
355 }
357 char *trapstr(int trapnr)
358 {
359 static char *strings[] = {
360 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
361 "invalid opcode", "device not available", "double fault",
362 "coprocessor segment", "invalid tss", "segment not found",
363 "stack error", "general protection fault", "page fault",
364 "spurious interrupt", "coprocessor error", "alignment check",
365 "machine check", "simd error"
366 };
368 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
369 return "???";
371 return strings[trapnr];
372 }
374 /*
375 * This is called for faults at very unexpected times (e.g., when interrupts
376 * are disabled). In such situations we can't do much that is safe. We try to
377 * print out some tracing and then we just spin.
378 */
379 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
380 {
381 static DEFINE_PER_CPU(char, depth);
383 /*
384 * In some cases, we can end up in a vicious cycle of fatal_trap()s
385 * within fatal_trap()s. We give the problem a couple of iterations to
386 * bottom out, and then we just panic.
387 */
388 if ( ++this_cpu(depth) < 3 )
389 {
390 watchdog_disable();
391 console_start_sync();
393 show_execution_state(regs);
395 if ( trapnr == TRAP_page_fault )
396 {
397 unsigned long cr2 = read_cr2();
398 printk("Faulting linear address: %p\n", _p(cr2));
399 show_page_walk(cr2);
400 }
401 }
403 panic("FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x] %s\n",
405 trapnr, trapstr(trapnr), regs->error_code,
406 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
407 }
409 static void do_guest_trap(
410 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
411 {
412 struct vcpu *v = current;
413 struct trap_bounce *tb;
414 const struct trap_info *ti;
416 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
418 tb = &v->arch.trap_bounce;
419 ti = &v->arch.guest_context.trap_ctxt[trapnr];
421 tb->flags = TBF_EXCEPTION;
422 tb->cs = ti->cs;
423 tb->eip = ti->address;
425 if ( use_error_code )
426 {
427 tb->flags |= TBF_EXCEPTION_ERRCODE;
428 tb->error_code = regs->error_code;
429 }
431 if ( TI_GET_IF(ti) )
432 tb->flags |= TBF_INTERRUPT;
434 if ( unlikely(null_trap_bounce(v, tb)) )
435 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
436 "on VCPU %d [ec=%04x]\n",
437 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
438 }
440 static void instruction_done(
441 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
442 {
443 regs->eip = eip;
444 regs->eflags &= ~X86_EFLAGS_RF;
445 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
446 {
447 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
448 if ( regs->eflags & X86_EFLAGS_TF )
449 current->arch.guest_context.debugreg[6] |= 0x4000;
450 do_guest_trap(TRAP_debug, regs, 0);
451 }
452 }
454 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
455 unsigned int port, unsigned int len)
456 {
457 unsigned int width, i, match = 0;
458 unsigned long start;
460 if ( !(v->arch.guest_context.debugreg[5]) ||
461 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
462 return 0;
464 for ( i = 0; i < 4; i++ )
465 {
466 if ( !(v->arch.guest_context.debugreg[5] &
467 (3 << (i * DR_ENABLE_SIZE))) )
468 continue;
470 start = v->arch.guest_context.debugreg[i];
471 width = 0;
473 switch ( (v->arch.guest_context.debugreg[7] >>
474 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
475 {
476 case DR_LEN_1: width = 1; break;
477 case DR_LEN_2: width = 2; break;
478 case DR_LEN_4: width = 4; break;
479 case DR_LEN_8: width = 8; break;
480 }
482 if ( (start < (port + len)) && ((start + width) > port) )
483 match |= 1 << i;
484 }
486 return match;
487 }
489 /*
490 * Called from asm to set up the MCE trapbounce info.
491 * Returns 0 if no callback is set up, else 1.
492 */
493 asmlinkage int set_guest_machinecheck_trapbounce(void)
494 {
495 struct vcpu *v = current;
496 struct trap_bounce *tb = &v->arch.trap_bounce;
498 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
499 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
500 return !null_trap_bounce(v, tb);
501 }
503 /*
504 * Called from asm to set up the NMI trapbounce info.
505 * Returns 0 if no callback is set up, else 1.
506 */
507 asmlinkage int set_guest_nmi_trapbounce(void)
508 {
509 struct vcpu *v = current;
510 struct trap_bounce *tb = &v->arch.trap_bounce;
511 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
512 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
513 return !null_trap_bounce(v, tb);
514 }
516 static inline void do_trap(
517 int trapnr, struct cpu_user_regs *regs, int use_error_code)
518 {
519 struct vcpu *curr = current;
520 unsigned long fixup;
522 DEBUGGER_trap_entry(trapnr, regs);
524 if ( guest_mode(regs) )
525 {
526 do_guest_trap(trapnr, regs, use_error_code);
527 return;
528 }
530 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
531 {
532 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
533 trapnr, _p(regs->eip), _p(fixup));
534 regs->eip = fixup;
535 return;
536 }
538 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
539 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
540 {
541 curr->arch.hvm_vcpu.fpu_exception_callback(
542 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
543 return;
544 }
546 DEBUGGER_trap_fatal(trapnr, regs);
548 show_execution_state(regs);
549 panic("FATAL TRAP: vector = %d (%s)\n"
550 "[error_code=%04x]\n",
551 trapnr, trapstr(trapnr), regs->error_code);
552 }
554 #define DO_ERROR_NOCODE(trapnr, name) \
555 asmlinkage void do_##name(struct cpu_user_regs *regs) \
556 { \
557 do_trap(trapnr, regs, 0); \
558 }
560 #define DO_ERROR(trapnr, name) \
561 asmlinkage void do_##name(struct cpu_user_regs *regs) \
562 { \
563 do_trap(trapnr, regs, 1); \
564 }
566 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
567 DO_ERROR_NOCODE(TRAP_overflow, overflow)
568 DO_ERROR_NOCODE(TRAP_bounds, bounds)
569 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
570 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
571 DO_ERROR( TRAP_no_segment, segment_not_present)
572 DO_ERROR( TRAP_stack_error, stack_segment)
573 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
574 DO_ERROR( TRAP_alignment_check, alignment_check)
575 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
577 int rdmsr_hypervisor_regs(
578 uint32_t idx, uint32_t *eax, uint32_t *edx)
579 {
580 idx -= 0x40000000;
581 if ( idx > 0 )
582 return 0;
584 switch ( idx )
585 {
586 case 0:
587 {
588 *eax = *edx = 0;
589 break;
590 }
591 default:
592 BUG();
593 }
595 return 1;
596 }
598 int wrmsr_hypervisor_regs(
599 uint32_t idx, uint32_t eax, uint32_t edx)
600 {
601 struct domain *d = current->domain;
603 idx -= 0x40000000;
604 if ( idx > 0 )
605 return 0;
607 switch ( idx )
608 {
609 case 0:
610 {
611 void *hypercall_page;
612 unsigned long mfn;
613 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
614 unsigned int idx = eax & 0xfff;
616 if ( idx > 0 )
617 {
618 gdprintk(XENLOG_WARNING,
619 "Out of range index %u to MSR %08x\n",
620 idx, 0x40000000);
621 return 0;
622 }
624 mfn = gmfn_to_mfn(d, gmfn);
626 if ( !mfn_valid(mfn) ||
627 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
628 {
629 gdprintk(XENLOG_WARNING,
630 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
631 gmfn, mfn, 0x40000000);
632 return 0;
633 }
635 hypercall_page = map_domain_page(mfn);
636 hypercall_page_initialise(d, hypercall_page);
637 unmap_domain_page(hypercall_page);
639 put_page_and_type(mfn_to_page(mfn));
640 break;
641 }
643 default:
644 BUG();
645 }
647 return 1;
648 }
650 int cpuid_hypervisor_leaves(
651 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
652 {
653 idx -= 0x40000000;
654 if ( idx > 2 )
655 return 0;
657 switch ( idx )
658 {
659 case 0:
660 *eax = 0x40000002; /* Largest leaf */
661 *ebx = XEN_CPUID_SIGNATURE_EBX;
662 *ecx = XEN_CPUID_SIGNATURE_ECX;
663 *edx = XEN_CPUID_SIGNATURE_EDX;
664 break;
666 case 1:
667 *eax = (xen_major_version() << 16) | xen_minor_version();
668 *ebx = 0; /* Reserved */
669 *ecx = 0; /* Reserved */
670 *edx = 0; /* Reserved */
671 break;
673 case 2:
674 *eax = 1; /* Number of hypercall-transfer pages */
675 *ebx = 0x40000000; /* MSR base address */
676 *ecx = 0; /* Features 1 */
677 *edx = 0; /* Features 2 */
678 if ( !is_hvm_vcpu(current) )
679 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
680 break;
682 default:
683 BUG();
684 }
686 return 1;
687 }
689 static void pv_cpuid(struct cpu_user_regs *regs)
690 {
691 uint32_t a, b, c, d;
693 a = regs->eax;
694 b = regs->ebx;
695 c = regs->ecx;
696 d = regs->edx;
698 if ( current->domain->domain_id != 0 )
699 {
700 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
701 domain_cpuid(current->domain, a, b, &a, &b, &c, &d);
702 goto out;
703 }
705 asm (
706 "cpuid"
707 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
708 : "0" (a), "1" (b), "2" (c), "3" (d) );
710 if ( (regs->eax & 0x7fffffff) == 1 )
711 {
712 /* Modify Feature Information. */
713 __clear_bit(X86_FEATURE_VME, &d);
714 __clear_bit(X86_FEATURE_PSE, &d);
715 __clear_bit(X86_FEATURE_PGE, &d);
716 __clear_bit(X86_FEATURE_MCE, &d);
717 __clear_bit(X86_FEATURE_MCA, &d);
718 __clear_bit(X86_FEATURE_PSE36, &d);
719 }
720 switch ( (uint32_t)regs->eax )
721 {
722 case 1:
723 /* Modify Feature Information. */
724 if ( !cpu_has_sep )
725 __clear_bit(X86_FEATURE_SEP, &d);
726 #ifdef __i386__
727 if ( !supervisor_mode_kernel )
728 __clear_bit(X86_FEATURE_SEP, &d);
729 #endif
730 __clear_bit(X86_FEATURE_DS, &d);
731 __clear_bit(X86_FEATURE_ACC, &d);
732 __clear_bit(X86_FEATURE_PBE, &d);
734 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
735 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
736 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
737 __clear_bit(X86_FEATURE_VMXE % 32, &c);
738 __clear_bit(X86_FEATURE_SMXE % 32, &c);
739 __clear_bit(X86_FEATURE_TM2 % 32, &c);
740 if ( is_pv_32bit_vcpu(current) )
741 __clear_bit(X86_FEATURE_CX16 % 32, &c);
742 __clear_bit(X86_FEATURE_XTPR % 32, &c);
743 __clear_bit(X86_FEATURE_PDCM % 32, &c);
744 __clear_bit(X86_FEATURE_DCA % 32, &c);
745 break;
746 case 0x80000001:
747 /* Modify Feature Information. */
748 if ( is_pv_32bit_vcpu(current) )
749 {
750 __clear_bit(X86_FEATURE_LM % 32, &d);
751 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
752 }
753 #ifndef __i386__
754 if ( is_pv_32on64_vcpu(current) &&
755 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
756 #endif
757 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
758 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
759 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
761 __clear_bit(X86_FEATURE_SVME % 32, &c);
762 __clear_bit(X86_FEATURE_OSVW % 32, &c);
763 __clear_bit(X86_FEATURE_IBS % 32, &c);
764 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
765 __clear_bit(X86_FEATURE_WDT % 32, &c);
766 break;
767 case 5: /* MONITOR/MWAIT */
768 case 0xa: /* Architectural Performance Monitor Features */
769 case 0x8000000a: /* SVM revision and features */
770 case 0x8000001b: /* Instruction Based Sampling */
771 a = b = c = d = 0;
772 break;
773 default:
774 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
775 break;
776 }
778 out:
779 regs->eax = a;
780 regs->ebx = b;
781 regs->ecx = c;
782 regs->edx = d;
783 }
785 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
786 {
787 char sig[5], instr[2];
788 unsigned long eip, rc;
790 eip = regs->eip;
792 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
793 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
794 {
795 propagate_page_fault(eip + sizeof(sig) - rc, 0);
796 return EXCRET_fault_fixed;
797 }
798 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
799 return 0;
800 eip += sizeof(sig);
802 /* We only emulate CPUID. */
803 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
804 {
805 propagate_page_fault(eip + sizeof(instr) - rc, 0);
806 return EXCRET_fault_fixed;
807 }
808 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
809 return 0;
810 eip += sizeof(instr);
812 pv_cpuid(regs);
814 instruction_done(regs, eip, 0);
816 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
818 return EXCRET_fault_fixed;
819 }
821 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
822 {
823 struct bug_frame bug;
824 struct bug_frame_str bug_str;
825 char *filename, *predicate, *eip = (char *)regs->eip;
826 unsigned long fixup;
827 int id, lineno;
829 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
831 if ( likely(guest_mode(regs)) )
832 {
833 if ( !emulate_forced_invalid_op(regs) )
834 do_guest_trap(TRAP_invalid_op, regs, 0);
835 return;
836 }
838 if ( !is_kernel(eip) ||
839 __copy_from_user(&bug, eip, sizeof(bug)) ||
840 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
841 (bug.ret != 0xc2) )
842 goto die;
843 eip += sizeof(bug);
845 id = bug.id & 3;
847 if ( id == BUGFRAME_dump )
848 {
849 show_execution_state(regs);
850 regs->eip = (unsigned long)eip;
851 return;
852 }
854 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
855 if ( !is_kernel(eip) ||
856 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
857 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
858 goto die;
859 eip += sizeof(bug_str);
861 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
862 lineno = bug.id >> 2;
864 if ( id == BUGFRAME_warn )
865 {
866 printk("Xen WARN at %.50s:%d\n", filename, lineno);
867 show_execution_state(regs);
868 regs->eip = (unsigned long)eip;
869 return;
870 }
872 if ( id == BUGFRAME_bug )
873 {
874 printk("Xen BUG at %.50s:%d\n", filename, lineno);
875 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
876 show_execution_state(regs);
877 panic("Xen BUG at %.50s:%d\n", filename, lineno);
878 }
880 /* ASSERT: decode the predicate string pointer. */
881 ASSERT(id == BUGFRAME_assert);
882 if ( !is_kernel(eip) ||
883 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
884 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
885 goto die;
886 eip += sizeof(bug_str);
888 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
889 printk("Assertion '%s' failed at %.50s:%d\n",
890 predicate, filename, lineno);
891 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
892 show_execution_state(regs);
893 panic("Assertion '%s' failed at %.50s:%d\n",
894 predicate, filename, lineno);
896 die:
897 if ( (fixup = search_exception_table(regs->eip)) != 0 )
898 {
899 regs->eip = fixup;
900 return;
901 }
902 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
903 show_execution_state(regs);
904 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
905 }
907 asmlinkage void do_int3(struct cpu_user_regs *regs)
908 {
909 DEBUGGER_trap_entry(TRAP_int3, regs);
911 if ( !guest_mode(regs) )
912 {
913 debugger_trap_fatal(TRAP_int3, regs);
914 return;
915 }
917 do_guest_trap(TRAP_int3, regs, 0);
918 }
920 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
921 {
922 machine_check_vector(regs, regs->error_code);
923 }
925 static void reserved_bit_page_fault(
926 unsigned long addr, struct cpu_user_regs *regs)
927 {
928 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
929 current->domain->domain_id, current->vcpu_id, regs->error_code);
930 show_page_walk(addr);
931 show_execution_state(regs);
932 }
934 void propagate_page_fault(unsigned long addr, u16 error_code)
935 {
936 struct trap_info *ti;
937 struct vcpu *v = current;
938 struct trap_bounce *tb = &v->arch.trap_bounce;
940 v->arch.guest_context.ctrlreg[2] = addr;
941 arch_set_cr2(v, addr);
943 /* Re-set error_code.user flag appropriately for the guest. */
944 error_code &= ~PFEC_user_mode;
945 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
946 error_code |= PFEC_user_mode;
948 trace_pv_page_fault(addr, error_code);
950 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
951 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
952 tb->error_code = error_code;
953 tb->cs = ti->cs;
954 tb->eip = ti->address;
955 if ( TI_GET_IF(ti) )
956 tb->flags |= TBF_INTERRUPT;
957 if ( unlikely(null_trap_bounce(v, tb)) )
958 {
959 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
960 v->domain->domain_id, v->vcpu_id, error_code);
961 show_page_walk(addr);
962 }
964 if ( unlikely(error_code & PFEC_reserved_bit) )
965 reserved_bit_page_fault(addr, guest_cpu_user_regs());
966 }
968 static int handle_gdt_ldt_mapping_fault(
969 unsigned long offset, struct cpu_user_regs *regs)
970 {
971 struct vcpu *curr = current;
972 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
973 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
974 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
976 /* Should never fault in another vcpu's area. */
977 BUG_ON(vcpu_area != curr->vcpu_id);
979 /* Byte offset within the gdt/ldt sub-area. */
980 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
982 if ( likely(is_ldt_area) )
983 {
984 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
985 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
986 {
987 if ( guest_mode(regs) )
988 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
989 regs->eip, offset);
990 }
991 else
992 {
993 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
994 if ( !guest_mode(regs) )
995 return 0;
996 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
997 propagate_page_fault(
998 curr->arch.guest_context.ldt_base + offset,
999 regs->error_code);
1002 else
1004 /* GDT fault: handle the fault as #GP(selector). */
1005 regs->error_code = (u16)offset & ~7;
1006 (void)do_general_protection(regs);
1009 return EXCRET_fault_fixed;
1012 #ifdef HYPERVISOR_VIRT_END
1013 #define IN_HYPERVISOR_RANGE(va) \
1014 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1015 #else
1016 #define IN_HYPERVISOR_RANGE(va) \
1017 (((va) >= HYPERVISOR_VIRT_START))
1018 #endif
1020 static int __spurious_page_fault(
1021 unsigned long addr, struct cpu_user_regs *regs)
1023 unsigned long mfn, cr3 = read_cr3();
1024 #if CONFIG_PAGING_LEVELS >= 4
1025 l4_pgentry_t l4e, *l4t;
1026 #endif
1027 #if CONFIG_PAGING_LEVELS >= 3
1028 l3_pgentry_t l3e, *l3t;
1029 #endif
1030 l2_pgentry_t l2e, *l2t;
1031 l1_pgentry_t l1e, *l1t;
1032 unsigned int required_flags, disallowed_flags;
1034 /*
1035 * We do not take spurious page faults in IRQ handlers as we do not
1036 * modify page tables in IRQ context. We therefore bail here because
1037 * map_domain_page() is not IRQ-safe.
1038 */
1039 if ( in_irq() )
1040 return 0;
1042 /* Reserved bit violations are never spurious faults. */
1043 if ( regs->error_code & PFEC_reserved_bit )
1044 return 0;
1046 required_flags = _PAGE_PRESENT;
1047 if ( regs->error_code & PFEC_write_access )
1048 required_flags |= _PAGE_RW;
1049 if ( regs->error_code & PFEC_user_mode )
1050 required_flags |= _PAGE_USER;
1052 disallowed_flags = 0;
1053 if ( regs->error_code & PFEC_insn_fetch )
1054 disallowed_flags |= _PAGE_NX;
1056 mfn = cr3 >> PAGE_SHIFT;
1058 #if CONFIG_PAGING_LEVELS >= 4
1059 l4t = map_domain_page(mfn);
1060 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1061 mfn = l4e_get_pfn(l4e);
1062 unmap_domain_page(l4t);
1063 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1064 (l4e_get_flags(l4e) & disallowed_flags) )
1065 return 0;
1066 #endif
1068 #if CONFIG_PAGING_LEVELS >= 3
1069 l3t = map_domain_page(mfn);
1070 #if CONFIG_PAGING_LEVELS == 3
1071 l3t += (cr3 & 0xFE0UL) >> 3;
1072 #endif
1073 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1074 mfn = l3e_get_pfn(l3e);
1075 unmap_domain_page(l3t);
1076 #if CONFIG_PAGING_LEVELS == 3
1077 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1078 return 0;
1079 #else
1080 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1081 (l3e_get_flags(l3e) & disallowed_flags) )
1082 return 0;
1083 #endif
1084 #endif
1086 l2t = map_domain_page(mfn);
1087 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1088 mfn = l2e_get_pfn(l2e);
1089 unmap_domain_page(l2t);
1090 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1091 (l2e_get_flags(l2e) & disallowed_flags) )
1092 return 0;
1093 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1095 l1e = l1e_empty(); /* define before use in debug tracing */
1096 goto spurious;
1099 l1t = map_domain_page(mfn);
1100 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1101 mfn = l1e_get_pfn(l1e);
1102 unmap_domain_page(l1t);
1103 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1104 (l1e_get_flags(l1e) & disallowed_flags) )
1105 return 0;
1107 spurious:
1108 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1109 "at addr %lx, e/c %04x\n",
1110 current->domain->domain_id, current->vcpu_id,
1111 addr, regs->error_code);
1112 #if CONFIG_PAGING_LEVELS >= 4
1113 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1114 #endif
1115 #if CONFIG_PAGING_LEVELS >= 3
1116 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1117 #endif
1118 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1119 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1120 #ifndef NDEBUG
1121 show_registers(regs);
1122 #endif
1123 return 1;
1126 static int spurious_page_fault(
1127 unsigned long addr, struct cpu_user_regs *regs)
1129 unsigned long flags;
1130 int is_spurious;
1132 /*
1133 * Disabling interrupts prevents TLB flushing, and hence prevents
1134 * page tables from becoming invalid under our feet during the walk.
1135 */
1136 local_irq_save(flags);
1137 is_spurious = __spurious_page_fault(addr, regs);
1138 local_irq_restore(flags);
1140 return is_spurious;
1143 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1145 struct vcpu *v = current;
1146 struct domain *d = v->domain;
1148 /* No fixups in interrupt context or when interrupts are disabled. */
1149 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1150 return 0;
1152 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1154 if ( paging_mode_external(d) && guest_mode(regs) )
1156 int ret = paging_fault(addr, regs);
1157 if ( ret == EXCRET_fault_fixed )
1158 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1159 return ret;
1161 if ( !(regs->error_code & PFEC_reserved_bit) &&
1162 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1163 return handle_gdt_ldt_mapping_fault(
1164 addr - GDT_LDT_VIRT_START, regs);
1165 return 0;
1168 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1169 guest_kernel_mode(v, regs) &&
1170 /* Do not check if access-protection fault since the page may
1171 legitimately be not present in shadow page tables */
1172 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1173 PFEC_write_access) &&
1174 ptwr_do_page_fault(v, addr, regs) )
1175 return EXCRET_fault_fixed;
1177 if ( paging_mode_enabled(d) )
1179 int ret = paging_fault(addr, regs);
1180 if ( ret == EXCRET_fault_fixed )
1181 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1182 return ret;
1185 return 0;
1188 /*
1189 * #PF error code:
1190 * Bit 0: Protection violation (=1) ; Page not present (=0)
1191 * Bit 1: Write access
1192 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1193 * Bit 3: Reserved bit violation
1194 * Bit 4: Instruction fetch
1195 */
1196 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1198 unsigned long addr, fixup;
1200 addr = read_cr2();
1202 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1204 perfc_incr(page_faults);
1206 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1207 return;
1209 if ( unlikely(!guest_mode(regs)) )
1211 if ( spurious_page_fault(addr, regs) )
1212 return;
1214 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1216 perfc_incr(copy_user_faults);
1217 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1218 reserved_bit_page_fault(addr, regs);
1219 regs->eip = fixup;
1220 return;
1223 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1225 show_execution_state(regs);
1226 show_page_walk(addr);
1227 panic("FATAL PAGE FAULT\n"
1228 "[error_code=%04x]\n"
1229 "Faulting linear address: %p\n",
1230 regs->error_code, _p(addr));
1233 propagate_page_fault(addr, regs->error_code);
1236 /*
1237 * Early #PF handler to print CR2, error code, and stack.
1239 * We also deal with spurious faults here, even though they should never happen
1240 * during early boot (an issue was seen once, but was most likely a hardware
1241 * problem).
1242 */
1243 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1245 static int stuck;
1246 static unsigned long prev_eip, prev_cr2;
1247 unsigned long cr2 = read_cr2();
1249 BUG_ON(smp_processor_id() != 0);
1251 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1253 prev_eip = regs->eip;
1254 prev_cr2 = cr2;
1255 stuck = 0;
1256 return;
1259 if ( stuck++ == 1000 )
1261 unsigned long *stk = (unsigned long *)regs;
1262 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1263 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1264 printk("Stack dump: ");
1265 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1266 printk("%p ", _p(*stk++));
1267 for ( ; ; ) ;
1271 long do_fpu_taskswitch(int set)
1273 struct vcpu *v = current;
1275 if ( set )
1277 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1278 stts();
1280 else
1282 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1283 if ( v->fpu_dirtied )
1284 clts();
1287 return 0;
1290 static int read_descriptor(unsigned int sel,
1291 const struct vcpu *v,
1292 const struct cpu_user_regs * regs,
1293 unsigned long *base,
1294 unsigned long *limit,
1295 unsigned int *ar,
1296 unsigned int vm86attr)
1298 struct desc_struct desc;
1300 if ( !vm86_mode(regs) )
1302 if ( sel < 4)
1303 desc.b = desc.a = 0;
1304 else if ( __get_user(desc,
1305 (const struct desc_struct *)(!(sel & 4)
1306 ? GDT_VIRT_START(v)
1307 : LDT_VIRT_START(v))
1308 + (sel >> 3)) )
1309 return 0;
1310 if ( !(vm86attr & _SEGMENT_CODE) )
1311 desc.b &= ~_SEGMENT_L;
1313 else
1315 desc.a = (sel << 20) | 0xffff;
1316 desc.b = vm86attr | (sel >> 12);
1319 *ar = desc.b & 0x00f0ff00;
1320 if ( !(desc.b & _SEGMENT_L) )
1322 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1323 (desc.b & 0xff000000));
1324 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1325 if ( desc.b & _SEGMENT_G )
1326 *limit = ((*limit + 1) << 12) - 1;
1327 #ifndef NDEBUG
1328 if ( !vm86_mode(regs) && (sel > 3) )
1330 unsigned int a, l;
1331 unsigned char valid;
1333 asm volatile (
1334 "larl %2,%0 ; setz %1"
1335 : "=r" (a), "=rm" (valid) : "rm" (sel));
1336 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1337 asm volatile (
1338 "lsll %2,%0 ; setz %1"
1339 : "=r" (l), "=rm" (valid) : "rm" (sel));
1340 BUG_ON(valid && (l != *limit));
1342 #endif
1344 else
1346 *base = 0UL;
1347 *limit = ~0UL;
1350 return 1;
1353 #ifdef __x86_64__
1354 static int read_gate_descriptor(unsigned int gate_sel,
1355 const struct vcpu *v,
1356 unsigned int *sel,
1357 unsigned long *off,
1358 unsigned int *ar)
1360 struct desc_struct desc;
1361 const struct desc_struct *pdesc;
1364 pdesc = (const struct desc_struct *)
1365 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1366 + (gate_sel >> 3);
1367 if ( (gate_sel < 4) ||
1368 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1369 __get_user(desc, pdesc) )
1370 return 0;
1372 *sel = (desc.a >> 16) & 0x0000fffc;
1373 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1374 *ar = desc.b & 0x0000ffff;
1376 /*
1377 * check_descriptor() clears the DPL field and stores the
1378 * guest requested DPL in the selector's RPL field.
1379 */
1380 if ( *ar & _SEGMENT_DPL )
1381 return 0;
1382 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1384 if ( !is_pv_32bit_vcpu(v) )
1386 if ( (*ar & 0x1f00) != 0x0c00 ||
1387 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1388 __get_user(desc, pdesc + 1) ||
1389 (desc.b & 0x1f00) )
1390 return 0;
1392 *off |= (unsigned long)desc.a << 32;
1393 return 1;
1396 switch ( *ar & 0x1f00 )
1398 case 0x0400:
1399 *off &= 0xffff;
1400 break;
1401 case 0x0c00:
1402 break;
1403 default:
1404 return 0;
1407 return 1;
1409 #endif
1411 /* Has the guest requested sufficient permission for this I/O access? */
1412 static int guest_io_okay(
1413 unsigned int port, unsigned int bytes,
1414 struct vcpu *v, struct cpu_user_regs *regs)
1416 #if defined(__x86_64__)
1417 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1418 int user_mode = !(v->arch.flags & TF_kernel_mode);
1419 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1420 #elif defined(__i386__)
1421 #define TOGGLE_MODE() ((void)0)
1422 #endif
1424 if ( !vm86_mode(regs) &&
1425 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1426 return 1;
1428 if ( v->arch.iobmp_limit > (port + bytes) )
1430 union { uint8_t bytes[2]; uint16_t mask; } x;
1432 /*
1433 * Grab permission bytes from guest space. Inaccessible bytes are
1434 * read as 0xff (no access allowed).
1435 */
1436 TOGGLE_MODE();
1437 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1438 port>>3, 2) )
1440 default: x.bytes[0] = ~0;
1441 case 1: x.bytes[1] = ~0;
1442 case 0: break;
1444 TOGGLE_MODE();
1446 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1447 return 1;
1450 return 0;
1453 /* Has the administrator granted sufficient permission for this I/O access? */
1454 static int admin_io_okay(
1455 unsigned int port, unsigned int bytes,
1456 struct vcpu *v, struct cpu_user_regs *regs)
1458 /*
1459 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1460 * We never permit direct access to that register.
1461 */
1462 if ( (port == 0xcf8) && (bytes == 4) )
1463 return 0;
1465 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1468 static uint32_t guest_io_read(
1469 unsigned int port, unsigned int bytes,
1470 struct vcpu *v, struct cpu_user_regs *regs)
1472 extern uint32_t pci_conf_read(
1473 uint32_t cf8, uint8_t offset, uint8_t bytes);
1475 uint32_t data = 0;
1476 unsigned int shift = 0;
1478 if ( admin_io_okay(port, bytes, v, regs) )
1480 switch ( bytes )
1482 case 1: return inb(port);
1483 case 2: return inw(port);
1484 case 4: return inl(port);
1488 while ( bytes != 0 )
1490 unsigned int size = 1;
1491 uint32_t sub_data = 0xff;
1493 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1495 sub_data = pv_pit_handler(port, 0, 0);
1497 else if ( (port == 0xcf8) && (bytes == 4) )
1499 size = 4;
1500 sub_data = v->domain->arch.pci_cf8;
1502 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1504 size = min(bytes, 4 - (port & 3));
1505 if ( size == 3 )
1506 size = 2;
1507 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1510 if ( size == 4 )
1511 return sub_data;
1513 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1514 shift += size * 8;
1515 port += size;
1516 bytes -= size;
1519 return data;
1522 static void guest_io_write(
1523 unsigned int port, unsigned int bytes, uint32_t data,
1524 struct vcpu *v, struct cpu_user_regs *regs)
1526 extern void pci_conf_write(
1527 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1529 if ( admin_io_okay(port, bytes, v, regs) )
1531 switch ( bytes ) {
1532 case 1:
1533 outb((uint8_t)data, port);
1534 if ( pv_post_outb_hook )
1535 pv_post_outb_hook(port, (uint8_t)data);
1536 break;
1537 case 2:
1538 outw((uint16_t)data, port);
1539 break;
1540 case 4:
1541 outl(data, port);
1542 break;
1544 return;
1547 while ( bytes != 0 )
1549 unsigned int size = 1;
1551 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1553 pv_pit_handler(port, (uint8_t)data, 1);
1555 else if ( (port == 0xcf8) && (bytes == 4) )
1557 size = 4;
1558 v->domain->arch.pci_cf8 = data;
1560 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1562 size = min(bytes, 4 - (port & 3));
1563 if ( size == 3 )
1564 size = 2;
1565 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1568 if ( size == 4 )
1569 return;
1571 port += size;
1572 bytes -= size;
1573 data >>= size * 8;
1577 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1578 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1579 __attribute__((__regparm__(1)));
1580 unsigned long guest_to_host_gpr_switch(unsigned long)
1581 __attribute__((__regparm__(1)));
1583 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1585 /* Instruction fetch with error handling. */
1586 #define insn_fetch(type, base, eip, limit) \
1587 ({ unsigned long _rc, _ptr = (base) + (eip); \
1588 type _x; \
1589 if ( ad_default < 8 ) \
1590 _ptr = (unsigned int)_ptr; \
1591 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1592 goto fail; \
1593 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1594 { \
1595 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1596 goto skip; \
1597 } \
1598 (eip) += sizeof(_x); _x; })
1600 #if defined(CONFIG_X86_32)
1601 # define read_sreg(regs, sr) ((regs)->sr)
1602 #elif defined(CONFIG_X86_64)
1603 # define read_sreg(regs, sr) read_segment_register(sr)
1604 #endif
1606 static int emulate_privileged_op(struct cpu_user_regs *regs)
1608 struct vcpu *v = current;
1609 unsigned long *reg, eip = regs->eip, res;
1610 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1611 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1612 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1613 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1614 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1615 ? regs->reg \
1616 : ad_bytes == 4 \
1617 ? (u32)regs->reg \
1618 : (u16)regs->reg)
1619 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1620 ? regs->reg = (val) \
1621 : ad_bytes == 4 \
1622 ? (*(u32 *)&regs->reg = (val)) \
1623 : (*(u16 *)&regs->reg = (val)))
1624 unsigned long code_base, code_limit;
1625 char io_emul_stub[32];
1626 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1627 u32 l, h, eax, edx;
1629 if ( !read_descriptor(regs->cs, v, regs,
1630 &code_base, &code_limit, &ar,
1631 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1632 goto fail;
1633 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1634 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1635 if ( !(ar & _SEGMENT_S) ||
1636 !(ar & _SEGMENT_P) ||
1637 !(ar & _SEGMENT_CODE) )
1638 goto fail;
1640 /* emulating only opcodes not allowing SS to be default */
1641 data_sel = read_sreg(regs, ds);
1643 /* Legacy prefixes. */
1644 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1646 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1648 case 0x66: /* operand-size override */
1649 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1650 continue;
1651 case 0x67: /* address-size override */
1652 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1653 continue;
1654 case 0x2e: /* CS override */
1655 data_sel = regs->cs;
1656 continue;
1657 case 0x3e: /* DS override */
1658 data_sel = read_sreg(regs, ds);
1659 continue;
1660 case 0x26: /* ES override */
1661 data_sel = read_sreg(regs, es);
1662 continue;
1663 case 0x64: /* FS override */
1664 data_sel = read_sreg(regs, fs);
1665 lm_ovr = lm_seg_fs;
1666 continue;
1667 case 0x65: /* GS override */
1668 data_sel = read_sreg(regs, gs);
1669 lm_ovr = lm_seg_gs;
1670 continue;
1671 case 0x36: /* SS override */
1672 data_sel = regs->ss;
1673 continue;
1674 case 0xf0: /* LOCK */
1675 lock = 1;
1676 continue;
1677 case 0xf2: /* REPNE/REPNZ */
1678 case 0xf3: /* REP/REPE/REPZ */
1679 rep_prefix = 1;
1680 continue;
1681 default:
1682 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1684 rex = opcode;
1685 continue;
1687 break;
1689 break;
1692 /* REX prefix. */
1693 if ( rex & 8 ) /* REX.W */
1694 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1695 modrm_reg = (rex & 4) << 1; /* REX.R */
1696 /* REX.X does not need to be decoded. */
1697 modrm_rm = (rex & 1) << 3; /* REX.B */
1699 if ( opcode == 0x0f )
1700 goto twobyte_opcode;
1702 if ( lock )
1703 goto fail;
1705 /* Input/Output String instructions. */
1706 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1708 unsigned long data_base, data_limit;
1710 if ( rep_prefix && (rd_ad(ecx) == 0) )
1711 goto done;
1713 if ( !(opcode & 2) )
1715 data_sel = read_sreg(regs, es);
1716 lm_ovr = lm_seg_none;
1719 if ( !(ar & _SEGMENT_L) )
1721 if ( !read_descriptor(data_sel, v, regs,
1722 &data_base, &data_limit, &ar,
1723 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1724 _SEGMENT_P) )
1725 goto fail;
1726 if ( !(ar & _SEGMENT_S) ||
1727 !(ar & _SEGMENT_P) ||
1728 (opcode & 2 ?
1729 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1730 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1731 goto fail;
1733 #ifdef CONFIG_X86_64
1734 else
1736 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1738 switch ( lm_ovr )
1740 case lm_seg_none:
1741 data_base = 0UL;
1742 break;
1743 case lm_seg_fs:
1744 data_base = v->arch.guest_context.fs_base;
1745 break;
1746 case lm_seg_gs:
1747 if ( guest_kernel_mode(v, regs) )
1748 data_base = v->arch.guest_context.gs_base_kernel;
1749 else
1750 data_base = v->arch.guest_context.gs_base_user;
1751 break;
1754 else
1755 read_descriptor(data_sel, v, regs,
1756 &data_base, &data_limit, &ar,
1757 0);
1758 data_limit = ~0UL;
1759 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1761 #endif
1763 port = (u16)regs->edx;
1765 continue_io_string:
1766 switch ( opcode )
1768 case 0x6c: /* INSB */
1769 op_bytes = 1;
1770 case 0x6d: /* INSW/INSL */
1771 if ( (data_limit < (op_bytes - 1)) ||
1772 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1773 !guest_io_okay(port, op_bytes, v, regs) )
1774 goto fail;
1775 data = guest_io_read(port, op_bytes, v, regs);
1776 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1777 &data, op_bytes)) != 0 )
1779 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1780 PFEC_write_access);
1781 return EXCRET_fault_fixed;
1783 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1784 ? -op_bytes : op_bytes));
1785 break;
1787 case 0x6e: /* OUTSB */
1788 op_bytes = 1;
1789 case 0x6f: /* OUTSW/OUTSL */
1790 if ( (data_limit < (op_bytes - 1)) ||
1791 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1792 !guest_io_okay(port, op_bytes, v, regs) )
1793 goto fail;
1794 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1795 op_bytes)) != 0 )
1797 propagate_page_fault(data_base + rd_ad(esi)
1798 + op_bytes - rc, 0);
1799 return EXCRET_fault_fixed;
1801 guest_io_write(port, op_bytes, data, v, regs);
1802 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1803 ? -op_bytes : op_bytes));
1804 break;
1807 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1809 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1811 if ( !bpmatch && !hypercall_preempt_check() )
1812 goto continue_io_string;
1813 eip = regs->eip;
1816 goto done;
1819 /*
1820 * Very likely to be an I/O instruction (IN/OUT).
1821 * Build an on-stack stub to execute the instruction with full guest
1822 * GPR context. This is needed for some systems which (ab)use IN/OUT
1823 * to communicate with BIOS code in system-management mode.
1824 */
1825 #ifdef __x86_64__
1826 /* movq $host_to_guest_gpr_switch,%rcx */
1827 io_emul_stub[0] = 0x48;
1828 io_emul_stub[1] = 0xb9;
1829 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1830 /* callq *%rcx */
1831 io_emul_stub[10] = 0xff;
1832 io_emul_stub[11] = 0xd1;
1833 #else
1834 /* call host_to_guest_gpr_switch */
1835 io_emul_stub[0] = 0xe8;
1836 *(s32 *)&io_emul_stub[1] =
1837 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1838 /* 7 x nop */
1839 memset(&io_emul_stub[5], 0x90, 7);
1840 #endif
1841 /* data16 or nop */
1842 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1843 /* <io-access opcode> */
1844 io_emul_stub[13] = opcode;
1845 /* imm8 or nop */
1846 io_emul_stub[14] = 0x90;
1847 /* ret (jumps to guest_to_host_gpr_switch) */
1848 io_emul_stub[15] = 0xc3;
1850 /* Handy function-typed pointer to the stub. */
1851 io_emul = (void *)io_emul_stub;
1853 if ( ioemul_handle_quirk )
1854 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1856 /* I/O Port and Interrupt Flag instructions. */
1857 switch ( opcode )
1859 case 0xe4: /* IN imm8,%al */
1860 op_bytes = 1;
1861 case 0xe5: /* IN imm8,%eax */
1862 port = insn_fetch(u8, code_base, eip, code_limit);
1863 io_emul_stub[14] = port; /* imm8 */
1864 exec_in:
1865 if ( !guest_io_okay(port, op_bytes, v, regs) )
1866 goto fail;
1867 if ( admin_io_okay(port, op_bytes, v, regs) )
1869 io_emul(regs);
1871 else
1873 if ( op_bytes == 4 )
1874 regs->eax = 0;
1875 else
1876 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1877 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1879 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1880 goto done;
1882 case 0xec: /* IN %dx,%al */
1883 op_bytes = 1;
1884 case 0xed: /* IN %dx,%eax */
1885 port = (u16)regs->edx;
1886 goto exec_in;
1888 case 0xe6: /* OUT %al,imm8 */
1889 op_bytes = 1;
1890 case 0xe7: /* OUT %eax,imm8 */
1891 port = insn_fetch(u8, code_base, eip, code_limit);
1892 io_emul_stub[14] = port; /* imm8 */
1893 exec_out:
1894 if ( !guest_io_okay(port, op_bytes, v, regs) )
1895 goto fail;
1896 if ( admin_io_okay(port, op_bytes, v, regs) )
1898 io_emul(regs);
1899 if ( (op_bytes == 1) && pv_post_outb_hook )
1900 pv_post_outb_hook(port, regs->eax);
1902 else
1904 guest_io_write(port, op_bytes, regs->eax, v, regs);
1906 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1907 goto done;
1909 case 0xee: /* OUT %al,%dx */
1910 op_bytes = 1;
1911 case 0xef: /* OUT %eax,%dx */
1912 port = (u16)regs->edx;
1913 goto exec_out;
1915 case 0xfa: /* CLI */
1916 case 0xfb: /* STI */
1917 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1918 goto fail;
1919 /*
1920 * This is just too dangerous to allow, in my opinion. Consider if the
1921 * caller then tries to reenable interrupts using POPF: we can't trap
1922 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1923 * do for us. :-)
1924 */
1925 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1926 goto done;
1929 /* No decode of this single-byte opcode. */
1930 goto fail;
1932 twobyte_opcode:
1933 /* Two-byte opcodes only emulated from guest kernel. */
1934 if ( !guest_kernel_mode(v, regs) )
1935 goto fail;
1937 /* Privileged (ring 0) instructions. */
1938 opcode = insn_fetch(u8, code_base, eip, code_limit);
1939 if ( lock && (opcode & ~3) != 0x20 )
1940 goto fail;
1941 switch ( opcode )
1943 case 0x06: /* CLTS */
1944 (void)do_fpu_taskswitch(0);
1945 break;
1947 case 0x09: /* WBINVD */
1948 /* Ignore the instruction if unprivileged. */
1949 if ( !cache_flush_permitted(v->domain) )
1950 /* Non-physdev domain attempted WBINVD; ignore for now since
1951 newer linux uses this in some start-of-day timing loops */
1953 else
1954 wbinvd();
1955 break;
1957 case 0x20: /* MOV CR?,<reg> */
1958 opcode = insn_fetch(u8, code_base, eip, code_limit);
1959 if ( opcode < 0xc0 )
1960 goto fail;
1961 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1962 modrm_rm |= (opcode >> 0) & 7;
1963 reg = decode_register(modrm_rm, regs, 0);
1964 switch ( modrm_reg )
1966 case 0: /* Read CR0 */
1967 *reg = (read_cr0() & ~X86_CR0_TS) |
1968 v->arch.guest_context.ctrlreg[0];
1969 break;
1971 case 2: /* Read CR2 */
1972 *reg = v->arch.guest_context.ctrlreg[2];
1973 break;
1975 case 3: /* Read CR3 */
1976 if ( !is_pv_32on64_vcpu(v) )
1977 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1978 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1979 #ifdef CONFIG_COMPAT
1980 else
1981 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1982 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1983 #endif
1984 break;
1986 case 4: /* Read CR4 */
1987 /*
1988 * Guests can read CR4 to see what features Xen has enabled. We
1989 * therefore lie about PGE & PSE as they are unavailable to guests.
1990 */
1991 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1992 break;
1994 default:
1995 goto fail;
1997 break;
1999 case 0x21: /* MOV DR?,<reg> */
2000 opcode = insn_fetch(u8, code_base, eip, code_limit);
2001 if ( opcode < 0xc0 )
2002 goto fail;
2003 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2004 modrm_rm |= (opcode >> 0) & 7;
2005 reg = decode_register(modrm_rm, regs, 0);
2006 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2007 goto fail;
2008 *reg = res;
2009 break;
2011 case 0x22: /* MOV <reg>,CR? */
2012 opcode = insn_fetch(u8, code_base, eip, code_limit);
2013 if ( opcode < 0xc0 )
2014 goto fail;
2015 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2016 modrm_rm |= (opcode >> 0) & 7;
2017 reg = decode_register(modrm_rm, regs, 0);
2018 switch ( modrm_reg )
2020 case 0: /* Write CR0 */
2021 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2023 gdprintk(XENLOG_WARNING,
2024 "Attempt to change unmodifiable CR0 flags.\n");
2025 goto fail;
2027 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2028 break;
2030 case 2: /* Write CR2 */
2031 v->arch.guest_context.ctrlreg[2] = *reg;
2032 arch_set_cr2(v, *reg);
2033 break;
2035 case 3: /* Write CR3 */
2036 domain_lock(v->domain);
2037 if ( !is_pv_32on64_vcpu(v) )
2038 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2039 #ifdef CONFIG_COMPAT
2040 else
2041 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2042 #endif
2043 domain_unlock(v->domain);
2044 if ( rc == 0 ) /* not okay */
2045 goto fail;
2046 break;
2048 case 4: /* Write CR4 */
2049 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2050 write_cr4(pv_guest_cr4_to_real_cr4(
2051 v->arch.guest_context.ctrlreg[4]));
2052 break;
2054 default:
2055 goto fail;
2057 break;
2059 case 0x23: /* MOV <reg>,DR? */
2060 opcode = insn_fetch(u8, code_base, eip, code_limit);
2061 if ( opcode < 0xc0 )
2062 goto fail;
2063 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2064 modrm_rm |= (opcode >> 0) & 7;
2065 reg = decode_register(modrm_rm, regs, 0);
2066 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2067 goto fail;
2068 break;
2070 case 0x30: /* WRMSR */
2071 eax = regs->eax;
2072 edx = regs->edx;
2073 res = ((u64)edx << 32) | eax;
2074 switch ( (u32)regs->ecx )
2076 #ifdef CONFIG_X86_64
2077 case MSR_FS_BASE:
2078 if ( is_pv_32on64_vcpu(v) )
2079 goto fail;
2080 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2081 goto fail;
2082 v->arch.guest_context.fs_base = res;
2083 break;
2084 case MSR_GS_BASE:
2085 if ( is_pv_32on64_vcpu(v) )
2086 goto fail;
2087 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2088 goto fail;
2089 v->arch.guest_context.gs_base_kernel = res;
2090 break;
2091 case MSR_SHADOW_GS_BASE:
2092 if ( is_pv_32on64_vcpu(v) )
2093 goto fail;
2094 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2095 goto fail;
2096 v->arch.guest_context.gs_base_user = res;
2097 break;
2098 #endif
2099 case MSR_K7_FID_VID_STATUS:
2100 case MSR_K7_FID_VID_CTL:
2101 case MSR_K8_PSTATE_LIMIT:
2102 case MSR_K8_PSTATE_CTRL:
2103 case MSR_K8_PSTATE_STATUS:
2104 case MSR_K8_PSTATE0:
2105 case MSR_K8_PSTATE1:
2106 case MSR_K8_PSTATE2:
2107 case MSR_K8_PSTATE3:
2108 case MSR_K8_PSTATE4:
2109 case MSR_K8_PSTATE5:
2110 case MSR_K8_PSTATE6:
2111 case MSR_K8_PSTATE7:
2112 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2113 goto fail;
2114 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2115 break;
2116 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2117 goto fail;
2118 break;
2119 case MSR_AMD64_NB_CFG:
2120 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2121 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2122 goto fail;
2123 if ( !IS_PRIV(v->domain) )
2124 break;
2125 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2126 (eax != l) ||
2127 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2128 goto invalid;
2129 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2130 goto fail;
2131 break;
2132 case MSR_FAM10H_MMIO_CONF_BASE:
2133 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2134 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2135 goto fail;
2136 if ( !IS_PRIV(v->domain) )
2137 break;
2138 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2139 (((((u64)h << 32) | l) ^ res) &
2140 ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
2141 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2142 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2143 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2144 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2145 goto invalid;
2146 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2147 goto fail;
2148 break;
2149 case MSR_IA32_PERF_CTL:
2150 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2151 goto fail;
2152 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2153 break;
2154 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2155 goto fail;
2156 break;
2157 case MSR_IA32_THERM_CONTROL:
2158 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2159 goto fail;
2160 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2161 goto fail;
2162 break;
2163 default:
2164 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2165 break;
2166 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2167 (eax != l) || (edx != h) )
2168 invalid:
2169 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2170 "%08x:%08x to %08x:%08x.\n",
2171 _p(regs->ecx), h, l, edx, eax);
2172 break;
2174 break;
2176 case 0x31: /* RDTSC */
2177 rdtsc(regs->eax, regs->edx);
2178 break;
2180 case 0x32: /* RDMSR */
2181 switch ( (u32)regs->ecx )
2183 #ifdef CONFIG_X86_64
2184 case MSR_FS_BASE:
2185 if ( is_pv_32on64_vcpu(v) )
2186 goto fail;
2187 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2188 regs->edx = v->arch.guest_context.fs_base >> 32;
2189 break;
2190 case MSR_GS_BASE:
2191 if ( is_pv_32on64_vcpu(v) )
2192 goto fail;
2193 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2194 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2195 break;
2196 case MSR_SHADOW_GS_BASE:
2197 if ( is_pv_32on64_vcpu(v) )
2198 goto fail;
2199 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2200 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2201 break;
2202 #endif
2203 case MSR_K7_FID_VID_CTL:
2204 case MSR_K7_FID_VID_STATUS:
2205 case MSR_K8_PSTATE_LIMIT:
2206 case MSR_K8_PSTATE_CTRL:
2207 case MSR_K8_PSTATE_STATUS:
2208 case MSR_K8_PSTATE0:
2209 case MSR_K8_PSTATE1:
2210 case MSR_K8_PSTATE2:
2211 case MSR_K8_PSTATE3:
2212 case MSR_K8_PSTATE4:
2213 case MSR_K8_PSTATE5:
2214 case MSR_K8_PSTATE6:
2215 case MSR_K8_PSTATE7:
2216 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2217 goto fail;
2218 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2220 regs->eax = regs->edx = 0;
2221 break;
2223 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2224 goto fail;
2225 break;
2226 case MSR_EFER:
2227 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2228 goto fail;
2229 break;
2230 case MSR_IA32_MISC_ENABLE:
2231 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2232 goto fail;
2233 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2234 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2235 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2236 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2237 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2238 break;
2239 case MSR_IA32_THERM_CONTROL:
2240 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2241 goto fail;
2242 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2243 goto fail;
2244 break;
2245 default:
2246 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2248 regs->eax = l;
2249 regs->edx = h;
2250 break;
2252 /* Everyone can read the MSR space. */
2253 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2254 _p(regs->ecx));*/
2255 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2256 goto fail;
2257 break;
2259 break;
2261 default:
2262 goto fail;
2265 #undef wr_ad
2266 #undef rd_ad
2268 done:
2269 instruction_done(regs, eip, bpmatch);
2270 skip:
2271 return EXCRET_fault_fixed;
2273 fail:
2274 return 0;
2277 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2278 unsigned int esp, unsigned int decr)
2280 return (((esp - decr) < (esp - 1)) &&
2281 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2284 static void emulate_gate_op(struct cpu_user_regs *regs)
2286 #ifdef __x86_64__
2287 struct vcpu *v = current;
2288 unsigned int sel, ar, dpl, nparm, opnd_sel;
2289 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2290 unsigned long off, eip, opnd_off, base, limit;
2291 int jump;
2293 /* Check whether this fault is due to the use of a call gate. */
2294 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2295 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2296 ((ar & _SEGMENT_TYPE) != 0xc00) )
2298 do_guest_trap(TRAP_gp_fault, regs, 1);
2299 return;
2301 if ( !(ar & _SEGMENT_P) )
2303 do_guest_trap(TRAP_no_segment, regs, 1);
2304 return;
2306 dpl = (ar >> 13) & 3;
2307 nparm = ar & 0x1f;
2309 /*
2310 * Decode instruction (and perhaps operand) to determine RPL,
2311 * whether this is a jump or a call, and the call return offset.
2312 */
2313 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2314 !(ar & _SEGMENT_S) ||
2315 !(ar & _SEGMENT_P) ||
2316 !(ar & _SEGMENT_CODE) )
2318 do_guest_trap(TRAP_gp_fault, regs, 1);
2319 return;
2322 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2323 ad_default = ad_bytes = op_default;
2324 opnd_sel = opnd_off = 0;
2325 jump = -1;
2326 for ( eip = regs->eip; eip - regs->_eip < 10; )
2328 switch ( insn_fetch(u8, base, eip, limit) )
2330 case 0x66: /* operand-size override */
2331 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2332 continue;
2333 case 0x67: /* address-size override */
2334 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2335 continue;
2336 case 0x2e: /* CS override */
2337 opnd_sel = regs->cs;
2338 ASSERT(opnd_sel);
2339 continue;
2340 case 0x3e: /* DS override */
2341 opnd_sel = read_sreg(regs, ds);
2342 if ( !opnd_sel )
2343 opnd_sel = dpl;
2344 continue;
2345 case 0x26: /* ES override */
2346 opnd_sel = read_sreg(regs, es);
2347 if ( !opnd_sel )
2348 opnd_sel = dpl;
2349 continue;
2350 case 0x64: /* FS override */
2351 opnd_sel = read_sreg(regs, fs);
2352 if ( !opnd_sel )
2353 opnd_sel = dpl;
2354 continue;
2355 case 0x65: /* GS override */
2356 opnd_sel = read_sreg(regs, gs);
2357 if ( !opnd_sel )
2358 opnd_sel = dpl;
2359 continue;
2360 case 0x36: /* SS override */
2361 opnd_sel = regs->ss;
2362 if ( !opnd_sel )
2363 opnd_sel = dpl;
2364 continue;
2365 case 0xea:
2366 ++jump;
2367 /* FALLTHROUGH */
2368 case 0x9a:
2369 ++jump;
2370 opnd_sel = regs->cs;
2371 opnd_off = eip;
2372 ad_bytes = ad_default;
2373 eip += op_bytes + 2;
2374 break;
2375 case 0xff:
2377 unsigned int modrm;
2379 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2381 case 0x28: case 0x68: case 0xa8:
2382 ++jump;
2383 /* FALLTHROUGH */
2384 case 0x18: case 0x58: case 0x98:
2385 ++jump;
2386 if ( ad_bytes != 2 )
2388 if ( (modrm & 7) == 4 )
2390 unsigned int sib;
2391 sib = insn_fetch(u8, base, eip, limit);
2393 modrm = (modrm & ~7) | (sib & 7);
2394 if ( (sib >>= 3) != 4 )
2395 opnd_off = *(unsigned long *)
2396 decode_register(sib & 7, regs, 0);
2397 opnd_off <<= sib >> 3;
2399 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2400 opnd_off += *(unsigned long *)
2401 decode_register(modrm & 7, regs, 0);
2402 else
2403 modrm |= 0x87;
2404 if ( !opnd_sel )
2406 switch ( modrm & 7 )
2408 default:
2409 opnd_sel = read_sreg(regs, ds);
2410 break;
2411 case 4: case 5:
2412 opnd_sel = regs->ss;
2413 break;
2417 else
2419 switch ( modrm & 7 )
2421 case 0: case 1: case 7:
2422 opnd_off = regs->ebx;
2423 break;
2424 case 6:
2425 if ( !(modrm & 0xc0) )
2426 modrm |= 0x80;
2427 else
2428 case 2: case 3:
2430 opnd_off = regs->ebp;
2431 if ( !opnd_sel )
2432 opnd_sel = regs->ss;
2434 break;
2436 if ( !opnd_sel )
2437 opnd_sel = read_sreg(regs, ds);
2438 switch ( modrm & 7 )
2440 case 0: case 2: case 4:
2441 opnd_off += regs->esi;
2442 break;
2443 case 1: case 3: case 5:
2444 opnd_off += regs->edi;
2445 break;
2448 switch ( modrm & 0xc0 )
2450 case 0x40:
2451 opnd_off += insn_fetch(s8, base, eip, limit);
2452 break;
2453 case 0x80:
2454 opnd_off += insn_fetch(s32, base, eip, limit);
2455 break;
2457 if ( ad_bytes == 4 )
2458 opnd_off = (unsigned int)opnd_off;
2459 else if ( ad_bytes == 2 )
2460 opnd_off = (unsigned short)opnd_off;
2461 break;
2464 break;
2466 break;
2469 if ( jump < 0 )
2471 fail:
2472 do_guest_trap(TRAP_gp_fault, regs, 1);
2473 skip:
2474 return;
2477 if ( (opnd_sel != regs->cs &&
2478 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2479 !(ar & _SEGMENT_S) ||
2480 !(ar & _SEGMENT_P) ||
2481 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2483 do_guest_trap(TRAP_gp_fault, regs, 1);
2484 return;
2487 opnd_off += op_bytes;
2488 #define ad_default ad_bytes
2489 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2490 #undef ad_default
2491 ASSERT((opnd_sel & ~3) == regs->error_code);
2492 if ( dpl < (opnd_sel & 3) )
2494 do_guest_trap(TRAP_gp_fault, regs, 1);
2495 return;
2498 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2499 !(ar & _SEGMENT_S) ||
2500 !(ar & _SEGMENT_CODE) ||
2501 (!jump || (ar & _SEGMENT_EC) ?
2502 ((ar >> 13) & 3) > (regs->cs & 3) :
2503 ((ar >> 13) & 3) != (regs->cs & 3)) )
2505 regs->error_code = sel;
2506 do_guest_trap(TRAP_gp_fault, regs, 1);
2507 return;
2509 if ( !(ar & _SEGMENT_P) )
2511 regs->error_code = sel;
2512 do_guest_trap(TRAP_no_segment, regs, 1);
2513 return;
2515 if ( off > limit )
2517 regs->error_code = 0;
2518 do_guest_trap(TRAP_gp_fault, regs, 1);
2519 return;
2522 if ( !jump )
2524 unsigned int ss, esp, *stkp;
2525 int rc;
2526 #define push(item) do \
2527 { \
2528 --stkp; \
2529 esp -= 4; \
2530 rc = __put_user(item, stkp); \
2531 if ( rc ) \
2532 { \
2533 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2534 PFEC_write_access); \
2535 return; \
2536 } \
2537 } while ( 0 )
2539 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2541 sel |= (ar >> 13) & 3;
2542 /* Inner stack known only for kernel ring. */
2543 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2545 do_guest_trap(TRAP_gp_fault, regs, 1);
2546 return;
2548 esp = v->arch.guest_context.kernel_sp;
2549 ss = v->arch.guest_context.kernel_ss;
2550 if ( (ss & 3) != (sel & 3) ||
2551 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2552 ((ar >> 13) & 3) != (sel & 3) ||
2553 !(ar & _SEGMENT_S) ||
2554 (ar & _SEGMENT_CODE) ||
2555 !(ar & _SEGMENT_WR) )
2557 regs->error_code = ss & ~3;
2558 do_guest_trap(TRAP_invalid_tss, regs, 1);
2559 return;
2561 if ( !(ar & _SEGMENT_P) ||
2562 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2564 regs->error_code = ss & ~3;
2565 do_guest_trap(TRAP_stack_error, regs, 1);
2566 return;
2568 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2569 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2571 do_guest_trap(TRAP_gp_fault, regs, 1);
2572 return;
2574 push(regs->ss);
2575 push(regs->esp);
2576 if ( nparm )
2578 const unsigned int *ustkp;
2580 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2581 ((ar >> 13) & 3) != (regs->cs & 3) ||
2582 !(ar & _SEGMENT_S) ||
2583 (ar & _SEGMENT_CODE) ||
2584 !(ar & _SEGMENT_WR) ||
2585 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2586 return do_guest_trap(TRAP_gp_fault, regs, 1);
2587 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2588 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2590 do_guest_trap(TRAP_gp_fault, regs, 1);
2591 return;
2593 do
2595 unsigned int parm;
2597 --ustkp;
2598 rc = __get_user(parm, ustkp);
2599 if ( rc )
2601 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2602 return;
2604 push(parm);
2605 } while ( --nparm );
2608 else
2610 sel |= (regs->cs & 3);
2611 esp = regs->esp;
2612 ss = regs->ss;
2613 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2614 ((ar >> 13) & 3) != (sel & 3) )
2616 do_guest_trap(TRAP_gp_fault, regs, 1);
2617 return;
2619 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2621 regs->error_code = 0;
2622 do_guest_trap(TRAP_stack_error, regs, 1);
2623 return;
2625 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2626 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2628 do_guest_trap(TRAP_gp_fault, regs, 1);
2629 return;
2632 push(regs->cs);
2633 push(eip);
2634 #undef push
2635 regs->esp = esp;
2636 regs->ss = ss;
2638 else
2639 sel |= (regs->cs & 3);
2641 regs->cs = sel;
2642 instruction_done(regs, off, 0);
2643 #endif
2646 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2648 struct vcpu *v = current;
2649 unsigned long fixup;
2651 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2653 if ( regs->error_code & 1 )
2654 goto hardware_gp;
2656 if ( !guest_mode(regs) )
2657 goto gp_in_kernel;
2659 /*
2660 * Cunning trick to allow arbitrary "INT n" handling.
2662 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2663 * instruction from trapping to the appropriate vector, when that might not
2664 * be expected by Xen or the guest OS. For example, that entry might be for
2665 * a fault handler (unlike traps, faults don't increment EIP), or might
2666 * expect an error code on the stack (which a software trap never
2667 * provides), or might be a hardware interrupt handler that doesn't like
2668 * being called spuriously.
2670 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2671 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2672 * clear to indicate that it's a software fault, not hardware.
2674 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2675 * okay because they can only be triggered by an explicit DPL-checked
2676 * instruction. The DPL specified by the guest OS for these vectors is NOT
2677 * CHECKED!!
2678 */
2679 if ( (regs->error_code & 3) == 2 )
2681 /* This fault must be due to <INT n> instruction. */
2682 const struct trap_info *ti;
2683 unsigned char vector = regs->error_code >> 3;
2684 ti = &v->arch.guest_context.trap_ctxt[vector];
2685 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2687 regs->eip += 2;
2688 do_guest_trap(vector, regs, 0);
2689 return;
2692 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2694 emulate_gate_op(regs);
2695 return;
2698 /* Emulate some simple privileged and I/O instructions. */
2699 if ( (regs->error_code == 0) &&
2700 emulate_privileged_op(regs) )
2702 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2703 return;
2706 #if defined(__i386__)
2707 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2708 (regs->error_code == 0) &&
2709 gpf_emulate_4gb(regs) )
2711 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2712 return;
2714 #endif
2716 /* Pass on GPF as is. */
2717 do_guest_trap(TRAP_gp_fault, regs, 1);
2718 return;
2720 gp_in_kernel:
2722 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2724 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2725 regs->error_code, _p(regs->eip), _p(fixup));
2726 regs->eip = fixup;
2727 return;
2730 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2732 hardware_gp:
2733 show_execution_state(regs);
2734 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2737 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2739 static void nmi_mce_softirq(void)
2741 int cpu = smp_processor_id();
2742 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2743 cpumask_t affinity;
2745 BUG_ON(st == NULL);
2746 BUG_ON(st->vcpu == NULL);
2748 /* Set the tmp value unconditionally, so that
2749 * the check in the iret hypercall works. */
2750 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2752 if ((cpu != st->processor)
2753 || (st->processor != st->vcpu->processor))
2755 /* We are on a different physical cpu.
2756 * Make sure to wakeup the vcpu on the
2757 * specified processor.
2758 */
2759 cpus_clear(affinity);
2760 cpu_set(st->processor, affinity);
2761 vcpu_set_affinity(st->vcpu, &affinity);
2763 /* Affinity is restored in the iret hypercall. */
2766 /* Only used to defer wakeup of domain/vcpu to
2767 * a safe (non-NMI/MCE) context.
2768 */
2769 vcpu_kick(st->vcpu);
2772 static void nmi_dom0_report(unsigned int reason_idx)
2774 struct domain *d = dom0;
2776 if ( (d == NULL) || (d->vcpu[0] == NULL) )
2777 return;
2779 set_bit(reason_idx, nmi_reason(d));
2781 send_guest_trap(d, 0, TRAP_nmi);
2784 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2786 switch ( opt_nmi[0] )
2788 case 'd': /* 'dom0' */
2789 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2790 case 'i': /* 'ignore' */
2791 break;
2792 default: /* 'fatal' */
2793 console_force_unlock();
2794 printk("\n\nNMI - MEMORY ERROR\n");
2795 fatal_trap(TRAP_nmi, regs);
2798 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2799 mdelay(1);
2800 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2803 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2805 switch ( opt_nmi[0] )
2807 case 'd': /* 'dom0' */
2808 nmi_dom0_report(_XEN_NMIREASON_io_error);
2809 case 'i': /* 'ignore' */
2810 break;
2811 default: /* 'fatal' */
2812 console_force_unlock();
2813 printk("\n\nNMI - I/O ERROR\n");
2814 fatal_trap(TRAP_nmi, regs);
2817 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2818 mdelay(1);
2819 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2822 static void unknown_nmi_error(unsigned char reason)
2824 switch ( opt_nmi[0] )
2826 case 'd': /* 'dom0' */
2827 nmi_dom0_report(_XEN_NMIREASON_unknown);
2828 case 'i': /* 'ignore' */
2829 break;
2830 default: /* 'fatal' */
2831 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2832 printk("Dazed and confused, but trying to continue\n");
2833 printk("Do you have a strange power saving mode enabled?\n");
2834 kexec_crash();
2838 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2840 return 0;
2843 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2845 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2847 unsigned int cpu = smp_processor_id();
2848 unsigned char reason;
2850 ++nmi_count(cpu);
2852 if ( nmi_callback(regs, cpu) )
2853 return;
2855 if ( nmi_watchdog )
2856 nmi_watchdog_tick(regs);
2858 /* Only the BSP gets external NMIs from the system. */
2859 if ( cpu == 0 )
2861 reason = inb(0x61);
2862 if ( reason & 0x80 )
2863 mem_parity_error(regs);
2864 else if ( reason & 0x40 )
2865 io_check_error(regs);
2866 else if ( !nmi_watchdog )
2867 unknown_nmi_error((unsigned char)(reason&0xff));
2871 void set_nmi_callback(nmi_callback_t callback)
2873 nmi_callback = callback;
2876 void unset_nmi_callback(void)
2878 nmi_callback = dummy_nmi_callback;
2881 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2883 struct vcpu *curr = current;
2885 BUG_ON(!guest_mode(regs));
2887 setup_fpu(curr);
2889 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2891 do_guest_trap(TRAP_no_device, regs, 0);
2892 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2894 else
2895 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2897 return;
2900 asmlinkage void do_debug(struct cpu_user_regs *regs)
2902 struct vcpu *v = current;
2904 DEBUGGER_trap_entry(TRAP_debug, regs);
2906 if ( !guest_mode(regs) )
2908 if ( regs->eflags & EF_TF )
2910 #ifdef __x86_64__
2911 void sysenter_entry(void);
2912 void sysenter_eflags_saved(void);
2913 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2914 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2915 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2916 goto out;
2917 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2918 #else
2919 WARN_ON(1);
2920 #endif
2921 regs->eflags &= ~EF_TF;
2923 else
2925 /*
2926 * We ignore watchpoints when they trigger within Xen. This may
2927 * happen when a buffer is passed to us which previously had a
2928 * watchpoint set on it. No need to bump EIP; the only faulting
2929 * trap is an instruction breakpoint, which can't happen to us.
2930 */
2931 WARN_ON(!search_exception_table(regs->eip));
2933 goto out;
2936 /* Save debug status register where guest OS can peek at it */
2937 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2939 ler_enable();
2940 do_guest_trap(TRAP_debug, regs, 0);
2941 return;
2943 out:
2944 ler_enable();
2945 return;
2948 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2952 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2954 int i;
2955 /* Keep secondary tables in sync with IRQ updates. */
2956 for ( i = 1; i < NR_CPUS; i++ )
2957 if ( idt_tables[i] != NULL )
2958 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2959 _set_gate(&idt_table[n], 14, dpl, addr);
2962 static void set_swint_gate(unsigned int n, void *addr)
2964 __set_intr_gate(n, 3, addr);
2967 void set_intr_gate(unsigned int n, void *addr)
2969 __set_intr_gate(n, 0, addr);
2972 void set_tss_desc(unsigned int n, void *addr)
2974 _set_tssldt_desc(
2975 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2976 (unsigned long)addr,
2977 offsetof(struct tss_struct, __cacheline_filler) - 1,
2978 9);
2979 #ifdef CONFIG_COMPAT
2980 _set_tssldt_desc(
2981 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2982 (unsigned long)addr,
2983 offsetof(struct tss_struct, __cacheline_filler) - 1,
2984 11);
2985 #endif
2988 void __devinit percpu_traps_init(void)
2990 subarch_percpu_traps_init();
2992 if ( !opt_ler )
2993 return;
2995 switch ( boot_cpu_data.x86_vendor )
2997 case X86_VENDOR_INTEL:
2998 switch ( boot_cpu_data.x86 )
3000 case 6:
3001 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3002 break;
3003 case 15:
3004 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3005 break;
3007 break;
3008 case X86_VENDOR_AMD:
3009 switch ( boot_cpu_data.x86 )
3011 case 6:
3012 case 15:
3013 case 16:
3014 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3015 break;
3017 break;
3020 ler_enable();
3023 void __init trap_init(void)
3025 /*
3026 * Note that interrupt gates are always used, rather than trap gates. We
3027 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3028 * first activation must have the "bad" value(s) for these registers and
3029 * we may lose them if another activation is installed before they are
3030 * saved. The page-fault handler also needs interrupts disabled until %cr2
3031 * has been read and saved on the stack.
3032 */
3033 set_intr_gate(TRAP_divide_error,&divide_error);
3034 set_intr_gate(TRAP_debug,&debug);
3035 set_intr_gate(TRAP_nmi,&nmi);
3036 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3037 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3038 set_intr_gate(TRAP_bounds,&bounds);
3039 set_intr_gate(TRAP_invalid_op,&invalid_op);
3040 set_intr_gate(TRAP_no_device,&device_not_available);
3041 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3042 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3043 set_intr_gate(TRAP_no_segment,&segment_not_present);
3044 set_intr_gate(TRAP_stack_error,&stack_segment);
3045 set_intr_gate(TRAP_gp_fault,&general_protection);
3046 set_intr_gate(TRAP_page_fault,&page_fault);
3047 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3048 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3049 set_intr_gate(TRAP_alignment_check,&alignment_check);
3050 set_intr_gate(TRAP_machine_check,&machine_check);
3051 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3053 /* CPU0 uses the master IDT. */
3054 idt_tables[0] = idt_table;
3056 percpu_traps_init();
3058 cpu_init();
3060 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3063 long register_guest_nmi_callback(unsigned long address)
3065 struct vcpu *v = current;
3066 struct domain *d = v->domain;
3067 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3069 t->vector = TRAP_nmi;
3070 t->flags = 0;
3071 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
3072 t->address = address;
3073 TI_SET_IF(t, 1);
3075 /*
3076 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3077 * now.
3078 */
3079 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3080 v->nmi_pending = 1;
3082 return 0;
3085 long unregister_guest_nmi_callback(void)
3087 struct vcpu *v = current;
3088 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3090 memset(t, 0, sizeof(*t));
3092 return 0;
3095 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3097 struct vcpu *v;
3098 struct trap_info *t;
3100 BUG_ON(d == NULL);
3101 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3103 /* Sanity check - XXX should be more fine grained. */
3104 BUG_ON(trap_nr > TRAP_syscall);
3106 v = d->vcpu[vcpuid];
3107 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3109 return (t->address != 0);
3113 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3115 struct vcpu *v;
3116 struct softirq_trap *st;
3118 BUG_ON(d == NULL);
3119 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3120 v = d->vcpu[vcpuid];
3122 switch (trap_nr) {
3123 case TRAP_nmi:
3124 if ( !test_and_set_bool(v->nmi_pending) ) {
3125 st = &per_cpu(softirq_trap, smp_processor_id());
3126 st->domain = dom0;
3127 st->vcpu = dom0->vcpu[0];
3128 st->processor = st->vcpu->processor;
3130 /* not safe to wake up a vcpu here */
3131 raise_softirq(NMI_MCE_SOFTIRQ);
3132 return 0;
3134 break;
3136 case TRAP_machine_check:
3138 /* We are called by the machine check (exception or polling) handlers
3139 * on the physical CPU that reported a machine check error. */
3141 if ( !test_and_set_bool(v->mce_pending) ) {
3142 st = &per_cpu(softirq_trap, smp_processor_id());
3143 st->domain = d;
3144 st->vcpu = v;
3145 st->processor = v->processor;
3147 /* not safe to wake up a vcpu here */
3148 raise_softirq(NMI_MCE_SOFTIRQ);
3149 return 0;
3151 break;
3154 /* delivery failed */
3155 return -EIO;
3159 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3161 struct trap_info cur;
3162 struct vcpu *curr = current;
3163 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3164 long rc = 0;
3166 /* If no table is presented then clear the entire virtual IDT. */
3167 if ( guest_handle_is_null(traps) )
3169 memset(dst, 0, 256 * sizeof(*dst));
3170 init_int80_direct_trap(curr);
3171 return 0;
3174 for ( ; ; )
3176 if ( hypercall_preempt_check() )
3178 rc = hypercall_create_continuation(
3179 __HYPERVISOR_set_trap_table, "h", traps);
3180 break;
3183 if ( copy_from_guest(&cur, traps, 1) )
3185 rc = -EFAULT;
3186 break;
3189 if ( cur.address == 0 )
3190 break;
3192 fixup_guest_code_selector(curr->domain, cur.cs);
3194 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3196 if ( cur.vector == 0x80 )
3197 init_int80_direct_trap(curr);
3199 guest_handle_add_offset(traps, 1);
3202 return rc;
3205 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3207 int i;
3208 struct vcpu *curr = current;
3210 switch ( reg )
3212 case 0:
3213 if ( !access_ok(value, sizeof(long)) )
3214 return -EPERM;
3215 if ( v == curr )
3216 write_debugreg(0, value);
3217 break;
3218 case 1:
3219 if ( !access_ok(value, sizeof(long)) )
3220 return -EPERM;
3221 if ( v == curr )
3222 write_debugreg(1, value);
3223 break;
3224 case 2:
3225 if ( !access_ok(value, sizeof(long)) )
3226 return -EPERM;
3227 if ( v == curr )
3228 write_debugreg(2, value);
3229 break;
3230 case 3:
3231 if ( !access_ok(value, sizeof(long)) )
3232 return -EPERM;
3233 if ( v == curr )
3234 write_debugreg(3, value);
3235 break;
3236 case 6:
3237 /*
3238 * DR6: Bits 4-11,16-31 reserved (set to 1).
3239 * Bit 12 reserved (set to 0).
3240 */
3241 value &= 0xffffefff; /* reserved bits => 0 */
3242 value |= 0xffff0ff0; /* reserved bits => 1 */
3243 if ( v == curr )
3244 write_debugreg(6, value);
3245 break;
3246 case 7:
3247 /*
3248 * DR7: Bit 10 reserved (set to 1).
3249 * Bits 11-12,14-15 reserved (set to 0).
3250 */
3251 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3252 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3253 /*
3254 * Privileged bits:
3255 * GD (bit 13): must be 0.
3256 */
3257 if ( value & DR_GENERAL_DETECT )
3258 return -EPERM;
3259 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3260 if ( value & DR7_ACTIVE_MASK )
3262 unsigned int io_enable = 0;
3264 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3266 if ( ((value >> i) & 3) == DR_IO )
3268 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3269 return -EPERM;
3270 io_enable |= value & (3 << ((i - 16) >> 1));
3272 #ifdef __i386__
3273 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3274 !boot_cpu_has(X86_FEATURE_LM)) &&
3275 (((value >> i) & 0xc) == DR_LEN_8) )
3276 return -EPERM;
3277 #endif
3280 /* Guest DR5 is a handy stash for I/O intercept information. */
3281 v->arch.guest_context.debugreg[5] = io_enable;
3282 value &= ~io_enable;
3284 /*
3285 * If DR7 was previously clear then we need to load all other
3286 * debug registers at this point as they were not restored during
3287 * context switch.
3288 */
3289 if ( (v == curr) &&
3290 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3292 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3293 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3294 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3295 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3296 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3299 if ( v == curr )
3300 write_debugreg(7, value);
3301 break;
3302 default:
3303 return -EINVAL;
3306 v->arch.guest_context.debugreg[reg] = value;
3307 return 0;
3310 long do_set_debugreg(int reg, unsigned long value)
3312 return set_debugreg(current, reg, value);
3315 unsigned long do_get_debugreg(int reg)
3317 struct vcpu *curr = current;
3319 switch ( reg )
3321 case 0 ... 3:
3322 case 6:
3323 return curr->arch.guest_context.debugreg[reg];
3324 case 7:
3325 return (curr->arch.guest_context.debugreg[7] |
3326 curr->arch.guest_context.debugreg[5]);
3327 case 4 ... 5:
3328 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3329 curr->arch.guest_context.debugreg[reg + 2] : 0);
3332 return -EINVAL;
3335 /*
3336 * Local variables:
3337 * mode: C
3338 * c-set-style: "BSD"
3339 * c-basic-offset: 4
3340 * tab-width: 4
3341 * indent-tabs-mode: nil
3342 * End:
3343 */