ia64/xen-unstable

view xen/arch/x86/traps.c @ 10892:0d2ba35c0cf2

[XEN] Add hypercall support for HVM guests. This is
fairly useless at the moment, since all of the hypercalls
fail, since copy_from_user doesn't work correctly in HVM
domains.

Signed-off-by: Steven Smith <ssmith@xensource.com>

Add a CPUID hypervisor platform interface at leaf
0x40000000. Allow hypercall transfer page to be filled
in via MSR 0x40000000.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 01 17:18:05 2006 +0100 (2006-08-01)
parents 2d2ed4d9b1c1
children 16aa4b417c6b
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/reboot.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <asm/shadow.h>
49 #include <asm/system.h>
50 #include <asm/io.h>
51 #include <asm/atomic.h>
52 #include <asm/desc.h>
53 #include <asm/debugreg.h>
54 #include <asm/smp.h>
55 #include <asm/flushtlb.h>
56 #include <asm/uaccess.h>
57 #include <asm/i387.h>
58 #include <asm/debugger.h>
59 #include <asm/msr.h>
60 #include <asm/x86_emulate.h>
62 /*
63 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
64 * fatal: Xen prints diagnostic message and then hangs.
65 * dom0: The NMI is virtualised to DOM0.
66 * ignore: The NMI error is cleared and ignored.
67 */
68 #ifdef NDEBUG
69 char opt_nmi[10] = "dom0";
70 #else
71 char opt_nmi[10] = "fatal";
72 #endif
73 string_param("nmi", opt_nmi);
75 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
76 idt_entry_t idt_table[IDT_ENTRIES];
78 #define DECLARE_TRAP_HANDLER(_name) \
79 asmlinkage void _name(void); \
80 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
82 asmlinkage void nmi(void);
83 DECLARE_TRAP_HANDLER(divide_error);
84 DECLARE_TRAP_HANDLER(debug);
85 DECLARE_TRAP_HANDLER(int3);
86 DECLARE_TRAP_HANDLER(overflow);
87 DECLARE_TRAP_HANDLER(bounds);
88 DECLARE_TRAP_HANDLER(invalid_op);
89 DECLARE_TRAP_HANDLER(device_not_available);
90 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
91 DECLARE_TRAP_HANDLER(invalid_TSS);
92 DECLARE_TRAP_HANDLER(segment_not_present);
93 DECLARE_TRAP_HANDLER(stack_segment);
94 DECLARE_TRAP_HANDLER(general_protection);
95 DECLARE_TRAP_HANDLER(page_fault);
96 DECLARE_TRAP_HANDLER(coprocessor_error);
97 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
98 DECLARE_TRAP_HANDLER(alignment_check);
99 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
100 DECLARE_TRAP_HANDLER(machine_check);
102 long do_set_debugreg(int reg, unsigned long value);
103 unsigned long do_get_debugreg(int reg);
105 static int debug_stack_lines = 20;
106 integer_param("debug_stack_lines", debug_stack_lines);
108 #ifdef CONFIG_X86_32
109 #define stack_words_per_line 8
110 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
111 #else
112 #define stack_words_per_line 4
113 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
114 #endif
116 int is_kernel_text(unsigned long addr)
117 {
118 extern char _stext, _etext;
119 if (addr >= (unsigned long) &_stext &&
120 addr <= (unsigned long) &_etext)
121 return 1;
122 return 0;
124 }
126 unsigned long kernel_text_end(void)
127 {
128 extern char _etext;
129 return (unsigned long) &_etext;
130 }
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
137 if ( hvm_guest(current) )
138 return;
140 if ( vm86_mode(regs) )
141 {
142 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
143 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
144 regs->ss, (uint16_t)(regs->esp & 0xffff));
145 }
146 else
147 {
148 stack = (unsigned long *)regs->esp;
149 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
150 }
152 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
153 {
154 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
155 break;
156 if ( get_user(addr, stack) )
157 {
158 if ( i != 0 )
159 printk("\n ");
160 printk("Fault while accessing guest memory.");
161 i = 1;
162 break;
163 }
164 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
165 printk("\n ");
166 printk(" %p", _p(addr));
167 stack++;
168 }
169 if ( i == 0 )
170 printk("Stack empty.");
171 printk("\n");
172 }
174 #ifdef NDEBUG
176 static void show_trace(struct cpu_user_regs *regs)
177 {
178 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
180 printk("Xen call trace:\n ");
182 printk("[<%p>]", _p(regs->eip));
183 print_symbol(" %s\n ", regs->eip);
185 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
186 {
187 addr = *stack++;
188 if ( is_kernel_text(addr) )
189 {
190 printk("[<%p>]", _p(addr));
191 print_symbol(" %s\n ", addr);
192 }
193 }
195 printk("\n");
196 }
198 #else
200 static void show_trace(struct cpu_user_regs *regs)
201 {
202 unsigned long *frame, next, addr, low, high;
204 printk("Xen call trace:\n ");
206 printk("[<%p>]", _p(regs->eip));
207 print_symbol(" %s\n ", regs->eip);
209 /* Bounds for range of valid frame pointer. */
210 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
211 high = (low & ~(STACK_SIZE - 1)) +
212 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
214 /* The initial frame pointer. */
215 next = regs->ebp;
217 for ( ; ; )
218 {
219 /* Valid frame pointer? */
220 if ( (next < low) || (next >= high) )
221 {
222 /*
223 * Exception stack frames have a different layout, denoted by an
224 * inverted frame pointer.
225 */
226 next = ~next;
227 if ( (next < low) || (next >= high) )
228 break;
229 frame = (unsigned long *)next;
230 next = frame[0];
231 addr = frame[(offsetof(struct cpu_user_regs, eip) -
232 offsetof(struct cpu_user_regs, ebp))
233 / BYTES_PER_LONG];
234 }
235 else
236 {
237 /* Ordinary stack frame. */
238 frame = (unsigned long *)next;
239 next = frame[0];
240 addr = frame[1];
241 }
243 printk("[<%p>]", _p(addr));
244 print_symbol(" %s\n ", addr);
246 low = (unsigned long)&frame[2];
247 }
249 printk("\n");
250 }
252 #endif
254 void show_stack(struct cpu_user_regs *regs)
255 {
256 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
257 int i;
259 if ( guest_mode(regs) )
260 return show_guest_stack(regs);
262 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
264 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
265 {
266 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
267 break;
268 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
269 printk("\n ");
270 addr = *stack++;
271 printk(" %p", _p(addr));
272 }
273 if ( i == 0 )
274 printk("Stack empty.");
275 printk("\n");
277 show_trace(regs);
278 }
280 void show_stack_overflow(unsigned long esp)
281 {
282 #ifdef MEMORY_GUARD
283 unsigned long esp_top;
284 unsigned long *stack, addr;
286 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
288 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
289 if ( ((unsigned long)(esp - esp_top) > 512) &&
290 ((unsigned long)(esp_top - esp) > 512) )
291 return;
293 if ( esp < esp_top )
294 esp = esp_top;
296 printk("Xen stack overflow:\n ");
298 stack = (unsigned long *)esp;
299 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
300 {
301 addr = *stack++;
302 if ( is_kernel_text(addr) )
303 {
304 printk("%p: [<%p>]", stack, _p(addr));
305 print_symbol(" %s\n ", addr);
306 }
307 }
309 printk("\n");
310 #endif
311 }
313 void show_execution_state(struct cpu_user_regs *regs)
314 {
315 show_registers(regs);
316 show_stack(regs);
317 }
319 /*
320 * This is called for faults at very unexpected times (e.g., when interrupts
321 * are disabled). In such situations we can't do much that is safe. We try to
322 * print out some tracing and then we just spin.
323 */
324 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
325 {
326 int cpu = smp_processor_id();
327 unsigned long cr2;
328 static char *trapstr[] = {
329 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
330 "invalid opcode", "device not available", "double fault",
331 "coprocessor segment", "invalid tss", "segment not found",
332 "stack error", "general protection fault", "page fault",
333 "spurious interrupt", "coprocessor error", "alignment check",
334 "machine check", "simd error"
335 };
337 watchdog_disable();
338 console_start_sync();
340 show_execution_state(regs);
342 if ( trapnr == TRAP_page_fault )
343 {
344 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
345 printk("Faulting linear address: %p\n", _p(cr2));
346 show_page_walk(cr2);
347 }
349 printk("************************************\n");
350 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
351 cpu, trapnr, trapstr[trapnr], regs->error_code,
352 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
353 printk("System shutting down -- need manual reset.\n");
354 printk("************************************\n");
356 (void)debugger_trap_fatal(trapnr, regs);
358 /* Lock up the console to prevent spurious output from other CPUs. */
359 console_force_lock();
361 /* Wait for manual reset. */
362 machine_halt();
363 }
365 static inline int do_trap(int trapnr, char *str,
366 struct cpu_user_regs *regs,
367 int use_error_code)
368 {
369 struct vcpu *v = current;
370 struct trap_bounce *tb = &v->arch.trap_bounce;
371 struct trap_info *ti;
372 unsigned long fixup;
374 DEBUGGER_trap_entry(trapnr, regs);
376 if ( !guest_mode(regs) )
377 goto xen_fault;
379 ti = &current->arch.guest_context.trap_ctxt[trapnr];
380 tb->flags = TBF_EXCEPTION;
381 tb->cs = ti->cs;
382 tb->eip = ti->address;
383 if ( use_error_code )
384 {
385 tb->flags |= TBF_EXCEPTION_ERRCODE;
386 tb->error_code = regs->error_code;
387 }
388 if ( TI_GET_IF(ti) )
389 tb->flags |= TBF_INTERRUPT;
390 return 0;
392 xen_fault:
394 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
395 {
396 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
397 regs->eip = fixup;
398 return 0;
399 }
401 DEBUGGER_trap_fatal(trapnr, regs);
403 show_execution_state(regs);
404 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
405 "[error_code=%04x]\n",
406 smp_processor_id(), trapnr, str, regs->error_code);
407 return 0;
408 }
410 #define DO_ERROR_NOCODE(trapnr, str, name) \
411 asmlinkage int do_##name(struct cpu_user_regs *regs) \
412 { \
413 return do_trap(trapnr, str, regs, 0); \
414 }
416 #define DO_ERROR(trapnr, str, name) \
417 asmlinkage int do_##name(struct cpu_user_regs *regs) \
418 { \
419 return do_trap(trapnr, str, regs, 1); \
420 }
422 DO_ERROR_NOCODE( 0, "divide error", divide_error)
423 DO_ERROR_NOCODE( 4, "overflow", overflow)
424 DO_ERROR_NOCODE( 5, "bounds", bounds)
425 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
426 DO_ERROR(10, "invalid TSS", invalid_TSS)
427 DO_ERROR(11, "segment not present", segment_not_present)
428 DO_ERROR(12, "stack segment", stack_segment)
429 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
430 DO_ERROR(17, "alignment check", alignment_check)
431 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
433 int rdmsr_hypervisor_regs(
434 uint32_t idx, uint32_t *eax, uint32_t *edx)
435 {
436 idx -= 0x40000000;
437 if ( idx > 0 )
438 return 0;
440 *eax = *edx = 0;
441 return 1;
442 }
444 int wrmsr_hypervisor_regs(
445 uint32_t idx, uint32_t eax, uint32_t edx)
446 {
447 struct domain *d = current->domain;
449 idx -= 0x40000000;
450 if ( idx > 0 )
451 return 0;
453 switch ( idx )
454 {
455 case 0:
456 {
457 void *hypercall_page;
458 unsigned long mfn;
459 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
460 unsigned int idx = eax & 0xfff;
462 if ( idx > 0 )
463 {
464 DPRINTK("Dom%d: Out of range index %u to MSR %08x\n",
465 d->domain_id, idx, 0x40000000);
466 return 0;
467 }
469 mfn = gmfn_to_mfn(d, gmfn);
471 if ( !mfn_valid(mfn) ||
472 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
473 {
474 DPRINTK("Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
475 d->domain_id, gmfn, mfn, 0x40000000);
476 return 0;
477 }
479 hypercall_page = map_domain_page(mfn);
480 hypercall_page_initialise(d, hypercall_page);
481 unmap_domain_page(hypercall_page);
483 put_page_and_type(mfn_to_page(mfn));
484 break;
485 }
487 default:
488 BUG();
489 }
491 return 1;
492 }
494 int cpuid_hypervisor_leaves(
495 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
496 {
497 idx -= 0x40000000;
498 if ( idx > 2 )
499 return 0;
501 switch ( idx )
502 {
503 case 0:
504 *eax = 0x40000002; /* Largest leaf */
505 *ebx = 0x566e6558; /* Signature 1: "XenV" */
506 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
507 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
508 break;
510 case 1:
511 *eax = (xen_major_version() << 16) | xen_minor_version();
512 *ebx = 0; /* Reserved */
513 *ecx = 0; /* Reserved */
514 *edx = 0; /* Reserved */
515 break;
517 case 2:
518 *eax = 1; /* Number of hypercall-transfer pages */
519 *ebx = 0x40000000; /* MSR base address */
520 *ecx = 0; /* Features 1 */
521 *edx = 0; /* Features 2 */
522 break;
524 default:
525 BUG();
526 }
528 return 1;
529 }
531 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
532 {
533 char sig[5], instr[2];
534 uint32_t a, b, c, d;
535 unsigned long eip, rc;
537 a = regs->eax;
538 b = regs->ebx;
539 c = regs->ecx;
540 d = regs->edx;
541 eip = regs->eip;
543 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
544 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
545 {
546 propagate_page_fault(eip + sizeof(sig) - rc, 0);
547 return EXCRET_fault_fixed;
548 }
549 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
550 return 0;
551 eip += sizeof(sig);
553 /* We only emulate CPUID. */
554 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
555 {
556 propagate_page_fault(eip + sizeof(instr) - rc, 0);
557 return EXCRET_fault_fixed;
558 }
559 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
560 return 0;
561 eip += sizeof(instr);
563 __asm__ (
564 "cpuid"
565 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
566 : "0" (a), "1" (b), "2" (c), "3" (d) );
568 if ( regs->eax == 1 )
569 {
570 /* Modify Feature Information. */
571 clear_bit(X86_FEATURE_VME, &d);
572 clear_bit(X86_FEATURE_DE, &d);
573 clear_bit(X86_FEATURE_PSE, &d);
574 clear_bit(X86_FEATURE_PGE, &d);
575 if ( !supervisor_mode_kernel )
576 clear_bit(X86_FEATURE_SEP, &d);
577 if ( !IS_PRIV(current->domain) )
578 clear_bit(X86_FEATURE_MTRR, &d);
579 }
580 else
581 {
582 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
583 }
585 regs->eax = a;
586 regs->ebx = b;
587 regs->ecx = c;
588 regs->edx = d;
589 regs->eip = eip;
591 return EXCRET_fault_fixed;
592 }
594 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
595 {
596 struct vcpu *v = current;
597 struct trap_bounce *tb = &v->arch.trap_bounce;
598 struct trap_info *ti;
599 int rc;
601 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
603 if ( unlikely(!guest_mode(regs)) )
604 {
605 char sig[5];
606 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
607 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
608 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
609 {
610 show_execution_state(regs);
611 regs->eip += sizeof(sig);
612 return EXCRET_fault_fixed;
613 }
614 printk("%02x %02x %02x %02x %02x\n",
615 (unsigned char)sig[0],
616 (unsigned char)sig[1],
617 (unsigned char)sig[2],
618 (unsigned char)sig[3],
619 (unsigned char)sig[4]);
620 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
621 show_execution_state(regs);
622 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
623 smp_processor_id(), TRAP_invalid_op);
624 }
626 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
627 return rc;
629 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
630 tb->flags = TBF_EXCEPTION;
631 tb->cs = ti->cs;
632 tb->eip = ti->address;
633 if ( TI_GET_IF(ti) )
634 tb->flags |= TBF_INTERRUPT;
636 return 0;
637 }
639 asmlinkage int do_int3(struct cpu_user_regs *regs)
640 {
641 struct vcpu *v = current;
642 struct trap_bounce *tb = &v->arch.trap_bounce;
643 struct trap_info *ti;
645 DEBUGGER_trap_entry(TRAP_int3, regs);
647 if ( !guest_mode(regs) )
648 {
649 DEBUGGER_trap_fatal(TRAP_int3, regs);
650 show_execution_state(regs);
651 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
652 }
654 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
655 tb->flags = TBF_EXCEPTION;
656 tb->cs = ti->cs;
657 tb->eip = ti->address;
658 if ( TI_GET_IF(ti) )
659 tb->flags |= TBF_INTERRUPT;
661 return 0;
662 }
664 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
665 {
666 fatal_trap(TRAP_machine_check, regs);
667 return 0;
668 }
670 void propagate_page_fault(unsigned long addr, u16 error_code)
671 {
672 struct trap_info *ti;
673 struct vcpu *v = current;
674 struct trap_bounce *tb = &v->arch.trap_bounce;
676 v->arch.guest_context.ctrlreg[2] = addr;
677 v->vcpu_info->arch.cr2 = addr;
679 /* Re-set error_code.user flag appropriately for the guest. */
680 error_code &= ~PGERR_user_mode;
681 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
682 error_code |= PGERR_user_mode;
684 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
685 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
686 tb->error_code = error_code;
687 tb->cs = ti->cs;
688 tb->eip = ti->address;
689 if ( TI_GET_IF(ti) )
690 tb->flags |= TBF_INTERRUPT;
691 }
693 static int handle_gdt_ldt_mapping_fault(
694 unsigned long offset, struct cpu_user_regs *regs)
695 {
696 extern int map_ldt_shadow_page(unsigned int);
698 struct vcpu *v = current;
699 struct domain *d = v->domain;
700 int ret;
702 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
703 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
704 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
706 /* Should never fault in another vcpu's area. */
707 BUG_ON(vcpu_area != current->vcpu_id);
709 /* Byte offset within the gdt/ldt sub-area. */
710 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
712 if ( likely(is_ldt_area) )
713 {
714 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
715 LOCK_BIGLOCK(d);
716 cleanup_writable_pagetable(d);
717 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
718 UNLOCK_BIGLOCK(d);
720 if ( unlikely(ret == 0) )
721 {
722 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
723 if ( !guest_mode(regs) )
724 return 0;
725 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
726 propagate_page_fault(
727 v->arch.guest_context.ldt_base + offset, regs->error_code);
728 }
729 }
730 else
731 {
732 /* GDT fault: handle the fault as #GP(selector). */
733 regs->error_code = (u16)offset & ~7;
734 (void)do_general_protection(regs);
735 }
737 return EXCRET_fault_fixed;
738 }
740 #ifdef HYPERVISOR_VIRT_END
741 #define IN_HYPERVISOR_RANGE(va) \
742 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
743 #else
744 #define IN_HYPERVISOR_RANGE(va) \
745 (((va) >= HYPERVISOR_VIRT_START))
746 #endif
748 static int __spurious_page_fault(
749 unsigned long addr, struct cpu_user_regs *regs)
750 {
751 unsigned long mfn, cr3 = read_cr3();
752 #if CONFIG_PAGING_LEVELS >= 4
753 l4_pgentry_t l4e, *l4t;
754 #endif
755 #if CONFIG_PAGING_LEVELS >= 3
756 l3_pgentry_t l3e, *l3t;
757 #endif
758 l2_pgentry_t l2e, *l2t;
759 l1_pgentry_t l1e, *l1t;
760 unsigned int required_flags, disallowed_flags;
762 /* Reserved bit violations are never spurious faults. */
763 if ( regs->error_code & PGERR_reserved_bit )
764 return 0;
766 required_flags = _PAGE_PRESENT;
767 if ( regs->error_code & PGERR_write_access )
768 required_flags |= _PAGE_RW;
769 if ( regs->error_code & PGERR_user_mode )
770 required_flags |= _PAGE_USER;
772 disallowed_flags = 0;
773 if ( regs->error_code & PGERR_instr_fetch )
774 disallowed_flags |= _PAGE_NX;
776 mfn = cr3 >> PAGE_SHIFT;
778 #if CONFIG_PAGING_LEVELS >= 4
779 l4t = map_domain_page(mfn);
780 l4e = l4t[l4_table_offset(addr)];
781 mfn = l4e_get_pfn(l4e);
782 unmap_domain_page(l4t);
783 if ( !(l4e_get_flags(l4e) & required_flags) ||
784 (l4e_get_flags(l4e) & disallowed_flags) )
785 return 0;
786 #endif
788 #if CONFIG_PAGING_LEVELS >= 3
789 l3t = map_domain_page(mfn);
790 #ifdef CONFIG_X86_PAE
791 l3t += (cr3 & 0xFE0UL) >> 3;
792 #endif
793 l3e = l3t[l3_table_offset(addr)];
794 mfn = l3e_get_pfn(l3e);
795 unmap_domain_page(l3t);
796 #ifdef CONFIG_X86_PAE
797 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
798 return 0;
799 #else
800 if ( !(l3e_get_flags(l3e) & required_flags) ||
801 (l3e_get_flags(l3e) & disallowed_flags) )
802 return 0;
803 #endif
804 #endif
806 l2t = map_domain_page(mfn);
807 l2e = l2t[l2_table_offset(addr)];
808 mfn = l2e_get_pfn(l2e);
809 unmap_domain_page(l2t);
810 if ( !(l2e_get_flags(l2e) & required_flags) ||
811 (l2e_get_flags(l2e) & disallowed_flags) )
812 return 0;
813 if ( l2e_get_flags(l2e) & _PAGE_PSE )
814 {
815 l1e = l1e_empty(); /* define before use in debug tracing */
816 goto spurious;
817 }
819 l1t = map_domain_page(mfn);
820 l1e = l1t[l1_table_offset(addr)];
821 mfn = l1e_get_pfn(l1e);
822 unmap_domain_page(l1t);
823 if ( !(l1e_get_flags(l1e) & required_flags) ||
824 (l1e_get_flags(l1e) & disallowed_flags) )
825 return 0;
827 spurious:
828 DPRINTK("Spurious fault in domain %u:%u at addr %lx, e/c %04x\n",
829 current->domain->domain_id, current->vcpu_id,
830 addr, regs->error_code);
831 #if CONFIG_PAGING_LEVELS >= 4
832 DPRINTK(" l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
833 #endif
834 #if CONFIG_PAGING_LEVELS >= 3
835 DPRINTK(" l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
836 #endif
837 DPRINTK(" l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
838 DPRINTK(" l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
839 #ifndef NDEBUG
840 show_registers(regs);
841 #endif
842 return 1;
843 }
845 static int spurious_page_fault(
846 unsigned long addr, struct cpu_user_regs *regs)
847 {
848 struct domain *d = current->domain;
849 int is_spurious;
851 LOCK_BIGLOCK(d);
852 cleanup_writable_pagetable(d);
853 is_spurious = __spurious_page_fault(addr, regs);
854 UNLOCK_BIGLOCK(d);
856 return is_spurious;
857 }
859 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
860 {
861 struct vcpu *v = current;
862 struct domain *d = v->domain;
864 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
865 {
866 if ( shadow_mode_external(d) && guest_mode(regs) )
867 return shadow_fault(addr, regs);
868 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
869 return handle_gdt_ldt_mapping_fault(
870 addr - GDT_LDT_VIRT_START, regs);
871 /*
872 * Do not propagate spurious faults in the hypervisor area to the
873 * guest. It cannot fix them up.
874 */
875 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
876 }
878 if ( unlikely(shadow_mode_enabled(d)) )
879 return shadow_fault(addr, regs);
881 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
882 {
883 LOCK_BIGLOCK(d);
884 if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
885 unlikely(l2_linear_offset(addr) ==
886 d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
887 {
888 ptwr_flush(d, PTWR_PT_ACTIVE);
889 UNLOCK_BIGLOCK(d);
890 return EXCRET_fault_fixed;
891 }
893 /*
894 * Note it is *not* safe to check PGERR_page_present here. It can be
895 * clear, due to unhooked page table, when we would otherwise expect
896 * it to be set. We have an aversion to trusting that flag in Xen, and
897 * guests ought to be leery too.
898 */
899 if ( guest_kernel_mode(v, regs) &&
900 (regs->error_code & PGERR_write_access) &&
901 ptwr_do_page_fault(d, addr, regs) )
902 {
903 UNLOCK_BIGLOCK(d);
904 return EXCRET_fault_fixed;
905 }
906 UNLOCK_BIGLOCK(d);
907 }
909 return 0;
910 }
912 /*
913 * #PF error code:
914 * Bit 0: Protection violation (=1) ; Page not present (=0)
915 * Bit 1: Write access
916 * Bit 2: User mode (=1) ; Supervisor mode (=0)
917 * Bit 3: Reserved bit violation
918 * Bit 4: Instruction fetch
919 */
920 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
921 {
922 unsigned long addr, fixup;
923 int rc;
925 ASSERT(!in_irq());
927 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
929 DEBUGGER_trap_entry(TRAP_page_fault, regs);
931 perfc_incrc(page_faults);
933 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
934 return rc;
936 if ( unlikely(!guest_mode(regs)) )
937 {
938 if ( spurious_page_fault(addr, regs) )
939 return EXCRET_not_a_fault;
941 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
942 {
943 perfc_incrc(copy_user_faults);
944 regs->eip = fixup;
945 return 0;
946 }
948 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
950 show_execution_state(regs);
951 show_page_walk(addr);
952 panic("CPU%d FATAL PAGE FAULT\n"
953 "[error_code=%04x]\n"
954 "Faulting linear address: %p\n",
955 smp_processor_id(), regs->error_code, _p(addr));
956 }
958 propagate_page_fault(addr, regs->error_code);
959 return 0;
960 }
962 long do_fpu_taskswitch(int set)
963 {
964 struct vcpu *v = current;
966 if ( set )
967 {
968 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
969 stts();
970 }
971 else
972 {
973 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
974 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
975 clts();
976 }
978 return 0;
979 }
981 /* Has the guest requested sufficient permission for this I/O access? */
982 static inline int guest_io_okay(
983 unsigned int port, unsigned int bytes,
984 struct vcpu *v, struct cpu_user_regs *regs)
985 {
986 u16 x;
987 #if defined(__x86_64__)
988 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
989 int user_mode = !(v->arch.flags & TF_kernel_mode);
990 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
991 #elif defined(__i386__)
992 #define TOGGLE_MODE() ((void)0)
993 #endif
995 if ( !vm86_mode(regs) &&
996 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
997 return 1;
999 if ( v->arch.iobmp_limit > (port + bytes) )
1001 TOGGLE_MODE();
1002 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
1003 TOGGLE_MODE();
1004 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
1005 return 1;
1008 return 0;
1011 /* Has the administrator granted sufficient permission for this I/O access? */
1012 static inline int admin_io_okay(
1013 unsigned int port, unsigned int bytes,
1014 struct vcpu *v, struct cpu_user_regs *regs)
1016 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1019 /* Check admin limits. Silently fail the access if it is disallowed. */
1020 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
1021 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
1022 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
1023 #define outb_user(_v, _p, _d, _r) \
1024 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
1025 #define outw_user(_v, _p, _d, _r) \
1026 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
1027 #define outl_user(_v, _p, _d, _r) \
1028 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
1030 /* Instruction fetch with error handling. */
1031 #define insn_fetch(_type, _size, _ptr) \
1032 ({ unsigned long _rc, _x; \
1033 if ( (_rc = copy_from_user(&_x, (_type *)eip, sizeof(_type))) != 0 ) \
1034 { \
1035 propagate_page_fault(eip + sizeof(_type) - _rc, 0); \
1036 return EXCRET_fault_fixed; \
1037 } \
1038 eip += _size; (_type)_x; })
1040 static int emulate_privileged_op(struct cpu_user_regs *regs)
1042 struct vcpu *v = current;
1043 unsigned long *reg, eip = regs->eip, res;
1044 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
1045 unsigned int port, i, op_bytes = 4, data, rc;
1046 u32 l, h;
1048 /* Legacy prefixes. */
1049 for ( i = 0; i < 8; i++ )
1051 switch ( opcode = insn_fetch(u8, 1, eip) )
1053 case 0x66: /* operand-size override */
1054 op_bytes ^= 6; /* switch between 2/4 bytes */
1055 break;
1056 case 0x67: /* address-size override */
1057 case 0x2e: /* CS override */
1058 case 0x3e: /* DS override */
1059 case 0x26: /* ES override */
1060 case 0x64: /* FS override */
1061 case 0x65: /* GS override */
1062 case 0x36: /* SS override */
1063 case 0xf0: /* LOCK */
1064 case 0xf2: /* REPNE/REPNZ */
1065 break;
1066 case 0xf3: /* REP/REPE/REPZ */
1067 rep_prefix = 1;
1068 break;
1069 default:
1070 goto done_prefixes;
1073 done_prefixes:
1075 #ifdef __x86_64__
1076 /* REX prefix. */
1077 if ( (opcode & 0xf0) == 0x40 )
1079 modrm_reg = (opcode & 4) << 1; /* REX.R */
1080 modrm_rm = (opcode & 1) << 3; /* REX.B */
1082 /* REX.W and REX.X do not need to be decoded. */
1083 opcode = insn_fetch(u8, 1, eip);
1085 #endif
1087 /* Input/Output String instructions. */
1088 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1090 if ( rep_prefix && (regs->ecx == 0) )
1091 goto done;
1093 continue_io_string:
1094 switch ( opcode )
1096 case 0x6c: /* INSB */
1097 op_bytes = 1;
1098 case 0x6d: /* INSW/INSL */
1099 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1100 goto fail;
1101 switch ( op_bytes )
1103 case 1:
1104 data = (u8)inb_user((u16)regs->edx, v, regs);
1105 break;
1106 case 2:
1107 data = (u16)inw_user((u16)regs->edx, v, regs);
1108 break;
1109 case 4:
1110 data = (u32)inl_user((u16)regs->edx, v, regs);
1111 break;
1113 if ( (rc = copy_to_user((void *)regs->edi, &data, op_bytes)) != 0 )
1115 propagate_page_fault(regs->edi + op_bytes - rc,
1116 PGERR_write_access);
1117 return EXCRET_fault_fixed;
1119 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1120 break;
1122 case 0x6e: /* OUTSB */
1123 op_bytes = 1;
1124 case 0x6f: /* OUTSW/OUTSL */
1125 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1126 goto fail;
1127 rc = copy_from_user(&data, (void *)regs->esi, op_bytes);
1128 if ( rc != 0 )
1130 propagate_page_fault(regs->esi + op_bytes - rc, 0);
1131 return EXCRET_fault_fixed;
1133 switch ( op_bytes )
1135 case 1:
1136 outb_user((u8)data, (u16)regs->edx, v, regs);
1137 break;
1138 case 2:
1139 outw_user((u16)data, (u16)regs->edx, v, regs);
1140 break;
1141 case 4:
1142 outl_user((u32)data, (u16)regs->edx, v, regs);
1143 break;
1145 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1146 break;
1149 if ( rep_prefix && (--regs->ecx != 0) )
1151 if ( !hypercall_preempt_check() )
1152 goto continue_io_string;
1153 eip = regs->eip;
1156 goto done;
1159 /* I/O Port and Interrupt Flag instructions. */
1160 switch ( opcode )
1162 case 0xe4: /* IN imm8,%al */
1163 op_bytes = 1;
1164 case 0xe5: /* IN imm8,%eax */
1165 port = insn_fetch(u8, 1, eip);
1166 exec_in:
1167 if ( !guest_io_okay(port, op_bytes, v, regs) )
1168 goto fail;
1169 switch ( op_bytes )
1171 case 1:
1172 regs->eax &= ~0xffUL;
1173 regs->eax |= (u8)inb_user(port, v, regs);
1174 break;
1175 case 2:
1176 regs->eax &= ~0xffffUL;
1177 regs->eax |= (u16)inw_user(port, v, regs);
1178 break;
1179 case 4:
1180 regs->eax = (u32)inl_user(port, v, regs);
1181 break;
1183 goto done;
1185 case 0xec: /* IN %dx,%al */
1186 op_bytes = 1;
1187 case 0xed: /* IN %dx,%eax */
1188 port = (u16)regs->edx;
1189 goto exec_in;
1191 case 0xe6: /* OUT %al,imm8 */
1192 op_bytes = 1;
1193 case 0xe7: /* OUT %eax,imm8 */
1194 port = insn_fetch(u8, 1, eip);
1195 exec_out:
1196 if ( !guest_io_okay(port, op_bytes, v, regs) )
1197 goto fail;
1198 switch ( op_bytes )
1200 case 1:
1201 outb_user((u8)regs->eax, port, v, regs);
1202 break;
1203 case 2:
1204 outw_user((u16)regs->eax, port, v, regs);
1205 break;
1206 case 4:
1207 outl_user((u32)regs->eax, port, v, regs);
1208 break;
1210 goto done;
1212 case 0xee: /* OUT %al,%dx */
1213 op_bytes = 1;
1214 case 0xef: /* OUT %eax,%dx */
1215 port = (u16)regs->edx;
1216 goto exec_out;
1218 case 0xfa: /* CLI */
1219 case 0xfb: /* STI */
1220 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1221 goto fail;
1222 /*
1223 * This is just too dangerous to allow, in my opinion. Consider if the
1224 * caller then tries to reenable interrupts using POPF: we can't trap
1225 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1226 * do for us. :-)
1227 */
1228 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1229 goto done;
1231 case 0x0f: /* Two-byte opcode */
1232 break;
1234 default:
1235 goto fail;
1238 /* Remaining instructions only emulated from guest kernel. */
1239 if ( !guest_kernel_mode(v, regs) )
1240 goto fail;
1242 /* Privileged (ring 0) instructions. */
1243 opcode = insn_fetch(u8, 1, eip);
1244 switch ( opcode )
1246 case 0x06: /* CLTS */
1247 (void)do_fpu_taskswitch(0);
1248 break;
1250 case 0x09: /* WBINVD */
1251 /* Ignore the instruction if unprivileged. */
1252 if ( !cache_flush_permitted(v->domain) )
1253 /* Non-physdev domain attempted WBINVD; ignore for now since
1254 newer linux uses this in some start-of-day timing loops */
1256 else
1257 wbinvd();
1258 break;
1260 case 0x20: /* MOV CR?,<reg> */
1261 opcode = insn_fetch(u8, 1, eip);
1262 modrm_reg |= (opcode >> 3) & 7;
1263 modrm_rm |= (opcode >> 0) & 7;
1264 reg = decode_register(modrm_rm, regs, 0);
1265 switch ( modrm_reg )
1267 case 0: /* Read CR0 */
1268 *reg = (read_cr0() & ~X86_CR0_TS) |
1269 v->arch.guest_context.ctrlreg[0];
1270 break;
1272 case 2: /* Read CR2 */
1273 *reg = v->arch.guest_context.ctrlreg[2];
1274 break;
1276 case 3: /* Read CR3 */
1277 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1278 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1279 break;
1281 case 4: /* Read CR4 */
1282 /*
1283 * Guests can read CR4 to see what features Xen has enabled. We
1284 * therefore lie about PGE & PSE as they are unavailable to guests.
1285 */
1286 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1287 break;
1289 default:
1290 goto fail;
1292 break;
1294 case 0x21: /* MOV DR?,<reg> */
1295 opcode = insn_fetch(u8, 1, eip);
1296 modrm_reg |= (opcode >> 3) & 7;
1297 modrm_rm |= (opcode >> 0) & 7;
1298 reg = decode_register(modrm_rm, regs, 0);
1299 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1300 goto fail;
1301 *reg = res;
1302 break;
1304 case 0x22: /* MOV <reg>,CR? */
1305 opcode = insn_fetch(u8, 1, eip);
1306 modrm_reg |= (opcode >> 3) & 7;
1307 modrm_rm |= (opcode >> 0) & 7;
1308 reg = decode_register(modrm_rm, regs, 0);
1309 switch ( modrm_reg )
1311 case 0: /* Write CR0 */
1312 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1314 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1315 goto fail;
1317 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1318 break;
1320 case 2: /* Write CR2 */
1321 v->arch.guest_context.ctrlreg[2] = *reg;
1322 v->vcpu_info->arch.cr2 = *reg;
1323 break;
1325 case 3: /* Write CR3 */
1326 LOCK_BIGLOCK(v->domain);
1327 cleanup_writable_pagetable(v->domain);
1328 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1329 UNLOCK_BIGLOCK(v->domain);
1330 break;
1332 case 4:
1333 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1335 DPRINTK("Attempt to change CR4 flags.\n");
1336 goto fail;
1338 break;
1340 default:
1341 goto fail;
1343 break;
1345 case 0x23: /* MOV <reg>,DR? */
1346 opcode = insn_fetch(u8, 1, eip);
1347 modrm_reg |= (opcode >> 3) & 7;
1348 modrm_rm |= (opcode >> 0) & 7;
1349 reg = decode_register(modrm_rm, regs, 0);
1350 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1351 goto fail;
1352 break;
1354 case 0x30: /* WRMSR */
1355 switch ( regs->ecx )
1357 #ifdef CONFIG_X86_64
1358 case MSR_FS_BASE:
1359 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1360 goto fail;
1361 v->arch.guest_context.fs_base =
1362 ((u64)regs->edx << 32) | regs->eax;
1363 break;
1364 case MSR_GS_BASE:
1365 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1366 goto fail;
1367 v->arch.guest_context.gs_base_kernel =
1368 ((u64)regs->edx << 32) | regs->eax;
1369 break;
1370 case MSR_SHADOW_GS_BASE:
1371 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1372 goto fail;
1373 v->arch.guest_context.gs_base_user =
1374 ((u64)regs->edx << 32) | regs->eax;
1375 break;
1376 #endif
1377 default:
1378 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1379 break;
1381 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1382 (regs->eax != l) || (regs->edx != h) )
1383 DPRINTK("Domain attempted WRMSR %p from "
1384 "%08x:%08x to %08lx:%08lx.\n",
1385 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1386 break;
1388 break;
1390 case 0x32: /* RDMSR */
1391 switch ( regs->ecx )
1393 #ifdef CONFIG_X86_64
1394 case MSR_FS_BASE:
1395 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1396 regs->edx = v->arch.guest_context.fs_base >> 32;
1397 break;
1398 case MSR_GS_BASE:
1399 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1400 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1401 break;
1402 case MSR_SHADOW_GS_BASE:
1403 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1404 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1405 break;
1406 #endif
1407 case MSR_EFER:
1408 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1409 goto fail;
1410 break;
1411 default:
1412 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1414 regs->eax = l;
1415 regs->edx = h;
1416 break;
1418 /* Everyone can read the MSR space. */
1419 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1420 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1421 goto fail;
1422 break;
1424 break;
1426 default:
1427 goto fail;
1430 done:
1431 regs->eip = eip;
1432 return EXCRET_fault_fixed;
1434 fail:
1435 return 0;
1438 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1440 struct vcpu *v = current;
1441 struct trap_bounce *tb = &v->arch.trap_bounce;
1442 struct trap_info *ti;
1443 unsigned long fixup;
1445 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1447 if ( regs->error_code & 1 )
1448 goto hardware_gp;
1450 if ( !guest_mode(regs) )
1451 goto gp_in_kernel;
1453 /*
1454 * Cunning trick to allow arbitrary "INT n" handling.
1456 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1457 * instruction from trapping to the appropriate vector, when that might not
1458 * be expected by Xen or the guest OS. For example, that entry might be for
1459 * a fault handler (unlike traps, faults don't increment EIP), or might
1460 * expect an error code on the stack (which a software trap never
1461 * provides), or might be a hardware interrupt handler that doesn't like
1462 * being called spuriously.
1464 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1465 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1466 * clear to indicate that it's a software fault, not hardware.
1468 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1469 * okay because they can only be triggered by an explicit DPL-checked
1470 * instruction. The DPL specified by the guest OS for these vectors is NOT
1471 * CHECKED!!
1472 */
1473 if ( (regs->error_code & 3) == 2 )
1475 /* This fault must be due to <INT n> instruction. */
1476 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1477 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1479 tb->flags = TBF_EXCEPTION;
1480 regs->eip += 2;
1481 goto finish_propagation;
1485 /* Emulate some simple privileged and I/O instructions. */
1486 if ( (regs->error_code == 0) &&
1487 emulate_privileged_op(regs) )
1488 return 0;
1490 #if defined(__i386__)
1491 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1492 (regs->error_code == 0) &&
1493 gpf_emulate_4gb(regs) )
1494 return 0;
1495 #endif
1497 /* Pass on GPF as is. */
1498 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1499 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1500 tb->error_code = regs->error_code;
1501 finish_propagation:
1502 tb->cs = ti->cs;
1503 tb->eip = ti->address;
1504 if ( TI_GET_IF(ti) )
1505 tb->flags |= TBF_INTERRUPT;
1506 return 0;
1508 gp_in_kernel:
1510 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1512 DPRINTK("GPF (%04x): %p -> %p\n",
1513 regs->error_code, _p(regs->eip), _p(fixup));
1514 regs->eip = fixup;
1515 return 0;
1518 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1520 hardware_gp:
1521 show_execution_state(regs);
1522 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1523 smp_processor_id(), regs->error_code);
1524 return 0;
1527 static void nmi_softirq(void)
1529 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1530 vcpu_kick(dom0->vcpu[0]);
1533 static void nmi_dom0_report(unsigned int reason_idx)
1535 struct domain *d;
1536 struct vcpu *v;
1538 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1539 return;
1541 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1543 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1544 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1547 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1549 switch ( opt_nmi[0] )
1551 case 'd': /* 'dom0' */
1552 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1553 case 'i': /* 'ignore' */
1554 break;
1555 default: /* 'fatal' */
1556 console_force_unlock();
1557 printk("\n\nNMI - MEMORY ERROR\n");
1558 fatal_trap(TRAP_nmi, regs);
1561 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1562 mdelay(1);
1563 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1566 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1568 switch ( opt_nmi[0] )
1570 case 'd': /* 'dom0' */
1571 nmi_dom0_report(_XEN_NMIREASON_io_error);
1572 case 'i': /* 'ignore' */
1573 break;
1574 default: /* 'fatal' */
1575 console_force_unlock();
1576 printk("\n\nNMI - I/O ERROR\n");
1577 fatal_trap(TRAP_nmi, regs);
1580 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1581 mdelay(1);
1582 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1585 static void unknown_nmi_error(unsigned char reason)
1587 switch ( opt_nmi[0] )
1589 case 'd': /* 'dom0' */
1590 nmi_dom0_report(_XEN_NMIREASON_unknown);
1591 case 'i': /* 'ignore' */
1592 break;
1593 default: /* 'fatal' */
1594 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1595 printk("Dazed and confused, but trying to continue\n");
1596 printk("Do you have a strange power saving mode enabled?\n");
1600 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1602 return 0;
1605 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1607 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1609 unsigned int cpu = smp_processor_id();
1610 unsigned char reason;
1612 ++nmi_count(cpu);
1614 if ( nmi_callback(regs, cpu) )
1615 return;
1617 if ( nmi_watchdog )
1618 nmi_watchdog_tick(regs);
1620 /* Only the BSP gets external NMIs from the system. */
1621 if ( cpu == 0 )
1623 reason = inb(0x61);
1624 if ( reason & 0x80 )
1625 mem_parity_error(regs);
1626 else if ( reason & 0x40 )
1627 io_check_error(regs);
1628 else if ( !nmi_watchdog )
1629 unknown_nmi_error((unsigned char)(reason&0xff));
1633 void set_nmi_callback(nmi_callback_t callback)
1635 nmi_callback = callback;
1638 void unset_nmi_callback(void)
1640 nmi_callback = dummy_nmi_callback;
1643 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1645 struct trap_bounce *tb;
1646 struct trap_info *ti;
1648 setup_fpu(current);
1650 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1652 tb = &current->arch.trap_bounce;
1653 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1655 tb->flags = TBF_EXCEPTION;
1656 tb->cs = ti->cs;
1657 tb->eip = ti->address;
1658 if ( TI_GET_IF(ti) )
1659 tb->flags |= TBF_INTERRUPT;
1661 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1664 return EXCRET_fault_fixed;
1667 asmlinkage int do_debug(struct cpu_user_regs *regs)
1669 unsigned long condition;
1670 struct vcpu *v = current;
1671 struct trap_bounce *tb = &v->arch.trap_bounce;
1672 struct trap_info *ti;
1674 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1676 /* Mask out spurious debug traps due to lazy DR7 setting */
1677 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1678 (v->arch.guest_context.debugreg[7] == 0) )
1680 __asm__("mov %0,%%db7" : : "r" (0UL));
1681 goto out;
1684 DEBUGGER_trap_entry(TRAP_debug, regs);
1686 if ( !guest_mode(regs) )
1688 /* Clear TF just for absolute sanity. */
1689 regs->eflags &= ~EF_TF;
1690 /*
1691 * We ignore watchpoints when they trigger within Xen. This may happen
1692 * when a buffer is passed to us which previously had a watchpoint set
1693 * on it. No need to bump EIP; the only faulting trap is an instruction
1694 * breakpoint, which can't happen to us.
1695 */
1696 goto out;
1699 /* Save debug status register where guest OS can peek at it */
1700 v->arch.guest_context.debugreg[6] = condition;
1702 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1703 tb->flags = TBF_EXCEPTION;
1704 tb->cs = ti->cs;
1705 tb->eip = ti->address;
1706 if ( TI_GET_IF(ti) )
1707 tb->flags |= TBF_INTERRUPT;
1709 out:
1710 return EXCRET_not_a_fault;
1713 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1715 return EXCRET_not_a_fault;
1718 void set_intr_gate(unsigned int n, void *addr)
1720 #ifdef __i386__
1721 int i;
1722 /* Keep secondary tables in sync with IRQ updates. */
1723 for ( i = 1; i < NR_CPUS; i++ )
1724 if ( idt_tables[i] != NULL )
1725 _set_gate(&idt_tables[i][n], 14, 0, addr);
1726 #endif
1727 _set_gate(&idt_table[n], 14, 0, addr);
1730 void set_system_gate(unsigned int n, void *addr)
1732 _set_gate(idt_table+n,14,3,addr);
1735 void set_task_gate(unsigned int n, unsigned int sel)
1737 idt_table[n].a = sel << 16;
1738 idt_table[n].b = 0x8500;
1741 void set_tss_desc(unsigned int n, void *addr)
1743 _set_tssldt_desc(
1744 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1745 (unsigned long)addr,
1746 offsetof(struct tss_struct, __cacheline_filler) - 1,
1747 9);
1750 void __init trap_init(void)
1752 extern void percpu_traps_init(void);
1754 /*
1755 * Note that interrupt gates are always used, rather than trap gates. We
1756 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1757 * first activation must have the "bad" value(s) for these registers and
1758 * we may lose them if another activation is installed before they are
1759 * saved. The page-fault handler also needs interrupts disabled until %cr2
1760 * has been read and saved on the stack.
1761 */
1762 set_intr_gate(TRAP_divide_error,&divide_error);
1763 set_intr_gate(TRAP_debug,&debug);
1764 set_intr_gate(TRAP_nmi,&nmi);
1765 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1766 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1767 set_intr_gate(TRAP_bounds,&bounds);
1768 set_intr_gate(TRAP_invalid_op,&invalid_op);
1769 set_intr_gate(TRAP_no_device,&device_not_available);
1770 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1771 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1772 set_intr_gate(TRAP_no_segment,&segment_not_present);
1773 set_intr_gate(TRAP_stack_error,&stack_segment);
1774 set_intr_gate(TRAP_gp_fault,&general_protection);
1775 set_intr_gate(TRAP_page_fault,&page_fault);
1776 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1777 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1778 set_intr_gate(TRAP_alignment_check,&alignment_check);
1779 set_intr_gate(TRAP_machine_check,&machine_check);
1780 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1782 percpu_traps_init();
1784 cpu_init();
1786 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1790 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1792 struct trap_info cur;
1793 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1794 long rc = 0;
1796 /* If no table is presented then clear the entire virtual IDT. */
1797 if ( guest_handle_is_null(traps) )
1799 memset(dst, 0, 256 * sizeof(*dst));
1800 init_int80_direct_trap(current);
1801 return 0;
1804 for ( ; ; )
1806 if ( hypercall_preempt_check() )
1808 rc = hypercall_create_continuation(
1809 __HYPERVISOR_set_trap_table, "h", traps);
1810 break;
1813 if ( copy_from_guest(&cur, traps, 1) )
1815 rc = -EFAULT;
1816 break;
1819 if ( cur.address == 0 )
1820 break;
1822 fixup_guest_code_selector(cur.cs);
1824 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1826 if ( cur.vector == 0x80 )
1827 init_int80_direct_trap(current);
1829 guest_handle_add_offset(traps, 1);
1832 return rc;
1836 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1838 int i;
1840 switch ( reg )
1842 case 0:
1843 if ( !access_ok(value, sizeof(long)) )
1844 return -EPERM;
1845 if ( p == current )
1846 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1847 break;
1848 case 1:
1849 if ( !access_ok(value, sizeof(long)) )
1850 return -EPERM;
1851 if ( p == current )
1852 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1853 break;
1854 case 2:
1855 if ( !access_ok(value, sizeof(long)) )
1856 return -EPERM;
1857 if ( p == current )
1858 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1859 break;
1860 case 3:
1861 if ( !access_ok(value, sizeof(long)) )
1862 return -EPERM;
1863 if ( p == current )
1864 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1865 break;
1866 case 6:
1867 /*
1868 * DR6: Bits 4-11,16-31 reserved (set to 1).
1869 * Bit 12 reserved (set to 0).
1870 */
1871 value &= 0xffffefff; /* reserved bits => 0 */
1872 value |= 0xffff0ff0; /* reserved bits => 1 */
1873 if ( p == current )
1874 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1875 break;
1876 case 7:
1877 /*
1878 * DR7: Bit 10 reserved (set to 1).
1879 * Bits 11-12,14-15 reserved (set to 0).
1880 * Privileged bits:
1881 * GD (bit 13): must be 0.
1882 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1883 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1884 */
1885 /* DR7 == 0 => debugging disabled for this domain. */
1886 if ( value != 0 )
1888 value &= 0xffff27ff; /* reserved bits => 0 */
1889 value |= 0x00000400; /* reserved bits => 1 */
1890 if ( (value & (1<<13)) != 0 ) return -EPERM;
1891 for ( i = 0; i < 16; i += 2 )
1892 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1894 if ( p == current )
1895 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1896 break;
1897 default:
1898 return -EINVAL;
1901 p->arch.guest_context.debugreg[reg] = value;
1902 return 0;
1905 long do_set_debugreg(int reg, unsigned long value)
1907 return set_debugreg(current, reg, value);
1910 unsigned long do_get_debugreg(int reg)
1912 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1913 return current->arch.guest_context.debugreg[reg];
1916 /*
1917 * Local variables:
1918 * mode: C
1919 * c-set-style: "BSD"
1920 * c-basic-offset: 4
1921 * tab-width: 4
1922 * indent-tabs-mode: nil
1923 * End:
1924 */