ia64/xen-unstable

changeset 9043:ee8041b0ab86

Add a compile time option to enable domain 0 running in ring 0.

In this mode only a single guest kernel is supported.

This mode only works for x86/32 (not x86/64).

Signed-off-by: Ian Campbell <Ian.Campbell@XenSource.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Feb 27 15:52:43 2006 +0100 (2006-02-27)
parents 6060937db0fe
children 911f3ecd975e
files xen/arch/x86/Makefile xen/arch/x86/Rules.mk xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/arch/x86/traps.c xen/arch/x86/x86_32/asm-offsets.c xen/arch/x86/x86_32/entry.S xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_32/supervisor_mode_kernel.S xen/arch/x86/x86_32/traps.c xen/arch/x86/x86_64/mm.c xen/common/dom0_ops.c xen/common/kernel.c xen/include/asm-ia64/config.h xen/include/asm-x86/config.h xen/include/asm-x86/desc.h xen/include/asm-x86/x86_32/asm_defns.h
line diff
     1.1 --- a/xen/arch/x86/Makefile	Mon Feb 27 11:02:00 2006 +0000
     1.2 +++ b/xen/arch/x86/Makefile	Mon Feb 27 15:52:43 2006 +0100
     1.3 @@ -33,6 +33,10 @@ ifeq ($(TARGET_SUBARCH),x86_32)
     1.4   endif
     1.5  endif
     1.6  
     1.7 +ifneq ($(supervisor_mode_kernel),y)
     1.8 +OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS))
     1.9 +endif
    1.10 +
    1.11  OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS))
    1.12  OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS))
    1.13  
     2.1 --- a/xen/arch/x86/Rules.mk	Mon Feb 27 11:02:00 2006 +0000
     2.2 +++ b/xen/arch/x86/Rules.mk	Mon Feb 27 15:52:43 2006 +0100
     2.3 @@ -6,6 +6,7 @@
     2.4  # 'make clean' before rebuilding.
     2.5  #
     2.6  pae ?= n
     2.7 +supervisor_mode_kernel ?= n
     2.8  
     2.9  CFLAGS  += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
    2.10  CFLAGS  += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe
    2.11 @@ -32,6 +33,9 @@ ifeq ($(pae),y)
    2.12  CFLAGS  += -DCONFIG_X86_PAE=1
    2.13  endif
    2.14  endif
    2.15 +ifeq ($(supervisor_mode_kernel),y)
    2.16 +CFLAGS  += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
    2.17 +endif
    2.18  
    2.19  ifeq ($(TARGET_SUBARCH),x86_64)
    2.20  CFLAGS  += -m64 -mno-red-zone -fpic -fno-reorder-blocks
     3.1 --- a/xen/arch/x86/domain.c	Mon Feb 27 11:02:00 2006 +0000
     3.2 +++ b/xen/arch/x86/domain.c	Mon Feb 27 15:52:43 2006 +0100
     3.3 @@ -351,17 +351,17 @@ int arch_set_info_guest(
     3.4  
     3.5      if ( !(c->flags & VGCF_HVM_GUEST) )
     3.6      {
     3.7 -        fixup_guest_selector(c->user_regs.ss);
     3.8 -        fixup_guest_selector(c->kernel_ss);
     3.9 -        fixup_guest_selector(c->user_regs.cs);
    3.10 +        fixup_guest_stack_selector(c->user_regs.ss);
    3.11 +        fixup_guest_stack_selector(c->kernel_ss);
    3.12 +        fixup_guest_code_selector(c->user_regs.cs);
    3.13  
    3.14  #ifdef __i386__
    3.15 -        fixup_guest_selector(c->event_callback_cs);
    3.16 -        fixup_guest_selector(c->failsafe_callback_cs);
    3.17 +        fixup_guest_code_selector(c->event_callback_cs);
    3.18 +        fixup_guest_code_selector(c->failsafe_callback_cs);
    3.19  #endif
    3.20  
    3.21          for ( i = 0; i < 256; i++ )
    3.22 -            fixup_guest_selector(c->trap_ctxt[i].cs);
    3.23 +            fixup_guest_code_selector(c->trap_ctxt[i].cs);
    3.24      }
    3.25      else if ( !hvm_enabled )
    3.26        return -EINVAL;
    3.27 @@ -847,7 +847,11 @@ unsigned long __hypercall_create_continu
    3.28          regs       = guest_cpu_user_regs();
    3.29  #if defined(__i386__)
    3.30          regs->eax  = op;
    3.31 -        regs->eip -= 2;  /* re-execute 'int 0x82' */
    3.32 +
    3.33 +        if ( supervisor_mode_kernel )
    3.34 +            regs->eip &= ~31; /* re-execute entire hypercall entry stub */
    3.35 +        else
    3.36 +            regs->eip -= 2;   /* re-execute 'int 0x82' */
    3.37  
    3.38          for ( i = 0; i < nr_args; i++ )
    3.39          {
     4.1 --- a/xen/arch/x86/domain_build.c	Mon Feb 27 11:02:00 2006 +0000
     4.2 +++ b/xen/arch/x86/domain_build.c	Mon Feb 27 15:52:43 2006 +0100
     4.3 @@ -793,6 +793,17 @@ int construct_dom0(struct domain *d,
     4.4          update_pagetables(v);
     4.5      }
     4.6  
     4.7 +    if ( supervisor_mode_kernel )
     4.8 +    {
     4.9 +        v->arch.guest_context.kernel_ss &= ~3;
    4.10 +        v->arch.guest_context.user_regs.ss &= ~3;
    4.11 +        v->arch.guest_context.user_regs.es &= ~3;
    4.12 +        v->arch.guest_context.user_regs.ds &= ~3;
    4.13 +        v->arch.guest_context.user_regs.fs &= ~3;
    4.14 +        v->arch.guest_context.user_regs.gs &= ~3;
    4.15 +        printk("Dom0 runs in ring 0 (supervisor mode)\n");
    4.16 +    }
    4.17 +
    4.18      rc = 0;
    4.19  
    4.20      /* DOM0 is permitted full I/O capabilities. */
     5.1 --- a/xen/arch/x86/traps.c	Mon Feb 27 11:02:00 2006 +0000
     5.2 +++ b/xen/arch/x86/traps.c	Mon Feb 27 15:52:43 2006 +0100
     5.3 @@ -1429,7 +1429,7 @@ long do_set_trap_table(struct trap_info 
     5.4          if ( cur.address == 0 )
     5.5              break;
     5.6  
     5.7 -        fixup_guest_selector(cur.cs);
     5.8 +        fixup_guest_code_selector(cur.cs);
     5.9  
    5.10          memcpy(&dst[cur.vector], &cur, sizeof(cur));
    5.11  
     6.1 --- a/xen/arch/x86/x86_32/asm-offsets.c	Mon Feb 27 11:02:00 2006 +0000
     6.2 +++ b/xen/arch/x86/x86_32/asm-offsets.c	Mon Feb 27 15:52:43 2006 +0100
     6.3 @@ -72,6 +72,13 @@ void __dummy__(void)
     6.4      DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
     6.5      BLANK();
     6.6  
     6.7 +    OFFSET(TSS_ss0, struct tss_struct, ss0);
     6.8 +    OFFSET(TSS_esp0, struct tss_struct, esp0);
     6.9 +    OFFSET(TSS_ss1, struct tss_struct, ss1);
    6.10 +    OFFSET(TSS_esp1, struct tss_struct, esp1);
    6.11 +    DEFINE(TSS_sizeof, sizeof(struct tss_struct));
    6.12 +    BLANK();
    6.13 +
    6.14      OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
    6.15      OFFSET(VCPU_svm_hsa_pa,  struct vcpu, arch.hvm_svm.host_save_pa);
    6.16      OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb);
     7.1 --- a/xen/arch/x86/x86_32/entry.S	Mon Feb 27 11:02:00 2006 +0000
     7.2 +++ b/xen/arch/x86/x86_32/entry.S	Mon Feb 27 15:52:43 2006 +0100
     7.3 @@ -77,6 +77,13 @@
     7.4  restore_all_guest:
     7.5          testl $X86_EFLAGS_VM,UREGS_eflags(%esp)
     7.6          jnz  restore_all_vm86
     7.7 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
     7.8 +        testl $2,UREGS_cs(%esp)
     7.9 +        jnz   1f
    7.10 +        call  restore_ring0_guest
    7.11 +        jmp   restore_all_vm86
    7.12 +1:
    7.13 +#endif
    7.14  FLT1:   mov  UREGS_ds(%esp),%ds
    7.15  FLT2:   mov  UREGS_es(%esp),%es
    7.16  FLT3:   mov  UREGS_fs(%esp),%fs
    7.17 @@ -157,6 +164,7 @@ restore_all_xen:
    7.18          ALIGN
    7.19  ENTRY(hypercall)
    7.20          subl $4,%esp
    7.21 +        FIXUP_RING0_GUEST_STACK
    7.22  	SAVE_ALL(b)
    7.23          sti
    7.24          GET_CURRENT(%ebx)
    7.25 @@ -294,6 +302,11 @@ FLT14:  movl %eax,%gs:(%esi)
    7.26          popl %eax
    7.27          shll $16,%eax                    # Bits 16-23: saved_upcall_mask
    7.28          movw UREGS_cs+4(%esp),%ax        # Bits  0-15: CS
    7.29 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
    7.30 +        testw $2,%ax
    7.31 +        jnz  FLT15
    7.32 +        and  $~3,%ax                     # RPL 1 -> RPL 0
    7.33 +#endif
    7.34  FLT15:  movl %eax,%gs:4(%esi) 
    7.35          test $0x00FF0000,%eax            # Bits 16-23: saved_upcall_mask
    7.36          setz %ch                         # %ch == !saved_upcall_mask
    7.37 @@ -388,6 +401,7 @@ ENTRY(divide_error)
    7.38  	pushl $TRAP_divide_error<<16
    7.39  	ALIGN
    7.40  error_code:
    7.41 +        FIXUP_RING0_GUEST_STACK
    7.42          SAVE_ALL_NOSEGREGS(a)
    7.43          SET_XEN_SEGMENTS(a)
    7.44          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp)
    7.45 @@ -505,6 +519,10 @@ ENTRY(spurious_interrupt_bug)
    7.46  	jmp error_code
    7.47  
    7.48  ENTRY(nmi)
    7.49 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
    7.50 +        # NMI entry protocol is incompatible with guest kernel in ring 0.
    7.51 +        iret
    7.52 +#else
    7.53          # Save state but do not trash the segment registers!
    7.54          # We may otherwise be unable to reload them or copy them to ring 1. 
    7.55  	pushl %eax
    7.56 @@ -546,6 +564,7 @@ 1:      movl  %ss:APIC_ICR(%eax),%ebx
    7.57          movl  $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \
    7.58                  TRAP_deferred_nmi),%ss:APIC_ICR(%eax)
    7.59          jmp   restore_all_xen
    7.60 +#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
    7.61  
    7.62  ENTRY(setup_vm86_frame)
    7.63          # Copies the entire stack frame forwards by 16 bytes.
     8.1 --- a/xen/arch/x86/x86_32/mm.c	Mon Feb 27 11:02:00 2006 +0000
     8.2 +++ b/xen/arch/x86/x86_32/mm.c	Mon Feb 27 15:52:43 2006 +0100
     8.3 @@ -180,6 +180,15 @@ void subarch_init_memory(struct domain *
     8.4              page_set_owner(page, dom_xen);
     8.5          }
     8.6      }
     8.7 +
     8.8 +    if ( supervisor_mode_kernel )
     8.9 +    {
    8.10 +        /* Guest kernel runs in ring 0, not ring 1. */
    8.11 +        struct desc_struct *d;
    8.12 +        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
    8.13 +        d[0].b &= ~_SEGMENT_DPL;
    8.14 +        d[1].b &= ~_SEGMENT_DPL;
    8.15 +    }
    8.16  }
    8.17  
    8.18  long subarch_memory_op(int op, void *arg)
    8.19 @@ -223,7 +232,7 @@ long do_stack_switch(unsigned long ss, u
    8.20      int nr = smp_processor_id();
    8.21      struct tss_struct *t = &init_tss[nr];
    8.22  
    8.23 -    fixup_guest_selector(ss);
    8.24 +    fixup_guest_stack_selector(ss);
    8.25  
    8.26      current->arch.guest_context.kernel_ss = ss;
    8.27      current->arch.guest_context.kernel_sp = esp;
    8.28 @@ -240,6 +249,10 @@ int check_descriptor(struct desc_struct 
    8.29      u32 a = d->a, b = d->b;
    8.30      u16 cs;
    8.31  
    8.32 +    /* Let a ring0 guest kernel set any descriptor it wants to. */
    8.33 +    if ( supervisor_mode_kernel )
    8.34 +        return 1;
    8.35 +
    8.36      /* A not-present descriptor will always fault, so is safe. */
    8.37      if ( !(b & _SEGMENT_P) ) 
    8.38          goto good;
    8.39 @@ -273,7 +286,7 @@ int check_descriptor(struct desc_struct 
    8.40  
    8.41          /* Validate and fix up the target code selector. */
    8.42          cs = a >> 16;
    8.43 -        fixup_guest_selector(cs);
    8.44 +        fixup_guest_code_selector(cs);
    8.45          if ( !guest_gate_selector_okay(cs) )
    8.46              goto bad;
    8.47          a = d->a = (d->a & 0xffffU) | (cs << 16);
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S	Mon Feb 27 15:52:43 2006 +0100
     9.3 @@ -0,0 +1,145 @@
     9.4 +/*
     9.5 + * Handle stack fixup for guest running in RING 0.
     9.6 + *
     9.7 + * Copyright (c) 2006 Ian Campbell
     9.8 + *
     9.9 + * When a guest kernel is allowed to run in RING 0 a hypercall,
    9.10 + * interrupt or exception interrupting the guest kernel will not cause
    9.11 + * a privilege level change and therefore the stack will not be swapped
    9.12 + * to the Xen stack.
    9.13 + *
    9.14 + * To fix this we look for RING 0 activation frames with a stack
    9.15 + * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel
    9.16 + * frame) and fix this up by locating the Xen stack via the TSS
    9.17 + * and moving the activation frame to the Xen stack. In the process we
    9.18 + * convert the frame into an inter-privilege frame returning to RING 1
    9.19 + * so that we can catch and reverse the process on exit.
    9.20 + */
    9.21 +
    9.22 +#include <xen/config.h>
    9.23 +#include <asm/asm_defns.h>
    9.24 +#include <public/xen.h>
    9.25 +
    9.26 +        # Upon entry the stack should be the Xen stack and contain:
    9.27 +        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN
    9.28 +        # On exit the stack should be %ss:%esp (i.e. the guest stack)
    9.29 +        # and contain:
    9.30 +        #   EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN
    9.31 +        ALIGN
    9.32 +ENTRY(restore_ring0_guest)
    9.33 +        # Point %gs:%esi to guest stack.
    9.34 +RRG0:   movw UREGS_ss+4(%esp),%gs
    9.35 +        movl UREGS_esp+4(%esp),%esi
    9.36 +
    9.37 +        # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack.
    9.38 +        movl $(UREGS_kernel_sizeof>>2)+1,%ecx
    9.39 +
    9.40 +1:      subl $4,%esi
    9.41 +        movl -4(%esp,%ecx,4),%eax
    9.42 +RRG1:   movl %eax,%gs:(%esi)
    9.43 +        loop 1b
    9.44 +
    9.45 +RRG2:   andl $~3,%gs:UREGS_cs+4(%esi)
    9.46 +
    9.47 +        movl %gs,%eax
    9.48 +
    9.49 +        # We need to do this because these registers are not present
    9.50 +        # on the guest stack so they cannot be restored by the code in
    9.51 +        # restore_all_guest.
    9.52 +RRG3:   mov  UREGS_ds+4(%esp),%ds
    9.53 +RRG4:   mov  UREGS_es+4(%esp),%es
    9.54 +RRG5:   mov  UREGS_fs+4(%esp),%fs
    9.55 +RRG6:   mov  UREGS_gs+4(%esp),%gs
    9.56 +
    9.57 +RRG7:   movl %eax,%ss
    9.58 +        movl %esi,%esp
    9.59 +
    9.60 +        ret
    9.61 +.section __ex_table,"a"
    9.62 +        .long RRG0,domain_crash_synchronous
    9.63 +        .long RRG1,domain_crash_synchronous
    9.64 +        .long RRG2,domain_crash_synchronous
    9.65 +        .long RRG3,domain_crash_synchronous
    9.66 +        .long RRG4,domain_crash_synchronous
    9.67 +        .long RRG5,domain_crash_synchronous
    9.68 +        .long RRG6,domain_crash_synchronous
    9.69 +        .long RRG7,domain_crash_synchronous
    9.70 +.previous
    9.71 +
    9.72 +        # Upon entry the stack should be a guest stack and contain:
    9.73 +        #   EFLAGS, %cs, %eip, ERROR, RETURN
    9.74 +        # On exit the stack should be the Xen stack and contain:
    9.75 +        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN
    9.76 +        ALIGN
    9.77 +ENTRY(fixup_ring0_guest_stack)
    9.78 +        pushl %eax
    9.79 +        pushl %ecx
    9.80 +        pushl %ds
    9.81 +        pushl %gs
    9.82 +        pushl %esi
    9.83 +
    9.84 +        movw  $__HYPERVISOR_DS,%ax
    9.85 +        movw  %ax,%ds
    9.86 +
    9.87 +        # Point %gs:%esi to guest stack frame.
    9.88 +        movw  %ss,%ax
    9.89 +        movw  %ax,%gs
    9.90 +        movl  %esp,%esi
    9.91 +        # Account for entries on the guest stack:
    9.92 +        # * Pushed by normal exception/interrupt/hypercall mechanisms
    9.93 +        #   * EFLAGS, %cs, %eip, ERROR == 4 words.
    9.94 +        # * Pushed by the fixup routine
    9.95 +        #   * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words.
    9.96 +        addl $((6+4)*4),%esi
    9.97 +
    9.98 +        # %gs:%esi now points to the guest stack before the
    9.99 +        # interrupt/exception occured.
   9.100 +
   9.101 +        /*
   9.102 +         * Reverse the __TSS macro, giving us the CPU number.
   9.103 +         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
   9.104 +         */
   9.105 +        str   %ecx
   9.106 +        shrl  $3,%ecx                                   # Calculate GDT index for TSS.
   9.107 +        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
   9.108 +        shll  $6,%ecx                                   # Each TSS entry is 0x80 bytes
   9.109 +        addl  $init_tss,%ecx                            # but we have 2*cpu from above.
   9.110 +
   9.111 +        # Load Xen stack from TSS.
   9.112 +        movw  TSS_ss0(%ecx),%ax
   9.113 +TRP1:   movw  %ax,%ss
   9.114 +        movl  TSS_esp0(%ecx),%esp
   9.115 +
   9.116 +        pushl %gs
   9.117 +        pushl %esi
   9.118 +
   9.119 +        # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi
   9.120 +        # from guest stack to Xen stack.
   9.121 +        movl  $10,%ecx
   9.122 +1:      subl  $4,%esp
   9.123 +        subl  $4,%esi
   9.124 +TRP2:   movl  %gs:(%esi),%eax
   9.125 +        movl  %eax,(%esp)
   9.126 +        loop  1b
   9.127 +
   9.128 +        # CS = CS|1 to simulate RING1 stack frame.
   9.129 +        orl   $1,32(%esp)
   9.130 +
   9.131 +        popl  %esi
   9.132 +        popl  %gs
   9.133 +        popl  %ds
   9.134 +        popl  %ecx
   9.135 +        popl  %eax
   9.136 +        ret
   9.137 +.section __ex_table,"a"
   9.138 +        .long TRP1,domain_crash_synchronous
   9.139 +        .long TRP2,domain_crash_synchronous
   9.140 +.previous
   9.141 +
   9.142 +domain_crash_synchronous_string:
   9.143 +        .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n"
   9.144 +
   9.145 +domain_crash_synchronous:
   9.146 +        pushl $domain_crash_synchronous_string
   9.147 +        call  printf
   9.148 +        jmp   __domain_crash_synchronous
    10.1 --- a/xen/arch/x86/x86_32/traps.c	Mon Feb 27 11:02:00 2006 +0000
    10.2 +++ b/xen/arch/x86/x86_32/traps.c	Mon Feb 27 15:52:43 2006 +0100
    10.3 @@ -256,8 +256,14 @@ void init_int80_direct_trap(struct vcpu 
    10.4       * We can't virtualise interrupt gates, as there's no way to get
    10.5       * the CPU to automatically clear the events_mask variable. Also we
    10.6       * must ensure that the CS is safe to poke into an interrupt gate.
    10.7 +     *
    10.8 +     * When running with supervisor_mode_kernel enabled a direct trap
    10.9 +     * to the guest OS cannot be used because the INT instruction will
   10.10 +     * switch to the Xen stack and we need to swap back to the guest
   10.11 +     * kernel stack before passing control to the system call entry point.
   10.12       */
   10.13 -    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) )
   10.14 +    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ||
   10.15 +         supervisor_mode_kernel )
   10.16      {
   10.17          v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
   10.18          return;
   10.19 @@ -278,8 +284,8 @@ long do_set_callbacks(unsigned long even
   10.20  {
   10.21      struct vcpu *d = current;
   10.22  
   10.23 -    fixup_guest_selector(event_selector);
   10.24 -    fixup_guest_selector(failsafe_selector);
   10.25 +    fixup_guest_code_selector(event_selector);
   10.26 +    fixup_guest_code_selector(failsafe_selector);
   10.27  
   10.28      d->arch.guest_context.event_callback_cs     = event_selector;
   10.29      d->arch.guest_context.event_callback_eip    = event_address;
   10.30 @@ -289,12 +295,51 @@ long do_set_callbacks(unsigned long even
   10.31      return 0;
   10.32  }
   10.33  
   10.34 -void hypercall_page_initialise(void *hypercall_page)
   10.35 +static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
   10.36 +{
   10.37 +    extern asmlinkage int hypercall(void);
   10.38 +    char *p;
   10.39 +    int i;
   10.40 +
   10.41 +    /* Fill in all the transfer points with template machine code. */
   10.42 +
   10.43 +    for ( i = 0; i < NR_hypercalls; i++ )
   10.44 +    {
   10.45 +        p = (char *)(hypercall_page + (i * 32));
   10.46 +
   10.47 +        *(u8  *)(p+ 0) = 0x9c;      /* pushf */
   10.48 +        *(u8  *)(p+ 1) = 0xfa;      /* cli */
   10.49 +        *(u8  *)(p+ 2) = 0xb8;      /* mov $<i>,%eax */
   10.50 +        *(u32 *)(p+ 3) = i;
   10.51 +        *(u8  *)(p+ 7) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
   10.52 +        *(u32 *)(p+ 8) = (u32)&hypercall;
   10.53 +        *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
   10.54 +        *(u8  *)(p+14) = 0xc3;      /* ret */
   10.55 +    }
   10.56 +
   10.57 +    /*
   10.58 +     * HYPERVISOR_iret is special because it doesn't return and expects a
   10.59 +     * special stack frame. Guests jump at this transfer point instead of
   10.60 +     * calling it.
   10.61 +     */
   10.62 +    p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
   10.63 +    *(u8  *)(p+ 0) = 0x50;      /* push %eax */
   10.64 +    *(u8  *)(p+ 1) = 0x9c;      /* pushf */
   10.65 +    *(u8  *)(p+ 2) = 0xfa;      /* cli */
   10.66 +    *(u8  *)(p+ 3) = 0xb8;      /* mov $<i>,%eax */
   10.67 +    *(u32 *)(p+ 4) = __HYPERVISOR_iret;
   10.68 +    *(u8  *)(p+ 8) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
   10.69 +    *(u32 *)(p+ 9) = (u32)&hypercall;
   10.70 +    *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
   10.71 +}
   10.72 +
   10.73 +static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
   10.74  {
   10.75      char *p;
   10.76      int i;
   10.77  
   10.78      /* Fill in all the transfer points with template machine code. */
   10.79 +
   10.80      for ( i = 0; i < (PAGE_SIZE / 32); i++ )
   10.81      {
   10.82          p = (char *)(hypercall_page + (i * 32));
   10.83 @@ -316,6 +361,14 @@ void hypercall_page_initialise(void *hyp
   10.84      *(u16 *)(p+ 6) = 0x82cd;  /* int  $0x82 */
   10.85  }
   10.86  
   10.87 +void hypercall_page_initialise(void *hypercall_page)
   10.88 +{
   10.89 +    if ( supervisor_mode_kernel )
   10.90 +        hypercall_page_initialise_ring0_kernel(hypercall_page);
   10.91 +    else
   10.92 +        hypercall_page_initialise_ring1_kernel(hypercall_page);
   10.93 +}
   10.94 +
   10.95  /*
   10.96   * Local variables:
   10.97   * mode: C
    11.1 --- a/xen/arch/x86/x86_64/mm.c	Mon Feb 27 11:02:00 2006 +0000
    11.2 +++ b/xen/arch/x86/x86_64/mm.c	Mon Feb 27 15:52:43 2006 +0100
    11.3 @@ -228,7 +228,7 @@ long subarch_memory_op(int op, void *arg
    11.4  
    11.5  long do_stack_switch(unsigned long ss, unsigned long esp)
    11.6  {
    11.7 -    fixup_guest_selector(ss);
    11.8 +    fixup_guest_stack_selector(ss);
    11.9      current->arch.guest_context.kernel_ss = ss;
   11.10      current->arch.guest_context.kernel_sp = esp;
   11.11      return 0;
   11.12 @@ -315,7 +315,7 @@ int check_descriptor(struct desc_struct 
   11.13  
   11.14      /* Validate and fix up the target code selector. */
   11.15      cs = a >> 16;
   11.16 -    fixup_guest_selector(cs);
   11.17 +    fixup_guest_code_selector(cs);
   11.18      if ( !guest_gate_selector_okay(cs) )
   11.19          goto bad;
   11.20      a = d->a = (d->a & 0xffffU) | (cs << 16);
    12.1 --- a/xen/common/dom0_ops.c	Mon Feb 27 11:02:00 2006 +0000
    12.2 +++ b/xen/common/dom0_ops.c	Mon Feb 27 15:52:43 2006 +0100
    12.3 @@ -170,6 +170,13 @@ long do_dom0_op(struct dom0_op *u_dom0_o
    12.4          cpumask_t      cpu_exclude_map;
    12.5          static domid_t rover = 0;
    12.6  
    12.7 +        /*
    12.8 +         * Running the domain 0 kernel in ring 0 is not compatible
    12.9 +         * with multiple guests.
   12.10 +         */
   12.11 +        if ( supervisor_mode_kernel )
   12.12 +            return -EINVAL;
   12.13 +
   12.14          dom = op->u.createdomain.domain;
   12.15          if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
   12.16          {
    13.1 --- a/xen/common/kernel.c	Mon Feb 27 11:02:00 2006 +0000
    13.2 +++ b/xen/common/kernel.c	Mon Feb 27 15:52:43 2006 +0100
    13.3 @@ -195,6 +195,8 @@ long do_xen_version(int cmd, void *arg)
    13.4                      (1U << XENFEAT_writable_page_tables) |
    13.5                      (1U << XENFEAT_auto_translated_physmap) |
    13.6                      (1U << XENFEAT_pae_pgdir_above_4gb);
    13.7 +            if ( supervisor_mode_kernel )
    13.8 +                fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
    13.9              break;
   13.10          default:
   13.11              return -EINVAL;
    14.1 --- a/xen/include/asm-ia64/config.h	Mon Feb 27 11:02:00 2006 +0000
    14.2 +++ b/xen/include/asm-ia64/config.h	Mon Feb 27 15:52:43 2006 +0100
    14.3 @@ -40,6 +40,8 @@
    14.4  //leave SMP for a later time
    14.5  //#undef CONFIG_SMP
    14.6  
    14.7 +#define supervisor_mode_kernel (0)
    14.8 +
    14.9  #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */
   14.10  
   14.11  #ifndef __ASSEMBLY__
    15.1 --- a/xen/include/asm-x86/config.h	Mon Feb 27 11:02:00 2006 +0000
    15.2 +++ b/xen/include/asm-x86/config.h	Mon Feb 27 15:52:43 2006 +0100
    15.3 @@ -37,6 +37,12 @@
    15.4  
    15.5  #define NR_CPUS 32
    15.6  
    15.7 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
    15.8 +# define supervisor_mode_kernel (1)
    15.9 +#else
   15.10 +# define supervisor_mode_kernel (0)
   15.11 +#endif
   15.12 +
   15.13  /* Linkage for x86 */
   15.14  #define __ALIGN .align 16,0x90
   15.15  #define __ALIGN_STR ".align 16,0x90"
    16.1 --- a/xen/include/asm-x86/desc.h	Mon Feb 27 11:02:00 2006 +0000
    16.2 +++ b/xen/include/asm-x86/desc.h	Mon Feb 27 15:52:43 2006 +0100
    16.3 @@ -27,10 +27,23 @@
    16.4  #endif
    16.5  
    16.6  /* Fix up the RPL of a guest segment selector. */
    16.7 -#define fixup_guest_selector(sel)                               \
    16.8 +#define __fixup_guest_selector(sel)                             \
    16.9      ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) :        \
   16.10       (((sel) & ~3) | GUEST_KERNEL_RPL))
   16.11  
   16.12 +/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
   16.13 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
   16.14 +#define fixup_guest_stack_selector(ss) ((void)0)
   16.15 +#else
   16.16 +#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss)
   16.17 +#endif
   16.18 +
   16.19 +/*
   16.20 + * Code selectors are always fixed up. It allows the Xen exit stub to detect
   16.21 + * return to guest context, even when the guest kernel runs in ring 0.
   16.22 + */
   16.23 +#define fixup_guest_code_selector(cs)  __fixup_guest_selector(cs)
   16.24 +
   16.25  /*
   16.26   * We need this function because enforcing the correct guest kernel RPL is
   16.27   * unsufficient if the selector is poked into an interrupt, trap or call gate.
    17.1 --- a/xen/include/asm-x86/x86_32/asm_defns.h	Mon Feb 27 11:02:00 2006 +0000
    17.2 +++ b/xen/include/asm-x86/x86_32/asm_defns.h	Mon Feb 27 15:52:43 2006 +0100
    17.3 @@ -48,11 +48,26 @@
    17.4  
    17.5  #ifdef PERF_COUNTERS
    17.6  #define PERFC_INCR(_name,_idx)                          \
    17.7 -    lock incl perfcounters+_name(,_idx,4)
    17.8 +        lock incl perfcounters+_name(,_idx,4)
    17.9  #else
   17.10  #define PERFC_INCR(_name,_idx)
   17.11  #endif
   17.12  
   17.13 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
   17.14 +#define FIXUP_RING0_GUEST_STACK                         \
   17.15 +        testl $2,8(%esp);                               \
   17.16 +        jnz 1f; /* rings 2 & 3 permitted */             \
   17.17 +        testl $1,8(%esp);                               \
   17.18 +        jz 2f;                                          \
   17.19 +        ud2; /* ring 1 should not be used */            \
   17.20 +        2:cmpl $(__HYPERVISOR_VIRT_START),%esp;         \
   17.21 +        jge 1f;                                         \
   17.22 +        call fixup_ring0_guest_stack;                   \
   17.23 +        1:
   17.24 +#else
   17.25 +#define FIXUP_RING0_GUEST_STACK
   17.26 +#endif
   17.27 +
   17.28  #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v)
   17.29  #define XBUILD_SMP_INTERRUPT(x,v)               \
   17.30  asmlinkage void x(void);                        \
   17.31 @@ -61,6 +76,7 @@ asmlinkage void x(void);                
   17.32      ".globl " STR(x) "\n\t"                     \
   17.33      STR(x) ":\n\t"                              \
   17.34      "pushl $"#v"<<16\n\t"                       \
   17.35 +    STR(FIXUP_RING0_GUEST_STACK)                \
   17.36      STR(SAVE_ALL(a))                            \
   17.37      "movl %esp,%eax\n\t"                        \
   17.38      "pushl %eax\n\t"                            \
   17.39 @@ -72,6 +88,7 @@ asmlinkage void x(void);                
   17.40  __asm__(                                        \
   17.41      "\n" __ALIGN_STR"\n"                        \
   17.42      "common_interrupt:\n\t"                     \
   17.43 +    STR(FIXUP_RING0_GUEST_STACK)                \
   17.44      STR(SAVE_ALL(a))                            \
   17.45      "movl %esp,%eax\n\t"                        \
   17.46      "pushl %eax\n\t"                            \