direct-io.hg

changeset 364:942eb9bcae13

bitkeeper revision 1.170 (3e9c936fXyHEI0NKOWQkP9tHN4sbqw)

Many files:
Finished virtualisation of x86 LDT. Xenolinux now exports this to applications (eg. for use by linuxthreads).
author kaf24@scramble.cl.cam.ac.uk
date Tue Apr 15 23:19:11 2003 +0000 (2003-04-15)
parents 2eb189eacf01
children bee5c5831c41
files xen/TODO xen/arch/i386/boot/boot.S xen/arch/i386/entry.S xen/arch/i386/mm.c xen/arch/i386/process.c xen/arch/i386/traps.c xen/common/domain.c xen/common/kernel.c xen/common/memory.c xen/common/schedule.c xen/include/asm-i386/irq.h xen/include/asm-i386/ptrace.h xen/include/asm-i386/system.h xen/include/xeno/mm.h xen/include/xeno/sched.h xen/net/dev.c xenolinux-2.4.21-pre4-sparse/arch/xeno/kernel/entry.S
line diff
     1.1 --- a/xen/TODO	Tue Apr 15 17:09:35 2003 +0000
     1.2 +++ b/xen/TODO	Tue Apr 15 23:19:11 2003 +0000
     1.3 @@ -7,31 +7,7 @@ longer-term goals.
     1.4   -- Keir (16/3/03)
     1.5  
     1.6  
     1.7 -1. ASSIGNING DOMAINS TO PROCESSORS
     1.8 -----------------------------------
     1.9 -More intelligent assignment of domains to processors. In
    1.10 -particular, we don't play well with hyperthreading: we will assign
    1.11 -domains to virtual processors on the same package, rather then
    1.12 -spreading them across processor packages.
    1.13 -
    1.14 -What we need to do is port code from Linux which stores information on
    1.15 -relationships between processors in the system (eg. which ones are
    1.16 -siblings in the same package). We then use this to balance domains
    1.17 -across packages, and across virtual processors within a package.
    1.18 -
    1.19 -2. PROPER DESTRUCTION OF DOMAINS
    1.20 ---------------------------------
    1.21 -Currently we do not free resources when destroying a domain. This is
    1.22 -because they may be tied up in subsystems, and there is no way of
    1.23 -pulling them back in a safe manner.
    1.24 -
    1.25 -The fix is probably to reference count resources and automatically
    1.26 -free them when the count reaches zero. We may get away with one count
    1.27 -per domain (for all its resources). When this reaches zero we know it
    1.28 -is safe to free everything: block-device rings, network rings, and all
    1.29 -the rest.
    1.30 -
    1.31 -3. FIX HANDLING OF NETWORK RINGS
    1.32 +1. FIX HANDLING OF NETWORK RINGS
    1.33  --------------------------------
    1.34  Handling of the transmit rings is currently very broken (for example,
    1.35  sending an inter-domain packet will wedge the hypervisor). This is
    1.36 @@ -44,7 +20,39 @@ order to requests, just as we already do
    1.37  rings. We'll need to add an opaque identifier to ring entries,
    1.38  allowing matching of requests and responses, but that's about it.
    1.39  
    1.40 -4. NETWORK CHECKSUM OFFLOAD 
    1.41 +2. ACCURATE TIMERS AND WALL-CLOCK TIME
    1.42 +--------------------------------------
    1.43 +Currently our long-term timebase free runs on CPU0, with no external
    1.44 +calibration. We should run ntpd on domain 0 and allow this to warp
    1.45 +Xen's timebase. Once this is done, we can have a timebase per CPU and
    1.46 +not worry about relative drift (since they'll all get sync'ed
    1.47 +periodically by ntp).
    1.48 +
    1.49 +3. ASSIGNING DOMAINS TO PROCESSORS
    1.50 +----------------------------------
    1.51 +More intelligent assignment of domains to processors. In
    1.52 +particular, we don't play well with hyperthreading: we will assign
    1.53 +domains to virtual processors on the same package, rather then
    1.54 +spreading them across processor packages.
    1.55 +
    1.56 +What we need to do is port code from Linux which stores information on
    1.57 +relationships between processors in the system (eg. which ones are
    1.58 +siblings in the same package). We then use this to balance domains
    1.59 +across packages, and across virtual processors within a package.
    1.60 +
    1.61 +4. PROPER DESTRUCTION OF DOMAINS
    1.62 +--------------------------------
    1.63 +Currently we do not free resources when destroying a domain. This is
    1.64 +because they may be tied up in subsystems, and there is no way of
    1.65 +pulling them back in a safe manner.
    1.66 +
    1.67 +The fix is probably to reference count resources and automatically
    1.68 +free them when the count reaches zero. We may get away with one count
    1.69 +per domain (for all its resources). When this reaches zero we know it
    1.70 +is safe to free everything: block-device rings, network rings, and all
    1.71 +the rest.
    1.72 +
    1.73 +5. NETWORK CHECKSUM OFFLOAD 
    1.74  --------------------------- 
    1.75  All the NICs that we support can checksum packets on behalf of guest
    1.76  OSes. We need to add appropriate flags to and from each domain to
    1.77 @@ -52,17 +60,6 @@ indicate, on transmit, which packets nee
    1.78  receive, which packets have been checked out as okay. We can steal
    1.79  Linux's interface, which is entirely sane given NIC limitations.
    1.80  
    1.81 -5. GDT AND LDT VIRTUALISATION 
    1.82 ------------------------------ 
    1.83 -We do not allow modification of the GDT, or any use of the LDT. This
    1.84 -is necessary for support of unmodified applications (eg. Linux uses
    1.85 -LDT in threaded applications, while Windows needs to update GDT
    1.86 -entries).
    1.87 -
    1.88 -I have some text on how to do this:
    1.89 -/usr/groups/xeno/discussion-docs/memory_management/segment_tables.txt
    1.90 -It's already half implemented, but the rest is still to do.
    1.91 -
    1.92  6. DOMAIN 0 MANAGEMENT DAEMON
    1.93  -----------------------------
    1.94  A better control daemon is required for domain 0, which keeps proper
    1.95 @@ -70,15 +67,7 @@ track of machine resources and can make 
    1.96  may require support in Xen; for example, notifications (eg. DOMn is
    1.97  killed), and requests (eg. can DOMn allocate x frames of memory?).
    1.98  
    1.99 -7. ACCURATE TIMERS AND WALL-CLOCK TIME
   1.100 ---------------------------------------
   1.101 -Currently our long-term timebase free runs on CPU0, with no external
   1.102 -calibration. We should run ntpd on domain 0 and allow this to warp
   1.103 -Xen's timebase. Once this is done, we can have a timebase per CPU and
   1.104 -not worry about relative drift (since they'll all get sync'ed
   1.105 -periodically by ntp).
   1.106 -
   1.107 -8. MODULE SUPPORT FOR XEN
   1.108 +7. MODULE SUPPORT FOR XEN
   1.109  -------------------------
   1.110  Network and blkdev drivers are bloating Xen. At some point we want to
   1.111  build drivers as modules, stick them in a cheesy ramfs, then relocate
   1.112 @@ -90,7 +79,7 @@ which drivers to load.
   1.113  Most of the hard stuff (relocating and the like) is done for us by
   1.114  Linux's module system.
   1.115  
   1.116 -9. NEW DESIGN FEATURES
   1.117 +8. NEW DESIGN FEATURES
   1.118  ----------------------
   1.119  This includes the last-chance page cache, and the unified buffer cache.
   1.120  
   1.121 @@ -99,35 +88,6 @@ This includes the last-chance page cache
   1.122  Graveyard
   1.123  *********
   1.124  
   1.125 -Following is some description how some of the above might be
   1.126 -implemented. Some of it is superceded and/or out of date, so follow
   1.127 -with caution.
   1.128 -
   1.129 -Segment descriptor tables
   1.130 --------------------------
   1.131 -We want to allow guest OSes to specify GDT and LDT tables using their
   1.132 -own pages of memory (just like with page tables). So allow the following:
   1.133 - * new_table_entry(ptr, val)
   1.134 -   [Allows insertion of a code, data, or LDT descriptor into given
   1.135 -    location. Can simply be checked then poked, with no need to look at
   1.136 -    page type.]
   1.137 - * new_GDT() -- relevent virtual pages are resolved to frames. Either
   1.138 -    (i) page not present; or (ii) page is only mapped read-only and checks
   1.139 -    out okay (then marked as special page). Old table is resolved first,
   1.140 -    and the pages are unmarked (no longer special type).
   1.141 - * new_LDT() -- same as for new_GDT(), with same special page type.
   1.142 -
   1.143 -Page table updates must be hooked, so we look for updates to virtual page
   1.144 -addresses in the GDT/LDT range. If map to not present, then old physpage
   1.145 -has type_count decremented. If map to present, ensure read-only, check the
   1.146 -page, and set special type.
   1.147 -
   1.148 -Merge set_{LDT,GDT} into update_baseptr, by passing four args:
   1.149 - update_baseptrs(mask, ptab, gdttab, ldttab);
   1.150 -Update of ptab requires update of gtab (or set to internal default).
   1.151 -Update of gtab requires update of ltab (or set to internal default).
   1.152 -
   1.153 -
   1.154  The hypervisor page cache
   1.155  -------------------------
   1.156  This will allow guest OSes to make use of spare pages in the system, but
     2.1 --- a/xen/arch/i386/boot/boot.S	Tue Apr 15 17:09:35 2003 +0000
     2.2 +++ b/xen/arch/i386/boot/boot.S	Tue Apr 15 23:19:11 2003 +0000
     2.3 @@ -87,43 +87,30 @@ continue_boot_cpu:
     2.4          xor     %eax,%eax
     2.5          rep     stosb
     2.6  
     2.7 -        /* Copy all modules (dom0 + initrd if presetn) to safety, above 48MB */
     2.8 +        /* Copy all modules (dom0 + initrd if present) out of the Xen heap */
     2.9          mov     (%esp),%eax
    2.10          cmp     $0x2BADB002,%eax
    2.11 -        jne     2f                           /* skip if magic no good */
    2.12 -
    2.13 -        sub     $__PAGE_OFFSET,%ebx	     /* turn back into a physaddr */
    2.14 -
    2.15 -	mov     0x14(%ebx),%edi              /* mbi->mods_count */
    2.16 -        dec     %edi                         /* count-- */
    2.17 -
    2.18 -	jb	2f			     /* if num modules was zero !!! */
    2.19 -
    2.20 +        jne     skip_dom0_copy
    2.21 +        sub     $__PAGE_OFFSET,%ebx          /* turn back into a phys addr */
    2.22 +        mov     0x14(%ebx),%edi              /* mb->mods_count */
    2.23 +        dec     %edi                         /* mbi->mods_count-- */
    2.24 +        jb      skip_dom0_copy               /* skip if no modules */
    2.25          mov     0x18(%ebx),%eax              /* mbi->mods_addr */
    2.26 -
    2.27 -	mov     (%eax),%ebx                  /* mod[0]->mod_start */
    2.28 -
    2.29 -        shl     $4,%edi                      /* count*16 */
    2.30 +        mov     (%eax),%ebx                  /* %ebx = mod[0]->mod_start */
    2.31 +        shl     $4,%edi                    
    2.32          add     %edi,%eax
    2.33 -        
    2.34 -        mov     0x4(%eax),%eax               /* mod[mod_count-1]->end */
    2.35 -
    2.36 +        mov     0x4(%eax),%eax               /* %eax = mod[mod_count-1]->end */
    2.37          mov     %eax,%ecx
    2.38 -        sub     %ebx,%ecx		     /* length in byte */
    2.39 -
    2.40 +        sub     %ebx,%ecx                    /* %ecx = byte len of all mods */
    2.41          mov     $(MAX_DIRECTMAP_ADDRESS), %edi
    2.42 -        add     %ecx, %edi		     /* src + length */
    2.43 -        
    2.44 -        shr     $2,%ecx                      /* ecx is length/4 */
    2.45 -
    2.46 -1:
    2.47 -        sub     $4,%eax                      /* eax = src, edi = dst */
    2.48 +        add     %ecx, %edi                   /* %edi = src + length */        
    2.49 +        shr     $2,%ecx                      /* %ecx = length/4 */
    2.50 +1:      sub     $4,%eax                      /* %eax = src, %edi = dst */
    2.51          sub     $4,%edi
    2.52          mov     (%eax),%ebx
    2.53          mov     %ebx,(%edi)
    2.54          loop 1b
    2.55 -
    2.56 -2:              
    2.57 +skip_dom0_copy:              
    2.58  
    2.59          /* Initialize low and high mappings of all memory with 4MB pages */
    2.60          mov     $idle0_pg_table-__PAGE_OFFSET,%edi
     3.1 --- a/xen/arch/i386/entry.S	Tue Apr 15 17:09:35 2003 +0000
     3.2 +++ b/xen/arch/i386/entry.S	Tue Apr 15 23:19:11 2003 +0000
     3.3 @@ -19,12 +19,14 @@
     3.4   *	18(%esp) - %eax
     3.5   *	1C(%esp) - %ds
     3.6   *	20(%esp) - %es
     3.7 - *	24(%esp) - orig_eax
     3.8 - *	28(%esp) - %eip
     3.9 - *	2C(%esp) - %cs
    3.10 - *	30(%esp) - %eflags
    3.11 - *	34(%esp) - %oldesp
    3.12 - *	38(%esp) - %oldss
    3.13 + *	24(%esp) - %fs
    3.14 + *	28(%esp) - %gs
    3.15 + *	2C(%esp) - orig_eax
    3.16 + *	30(%esp) - %eip
    3.17 + *	34(%esp) - %cs
    3.18 + *	38(%esp) - %eflags
    3.19 + *	3C(%esp) - %oldesp
    3.20 + *	40(%esp) - %oldss
    3.21   *
    3.22   * "current" is in register %ebx during any slow entries.
    3.23   */
    3.24 @@ -56,13 +58,14 @@
    3.25   * of the frame does an inter-privilege interrupt-return.
    3.26   * 
    3.27   * Note that the "failsafe callback" uses a special stackframe:
    3.28 - * { return_DS, return_ES, return_EIP, return_CS, return_EFLAGS, ... }
    3.29 - * That is, original values for DS/ES are placed on stack rather than
    3.30 - * in DS/ES themselves. Why? It saves us loading them, only to have them
    3.31 + * { return_DS, return_ES, return_FS, return_GS, return_EIP,
    3.32 + *   return_CS, return_EFLAGS[, return_ESP, return_SS] }
    3.33 + * That is, original values for DS/ES/FS/GS are placed on stack rather than
    3.34 + * in DS/ES/FS/GS themselves. Why? It saves us loading them, only to have them
    3.35   * saved/restored in guest OS. Furthermore, if we load them we may cause
    3.36   * a fault if they are invalid, which is a hassle to deal with. We avoid
    3.37   * that problem if we don't load them :-) This property allows us to use
    3.38 - * the failsafe callback as a fallback: if we ever fault on loading DS/ES
    3.39 + * the failsafe callback as a fallback: if we ever fault on loading DS/ES/FS/GS
    3.40   * on return to ring != 0, we can simply package it up as a return via
    3.41   * the failsafe callback, and let the guest OS sort it out (perhaps by
    3.42   * killing an application process). Note that we also do this for any
    3.43 @@ -90,12 +93,14 @@ EBP		= 0x14
    3.44  EAX		= 0x18
    3.45  DS		= 0x1C
    3.46  ES		= 0x20
    3.47 -ORIG_EAX	= 0x24
    3.48 -EIP		= 0x28
    3.49 -CS		= 0x2C
    3.50 -EFLAGS		= 0x30
    3.51 -OLDESP		= 0x34
    3.52 -OLDSS		= 0x38
    3.53 +FS              = 0x24
    3.54 +GS              = 0x28
    3.55 +ORIG_EAX	= 0x2C
    3.56 +EIP		= 0x30
    3.57 +CS		= 0x34
    3.58 +EFLAGS		= 0x38
    3.59 +OLDESP		= 0x3C
    3.60 +OLDSS		= 0x40
    3.61  
    3.62  /* Offsets in task_struct */
    3.63  PROCESSOR       =  0
    3.64 @@ -113,14 +118,14 @@ EVENTS          =  0
    3.65  EVENTS_ENABLE   =  4
    3.66  
    3.67  /* Offsets in guest_trap_bounce */
    3.68 -GTB_ERROR_CODE  =  0
    3.69 -GTB_CR2         =  4
    3.70 -GTB_FLAGS       =  8
    3.71 -GTB_CS          = 10
    3.72 -GTB_EIP         = 12
    3.73 -GTBF_TRAP       =  1
    3.74 -GTBF_TRAP_NOCODE = 2
    3.75 -GTBF_TRAP_CR2   = 4
    3.76 +GTB_ERROR_CODE   =  0
    3.77 +GTB_CR2          =  4
    3.78 +GTB_FLAGS        =  8
    3.79 +GTB_CS           = 10
    3.80 +GTB_EIP          = 12
    3.81 +GTBF_TRAP        =  1
    3.82 +GTBF_TRAP_NOCODE =  2
    3.83 +GTBF_TRAP_CR2    =  4
    3.84                          
    3.85  CF_MASK		= 0x00000001
    3.86  IF_MASK		= 0x00000200
    3.87 @@ -128,6 +133,8 @@ NT_MASK		= 0x00004000
    3.88  
    3.89  #define SAVE_ALL \
    3.90  	cld; \
    3.91 +	pushl %gs; \
    3.92 +	pushl %fs; \
    3.93  	pushl %es; \
    3.94  	pushl %ds; \
    3.95  	pushl %eax; \
    3.96 @@ -139,7 +146,8 @@ NT_MASK		= 0x00004000
    3.97  	pushl %ebx; \
    3.98  	movl $(__HYPERVISOR_DS),%edx; \
    3.99  	movl %edx,%ds; \
   3.100 -	movl %edx,%es;
   3.101 +	movl %edx,%es; \
   3.102 +        sti; 
   3.103  
   3.104  #define RESTORE_ALL	\
   3.105  	popl %ebx;	\
   3.106 @@ -151,13 +159,17 @@ NT_MASK		= 0x00004000
   3.107  	popl %eax;	\
   3.108  1:	popl %ds;	\
   3.109  2:	popl %es;	\
   3.110 +3:	popl %fs;	\
   3.111 +4:	popl %gs;	\
   3.112          addl $4,%esp;	\
   3.113 -3:      iret;		\
   3.114 +5:      iret;		\
   3.115  .section .fixup,"ax";	\
   3.116 -6:      subl $4,%esp;   \
   3.117 -        pushl %es;      \
   3.118 -5:      pushl %ds;      \
   3.119 -4:      pushl %eax;     \
   3.120 +10:     subl $4,%esp;   \
   3.121 +        pushl %gs;      \
   3.122 +9:      pushl %fs;      \
   3.123 +8:      pushl %es;      \
   3.124 +7:      pushl %ds;      \
   3.125 +6:      pushl %eax;     \
   3.126  	pushl %ebp;     \
   3.127  	pushl %edi;     \
   3.128  	pushl %esi;     \
   3.129 @@ -172,9 +184,11 @@ 4:      pushl %eax;     \
   3.130  .previous;                           \
   3.131  .section __ex_table,"a";             \
   3.132  	.align 4;	             \
   3.133 -	.long 1b,4b;       	     \
   3.134 -	.long 2b,5b;	             \
   3.135 -	.long 3b,6b;	             \
   3.136 +	.long 1b,6b;       	     \
   3.137 +	.long 2b,7b;	             \
   3.138 +	.long 3b,8b;	             \
   3.139 +	.long 4b,9b;	             \
   3.140 +	.long 5b,10b;	             \
   3.141  .previous
   3.142  
   3.143  #define GET_CURRENT(reg)  \
   3.144 @@ -315,7 +329,22 @@ process_hyp_events:
   3.145  
   3.146  /* No special register assumptions */
   3.147  failsafe_callback:
   3.148 -        GET_CURRENT(%ebx)
   3.149 +        # Check that we are actually returning to ring != 0 because
   3.150 +        # we may fault when returning to another ring 0 activation.
   3.151 +        # This can only occur when restoring FS and GS, which can be avoided
   3.152 +        # by zeroing those registers and trying again. The outermost ring 0
   3.153 +        # activation will do a full failsafe callback to the guest OS.
   3.154 +        # Note that the outermost activation certainly has the "bad" selector
   3.155 +        # value saved away, since interrupts are always disabled in ring 0
   3.156 +        # until all segment registers have been saved.
   3.157 +        movb CS(%esp),%al
   3.158 +        test $3,%al
   3.159 +        jnz  1f
   3.160 +        xorl %eax,%eax
   3.161 +        movl %eax,FS(%esp)
   3.162 +        movl %eax,GS(%esp)
   3.163 +        jmp  restore_all   
   3.164 +1:      GET_CURRENT(%ebx)
   3.165          mov  PROCESSOR(%ebx),%eax
   3.166          shl  $4,%eax
   3.167          lea  guest_trap_bounce(%eax),%edx
   3.168 @@ -324,11 +353,15 @@ failsafe_callback:
   3.169          movl FAILSAFE_SEL(%ebx),%eax
   3.170          movw %ax,GTB_CS(%edx)
   3.171          call create_bounce_frame
   3.172 -        subl $8,%esi                 # add DS/ES to failsafe stack frame
   3.173 +        subl $16,%esi                # add DS/ES/FS/GS to failsafe stack frame
   3.174          movl DS(%esp),%eax
   3.175  FAULT1: movl %eax,(%esi) 
   3.176          movl ES(%esp),%eax
   3.177  FAULT2: movl %eax,4(%esi)
   3.178 +        movl FS(%esp),%eax
   3.179 +FAULT3: movl %eax,8(%esi) 
   3.180 +        movl GS(%esp),%eax
   3.181 +FAULT4: movl %eax,12(%esi)
   3.182          movl %esi,OLDESP(%esp)
   3.183          popl %ebx
   3.184          popl %ecx
   3.185 @@ -337,8 +370,8 @@ FAULT2: movl %eax,4(%esi)
   3.186          popl %edi
   3.187          popl %ebp
   3.188          popl %eax
   3.189 -        addl $12,%esp
   3.190 -FAULT3: iret 
   3.191 +        addl $20,%esp                # skip DS/ES/FS/GS/ORIG_EAX
   3.192 +FAULT5: iret 
   3.193  
   3.194          
   3.195  /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS (RING-1) STACK:         */
   3.196 @@ -354,25 +387,25 @@ create_bounce_frame:
   3.197          shll $8,%eax /* multiply by 256 */
   3.198          addl $init_tss + 12,%eax
   3.199          movl (%eax),%esi /* tss->esp1 */
   3.200 -FAULT4: movl 4(%eax),%ds /* tss->ss1  */
   3.201 +FAULT6: movl 4(%eax),%ds /* tss->ss1  */
   3.202          /* base of stack frame must contain ss/esp (inter-priv iret) */
   3.203          subl $8,%esi
   3.204          movl OLDESP+4(%esp),%eax
   3.205 -FAULT5: movl %eax,(%esi) 
   3.206 +FAULT7: movl %eax,(%esi) 
   3.207          movl OLDSS+4(%esp),%eax
   3.208 -FAULT6: movl %eax,4(%esi) 
   3.209 +FAULT8: movl %eax,4(%esi) 
   3.210          jmp 2f
   3.211  1:      /* obtain ss/esp from oldss/oldesp -- a ring-1 activation exists */
   3.212          movl OLDESP+4(%esp),%esi
   3.213 -FAULT7: movl OLDSS+4(%esp),%ds 
   3.214 +FAULT9: movl OLDSS+4(%esp),%ds 
   3.215  2:      /* Construct a stack frame: EFLAGS, CS/EIP */
   3.216          subl $12,%esi
   3.217          movl EIP+4(%esp),%eax
   3.218 -FAULT8: movl %eax,(%esi) 
   3.219 +FAULT10:movl %eax,(%esi) 
   3.220          movl CS+4(%esp),%eax
   3.221 -FAULT9: movl %eax,4(%esi) 
   3.222 +FAULT11:movl %eax,4(%esi) 
   3.223          movl EFLAGS+4(%esp),%eax
   3.224 -FAULT10:movl %eax,8(%esi)
   3.225 +FAULT12:movl %eax,8(%esi)
   3.226          /* Rewrite our stack frame and return to ring 1. */
   3.227          /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */
   3.228          andl $0xfffcbeff,%eax
   3.229 @@ -390,16 +423,18 @@ FAULT10:movl %eax,8(%esi)
   3.230          .align 4
   3.231          .long FAULT1, kill_domain_fixup3 # Fault writing to ring-1 stack
   3.232          .long FAULT2, kill_domain_fixup3 # Fault writing to ring-1 stack
   3.233 -        .long FAULT3, kill_domain_fixup1 # Fault executing failsafe iret
   3.234 -        .long FAULT4, kill_domain_fixup2 # Fault loading ring-1 stack selector
   3.235 -        .long FAULT5, kill_domain_fixup2 # Fault writing to ring-1 stack
   3.236 -        .long FAULT6, kill_domain_fixup2 # Fault writing to ring-1 stack
   3.237 -        .long FAULT7, kill_domain_fixup2 # Fault loading ring-1 stack selector
   3.238 +        .long FAULT3, kill_domain_fixup3 # Fault writing to ring-1 stack
   3.239 +        .long FAULT4, kill_domain_fixup3 # Fault writing to ring-1 stack
   3.240 +        .long FAULT5, kill_domain_fixup1 # Fault executing failsafe iret
   3.241 +        .long FAULT6, kill_domain_fixup2 # Fault loading ring-1 stack selector
   3.242 +        .long FAULT7, kill_domain_fixup2 # Fault writing to ring-1 stack
   3.243          .long FAULT8, kill_domain_fixup2 # Fault writing to ring-1 stack
   3.244 -        .long FAULT9, kill_domain_fixup2 # Fault writing to ring-1 stack
   3.245 +        .long FAULT9, kill_domain_fixup2 # Fault loading ring-1 stack selector
   3.246          .long FAULT10,kill_domain_fixup2 # Fault writing to ring-1 stack
   3.247 -        .long FAULT11,kill_domain_fixup3 # Fault writing to ring-1 stack
   3.248 -        .long FAULT12,kill_domain_fixup3 # Fault writing to ring-1 stack
   3.249 +        .long FAULT11,kill_domain_fixup2 # Fault writing to ring-1 stack
   3.250 +        .long FAULT12,kill_domain_fixup2 # Fault writing to ring-1 stack
   3.251 +        .long FAULT13,kill_domain_fixup3 # Fault writing to ring-1 stack
   3.252 +        .long FAULT14,kill_domain_fixup3 # Fault writing to ring-1 stack
   3.253  .previous
   3.254                 
   3.255  # This handler kills domains which experience unrecoverable faults.
   3.256 @@ -429,12 +464,12 @@ process_guest_exception_and_events:
   3.257          jnz  2f
   3.258          subl $4,%esi                    # push error_code onto guest frame
   3.259          movl %es:GTB_ERROR_CODE(%edx),%eax
   3.260 -FAULT11:movl %eax,(%esi)
   3.261 +FAULT13:movl %eax,(%esi)
   3.262          test $GTBF_TRAP_CR2,%cl
   3.263          jz   1f
   3.264          subl $4,%esi                    # push %cr2 onto guest frame
   3.265          movl %es:GTB_CR2(%edx),%eax
   3.266 -FAULT12:movl %eax,(%esi)
   3.267 +FAULT14:movl %eax,(%esi)
   3.268  1:      movl %esi,OLDESP(%esp)        
   3.269  2:      push %es                        # unclobber %ds
   3.270          pop  %ds 
   3.271 @@ -463,31 +498,36 @@ ENTRY(divide_error)
   3.272  	pushl $ SYMBOL_NAME(do_divide_error)
   3.273  	ALIGN
   3.274  error_code:
   3.275 +	pushl %fs
   3.276 +	pushl %es
   3.277  	pushl %ds
   3.278  	pushl %eax
   3.279 -	xorl %eax,%eax
   3.280 +	xorl  %eax,%eax
   3.281  	pushl %ebp
   3.282  	pushl %edi
   3.283  	pushl %esi
   3.284  	pushl %edx
   3.285 -	decl %eax			# eax = -1
   3.286 +	decl  %eax			# eax = -1
   3.287  	pushl %ecx
   3.288  	pushl %ebx
   3.289  	cld
   3.290 -	movl %es,%ecx
   3.291 -	movl ORIG_EAX(%esp), %esi	# get the error code
   3.292 -	movl ES(%esp), %edi		# get the function address
   3.293 -	movl %eax, ORIG_EAX(%esp)
   3.294 -	movl %ecx, ES(%esp)
   3.295 -	movl %esp,%edx
   3.296 +	movl  %gs,%ecx
   3.297 +	movl  ORIG_EAX(%esp), %esi	# get the error code
   3.298 +	movl  GS(%esp), %edi		# get the function address
   3.299 +	movl  %eax, ORIG_EAX(%esp)
   3.300 +	movl  %ecx, GS(%esp)
   3.301 +	movl  %esp,%edx
   3.302  	pushl %esi			# push the error code
   3.303  	pushl %edx			# push the pt_regs pointer
   3.304 -	movl $(__HYPERVISOR_DS),%edx
   3.305 -	movl %edx,%ds
   3.306 -	movl %edx,%es
   3.307 +	movl  $(__HYPERVISOR_DS),%edx
   3.308 +	movl  %edx,%ds
   3.309 +	movl  %edx,%es
   3.310  	GET_CURRENT(%ebx)
   3.311 -	call *%edi
   3.312 -	addl $8,%esp
   3.313 +	call  *%edi
   3.314 +        # NB. We reenable interrupts AFTER exception processing, as that is
   3.315 +        #     required by the page fault handler (needs to save %cr2)
   3.316 +        sti
   3.317 +        addl  $8,%esp
   3.318  	jmp ret_from_exception
   3.319  
   3.320  ENTRY(coprocessor_error)
     4.1 --- a/xen/arch/i386/mm.c	Tue Apr 15 17:09:35 2003 +0000
     4.2 +++ b/xen/arch/i386/mm.c	Tue Apr 15 23:19:11 2003 +0000
     4.3 @@ -227,6 +227,9 @@ long do_set_gdt(unsigned long *frame_lis
     4.4          current->mm.perdomain_pt[i] = mk_l1_pgentry(0);
     4.5          if ( pfn == 0 ) continue;
     4.6          page = frame_table + pfn;
     4.7 +        ASSERT((page->flags & PG_type_mask) == PGT_gdt_page);
     4.8 +        ASSERT((page->flags & PG_domain_mask) == current->domain);
     4.9 +        ASSERT((page->type_count != 0) && (page->tot_count != 0));
    4.10          put_page_type(page);
    4.11          put_page_tot(page);
    4.12      }
     5.1 --- a/xen/arch/i386/process.c	Tue Apr 15 17:09:35 2003 +0000
     5.2 +++ b/xen/arch/i386/process.c	Tue Apr 15 23:19:11 2003 +0000
     5.3 @@ -199,8 +199,9 @@ void show_regs(struct pt_regs * regs)
     5.4             regs->eax,regs->ebx,regs->ecx,regs->edx);
     5.5      printk("ESI: %08lx EDI: %08lx EBP: %08lx",
     5.6             regs->esi, regs->edi, regs->ebp);
     5.7 -    printk(" DS: %04x ES: %04x\n",
     5.8 -           0xffff & regs->xds,0xffff & regs->xes);
     5.9 +    printk(" DS: %04x ES: %04x FS: %04x GS: %04x\n",
    5.10 +           0xffff & regs->xds, 0xffff & regs->xes,
    5.11 +           0xffff & regs->xfs, 0xffff & regs->xgs);
    5.12  
    5.13      __asm__("movl %%cr0, %0": "=r" (cr0));
    5.14      __asm__("movl %%cr2, %0": "=r" (cr2));
    5.15 @@ -260,7 +261,7 @@ void new_thread(struct task_struct *p,
    5.16       *  [EAX,EBX,ECX,EDX,EDI,EBP are zero]
    5.17       */
    5.18      p->thread.fs = p->thread.gs = FLAT_RING1_DS;
    5.19 -    regs->xds = regs->xes = regs->xss = FLAT_RING1_DS;
    5.20 +    regs->xds = regs->xes = regs->xfs = regs->xgs = regs->xss = FLAT_RING1_DS;
    5.21      regs->xcs = FLAT_RING1_CS;
    5.22      regs->eip = start_pc;
    5.23      regs->esp = start_stack;
    5.24 @@ -313,8 +314,7 @@ void new_thread(struct task_struct *p,
    5.25  /* NB. prev_p passed in %eax, next_p passed in %edx */
    5.26  void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
    5.27  {
    5.28 -    struct thread_struct *prev = &prev_p->thread,
    5.29 -        *next = &next_p->thread;
    5.30 +    struct thread_struct *next = &next_p->thread;
    5.31      struct tss_struct *tss = init_tss + smp_processor_id();
    5.32  
    5.33      unlazy_fpu(prev_p);
    5.34 @@ -327,24 +327,11 @@ void __switch_to(struct task_struct *pre
    5.35      tss->esp1 = next->esp1;
    5.36      tss->ss1  = next->ss1;
    5.37  
    5.38 -    /*
    5.39 -     * Save away %fs and %gs. No need to save %es and %ds, as
    5.40 -     * those are always kernel segments while inside the kernel.
    5.41 -     */
    5.42 -    asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
    5.43 -    asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
    5.44 -
    5.45      /* Switch GDT and LDT. */
    5.46      __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
    5.47      load_LDT();
    5.48  
    5.49      /*
    5.50 -     * Restore %fs and %gs.
    5.51 -     */
    5.52 -    loadsegment(fs, next->fs);
    5.53 -    loadsegment(gs, next->gs);
    5.54 -
    5.55 -    /*
    5.56       * Now maybe reload the debug registers
    5.57       */
    5.58      if (next->debugreg[7]){
     6.1 --- a/xen/arch/i386/traps.c	Tue Apr 15 17:09:35 2003 +0000
     6.2 +++ b/xen/arch/i386/traps.c	Tue Apr 15 23:19:11 2003 +0000
     6.3 @@ -159,8 +159,9 @@ void show_registers(struct pt_regs *regs
     6.4             regs->eax, regs->ebx, regs->ecx, regs->edx);
     6.5      printk("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
     6.6             regs->esi, regs->edi, regs->ebp, esp);
     6.7 -    printk("ds: %04x   es: %04x   ss: %04x\n",
     6.8 -           regs->xds & 0xffff, regs->xes & 0xffff, ss);
     6.9 +    printk("ds: %04x   es: %04x   fs: %04x   gs: %04x   ss: %04x\n",
    6.10 +           regs->xds & 0xffff, regs->xes & 0xffff, 
    6.11 +           regs->xfs & 0xffff, regs->xgs & 0xffff, ss);
    6.12  
    6.13      show_stack(&regs->esp);
    6.14  }	
    6.15 @@ -170,10 +171,11 @@ spinlock_t die_lock = SPIN_LOCK_UNLOCKED
    6.16  
    6.17  void die(const char * str, struct pt_regs * regs, long err)
    6.18  {
    6.19 -    spin_lock_irq(&die_lock);
    6.20 +    unsigned long flags;
    6.21 +    spin_lock_irqsave(&die_lock, flags);
    6.22      printk("%s: %04lx,%04lx\n", str, err >> 16, err & 0xffff);
    6.23      show_registers(regs);
    6.24 -    spin_unlock_irq(&die_lock);
    6.25 +    spin_unlock_irqrestore(&die_lock, flags);
    6.26      panic("HYPERVISOR DEATH!!\n");
    6.27  }
    6.28  
    6.29 @@ -205,6 +207,7 @@ static void inline do_trap(int trapnr, c
    6.30      if ( (fixup = search_exception_table(regs->eip)) != 0 )
    6.31      {
    6.32          regs->eip = fixup;
    6.33 +        regs->xfs = regs->xgs = 0;
    6.34          return;
    6.35      }
    6.36  
    6.37 @@ -264,9 +267,6 @@ asmlinkage void do_page_fault(struct pt_
    6.38  
    6.39   bounce_fault:
    6.40  
    6.41 -    if ( (regs->xcs &3) == 1 )
    6.42 -        printk("Fault at %08x (%08x)\n", addr, regs->eip); /* XXX */
    6.43 -
    6.44      ti = p->thread.traps + 14;
    6.45      gtb->flags = GTBF_TRAP_CR2; /* page fault pushes %cr2 */
    6.46      gtb->cr2        = addr;
    6.47 @@ -285,7 +285,7 @@ asmlinkage void do_page_fault(struct pt_
    6.48      off  = addr - LDT_VIRT_START;
    6.49      addr = p->mm.ldt_base + off;
    6.50  
    6.51 -    spin_lock_irq(&p->page_lock);
    6.52 +    spin_lock(&p->page_lock);
    6.53  
    6.54      pl2e  = map_domain_mem(pagetable_val(p->mm.pagetable));
    6.55      l2e   = l2_pgentry_val(pl2e[l2_table_offset(addr)]);
    6.56 @@ -303,34 +303,30 @@ asmlinkage void do_page_fault(struct pt_
    6.57      if ( (page->flags & PG_type_mask) != PGT_ldt_page )
    6.58      {
    6.59          if ( page->type_count != 0 )
    6.60 -        { /* XXX */
    6.61 -            printk("BOGO TYPE %08lx %ld\n", page->flags, page->type_count);
    6.62              goto unlock_and_bounce_fault;
    6.63 -        }
    6.64 +
    6.65          /* Check all potential LDT entries in the page. */
    6.66          ldt_page = map_domain_mem(l1e & PAGE_MASK);
    6.67          for ( i = 0; i < 512; i++ )
    6.68              if ( !check_descriptor(ldt_page[i*2], ldt_page[i*2+1]) )
    6.69 -            { /* XXX */
    6.70 -                printk("Bad desc!!!!!\n");
    6.71                  goto unlock_and_bounce_fault;
    6.72 -            }
    6.73          unmap_domain_mem(ldt_page);
    6.74 +
    6.75          page->flags &= ~PG_type_mask;
    6.76          page->flags |= PGT_ldt_page;
    6.77 -        get_page_type(page);
    6.78 -        get_page_tot(page);
    6.79      }
    6.80  
    6.81 -    p->mm.perdomain_pt[l1_table_offset(off)+16] = mk_l1_pgentry(l1e);
    6.82 +    get_page_type(page);
    6.83 +    get_page_tot(page);
    6.84 +    p->mm.perdomain_pt[l1_table_offset(off)+16] = mk_l1_pgentry(l1e|_PAGE_RW);
    6.85  
    6.86 -    spin_unlock_irq(&p->page_lock);
    6.87 +    spin_unlock(&p->page_lock);
    6.88      return;
    6.89  
    6.90  
    6.91   unlock_and_bounce_fault:
    6.92  
    6.93 -    spin_unlock_irq(&p->page_lock);
    6.94 +    spin_unlock(&p->page_lock);
    6.95      goto bounce_fault;
    6.96  
    6.97  
    6.98 @@ -339,6 +335,7 @@ asmlinkage void do_page_fault(struct pt_
    6.99      if ( (fixup = search_exception_table(regs->eip)) != 0 )
   6.100      {
   6.101          regs->eip = fixup;
   6.102 +        regs->xfs = regs->xgs = 0;
   6.103          return;
   6.104      }
   6.105  
   6.106 @@ -420,8 +417,8 @@ asmlinkage void do_general_protection(st
   6.107  
   6.108      if ( (fixup = search_exception_table(regs->eip)) != 0 )
   6.109      {
   6.110 -        printk("Hmmmm %08lx -> %08lx (%04lx)\n", regs->eip, fixup, error_code);
   6.111          regs->eip = fixup;
   6.112 +        regs->xfs = regs->xgs = 0;
   6.113          return;
   6.114      }
   6.115  
   6.116 @@ -565,31 +562,14 @@ do { \
   6.117  	 "3" ((char *) (addr)),"2" (__HYPERVISOR_CS << 16)); \
   6.118  } while (0)
   6.119  
   6.120 -
   6.121 -/*
   6.122 - * This needs to use 'idt_table' rather than 'idt', and
   6.123 - * thus use the _nonmapped_ version of the IDT, as the
   6.124 - * Pentium F0 0F bugfix can have resulted in the mapped
   6.125 - * IDT being write-protected.
   6.126 - */
   6.127  void set_intr_gate(unsigned int n, void *addr)
   6.128  {
   6.129      _set_gate(idt_table+n,14,0,addr);
   6.130  }
   6.131  
   6.132 -static void __init set_trap_gate(unsigned int n, void *addr)
   6.133 -{
   6.134 -    _set_gate(idt_table+n,15,0,addr);
   6.135 -}
   6.136 -
   6.137  static void __init set_system_gate(unsigned int n, void *addr)
   6.138  {
   6.139 -    _set_gate(idt_table+n,15,3,addr);
   6.140 -}
   6.141 -
   6.142 -static void __init set_call_gate(void *a, void *addr)
   6.143 -{
   6.144 -    _set_gate(a,12,3,addr);
   6.145 +    _set_gate(idt_table+n,14,3,addr);
   6.146  }
   6.147  
   6.148  #define _set_seg_desc(gate_addr,type,dpl,base,limit) {\
   6.149 @@ -620,29 +600,37 @@ void set_tss_desc(unsigned int n, void *
   6.150  
   6.151  void __init trap_init(void)
   6.152  {
   6.153 -    set_trap_gate(0,&divide_error);
   6.154 -    set_trap_gate(1,&debug);
   6.155 +    /*
   6.156 +     * Note that interrupt gates are always used, rather than trap gates. We 
   6.157 +     * must have interrupts disabled until DS/ES/FS/GS are saved because the 
   6.158 +     * first activation must have the "bad" value(s) for these registers and 
   6.159 +     * we may lose them if another activation is installed before they are 
   6.160 +     * saved. The page-fault handler also needs interrupts disabled until %cr2 
   6.161 +     * has been read and saved on the stack.
   6.162 +     */
   6.163 +    set_intr_gate(0,&divide_error);
   6.164 +    set_intr_gate(1,&debug);
   6.165      set_intr_gate(2,&nmi);
   6.166      set_system_gate(3,&int3);     /* usable from all privilege levels */
   6.167      set_system_gate(4,&overflow); /* usable from all privilege levels */
   6.168 -    set_trap_gate(5,&bounds);
   6.169 -    set_trap_gate(6,&invalid_op);
   6.170 -    set_trap_gate(7,&device_not_available);
   6.171 -    set_trap_gate(8,&double_fault);
   6.172 -    set_trap_gate(9,&coprocessor_segment_overrun);
   6.173 -    set_trap_gate(10,&invalid_TSS);
   6.174 -    set_trap_gate(11,&segment_not_present);
   6.175 -    set_trap_gate(12,&stack_segment);
   6.176 -    set_trap_gate(13,&general_protection);
   6.177 +    set_intr_gate(5,&bounds);
   6.178 +    set_intr_gate(6,&invalid_op);
   6.179 +    set_intr_gate(7,&device_not_available);
   6.180 +    set_intr_gate(8,&double_fault);
   6.181 +    set_intr_gate(9,&coprocessor_segment_overrun);
   6.182 +    set_intr_gate(10,&invalid_TSS);
   6.183 +    set_intr_gate(11,&segment_not_present);
   6.184 +    set_intr_gate(12,&stack_segment);
   6.185 +    set_intr_gate(13,&general_protection);
   6.186      set_intr_gate(14,&page_fault);
   6.187 -    set_trap_gate(15,&spurious_interrupt_bug);
   6.188 -    set_trap_gate(16,&coprocessor_error);
   6.189 -    set_trap_gate(17,&alignment_check);
   6.190 -    set_trap_gate(18,&machine_check);
   6.191 -    set_trap_gate(19,&simd_coprocessor_error);
   6.192 +    set_intr_gate(15,&spurious_interrupt_bug);
   6.193 +    set_intr_gate(16,&coprocessor_error);
   6.194 +    set_intr_gate(17,&alignment_check);
   6.195 +    set_intr_gate(18,&machine_check);
   6.196 +    set_intr_gate(19,&simd_coprocessor_error);
   6.197  
   6.198      /* Only ring 1 can access monitor services. */
   6.199 -    _set_gate(idt_table+HYPERVISOR_CALL_VECTOR,15,1,&hypervisor_call);
   6.200 +    _set_gate(idt_table+HYPERVISOR_CALL_VECTOR,14,1,&hypervisor_call);
   6.201  
   6.202      /* CPU0 uses the master IDT. */
   6.203      idt_tables[0] = idt_table;
     7.1 --- a/xen/common/domain.c	Tue Apr 15 17:09:35 2003 +0000
     7.2 +++ b/xen/common/domain.c	Tue Apr 15 23:19:11 2003 +0000
     7.3 @@ -392,20 +392,19 @@ int setup_guestos(struct task_struct *p,
     7.4      /* Sanity! */
     7.5      if ( p->domain != 0 ) BUG();
     7.6  
     7.7 -    /* This is all a bit grim. We've moved the modules to the "safe"
     7.8 -     physical memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later
     7.9 -     in this routeine, we're going to copy it down into the region
    7.10 -     that's actually been allocated to domain 0. This is highly likely
    7.11 -     to be overlapping, so we use a forward copy.
    7.12 +    /*
    7.13 +     * This is all a bit grim. We've moved the modules to the "safe" physical 
    7.14 +     * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this 
    7.15 +     * routeine, we're going to copy it down into the region that's actually 
    7.16 +     * been allocated to domain 0. This is highly likely to be overlapping, so 
    7.17 +     * we use a forward copy.
    7.18 +     * 
    7.19 +     * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with 
    7.20 +     * 4GB and lots of network/disk cards that allocate loads of buffers. 
    7.21 +     * We'll have to revist this if we ever support PAE (64GB).
    7.22 +     */
    7.23  
    7.24 -     MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine
    7.25 -     with 4GB and lots of network/disk cards that allocate loads of
    7.26 -     buffers. We'll have to revist this if we ever support PAE (64GB).
    7.27 -
    7.28 - */
    7.29 -
    7.30 -
    7.31 -    data_start = map_domain_mem( (unsigned long) phy_data_start );
    7.32 +    data_start = map_domain_mem((unsigned long)phy_data_start);
    7.33  
    7.34      if ( strncmp(data_start, "XenoGues", 8) )
    7.35      {
    7.36 @@ -480,7 +479,7 @@ int setup_guestos(struct task_struct *p,
    7.37          if ( count < p->tot_pages )
    7.38          {
    7.39              page = frame_table + (cur_address >> PAGE_SHIFT);
    7.40 -            page->flags = dom | PGT_writeable_page;
    7.41 +            page->flags = dom | PGT_writeable_page | PG_need_flush;
    7.42              page->type_count = page->tot_count = 1;
    7.43              /* Set up the MPT entry. */
    7.44              machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
    7.45 @@ -558,24 +557,21 @@ int setup_guestos(struct task_struct *p,
    7.46      __write_cr3_counted(pagetable_val(p->mm.pagetable));
    7.47  
    7.48      /* Copy the guest OS image. */    
    7.49 -    src = (char *)(phy_data_start + 12);
    7.50 -    vsrc= (char *)(data_start + 12); /* data_start invalid after first page*/
    7.51 -    dst = (char *)virt_load_address;
    7.52 +    src  = (char *)(phy_data_start + 12);
    7.53 +    vsrc = (char *)(data_start + 12); /* data_start invalid after first page*/
    7.54 +    dst  = (char *)virt_load_address;
    7.55      while ( src < (phy_data_start+data_len) )
    7.56 -      {
    7.57 +    {
    7.58  	*dst++ = *vsrc++;
    7.59  	src++;
    7.60 -
    7.61  	if ( (((unsigned long)src) & (PAGE_SIZE-1)) == 0 )
    7.62 -	  {
    7.63 +        {
    7.64  	    unmap_domain_mem( vsrc-1 );
    7.65  	    vsrc = map_domain_mem( (unsigned long)src );
    7.66 -	  }
    7.67 -      }
    7.68 +        }
    7.69 +    }
    7.70      unmap_domain_mem( vsrc );
    7.71 -
    7.72 -    printk("copy done\n");
    7.73 -
    7.74 +    
    7.75      /* Set up start info area. */
    7.76      memset(virt_startinfo_address, 0, sizeof(*virt_startinfo_address));
    7.77      virt_startinfo_address->nr_pages = p->tot_pages;
    7.78 @@ -585,13 +581,13 @@ int setup_guestos(struct task_struct *p,
    7.79          ((p->tot_pages - 1) << PAGE_SHIFT); 
    7.80  
    7.81      if ( initrd_len )
    7.82 -      {
    7.83 +    {
    7.84  	virt_startinfo_address->mod_start = (unsigned long)dst-initrd_len;
    7.85  	virt_startinfo_address->mod_len   = initrd_len;
    7.86 -
    7.87 -	printk("Initrd len 0x%x, start at 0x%08x\n",
    7.88 -	       virt_startinfo_address->mod_len, virt_startinfo_address->mod_start);
    7.89 -      }
    7.90 +	printk("Initrd len 0x%lx, start at 0x%08lx\n",
    7.91 +	       virt_startinfo_address->mod_len, 
    7.92 +               virt_startinfo_address->mod_start);
    7.93 +    }
    7.94  
    7.95      /* Add virtual network interfaces and point to them in startinfo. */
    7.96      while (params->num_vifs-- > 0) {
     8.1 --- a/xen/common/kernel.c	Tue Apr 15 17:09:35 2003 +0000
     8.2 +++ b/xen/common/kernel.c	Tue Apr 15 23:19:11 2003 +0000
     8.3 @@ -192,18 +192,18 @@ void cmain (unsigned long magic, multibo
     8.4      new_dom = do_newdomain(0, 0);
     8.5      if ( new_dom == NULL ) panic("Error creating domain 0\n");
     8.6  
     8.7 -    /* We're going to setup domain0 using the module(s) that we
     8.8 -       stashed safely above our MAX_DIRECTMAP_ADDRESS in boot/Boot.S
     8.9 -
    8.10 -       The second module, if present, is an initrd ramdisk
    8.11 +    /*
    8.12 +     * We're going to setup domain0 using the module(s) that we stashed safely 
    8.13 +     * above our MAX_DIRECTMAP_ADDRESS in boot/Boot.S The second module, if 
    8.14 +     * present, is an initrd ramdisk
    8.15       */
    8.16 -
    8.17      if ( setup_guestos(new_dom, 
    8.18                         &dom0_params, 
    8.19 -                       MAX_DIRECTMAP_ADDRESS, 
    8.20 -                       mod[mbi->mods_count-1].mod_end - mod[0].mod_start, 		              __va(mod[0].string),
    8.21 -		       (mbi->mods_count==2)?
    8.22 -		           (mod[1].mod_end - mod[1].mod_start):0)
    8.23 +                       (char *)MAX_DIRECTMAP_ADDRESS, 
    8.24 +                       mod[mbi->mods_count-1].mod_end - mod[0].mod_start,
    8.25 +                       __va(mod[0].string),
    8.26 +		       (mbi->mods_count == 2) ?
    8.27 +                       (mod[1].mod_end - mod[1].mod_start):0)
    8.28           != 0 ) panic("Could not set up DOM0 guest OS\n");
    8.29  
    8.30      update_dom_time(new_dom->shared_info);
     9.1 --- a/xen/common/memory.c	Tue Apr 15 17:09:35 2003 +0000
     9.2 +++ b/xen/common/memory.c	Tue Apr 15 23:19:11 2003 +0000
     9.3 @@ -176,7 +176,7 @@
     9.4  #include <asm/uaccess.h>
     9.5  #include <asm/domain_page.h>
     9.6  
     9.7 -#if 1
     9.8 +#if 0
     9.9  #define MEM_LOG(_f, _a...) printk("DOM%d: (file=memory.c, line=%d) " _f "\n", current->domain, __LINE__, ## _a )
    9.10  #else
    9.11  #define MEM_LOG(_f, _a...) ((void)0)
    9.12 @@ -724,6 +724,9 @@ static int do_extended_command(unsigned 
    9.13                      if ( pfn == 0 ) continue;
    9.14                      current->mm.perdomain_pt[i] = mk_l1_pgentry(0);
    9.15                      page = frame_table + pfn;
    9.16 +                    ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
    9.17 +                    ASSERT((page->flags & PG_domain_mask) == current->domain);
    9.18 +                    ASSERT((page->type_count != 0) && (page->tot_count != 0));
    9.19                      put_page_type(page);
    9.20                      put_page_tot(page);                
    9.21                  }
    10.1 --- a/xen/common/schedule.c	Tue Apr 15 17:09:35 2003 +0000
    10.2 +++ b/xen/common/schedule.c	Tue Apr 15 23:19:11 2003 +0000
    10.3 @@ -394,7 +394,7 @@ asmlinkage void schedule(void)
    10.4  
    10.5  #ifndef NDEBUG
    10.6      if (r_time < ctx_allow) {
    10.7 -        printk("[%02d]: %lx\n", this_cpu, r_time);
    10.8 +        printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time);
    10.9          dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
   10.10      }
   10.11  #endif
    11.1 --- a/xen/include/asm-i386/irq.h	Tue Apr 15 17:09:35 2003 +0000
    11.2 +++ b/xen/include/asm-i386/irq.h	Tue Apr 15 23:19:11 2003 +0000
    11.3 @@ -102,6 +102,8 @@ extern char _stext, _etext;
    11.4  
    11.5  #define SAVE_ALL \
    11.6  	"cld\n\t" \
    11.7 +	"pushl %gs\n\t" \
    11.8 +	"pushl %fs\n\t" \
    11.9  	"pushl %es\n\t" \
   11.10  	"pushl %ds\n\t" \
   11.11  	"pushl %eax\n\t" \
    12.1 --- a/xen/include/asm-i386/ptrace.h	Tue Apr 15 17:09:35 2003 +0000
    12.2 +++ b/xen/include/asm-i386/ptrace.h	Tue Apr 15 23:19:11 2003 +0000
    12.3 @@ -1,28 +1,6 @@
    12.4  #ifndef _I386_PTRACE_H
    12.5  #define _I386_PTRACE_H
    12.6  
    12.7 -#define EBX 0
    12.8 -#define ECX 1
    12.9 -#define EDX 2
   12.10 -#define ESI 3
   12.11 -#define EDI 4
   12.12 -#define EBP 5
   12.13 -#define EAX 6
   12.14 -#define DS 7
   12.15 -#define ES 8
   12.16 -#define FS 9
   12.17 -#define GS 10
   12.18 -#define ORIG_EAX 11
   12.19 -#define EIP 12
   12.20 -#define CS  13
   12.21 -#define EFL 14
   12.22 -#define UESP 15
   12.23 -#define SS   16
   12.24 -#define FRAME_SIZE 17
   12.25 -
   12.26 -/* this struct defines the way the registers are stored on the 
   12.27 -   stack during a system call. */
   12.28 -
   12.29  struct pt_regs {
   12.30  	long ebx;
   12.31  	long ecx;
   12.32 @@ -33,6 +11,8 @@ struct pt_regs {
   12.33  	long eax;
   12.34  	int  xds;
   12.35  	int  xes;
   12.36 +	int  xfs;
   12.37 +	int  xgs;
   12.38  	long orig_eax;
   12.39  	long eip;
   12.40  	int  xcs;
   12.41 @@ -41,19 +21,6 @@ struct pt_regs {
   12.42  	int  xss;
   12.43  };
   12.44  
   12.45 -/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
   12.46 -#define PTRACE_GETREGS            12
   12.47 -#define PTRACE_SETREGS            13
   12.48 -#define PTRACE_GETFPREGS          14
   12.49 -#define PTRACE_SETFPREGS          15
   12.50 -#define PTRACE_GETFPXREGS         18
   12.51 -#define PTRACE_SETFPXREGS         19
   12.52 -
   12.53 -#define PTRACE_SETOPTIONS         21
   12.54 -
   12.55 -/* options set using PTRACE_SETOPTIONS */
   12.56 -#define PTRACE_O_TRACESYSGOOD     0x00000001
   12.57 -
   12.58  enum EFLAGS {
   12.59          EF_CF   = 0x00000001,
   12.60          EF_PF   = 0x00000004,
    13.1 --- a/xen/include/asm-i386/system.h	Tue Apr 15 17:09:35 2003 +0000
    13.2 +++ b/xen/include/asm-i386/system.h	Tue Apr 15 23:19:11 2003 +0000
    13.3 @@ -4,8 +4,9 @@
    13.4  #include <xeno/config.h>
    13.5  #include <asm/bitops.h>
    13.6  
    13.7 -struct task_struct;	/* one of the stranger aspects of C forward declarations.. */
    13.8 -extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
    13.9 +struct task_struct;
   13.10 +extern void FASTCALL(__switch_to(struct task_struct *prev, 
   13.11 +                                 struct task_struct *next));
   13.12  
   13.13  #define prepare_to_switch()	do { } while(0)
   13.14  #define switch_to(prev,next) do {					\
   13.15 @@ -33,30 +34,7 @@ extern void FASTCALL(__switch_to(struct 
   13.16                       :"memory");                                        \
   13.17  } while (0)
   13.18  
   13.19 -/*
   13.20 - * Load a segment. Fall back on loading the zero
   13.21 - * segment if something goes wrong..
   13.22 - */
   13.23 -#define loadsegment(seg,value)			\
   13.24 -	asm volatile("\n"			\
   13.25 -		"1:\t"				\
   13.26 -		"movl %0,%%" #seg "\n"		\
   13.27 -		"2:\n"				\
   13.28 -		".section .fixup,\"ax\"\n"	\
   13.29 -		"3:\t"				\
   13.30 -		"pushl $0\n\t"			\
   13.31 -		"popl %%" #seg "\n\t"		\
   13.32 -		"jmp 2b\n"			\
   13.33 -		".previous\n"			\
   13.34 -		".section __ex_table,\"a\"\n\t"	\
   13.35 -		".align 4\n\t"			\
   13.36 -		".long 1b,3b\n"			\
   13.37 -		".previous"			\
   13.38 -		: :"m" (*(unsigned int *)&(value)))
   13.39 -
   13.40 -/*
   13.41 - * Clear and set 'TS' bit respectively
   13.42 - */
   13.43 +/* Clear and set 'TS' bit respectively */
   13.44  #define clts() __asm__ __volatile__ ("clts")
   13.45  #define read_cr0() ({ \
   13.46  	unsigned int __dummy; \
   13.47 @@ -152,7 +130,7 @@ static inline void __set_64bit_var (unsi
   13.48  /*
   13.49   * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
   13.50   * Note 2: xchg has side effect, so that attribute volatile is necessary,
   13.51 - *	  but generally the primitive is invalid, *ptr is output argument. --ANK
   13.52 + *   but generally the primitive is invalid, *ptr is output argument. --ANK
   13.53   */
   13.54  static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
   13.55  {
    14.1 --- a/xen/include/xeno/mm.h	Tue Apr 15 17:09:35 2003 +0000
    14.2 +++ b/xen/include/xeno/mm.h	Tue Apr 15 23:19:11 2003 +0000
    14.3 @@ -67,12 +67,14 @@ typedef struct pfn_info {
    14.4  #define REFCNT_PIN_BIT 0x40000000UL
    14.5  
    14.6  #define get_page_tot(p)		 ((p)->tot_count++)
    14.7 -#define put_page_tot(p)		 (--(p)->tot_count)
    14.8 +#define put_page_tot(p)		 \
    14.9 +    ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; })
   14.10  #define page_tot_count(p)	 ((p)->tot_count)
   14.11  #define set_page_tot_count(p,v)  ((p)->tot_count = v)
   14.12  
   14.13  #define get_page_type(p)	 ((p)->type_count++)
   14.14 -#define put_page_type(p)	 (--(p)->type_count)
   14.15 +#define put_page_type(p)	 \
   14.16 +    ({ ASSERT((p)->type_count != 0); --(p)->type_count; })
   14.17  #define page_type_count(p)	 ((p)->type_count)
   14.18  #define set_page_type_count(p,v) ((p)->type_count = v)
   14.19  
   14.20 @@ -95,18 +97,18 @@ typedef struct pfn_info {
   14.21  #define PGT_gdt_page        (5<<24) /* using this page in a GDT? */
   14.22  #define PGT_ldt_page        (6<<24) /* using this page in an LDT? */
   14.23  #define PGT_writeable_page  (7<<24) /* has writable mappings of this page? */
   14.24 -#define PGT_net_rx_buf      (8<<24) /* this page has been pirated by the net code. */
   14.25 +#define PGT_net_rx_buf      (8<<24) /* this page taken by the net code. */
   14.26  
   14.27  /*
   14.28   * This bit indicates that the TLB must be flushed when the type count of this
   14.29   * frame drops to zero. This is needed on current x86 processors only for
   14.30 - * frames which have guestos-accessible writeable mappings. In this case we must 
   14.31 - * prevent stale TLB entries allowing the frame to be written if it used for a
   14.32 - * page table, for example.
   14.33 + * frames which have guestos-accessible writeable mappings. In this case we
   14.34 + * must prevent stale TLB entries allowing the frame to be written if it used
   14.35 + * for a page table, for example.
   14.36   * 
   14.37 - * We have this bit because the writeable type is actually also used to pin a page
   14.38 - * when it is used as a disk read buffer. This doesn't require a TLB flush because
   14.39 - * the frame never has a mapping in the TLB.
   14.40 + * We have this bit because the writeable type is actually also used to pin a
   14.41 + * page when it is used as a disk read buffer. This doesn't require a TLB flush
   14.42 + * because the frame never has a mapping in the TLB.
   14.43   */
   14.44  #define PG_need_flush       (1<<28)
   14.45  
   14.46 @@ -114,10 +116,10 @@ typedef struct pfn_info {
   14.47  #define PageSetSlab(page)	set_bit(PG_slab, &(page)->flags)
   14.48  #define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
   14.49  
   14.50 -#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)            \
   14.51 -    do {                                             \
   14.52 -        (_pfn)->flags = (_dom) | PGT_writeable_page; \
   14.53 -        (_pfn)->tot_count = (_pfn)->type_count = 1;  \
   14.54 +#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                            \
   14.55 +    do {                                                             \
   14.56 +        (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \
   14.57 +        (_pfn)->tot_count = (_pfn)->type_count = 2;                  \
   14.58      } while ( 0 )
   14.59  
   14.60  #define UNSHARE_PFN(_pfn) \
    15.1 --- a/xen/include/xeno/sched.h	Tue Apr 15 17:09:35 2003 +0000
    15.2 +++ b/xen/include/xeno/sched.h	Tue Apr 15 23:19:11 2003 +0000
    15.3 @@ -80,8 +80,8 @@ struct task_struct {
    15.4       * Return vectors pushed to us by guest OS.
    15.5       * The stack frame for events is exactly that of an x86 hardware interrupt.
    15.6       * The stack frame for a failsafe callback is augmented with saved values
    15.7 -     * for segment registers %ds and %es:
    15.8 -     * 	%ds, %es, %eip, %cs, %eflags [, %oldesp, %oldss]
    15.9 +     * for segment registers %ds, %es, %fs and %gs:
   15.10 +     * 	%ds, %es, %fs, %gs, %eip, %cs, %eflags [, %oldesp, %oldss]
   15.11       */
   15.12      unsigned long event_selector;    /* 20: entry CS  */
   15.13      unsigned long event_address;     /* 24: entry EIP */
    16.1 --- a/xen/net/dev.c	Tue Apr 15 17:09:35 2003 +0000
    16.2 +++ b/xen/net/dev.c	Tue Apr 15 23:19:11 2003 +0000
    16.3 @@ -521,7 +521,7 @@ void deliver_packet(struct sk_buff *skb,
    16.4      g_pfn->tot_count = g_pfn->type_count = 0;
    16.5      h_pfn->flags = g_pfn->flags & ~PG_type_mask;
    16.6          
    16.7 -    if (*g_pte & _PAGE_RW) h_pfn->flags |= PGT_writeable_page;
    16.8 +    if (*g_pte & _PAGE_RW) h_pfn->flags |= PGT_writeable_page | PG_need_flush;
    16.9      g_pfn->flags = 0;
   16.10          
   16.11      /* Point the guest at the new machine frame. */
   16.12 @@ -567,7 +567,6 @@ int netif_rx(struct sk_buff *skb)
   16.13      local_irq_save(flags);
   16.14  
   16.15      ASSERT(skb->skb_type == SKB_ZERO_COPY);
   16.16 -    ASSERT((skb->data - skb->head) == (18 + ETH_HLEN));
   16.17  
   16.18      /*
   16.19       * Offset will include 16 bytes padding from dev_alloc_skb, 14 bytes for 
    17.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/kernel/entry.S	Tue Apr 15 17:09:35 2003 +0000
    17.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/kernel/entry.S	Tue Apr 15 23:19:11 2003 +0000
    17.3 @@ -373,13 +373,19 @@ critical_fixup_table:
    17.4  ENTRY(failsafe_callback)
    17.5  1:      pop  %ds
    17.6  2:      pop  %es
    17.7 -3:      iret
    17.8 +3:      pop  %fs
    17.9 +4:      pop  %gs
   17.10 +5:      iret
   17.11  .section .fixup,"ax";	\
   17.12 -4:	movl $0,(%esp);	\
   17.13 +6:	movl $0,(%esp);	\
   17.14  	jmp 1b;		\
   17.15 -5:	movl $0,(%esp);	\
   17.16 +7:	movl $0,(%esp);	\
   17.17  	jmp 2b;		\
   17.18 -6:	pushl %ss;	\
   17.19 +8:	movl $0,(%esp);	\
   17.20 +	jmp 3b;		\
   17.21 +9:	movl $0,(%esp);	\
   17.22 +	jmp 4b;		\
   17.23 +10:	pushl %ss;	\
   17.24  	popl %ds;	\
   17.25  	pushl %ss;	\
   17.26  	popl %es;	\
   17.27 @@ -388,9 +394,11 @@ 6:	pushl %ss;	\
   17.28  .previous;		\
   17.29  .section __ex_table,"a";\
   17.30  	.align 4;	\
   17.31 -	.long 1b,4b;	\
   17.32 -	.long 2b,5b;	\
   17.33 -	.long 3b,6b;	\
   17.34 +	.long 1b,6b;	\
   17.35 +	.long 2b,7b;	\
   17.36 +	.long 3b,8b;	\
   17.37 +	.long 4b,9b;	\
   17.38 +	.long 5b,10b;	\
   17.39  .previous
   17.40          
   17.41  ENTRY(coprocessor_error)