ia64/xen-unstable

changeset 1011:06425d6a590e

bitkeeper revision 1.653.1.2 (3fe44437s9U66sHQJ1hA64CActA2uw)

Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xeno
author kaf24@scramble.cl.cam.ac.uk
date Sat Dec 20 12:44:39 2003 +0000 (2003-12-20)
parents 9be9350b3317 14aefa321a10
children b301a3d4a2a5
files .rootkeys tools/xc/lib/xc_linux_build.c tools/xc/lib/xc_linux_restore.c tools/xc/lib/xc_linux_save.c xen/GUEST_CHANGES xen/TODO xen/arch/i386/Rules.mk xen/arch/i386/apic.c xen/arch/i386/entry.S xen/arch/i386/flushtlb.c xen/arch/i386/io_apic.c xen/arch/i386/ioremap.c xen/arch/i386/irq.c xen/arch/i386/mm.c xen/arch/i386/pci-irq.c xen/arch/i386/process.c xen/arch/i386/smp.c xen/arch/i386/smpboot.c xen/arch/i386/traps.c xen/common/dom0_ops.c xen/common/dom_mem_ops.c xen/common/domain.c xen/common/kernel.c xen/common/memory.c xen/common/network.c xen/common/page_alloc.c xen/drivers/block/ll_rw_blk.c xen/drivers/block/xen_block.c xen/drivers/block/xen_vbd.c xen/drivers/net/e1000/e1000_main.c xen/include/asm-i386/atomic.h xen/include/asm-i386/flushtlb.h xen/include/asm-i386/io.h xen/include/asm-i386/page.h xen/include/asm-i386/pgalloc.h xen/include/asm-i386/smp.h xen/include/asm-i386/spinlock.h xen/include/asm-i386/system.h xen/include/hypervisor-ifs/dom0_ops.h xen/include/hypervisor-ifs/hypervisor-if.h xen/include/xeno/config.h xen/include/xeno/mm.h xen/include/xeno/perfc.h xen/include/xeno/perfc_defn.h xen/include/xeno/sched.h xen/include/xeno/vif.h xen/net/dev.c xen/net/skbuff.c xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c xenolinux-2.4.23-sparse/arch/xeno/mm/init.c xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
line diff
     1.1 --- a/.rootkeys	Sat Dec 20 11:49:50 2003 +0000
     1.2 +++ b/.rootkeys	Sat Dec 20 12:44:39 2003 +0000
     1.3 @@ -80,10 +80,8 @@ 3fbd0a42l40lM0IICw2jXbQBVZSdZg tools/xc/
     1.4  3fbd4bd6GtGwZGxYUJPOheYIR7bPaA tools/xc/py/XenoUtil.py
     1.5  3fbd0a40yT6G3M9hMpaz5xTUdl0E4g tools/xc/py/setup.py
     1.6  3f72f1bdJPsV3JCnBqs9ddL9tr6D2g xen/COPYING
     1.7 -3f841450eJvqAD1Dldc0_aOweGiglQ xen/GUEST_CHANGES
     1.8  3ddb79bcbOVHh38VJzc97-JEGD4dJQ xen/Makefile
     1.9  3ddb79bcWnTwYsQRWl_PaneJfa6p0w xen/Rules.mk
    1.10 -3e74d2be6ELqhaY1sW0yyHRKhpOvDQ xen/TODO
    1.11  3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/i386/Makefile
    1.12  3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/i386/Rules.mk
    1.13  3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/i386/acpitable.c
    1.14 @@ -93,6 +91,7 @@ 3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/
    1.15  3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c
    1.16  3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S
    1.17  3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c
    1.18 +3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c
    1.19  3ddb79bcesE5E-lS4QhRhlqXxqj9cA xen/arch/i386/i387.c
    1.20  3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/i386/i8259.c
    1.21  3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/i386/idle0_task.c
     2.1 --- a/tools/xc/lib/xc_linux_build.c	Sat Dec 20 11:49:50 2003 +0000
     2.2 +++ b/tools/xc/lib/xc_linux_build.c	Sat Dec 20 12:44:39 2003 +0000
     2.3 @@ -106,12 +106,12 @@ static int setup_guestos(int xc_handle,
     2.4                           const char *cmdline,
     2.5                           unsigned long shared_info_frame)
     2.6  {
     2.7 -    l1_pgentry_t *vl1tab = NULL, *vl1e = NULL;
     2.8 -    l2_pgentry_t *vl2tab = NULL, *vl2e = NULL;
     2.9 +    l1_pgentry_t *vl1tab;
    2.10 +    l2_pgentry_t *vl2tab;
    2.11      unsigned long *page_array = NULL;
    2.12      mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL;
    2.13      int alloc_index, num_pt_pages;
    2.14 -    unsigned long l2tab;
    2.15 +    unsigned long l2tab, l2e, l1e=0;
    2.16      unsigned long l1tab = 0;
    2.17      unsigned long num_pgt_updates = 0;
    2.18      unsigned long count, pt_start, i, j;
    2.19 @@ -230,44 +230,46 @@ static int setup_guestos(int xc_handle,
    2.20      if ( (vl2tab = map_pfn(pm_handle, l2tab >> PAGE_SHIFT)) == NULL )
    2.21          goto error_out;
    2.22      memset(vl2tab, 0, PAGE_SIZE);
    2.23 -    vl2e = vl2tab + l2_table_offset(virt_load_addr);
    2.24 +    unmap_pfn(pm_handle, vl2tab);
    2.25 +    l2e = l2tab + (l2_table_offset(virt_load_addr)*sizeof(l2_pgentry_t));
    2.26      for ( count = 0; count < tot_pages; count++ )
    2.27      {    
    2.28 -        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) 
    2.29 +        if ( (l1e & (PAGE_SIZE-1)) == 0 )
    2.30          {
    2.31              l1tab = page_array[alloc_index] << PAGE_SHIFT;
    2.32              if ( (vl1tab = map_pfn(pm_handle, l1tab >> PAGE_SHIFT)) == NULL )
    2.33                  goto error_out;
    2.34              memset(vl1tab, 0, PAGE_SIZE);
    2.35 +            unmap_pfn(pm_handle, vl1tab);
    2.36              alloc_index--;
    2.37  		
    2.38 -            vl1e = vl1tab + l1_table_offset(virt_load_addr + 
    2.39 -                                            (count << PAGE_SHIFT));
    2.40 +            l1e = l1tab + (l1_table_offset(virt_load_addr+(count<<PAGE_SHIFT))*
    2.41 +                           sizeof(l1_pgentry_t));
    2.42  
    2.43              /* make apropriate entry in the page directory */
    2.44 -            pgt_updates->ptr = (unsigned long)vl2e;
    2.45 +            pgt_updates->ptr = l2e;
    2.46              pgt_updates->val = l1tab | L2_PROT;
    2.47              pgt_updates++;
    2.48              num_pgt_updates++;
    2.49 -            vl2e++;
    2.50 +            l2e += sizeof(l2_pgentry_t);
    2.51          }
    2.52  
    2.53          if ( count < pt_start )
    2.54          {
    2.55 -            pgt_updates->ptr = (unsigned long)vl1e;
    2.56 +            pgt_updates->ptr = l1e;
    2.57              pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT;
    2.58              pgt_updates++;
    2.59              num_pgt_updates++;
    2.60 -            vl1e++;
    2.61 +            l1e += sizeof(l1_pgentry_t);
    2.62          }
    2.63          else
    2.64          {
    2.65 -            pgt_updates->ptr = (unsigned long)vl1e;
    2.66 +            pgt_updates->ptr = l1e;
    2.67              pgt_updates->val = 
    2.68                  ((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW;
    2.69              pgt_updates++;
    2.70              num_pgt_updates++;
    2.71 -            vl1e++;
    2.72 +            l1e += sizeof(l1_pgentry_t);
    2.73          }
    2.74  
    2.75          pgt_updates->ptr = 
     3.1 --- a/tools/xc/lib/xc_linux_restore.c	Sat Dec 20 11:49:50 2003 +0000
     3.2 +++ b/tools/xc/lib/xc_linux_restore.c	Sat Dec 20 12:44:39 2003 +0000
     3.3 @@ -301,7 +301,8 @@ int xc_linux_restore(int xc_handle,
     3.4                      page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
     3.5                  }
     3.6                  if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
     3.7 -                                    (unsigned long)&ppage[j], page[j]) )
     3.8 +                                    (mfn<<PAGE_SHIFT)+(j*sizeof(l1_pgentry_t)),
     3.9 +                                    page[j]) )
    3.10                      goto out;
    3.11              }
    3.12              break;
    3.13 @@ -337,7 +338,8 @@ int xc_linux_restore(int xc_handle,
    3.14                      page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
    3.15                  }
    3.16                  if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
    3.17 -                                    (unsigned long)&ppage[j], page[j]) )
    3.18 +                                    (mfn<<PAGE_SHIFT)+(j*sizeof(l2_pgentry_t)),
    3.19 +                                    page[j]) )
    3.20                      goto out;
    3.21              }
    3.22              break;
    3.23 @@ -345,9 +347,6 @@ int xc_linux_restore(int xc_handle,
    3.24              memcpy(ppage, page, PAGE_SIZE);
    3.25              break;
    3.26          }
    3.27 -        /* NB. Must flush before unmapping page, as pass VAs to Xen. */
    3.28 -        if ( flush_mmu_updates(xc_handle, mmu_updates, &mmu_update_idx) )
    3.29 -            goto out;
    3.30          unmap_pfn(pm_handle, ppage);
    3.31  
    3.32          if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
     4.1 --- a/tools/xc/lib/xc_linux_save.c	Sat Dec 20 11:49:50 2003 +0000
     4.2 +++ b/tools/xc/lib/xc_linux_save.c	Sat Dec 20 12:44:39 2003 +0000
     4.3 @@ -44,19 +44,20 @@ static int check_pfn_ownership(int xc_ha
     4.4  {
     4.5      dom0_op_t op;
     4.6      op.cmd = DOM0_GETPAGEFRAMEINFO;
     4.7 -    op.u.getpageframeinfo.pfn = mfn;
     4.8 -    if ( (do_dom0_op(xc_handle, &op) < 0) || 
     4.9 -         (op.u.getpageframeinfo.domain != dom) )
    4.10 -        return 0;
    4.11 -    return 1;
    4.12 +    op.u.getpageframeinfo.pfn    = mfn;
    4.13 +    op.u.getpageframeinfo.domain = dom;
    4.14 +    return (do_dom0_op(xc_handle, &op) >= 0);
    4.15  }
    4.16  
    4.17  #define GETPFN_ERR (~0U)
    4.18 -static unsigned int get_pfn_type(int xc_handle, unsigned long mfn)
    4.19 +static unsigned int get_pfn_type(int xc_handle, 
    4.20 +                                 unsigned long mfn, 
    4.21 +                                 unsigned int dom)
    4.22  {
    4.23      dom0_op_t op;
    4.24      op.cmd = DOM0_GETPAGEFRAMEINFO;
    4.25 -    op.u.getpageframeinfo.pfn = mfn;
    4.26 +    op.u.getpageframeinfo.pfn    = mfn;
    4.27 +    op.u.getpageframeinfo.domain = dom;
    4.28      if ( do_dom0_op(xc_handle, &op) < 0 )
    4.29      {
    4.30          PERROR("Unexpected failure when getting page frame info!");
    4.31 @@ -259,7 +260,8 @@ int xc_linux_save(int xc_handle,
    4.32          mfn_to_pfn_table[mfn] = i;
    4.33  
    4.34          /* Query page type by MFN, but store it by PFN. */
    4.35 -        if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn)) == GETPFN_ERR )
    4.36 +        if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) == 
    4.37 +             GETPFN_ERR )
    4.38              goto out;
    4.39      }
    4.40  
     5.1 --- a/xen/GUEST_CHANGES	Sat Dec 20 11:49:50 2003 +0000
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,26 +0,0 @@
     5.4 -
     5.5 -The interface between Xen and overlying guest OSes has changed in the
     5.6 -following ways since version 1.0:
     5.7 -
     5.8 -Modified hypercall 'pt_update'
     5.9 -------------------------------
    5.10 -Page-table updates passed to the 'pt_update' hypercall must now
    5.11 -specify a virtual address that maps the PTE to be modified. Previously
    5.12 -a physical address was used, requiring Xen to temporarily map the PTE
    5.13 -into its own private region so that it could be read and written.
    5.14 -This affects only commands of type PGREQ_NORMAL_UPDATE and
    5.15 -PGREQ_UNCHECKED_UPDATE.
    5.16 -
    5.17 -New hypercall 'update_va_mapping'
    5.18 ----------------------------------
    5.19 -A new high-speed page-table update method has been introduced, which
    5.20 -may be of particular benefit when fixing up application page faults.
    5.21 -Invoked as 'update_va_mapping(page_number, new_pte_value, flags)':
    5.22 - <page_number>: The virtual page number in the current address space 
    5.23 -                whose PTE is to be modified.
    5.24 - <new_pte_value>: The new value to write into the PTE.
    5.25 - <flags>: An ORed combination of
    5.26 -          UVMF_INVLPG: Flush stale TLB entry of the updated page mapping
    5.27 -          UVMF_FLUSH_TLB: Flush all TLB entries
    5.28 -You can see this new call in use in Xenolinux (common/memory.c).
    5.29 -
     6.1 --- a/xen/TODO	Sat Dec 20 11:49:50 2003 +0000
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,54 +0,0 @@
     6.4 -
     6.5 -This is stuff we probably want to implement in the near future.
     6.6 -
     6.7 - -- Keir (16/3/03)
     6.8 -
     6.9 -
    6.10 -1. DOMAIN-0 MANAGEMENT DAEMON
    6.11 ------------------------------
    6.12 -A better control daemon is required for domain 0, which keeps proper
    6.13 -track of machine resources and can make sensible policy choices. This
    6.14 -may require support in Xen; for example, notifications (eg. DOMn is
    6.15 -killed), and requests (eg. can DOMn allocate x frames of memory?).
    6.16 -
    6.17 -2. ASSIGNING DOMAINS TO PROCESSORS
    6.18 -----------------------------------
    6.19 -More intelligent assignment of domains to processors. In
    6.20 -particular, we don't play well with hyperthreading: we will assign
    6.21 -domains to virtual processors on the same package, rather then
    6.22 -spreading them across processor packages.
    6.23 -
    6.24 -What we need to do is port code from Linux which stores information on
    6.25 -relationships between processors in the system (eg. which ones are
    6.26 -siblings in the same package). We then use this to balance domains
    6.27 -across packages, and across virtual processors within a package.
    6.28 -
    6.29 -3. SANE NETWORK ROUTING
    6.30 ------------------------
    6.31 -The current virtual firewall/router is completely broken. Needs a new
    6.32 -design and implementation!
    6.33 -
    6.34 -
    6.35 -
    6.36 -Graveyard
    6.37 -*********
    6.38 -
    6.39 -The hypervisor page cache
    6.40 --------------------------
    6.41 -This will allow guest OSes to make use of spare pages in the system, but
    6.42 -allow them to be immediately used for any new domains or memory requests.
    6.43 -The idea is that, when a page is laundered and falls off Linux's clean_LRU
    6.44 -list, rather than freeing it it becomes a candidate for passing down into
    6.45 -the hypervisor. In return, xeno-linux may ask for one of its previously-
    6.46 -cached pages back:
    6.47 - (page, new_id) = cache_query(page, old_id);
    6.48 -If the requested page couldn't be kept, a blank page is returned.
    6.49 -When would Linux make the query? Whenever it wants a page back without
    6.50 -the delay or going to disc. Also, whenever a page would otherwise be
    6.51 -flushed to disc.
    6.52 -
    6.53 -To try and add to the cache: (blank_page, new_id) = cache_query(page, NULL);
    6.54 - [NULL means "give me a blank page"].
    6.55 -To try and retrieve from the cache: (page, new_id) = cache_query(x_page, id)
    6.56 - [we may request that x_page just be discarded, and therefore not impinge
    6.57 -  on this domain's cache quota].
     7.1 --- a/xen/arch/i386/Rules.mk	Sat Dec 20 11:49:50 2003 +0000
     7.2 +++ b/xen/arch/i386/Rules.mk	Sat Dec 20 12:44:39 2003 +0000
     7.3 @@ -8,8 +8,8 @@ MONITOR_BASE := 0xFC500000
     7.4  # Bootloader should load monitor to this real address
     7.5  LOAD_BASE    := 0x00100000
     7.6  CFLAGS  := -nostdinc -fno-builtin -O3 -Wall -DMONITOR_BASE=$(MONITOR_BASE) 
     7.7 -CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
     7.8 -#CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
     7.9 +#CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
    7.10 +CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
    7.11  LDFLAGS := -T xeno.lds -N
    7.12  
    7.13  
     8.1 --- a/xen/arch/i386/apic.c	Sat Dec 20 11:49:50 2003 +0000
     8.2 +++ b/xen/arch/i386/apic.c	Sat Dec 20 12:44:39 2003 +0000
     8.3 @@ -47,7 +47,7 @@
     8.4  #include <asm/hardirq.h>
     8.5  #include <asm/apic.h>
     8.6  #include <xeno/mm.h>
     8.7 -
     8.8 +#include <asm/io_apic.h>
     8.9  #include <asm/timex.h>
    8.10  #include <xeno/ac_timer.h>
    8.11  #include <xeno/perfc.h>
     9.1 --- a/xen/arch/i386/entry.S	Sat Dec 20 11:49:50 2003 +0000
     9.2 +++ b/xen/arch/i386/entry.S	Sat Dec 20 12:44:39 2003 +0000
     9.3 @@ -82,7 +82,6 @@
     9.4  #include <xeno/config.h>
     9.5  #include <xeno/errno.h>
     9.6  #include <hypervisor-ifs/hypervisor-if.h>
     9.7 -#include <asm/smp.h>
     9.8  
     9.9  EBX		= 0x00
    9.10  ECX		= 0x04
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/xen/arch/i386/flushtlb.c	Sat Dec 20 12:44:39 2003 +0000
    10.3 @@ -0,0 +1,64 @@
    10.4 +/******************************************************************************
    10.5 + * flushtlb.c
    10.6 + * 
    10.7 + * TLB flushes are timestamped using a global virtual 'clock' which ticks
    10.8 + * on any TLB flush on any processor.
    10.9 + * 
   10.10 + * Copyright (c) 2003, K A Fraser
   10.11 + */
   10.12 +
   10.13 +#include <xeno/config.h>
   10.14 +#include <xeno/sched.h>
   10.15 +#include <asm/flushtlb.h>
   10.16 +
   10.17 +unsigned long tlbflush_mask;
   10.18 +unsigned long tlbflush_clock;
   10.19 +unsigned long tlbflush_time[NR_CPUS];
   10.20 +
   10.21 +static inline void tlb_clocktick(unsigned int cpu)
   10.22 +{
   10.23 +    unsigned long x, nx, y, ny;
   10.24 +    
   10.25 +    clear_bit(cpu, &tlbflush_mask);
   10.26 +
   10.27 +    /* Tick the clock. 'y' contains the current time after the tick. */
   10.28 +    ny = tlbflush_clock;
   10.29 +    do {
   10.30 +#ifdef CONFIG_SMP
   10.31 +        if ( unlikely(((y = ny+1) & (GLOBAL_FLUSH_PERIOD - 1)) == 0) )
   10.32 +        {
   10.33 +            new_tlbflush_clock_period();
   10.34 +            y = tlbflush_clock;
   10.35 +            break;
   10.36 +        }
   10.37 +#else
   10.38 +        y = ny+1;
   10.39 +#endif
   10.40 +    }
   10.41 +    while ( unlikely((ny = cmpxchg(&tlbflush_clock, y-1, y)) != y-1) );
   10.42 +
   10.43 +    /* Update cpu's timestamp to current time, unless someone else beats us. */
   10.44 +    nx = tlbflush_time[cpu];
   10.45 +    do { 
   10.46 +        if ( unlikely((x = nx) >= y) )
   10.47 +            break;
   10.48 +    }
   10.49 +    while ( unlikely((nx = cmpxchg(&tlbflush_time[cpu], x, y)) != x) );
   10.50 +}
   10.51 +
   10.52 +void write_cr3_counted(unsigned long pa)
   10.53 +{
   10.54 +    __asm__ __volatile__ ( 
   10.55 +        "movl %0, %%cr3"
   10.56 +        : : "r" (pa) : "memory" );
   10.57 +    tlb_clocktick(smp_processor_id());
   10.58 +}
   10.59 +
   10.60 +void flush_tlb_counted(void)
   10.61 +{
   10.62 +    __asm__ __volatile__ ( 
   10.63 +        "movl %%cr3, %%eax; movl %%eax, %%cr3"
   10.64 +        : : : "memory", "eax" );
   10.65 +    tlb_clocktick(smp_processor_id());
   10.66 +}
   10.67 +
    11.1 --- a/xen/arch/i386/io_apic.c	Sat Dec 20 11:49:50 2003 +0000
    11.2 +++ b/xen/arch/i386/io_apic.c	Sat Dec 20 12:44:39 2003 +0000
    11.3 @@ -28,6 +28,8 @@
    11.4  #include <xeno/config.h>
    11.5  #include <asm/mc146818rtc.h>
    11.6  #include <asm/io.h>
    11.7 +#include <asm/mpspec.h>
    11.8 +#include <asm/io_apic.h>
    11.9  #include <asm/smp.h>
   11.10  #include <asm/desc.h>
   11.11  #include <asm/smpboot.h>
    12.1 --- a/xen/arch/i386/ioremap.c	Sat Dec 20 11:49:50 2003 +0000
    12.2 +++ b/xen/arch/i386/ioremap.c	Sat Dec 20 12:44:39 2003 +0000
    12.3 @@ -15,92 +15,50 @@
    12.4  #include <asm/pgalloc.h>
    12.5  #include <asm/page.h>
    12.6  
    12.7 -static unsigned long remap_base = 0;
    12.8 +static unsigned long remap_base = IOREMAP_VIRT_START;
    12.9  
   12.10  #define PAGE_ALIGN(addr)    (((addr)+PAGE_SIZE-1)&PAGE_MASK)
   12.11  
   12.12 -static void new_l2e(l2_pgentry_t *pl2e)
   12.13 -{
   12.14 -    l1_pgentry_t *pl1e = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
   12.15 -    if ( !pl1e ) BUG();
   12.16 -    clear_page(pl1e);
   12.17 -    *pl2e = mk_l2_pgentry(__pa(pl1e)|__PAGE_HYPERVISOR);
   12.18 -}
   12.19 -
   12.20 -
   12.21 -void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
   12.22 +void * __ioremap(unsigned long phys_addr, 
   12.23 +                 unsigned long size, 
   12.24 +                 unsigned long flags)
   12.25  {
   12.26      unsigned long vaddr;
   12.27      unsigned long offset, cur=0, last_addr;
   12.28      l2_pgentry_t *pl2e;
   12.29      l1_pgentry_t *pl1e;
   12.30  
   12.31 -    /* First time through, start allocating from far end of virtual memory. */
   12.32 -    if ( !remap_base ) remap_base = IOREMAP_VIRT_START;
   12.33 -
   12.34      /* Don't allow wraparound or zero size */
   12.35      last_addr = phys_addr + size - 1;
   12.36 -    if (!size || last_addr < phys_addr)
   12.37 +    if ( (size == 0) || (last_addr < phys_addr) )
   12.38          return NULL;
   12.39  
   12.40 -    /*
   12.41 -     * Don't remap the low PCI/ISA area, it's always mapped..
   12.42 -     */
   12.43 -    if (phys_addr >= 0xA0000 && last_addr < 0x100000)
   12.44 +    /* Don't remap the low PCI/ISA area: it's always mapped. */
   12.45 +    if ( (phys_addr >= 0xA0000) && (last_addr < 0x100000) )
   12.46          return phys_to_virt(phys_addr);
   12.47  
   12.48 -    if(remap_base + size > IOREMAP_VIRT_END-1) {
   12.49 -      printk("ioremap: going past end of reserved space!\n");
   12.50 -      return NULL;
   12.51 +    if ( (remap_base + size) > (IOREMAP_VIRT_END - 1) )
   12.52 +    {
   12.53 +        printk("ioremap: going past end of reserved space!\n");
   12.54 +        return NULL;
   12.55      }
   12.56 -#if 0
   12.57 -    /*
   12.58 -     * Don't allow anybody to remap normal RAM that we're using..
   12.59 -     */
   12.60 -    if (phys_addr < virt_to_phys(high_memory)) {
   12.61 -        char *t_addr, *t_end;
   12.62 -        struct pfn_info *page;
   12.63  
   12.64 -        t_addr = __va(phys_addr);
   12.65 -        t_end = t_addr + (size - 1);
   12.66 -	   
   12.67 -        for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
   12.68 -            if(!PageReserved(page))
   12.69 -                return NULL;
   12.70 -    }
   12.71 -#endif
   12.72 -
   12.73 -    /*
   12.74 -     * Mappings have to be page-aligned
   12.75 -     */
   12.76 +    /* Mappings have to be page-aligned. */
   12.77      offset = phys_addr & ~PAGE_MASK;
   12.78      phys_addr &= PAGE_MASK;
   12.79      size = PAGE_ALIGN(last_addr) - phys_addr;
   12.80  
   12.81 -    /*
   12.82 -     * Ok, go for it..
   12.83 -     */
   12.84 +    /* Ok, go for it. */
   12.85      vaddr = remap_base;
   12.86      remap_base += size;
   12.87      pl2e = &idle_pg_table[l2_table_offset(vaddr)];
   12.88 -    if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
   12.89      pl1e = l2_pgentry_to_l1(*pl2e++) + l1_table_offset(vaddr);
   12.90 -    for ( ; ; ) 
   12.91 -    {
   12.92 -        if ( !l1_pgentry_empty(*pl1e) ) BUG();
   12.93 +    do {
   12.94          *pl1e++ = mk_l1_pgentry((phys_addr+cur)|PAGE_HYPERVISOR|flags);
   12.95 -        cur += PAGE_SIZE;
   12.96 -        if ( cur == size ) break;
   12.97 -        if ( !((unsigned long)pl1e & (PAGE_SIZE-1)) )
   12.98 -        {
   12.99 -            if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
  12.100 -            pl1e = l2_pgentry_to_l1(*pl2e++);        
  12.101 -        }
  12.102      }
  12.103 +    while ( (cur += PAGE_SIZE) != size );
  12.104  
  12.105 -    flush_tlb_all();
  12.106 -
  12.107 -    return (void *) (offset + (char *)vaddr);
  12.108 +    return (void *)(offset + (char *)vaddr);
  12.109  }
  12.110  
  12.111  void iounmap(void *addr)
    13.1 --- a/xen/arch/i386/irq.c	Sat Dec 20 11:49:50 2003 +0000
    13.2 +++ b/xen/arch/i386/irq.c	Sat Dec 20 12:44:39 2003 +0000
    13.3 @@ -24,7 +24,8 @@
    13.4  #include <xeno/interrupt.h>
    13.5  #include <xeno/irq.h>
    13.6  #include <xeno/slab.h>
    13.7 -
    13.8 +#include <asm/mpspec.h>
    13.9 +#include <asm/io_apic.h>
   13.10  #include <asm/msr.h>
   13.11  #include <asm/hardirq.h>
   13.12  #include <asm/ptrace.h>
    14.1 --- a/xen/arch/i386/mm.c	Sat Dec 20 11:49:50 2003 +0000
    14.2 +++ b/xen/arch/i386/mm.c	Sat Dec 20 12:44:39 2003 +0000
    14.3 @@ -27,8 +27,8 @@
    14.4  #include <asm/fixmap.h>
    14.5  #include <asm/domain_page.h>
    14.6  
    14.7 -static inline void set_pte_phys (unsigned long vaddr,
    14.8 -                                 l1_pgentry_t entry)
    14.9 +static inline void set_pte_phys(unsigned long vaddr,
   14.10 +                                l1_pgentry_t entry)
   14.11  {
   14.12      l2_pgentry_t *l2ent;
   14.13      l1_pgentry_t *l1ent;
   14.14 @@ -41,20 +41,22 @@ static inline void set_pte_phys (unsigne
   14.15      __flush_tlb_one(vaddr);
   14.16  }
   14.17  
   14.18 -void __set_fixmap (enum fixed_addresses idx, 
   14.19 -                   l1_pgentry_t entry)
   14.20 +
   14.21 +void __set_fixmap(enum fixed_addresses idx, 
   14.22 +                  l1_pgentry_t entry)
   14.23  {
   14.24      unsigned long address = __fix_to_virt(idx);
   14.25  
   14.26 -    if (idx >= __end_of_fixed_addresses) {
   14.27 +    if ( likely(idx < __end_of_fixed_addresses) )
   14.28 +        set_pte_phys(address, entry);
   14.29 +    else
   14.30          printk("Invalid __set_fixmap\n");
   14.31 -        return;
   14.32 -    }
   14.33 -    set_pte_phys(address, entry);
   14.34  }
   14.35  
   14.36 -static void __init fixrange_init (unsigned long start, 
   14.37 -                                  unsigned long end, l2_pgentry_t *pg_base)
   14.38 +
   14.39 +static void __init fixrange_init(unsigned long start, 
   14.40 +                                 unsigned long end, 
   14.41 +                                 l2_pgentry_t *pg_base)
   14.42  {
   14.43      l2_pgentry_t *l2e;
   14.44      int i;
   14.45 @@ -66,7 +68,8 @@ static void __init fixrange_init (unsign
   14.46  
   14.47      for ( ; (i < ENTRIES_PER_L2_PAGETABLE) && (vaddr != end); l2e++, i++ ) 
   14.48      {
   14.49 -        if ( !l2_pgentry_empty(*l2e) ) continue;
   14.50 +        if ( !l2_pgentry_empty(*l2e) )
   14.51 +            continue;
   14.52          page = (unsigned long)get_free_page(GFP_KERNEL);
   14.53          clear_page(page);
   14.54          *l2e = mk_l2_pgentry(__pa(page) | __PAGE_HYPERVISOR);
   14.55 @@ -79,11 +82,6 @@ void __init paging_init(void)
   14.56      unsigned long addr;
   14.57      void *ioremap_pt;
   14.58  
   14.59 -    /* XXX initialised in boot.S */
   14.60 -    /*if ( cpu_has_pge ) set_in_cr4(X86_CR4_PGE);*/
   14.61 -    /*if ( cpu_has_pse ) set_in_cr4(X86_CR4_PSE);*/
   14.62 -    /*if ( cpu_has_pae ) set_in_cr4(X86_CR4_PAE);*/
   14.63 -
   14.64      /*
   14.65       * Fixed mappings, only the page table structure has to be
   14.66       * created - mappings will be set by set_fixmap():
   14.67 @@ -115,12 +113,12 @@ void __init paging_init(void)
   14.68  
   14.69  }
   14.70  
   14.71 -void __init zap_low_mappings (void)
   14.72 +void __init zap_low_mappings(void)
   14.73  {
   14.74      int i;
   14.75      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   14.76          idle_pg_table[i] = mk_l2_pgentry(0);
   14.77 -    flush_tlb_all();
   14.78 +    flush_tlb_all_pge();
   14.79  }
   14.80  
   14.81  
   14.82 @@ -212,86 +210,54 @@ long set_gdt(struct task_struct *p,
   14.83               unsigned int entries)
   14.84  {
   14.85      /* NB. There are 512 8-byte entries per GDT page. */
   14.86 -    unsigned int i, j, nr_pages = (entries + 511) / 512;
   14.87 -    unsigned long pfn, *gdt_page;
   14.88 -    long ret = -EINVAL;
   14.89 -    struct pfn_info *page;
   14.90 +    int i, nr_pages = (entries + 511) / 512;
   14.91 +    unsigned long pfn;
   14.92      struct desc_struct *vgdt;
   14.93  
   14.94 -    spin_lock(&p->page_lock);
   14.95 -
   14.96      /* Check the new GDT. */
   14.97      for ( i = 0; i < nr_pages; i++ )
   14.98      {
   14.99 -        if ( frames[i] >= max_page ) 
  14.100 -            goto out;
  14.101 -        
  14.102 -        page = frame_table + frames[i];
  14.103 -        if ( (page->flags & PG_domain_mask) != p->domain )
  14.104 -            goto out;
  14.105 -
  14.106 -        if ( (page->flags & PG_type_mask) != PGT_gdt_page )
  14.107 -        {
  14.108 -            if ( page_type_count(page) != 0 )
  14.109 -                goto out;
  14.110 -
  14.111 -            /* Check all potential GDT entries in the page. */
  14.112 -            gdt_page = map_domain_mem(frames[0] << PAGE_SHIFT);
  14.113 -            for ( j = 0; j < 512; j++ )
  14.114 -                if ( !check_descriptor(gdt_page[j*2], gdt_page[j*2+1]) )
  14.115 -                    goto out;
  14.116 -            unmap_domain_mem(gdt_page);
  14.117 -        }
  14.118 -    }
  14.119 -
  14.120 -    /* Tear down the old GDT. */
  14.121 -    for ( i = 0; i < 16; i++ )
  14.122 -    {
  14.123 -        pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]);
  14.124 -        p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
  14.125 -        if ( pfn == 0 ) continue;
  14.126 -        page = frame_table + pfn;
  14.127 -        ASSERT((page->flags & PG_type_mask) == PGT_gdt_page);
  14.128 -        ASSERT((page->flags & PG_domain_mask) == p->domain);
  14.129 -        ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
  14.130 -        put_page_type(page);
  14.131 -        put_page_tot(page);
  14.132 -    }
  14.133 -
  14.134 -    /* Install the new GDT. */
  14.135 -    for ( i = 0; i < nr_pages; i++ )
  14.136 -    {
  14.137 -        p->mm.perdomain_pt[i] =
  14.138 -            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  14.139 -        
  14.140 -        page = frame_table + frames[i];
  14.141 -        page->flags &= ~(PG_type_mask | PG_need_flush);
  14.142 -        page->flags |= PGT_gdt_page;
  14.143 -        get_page_type(page);
  14.144 -        get_page_tot(page);
  14.145 +        if ( unlikely(frames[i] >= max_page) ||
  14.146 +             unlikely(!get_page_and_type(&frame_table[frames[i]], 
  14.147 +                                         p, PGT_gdt_page)) )
  14.148 +            goto fail;
  14.149      }
  14.150  
  14.151      /* Copy reserved GDT entries to the new GDT. */
  14.152 -    vgdt = map_domain_mem(frames[i] << PAGE_SHIFT);
  14.153 +    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
  14.154      memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
  14.155             gdt_table + FIRST_RESERVED_GDT_ENTRY, 
  14.156             NR_RESERVED_GDT_ENTRIES*8);
  14.157      unmap_domain_mem(vgdt);
  14.158  
  14.159 +    /* Tear down the old GDT. */
  14.160 +    for ( i = 0; i < 16; i++ )
  14.161 +    {
  14.162 +        if ( (pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i])) != 0 )
  14.163 +            put_page_and_type(&frame_table[pfn]);
  14.164 +        p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
  14.165 +    }
  14.166 +
  14.167 +    /* Install the new GDT. */
  14.168 +    for ( i = 0; i < nr_pages; i++ )
  14.169 +        p->mm.perdomain_pt[i] =
  14.170 +            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  14.171 +
  14.172      SET_GDT_ADDRESS(p, GDT_VIRT_START);
  14.173      SET_GDT_ENTRIES(p, (entries*8)-1);
  14.174  
  14.175 -    ret = 0; /* success */
  14.176 +    return 0;
  14.177  
  14.178 - out:
  14.179 -    spin_unlock(&p->page_lock);
  14.180 -    return ret;
  14.181 + fail:
  14.182 +    while ( i-- > 0 )
  14.183 +        put_page_and_type(&frame_table[frames[i]]);
  14.184 +    return -EINVAL;
  14.185  }
  14.186  
  14.187  
  14.188  long do_set_gdt(unsigned long *frame_list, unsigned int entries)
  14.189  {
  14.190 -    unsigned int nr_pages = (entries + 511) / 512;
  14.191 +    int nr_pages = (entries + 511) / 512;
  14.192      unsigned long frames[16];
  14.193      long ret;
  14.194  
  14.195 @@ -321,14 +287,12 @@ long do_update_descriptor(
  14.196      if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) )
  14.197          return -EINVAL;
  14.198  
  14.199 -    spin_lock(&current->page_lock);
  14.200 -
  14.201 -    page = frame_table + pfn;
  14.202 -    if ( (page->flags & PG_domain_mask) != current->domain )
  14.203 +    page = &frame_table[pfn];
  14.204 +    if ( unlikely(!get_page(page, current)) )
  14.205          goto out;
  14.206  
  14.207      /* Check if the given frame is in use in an unsafe context. */
  14.208 -    switch ( (page->flags & PG_type_mask) )
  14.209 +    switch ( page->type_and_flags & PGT_type_mask )
  14.210      {
  14.211      case PGT_gdt_page:
  14.212          /* Disallow updates of Xen-reserved descriptors in the current GDT. */
  14.213 @@ -336,12 +300,17 @@ long do_update_descriptor(
  14.214               (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
  14.215               (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
  14.216              goto out;
  14.217 +        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
  14.218 +            goto out;
  14.219 +        break;
  14.220      case PGT_ldt_page:
  14.221 -    case PGT_writeable_page:
  14.222 +        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
  14.223 +            goto out;
  14.224          break;
  14.225      default:
  14.226 -        if ( page_type_count(page) != 0 )
  14.227 +        if ( unlikely(!get_page_type(page, PGT_writeable_page)) )
  14.228              goto out;
  14.229 +        break;
  14.230      }
  14.231  
  14.232      /* All is good so make the update. */
  14.233 @@ -350,9 +319,11 @@ long do_update_descriptor(
  14.234      gdt_pent[1] = word2;
  14.235      unmap_domain_mem(gdt_pent);
  14.236  
  14.237 +    put_page_type(page);
  14.238 +
  14.239      ret = 0; /* success */
  14.240  
  14.241   out:
  14.242 -    spin_unlock(&current->page_lock);
  14.243 +    put_page(page);
  14.244      return ret;
  14.245  }
    15.1 --- a/xen/arch/i386/pci-irq.c	Sat Dec 20 11:49:50 2003 +0000
    15.2 +++ b/xen/arch/i386/pci-irq.c	Sat Dec 20 12:44:39 2003 +0000
    15.3 @@ -6,16 +6,15 @@
    15.4  
    15.5  #include <linux/config.h>
    15.6  #include <linux/types.h>
    15.7 -/*#include <linux/kernel.h>*/
    15.8  #include <linux/pci.h>
    15.9  #include <linux/init.h>
   15.10  #include <linux/slab.h>
   15.11  #include <linux/interrupt.h>
   15.12  #include <linux/irq.h>
   15.13  #include <linux/sched.h>
   15.14 -
   15.15  #include <asm/io.h>
   15.16  #include <asm/smp.h>
   15.17 +#include <asm/mpspec.h>
   15.18  #include <asm/io_apic.h>
   15.19  
   15.20  #include "pci-i386.h"
    16.1 --- a/xen/arch/i386/process.c	Sat Dec 20 11:49:50 2003 +0000
    16.2 +++ b/xen/arch/i386/process.c	Sat Dec 20 12:44:39 2003 +0000
    16.3 @@ -27,6 +27,7 @@
    16.4  #include <asm/processor.h>
    16.5  #include <asm/desc.h>
    16.6  #include <asm/i387.h>
    16.7 +#include <asm/mpspec.h>
    16.8  
    16.9  #include <xeno/irq.h>
   16.10  #include <xeno/event.h>
   16.11 @@ -263,7 +264,7 @@ void switch_to(struct task_struct *prev_
   16.12      tss->ss1  = next->ss1;
   16.13  
   16.14      /* Switch page tables.  */
   16.15 -    __write_cr3_counted(pagetable_val(next_p->mm.pagetable));
   16.16 +    write_cr3_counted(pagetable_val(next_p->mm.pagetable));
   16.17  
   16.18      set_current(next_p);
   16.19  
    17.1 --- a/xen/arch/i386/smp.c	Sat Dec 20 11:49:50 2003 +0000
    17.2 +++ b/xen/arch/i386/smp.c	Sat Dec 20 12:44:39 2003 +0000
    17.3 @@ -16,6 +16,7 @@
    17.4  #include <asm/mc146818rtc.h>
    17.5  #include <asm/pgalloc.h>
    17.6  #include <asm/smpboot.h>
    17.7 +#include <asm/hardirq.h>
    17.8  
    17.9  #ifdef CONFIG_SMP
   17.10  
   17.11 @@ -264,34 +265,67 @@ static spinlock_t tlbstate_lock = SPIN_L
   17.12  asmlinkage void smp_invalidate_interrupt(void)
   17.13  {
   17.14      ack_APIC_irq();
   17.15 -    if (test_and_clear_bit(smp_processor_id(), &flush_cpumask))
   17.16 -        local_flush_tlb();
   17.17 +    clear_bit(smp_processor_id(), &flush_cpumask);
   17.18 +    local_flush_tlb();
   17.19  }
   17.20  
   17.21 -void flush_tlb_others(unsigned long cpumask)
   17.22 +void flush_tlb_mask(unsigned long mask)
   17.23  {
   17.24 -    spin_lock(&tlbstate_lock);
   17.25 -    atomic_set_mask(cpumask, &flush_cpumask);
   17.26 -    send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
   17.27 -    while (flush_cpumask) continue;
   17.28 +    if ( unlikely(in_irq()) )
   17.29 +        BUG();
   17.30 +    
   17.31 +    if ( mask & (1 << smp_processor_id()) )
   17.32 +    {
   17.33 +        local_flush_tlb();
   17.34 +        mask &= ~(1 << smp_processor_id());
   17.35 +    }
   17.36 +
   17.37 +    if ( mask != 0 )
   17.38 +    {
   17.39 +        spin_lock(&tlbstate_lock);
   17.40 +        flush_cpumask = mask;
   17.41 +        send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
   17.42 +        while ( flush_cpumask != 0 )
   17.43 +        {
   17.44 +            rep_nop();
   17.45 +            barrier();
   17.46 +        }
   17.47 +        spin_unlock(&tlbstate_lock);
   17.48 +    }
   17.49 +}
   17.50 +
   17.51 +void new_tlbflush_clock_period(void)
   17.52 +{
   17.53 +    if ( unlikely(!spin_trylock(&tlbstate_lock)) )
   17.54 +        return;
   17.55 +
   17.56 +    if ( unlikely((flush_cpumask = tlbflush_mask) != 0) )
   17.57 +    {
   17.58 +        send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR);
   17.59 +        while ( flush_cpumask != 0 )
   17.60 +        {
   17.61 +            rep_nop();
   17.62 +            barrier();
   17.63 +        }
   17.64 +    }
   17.65 +
   17.66 +    /* No need for cmpxchg updates here: we are protected by tlbstate lock. */
   17.67 +    tlbflush_mask = (1 << smp_num_cpus) - 1;
   17.68 +    wmb(); /* Reset the mask before allowing the clock to continue ticking. */
   17.69 +    tlbflush_clock++;
   17.70 +
   17.71      spin_unlock(&tlbstate_lock);
   17.72  }
   17.73 -	
   17.74 -static inline void do_flush_tlb_all_local(void)
   17.75 +
   17.76 +static void flush_tlb_all_pge_ipi(void* info)
   17.77  {
   17.78 -    __flush_tlb_all();
   17.79 +    __flush_tlb_pge();
   17.80  }
   17.81  
   17.82 -static void flush_tlb_all_ipi(void* info)
   17.83 +void flush_tlb_all_pge(void)
   17.84  {
   17.85 -    do_flush_tlb_all_local();
   17.86 -}
   17.87 -
   17.88 -void flush_tlb_all(void)
   17.89 -{
   17.90 -    smp_call_function (flush_tlb_all_ipi,0,1,1);
   17.91 -
   17.92 -    do_flush_tlb_all_local();
   17.93 +    smp_call_function (flush_tlb_all_pge_ipi,0,1,1);
   17.94 +    __flush_tlb_pge();
   17.95  }
   17.96  
   17.97  void smp_send_event_check_mask(unsigned long cpu_mask)
    18.1 --- a/xen/arch/i386/smpboot.c	Sat Dec 20 11:49:50 2003 +0000
    18.2 +++ b/xen/arch/i386/smpboot.c	Sat Dec 20 12:44:39 2003 +0000
    18.3 @@ -44,6 +44,8 @@
    18.4  #include <xeno/smp.h>
    18.5  #include <asm/msr.h>
    18.6  #include <asm/system.h>
    18.7 +#include <asm/mpspec.h>
    18.8 +#include <asm/io_apic.h>
    18.9  #include <xeno/sched.h>
   18.10  #include <xeno/delay.h>
   18.11  #include <xeno/lib.h>
    19.1 --- a/xen/arch/i386/traps.c	Sat Dec 20 11:49:50 2003 +0000
    19.2 +++ b/xen/arch/i386/traps.c	Sat Dec 20 12:44:39 2003 +0000
    19.3 @@ -211,6 +211,7 @@ static inline void do_trap(int trapnr, c
    19.4  
    19.5      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
    19.6      {
    19.7 +        DPRINTK("Trap %d: %08lx -> %08lx\n", trapnr, regs->eip, fixup);
    19.8          regs->eip = fixup;
    19.9          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
   19.10          return;
   19.11 @@ -328,6 +329,7 @@ asmlinkage void do_page_fault(struct pt_
   19.12  
   19.13      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
   19.14      {
   19.15 +        DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup);
   19.16          regs->eip = fixup;
   19.17          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
   19.18          return;
   19.19 @@ -411,6 +413,7 @@ asmlinkage void do_general_protection(st
   19.20  
   19.21      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
   19.22      {
   19.23 +        DPRINTK("GPF (%04lx): %08lx -> %08lx\n", error_code, regs->eip, fixup);
   19.24          regs->eip = fixup;
   19.25          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
   19.26          return;
    20.1 --- a/xen/common/dom0_ops.c	Sat Dec 20 11:49:50 2003 +0000
    20.2 +++ b/xen/common/dom0_ops.c	Sat Dec 20 12:44:39 2003 +0000
    20.3 @@ -38,31 +38,6 @@ static unsigned int get_domnr(void)
    20.4      return 0;
    20.5  }
    20.6  
    20.7 -static void build_page_list(struct task_struct *p)
    20.8 -{
    20.9 -    unsigned long *list;
   20.10 -    unsigned long curr;
   20.11 -    struct list_head *list_ent;
   20.12 -
   20.13 -    curr = list_entry(p->pg_head.next, struct pfn_info, list) - frame_table;
   20.14 -    list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
   20.15 -
   20.16 -    list_for_each(list_ent, &p->pg_head)
   20.17 -    {
   20.18 -        *list++ = list_entry(list_ent, struct pfn_info, list) - frame_table;
   20.19 -
   20.20 -        if( ((unsigned long)list & ~PAGE_MASK) == 0 )
   20.21 -        {
   20.22 -            struct list_head *ent = frame_table[curr].list.next;
   20.23 -            curr = list_entry(ent, struct pfn_info, list) - frame_table;
   20.24 -            unmap_domain_mem(list-1);
   20.25 -            list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
   20.26 -        }
   20.27 -    }
   20.28 -
   20.29 -    unmap_domain_mem(list);
   20.30 -}
   20.31 -
   20.32  static int msr_cpu_mask;
   20.33  static unsigned long msr_addr;
   20.34  static unsigned long msr_lo;
   20.35 @@ -163,8 +138,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   20.36              goto exit_create;
   20.37          }
   20.38  
   20.39 -        build_page_list(p);
   20.40 -        
   20.41          ret = p->domain;
   20.42          
   20.43          op.u.createdomain.domain = ret;
   20.44 @@ -246,7 +219,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   20.45      case DOM0_GETMEMLIST:
   20.46      {
   20.47          int i;
   20.48 -        struct task_struct * p = find_domain_by_id(op.u.getmemlist.domain);
   20.49 +        struct task_struct *p = find_domain_by_id(op.u.getmemlist.domain);
   20.50          unsigned long max_pfns = op.u.getmemlist.max_pfns;
   20.51          unsigned long pfn;
   20.52          unsigned long *buffer = op.u.getmemlist.buffer;
   20.53 @@ -255,28 +228,27 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   20.54          ret = -EINVAL;
   20.55          if ( p != NULL )
   20.56          {
   20.57 -            list_ent = p->pg_head.next;
   20.58 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
   20.59 -            
   20.60 -            for ( i = 0; (i < max_pfns) && (list_ent != &p->pg_head); i++ )
   20.61 +            ret = 0;
   20.62 +
   20.63 +            spin_lock(&p->page_list_lock);
   20.64 +            list_ent = p->page_list.next;
   20.65 +            for ( i = 0; (i < max_pfns) && (list_ent != &p->page_list); i++ )
   20.66              {
   20.67 +                pfn = list_entry(list_ent, struct pfn_info, list) - 
   20.68 +                    frame_table;
   20.69                  if ( put_user(pfn, buffer) )
   20.70                  {
   20.71                      ret = -EFAULT;
   20.72 -                    goto out_getmemlist;
   20.73 +                    break;
   20.74                  }
   20.75                  buffer++;
   20.76                  list_ent = frame_table[pfn].list.next;
   20.77 -                pfn = list_entry(list_ent, struct pfn_info, list) - 
   20.78 -                    frame_table;
   20.79              }
   20.80 +            spin_unlock(&p->page_list_lock);
   20.81  
   20.82              op.u.getmemlist.num_pfns = i;
   20.83              copy_to_user(u_dom0_op, &op, sizeof(op));
   20.84 -
   20.85 -            ret = 0;
   20.86 -
   20.87 -        out_getmemlist:
   20.88 +            
   20.89              put_task_struct(p);
   20.90          }
   20.91      }
   20.92 @@ -369,21 +341,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   20.93      {
   20.94          struct pfn_info *page;
   20.95          unsigned long pfn = op.u.getpageframeinfo.pfn;
   20.96 -        
   20.97 -        if ( pfn >= max_page )
   20.98 -        {
   20.99 -            ret = -EINVAL;
  20.100 -        }
  20.101 -        else
  20.102 +        unsigned int dom = op.u.getpageframeinfo.domain;
  20.103 +        struct task_struct *p;
  20.104 +
  20.105 +        ret = -EINVAL;
  20.106 +
  20.107 +        if ( unlikely(pfn >= max_page) || 
  20.108 +             unlikely((p = find_domain_by_id(dom)) == NULL) )
  20.109 +            break;
  20.110 +
  20.111 +        page = &frame_table[pfn];
  20.112 +
  20.113 +        if ( likely(get_page(page, p)) )
  20.114          {
  20.115 -            page = frame_table + pfn;
  20.116 -            
  20.117 -            op.u.getpageframeinfo.domain = page->flags & PG_domain_mask;
  20.118 -            op.u.getpageframeinfo.type   = NONE;
  20.119 +            op.u.getpageframeinfo.type = NONE;
  20.120  
  20.121 -            if ( page_type_count(page) != 0 )
  20.122 +            if ( (page->type_and_flags & PGT_count_mask) != 0 )
  20.123              {
  20.124 -                switch ( page->flags & PG_type_mask )
  20.125 +                switch ( page->type_and_flags & PGT_type_mask )
  20.126                  {
  20.127                  case PGT_l1_page_table:
  20.128                      op.u.getpageframeinfo.type = L1TAB;
  20.129 @@ -393,9 +368,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
  20.130                      break;
  20.131                  }
  20.132              }
  20.133 +            
  20.134 +            put_page(page);
  20.135 +        }
  20.136  
  20.137 -            copy_to_user(u_dom0_op, &op, sizeof(op));
  20.138 -        }
  20.139 +        put_task_struct(p);
  20.140 +
  20.141 +        copy_to_user(u_dom0_op, &op, sizeof(op));
  20.142      }
  20.143      break;
  20.144  
    21.1 --- a/xen/common/dom_mem_ops.c	Sat Dec 20 11:49:50 2003 +0000
    21.2 +++ b/xen/common/dom_mem_ops.c	Sat Dec 20 12:44:39 2003 +0000
    21.3 @@ -16,58 +16,26 @@
    21.4  #include <xeno/event.h>
    21.5  #include <asm/domain_page.h>
    21.6  
    21.7 -#if 0
    21.8 -#define DPRINTK(_f, _a...) printk( _f , ## _a )
    21.9 -#else
   21.10 -#define DPRINTK(_f, _a...) ((void)0)
   21.11 -#endif
   21.12 -
   21.13  static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
   21.14  {
   21.15 -    struct list_head *temp;
   21.16 -    struct pfn_info  *pf;     /* pfn_info of current page */
   21.17 +    struct pfn_info  *page;
   21.18      unsigned long     mpfn;   /* machine frame number of current page */
   21.19      void             *va;     /* Xen-usable mapping of current page */
   21.20      unsigned long     i;
   21.21 -    unsigned long     flags;
   21.22  
   21.23 -    /*
   21.24 -     * POLICY DECISION: Each domain has a page limit.
   21.25 -     * NB. The first part of test is because op.size could be so big that
   21.26 -     * tot_pages + op.size overflows a u_long.
   21.27 -     */
   21.28 -    if( (op.size > p->max_pages) ||
   21.29 -        ((p->tot_pages + op.size) > p->max_pages) )
   21.30 -        return -ENOMEM;
   21.31 -
   21.32 -    spin_lock_irqsave(&free_list_lock, flags);
   21.33 -
   21.34 -    if ( free_pfns < (op.size + (SLACK_DOMAIN_MEM_KILOBYTES >> 
   21.35 -                                  (PAGE_SHIFT-10))) ) 
   21.36 -    {
   21.37 -        spin_unlock_irqrestore(&free_list_lock, flags);
   21.38 -        return -ENOMEM;
   21.39 -    }
   21.40 -
   21.41 -    spin_lock(&p->page_lock);
   21.42 -    
   21.43 -    temp = free_list.next;
   21.44      for ( i = 0; i < op.size; i++ )
   21.45      {
   21.46 -        /* Get a free page and add it to the domain's page list. */
   21.47 -        pf = list_entry(temp, struct pfn_info, list);
   21.48 -        pf->flags |= p->domain;
   21.49 -        set_page_type_count(pf, 0);
   21.50 -        set_page_tot_count(pf, 0);
   21.51 -        temp = temp->next;
   21.52 -        list_del(&pf->list);
   21.53 -        list_add_tail(&pf->list, &p->pg_head);
   21.54 -        free_pfns--;
   21.55 +        /* Leave some slack pages; e.g., for the network. */
   21.56 +        if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
   21.57 +                                   (PAGE_SHIFT-10))) ) 
   21.58 +            break;
   21.59  
   21.60 -        p->tot_pages++;
   21.61 -
   21.62 +        /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
   21.63 +        if ( unlikely((page = alloc_domain_page(p)) == NULL) )
   21.64 +            break;
   21.65 +        
   21.66          /* Inform the domain of the new page's machine address. */ 
   21.67 -        mpfn = (unsigned long)(pf - frame_table);
   21.68 +        mpfn = (unsigned long)(page - frame_table);
   21.69          copy_to_user(op.pages, &mpfn, sizeof(mpfn));
   21.70          op.pages++; 
   21.71  
   21.72 @@ -77,26 +45,17 @@ static long alloc_dom_mem(struct task_st
   21.73          unmap_domain_mem(va);
   21.74      }
   21.75  
   21.76 -    spin_unlock(&p->page_lock);
   21.77 -    spin_unlock_irqrestore(&free_list_lock, flags);
   21.78 -    
   21.79 -    return op.size;
   21.80 +    return i;
   21.81  }
   21.82      
   21.83  static long free_dom_mem(struct task_struct *p, reservation_decrease_t op)
   21.84  {
   21.85 -    struct list_head *temp;
   21.86 -    struct pfn_info  *pf;     /* pfn_info of current page */
   21.87 +    struct pfn_info  *page;
   21.88      unsigned long     mpfn;   /* machine frame number of current page */
   21.89      unsigned long     i;
   21.90 -    unsigned long     flags;
   21.91      long              rc = 0;
   21.92      int               need_flush = 0;
   21.93  
   21.94 -    spin_lock_irqsave(&free_list_lock, flags);
   21.95 -    spin_lock(&p->page_lock);
   21.96 -
   21.97 -    temp = free_list.next;
   21.98      for ( i = 0; i < op.size; i++ )
   21.99      {
  21.100          copy_from_user(&mpfn, op.pages, sizeof(mpfn));
  21.101 @@ -109,37 +68,28 @@ static long free_dom_mem(struct task_str
  21.102              goto out;
  21.103          }
  21.104  
  21.105 -        pf = &frame_table[mpfn];
  21.106 -        if ( (page_type_count(pf) != 0) || 
  21.107 -             (page_tot_count(pf) != 0) ||
  21.108 -             ((pf->flags & PG_domain_mask) != p->domain) )
  21.109 +        page = &frame_table[mpfn];
  21.110 +        if ( unlikely(!get_page(page, p)) )
  21.111          {
  21.112 -            DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n",
  21.113 -                    p->domain, page_type_count(pf), 
  21.114 -                    page_tot_count(pf), pf->flags);
  21.115 +            DPRINTK("Bad page free for domain %d\n", p->domain);
  21.116              rc = -EINVAL;
  21.117              goto out;
  21.118          }
  21.119  
  21.120 -        need_flush |= pf->flags & PG_need_flush;
  21.121 -
  21.122 -        pf->flags = 0;
  21.123 +        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
  21.124 +            put_page_and_type(page);
  21.125  
  21.126 -        list_del(&pf->list);
  21.127 -        list_add(&pf->list, &free_list);
  21.128 -        free_pfns++;
  21.129 +        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
  21.130 +            put_page(page);
  21.131  
  21.132 -        p->tot_pages--;
  21.133 +        put_page(page);
  21.134      }
  21.135  
  21.136   out:
  21.137 -    spin_unlock(&p->page_lock);
  21.138 -    spin_unlock_irqrestore(&free_list_lock, flags);
  21.139 -    
  21.140      if ( need_flush )
  21.141      {
  21.142          __flush_tlb();
  21.143 -        perfc_incrc(need_flush_tlb_flush);
  21.144 +        perfc_incr(need_flush_tlb_flush);
  21.145      }
  21.146  
  21.147      return rc ? rc : op.size;
    22.1 --- a/xen/common/domain.c	Sat Dec 20 11:49:50 2003 +0000
    22.2 +++ b/xen/common/domain.c	Sat Dec 20 12:44:39 2003 +0000
    22.3 @@ -51,12 +51,11 @@ struct task_struct *do_createdomain(unsi
    22.4      sprintf(p->name, "Domain-%d", dom_id);
    22.5  
    22.6      spin_lock_init(&p->blk_ring_lock);
    22.7 -    spin_lock_init(&p->page_lock);
    22.8      spin_lock_init(&p->event_channel_lock);
    22.9  
   22.10      p->shared_info = (void *)get_free_page(GFP_KERNEL);
   22.11      memset(p->shared_info, 0, PAGE_SIZE);
   22.12 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id);
   22.13 +    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
   22.14  
   22.15      p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
   22.16      memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
   22.17 @@ -67,8 +66,10 @@ struct task_struct *do_createdomain(unsi
   22.18  
   22.19      sched_add_domain(p);
   22.20  
   22.21 -    INIT_LIST_HEAD(&p->pg_head);
   22.22 +    spin_lock_init(&p->page_list_lock);
   22.23 +    INIT_LIST_HEAD(&p->page_list);
   22.24      p->max_pages = p->tot_pages = 0;
   22.25 +
   22.26      write_lock_irqsave(&tasklist_lock, flags);
   22.27      SET_LINKS(p);
   22.28      p->next_hash = task_hash[TASK_HASH(dom_id)];
   22.29 @@ -218,77 +219,203 @@ long stop_other_domain(unsigned int dom)
   22.30      return 0;
   22.31  }
   22.32  
   22.33 -unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
   22.34 +struct pfn_info *alloc_domain_page(struct task_struct *p)
   22.35  {
   22.36 -    struct list_head *temp;
   22.37 -    struct pfn_info *pf;
   22.38 -    unsigned int alloc_pfns;
   22.39 -    unsigned int req_pages;
   22.40 -    unsigned long flags;
   22.41 -
   22.42 -    /* how many pages do we need to alloc? */
   22.43 -    req_pages = kbytes >> (PAGE_SHIFT - 10);
   22.44 +    struct pfn_info *page = NULL;
   22.45 +    unsigned long flags, mask, pfn_stamp, cpu_stamp;
   22.46 +    int i;
   22.47  
   22.48      spin_lock_irqsave(&free_list_lock, flags);
   22.49 -    
   22.50 -    /* is there enough mem to serve the request? */   
   22.51 -    if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) >
   22.52 -         free_pfns )
   22.53 +    if ( likely(!list_empty(&free_list)) )
   22.54 +    {
   22.55 +        page = list_entry(free_list.next, struct pfn_info, list);
   22.56 +        list_del(&page->list);
   22.57 +        free_pfns--;
   22.58 +    }
   22.59 +    spin_unlock_irqrestore(&free_list_lock, flags);
   22.60 +
   22.61 +    if ( unlikely(page == NULL) )
   22.62 +        return NULL;
   22.63 +
   22.64 +    if ( unlikely((mask = page->u.cpu_mask) != 0) )
   22.65      {
   22.66 -        spin_unlock_irqrestore(&free_list_lock, flags);
   22.67 -        return -1;
   22.68 +        pfn_stamp = page->tlbflush_timestamp;
   22.69 +        for ( i = 0; mask != 0; i++ )
   22.70 +        {
   22.71 +            if ( unlikely(mask & (1<<i)) )
   22.72 +            {
   22.73 +                cpu_stamp = tlbflush_time[i];
   22.74 +                if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
   22.75 +                    mask &= ~(1<<i);
   22.76 +            }
   22.77 +        }
   22.78 +
   22.79 +        if ( unlikely(mask != 0) )
   22.80 +        {
   22.81 +            if ( unlikely(in_irq()) )
   22.82 +            {
   22.83 +                DPRINTK("Returning NULL from alloc_domain_page: in_irq\n");
   22.84 +                goto free_and_exit;
   22.85 +            }
   22.86 +            perfc_incrc(need_flush_tlb_flush);
   22.87 +            flush_tlb_mask(mask);
   22.88 +        }
   22.89 +    }
   22.90 +
   22.91 +    page->u.domain = p;
   22.92 +    page->type_and_flags = 0;
   22.93 +    if ( p != NULL )
   22.94 +    {
   22.95 +        if ( unlikely(in_irq()) )
   22.96 +            BUG();
   22.97 +        wmb(); /* Domain pointer must be visible before updating refcnt. */
   22.98 +        spin_lock(&p->page_list_lock);
   22.99 +        if ( unlikely(p->tot_pages >= p->max_pages) )
  22.100 +        {
  22.101 +            spin_unlock(&p->page_list_lock);
  22.102 +            goto free_and_exit;
  22.103 +        }
  22.104 +        list_add_tail(&page->list, &p->page_list);
  22.105 +        p->tot_pages++;
  22.106 +        page->count_and_flags = PGC_allocated | 1;
  22.107 +        spin_unlock(&p->page_list_lock);
  22.108      }
  22.109  
  22.110 -    /* allocate pages and build a thread through frame_table */
  22.111 -    temp = free_list.next;
  22.112 -    for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ )
  22.113 +    return page;
  22.114 +
  22.115 + free_and_exit:
  22.116 +    spin_lock_irqsave(&free_list_lock, flags);
  22.117 +    list_add(&page->list, &free_list);
  22.118 +    free_pfns++;
  22.119 +    spin_unlock_irqrestore(&free_list_lock, flags);
  22.120 +    return NULL;
  22.121 +}
  22.122 +
  22.123 +void free_domain_page(struct pfn_info *page)
  22.124 +{
  22.125 +    unsigned long flags;
  22.126 +    struct task_struct *p = page->u.domain;
  22.127 +
  22.128 +    if ( unlikely(in_irq()) )
  22.129 +        BUG();
  22.130 +
  22.131 +    if ( likely(!IS_XEN_HEAP_FRAME(page)) )
  22.132 +    {
  22.133 +        /*
  22.134 +         * No race with setting of zombie bit. If it wasn't set before the
  22.135 +         * last reference was dropped, then it can't be set now.
  22.136 +         */
  22.137 +        page->u.cpu_mask = 0;
  22.138 +        if ( !(page->count_and_flags & PGC_zombie) )
  22.139 +        {
  22.140 +            page->tlbflush_timestamp = tlbflush_clock;
  22.141 +            page->u.cpu_mask = 1 << p->processor;
  22.142 +
  22.143 +            spin_lock(&p->page_list_lock);
  22.144 +            list_del(&page->list);
  22.145 +            p->tot_pages--;
  22.146 +            spin_unlock(&p->page_list_lock);
  22.147 +        }
  22.148 +
  22.149 +        page->count_and_flags = 0;
  22.150 +
  22.151 +        spin_lock_irqsave(&free_list_lock, flags);
  22.152 +        list_add(&page->list, &free_list);
  22.153 +        free_pfns++;
  22.154 +        spin_unlock_irqrestore(&free_list_lock, flags);
  22.155 +    }
  22.156 +    else
  22.157      {
  22.158 -        pf = list_entry(temp, struct pfn_info, list);
  22.159 -        pf->flags = p->domain;
  22.160 -        set_page_type_count(pf, 0);
  22.161 -        set_page_tot_count(pf, 0);
  22.162 -        temp = temp->next;
  22.163 -        list_del(&pf->list);
  22.164 -        list_add_tail(&pf->list, &p->pg_head);
  22.165 -        free_pfns--;
  22.166 -        ASSERT(free_pfns != 0);
  22.167 +        /*
  22.168 +         * No need for a TLB flush. Non-domain pages are always co-held by Xen,
  22.169 +         * and the Xen reference is not dropped until the domain is dead.
  22.170 +         * DOM0 may hold references, but it's trusted so no need to flush.
  22.171 +         */
  22.172 +        page->u.cpu_mask = 0;
  22.173 +        page->count_and_flags = 0;
  22.174 +        free_page((unsigned long)page_to_virt(page));
  22.175      }
  22.176 -   
  22.177 -    spin_unlock_irqrestore(&free_list_lock, flags);
  22.178 -    
  22.179 -    p->tot_pages = req_pages;
  22.180 +}
  22.181 +
  22.182 +
  22.183 +void free_all_dom_mem(struct task_struct *p)
  22.184 +{
  22.185 +    struct list_head *ent, zombies;
  22.186 +    struct pfn_info *page;
  22.187 +
  22.188 +    INIT_LIST_HEAD(&zombies);
  22.189 +
  22.190 +    spin_lock(&p->page_list_lock);
  22.191 +    while ( (ent = p->page_list.next) != &p->page_list )
  22.192 +    {
  22.193 +        page = list_entry(ent, struct pfn_info, list);
  22.194 +
  22.195 +        if ( unlikely(!get_page(page, p)) )
  22.196 +        {
  22.197 +            /*
  22.198 +             * Another CPU has dropped the last reference and is responsible 
  22.199 +             * for removing the page from this list. Wait for them to do so.
  22.200 +             */
  22.201 +            spin_unlock(&p->page_list_lock);
  22.202 +            while ( p->page_list.next == ent )
  22.203 +                barrier();
  22.204 +            spin_lock(&p->page_list_lock);
  22.205 +            continue;
  22.206 +        }
  22.207 +
  22.208 +        set_bit(_PGC_zombie, &page->count_and_flags);
  22.209 +
  22.210 +        list_del(&page->list);
  22.211 +        p->tot_pages--;
  22.212 +
  22.213 +        list_add(&page->list, &zombies);
  22.214 +    }
  22.215 +    spin_unlock(&p->page_list_lock);
  22.216 +
  22.217 +    /* We do the potentially complex 'put' operations with no lock held. */
  22.218 +    while ( (ent = zombies.next) != &zombies )
  22.219 +    {
  22.220 +        page = list_entry(ent, struct pfn_info, list);
  22.221 +
  22.222 +        list_del(&page->list);
  22.223 +        
  22.224 +        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
  22.225 +            put_page_and_type(page);
  22.226 +
  22.227 +        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
  22.228 +            put_page(page);
  22.229 +
  22.230 +        put_page(page);
  22.231 +    }
  22.232 +}
  22.233 +
  22.234 +
  22.235 +unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
  22.236 +{
  22.237 +    unsigned int alloc_pfns, nr_pages;
  22.238 +
  22.239 +    nr_pages = kbytes >> (PAGE_SHIFT - 10);
  22.240  
  22.241      /* TEMPORARY: max_pages should be explicitly specified. */
  22.242 -    p->max_pages = p->tot_pages;
  22.243 +    p->max_pages = nr_pages;
  22.244 +
  22.245 +    for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ )
  22.246 +    {
  22.247 +        if ( unlikely(alloc_domain_page(p) == NULL) ||
  22.248 +             unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
  22.249 +                                   (PAGE_SHIFT-10))) )
  22.250 +        {
  22.251 +            free_all_dom_mem(p);
  22.252 +            return -1;
  22.253 +        }
  22.254 +    }
  22.255 +
  22.256 +    p->tot_pages = nr_pages;
  22.257  
  22.258      return 0;
  22.259  }
  22.260   
  22.261  
  22.262 -void free_all_dom_mem(struct task_struct *p)
  22.263 -{
  22.264 -    struct list_head *ent;
  22.265 -    unsigned long flags;
  22.266 -
  22.267 -    spin_lock_irqsave(&free_list_lock, flags);
  22.268 -    while ( (ent = p->pg_head.next) != &p->pg_head )
  22.269 -    {
  22.270 -        struct pfn_info *pf = list_entry(ent, struct pfn_info, list);
  22.271 -        set_page_type_count(pf, 0);
  22.272 -        set_page_tot_count(pf, 0);
  22.273 -        pf->flags = 0;
  22.274 -        ASSERT(ent->next->prev == ent);
  22.275 -        ASSERT(ent->prev->next == ent);
  22.276 -        list_del(ent);
  22.277 -        list_add(ent, &free_list);
  22.278 -        free_pfns++;
  22.279 -    }
  22.280 -    spin_unlock_irqrestore(&free_list_lock, flags);
  22.281 -
  22.282 -    p->tot_pages = 0;
  22.283 -}
  22.284 -
  22.285 -
  22.286  /* Release resources belonging to task @p. */
  22.287  void release_task(struct task_struct *p)
  22.288  {
  22.289 @@ -309,7 +436,6 @@ void release_task(struct task_struct *p)
  22.290      destroy_event_channels(p);
  22.291      free_page((unsigned long)p->mm.perdomain_pt);
  22.292      UNSHARE_PFN(virt_to_page(p->shared_info));
  22.293 -    free_page((unsigned long)p->shared_info);
  22.294      free_all_dom_mem(p);
  22.295  
  22.296      kmem_cache_free(task_struct_cachep, p);
  22.297 @@ -360,11 +486,10 @@ int final_setup_guestos(struct task_stru
  22.298      p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs;
  22.299      p->failsafe_address  = builddomain->ctxt.failsafe_callback_eip;
  22.300      
  22.301 -    /* NB. Page base must already be pinned! */
  22.302      phys_l2tab = builddomain->ctxt.pt_base;
  22.303      p->mm.pagetable = mk_pagetable(phys_l2tab);
  22.304 -    get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]);
  22.305 -    get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]);
  22.306 +    get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, 
  22.307 +                      PGT_l2_page_table);
  22.308  
  22.309      /* Set up the shared info structure. */
  22.310      update_dom_time(p->shared_info);
  22.311 @@ -449,7 +574,7 @@ int setup_guestos(struct task_struct *p,
  22.312          return -ENOMEM;
  22.313      }
  22.314  
  22.315 -    alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) -
  22.316 +    alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) -
  22.317          frame_table;
  22.318      alloc_address <<= PAGE_SHIFT;
  22.319      alloc_index = p->tot_pages;
  22.320 @@ -497,7 +622,7 @@ int setup_guestos(struct task_struct *p,
  22.321      p->mm.pagetable = mk_pagetable(phys_l2tab);
  22.322  
  22.323      l2tab += l2_table_offset(virt_load_address);
  22.324 -    cur_address = list_entry(p->pg_head.next, struct pfn_info, list) -
  22.325 +    cur_address = list_entry(p->page_list.next, struct pfn_info, list) -
  22.326          frame_table;
  22.327      cur_address <<= PAGE_SHIFT;
  22.328      for ( count = 0; count < p->tot_pages; count++ )
  22.329 @@ -514,10 +639,10 @@ int setup_guestos(struct task_struct *p,
  22.330          }
  22.331          *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
  22.332          
  22.333 -        page = frame_table + (cur_address >> PAGE_SHIFT);
  22.334 -        page->flags = dom | PGT_writeable_page | PG_need_flush;
  22.335 -        set_page_type_count(page, 1);
  22.336 -        set_page_tot_count(page, 1);
  22.337 +        page = &frame_table[cur_address >> PAGE_SHIFT];
  22.338 +        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
  22.339 +        if ( !get_page_and_type(page, p, PGT_writeable_page) )
  22.340 +            BUG();
  22.341          /* Set up the MPT entry. */
  22.342          machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
  22.343  
  22.344 @@ -538,8 +663,9 @@ int setup_guestos(struct task_struct *p,
  22.345      {
  22.346          *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
  22.347          page = frame_table + l1_pgentry_to_pagenr(*l1tab);
  22.348 -        page->flags = dom | PGT_l1_page_table;
  22.349 -        get_page_tot(page);
  22.350 +        page->type_and_flags &= ~PGT_type_mask;
  22.351 +        page->type_and_flags |= PGT_l1_page_table;
  22.352 +        get_page(page, p); /* an extra ref because of readable mapping */
  22.353          l1tab++;
  22.354          if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
  22.355          {
  22.356 @@ -548,9 +674,13 @@ int setup_guestos(struct task_struct *p,
  22.357              l2tab++;
  22.358          }
  22.359      }
  22.360 -    get_page_type(page); /* guest_pinned */
  22.361 -    get_page_tot(page);  /* guest_pinned */
  22.362 -    page->flags = dom | PG_guest_pinned | PGT_l2_page_table;
  22.363 +    /* Rewrite last L1 page to be a L2 page. */
  22.364 +    page->type_and_flags &= ~PGT_type_mask;
  22.365 +    page->type_and_flags |= PGT_l2_page_table;
  22.366 +    /* Get another ref to L2 page so that it can be pinned. */
  22.367 +    if ( !get_page_and_type(page, p, PGT_l2_page_table) )
  22.368 +        BUG();
  22.369 +    set_bit(_PGC_guest_pinned, &page->count_and_flags);
  22.370      unmap_domain_mem(l1start);
  22.371  
  22.372      /* Set up shared info area. */
  22.373 @@ -565,7 +695,7 @@ int setup_guestos(struct task_struct *p,
  22.374  
  22.375      /* Install the new page tables. */
  22.376      __cli();
  22.377 -    __write_cr3_counted(pagetable_val(p->mm.pagetable));
  22.378 +    write_cr3_counted(pagetable_val(p->mm.pagetable));
  22.379  
  22.380      /* Copy the guest OS image. */    
  22.381      src  = (char *)(phy_data_start + 12);
  22.382 @@ -632,7 +762,7 @@ int setup_guestos(struct task_struct *p,
  22.383  
  22.384  
  22.385      /* Reinstate the caller's page tables. */
  22.386 -    __write_cr3_counted(pagetable_val(current->mm.pagetable));
  22.387 +    write_cr3_counted(pagetable_val(current->mm.pagetable));
  22.388      __sti();
  22.389  
  22.390      p->flags |= PF_CONSTRUCTED;
    23.1 --- a/xen/common/kernel.c	Sat Dec 20 11:49:50 2003 +0000
    23.2 +++ b/xen/common/kernel.c	Sat Dec 20 12:44:39 2003 +0000
    23.3 @@ -181,6 +181,13 @@ void cmain (unsigned long magic, multibo
    23.4          for ( ; ; ) ;
    23.5      }
    23.6  
    23.7 +    /* The array of pfn_info structures must fit into the reserved area. */
    23.8 +    if ( sizeof(struct pfn_info) > 24 )
    23.9 +    {
   23.10 +        printk("'struct pfn_info' too large to fit in Xen address space!\n");
   23.11 +        for ( ; ; ) ;
   23.12 +    }
   23.13 +
   23.14      set_current(&idle0_task);
   23.15  
   23.16      max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10);
    24.1 --- a/xen/common/memory.c	Sat Dec 20 11:49:50 2003 +0000
    24.2 +++ b/xen/common/memory.c	Sat Dec 20 12:44:39 2003 +0000
    24.3 @@ -139,34 +139,28 @@
    24.4  #include <asm/uaccess.h>
    24.5  #include <asm/domain_page.h>
    24.6  
    24.7 -#if 0
    24.8 -#define MEM_LOG(_f, _a...) 
    24.9 +#ifndef NDEBUG
   24.10 +#define MEM_LOG(_f, _a...)                           \
   24.11    printk("DOM%d: (file=memory.c, line=%d) " _f "\n", \
   24.12           current->domain, __LINE__, ## _a )
   24.13  #else
   24.14  #define MEM_LOG(_f, _a...) ((void)0)
   24.15  #endif
   24.16  
   24.17 -/* Domain 0 is allowed to submit requests on behalf of others. */
   24.18 -#define DOMAIN_OKAY(_f) \
   24.19 -    ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
   24.20 +static int alloc_l2_table(struct pfn_info *page);
   24.21 +static int alloc_l1_table(struct pfn_info *page);
   24.22 +static int get_page_from_pagenr(unsigned long page_nr);
   24.23 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
   24.24 +                                         unsigned int type);
   24.25  
   24.26 -/* 'get' checks parameter for validity before inc'ing refcnt. */
   24.27 -static int get_l2_table(unsigned long page_nr);
   24.28 -static int get_l1_table(unsigned long page_nr);
   24.29 -static int get_page(unsigned long page_nr, int writeable);
   24.30 -static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
   24.31 -/* 'put' does no checking because if refcnt not zero, entity must be valid. */
   24.32 -static void put_l2_table(unsigned long page_nr);
   24.33 -static void put_l1_table(unsigned long page_nr);
   24.34 -static void put_page(unsigned long page_nr, int writeable);
   24.35 -static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
   24.36 +static void free_l2_table(struct pfn_info *page);
   24.37 +static void free_l1_table(struct pfn_info *page);
   24.38  
   24.39 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t);
   24.40 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
   24.41  static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
   24.42  
   24.43  /* frame table size and its size in pages */
   24.44 -frame_table_t * frame_table;
   24.45 +struct pfn_info *frame_table;
   24.46  unsigned long frame_table_size;
   24.47  unsigned long max_page;
   24.48  
   24.49 @@ -176,8 +170,11 @@ unsigned int free_pfns;
   24.50  
   24.51  /* Used to defer flushing of memory structures. */
   24.52  static struct {
   24.53 -    int flush_tlb;
   24.54 -    int refresh_ldt;
   24.55 +#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
   24.56 +#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
   24.57 +#define DOP_RESTORE_CR0 (1<<2) /* Set the WP bit in CR0.         */
   24.58 +    unsigned long flags;
   24.59 +    unsigned long cr0;
   24.60  } deferred_op[NR_CPUS] __cacheline_aligned;
   24.61  
   24.62  /*
   24.63 @@ -196,7 +193,7 @@ void __init init_frametable(unsigned lon
   24.64      max_page = nr_pages;
   24.65      frame_table_size = nr_pages * sizeof(struct pfn_info);
   24.66      frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
   24.67 -    frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
   24.68 +    frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
   24.69      memset(frame_table, 0, frame_table_size);
   24.70  
   24.71      free_pfns = 0;
   24.72 @@ -218,7 +215,7 @@ void __init init_frametable(unsigned lon
   24.73  
   24.74  static void __invalidate_shadow_ldt(struct task_struct *p)
   24.75  {
   24.76 -    int i, cpu = p->processor;
   24.77 +    int i;
   24.78      unsigned long pfn;
   24.79      struct pfn_info *page;
   24.80      
   24.81 @@ -230,16 +227,13 @@ static void __invalidate_shadow_ldt(stru
   24.82          if ( pfn == 0 ) continue;
   24.83          p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
   24.84          page = frame_table + pfn;
   24.85 -        ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
   24.86 -        ASSERT((page->flags & PG_domain_mask) == p->domain);
   24.87 -        ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
   24.88 -        put_page_type(page);
   24.89 -        put_page_tot(page);                
   24.90 +        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
   24.91 +        ASSERT_PAGE_IS_DOMAIN(page, p);
   24.92 +        put_page_and_type(page);
   24.93      }
   24.94  
   24.95      /* Dispose of the (now possibly invalid) mappings from the TLB.  */
   24.96 -    deferred_op[cpu].flush_tlb   = 1;
   24.97 -    deferred_op[cpu].refresh_ldt = 1;
   24.98 +    deferred_op[p->processor].flags |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
   24.99  }
  24.100  
  24.101  
  24.102 @@ -251,556 +245,614 @@ static inline void invalidate_shadow_ldt
  24.103  }
  24.104  
  24.105  
  24.106 +int alloc_segdesc_page(struct pfn_info *page)
  24.107 +{
  24.108 +    unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
  24.109 +    int i;
  24.110 +
  24.111 +    for ( i = 0; i < 512; i++ )
  24.112 +        if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) )
  24.113 +            goto fail;
  24.114 +
  24.115 +    unmap_domain_mem(descs);
  24.116 +    return 1;
  24.117 +
  24.118 + fail:
  24.119 +    unmap_domain_mem(descs);
  24.120 +    return 0;
  24.121 +}
  24.122 +
  24.123 +
  24.124  /* Map shadow page at offset @off. Returns 0 on success. */
  24.125  int map_ldt_shadow_page(unsigned int off)
  24.126  {
  24.127      struct task_struct *p = current;
  24.128 -    unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
  24.129 -    unsigned long l1e, *ldt_page;
  24.130 -    struct pfn_info *page;
  24.131 -    int i, ret = -1;
  24.132 +    unsigned long l1e;
  24.133  
  24.134 -    /* We cannot take a page_lock in interrupt context. */
  24.135 -    if ( in_interrupt() )
  24.136 +    if ( unlikely(in_interrupt()) )
  24.137          BUG();
  24.138  
  24.139 -    spin_lock(&p->page_lock);
  24.140 -
  24.141 -    __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
  24.142 -    if ( unlikely(!(l1e & _PAGE_PRESENT)) )
  24.143 -        goto out;
  24.144 -
  24.145 -    page = frame_table + (l1e >> PAGE_SHIFT);
  24.146 -    if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) )
  24.147 -    {
  24.148 -        if ( unlikely(page_type_count(page) != 0) )
  24.149 -            goto out;
  24.150 +    __get_user(l1e, (unsigned long *)&linear_pg_table[(p->mm.ldt_base >> 
  24.151 +                                                       PAGE_SHIFT) + off]);
  24.152  
  24.153 -        /* Check all potential LDT entries in the page. */
  24.154 -        ldt_page = (unsigned long *)addr;
  24.155 -        for ( i = 0; i < 512; i++ )
  24.156 -            if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) )
  24.157 -                goto out;
  24.158 -        if ( unlikely(page->flags & PG_need_flush) )
  24.159 -        {
  24.160 -            perfc_incrc(need_flush_tlb_flush);
  24.161 -            __write_cr3_counted(pagetable_val(p->mm.pagetable));
  24.162 -            page->flags &= ~PG_need_flush;
  24.163 -        }
  24.164 +    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
  24.165 +         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
  24.166 +                                     p, PGT_ldt_page)) )
  24.167 +        return 0;
  24.168  
  24.169 -        page->flags &= ~PG_type_mask;
  24.170 -        page->flags |= PGT_ldt_page;
  24.171 -    }
  24.172 -
  24.173 -    /* Success! */
  24.174 -    get_page_type(page);
  24.175 -    get_page_tot(page);
  24.176 -    p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW);
  24.177 +    p->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
  24.178      p->mm.shadow_ldt_mapcnt++;
  24.179  
  24.180 -    ret = 0;
  24.181 -
  24.182 - out:
  24.183 -    spin_unlock(&p->page_lock);
  24.184 -    return ret;
  24.185 +    return 1;
  24.186  }
  24.187  
  24.188  
  24.189 -/* Return original refcnt, or -1 on error. */
  24.190 -static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
  24.191 +/* Domain 0 is allowed to build page tables on others' behalf. */
  24.192 +static inline int dom0_get_page(struct pfn_info *page)
  24.193  {
  24.194 -    struct pfn_info *page;
  24.195 -    unsigned long flags;
  24.196 +    unsigned long x, nx, y = page->count_and_flags;
  24.197 +
  24.198 +    do {
  24.199 +        x  = y;
  24.200 +        nx = x + 1;
  24.201 +        if ( unlikely((x & PGC_count_mask) == 0) ||
  24.202 +             unlikely((nx & PGC_count_mask) == 0) )
  24.203 +            return 0;
  24.204 +    }
  24.205 +    while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
  24.206 +
  24.207 +    return 1;
  24.208 +}
  24.209 +
  24.210 +
  24.211 +static int get_page_from_pagenr(unsigned long page_nr)
  24.212 +{
  24.213 +    struct pfn_info *page = &frame_table[page_nr];
  24.214  
  24.215      if ( unlikely(page_nr >= max_page) )
  24.216      {
  24.217          MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
  24.218 -        return -1;
  24.219 +        return 0;
  24.220      }
  24.221 -    page = frame_table + page_nr;
  24.222 -    flags = page->flags;
  24.223 -    if ( unlikely(!DOMAIN_OKAY(flags)) )
  24.224 -    {
  24.225 -        MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
  24.226 -        return -1;
  24.227 -    }
  24.228 -    if ( (flags & PG_type_mask) != type )
  24.229 +
  24.230 +    if ( unlikely(!get_page(page, current)) &&
  24.231 +         ((current->domain != 0) || !dom0_get_page(page)) )
  24.232      {
  24.233 -        if ( page_type_count(page) != 0 )
  24.234 -        {
  24.235 -            MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
  24.236 -                    page_nr << PAGE_SHIFT,
  24.237 -                    flags & PG_type_mask, type, page_type_count(page));
  24.238 -            return -1;
  24.239 -        }
  24.240 -
  24.241 -        if ( unlikely(flags & PG_need_flush) )
  24.242 -        {
  24.243 -            deferred_op[smp_processor_id()].flush_tlb = 1;
  24.244 -            page->flags &= ~PG_need_flush;
  24.245 -            perfc_incrc(need_flush_tlb_flush);
  24.246 -        }
  24.247 -
  24.248 -        page->flags &= ~PG_type_mask;
  24.249 -        page->flags |= type;
  24.250 +        MEM_LOG("Could not get page reference for pfn %08lx\n", page_nr);
  24.251 +        return 0;
  24.252      }
  24.253  
  24.254 -    get_page_tot(page);
  24.255 -    return get_page_type(page);
  24.256 +    return 1;
  24.257 +}
  24.258 +
  24.259 +
  24.260 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  24.261 +                                         unsigned int type)
  24.262 +{
  24.263 +    struct pfn_info *page = &frame_table[page_nr];
  24.264 +
  24.265 +    if ( unlikely(!get_page_from_pagenr(page_nr)) )
  24.266 +        return 0;
  24.267 +
  24.268 +    if ( unlikely(!get_page_type(page, type)) )
  24.269 +    {
  24.270 +        MEM_LOG("Bad page type for pfn %08lx (%08lx)", 
  24.271 +                page_nr, page->type_and_flags);
  24.272 +        put_page(page);
  24.273 +        return 0;
  24.274 +    }
  24.275 +
  24.276 +    return 1;
  24.277  }
  24.278  
  24.279  
  24.280 -/* Return new refcnt, or -1 on error. */
  24.281 -static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
  24.282 +/*
  24.283 + * We allow an L2 table to map itself, to achieve a linear p.t. Note that this
  24.284 + * does not raise any reference counts.
  24.285 + */
  24.286 +static int check_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn)
  24.287 +{
  24.288 +    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
  24.289 +    {
  24.290 +        MEM_LOG("Attempt to create linear p.t. with write perms");
  24.291 +        return 0;
  24.292 +    }
  24.293 +
  24.294 +    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
  24.295 +    {
  24.296 +        MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
  24.297 +        return 0;
  24.298 +    }
  24.299 +
  24.300 +    return 1;
  24.301 +}
  24.302 +
  24.303 +
  24.304 +static int get_page_from_l1e(l1_pgentry_t l1e)
  24.305 +{
  24.306 +    ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
  24.307 +
  24.308 +    if ( unlikely((l1_pgentry_val(l1e) &
  24.309 +                   (_PAGE_GLOBAL|_PAGE_PAT))) )
  24.310 +    {
  24.311 +        MEM_LOG("Bad L1 page type settings %04lx",
  24.312 +                l1_pgentry_val(l1e) &
  24.313 +                (_PAGE_GLOBAL|_PAGE_PAT));
  24.314 +        return 0;
  24.315 +    }
  24.316 +
  24.317 +    if ( l1_pgentry_val(l1e) & _PAGE_RW )
  24.318 +    {
  24.319 +        if ( unlikely(!get_page_and_type_from_pagenr(
  24.320 +            l1_pgentry_to_pagenr(l1e), PGT_writeable_page)) )
  24.321 +            return 0;
  24.322 +        set_bit(_PGC_tlb_flush_on_type_change, 
  24.323 +                &frame_table[l1_pgentry_to_pagenr(l1e)].count_and_flags);
  24.324 +    }
  24.325 +    else
  24.326 +    {
  24.327 +        if ( unlikely(!get_page_from_pagenr(l1_pgentry_to_pagenr(l1e))) )
  24.328 +            return 0;
  24.329 +    }
  24.330 +
  24.331 +    return 1;
  24.332 +}
  24.333 +
  24.334 +
  24.335 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  24.336 +static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  24.337 +{
  24.338 +    ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
  24.339 +
  24.340 +    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  24.341 +    {
  24.342 +        MEM_LOG("Bad L2 page type settings %04lx",
  24.343 +                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
  24.344 +        return 0;
  24.345 +    }
  24.346 +
  24.347 +    if ( unlikely(!get_page_and_type_from_pagenr(
  24.348 +        l2_pgentry_to_pagenr(l2e), PGT_l1_page_table)) &&
  24.349 +         unlikely(!check_linear_pagetable(l2e, pfn)) )
  24.350 +        return 0;
  24.351 +
  24.352 +    return 1;
  24.353 +}
  24.354 +
  24.355 +
  24.356 +static void put_page_from_l1e(l1_pgentry_t l1e)
  24.357  {
  24.358      struct pfn_info *page;
  24.359  
  24.360 -    if ( unlikely(page_nr >= max_page) )
  24.361 +    ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
  24.362 +
  24.363 +    page = &frame_table[l1_pgentry_to_pagenr(l1e)];
  24.364 +
  24.365 +    if ( l1_pgentry_val(l1e) & _PAGE_RW )
  24.366      {
  24.367 -        MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
  24.368 -        return -1;
  24.369 +        put_page_and_type(page);
  24.370      }
  24.371 -    page = frame_table + page_nr;
  24.372 -    if ( unlikely(!DOMAIN_OKAY(page->flags)) || 
  24.373 -         unlikely(((page->flags & PG_type_mask) != type)) ) 
  24.374 +    else
  24.375      {
  24.376 -        MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
  24.377 -                page->flags & PG_domain_mask, page->flags & PG_type_mask,
  24.378 -                type);
  24.379 -        return -1;
  24.380 +        /* We expect this is rare so we blow the entire shadow LDT. */
  24.381 +        if ( unlikely(((page->type_and_flags & PGT_type_mask) == 
  24.382 +                       PGT_ldt_page)) &&
  24.383 +             unlikely(((page->type_and_flags & PGT_count_mask) != 0)) )
  24.384 +            invalidate_shadow_ldt();
  24.385 +        put_page(page);
  24.386      }
  24.387 -    ASSERT(page_type_count(page) != 0);
  24.388 -    put_page_tot(page);
  24.389 -    return put_page_type(page);
  24.390 +}
  24.391 +
  24.392 +
  24.393 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  24.394 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  24.395 +{
  24.396 +    ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
  24.397 +
  24.398 +    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
  24.399 +         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
  24.400 +        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  24.401  }
  24.402  
  24.403  
  24.404 -/* We allow a L2 table to map itself, to achieve a linear pagetable. */
  24.405 -/* NB. There's no need for a put_twisted_l2_table() function!! */
  24.406 -static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
  24.407 +static int alloc_l2_table(struct pfn_info *page)
  24.408  {
  24.409 -    unsigned long l2v = l2_pgentry_val(l2e);
  24.410 +    unsigned long page_nr = page - frame_table;
  24.411 +    l2_pgentry_t *pl2e, l2e;
  24.412 +    int i;
  24.413 +   
  24.414 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  24.415 +
  24.416 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  24.417 +    {
  24.418 +        l2e = pl2e[i];
  24.419 +
  24.420 +        if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 
  24.421 +            continue;
  24.422  
  24.423 -    /* Clearly the mapping must be read-only :-) */
  24.424 -    if ( (l2v & _PAGE_RW) )
  24.425 +        if ( unlikely(!get_page_from_l2e(l2e, page_nr)) )
  24.426 +            goto fail;
  24.427 +    }
  24.428 +    
  24.429 +    /* Now we add our private high mappings. */
  24.430 +    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  24.431 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  24.432 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  24.433 +    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  24.434 +        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  24.435 +    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  24.436 +        mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) | 
  24.437 +                      __PAGE_HYPERVISOR);
  24.438 +
  24.439 +    unmap_domain_mem(pl2e);
  24.440 +    return 1;
  24.441 +
  24.442 + fail:
  24.443 +    while ( i-- > 0 )
  24.444      {
  24.445 -        MEM_LOG("Attempt to install twisted L2 entry with write permissions");
  24.446 -        return -1;
  24.447 +        l2e = pl2e[i];
  24.448 +        if ( l2_pgentry_val(l2e) & _PAGE_PRESENT )
  24.449 +            put_page_from_l2e(l2e, page_nr);
  24.450      }
  24.451  
  24.452 -    /* This is a sufficient final check. */
  24.453 -    if ( (l2v >> PAGE_SHIFT) != entry_pfn )
  24.454 -    {
  24.455 -        MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
  24.456 -        return -1;
  24.457 -    }
  24.458 -    
  24.459 -    /* We don't bump the reference counts. */
  24.460 +    unmap_domain_mem(pl2e);
  24.461      return 0;
  24.462  }
  24.463  
  24.464  
  24.465 -static int get_l2_table(unsigned long page_nr)
  24.466 +static int alloc_l1_table(struct pfn_info *page)
  24.467  {
  24.468 -    struct pfn_info *page;
  24.469 -    struct task_struct *p;
  24.470 -    l2_pgentry_t *p_l2_entry, l2_entry;
  24.471 -    int i, ret=0;
  24.472 -   
  24.473 -    ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
  24.474 -    if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
  24.475 -    
  24.476 -    /* NEW level-2 page table! Deal with every PDE in the table. */
  24.477 -    p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  24.478 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  24.479 +    unsigned long page_nr = page - frame_table;
  24.480 +    l1_pgentry_t *pl1e, l1e;
  24.481 +    int i;
  24.482 +
  24.483 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  24.484 +
  24.485 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  24.486      {
  24.487 -        l2_entry = *p_l2_entry++;
  24.488 -        if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
  24.489 -        if ( unlikely((l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  24.490 -        {
  24.491 -            MEM_LOG("Bad L2 page type settings %04lx",
  24.492 -                    l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
  24.493 -            ret = -1;
  24.494 +        l1e = pl1e[i];
  24.495 +
  24.496 +        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 
  24.497 +            continue;
  24.498 +
  24.499 +        if ( unlikely(!get_page_from_l1e(l1e)) )
  24.500              goto fail;
  24.501 -        }
  24.502 -        /* Assume we're mapping an L1 table, falling back to twisted L2. */
  24.503 -        ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
  24.504 -        if ( unlikely(ret) ) ret = get_twisted_l2_table(page_nr, l2_entry);
  24.505 -        if ( unlikely(ret) ) goto fail;
  24.506 -    }
  24.507 -    
  24.508 -    /* Now we simply slap in our high mapping. */
  24.509 -    memcpy(p_l2_entry, 
  24.510 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  24.511 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  24.512 -    p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
  24.513 -              DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
  24.514 -        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  24.515 -
  24.516 -    /*
  24.517 -     * The per-domain PGD is slightly tricky, as we may not be executing
  24.518 -     * in the context of the correct domain (DOM0 builds pt's for others).
  24.519 -     */
  24.520 -    page = frame_table + page_nr;
  24.521 -    if ( (p = find_domain_by_id(page->flags & PG_domain_mask)) != NULL )
  24.522 -    {
  24.523 -        p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
  24.524 -                  DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
  24.525 -            mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
  24.526 -        put_task_struct(p);
  24.527      }
  24.528  
  24.529 - out:
  24.530 -    unmap_domain_mem(p_l2_entry);
  24.531 -    return ret;
  24.532 +    /* Make sure we unmap the right page! */
  24.533 +    unmap_domain_mem(pl1e);
  24.534 +    return 1;
  24.535  
  24.536   fail:
  24.537 -    p_l2_entry--;
  24.538      while ( i-- > 0 )
  24.539      {
  24.540 -        l2_entry = *--p_l2_entry;
  24.541 -        if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
  24.542 -            put_l1_table(l2_pgentry_to_pagenr(l2_entry));
  24.543 +        l1e = pl1e[i];
  24.544 +        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
  24.545 +            continue;
  24.546 +        put_page_from_l1e(l1e);
  24.547      }
  24.548 -    if ( dec_page_refcnt(page_nr, PGT_l2_page_table) != 0 )
  24.549 -        BUG();
  24.550 -    goto out;
  24.551 +
  24.552 +    unmap_domain_mem(pl1e);
  24.553 +    return 0;
  24.554  }
  24.555  
  24.556  
  24.557 -static int get_l1_table(unsigned long page_nr)
  24.558 +static void free_l2_table(struct pfn_info *page)
  24.559  {
  24.560 -    l1_pgentry_t *p_l1_entry, l1_entry;
  24.561 -    int i, ret;
  24.562 +    unsigned long page_nr = page - frame_table;
  24.563 +    l2_pgentry_t *pl2e, l2e;
  24.564 +    int i;
  24.565 +
  24.566 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  24.567  
  24.568 -    /* Update ref count for page pointed at by PDE. */
  24.569 -    ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
  24.570 -    if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
  24.571 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  24.572 +    {
  24.573 +        l2e = pl2e[i];
  24.574 +        if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
  24.575 +             unlikely((l2_pgentry_val(l2e) >> PAGE_SHIFT) != page_nr) )
  24.576 +            put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  24.577 +    }
  24.578  
  24.579 -    /* NEW level-1 page table! Deal with every PTE in the table. */
  24.580 -    p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  24.581 +    unmap_domain_mem(pl2e);
  24.582 +}
  24.583 +
  24.584 +
  24.585 +static void free_l1_table(struct pfn_info *page)
  24.586 +{
  24.587 +    unsigned long page_nr = page - frame_table;
  24.588 +    l1_pgentry_t *pl1e, l1e;
  24.589 +    int i;
  24.590 +
  24.591 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  24.592 +
  24.593      for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  24.594      {
  24.595 -        l1_entry = *p_l1_entry++;
  24.596 -        if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
  24.597 -        if ( unlikely((l1_pgentry_val(l1_entry) &
  24.598 -                       (_PAGE_GLOBAL|_PAGE_PAT))) )
  24.599 -        {
  24.600 -            MEM_LOG("Bad L1 page type settings %04lx",
  24.601 -                    l1_pgentry_val(l1_entry) &
  24.602 -                    (_PAGE_GLOBAL|_PAGE_PAT));
  24.603 -            ret = -1;
  24.604 -            goto fail;
  24.605 -        }
  24.606 -        ret = get_page(l1_pgentry_to_pagenr(l1_entry),
  24.607 -                       l1_pgentry_val(l1_entry) & _PAGE_RW);
  24.608 -        if ( unlikely(ret) ) goto fail;
  24.609 +        l1e = pl1e[i];
  24.610 +        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 
  24.611 +            continue;
  24.612 +        put_page_from_l1e(l1e);
  24.613      }
  24.614  
  24.615 -    /* Make sure we unmap the right page! */
  24.616 -    unmap_domain_mem(p_l1_entry-1);
  24.617 -    return ret;
  24.618 +    unmap_domain_mem(pl1e);
  24.619 +}
  24.620 +
  24.621  
  24.622 - fail:
  24.623 -    p_l1_entry--;
  24.624 -    while ( i-- > 0 )
  24.625 -    {
  24.626 -        l1_entry = *--p_l1_entry;
  24.627 -        if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 
  24.628 -            put_page(l1_pgentry_to_pagenr(l1_entry), 
  24.629 -                     l1_pgentry_val(l1_entry) & _PAGE_RW);
  24.630 -    }
  24.631 -    if ( dec_page_refcnt(page_nr, PGT_l1_page_table) != 0 )
  24.632 -        BUG();
  24.633 -    unmap_domain_mem(p_l1_entry);
  24.634 -    return ret;
  24.635 +static inline int update_l2e(l2_pgentry_t *pl2e, 
  24.636 +                             l2_pgentry_t  ol2e, 
  24.637 +                             l2_pgentry_t  nl2e)
  24.638 +{
  24.639 +    unsigned long o = cmpxchg((unsigned long *)pl2e, 
  24.640 +                              l2_pgentry_val(ol2e), 
  24.641 +                              l2_pgentry_val(nl2e));
  24.642 +    if ( o != l2_pgentry_val(ol2e) )
  24.643 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  24.644 +                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
  24.645 +    return (o == l2_pgentry_val(ol2e));
  24.646  }
  24.647  
  24.648  
  24.649 -static int get_page(unsigned long page_nr, int writeable)
  24.650 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  24.651 +static int mod_l2_entry(l2_pgentry_t *pl2e, 
  24.652 +                        l2_pgentry_t nl2e, 
  24.653 +                        unsigned long pfn)
  24.654  {
  24.655 -    struct pfn_info *page;
  24.656 -    unsigned long flags;
  24.657 +    l2_pgentry_t ol2e;
  24.658 +    unsigned long _ol2e;
  24.659  
  24.660 -    /* Update ref count for page pointed at by PTE. */
  24.661 -    if ( unlikely(page_nr >= max_page) )
  24.662 -    {
  24.663 -        MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
  24.664 -        return(-1);
  24.665 -    }
  24.666 -    page = frame_table + page_nr;
  24.667 -    flags = page->flags;
  24.668 -    if ( unlikely(!DOMAIN_OKAY(flags)) )
  24.669 +    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
  24.670 +                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  24.671      {
  24.672 -        MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
  24.673 -        return(-1);
  24.674 -    }
  24.675 -
  24.676 -    if ( writeable )
  24.677 -    {
  24.678 -        if ( (flags & PG_type_mask) != PGT_writeable_page )
  24.679 -        {
  24.680 -            if ( page_type_count(page) != 0 )
  24.681 -            {
  24.682 -                MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
  24.683 -                        flags & PG_type_mask, PGT_writeable_page,
  24.684 -                        page_type_count(page));
  24.685 -                return(-1);
  24.686 -            }
  24.687 -            page->flags &= ~PG_type_mask;
  24.688 -            page->flags |= PGT_writeable_page;
  24.689 -        }
  24.690 -        page->flags |= PG_need_flush;
  24.691 -        get_page_type(page);
  24.692 +        MEM_LOG("Illegal L2 update attempt in hypervisor area %p", pl2e);
  24.693 +        return 0;
  24.694      }
  24.695  
  24.696 -    get_page_tot(page);
  24.697 -    
  24.698 -    return(0);
  24.699 -}
  24.700 -
  24.701 -
  24.702 -static void put_l2_table(unsigned long page_nr)
  24.703 -{
  24.704 -    l2_pgentry_t *p_l2_entry, l2_entry;
  24.705 -    int i;
  24.706 +    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
  24.707 +        return 0;
  24.708 +    ol2e = mk_l2_pgentry(_ol2e);
  24.709  
  24.710 -    if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return;
  24.711 +    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
  24.712 +    {
  24.713 +        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  24.714 +        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) != 0 )
  24.715 +        {
  24.716 +            if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
  24.717 +                return 0;
  24.718  
  24.719 -    /* We had last reference to level-2 page table. Free the PDEs. */
  24.720 -    p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  24.721 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  24.722 -    {
  24.723 -        l2_entry = *p_l2_entry++;
  24.724 -        if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
  24.725 -            put_l1_table(l2_pgentry_to_pagenr(l2_entry));
  24.726 -    }
  24.727 +            if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  24.728 +            {
  24.729 +                put_page_from_l2e(nl2e, pfn);
  24.730 +                return 0;
  24.731 +            }
  24.732  
  24.733 -    unmap_domain_mem(p_l2_entry);
  24.734 -}
  24.735 -
  24.736 -
  24.737 -static void put_l1_table(unsigned long page_nr)
  24.738 -{
  24.739 -    l1_pgentry_t *p_l1_entry, l1_entry;
  24.740 -    int i;
  24.741 -
  24.742 -    if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return;
  24.743 +            if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
  24.744 +                put_page_from_l2e(ol2e, pfn);
  24.745 +        }
  24.746 +        else if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  24.747 +        {
  24.748 +            return 0;
  24.749 +        }
  24.750 +    }
  24.751 +    else
  24.752 +    {
  24.753 +        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  24.754 +            return 0;
  24.755  
  24.756 -    /* We had last reference to level-1 page table. Free the PTEs. */
  24.757 -    p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  24.758 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  24.759 -    {
  24.760 -        l1_entry = *p_l1_entry++;
  24.761 -        if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 
  24.762 -            put_page(l1_pgentry_to_pagenr(l1_entry), 
  24.763 -                     l1_pgentry_val(l1_entry) & _PAGE_RW);
  24.764 +        if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
  24.765 +            put_page_from_l2e(ol2e, pfn);
  24.766      }
  24.767 -
  24.768 -    /* Make sure we unmap the right page! */
  24.769 -    unmap_domain_mem(p_l1_entry-1);
  24.770 +    
  24.771 +    return 1;
  24.772  }
  24.773  
  24.774  
  24.775 -static void put_page(unsigned long page_nr, int writeable)
  24.776 +static inline int update_l1e(l1_pgentry_t *pl1e, 
  24.777 +                             l1_pgentry_t  ol1e, 
  24.778 +                             l1_pgentry_t  nl1e)
  24.779  {
  24.780 -    struct pfn_info *page;
  24.781 -    ASSERT(page_nr < max_page);
  24.782 -    page = frame_table + page_nr;
  24.783 -    ASSERT(DOMAIN_OKAY(page->flags));
  24.784 -    ASSERT((!writeable) || 
  24.785 -           ((page_type_count(page) != 0) && 
  24.786 -            ((page->flags & PG_type_mask) == PGT_writeable_page) &&
  24.787 -            ((page->flags & PG_need_flush) == PG_need_flush)));
  24.788 -    if ( writeable )
  24.789 +    unsigned long o = l1_pgentry_val(ol1e);
  24.790 +    unsigned long n = l1_pgentry_val(nl1e);
  24.791 +
  24.792 +    while ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
  24.793      {
  24.794 -        put_page_type(page);
  24.795 +        unsigned int cpu = smp_processor_id();
  24.796 +        /* The CMPXCHG faulted -- maybe we need to clear the WP bit. */
  24.797 +        if ( deferred_op[cpu].flags & DOP_RESTORE_CR0 )
  24.798 +        {
  24.799 +            MEM_LOG("cmpxchg fault despite WP bit cleared\n");
  24.800 +            return 0;
  24.801 +        }
  24.802 +        deferred_op[cpu].cr0 = read_cr0();
  24.803 +        write_cr0(deferred_op[cpu].cr0 & ~X86_CR0_WP);
  24.804 +        deferred_op[cpu].flags |= DOP_RESTORE_CR0;
  24.805      }
  24.806 -    else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
  24.807 -                       (page_type_count(page) != 0)) )
  24.808 -    {
  24.809 -        /* We expect this is rare so we just blow the entire shadow LDT. */
  24.810 -        invalidate_shadow_ldt();
  24.811 -    }
  24.812 -    put_page_tot(page);
  24.813 +
  24.814 +    if ( o != l1_pgentry_val(ol1e))
  24.815 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  24.816 +                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
  24.817 +
  24.818 +    /* The swap was successful if the old value we saw is equal to ol1e. */
  24.819 +    return (o == l1_pgentry_val(ol1e));
  24.820  }
  24.821  
  24.822  
  24.823 -static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry)
  24.824 +/* Update the L1 entry at pl1e to new value nl1e. */
  24.825 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  24.826  {
  24.827 -    l2_pgentry_t old_l2_entry = *p_l2_entry;
  24.828 +    l1_pgentry_t ol1e;
  24.829 +    unsigned long _ol1e;
  24.830  
  24.831 -    if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
  24.832 -                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  24.833 -    {
  24.834 -        MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
  24.835 -                p_l2_entry);
  24.836 -        goto fail;
  24.837 -    }
  24.838 -
  24.839 -    if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
  24.840 +    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
  24.841      {
  24.842 -        if ( unlikely((l2_pgentry_val(new_l2_entry) & 
  24.843 -                       (_PAGE_GLOBAL|_PAGE_PSE))) )
  24.844 -        {
  24.845 -            MEM_LOG("Bad L2 entry val %04lx",
  24.846 -                    l2_pgentry_val(new_l2_entry) & 
  24.847 -                    (_PAGE_GLOBAL|_PAGE_PSE));
  24.848 -            goto fail;
  24.849 -        }
  24.850 -        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  24.851 -        if ( ((l2_pgentry_val(old_l2_entry) ^ 
  24.852 -               l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
  24.853 -        {
  24.854 -            /* Assume we're mapping an L1 table, falling back to twisted L2. */
  24.855 -            if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) )
  24.856 -            {
  24.857 -                /* NB. No need to sanity-check the VA: done already. */
  24.858 -                unsigned long l1e = l1_pgentry_val(
  24.859 -                    linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]);
  24.860 -                if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) )
  24.861 -                    goto fail;
  24.862 -            }
  24.863 -
  24.864 -            if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) 
  24.865 -                put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));            
  24.866 -        } 
  24.867 -    }
  24.868 -    else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
  24.869 -    {
  24.870 -        put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
  24.871 +        MEM_LOG("Bad get_user\n");
  24.872 +        return 0;
  24.873      }
  24.874      
  24.875 -    *p_l2_entry = new_l2_entry;
  24.876 -    return 0;
  24.877 -
  24.878 - fail:
  24.879 -    return -1;
  24.880 -}
  24.881 -
  24.882 +    ol1e = mk_l1_pgentry(_ol1e);
  24.883  
  24.884 -static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry)
  24.885 -{
  24.886 -    l1_pgentry_t old_l1_entry = *p_l1_entry;
  24.887 -
  24.888 -    if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
  24.889 +    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
  24.890      {
  24.891 -        if ( unlikely((l1_pgentry_val(new_l1_entry) &
  24.892 -                       (_PAGE_GLOBAL|_PAGE_PAT))) ) 
  24.893 -        {
  24.894 -            MEM_LOG("Bad L1 entry val %04lx",
  24.895 -                    l1_pgentry_val(new_l1_entry) & 
  24.896 -                    (_PAGE_GLOBAL|_PAGE_PAT));
  24.897 -            goto fail;
  24.898 -        }
  24.899          /*
  24.900           * Differ in mapping (bits 12-31), writeable (bit 1), or
  24.901           * presence (bit 0)?
  24.902           */
  24.903 -        if ( ((l1_pgentry_val(old_l1_entry) ^
  24.904 -               l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
  24.905 +        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) != 0 )
  24.906          {
  24.907 -            if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
  24.908 -                          l1_pgentry_val(new_l1_entry) & _PAGE_RW) )
  24.909 -                goto fail;
  24.910 +            if ( unlikely(!get_page_from_l1e(nl1e)) )
  24.911 +                return 0;
  24.912 +
  24.913 +            if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  24.914 +            {
  24.915 +                put_page_from_l1e(nl1e);
  24.916 +                return 0;
  24.917 +            }
  24.918  
  24.919 -            if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) 
  24.920 -                put_page(l1_pgentry_to_pagenr(old_l1_entry),
  24.921 -                         l1_pgentry_val(old_l1_entry) & _PAGE_RW);
  24.922 -        } 
  24.923 +            if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
  24.924 +                put_page_from_l1e(ol1e);
  24.925 +        }
  24.926 +        else if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  24.927 +        {
  24.928 +            return 0;
  24.929 +        }
  24.930      }
  24.931 -    else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
  24.932 +    else 
  24.933      {
  24.934 -        put_page(l1_pgentry_to_pagenr(old_l1_entry),
  24.935 -                 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
  24.936 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  24.937 +            return 0;
  24.938 +
  24.939 +        if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
  24.940 +            put_page_from_l1e(ol1e);
  24.941      }
  24.942  
  24.943 -    *p_l1_entry = new_l1_entry;
  24.944 -    return 0;
  24.945 +    return 1;
  24.946 +}
  24.947 +
  24.948 +
  24.949 +int alloc_page_type(struct pfn_info *page, unsigned int type)
  24.950 +{
  24.951 +    if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, 
  24.952 +                                     &page->count_and_flags)) )
  24.953 +    {
  24.954 +        struct task_struct *p = page->u.domain;
  24.955 +        mb(); /* Check zombie status before using domain ptr. */
  24.956 +        /*
  24.957 +         * NB. 'p' may no longer be valid by time we dereference it, so
  24.958 +         * p->processor might be garbage. We clamp it, just in case.
  24.959 +         */
  24.960 +        if ( !test_bit(_PGC_zombie, &page->count_and_flags) &&
  24.961 +             unlikely(NEED_FLUSH(tlbflush_time[(p->processor)&(NR_CPUS-1)], 
  24.962 +                                 page->tlbflush_timestamp)) )
  24.963 +        {
  24.964 +            perfc_incr(need_flush_tlb_flush);
  24.965 +            flush_tlb_cpu(p->processor);
  24.966 +        }
  24.967 +    }
  24.968  
  24.969 - fail:
  24.970 -    return -1;
  24.971 +    switch ( type )
  24.972 +    {
  24.973 +    case PGT_l1_page_table:
  24.974 +        return alloc_l1_table(page);
  24.975 +    case PGT_l2_page_table:
  24.976 +        return alloc_l2_table(page);
  24.977 +    case PGT_gdt_page:
  24.978 +    case PGT_ldt_page:
  24.979 +        return alloc_segdesc_page(page);
  24.980 +    default:
  24.981 +        BUG();
  24.982 +    }
  24.983 +
  24.984 +    return 0;
  24.985 +}
  24.986 +
  24.987 +
  24.988 +void free_page_type(struct pfn_info *page, unsigned int type)
  24.989 +{
  24.990 +    switch ( type )
  24.991 +    {
  24.992 +    case PGT_l1_page_table:
  24.993 +        return free_l1_table(page);
  24.994 +    case PGT_l2_page_table:
  24.995 +        return free_l2_table(page);
  24.996 +    default:
  24.997 +        BUG();
  24.998 +    }
  24.999  }
 24.1000  
 24.1001  
 24.1002  static int do_extended_command(unsigned long ptr, unsigned long val)
 24.1003  {
 24.1004 -    int err = 0, cpu = smp_processor_id();
 24.1005 +    int okay = 1, cpu = smp_processor_id();
 24.1006      unsigned int cmd = val & MMUEXT_CMD_MASK;
 24.1007      unsigned long pfn = ptr >> PAGE_SHIFT;
 24.1008 -    struct pfn_info *page = frame_table + pfn;
 24.1009 +    struct pfn_info *page = &frame_table[pfn];
 24.1010  
 24.1011      /* 'ptr' must be in range except where it isn't a machine address. */
 24.1012      if ( (pfn >= max_page) && (cmd != MMUEXT_SET_LDT) )
 24.1013 +    {
 24.1014 +        MEM_LOG("Ptr out of range for extended MMU command");
 24.1015          return 1;
 24.1016 +    }
 24.1017  
 24.1018      switch ( cmd )
 24.1019      {
 24.1020      case MMUEXT_PIN_L1_TABLE:
 24.1021 -        if ( unlikely(page->flags & PG_guest_pinned) )
 24.1022 -        {
 24.1023 -            MEM_LOG("Pfn %08lx already pinned", pfn);
 24.1024 -            err = 1;
 24.1025 -            break;
 24.1026 -        }
 24.1027 -        err = get_l1_table(pfn);
 24.1028 -        goto mark_as_pinned;
 24.1029 -
 24.1030      case MMUEXT_PIN_L2_TABLE:
 24.1031 -        if ( unlikely(page->flags & PG_guest_pinned) )
 24.1032 -        {
 24.1033 -            MEM_LOG("Pfn %08lx already pinned", pfn);
 24.1034 -            err = 1;
 24.1035 -            break;
 24.1036 -        }
 24.1037 -        err = get_l2_table(pfn);
 24.1038 -
 24.1039 -    mark_as_pinned:
 24.1040 -        if ( unlikely(err) )
 24.1041 +        okay = get_page_and_type_from_pagenr(pfn, 
 24.1042 +                                             (cmd == MMUEXT_PIN_L2_TABLE) ? 
 24.1043 +                                             PGT_l2_page_table : 
 24.1044 +                                             PGT_l1_page_table);
 24.1045 +        if ( unlikely(!okay) )
 24.1046          {
 24.1047              MEM_LOG("Error while pinning pfn %08lx", pfn);
 24.1048              break;
 24.1049          }
 24.1050 -        page->flags |= PG_guest_pinned;
 24.1051 +
 24.1052 +        if ( unlikely(test_and_set_bit(_PGC_guest_pinned, 
 24.1053 +                                       &page->count_and_flags)) )
 24.1054 +        {
 24.1055 +            MEM_LOG("Pfn %08lx already pinned", pfn);
 24.1056 +            put_page_and_type(page);
 24.1057 +            okay = 0;
 24.1058 +            break;
 24.1059 +        }
 24.1060 +
 24.1061          break;
 24.1062  
 24.1063      case MMUEXT_UNPIN_TABLE:
 24.1064 -        if ( unlikely(!DOMAIN_OKAY(page->flags)) )
 24.1065 +        if ( unlikely(!(okay = get_page_from_pagenr(pfn))) )
 24.1066          {
 24.1067 -            err = 1;
 24.1068 -            MEM_LOG("Page %08lx bad domain (dom=%ld)",
 24.1069 -                    ptr, page->flags & PG_domain_mask);
 24.1070 +            MEM_LOG("Page %08lx bad domain (dom=%p)",
 24.1071 +                    ptr, page->u.domain);
 24.1072          }
 24.1073 -        else if ( likely(page->flags & PG_guest_pinned) )
 24.1074 +        else if ( likely(test_and_clear_bit(_PGC_guest_pinned, 
 24.1075 +                                            &page->count_and_flags)) )
 24.1076          {
 24.1077 -            page->flags &= ~PG_guest_pinned;
 24.1078 -            ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
 24.1079 -                put_l1_table(pfn) : put_l2_table(pfn);
 24.1080 +            put_page_and_type(page);
 24.1081          }
 24.1082          else
 24.1083          {
 24.1084 -            err = 1;
 24.1085 +            okay = 0;
 24.1086              MEM_LOG("Pfn %08lx not pinned", pfn);
 24.1087          }
 24.1088          break;
 24.1089  
 24.1090      case MMUEXT_NEW_BASEPTR:
 24.1091 -        err = get_l2_table(pfn);
 24.1092 -        if ( !err )
 24.1093 +        okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table);
 24.1094 +        if ( likely(okay) )
 24.1095          {
 24.1096 -            put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
 24.1097 +            put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable)
 24.1098 +                                          >> PAGE_SHIFT]);
 24.1099              current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
 24.1100              invalidate_shadow_ldt();
 24.1101 -            deferred_op[cpu].flush_tlb = 1;
 24.1102 +            deferred_op[cpu].flags |= DOP_FLUSH_TLB;
 24.1103          }
 24.1104          else
 24.1105          {
 24.1106 -            MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
 24.1107 +            MEM_LOG("Error while installing new baseptr %08lx", ptr);
 24.1108          }
 24.1109          break;
 24.1110          
 24.1111      case MMUEXT_TLB_FLUSH:
 24.1112 -        deferred_op[cpu].flush_tlb = 1;
 24.1113 +        deferred_op[cpu].flags |= DOP_FLUSH_TLB;
 24.1114          break;
 24.1115      
 24.1116      case MMUEXT_INVLPG:
 24.1117 @@ -815,7 +867,7 @@ static int do_extended_command(unsigned 
 24.1118               ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
 24.1119               ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
 24.1120          {
 24.1121 -            err = 1;
 24.1122 +            okay = 0;
 24.1123              MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
 24.1124          }
 24.1125          else if ( (current->mm.ldt_ents != ents) || 
 24.1126 @@ -825,37 +877,39 @@ static int do_extended_command(unsigned 
 24.1127              current->mm.ldt_base = ptr;
 24.1128              current->mm.ldt_ents = ents;
 24.1129              load_LDT(current);
 24.1130 -            deferred_op[cpu].refresh_ldt = (ents != 0);
 24.1131 +            deferred_op[cpu].flags &= ~DOP_RELOAD_LDT;
 24.1132 +            if ( ents != 0 )
 24.1133 +                deferred_op[cpu].flags |= DOP_RELOAD_LDT;
 24.1134          }
 24.1135          break;
 24.1136      }
 24.1137  
 24.1138      default:
 24.1139          MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
 24.1140 -        err = 1;
 24.1141 +        okay = 0;
 24.1142          break;
 24.1143      }
 24.1144  
 24.1145 -    return err;
 24.1146 +    return okay;
 24.1147  }
 24.1148  
 24.1149  
 24.1150  int do_mmu_update(mmu_update_t *ureqs, int count)
 24.1151  {
 24.1152      mmu_update_t req;
 24.1153 -    unsigned long flags, pfn, l1e;
 24.1154 +    unsigned long va = 0, flags, pfn, prev_pfn = 0;
 24.1155      struct pfn_info *page;
 24.1156 -    int rc = 0, err = 0, i, cpu = smp_processor_id();
 24.1157 +    int rc = 0, okay = 1, i, cpu = smp_processor_id();
 24.1158      unsigned int cmd;
 24.1159 -    unsigned long cr0 = 0;
 24.1160  
 24.1161 -    perfc_incrc( calls_to_mmu_update ); 
 24.1162 -    perfc_addc( num_page_updates, count );
 24.1163 +    perfc_incrc(calls_to_mmu_update); 
 24.1164 +    perfc_addc(num_page_updates, count);
 24.1165  
 24.1166      for ( i = 0; i < count; i++ )
 24.1167      {
 24.1168          if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
 24.1169          {
 24.1170 +            MEM_LOG("Bad copy_from_user");
 24.1171              rc = -EFAULT;
 24.1172              break;
 24.1173          }
 24.1174 @@ -863,77 +917,85 @@ int do_mmu_update(mmu_update_t *ureqs, i
 24.1175          cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
 24.1176          pfn = req.ptr >> PAGE_SHIFT;
 24.1177  
 24.1178 -        err = 1;
 24.1179 -
 24.1180 -        spin_lock(&current->page_lock);
 24.1181 +        okay = 0;
 24.1182  
 24.1183 -        /* Get the page-frame number that a non-extended command references. */
 24.1184 -        if ( (cmd == MMU_NORMAL_PT_UPDATE) || 
 24.1185 -             (cmd == MMU_UNCHECKED_PT_UPDATE) )
 24.1186 -        {
 24.1187 -            if ( cr0 == 0 )
 24.1188 -            {
 24.1189 -                cr0 = read_cr0();
 24.1190 -                write_cr0(cr0 & ~X86_CR0_WP);
 24.1191 -            }
 24.1192 -            /* Need to use 'get_user' since the VA's PGD may be absent. */
 24.1193 -            __get_user(l1e, (unsigned long *)(linear_pg_table+pfn));
 24.1194 -            /* Now check that the VA's PTE isn't absent. */
 24.1195 -            if ( unlikely(!(l1e & _PAGE_PRESENT)) )
 24.1196 -            {
 24.1197 -                MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e);
 24.1198 -                goto unlock;
 24.1199 -            }
 24.1200 -            /* Finally, get the underlying machine address. */
 24.1201 -            pfn = l1e >> PAGE_SHIFT;
 24.1202 -        }
 24.1203 -
 24.1204 -        /* Least significant bits of 'ptr' demux the operation type. */
 24.1205          switch ( cmd )
 24.1206          {
 24.1207              /*
 24.1208               * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
 24.1209               */
 24.1210          case MMU_NORMAL_PT_UPDATE:
 24.1211 -            page  = frame_table + pfn;
 24.1212 -            flags = page->flags;
 24.1213 +            page = &frame_table[pfn];
 24.1214  
 24.1215 -            if ( likely(DOMAIN_OKAY(flags)) )
 24.1216 +            if ( unlikely(!get_page(page, current)) &&
 24.1217 +                 ((current->domain != 0) || !dom0_get_page(page)) )
 24.1218              {
 24.1219 -                switch ( (flags & PG_type_mask) )
 24.1220 -                {
 24.1221 -                case PGT_l1_page_table: 
 24.1222 -                    err = mod_l1_entry((l1_pgentry_t *)req.ptr, 
 24.1223 -                                       mk_l1_pgentry(req.val)); 
 24.1224 -                    break;
 24.1225 -                case PGT_l2_page_table: 
 24.1226 -                    err = mod_l2_entry((l2_pgentry_t *)req.ptr, 
 24.1227 -                                       mk_l2_pgentry(req.val)); 
 24.1228 -                    break;                    
 24.1229 -                default:
 24.1230 -                    if ( page_type_count(page) == 0 )
 24.1231 -                    {
 24.1232 -                        *(unsigned long *)req.ptr = req.val;
 24.1233 -                        err = 0;
 24.1234 -                    }
 24.1235 -                    else
 24.1236 -                        MEM_LOG("Update to bad page %08lx", req.ptr);
 24.1237 -                    break;
 24.1238 -                }
 24.1239 +                MEM_LOG("Could not get page for normal update");
 24.1240 +                break;
 24.1241 +            }
 24.1242 +
 24.1243 +            if ( likely(prev_pfn == pfn) )
 24.1244 +            {
 24.1245 +                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 24.1246              }
 24.1247              else
 24.1248              {
 24.1249 -                MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
 24.1250 -                        current->domain, pfn);
 24.1251 +                if ( prev_pfn != 0 )
 24.1252 +                    unmap_domain_mem((void *)va);
 24.1253 +                va = (unsigned long)map_domain_mem(req.ptr);
 24.1254 +                prev_pfn = pfn;
 24.1255              }
 24.1256 +
 24.1257 +            switch ( (page->type_and_flags & PGT_type_mask) )
 24.1258 +            {
 24.1259 +            case PGT_l1_page_table: 
 24.1260 +                if ( likely(get_page_type(page, PGT_l1_page_table)) )
 24.1261 +                {
 24.1262 +                    okay = mod_l1_entry((l1_pgentry_t *)va, 
 24.1263 +                                        mk_l1_pgentry(req.val)); 
 24.1264 +                    put_page_type(page);
 24.1265 +                }
 24.1266 +                break;
 24.1267 +            case PGT_l2_page_table:
 24.1268 +                if ( likely(get_page_type(page, PGT_l2_page_table)) )
 24.1269 +                {
 24.1270 +                    okay = mod_l2_entry((l2_pgentry_t *)va, 
 24.1271 +                                        mk_l2_pgentry(req.val),
 24.1272 +                                        pfn); 
 24.1273 +                    put_page_type(page);
 24.1274 +                }
 24.1275 +                break;
 24.1276 +            default:
 24.1277 +                if ( likely(get_page_type(page, PGT_writeable_page)) )
 24.1278 +                {
 24.1279 +                    *(unsigned long *)va = req.val;
 24.1280 +                    okay = 1;
 24.1281 +                    put_page_type(page);
 24.1282 +                }
 24.1283 +                break;
 24.1284 +            }
 24.1285 +            
 24.1286 +            put_page(page);
 24.1287 +
 24.1288              break;
 24.1289  
 24.1290          case MMU_UNCHECKED_PT_UPDATE:
 24.1291              req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 24.1292              if ( likely(IS_PRIV(current)) )
 24.1293              {
 24.1294 -                *(unsigned long *)req.ptr = req.val;
 24.1295 -                err = 0;
 24.1296 +                if ( likely(prev_pfn == pfn) )
 24.1297 +                {
 24.1298 +                    va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 24.1299 +                }
 24.1300 +                else
 24.1301 +                {
 24.1302 +                    if ( prev_pfn != 0 )
 24.1303 +                        unmap_domain_mem((void *)va);
 24.1304 +                    va = (unsigned long)map_domain_mem(req.ptr);
 24.1305 +                    prev_pfn = pfn;
 24.1306 +                }
 24.1307 +                *(unsigned long *)va = req.val;
 24.1308 +                okay = 1;
 24.1309              }
 24.1310              else
 24.1311              {
 24.1312 @@ -942,21 +1004,18 @@ int do_mmu_update(mmu_update_t *ureqs, i
 24.1313              break;
 24.1314              
 24.1315          case MMU_MACHPHYS_UPDATE:
 24.1316 -            page = frame_table + pfn;
 24.1317 +            page = &frame_table[pfn];
 24.1318              if ( unlikely(pfn >= max_page) )
 24.1319              {
 24.1320                  MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
 24.1321              }
 24.1322 -            else if ( likely(DOMAIN_OKAY(page->flags)) )
 24.1323 +            else if ( likely(get_page(page, current)) ||
 24.1324 +                      ((current->domain == 0) && dom0_get_page(page)) )
 24.1325              {
 24.1326                  machine_to_phys_mapping[pfn] = req.val;
 24.1327 -                err = 0;
 24.1328 +                okay = 1;
 24.1329 +                put_page(page);
 24.1330              }
 24.1331 -            else
 24.1332 -            {
 24.1333 -                MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
 24.1334 -                        current->domain, pfn);
 24.1335 -            }            
 24.1336              break;
 24.1337  
 24.1338              /*
 24.1339 @@ -965,7 +1024,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
 24.1340               */
 24.1341          case MMU_EXTENDED_COMMAND:
 24.1342              req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 24.1343 -            err = do_extended_command(req.ptr, req.val);
 24.1344 +            okay = do_extended_command(req.ptr, req.val);
 24.1345              break;
 24.1346  
 24.1347          default:
 24.1348 @@ -973,10 +1032,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
 24.1349              break;
 24.1350          }
 24.1351  
 24.1352 -    unlock:
 24.1353 -        spin_unlock(&current->page_lock);
 24.1354 -
 24.1355 -        if ( unlikely(err) )
 24.1356 +        if ( unlikely(!okay) )
 24.1357          {
 24.1358              rc = -EINVAL;
 24.1359              break;
 24.1360 @@ -985,20 +1041,20 @@ int do_mmu_update(mmu_update_t *ureqs, i
 24.1361          ureqs++;
 24.1362      }
 24.1363  
 24.1364 -    if ( deferred_op[cpu].flush_tlb )
 24.1365 -    {
 24.1366 -        deferred_op[cpu].flush_tlb = 0;
 24.1367 -        __write_cr3_counted(pagetable_val(current->mm.pagetable));
 24.1368 -    }
 24.1369 +    if ( prev_pfn != 0 )
 24.1370 +        unmap_domain_mem((void *)va);
 24.1371 +
 24.1372 +    flags = deferred_op[cpu].flags;
 24.1373 +    deferred_op[cpu].flags = 0;
 24.1374  
 24.1375 -    if ( deferred_op[cpu].refresh_ldt )
 24.1376 -    {
 24.1377 -        deferred_op[cpu].refresh_ldt = 0;
 24.1378 +    if ( flags & DOP_FLUSH_TLB )
 24.1379 +        write_cr3_counted(pagetable_val(current->mm.pagetable));
 24.1380 +
 24.1381 +    if ( flags & DOP_RELOAD_LDT )
 24.1382          (void)map_ldt_shadow_page(0);
 24.1383 -    }
 24.1384  
 24.1385 -    if ( cr0 != 0 )
 24.1386 -        write_cr0(cr0);
 24.1387 +    if ( unlikely(flags & DOP_RESTORE_CR0) )
 24.1388 +        write_cr0(deferred_op[cpu].cr0);
 24.1389  
 24.1390      return rc;
 24.1391  }
 24.1392 @@ -1006,48 +1062,34 @@ int do_mmu_update(mmu_update_t *ureqs, i
 24.1393  
 24.1394  int do_update_va_mapping(unsigned long page_nr, 
 24.1395                           unsigned long val, 
 24.1396 -                         unsigned long flags)
 24.1397 +                         unsigned long caller_flags)
 24.1398  {
 24.1399 -    unsigned long _x, cr0 = 0;
 24.1400      struct task_struct *p = current;
 24.1401 -    int err = -EINVAL;
 24.1402 +    int err = 0;
 24.1403 +    unsigned int cpu = p->processor;
 24.1404 +    unsigned long defer_flags;
 24.1405  
 24.1406      if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
 24.1407 -        goto out;
 24.1408 -
 24.1409 -    spin_lock(&p->page_lock);
 24.1410 +        return -EINVAL;
 24.1411  
 24.1412 -    /* Check that the VA's page-directory entry is present.. */
 24.1413 -    if ( unlikely((err = __get_user(_x, (unsigned long *)
 24.1414 -                                    (&linear_pg_table[page_nr]))) != 0) )
 24.1415 -        goto unlock_and_out;
 24.1416 +    if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 
 24.1417 +                                mk_l1_pgentry(val))) )
 24.1418 +        err = -EINVAL;
 24.1419  
 24.1420 -    /* If the VA's page-directory entry is read-only, we frob the WP bit. */
 24.1421 -    if ( unlikely(__put_user(_x, (unsigned long *)
 24.1422 -                             (&linear_pg_table[page_nr]))) )
 24.1423 -    {
 24.1424 -        cr0 = read_cr0();
 24.1425 -        write_cr0(cr0 & ~X86_CR0_WP);        
 24.1426 -    }
 24.1427 +    defer_flags = deferred_op[cpu].flags;
 24.1428 +    deferred_op[cpu].flags = 0;
 24.1429  
 24.1430 -    if ( unlikely(mod_l1_entry(&linear_pg_table[page_nr], 
 24.1431 -                               mk_l1_pgentry(val)) != 0) )
 24.1432 -    {
 24.1433 -        err = -EINVAL;
 24.1434 -        goto check_cr0_unlock_and_out;
 24.1435 -    }
 24.1436 -
 24.1437 -    if ( unlikely(flags & UVMF_INVLPG) )
 24.1438 +    if ( unlikely(defer_flags & DOP_FLUSH_TLB) || 
 24.1439 +         unlikely(caller_flags & UVMF_FLUSH_TLB) )
 24.1440 +        write_cr3_counted(pagetable_val(p->mm.pagetable));
 24.1441 +    else if ( unlikely(caller_flags & UVMF_INVLPG) )
 24.1442          __flush_tlb_one(page_nr << PAGE_SHIFT);
 24.1443  
 24.1444 -    if ( unlikely(flags & UVMF_FLUSH_TLB) )
 24.1445 -        __write_cr3_counted(pagetable_val(p->mm.pagetable));
 24.1446 +    if ( unlikely(defer_flags & DOP_RELOAD_LDT) )
 24.1447 +        (void)map_ldt_shadow_page(0);
 24.1448  
 24.1449 - check_cr0_unlock_and_out:
 24.1450 -    if ( unlikely(cr0 != 0) )
 24.1451 -        write_cr0(cr0);
 24.1452 - unlock_and_out:
 24.1453 -    spin_unlock(&p->page_lock);
 24.1454 - out:
 24.1455 +    if ( unlikely(defer_flags & DOP_RESTORE_CR0) )
 24.1456 +        write_cr0(deferred_op[cpu].cr0);
 24.1457 +
 24.1458      return err;
 24.1459  }
    25.1 --- a/xen/common/network.c	Sat Dec 20 11:49:50 2003 +0000
    25.2 +++ b/xen/common/network.c	Sat Dec 20 12:44:39 2003 +0000
    25.3 @@ -90,7 +90,7 @@ net_vif_t *create_net_vif(int domain)
    25.4      if ( sizeof(net_ring_t) > PAGE_SIZE ) BUG();
    25.5      new_ring = (net_ring_t *)get_free_page(GFP_KERNEL);
    25.6      clear_page(new_ring);
    25.7 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), domain);
    25.8 +    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
    25.9  
   25.10      /*
   25.11       * Fill in the new vif struct. Note that, while the vif's refcnt is
    26.1 --- a/xen/common/page_alloc.c	Sat Dec 20 11:49:50 2003 +0000
    26.2 +++ b/xen/common/page_alloc.c	Sat Dec 20 12:44:39 2003 +0000
    26.3 @@ -188,12 +188,12 @@ void __init init_page_allocator(unsigned
    26.4  /* Release a PHYSICAL address range to the allocator. */
    26.5  void release_bytes_to_allocator(unsigned long min, unsigned long max)
    26.6  {
    26.7 -    min = round_pgup  (min) + PAGE_OFFSET;
    26.8 -    max = round_pgdown(max) + PAGE_OFFSET;
    26.9 +    min = round_pgup  (min);
   26.10 +    max = round_pgdown(max);
   26.11  
   26.12      while ( min < max )
   26.13      {
   26.14 -        __free_pages(min, 0);
   26.15 +        __free_pages(min+PAGE_OFFSET, 0);
   26.16          min += PAGE_SIZE;
   26.17      }
   26.18  }
   26.19 @@ -210,7 +210,6 @@ unsigned long __get_free_pages(int mask,
   26.20  retry:
   26.21      spin_lock_irqsave(&alloc_lock, flags);
   26.22  
   26.23 -
   26.24      /* Find smallest order which can satisfy the request. */
   26.25      for ( i = order; i < FREELIST_SIZE; i++ ) {
   26.26  	if ( !FREELIST_EMPTY(free_head[i]) ) 
    27.1 --- a/xen/drivers/block/ll_rw_blk.c	Sat Dec 20 11:49:50 2003 +0000
    27.2 +++ b/xen/drivers/block/ll_rw_blk.c	Sat Dec 20 12:44:39 2003 +0000
    27.3 @@ -14,31 +14,15 @@
    27.4  #include <xeno/types.h>
    27.5  #include <xeno/lib.h>
    27.6  #include <xeno/sched.h>
    27.7 -/*#include <xeno/kernel_stat.h>*/
    27.8  #include <xeno/errno.h>
    27.9 -/*#include <xeno/locks.h>*/
   27.10  #include <xeno/mm.h>
   27.11 -/*#include <xeno/swap.h>*/
   27.12  #include <xeno/init.h>
   27.13 -/*#include <xeno/smp_lock.h>*/
   27.14 -/*#include <xeno/completion.h>*/
   27.15 -
   27.16  #include <asm/system.h>
   27.17  #include <asm/io.h>
   27.18  #include <xeno/blk.h>
   27.19 -/*#include <xeno/highmem.h>*/
   27.20  #include <xeno/slab.h>
   27.21  #include <xeno/module.h>
   27.22  
   27.23 -/*
   27.24 - * KAF: We can turn off noise relating to barking guest-OS requests.
   27.25 - */
   27.26 -#if 0
   27.27 -#define DPRINTK(_f, _a...) printk(_f , ## _a)
   27.28 -#else
   27.29 -#define DPRINTK(_f, _a...) ((void)0)
   27.30 -#endif
   27.31 -
   27.32  /* This will die as all synchronous stuff is coming to an end */
   27.33  #if 0 
   27.34  #define complete(_r) panic("completion.h stuff may be needed...")
   27.35 @@ -47,8 +31,6 @@
   27.36  #define complete(_r) (*(int *)(_r) = 0)
   27.37  #endif
   27.38  
   27.39 -
   27.40 -
   27.41  /*
   27.42   * MAC Floppy IWM hooks
   27.43   */
    28.1 --- a/xen/drivers/block/xen_block.c	Sat Dec 20 11:49:50 2003 +0000
    28.2 +++ b/xen/drivers/block/xen_block.c	Sat Dec 20 12:44:39 2003 +0000
    28.3 @@ -20,12 +20,6 @@
    28.4  #include <xeno/vbd.h>
    28.5  #include <xeno/slab.h>
    28.6  
    28.7 -#if 0
    28.8 -#define DPRINTK(_f, _a...) printk( _f , ## _a )
    28.9 -#else
   28.10 -#define DPRINTK(_f, _a...) ((void)0)
   28.11 -#endif
   28.12 -
   28.13  /*
   28.14   * These are rather arbitrary. They are fairly large because adjacent
   28.15   * requests pulled from a communication ring are quite likely to end
   28.16 @@ -60,15 +54,11 @@ static atomic_t nr_pending;
   28.17  
   28.18  static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
   28.19  
   28.20 -static int __buffer_is_valid(struct task_struct *p, 
   28.21 -                             unsigned long buffer, 
   28.22 -                             unsigned short size,
   28.23 -                             int writeable_buffer);
   28.24 -static void __lock_buffer(unsigned long buffer,
   28.25 -                          unsigned short size,
   28.26 -                          int writeable_buffer);
   28.27 -static void unlock_buffer(struct task_struct *p,
   28.28 -                          unsigned long buffer,
   28.29 +static int lock_buffer(struct task_struct *p,
   28.30 +                       unsigned long buffer,
   28.31 +                       unsigned short size,
   28.32 +                       int writeable_buffer);
   28.33 +static void unlock_buffer(unsigned long buffer,
   28.34                            unsigned short size,
   28.35                            int writeable_buffer);
   28.36  
   28.37 @@ -185,8 +175,7 @@ static void end_block_io_op_softirq(stru
   28.38      {
   28.39          pending_req = bh->pending_req;
   28.40          
   28.41 -        unlock_buffer(pending_req->domain, 
   28.42 -                      virt_to_phys(bh->b_data), 
   28.43 +        unlock_buffer(virt_to_phys(bh->b_data), 
   28.44                        bh->b_size, 
   28.45                        (pending_req->operation==READ));
   28.46          
   28.47 @@ -321,55 +310,10 @@ long do_block_io_op(block_io_op_t *u_blo
   28.48   * DOWNWARD CALLS -- These interface with the block-device layer proper.
   28.49   */
   28.50  
   28.51 -static int __buffer_is_valid(struct task_struct *p, 
   28.52 -                             unsigned long buffer, 
   28.53 -                             unsigned short size,
   28.54 -                             int writeable_buffer)
   28.55 -{
   28.56 -    unsigned long    pfn;
   28.57 -    struct pfn_info *page;
   28.58 -    int rc = 0;
   28.59 -
   28.60 -    /* A request may span multiple page frames. Each must be checked. */
   28.61 -    for ( pfn = buffer >> PAGE_SHIFT; 
   28.62 -          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   28.63 -          pfn++ )
   28.64 -    {
   28.65 -        /* Each frame must be within bounds of machine memory. */
   28.66 -        if ( pfn >= max_page )
   28.67 -        {
   28.68 -            DPRINTK("pfn out of range: %08lx\n", pfn);
   28.69 -            goto out;
   28.70 -        }
   28.71 -
   28.72 -        page = frame_table + pfn;
   28.73 -
   28.74 -        /* Each frame must belong to the requesting domain. */
   28.75 -        if ( (page->flags & PG_domain_mask) != p->domain )
   28.76 -        {
   28.77 -            DPRINTK("bad domain: expected %d, got %ld\n", 
   28.78 -                    p->domain, page->flags & PG_domain_mask);
   28.79 -            goto out;
   28.80 -        }
   28.81 -
   28.82 -        /* If reading into the frame, the frame must be writeable. */
   28.83 -        if ( writeable_buffer &&
   28.84 -             ((page->flags & PG_type_mask) != PGT_writeable_page) &&
   28.85 -             (page_type_count(page) != 0) )
   28.86 -        {
   28.87 -            DPRINTK("non-writeable page passed for block read\n");
   28.88 -            goto out;
   28.89 -        }
   28.90 -    }    
   28.91 -
   28.92 -    rc = 1;
   28.93 - out:
   28.94 -    return rc;
   28.95 -}
   28.96 -
   28.97 -static void __lock_buffer(unsigned long buffer,
   28.98 -                          unsigned short size,
   28.99 -                          int writeable_buffer)
  28.100 +static int lock_buffer(struct task_struct *p,
  28.101 +                       unsigned long buffer,
  28.102 +                       unsigned short size,
  28.103 +                       int writeable_buffer)
  28.104  {
  28.105      unsigned long    pfn;
  28.106      struct pfn_info *page;
  28.107 @@ -378,40 +322,48 @@ static void __lock_buffer(unsigned long 
  28.108            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
  28.109            pfn++ )
  28.110      {
  28.111 -        page = frame_table + pfn;
  28.112 -        if ( writeable_buffer )
  28.113 +        if ( unlikely(pfn >= max_page) )
  28.114 +            goto fail;
  28.115 +
  28.116 +        page = &frame_table[pfn];
  28.117 +
  28.118 +        if ( unlikely(!get_page(page, p)) )
  28.119 +            goto fail;
  28.120 +
  28.121 +        if ( writeable_buffer && 
  28.122 +             unlikely(!get_page_type(page, PGT_writeable_page)) )
  28.123          {
  28.124 -            if ( page_type_count(page) == 0 )
  28.125 -            {
  28.126 -                page->flags &= ~PG_type_mask;
  28.127 -                /* No need for PG_need_flush here. */
  28.128 -                page->flags |= PGT_writeable_page;
  28.129 -            }
  28.130 -            get_page_type(page);
  28.131 +            put_page(page);
  28.132 +            goto fail;
  28.133          }
  28.134 -        get_page_tot(page);
  28.135      }
  28.136 +
  28.137 +    return 1;
  28.138 +
  28.139 + fail:
  28.140 +    while ( pfn-- > (buffer >> PAGE_SHIFT) )
  28.141 +    {        
  28.142 +        if ( writeable_buffer )
  28.143 +            put_page_type(&frame_table[pfn]);
  28.144 +        put_page(&frame_table[pfn]);
  28.145 +    }
  28.146 +    return 0;
  28.147  }
  28.148  
  28.149 -static void unlock_buffer(struct task_struct *p,
  28.150 -                          unsigned long buffer,
  28.151 +static void unlock_buffer(unsigned long buffer,
  28.152                            unsigned short size,
  28.153                            int writeable_buffer)
  28.154  {
  28.155 -    unsigned long    pfn;
  28.156 -    struct pfn_info *page;
  28.157 +    unsigned long pfn;
  28.158  
  28.159 -    spin_lock(&p->page_lock);
  28.160      for ( pfn = buffer >> PAGE_SHIFT; 
  28.161            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
  28.162            pfn++ )
  28.163      {
  28.164 -        page = frame_table + pfn;
  28.165          if ( writeable_buffer )
  28.166 -            put_page_type(page);
  28.167 -        put_page_tot(page);
  28.168 +            put_page_type(&frame_table[pfn]);
  28.169 +        put_page(&frame_table[pfn]);
  28.170      }
  28.171 -    spin_unlock(&p->page_lock);
  28.172  }
  28.173  
  28.174  static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
  28.175 @@ -480,8 +432,6 @@ static void dispatch_rw_block_io(struct 
  28.176      int new_segs, nr_psegs = 0;
  28.177      phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
  28.178  
  28.179 -    spin_lock(&p->page_lock);
  28.180 -
  28.181      /* Check that number of segments is sane. */
  28.182      if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
  28.183      {
  28.184 @@ -506,7 +456,7 @@ static void dispatch_rw_block_io(struct 
  28.185              goto bad_descriptor;
  28.186          }
  28.187  
  28.188 -        if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
  28.189 +        if ( !lock_buffer(p, buffer, nr_sects<<9, (operation==READ)) )
  28.190  	{
  28.191              DPRINTK("invalid buffer\n");
  28.192              goto bad_descriptor;
  28.193 @@ -530,6 +480,7 @@ static void dispatch_rw_block_io(struct 
  28.194                          req->sector_number + tot_sects, 
  28.195                          req->sector_number + tot_sects + nr_sects, 
  28.196                          req->device); 
  28.197 +                unlock_buffer(buffer, nr_sects<<9, (operation==READ));
  28.198                  goto bad_descriptor;
  28.199              }
  28.200  
  28.201 @@ -546,12 +497,6 @@ static void dispatch_rw_block_io(struct 
  28.202          if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
  28.203      }
  28.204  
  28.205 -    /* Lock pages associated with each buffer head. */
  28.206 -    for ( i = 0; i < nr_psegs; i++ )
  28.207 -        __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, 
  28.208 -                      (operation==READ));
  28.209 -    spin_unlock(&p->page_lock);
  28.210 -
  28.211      atomic_inc(&nr_pending);
  28.212      pending_req = pending_reqs + pending_ring[pending_cons];
  28.213      PENDREQ_IDX_INC(pending_cons);
  28.214 @@ -594,7 +539,6 @@ static void dispatch_rw_block_io(struct 
  28.215      return;
  28.216  
  28.217   bad_descriptor:
  28.218 -    spin_unlock(&p->page_lock);
  28.219      make_response(p, req->id, req->operation, 1);
  28.220  } 
  28.221  
  28.222 @@ -670,7 +614,7 @@ void init_blkdev_info(struct task_struct
  28.223      if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
  28.224      p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
  28.225      clear_page(p->blk_ring_base);
  28.226 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
  28.227 +    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
  28.228      p->blkdev_list.next = NULL;
  28.229      spin_lock_init(&p->vbd_lock);
  28.230  }
  28.231 @@ -680,7 +624,6 @@ void destroy_blkdev_info(struct task_str
  28.232  {
  28.233      ASSERT(!__on_blkdev_list(p));
  28.234      UNSHARE_PFN(virt_to_page(p->blk_ring_base));
  28.235 -    free_page((unsigned long)p->blk_ring_base);
  28.236      destroy_all_vbds(p);
  28.237  }
  28.238  
    29.1 --- a/xen/drivers/block/xen_vbd.c	Sat Dec 20 11:49:50 2003 +0000
    29.2 +++ b/xen/drivers/block/xen_vbd.c	Sat Dec 20 12:44:39 2003 +0000
    29.3 @@ -23,13 +23,6 @@
    29.4  extern int ide_probe_devices(xen_disk_info_t *xdi);
    29.5  extern int scsi_probe_devices(xen_disk_info_t *xdi);
    29.6  
    29.7 -
    29.8 -#if 0
    29.9 -#define DPRINTK(_f, _a...) printk( _f , ## _a )
   29.10 -#else
   29.11 -#define DPRINTK(_f, _a...) ((void)0)
   29.12 -#endif
   29.13 -
   29.14  /* XXX SMH: crappy 'hash function' .. fix when care. */
   29.15  #define HSH(_x) ((_x) & (VBD_HTAB_SZ - 1))
   29.16  
   29.17 @@ -447,16 +440,9 @@ long vbd_probe(vbd_probe_t *probe)
   29.18      if ( (probe->domain == VBD_PROBE_ALL) || IS_PRIV(p) )
   29.19      { 
   29.20          /* Privileged domains always get access to the 'real' devices. */
   29.21 -        if ( (ret = ide_probe_devices(&probe->xdi)) != 0 ) 
   29.22 -        {
   29.23 -            DPRINTK("vbd_probe: error %d in probing ide devices\n", ret); 
   29.24 +        if ( ((ret = ide_probe_devices(&probe->xdi)) != 0) ||
   29.25 +             ((ret = scsi_probe_devices(&probe->xdi)) != 0) )
   29.26              goto out; 
   29.27 -        }
   29.28 -        if ( (ret = scsi_probe_devices(&probe->xdi)) != 0 )
   29.29 -        { 
   29.30 -            DPRINTK("vbd_probe: error %d in probing scsi devices\n", ret); 
   29.31 -            goto out; 
   29.32 -        }
   29.33      } 
   29.34  
   29.35      if ( probe->domain == VBD_PROBE_ALL )
   29.36 @@ -469,8 +455,6 @@ long vbd_probe(vbd_probe_t *probe)
   29.37              { 
   29.38                  if( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
   29.39                  { 
   29.40 -                    DPRINTK("vbd_probe: error %d in probing virtual devices\n",
   29.41 -                            ret); 
   29.42                      read_unlock_irqrestore(&tasklist_lock, flags);
   29.43                      goto out; 
   29.44                  }
   29.45 @@ -478,17 +462,12 @@ long vbd_probe(vbd_probe_t *probe)
   29.46          }
   29.47          read_unlock_irqrestore(&tasklist_lock, flags);
   29.48      } 
   29.49 -    else 
   29.50 -    { 
   29.51 -        if ( (ret = vbd_probe_devices(&probe->xdi, p)) )
   29.52 -        { 
   29.53 -            DPRINTK("vbd_probe: error %d in probing virtual devices\n", ret); 
   29.54 -            goto out; 
   29.55 -        }
   29.56 -
   29.57 -    }
   29.58 +    else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
   29.59 +        goto out; 
   29.60  
   29.61   out: 
   29.62 +    if ( ret != 0 )
   29.63 +        DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); 
   29.64      if ( p != NULL )
   29.65          put_task_struct(p); 
   29.66      return ret; 
    30.1 --- a/xen/drivers/net/e1000/e1000_main.c	Sat Dec 20 11:49:50 2003 +0000
    30.2 +++ b/xen/drivers/net/e1000/e1000_main.c	Sat Dec 20 12:44:39 2003 +0000
    30.3 @@ -1816,10 +1816,12 @@ e1000_xmit_frame(struct sk_buff *skb, st
    30.4  static void
    30.5  e1000_tx_timeout(struct net_device *netdev)
    30.6  {
    30.7 +#if 0
    30.8  	struct e1000_adapter *adapter = netdev->priv;
    30.9  
   30.10  	/* Do the reset outside of interrupt context */
   30.11 -	//schedule_work(&adapter->tx_timeout_task);
   30.12 +	schedule_work(&adapter->tx_timeout_task);
   30.13 +#endif
   30.14  	e1000_tx_timeout_task(netdev); // XXXX HACK!!! XEN
   30.15  }
   30.16  
    31.1 --- a/xen/include/asm-i386/atomic.h	Sat Dec 20 11:49:50 2003 +0000
    31.2 +++ b/xen/include/asm-i386/atomic.h	Sat Dec 20 12:44:39 2003 +0000
    31.3 @@ -186,15 +186,6 @@ static __inline__ int atomic_add_negativ
    31.4  	return c;
    31.5  }
    31.6  
    31.7 -/* These are x86-specific, used by some header files */
    31.8 -#define atomic_clear_mask(mask, addr) \
    31.9 -__asm__ __volatile__(LOCK "andl %0,%1" \
   31.10 -: : "r" (~(mask)),"m" (*addr) : "memory")
   31.11 -
   31.12 -#define atomic_set_mask(mask, addr) \
   31.13 -__asm__ __volatile__(LOCK "orl %0,%1" \
   31.14 -: : "r" (mask),"m" (*addr) : "memory")
   31.15 -
   31.16  /* Atomic operations are already serializing on x86 */
   31.17  #define smp_mb__before_atomic_dec()	barrier()
   31.18  #define smp_mb__after_atomic_dec()	barrier()
    32.1 --- a/xen/include/asm-i386/flushtlb.h	Sat Dec 20 11:49:50 2003 +0000
    32.2 +++ b/xen/include/asm-i386/flushtlb.h	Sat Dec 20 12:44:39 2003 +0000
    32.3 @@ -1,40 +1,39 @@
    32.4  /******************************************************************************
    32.5   * flushtlb.h
    32.6   * 
    32.7 - * TLB flush macros that count flushes.  Counting is used to enforce 
    32.8 - * zero-copy safety, particularily for the network code.
    32.9 - *
   32.10 - * akw - Jan 21, 2003
   32.11 + * TLB flushes are timestamped using a global virtual 'clock' which ticks
   32.12 + * on any TLB flush on any processor.
   32.13 + * 
   32.14 + * Copyright (c) 2003, K A Fraser
   32.15   */
   32.16  
   32.17 -#ifndef __FLUSHTLB_H
   32.18 -#define __FLUSHTLB_H
   32.19 +#ifndef __FLUSHTLB_H__
   32.20 +#define __FLUSHTLB_H__
   32.21  
   32.22  #include <xeno/smp.h>
   32.23 -#include <asm/atomic.h>
   32.24  
   32.25 -atomic_t tlb_flush_count[NR_CPUS];
   32.26 -
   32.27 -#define __write_cr3_counted(__pa)                                       \
   32.28 -    do {                                                                \
   32.29 -                __asm__ __volatile__ (                                  \
   32.30 -                        "movl %0, %%cr3;"                               \
   32.31 -                        :: "r" (__pa)                                   \
   32.32 -                        : "memory");                                    \
   32.33 -                atomic_inc(&tlb_flush_count[smp_processor_id()]);       \
   32.34 -    } while (0)
   32.35 +/*
   32.36 + * Every GLOBAL_FLUSH_PERIOD ticks of the tlbflush clock, every TLB in the
   32.37 + * system is guaranteed to have been flushed.
   32.38 + */
   32.39 +#define GLOBAL_FLUSH_PERIOD (1<<16)
   32.40  
   32.41 -#define __flush_tlb_counted()                                           \
   32.42 -        do {                                                            \
   32.43 -                unsigned int tmpreg;                                    \
   32.44 -                                                                        \
   32.45 -                __asm__ __volatile__(                                   \
   32.46 -                        "movl %%cr3, %0;  # flush TLB \n"               \
   32.47 -                        "movl %0, %%cr3;                "               \
   32.48 -                        : "=r" (tmpreg)                                 \
   32.49 -                        :: "memory");                                   \
   32.50 -                atomic_inc(&tlb_flush_count[smp_processor_id()]);       \
   32.51 -        } while (0)
   32.52 +/*
   32.53 + * '_cpu_stamp' is the current timestamp for the CPU we are testing.
   32.54 + * '_lastuse_stamp' is a timestamp taken when the PFN we are testing was last 
   32.55 + * used for a purpose that may have caused the CPU's TLB to become tainted.
   32.56 + */
   32.57 +#define NEED_FLUSH(_cpu_stamp, _lastuse_stamp) \
   32.58 + (((_cpu_stamp) > (_lastuse_stamp)) ||         \
   32.59 +  (((_lastuse_stamp) - (_cpu_stamp)) > (2*GLOBAL_FLUSH_PERIOD)))
   32.60  
   32.61 -#endif
   32.62 -                           
   32.63 +extern unsigned long tlbflush_mask;
   32.64 +extern unsigned long tlbflush_clock;
   32.65 +extern unsigned long tlbflush_time[NR_CPUS];
   32.66 +
   32.67 +extern void new_tlbflush_clock_period(void);
   32.68 +
   32.69 +extern void write_cr3_counted(unsigned long pa);
   32.70 +extern void flush_tlb_counted(void);
   32.71 +
   32.72 +#endif /* __FLUSHTLB_H__ */
    33.1 --- a/xen/include/asm-i386/io.h	Sat Dec 20 11:49:50 2003 +0000
    33.2 +++ b/xen/include/asm-i386/io.h	Sat Dec 20 12:44:39 2003 +0000
    33.3 @@ -36,10 +36,9 @@ static inline void * phys_to_virt(unsign
    33.4  	return __va(address);
    33.5  }
    33.6  
    33.7 -/*
    33.8 - * Change "struct page" to physical address.
    33.9 - */
   33.10 -#define page_to_phys(page)	((page - frame_table) << PAGE_SHIFT)
   33.11 +#define page_to_pfn(_page)  ((unsigned long)((_page) - frame_table))
   33.12 +#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT)
   33.13 +#define page_to_virt(_page) phys_to_virt(page_to_phys(_page))
   33.14  
   33.15  extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
   33.16  
    34.1 --- a/xen/include/asm-i386/page.h	Sat Dec 20 11:49:50 2003 +0000
    34.2 +++ b/xen/include/asm-i386/page.h	Sat Dec 20 12:44:39 2003 +0000
    34.3 @@ -92,7 +92,7 @@ typedef struct { unsigned long pt_lo; } 
    34.4  extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE];
    34.5  extern void paging_init(void);
    34.6  
    34.7 -#define __flush_tlb() __flush_tlb_counted()
    34.8 +#define __flush_tlb() flush_tlb_counted()
    34.9  
   34.10  /* Flush global pages as well. */
   34.11  
   34.12 @@ -111,10 +111,10 @@ extern void paging_init(void);
   34.13          } while (0)
   34.14  
   34.15  
   34.16 -#define __flush_tlb_all()						\
   34.17 +#define __flush_tlb_pge()						\
   34.18  	do {								\
   34.19                  __pge_off();                                            \
   34.20 -		__flush_tlb_counted();					\
   34.21 +		flush_tlb_counted();					\
   34.22                  __pge_on();                                             \
   34.23  	} while (0)
   34.24  
    35.1 --- a/xen/include/asm-i386/pgalloc.h	Sat Dec 20 11:49:50 2003 +0000
    35.2 +++ b/xen/include/asm-i386/pgalloc.h	Sat Dec 20 12:44:39 2003 +0000
    35.3 @@ -47,28 +47,24 @@
    35.4  
    35.5  #ifndef CONFIG_SMP
    35.6  
    35.7 -#define flush_tlb()         __flush_tlb()
    35.8 -#define flush_tlb_all()     __flush_tlb_all()
    35.9 -#define local_flush_tlb()   __flush_tlb()
   35.10 -#define flush_tlb_cpu(_cpu) __flush_tlb()
   35.11 +#define flush_tlb()           __flush_tlb()
   35.12 +#define flush_tlb_all()       __flush_tlb()
   35.13 +#define flush_tlb_all_pge()   __flush_tlb_pge()
   35.14 +#define local_flush_tlb()     __flush_tlb()
   35.15 +#define flush_tlb_cpu(_cpu)   __flush_tlb()
   35.16 +#define flush_tlb_mask(_mask) __flush_tlb()
   35.17  
   35.18  #else
   35.19  
   35.20  #include <xeno/smp.h>
   35.21  
   35.22 -#define flush_tlb()	    __flush_tlb()
   35.23 -#define local_flush_tlb()   __flush_tlb()
   35.24 -
   35.25 -extern void flush_tlb_all(void);
   35.26 +extern void flush_tlb_mask(unsigned long mask);
   35.27 +extern void flush_tlb_all_pge(void);
   35.28  
   35.29 -extern void flush_tlb_others(unsigned long cpumask);
   35.30 -static inline void flush_tlb_cpu(unsigned int cpu)
   35.31 -{
   35.32 -    if ( cpu == smp_processor_id() )
   35.33 -        __flush_tlb();
   35.34 -    else
   35.35 -        flush_tlb_others(1<<cpu);
   35.36 -}
   35.37 +#define flush_tlb()	    __flush_tlb()
   35.38 +#define flush_tlb_all()     flush_tlb_mask((1 << smp_num_cpus) - 1)
   35.39 +#define local_flush_tlb()   __flush_tlb()
   35.40 +#define flush_tlb_cpu(_cpu) flush_tlb_mask(1 << (_cpu))
   35.41  
   35.42  #endif
   35.43  
    36.1 --- a/xen/include/asm-i386/smp.h	Sat Dec 20 11:49:50 2003 +0000
    36.2 +++ b/xen/include/asm-i386/smp.h	Sat Dec 20 12:44:39 2003 +0000
    36.3 @@ -1,15 +1,8 @@
    36.4  #ifndef __ASM_SMP_H
    36.5  #define __ASM_SMP_H
    36.6  
    36.7 -#ifndef __ASSEMBLY__
    36.8  #include <xeno/config.h>
    36.9  #include <asm/ptrace.h>
   36.10 -#include <asm/fixmap.h>
   36.11 -#include <asm/bitops.h>
   36.12 -#include <asm/mpspec.h>
   36.13 -#include <asm/io_apic.h>
   36.14 -#include <asm/apic.h>
   36.15 -#endif
   36.16  
   36.17  #ifdef CONFIG_SMP
   36.18  #define TARGET_CPUS cpu_online_map
   36.19 @@ -18,8 +11,6 @@
   36.20  #endif
   36.21  
   36.22  #ifdef CONFIG_SMP
   36.23 -#ifndef __ASSEMBLY__
   36.24 -
   36.25  /*
   36.26   * Private routines/data
   36.27   */
   36.28 @@ -74,6 +65,9 @@ extern void smp_store_cpu_info(int id);	
   36.29  
   36.30  #define smp_processor_id() (current->processor)
   36.31  
   36.32 +#include <asm/fixmap.h>
   36.33 +#include <asm/apic.h>
   36.34 +
   36.35  static __inline int hard_smp_processor_id(void)
   36.36  {
   36.37  	/* we don't want to mark this access volatile - bad code generation */
   36.38 @@ -86,7 +80,5 @@ static __inline int logical_smp_processo
   36.39  	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
   36.40  }
   36.41  
   36.42 -#endif /* !__ASSEMBLY__ */
   36.43 -
   36.44  #endif
   36.45  #endif
    37.1 --- a/xen/include/asm-i386/spinlock.h	Sat Dec 20 11:49:50 2003 +0000
    37.2 +++ b/xen/include/asm-i386/spinlock.h	Sat Dec 20 12:44:39 2003 +0000
    37.3 @@ -1,11 +1,10 @@
    37.4  #ifndef __ASM_SPINLOCK_H
    37.5  #define __ASM_SPINLOCK_H
    37.6  
    37.7 +#include <xeno/config.h>
    37.8 +#include <xeno/lib.h>
    37.9  #include <asm/atomic.h>
   37.10  #include <asm/rwlock.h>
   37.11 -#include <asm/page.h>
   37.12 -#include <xeno/config.h>
   37.13 -#include <xeno/lib.h>
   37.14  
   37.15  #if 0
   37.16  #define SPINLOCK_DEBUG	1
    38.1 --- a/xen/include/asm-i386/system.h	Sat Dec 20 11:49:50 2003 +0000
    38.2 +++ b/xen/include/asm-i386/system.h	Sat Dec 20 12:44:39 2003 +0000
    38.3 @@ -93,7 +93,34 @@ static inline unsigned long __cmpxchg(vo
    38.4  #define cmpxchg(ptr,o,n)\
    38.5  	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
    38.6  					(unsigned long)(n),sizeof(*(ptr))))
    38.7 -    
    38.8 +
    38.9 +
   38.10 +/*
   38.11 + * This function causes longword _o to be changed to _n at location _p.
   38.12 + * If this access causes a fault then we return 1, otherwise we return 0.
   38.13 + * If no fault occurs then _o is updated to teh value we saw at _p. If this
   38.14 + * is the same as the initial value of _o then _n is written to location _p.
   38.15 + */
   38.16 +#define cmpxchg_user(_p,_o,_n)                                          \
   38.17 +({                                                                      \
   38.18 +    int _rc;                                                            \
   38.19 +    __asm__ __volatile__ (                                              \
   38.20 +        "1: " LOCK_PREFIX "cmpxchgl %2,%3\n"                            \
   38.21 +        "2:\n"                                                          \
   38.22 +        ".section .fixup,\"ax\"\n"                                      \
   38.23 +        "3:     movl $1,%1\n"                                           \
   38.24 +        "       jmp 2b\n"                                               \
   38.25 +        ".previous\n"                                                   \
   38.26 +        ".section __ex_table,\"a\"\n"                                   \
   38.27 +        "       .align 4\n"                                             \
   38.28 +        "       .long 1b,3b\n"                                          \
   38.29 +        ".previous"                                                     \
   38.30 +        : "=a" (_o), "=r" (_rc)                                         \
   38.31 +        : "q" (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \
   38.32 +        : "memory");                                                    \
   38.33 +    _rc;                                                                \
   38.34 +})
   38.35 +
   38.36  /*
   38.37   * Force strict CPU ordering.
   38.38   * And yes, this is required on UP too when we're talking
    39.1 --- a/xen/include/hypervisor-ifs/dom0_ops.h	Sat Dec 20 11:49:50 2003 +0000
    39.2 +++ b/xen/include/hypervisor-ifs/dom0_ops.h	Sat Dec 20 12:44:39 2003 +0000
    39.3 @@ -141,8 +141,8 @@ typedef struct dom0_getpageframeinfo_st
    39.4  {
    39.5      /* IN variables. */
    39.6      unsigned long pfn;          /* Machine page frame number to query.       */
    39.7 +    unsigned int domain;        /* To which domain does the frame belong?    */
    39.8      /* OUT variables. */
    39.9 -    unsigned int domain;        /* To which domain does the frame belong?    */
   39.10      enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type?       */
   39.11  } dom0_getpageframeinfo_t;
   39.12  
    40.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h	Sat Dec 20 11:49:50 2003 +0000
    40.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h	Sat Dec 20 12:44:39 2003 +0000
    40.3 @@ -125,9 +125,9 @@
    40.4   *  which shifts the least bits out.
    40.5   */
    40.6  /* A normal page-table update request. */
    40.7 -#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is VA.      */
    40.8 +#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.      */
    40.9  /* DOM0 can make entirely unchecked updates which do not affect refcnts. */
   40.10 -#define MMU_UNCHECKED_PT_UPDATE  1 /* unchecked '*ptr = val'. ptr is VA.    */
   40.11 +#define MMU_UNCHECKED_PT_UPDATE  1 /* unchecked '*ptr = val'. ptr is MA.    */
   40.12  /* Update an entry in the machine->physical mapping table. */
   40.13  #define MMU_MACHPHYS_UPDATE      2 /* ptr = MA of frame to modify entry for */
   40.14  /* An extended command. */
    41.1 --- a/xen/include/xeno/config.h	Sat Dec 20 11:49:50 2003 +0000
    41.2 +++ b/xen/include/xeno/config.h	Sat Dec 20 12:44:39 2003 +0000
    41.3 @@ -145,6 +145,13 @@
    41.4  
    41.5  #define capable(_c) 0
    41.6  
    41.7 +#ifndef NDEBUG
    41.8 +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
    41.9 +                           __FILE__, __LINE__, ## _a)
   41.10 +#else
   41.11 +#define DPRINTK(_f, _a...) ((void)0)
   41.12 +#endif
   41.13 +
   41.14  #ifndef __ASSEMBLY__
   41.15  
   41.16  #include <xeno/compiler.h>
    42.1 --- a/xen/include/xeno/mm.h	Sat Dec 20 11:49:50 2003 +0000
    42.2 +++ b/xen/include/xeno/mm.h	Sat Dec 20 12:44:39 2003 +0000
    42.3 @@ -3,34 +3,35 @@
    42.4  #define __XENO_MM_H__
    42.5  
    42.6  #include <xeno/config.h>
    42.7 +#include <xeno/list.h>
    42.8 +#include <xeno/spinlock.h>
    42.9 +#include <xeno/perfc.h>
   42.10 +#include <xeno/sched.h>
   42.11 +
   42.12 +#include <asm/pgalloc.h>
   42.13  #include <asm/atomic.h>
   42.14  #include <asm/desc.h>
   42.15 -#include <xeno/list.h>
   42.16 +#include <asm/flushtlb.h>
   42.17 +#include <asm/io.h>
   42.18 +
   42.19  #include <hypervisor-ifs/hypervisor-if.h>
   42.20 -#include <xeno/spinlock.h>
   42.21  
   42.22 -/* XXX KAF: These may die eventually, but so many refs in slab.c :((( */
   42.23 +/*
   42.24 + * These are for compatibility with calls to the Linux memory allocators.
   42.25 + */
   42.26  
   42.27 -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */
   42.28  #define __GFP_DMA       0x01
   42.29 -
   42.30 -/* Action modifiers - doesn't change the zoning */
   42.31 +#define GFP_DMA         __GFP_DMA
   42.32  #define __GFP_WAIT      0x10    /* Can wait and reschedule? */
   42.33  #define __GFP_HIGH      0x20    /* Should access emergency pools? */
   42.34  #define __GFP_IO        0x40    /* Can start low memory physical IO? */
   42.35  #define __GFP_HIGHIO    0x80    /* Can start high mem physical IO? */
   42.36  #define __GFP_FS        0x100   /* Can call down to low-level FS? */
   42.37 -
   42.38  #define GFP_ATOMIC      (__GFP_HIGH)
   42.39 -#define GFP_KERNEL      (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
   42.40 +#define GFP_KERNEL      (__GFP_HIGH | __GFP_WAIT | __GFP_IO | \
   42.41 +                         __GFP_HIGHIO | __GFP_FS)
   42.42  
   42.43 -/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
   42.44 -   platforms, used as appropriate on others */
   42.45 -
   42.46 -#define GFP_DMA         __GFP_DMA
   42.47 -
   42.48 -
   42.49 -/******************************************************************************
   42.50 +/*
   42.51   * The following is for page_alloc.c.
   42.52   */
   42.53  
   42.54 @@ -44,95 +45,80 @@ void __free_pages(unsigned long p, int o
   42.55  #define free_page(_p) (__free_pages(_p,0))
   42.56  
   42.57  
   42.58 -/******************************************************************************
   42.59 - * The following is the array of page info. One entry per page owned
   42.60 - * by the hypervisor, indexed from `mem_map', just like Linux.
   42.61 - *
   42.62 - * 12.11.02. We no longer use struct page or mem_map, these are replaced
   42.63 - * with struct pfn_info and frame_table respectively. Boris Dragovic
   42.64 +/*
   42.65 + * Per-page-frame information.
   42.66   */
   42.67  
   42.68 -typedef struct pfn_info {
   42.69 -    struct list_head list;      /* ->mapping has some page lists. */
   42.70 -    unsigned long flags;        /* atomic flags. */
   42.71 -    unsigned long tot_count;    /* Total domain usage count. */
   42.72 -    unsigned long type_count;   /* pagetable/dir, or domain-writeable refs. */
   42.73 -} frame_table_t;
   42.74 -
   42.75 -#define get_page_tot(p)		 ((p)->tot_count++)
   42.76 -#define put_page_tot(p)		 \
   42.77 -    ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; })
   42.78 -#define page_tot_count(p)	 ((p)->tot_count)
   42.79 -#define set_page_tot_count(p,v)  ((p)->tot_count = v)
   42.80 -
   42.81 -#define get_page_type(p)	 ((p)->type_count++)
   42.82 -#define put_page_type(p)	 \
   42.83 -    ({ ASSERT((p)->type_count != 0); --(p)->type_count; })
   42.84 -#define page_type_count(p)	 ((p)->type_count)
   42.85 -#define set_page_type_count(p,v) ((p)->type_count = v)
   42.86 +struct pfn_info
   42.87 +{
   42.88 +    /* Each frame can be threaded onto a doubly-linked list. */
   42.89 +    struct list_head list;
   42.90 +    /* The following possible uses are context-dependent. */
   42.91 +    union {
   42.92 +        /* Page is in use and not a zombie: we keep a pointer to its owner. */
   42.93 +        struct task_struct *domain;
   42.94 +        /* Page is not currently allocated: mask of possibly-tainted TLBs. */
   42.95 +        unsigned long cpu_mask;
   42.96 +        /* Page is a zombie: this word currently has no use. */
   42.97 +        unsigned long _unused;
   42.98 +    } u;
   42.99 +    /* Reference count and various PGC_xxx flags and fields. */
  42.100 +    unsigned long       count_and_flags;
  42.101 +    /* Type reference count and various PGT_xxx flags and fields. */
  42.102 +    unsigned long       type_and_flags;
  42.103 +    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
  42.104 +    unsigned long       tlbflush_timestamp;
  42.105 +};
  42.106  
  42.107 -#define PG_domain_mask MAX_DOMAIN_ID /* owning domain (16 bits) */
  42.108 -/* hypervisor flags (domain == 0) */
  42.109 -#define PG_slab	       24
  42.110 -/* domain flags (domain != 0) */
  42.111 -/*
  42.112 - * NB. The following page types are MUTUALLY EXCLUSIVE.
  42.113 - * At most one can be true at any point, and 'type_count' counts how many
  42.114 - * references exist of the current type. A change in type can only occur
  42.115 - * when type_count == 0.
  42.116 - */
  42.117 -#define PG_type_mask        (15<<24) /* bits 24-27 */
  42.118 -#define PGT_none            (0<<24) /* no special uses of this page */
  42.119 -#define PGT_l1_page_table   (1<<24) /* using this page as an L1 page table? */
  42.120 -#define PGT_l2_page_table   (2<<24) /* using this page as an L2 page table? */
  42.121 -#define PGT_l3_page_table   (3<<24) /* using this page as an L3 page table? */
  42.122 -#define PGT_l4_page_table   (4<<24) /* using this page as an L4 page table? */
  42.123 -#define PGT_gdt_page        (5<<24) /* using this page in a GDT? */
  42.124 -#define PGT_ldt_page        (6<<24) /* using this page in an LDT? */
  42.125 -#define PGT_writeable_page  (7<<24) /* has writable mappings of this page? */
  42.126 + /* The following page types are MUTUALLY EXCLUSIVE. */
  42.127 +#define PGT_none            (0<<29) /* no special uses of this page */
  42.128 +#define PGT_l1_page_table   (1<<29) /* using this page as an L1 page table? */
  42.129 +#define PGT_l2_page_table   (2<<29) /* using this page as an L2 page table? */
  42.130 +#define PGT_l3_page_table   (3<<29) /* using this page as an L3 page table? */
  42.131 +#define PGT_l4_page_table   (4<<29) /* using this page as an L4 page table? */
  42.132 +#define PGT_gdt_page        (5<<29) /* using this page in a GDT? */
  42.133 +#define PGT_ldt_page        (6<<29) /* using this page in an LDT? */
  42.134 +#define PGT_writeable_page  (7<<29) /* has writable mappings of this page? */
  42.135 +#define PGT_type_mask       (7<<29) /* Bits 29-31. */
  42.136 + /* Has this page been validated for use as its current type? */
  42.137 +#define _PGT_validated      28
  42.138 +#define PGT_validated       (1<<_PGT_validated)
  42.139 + /* 28-bit count of uses of this frame as its current type. */
  42.140 +#define PGT_count_mask      ((1<<28)-1)
  42.141  
  42.142 -/*
  42.143 - * This bit indicates that the TLB must be flushed when the type count of this
  42.144 - * frame drops to zero. This is needed on current x86 processors only for
  42.145 - * frames which have guestos-accessible writeable mappings. In this case we
  42.146 - * must prevent stale TLB entries allowing the frame to be written if it used
  42.147 - * for a page table, for example.
  42.148 - * 
  42.149 - * We have this bit because the writeable type is actually also used to pin a
  42.150 - * page when it is used as a disk read buffer. This doesn't require a TLB flush
  42.151 - * because the frame never has a mapping in the TLB.
  42.152 - */
  42.153 -#define PG_need_flush       (1<<28)
  42.154 + /* The owner of this page is dead: 'u.domain' is no longer valid. */
  42.155 +#define _PGC_zombie                   31
  42.156 +#define PGC_zombie                    (1<<_PGC_zombie)
  42.157 + /* For safety, force a TLB flush when this page's type changes. */
  42.158 +#define _PGC_tlb_flush_on_type_change 30
  42.159 +#define PGC_tlb_flush_on_type_change  (1<<_PGC_tlb_flush_on_type_change)
  42.160 + /* Owning guest has pinned this page to its current type? */
  42.161 +#define _PGC_guest_pinned             29
  42.162 +#define PGC_guest_pinned              (1<<_PGC_guest_pinned)
  42.163 + /* Cleared when the owning guest 'frees' this page. */
  42.164 +#define _PGC_allocated                28
  42.165 +#define PGC_allocated                 (1<<_PGC_allocated)
  42.166 + /* 28-bit count of references to this frame. */
  42.167 +#define PGC_count_mask                ((1<<28)-1)
  42.168  
  42.169 -/*
  42.170 - * This bit indicates that the guest OS has pinned the page to its current
  42.171 - * type. For page tables this can avoid the frame scanning and reference-count
  42.172 - * updates that occur when the type count falls to zero.
  42.173 - */
  42.174 -#define PG_guest_pinned     (1<<29)
  42.175 +/* We trust the slab allocator in slab.c, and our use of it. */
  42.176 +#define PageSlab(page)		(1)
  42.177 +#define PageSetSlab(page)	((void)0)
  42.178 +#define PageClearSlab(page)	((void)0)
  42.179 +
  42.180 +#define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < MAX_MONITOR_ADDRESS)
  42.181  
  42.182 -#define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
  42.183 -#define PageSetSlab(page)	set_bit(PG_slab, &(page)->flags)
  42.184 -#define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
  42.185 -
  42.186 -#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                            \
  42.187 -    do {                                                             \
  42.188 -        (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \
  42.189 -        set_page_tot_count((_pfn), 2);                               \
  42.190 -        set_page_type_count((_pfn), 2);                              \
  42.191 +#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                  \
  42.192 +    do {                                                                   \
  42.193 +        (_pfn)->u.domain = (_dom);                                         \
  42.194 +        wmb(); /* install valid domain ptr before updating refcnt. */      \
  42.195 +        (_pfn)->count_and_flags = 1; /* Xen holds a writeable reference */ \
  42.196 +        (_pfn)->type_and_flags  = PGT_writeable_page | PGT_validated | 1;  \
  42.197      } while ( 0 )
  42.198  
  42.199 -#define UNSHARE_PFN(_pfn)                                            \
  42.200 -    do {                                                             \
  42.201 -        (_pfn)->flags = 0;                                           \
  42.202 -        set_page_tot_count((_pfn), 0);                               \
  42.203 -        set_page_type_count((_pfn), 0);                              \
  42.204 -    } while ( 0 )
  42.205 +#define UNSHARE_PFN(_pfn) put_page_and_type(_pfn)
  42.206  
  42.207 -/* The array of struct pfn_info,  
  42.208 - * free pfn list and number of free pfns in the free list
  42.209 - */
  42.210 -extern frame_table_t * frame_table;
  42.211 +extern struct pfn_info *frame_table;
  42.212  extern unsigned long frame_table_size;
  42.213  extern struct list_head free_list;
  42.214  extern spinlock_t free_list_lock;
  42.215 @@ -140,6 +126,180 @@ extern unsigned int free_pfns;
  42.216  extern unsigned long max_page;
  42.217  void init_frametable(unsigned long nr_pages);
  42.218  
  42.219 +struct pfn_info *alloc_domain_page(struct task_struct *p);
  42.220 +void free_domain_page(struct pfn_info *page);
  42.221 +
  42.222 +int alloc_page_type(struct pfn_info *page, unsigned int type);
  42.223 +void free_page_type(struct pfn_info *page, unsigned int type);
  42.224 +
  42.225 +static inline void put_page(struct pfn_info *page)
  42.226 +{
  42.227 +    unsigned long nx, x, y = page->count_and_flags;
  42.228 +
  42.229 +    do {
  42.230 +        x  = y;
  42.231 +        nx = x - 1;
  42.232 +    }
  42.233 +    while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
  42.234 +
  42.235 +    if ( unlikely((nx & PGC_count_mask) == 0) )
  42.236 +        free_domain_page(page);
  42.237 +}
  42.238 +
  42.239 +
  42.240 +static inline int get_page(struct pfn_info *page,
  42.241 +                           struct task_struct *domain)
  42.242 +{
  42.243 +    unsigned long x, nx, y = page->count_and_flags;
  42.244 +    struct task_struct *p, *np = page->u.domain;
  42.245 +
  42.246 +    do {
  42.247 +        x  = y;
  42.248 +        nx = x + 1;
  42.249 +        p  = np;
  42.250 +        if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
  42.251 +             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
  42.252 +             unlikely(x & PGC_zombie) ||             /* Zombie? */
  42.253 +             unlikely(p != domain) )                 /* Wrong owner? */
  42.254 +        {
  42.255 +            DPRINTK("Error pfn %08lx: ed=%p,sd=%p,caf=%08lx\n",
  42.256 +                    page_to_pfn(page), domain, p, x);
  42.257 +            return 0;
  42.258 +        }
  42.259 +        __asm__ __volatile__(
  42.260 +            LOCK_PREFIX "cmpxchg8b %3"
  42.261 +            : "=a" (np), "=d" (y), "=b" (p),
  42.262 +              "=m" (*(volatile unsigned long long *)(&page->u.domain))
  42.263 +            : "0" (p), "1" (x), "b" (p), "c" (nx) );
  42.264 +    }
  42.265 +    while ( unlikely(np != p) || unlikely(y != x) );
  42.266 +
  42.267 +    return 1;
  42.268 +}
  42.269 +
  42.270 +
  42.271 +static inline void put_page_type(struct pfn_info *page)
  42.272 +{
  42.273 +    unsigned long nx, x, y = page->type_and_flags;
  42.274 +
  42.275 + again:
  42.276 +    do {
  42.277 +        x  = y;
  42.278 +        nx = x - 1;
  42.279 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  42.280 +        {
  42.281 +            page->tlbflush_timestamp = tlbflush_clock;
  42.282 +            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  42.283 +                 likely(nx & PGT_validated) )
  42.284 +            {
  42.285 +                /*
  42.286 +                 * Page-table pages must be unvalidated when count is zero. The
  42.287 +                 * 'free' is safe because the refcnt is non-zero and the
  42.288 +                 * validated bit is clear => other ops will spin or fail.
  42.289 +                 */
  42.290 +                if ( unlikely((y = cmpxchg(&page->type_and_flags, x, 
  42.291 +                                           x & ~PGT_validated)) != x) )
  42.292 +                    goto again;
  42.293 +                /* We cleared the 'valid bit' so we must do the clear up. */
  42.294 +                free_page_type(page, x & PGT_type_mask);
  42.295 +                /* Carry on as we were, but with the 'valid bit' now clear. */
  42.296 +                x  &= ~PGT_validated;
  42.297 +                nx &= ~PGT_validated;
  42.298 +            }
  42.299 +        }
  42.300 +    }
  42.301 +    while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
  42.302 +}
  42.303 +
  42.304 +
  42.305 +static inline int get_page_type(struct pfn_info *page, unsigned long type)
  42.306 +{
  42.307 +    unsigned long nx, x, y = page->type_and_flags;
  42.308 + again:
  42.309 +    do {
  42.310 +        x  = y;
  42.311 +        nx = x + 1;
  42.312 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  42.313 +        {
  42.314 +            DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
  42.315 +            return 0;
  42.316 +        }
  42.317 +        else if ( unlikely((x & PGT_count_mask) == 0) )
  42.318 +        {
  42.319 +            if ( (x & PGT_type_mask) != type )
  42.320 +            {
  42.321 +                nx &= ~(PGT_type_mask | PGT_validated);
  42.322 +                nx |= type;
  42.323 +                /* No extra validation needed for writeable pages. */
  42.324 +                if ( type == PGT_writeable_page )
  42.325 +                    nx |= PGT_validated;
  42.326 +            }
  42.327 +        }
  42.328 +        else if ( unlikely((x & PGT_type_mask) != type) )
  42.329 +        {
  42.330 +            DPRINTK("Unexpected type (saw %08lx != exp %08lx) for pfn %08lx\n",
  42.331 +                    x & PGT_type_mask, type, page_to_pfn(page));
  42.332 +            return 0;
  42.333 +        }
  42.334 +        else if ( unlikely(!(x & PGT_validated)) )
  42.335 +        {
  42.336 +            /* Someone else is updating validation of this page. Wait... */
  42.337 +            while ( (y = page->type_and_flags) != x )
  42.338 +            {
  42.339 +                rep_nop();
  42.340 +                barrier();
  42.341 +            }
  42.342 +            goto again;
  42.343 +        }
  42.344 +    }
  42.345 +    while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
  42.346 +
  42.347 +    if ( unlikely(!(nx & PGT_validated)) )
  42.348 +    {
  42.349 +        /* Try to validate page type; drop the new reference on failure. */
  42.350 +        if ( unlikely(!alloc_page_type(page, type)) )
  42.351 +        {
  42.352 +            DPRINTK("Error while validating pfn %08lx for type %08lx\n",
  42.353 +                    page_to_pfn(page), type);
  42.354 +            put_page_type(page);
  42.355 +            return 0;
  42.356 +        }
  42.357 +        set_bit(_PGT_validated, &page->type_and_flags);
  42.358 +    }
  42.359 +
  42.360 +    return 1;
  42.361 +}
  42.362 +
  42.363 +
  42.364 +static inline void put_page_and_type(struct pfn_info *page)
  42.365 +{
  42.366 +    put_page_type(page);
  42.367 +    put_page(page);
  42.368 +}
  42.369 +
  42.370 +
  42.371 +static inline int get_page_and_type(struct pfn_info *page,
  42.372 +                                    struct task_struct *domain,
  42.373 +                                    unsigned int type)
  42.374 +{
  42.375 +    int rc = get_page(page, domain);
  42.376 +
  42.377 +    if ( likely(rc) && unlikely(!get_page_type(page, type)) )
  42.378 +    {
  42.379 +        put_page(page);
  42.380 +        rc = 0;
  42.381 +    }
  42.382 +
  42.383 +    return rc;
  42.384 +}
  42.385 +
  42.386 +#define ASSERT_PAGE_IS_TYPE(_p, _t)                \
  42.387 +    ASSERT(((_p)->type_and_flags & PGT_type_mask) == (_t));  \
  42.388 +    ASSERT(((_p)->type_and_flags & PGT_count_mask) != 0)
  42.389 +#define ASSERT_PAGE_IS_DOMAIN(_p, _d)              \
  42.390 +    ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0);  \
  42.391 +    ASSERT((_p)->u.domain == (_d))
  42.392 +
  42.393  int check_descriptor(unsigned long a, unsigned long b);
  42.394  
  42.395  /*
    43.1 --- a/xen/include/xeno/perfc.h	Sat Dec 20 11:49:50 2003 +0000
    43.2 +++ b/xen/include/xeno/perfc.h	Sat Dec 20 12:44:39 2003 +0000
    43.3 @@ -1,6 +1,6 @@
    43.4 -/*
    43.5 - * xen performance counters
    43.6 - */
    43.7 +
    43.8 +#ifndef __XENO_PERFC_H__
    43.9 +#define __XENO_PERFC_H__
   43.10  
   43.11  #include <asm/atomic.h>
   43.12  
   43.13 @@ -53,3 +53,4 @@ extern struct perfcounter_t perfcounters
   43.14  #define perfc_addc(x,y)   atomic_add((y), &perfcounters.x[smp_processor_id()])
   43.15  #define perfc_adda(x,y,z) atomic_add((z), &perfcounters.x[y])
   43.16  
   43.17 +#endif /* __XENO_PERFC_H__ */
    44.1 --- a/xen/include/xeno/perfc_defn.h	Sat Dec 20 11:49:50 2003 +0000
    44.2 +++ b/xen/include/xeno/perfc_defn.h	Sat Dec 20 12:44:39 2003 +0000
    44.3 @@ -12,7 +12,6 @@ PERFCOUNTER( net_hypercalls, "network hy
    44.4  PERFCOUNTER( net_rx_congestion_drop, "net rx congestion drops" )
    44.5  PERFCOUNTER( net_rx_capacity_drop, "net rx capacity drops" )
    44.6  PERFCOUNTER( net_rx_delivered, "net rx delivered" )
    44.7 -PERFCOUNTER( net_rx_tlbflush, "net rx tlb flushes" )
    44.8  PERFCOUNTER( net_tx_transmitted, "net tx transmitted" )
    44.9  
   44.10  PERFCOUNTER_CPU( domain_page_tlb_flush, "domain page tlb flushes" )
    45.1 --- a/xen/include/xeno/sched.h	Sat Dec 20 11:49:50 2003 +0000
    45.2 +++ b/xen/include/xeno/sched.h	Sat Dec 20 12:44:39 2003 +0000
    45.3 @@ -4,7 +4,6 @@
    45.4  #include <xeno/config.h>
    45.5  #include <xeno/types.h>
    45.6  #include <xeno/spinlock.h>
    45.7 -#include <asm/page.h>
    45.8  #include <asm/ptrace.h>
    45.9  #include <xeno/smp.h>
   45.10  #include <asm/processor.h>
   45.11 @@ -16,7 +15,6 @@
   45.12  #include <xeno/time.h>
   45.13  #include <xeno/ac_timer.h>
   45.14  #include <xeno/delay.h>
   45.15 -#include <xeno/slab.h>
   45.16  
   45.17  #define MAX_DOMAIN_NAME 16
   45.18  
   45.19 @@ -94,9 +92,10 @@ struct task_struct
   45.20      
   45.21      unsigned int domain;        /* domain id */
   45.22  
   45.23 -    struct list_head pg_head;
   45.24 -    unsigned int tot_pages;     /* number of pages currently possesed */
   45.25 -    unsigned int max_pages;     /* max number of pages that can be possesed */
   45.26 +    spinlock_t       page_list_lock;
   45.27 +    struct list_head page_list;
   45.28 +    unsigned int     tot_pages; /* number of pages currently possesed */
   45.29 +    unsigned int     max_pages; /* max number of pages that can be possesed */
   45.30  
   45.31      /* scheduling */
   45.32      struct list_head run_list;
   45.33 @@ -132,8 +131,6 @@ struct task_struct
   45.34  
   45.35      /* VM */
   45.36      struct mm_struct mm;
   45.37 -    /* We need this lock to check page types and frob reference counts. */
   45.38 -    spinlock_t page_lock;
   45.39  
   45.40      mm_segment_t addr_limit;
   45.41  
   45.42 @@ -194,6 +191,8 @@ extern struct task_struct *idle_task[NR_
   45.43  
   45.44  #define STACK_SIZE PAGE_SIZE
   45.45  
   45.46 +#include <xeno/slab.h>
   45.47 +
   45.48  extern kmem_cache_t *task_struct_cachep;
   45.49  #define alloc_task_struct()  \
   45.50    ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
    46.1 --- a/xen/include/xeno/vif.h	Sat Dec 20 11:49:50 2003 +0000
    46.2 +++ b/xen/include/xeno/vif.h	Sat Dec 20 12:44:39 2003 +0000
    46.3 @@ -34,7 +34,7 @@ extern struct net_device *the_dev;
    46.4  typedef struct rx_shadow_entry_st 
    46.5  {
    46.6      unsigned short id;
    46.7 -    unsigned short flush_count; /* 16 bits should be enough */
    46.8 +    unsigned short _pad;
    46.9      unsigned long  pte_ptr;
   46.10      unsigned long  buf_pfn;
   46.11  } rx_shadow_entry_t;
    47.1 --- a/xen/net/dev.c	Sat Dec 20 11:49:50 2003 +0000
    47.2 +++ b/xen/net/dev.c	Sat Dec 20 12:44:39 2003 +0000
    47.3 @@ -39,12 +39,6 @@
    47.4  #define rtnl_lock() ((void)0)
    47.5  #define rtnl_unlock() ((void)0)
    47.6  
    47.7 -#if 0
    47.8 -#define DPRINTK(_f, _a...) printk(_f , ## _a)
    47.9 -#else 
   47.10 -#define DPRINTK(_f, _a...) ((void)0)
   47.11 -#endif
   47.12 -
   47.13  #define TX_RING_INC(_i)    (((_i)+1) & (TX_RING_SIZE-1))
   47.14  #define RX_RING_INC(_i)    (((_i)+1) & (RX_RING_SIZE-1))
   47.15  #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
   47.16 @@ -54,9 +48,9 @@ struct skb_completion_queues skb_queue[N
   47.17  
   47.18  static int get_tx_bufs(net_vif_t *vif);
   47.19  
   47.20 -static void __make_tx_response(net_vif_t *vif, 
   47.21 -                               unsigned short id, 
   47.22 -                               unsigned char  st);
   47.23 +static void make_tx_response(net_vif_t     *vif, 
   47.24 +                             unsigned short id, 
   47.25 +                             unsigned char  st);
   47.26  static void make_rx_response(net_vif_t     *vif, 
   47.27                               unsigned short id, 
   47.28                               unsigned short size,
   47.29 @@ -499,89 +493,69 @@ struct netif_rx_stats netdev_rx_stat[NR_
   47.30  void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
   47.31  {
   47.32      rx_shadow_entry_t *rx;
   47.33 -    unsigned long *ptep; 
   47.34 +    unsigned long *ptep, pte; 
   47.35      struct pfn_info *old_page, *new_page, *pte_page;
   47.36      unsigned int i; 
   47.37      unsigned short size;
   47.38      unsigned char  offset, status = RING_STATUS_OK;
   47.39 +    struct task_struct *p = vif->domain;
   47.40  
   47.41      memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
   47.42      if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
   47.43          memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
   47.44  
   47.45 -    /*
   47.46 -     * Slightly gross: we need the page_lock so that we can do PTE checking.
   47.47 -     * However, we take it slightly early so that it can protect the update
   47.48 -     * of rx_cons. This saves us from grabbing two locks.
   47.49 -     */
   47.50 -    spin_lock(&vif->domain->page_lock);
   47.51 +    spin_lock(&vif->rx_lock);
   47.52  
   47.53      if ( (i = vif->rx_cons) == vif->rx_prod )
   47.54      {
   47.55 -        spin_unlock(&vif->domain->page_lock);
   47.56 +        spin_unlock(&vif->rx_lock);
   47.57          perfc_incr(net_rx_capacity_drop);
   47.58          return;
   47.59      }
   47.60 -    rx = vif->rx_shadow_ring + i;
   47.61 +    rx = &vif->rx_shadow_ring[i];
   47.62      vif->rx_cons = RX_RING_INC(i);
   47.63  
   47.64      size   = (unsigned short)skb->len;
   47.65      offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
   47.66  
   47.67 -    /* Release the page-table page. */
   47.68 -    pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
   47.69 -    put_page_type(pte_page);
   47.70 -    put_page_tot(pte_page);
   47.71 -
   47.72 -    old_page = frame_table + rx->buf_pfn;
   47.73 +    pte_page = &frame_table[rx->pte_ptr >> PAGE_SHIFT];
   47.74 +    old_page = &frame_table[rx->buf_pfn];
   47.75      new_page = skb->pf;
   47.76      
   47.77      ptep = map_domain_mem(rx->pte_ptr);
   47.78  
   47.79 -    if ( (*ptep & _PAGE_PRESENT) )
   47.80 +    new_page->u.domain = p;
   47.81 +    wmb(); /* make dom ptr visible before updating refcnt. */
   47.82 +    spin_lock(&p->page_list_lock);
   47.83 +    list_add(&new_page->list, &p->page_list);
   47.84 +    new_page->count_and_flags = PGC_allocated | 2;
   47.85 +    spin_unlock(&p->page_list_lock);
   47.86 +    get_page_type(new_page, PGT_writeable_page);
   47.87 +    set_bit(_PGC_tlb_flush_on_type_change, &new_page->count_and_flags);
   47.88 +    wmb(); /* Get type count and set flush bit before updating PTE. */
   47.89 +
   47.90 +    pte = *ptep;
   47.91 +    if ( unlikely(pte & _PAGE_PRESENT) || 
   47.92 +         unlikely(cmpxchg(ptep, pte, 
   47.93 +                          (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
   47.94 +                          ((new_page - frame_table) << PAGE_SHIFT))) != pte )
   47.95      {
   47.96 -        /* Bail out if the PTE has been reused under our feet. */
   47.97 -        list_add(&old_page->list, &vif->domain->pg_head);
   47.98 -        old_page->flags = vif->domain->domain;
   47.99          unmap_domain_mem(ptep);
  47.100 -        spin_unlock(&vif->domain->page_lock);
  47.101          status = RING_STATUS_BAD_PAGE;
  47.102          goto out;
  47.103      }
  47.104  
  47.105 -    /* Give the new page to the domain, marking it writeable. */
  47.106 -    set_page_type_count(new_page, 1);
  47.107 -    set_page_tot_count(new_page, 1);
  47.108 -    new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
  47.109 -    list_add(&new_page->list, &vif->domain->pg_head);
  47.110 -    
  47.111 -    /* Patch the PTE to map the new page as writeable. */
  47.112      machine_to_phys_mapping[new_page - frame_table] 
  47.113 -        = machine_to_phys_mapping[old_page - frame_table];        
  47.114 -    *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
  47.115 -        (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
  47.116 +        = machine_to_phys_mapping[old_page - frame_table];
  47.117      
  47.118      unmap_domain_mem(ptep);
  47.119  
  47.120 -    spin_unlock(&vif->domain->page_lock);
  47.121 -    
  47.122      /* Our skbuff now points at the guest's old frame. */
  47.123      skb->pf = old_page;
  47.124  
  47.125      /* Updates must happen before releasing the descriptor. */
  47.126      smp_wmb();
  47.127  
  47.128 -    /*
  47.129 -     * NB. The remote flush here should be safe, as we hold no locks. The 
  47.130 -     * network driver that called us should also have no nasty locks.
  47.131 -     */
  47.132 -    if ( rx->flush_count == (unsigned short)
  47.133 -         atomic_read(&tlb_flush_count[vif->domain->processor]) )
  47.134 -    {
  47.135 -        perfc_incr(net_rx_tlbflush);
  47.136 -        flush_tlb_cpu(vif->domain->processor);
  47.137 -    }
  47.138 -
  47.139      perfc_incr(net_rx_delivered);
  47.140  
  47.141      /* record this so they can be billed */
  47.142 @@ -589,7 +563,9 @@ void deliver_packet(struct sk_buff *skb,
  47.143      vif->total_bytes_received += size;
  47.144  
  47.145   out:
  47.146 +    put_page_and_type(pte_page);
  47.147      make_rx_response(vif, rx->id, size, status, offset);
  47.148 +    spin_unlock(&vif->rx_lock);
  47.149  }
  47.150  
  47.151  /**
  47.152 @@ -785,8 +761,8 @@ static void net_tx_action(unsigned long 
  47.153          skb->mac.raw  = skb->data; 
  47.154          skb->guest_id = tx->id;
  47.155          
  47.156 -        skb_shinfo(skb)->frags[0].page        = frame_table +
  47.157 -            (tx->payload >> PAGE_SHIFT);
  47.158 +        skb_shinfo(skb)->frags[0].page        = 
  47.159 +            &frame_table[tx->payload >> PAGE_SHIFT];
  47.160          skb_shinfo(skb)->frags[0].size        = tx->size - PKT_PROT_LEN;
  47.161          skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
  47.162          skb_shinfo(skb)->nr_frags = 1;
  47.163 @@ -856,10 +832,8 @@ static void tx_skb_release(struct sk_buf
  47.164  
  47.165      vif = skb->src_vif;
  47.166      
  47.167 -    spin_lock(&vif->domain->page_lock);
  47.168      for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
  47.169 -        put_page_tot(skb_shinfo(skb)->frags[i].page);
  47.170 -    spin_unlock(&vif->domain->page_lock);
  47.171 +        put_page(skb_shinfo(skb)->frags[i].page);
  47.172      
  47.173      if ( skb->skb_type == SKB_NODATA )
  47.174          kmem_cache_free(net_header_cachep, skb->head);
  47.175 @@ -867,7 +841,7 @@ static void tx_skb_release(struct sk_buf
  47.176      skb_shinfo(skb)->nr_frags = 0; 
  47.177      
  47.178      spin_lock(&vif->tx_lock);
  47.179 -    __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
  47.180 +    make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
  47.181      spin_unlock(&vif->tx_lock);
  47.182      
  47.183      /*
  47.184 @@ -1904,7 +1878,7 @@ static int get_tx_bufs(net_vif_t *vif)
  47.185          if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
  47.186          {
  47.187              DPRINTK("Bad packet size: %d\n", tx.size);
  47.188 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.189 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.190              continue; 
  47.191          }
  47.192  
  47.193 @@ -1932,23 +1906,21 @@ static int get_tx_bufs(net_vif_t *vif)
  47.194          vif->remaining_credit -= tx.size;
  47.195  
  47.196          /* No crossing a page boundary as the payload mustn't fragment. */
  47.197 -        if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) 
  47.198 +        if ( unlikely(((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE) ) 
  47.199          {
  47.200              DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 
  47.201                      tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
  47.202 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.203 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.204              continue;
  47.205          }
  47.206  
  47.207          buf_pfn  = tx.addr >> PAGE_SHIFT;
  47.208          buf_page = frame_table + buf_pfn;
  47.209 -        spin_lock(&p->page_lock);
  47.210 -        if ( (buf_pfn >= max_page) || 
  47.211 -             ((buf_page->flags & PG_domain_mask) != p->domain) ) 
  47.212 +        if ( unlikely(buf_pfn >= max_page) || 
  47.213 +             unlikely(!get_page(buf_page, p)) )
  47.214          {
  47.215              DPRINTK("Bad page frame\n");
  47.216 -            spin_unlock(&p->page_lock);
  47.217 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.218 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.219              continue;
  47.220          }
  47.221              
  47.222 @@ -1958,8 +1930,8 @@ static int get_tx_bufs(net_vif_t *vif)
  47.223              init_tx_header(vif, g_data, tx.size, the_dev));
  47.224          if ( protocol == 0 )
  47.225          {
  47.226 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.227 -            goto tx_unmap_and_continue;
  47.228 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.229 +            goto cleanup_and_continue;
  47.230          }
  47.231  
  47.232          target = net_get_target_vif(g_data, tx.size, vif);
  47.233 @@ -1969,9 +1941,9 @@ static int get_tx_bufs(net_vif_t *vif)
  47.234              /* Local delivery */
  47.235              if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
  47.236              {
  47.237 -                __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.238 +                make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  47.239                  put_vif(target);
  47.240 -                goto tx_unmap_and_continue;
  47.241 +                goto cleanup_and_continue;
  47.242              }
  47.243  
  47.244              skb->src_vif = vif;
  47.245 @@ -1995,7 +1967,7 @@ static int get_tx_bufs(net_vif_t *vif)
  47.246              if ( netif_rx(skb) == NET_RX_DROP )
  47.247                  kfree_skb(skb);
  47.248  
  47.249 -            __make_tx_response(vif, tx.id, RING_STATUS_OK);
  47.250 +            make_tx_response(vif, tx.id, RING_STATUS_OK);
  47.251          }
  47.252          else if ( (target == VIF_PHYS) || IS_PRIV(p) )
  47.253          {
  47.254 @@ -2005,23 +1977,24 @@ static int get_tx_bufs(net_vif_t *vif)
  47.255                  kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
  47.256              if ( vif->tx_shadow_ring[j].header == NULL )
  47.257              { 
  47.258 -                __make_tx_response(vif, tx.id, RING_STATUS_OK);
  47.259 -                goto tx_unmap_and_continue;
  47.260 +                make_tx_response(vif, tx.id, RING_STATUS_OK);
  47.261 +                goto cleanup_and_continue;
  47.262              }
  47.263  
  47.264              memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
  47.265              vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
  47.266 -            get_page_tot(buf_page);
  47.267 +            buf_page = NULL; /* hand off our page reference */
  47.268              j = TX_RING_INC(j);
  47.269          }
  47.270          else
  47.271          {
  47.272 -            __make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
  47.273 +            make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
  47.274          }
  47.275  
  47.276 -    tx_unmap_and_continue:
  47.277 +    cleanup_and_continue:
  47.278 +        if ( buf_page != NULL )
  47.279 +            put_page(buf_page);
  47.280          unmap_domain_mem(g_data);
  47.281 -        spin_unlock(&p->page_lock);
  47.282      }
  47.283  
  47.284      /*
  47.285 @@ -2044,33 +2017,18 @@ static int get_tx_bufs(net_vif_t *vif)
  47.286  }
  47.287  
  47.288  
  47.289 -static long get_bufs_from_vif(net_vif_t *vif)
  47.290 +static void get_rx_bufs(net_vif_t *vif)
  47.291  {
  47.292 -    net_ring_t *shared_rings;
  47.293 -    net_idx_t *shared_idxs;
  47.294 +    struct task_struct *p = vif->domain;
  47.295 +    net_ring_t *shared_rings = vif->shared_rings;
  47.296 +    net_idx_t *shared_idxs = vif->shared_idxs;
  47.297      unsigned int i, j;
  47.298      rx_req_entry_t rx;
  47.299      unsigned long  pte_pfn, buf_pfn;
  47.300      struct pfn_info *pte_page, *buf_page;
  47.301 -    struct task_struct *p = vif->domain;
  47.302 -    unsigned long *ptep;    
  47.303 -
  47.304 -    shared_idxs  = vif->shared_idxs;
  47.305 -    shared_rings = vif->shared_rings;
  47.306 -        
  47.307 -    /*
  47.308 -     * PHASE 1 -- TRANSMIT RING
  47.309 -     */
  47.310 +    unsigned long *ptep, pte;
  47.311  
  47.312 -    if ( get_tx_bufs(vif) )
  47.313 -    {
  47.314 -        add_to_net_schedule_list_tail(vif);
  47.315 -        maybe_schedule_tx_action();
  47.316 -    }
  47.317 -
  47.318 -    /*
  47.319 -     * PHASE 2 -- RECEIVE RING
  47.320 -     */
  47.321 +    spin_lock(&vif->rx_lock);
  47.322  
  47.323      /*
  47.324       * Collect up new receive buffers. We collect up to the guest OS's new
  47.325 @@ -2085,66 +2043,83 @@ static long get_bufs_from_vif(net_vif_t 
  47.326      {
  47.327          rx = shared_rings->rx_ring[i].req;
  47.328  
  47.329 -        pte_pfn = rx.addr >> PAGE_SHIFT;
  47.330 -        pte_page = frame_table + pte_pfn;
  47.331 +        pte_pfn  = rx.addr >> PAGE_SHIFT;
  47.332 +        pte_page = &frame_table[pte_pfn];
  47.333              
  47.334 -        spin_lock(&p->page_lock);
  47.335 -        if ( (pte_pfn >= max_page) || 
  47.336 -             ((pte_page->flags & (PG_type_mask | PG_domain_mask)) != 
  47.337 -              (PGT_l1_page_table | p->domain)) ) 
  47.338 +        /* The address passed down must be to a valid PTE. */
  47.339 +        if ( unlikely(pte_pfn >= max_page) ||
  47.340 +             unlikely(!get_page_and_type(pte_page, p, PGT_l1_page_table)) )
  47.341          {
  47.342              DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
  47.343 -                    p->domain, pte_pfn, max_page, pte_page->flags);
  47.344 -            spin_unlock(&p->page_lock);
  47.345 +                    p->domain, pte_pfn, max_page, pte_page->type_and_flags);
  47.346              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  47.347              continue;
  47.348          }
  47.349 -            
  47.350 +        
  47.351          ptep = map_domain_mem(rx.addr);
  47.352 -            
  47.353 -        if ( !(*ptep & _PAGE_PRESENT) )
  47.354 +        pte  = *ptep;
  47.355 +        
  47.356 +        /* We must be passed a valid writeable mapping to swizzle. */
  47.357 +        if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) != 
  47.358 +                      (_PAGE_PRESENT|_PAGE_RW)) ||
  47.359 +             unlikely(cmpxchg(ptep, pte, pte & ~_PAGE_PRESENT) != pte) )
  47.360          {
  47.361 -            DPRINTK("Invalid PTE passed down (not present)\n");
  47.362 +            DPRINTK("Invalid PTE passed down (not present or changing)\n");
  47.363 +            put_page_and_type(pte_page);
  47.364 +            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  47.365 +            goto rx_unmap_and_continue;
  47.366 +        }
  47.367 +        
  47.368 +        buf_pfn  = pte >> PAGE_SHIFT;
  47.369 +        buf_page = &frame_table[buf_pfn];
  47.370 +
  47.371 +        /*
  47.372 +         * The page must belong to the correct domain, and must be mapped
  47.373 +         * just once as a writeable page.
  47.374 +         */
  47.375 +        if ( unlikely(buf_page->u.domain != p) ||
  47.376 +             unlikely(!test_and_clear_bit(_PGC_allocated, 
  47.377 +                                          &buf_page->count_and_flags)) ||
  47.378 +             unlikely(cmpxchg(&buf_page->type_and_flags, 
  47.379 +                              PGT_writeable_page|PGT_validated|1,
  47.380 +                              0) != (PGT_writeable_page|PGT_validated|1)) )
  47.381 +        {
  47.382 +            DPRINTK("Bad domain or page mapped writeable more than once.\n");
  47.383 +            if ( buf_page->u.domain == p )
  47.384 +                set_bit(_PGC_allocated, &buf_page->count_and_flags);
  47.385 +            if ( unlikely(cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) !=
  47.386 +                          (pte & ~_PAGE_PRESENT)) )
  47.387 +                put_page_and_type(buf_page);
  47.388 +            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  47.389 +            goto rx_unmap_and_continue;
  47.390 +        }
  47.391 +
  47.392 +        /*
  47.393 +         * Now ensure that we can take the last references to this page.
  47.394 +         * The final count should be 2, because of PGC_allocated.
  47.395 +         */
  47.396 +        if ( unlikely(cmpxchg(&buf_page->count_and_flags, 
  47.397 +                              PGC_tlb_flush_on_type_change | 2, 0) != 
  47.398 +                      (PGC_tlb_flush_on_type_change | 2)) )
  47.399 +        {
  47.400 +            DPRINTK("Page held more than once\n");
  47.401 +            /* Leave the page unmapped at 'ptep'. Stoopid domain! */
  47.402              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  47.403              goto rx_unmap_and_continue;
  47.404          }
  47.405              
  47.406 -        buf_pfn  = *ptep >> PAGE_SHIFT;
  47.407 -        buf_page = frame_table + buf_pfn;
  47.408 +        /* Remove from the domain's allocation list. */
  47.409 +        spin_lock(&p->page_list_lock);
  47.410 +        list_del(&buf_page->list);
  47.411 +        spin_unlock(&p->page_list_lock);
  47.412  
  47.413 -        if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
  47.414 -              (PGT_writeable_page | p->domain)) || 
  47.415 -             (page_tot_count(buf_page) != 1) )
  47.416 -        {
  47.417 -            DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
  47.418 -                    page_type_count(buf_page), page_tot_count(buf_page), 
  47.419 -                    buf_page->flags);
  47.420 -            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  47.421 -            goto rx_unmap_and_continue;
  47.422 -        }
  47.423 -            
  47.424 -        /*
  47.425 -         * The pte they passed was good, so take it away from them. We also
  47.426 -         * lock down the page-table page, so it doesn't go away.
  47.427 -         */
  47.428 -        get_page_type(pte_page);
  47.429 -        get_page_tot(pte_page);
  47.430 -        *ptep &= ~_PAGE_PRESENT;
  47.431 -        buf_page->flags = 0;
  47.432 -        set_page_type_count(buf_page, 0);
  47.433 -        set_page_tot_count(buf_page, 0);
  47.434 -        list_del(&buf_page->list);
  47.435 -
  47.436 -        vif->rx_shadow_ring[j].id          = rx.id;
  47.437 -        vif->rx_shadow_ring[j].pte_ptr     = rx.addr;
  47.438 -        vif->rx_shadow_ring[j].buf_pfn     = buf_pfn;
  47.439 -        vif->rx_shadow_ring[j].flush_count = (unsigned short) 
  47.440 -            atomic_read(&tlb_flush_count[smp_processor_id()]);
  47.441 +        vif->rx_shadow_ring[j].id      = rx.id;
  47.442 +        vif->rx_shadow_ring[j].pte_ptr = rx.addr;
  47.443 +        vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
  47.444          j = RX_RING_INC(j);
  47.445              
  47.446      rx_unmap_and_continue:
  47.447          unmap_domain_mem(ptep);
  47.448 -        spin_unlock(&p->page_lock);
  47.449      }
  47.450  
  47.451      vif->rx_req_cons = i;
  47.452 @@ -2155,6 +2130,20 @@ static long get_bufs_from_vif(net_vif_t 
  47.453          vif->rx_prod = j;
  47.454      }
  47.455  
  47.456 +    spin_unlock(&vif->rx_lock);
  47.457 +}
  47.458 +
  47.459 +
  47.460 +static long get_bufs_from_vif(net_vif_t *vif)
  47.461 +{
  47.462 +    if ( get_tx_bufs(vif) )
  47.463 +    {
  47.464 +        add_to_net_schedule_list_tail(vif);
  47.465 +        maybe_schedule_tx_action();
  47.466 +    }
  47.467 +
  47.468 +    get_rx_bufs(vif);
  47.469 +
  47.470      return 0;
  47.471  }
  47.472  
  47.473 @@ -2162,7 +2151,7 @@ static long get_bufs_from_vif(net_vif_t 
  47.474  long flush_bufs_for_vif(net_vif_t *vif)
  47.475  {
  47.476      int i;
  47.477 -    unsigned long *pte;
  47.478 +    unsigned long *ptep, pte;
  47.479      struct pfn_info *page;
  47.480      struct task_struct *p = vif->domain;
  47.481      rx_shadow_entry_t *rx;
  47.482 @@ -2170,7 +2159,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
  47.483      net_idx_t *shared_idxs = vif->shared_idxs;
  47.484  
  47.485      /* Return any outstanding receive buffers to the guest OS. */
  47.486 -    spin_lock(&p->page_lock);
  47.487 +    spin_lock(&vif->rx_lock);
  47.488      for ( i = vif->rx_req_cons; 
  47.489            (i != shared_idxs->rx_req_prod) && 
  47.490                (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 
  47.491 @@ -2184,32 +2173,32 @@ long flush_bufs_for_vif(net_vif_t *vif)
  47.492      {
  47.493          rx = &vif->rx_shadow_ring[i];
  47.494  
  47.495 -        /* Release the page-table page. */
  47.496 -        page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
  47.497 -        put_page_type(page);
  47.498 -        put_page_tot(page);
  47.499 -
  47.500          /* Give the buffer page back to the domain. */
  47.501 -        page = frame_table + rx->buf_pfn;
  47.502 -        list_add(&page->list, &p->pg_head);
  47.503 -        page->flags = vif->domain->domain;
  47.504 +        page = &frame_table[rx->buf_pfn];
  47.505 +        spin_lock(&p->page_list_lock);
  47.506 +        list_add(&page->list, &p->page_list);
  47.507 +        page->count_and_flags = PGC_allocated | 2;
  47.508 +        spin_unlock(&p->page_list_lock);
  47.509 +        get_page_type(page, PGT_writeable_page);
  47.510 +        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
  47.511 +        wmb();
  47.512  
  47.513          /* Patch up the PTE if it hasn't changed under our feet. */
  47.514 -        pte = map_domain_mem(rx->pte_ptr);
  47.515 -        if ( !(*pte & _PAGE_PRESENT) )
  47.516 -        {
  47.517 -            *pte = (rx->buf_pfn<<PAGE_SHIFT) | (*pte & ~PAGE_MASK) | 
  47.518 -                _PAGE_RW | _PAGE_PRESENT;
  47.519 -            page->flags |= PGT_writeable_page | PG_need_flush;
  47.520 -            set_page_type_count(page, 1);
  47.521 -            set_page_tot_count(page, 1);
  47.522 -        }
  47.523 -        unmap_domain_mem(pte);
  47.524 +        ptep = map_domain_mem(rx->pte_ptr);
  47.525 +        pte  = *ptep;
  47.526 +        if ( unlikely(pte & _PAGE_PRESENT) ||
  47.527 +             unlikely(cmpxchg(ptep, pte, (rx->buf_pfn<<PAGE_SHIFT) | 
  47.528 +                              (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT)
  47.529 +                      != pte) )
  47.530 +            put_page_and_type(page);
  47.531 +        unmap_domain_mem(ptep);
  47.532 +
  47.533 +        put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]);
  47.534  
  47.535          make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
  47.536      }
  47.537      vif->rx_cons = i;
  47.538 -    spin_unlock(&p->page_lock);
  47.539 +    spin_unlock(&vif->rx_lock);
  47.540  
  47.541      /*
  47.542       * Flush pending transmit buffers. The guest may still have to wait for
  47.543 @@ -2221,7 +2210,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
  47.544                (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 
  47.545            i = TX_RING_INC(i) )
  47.546      {
  47.547 -        __make_tx_response(vif, shared_rings->tx_ring[i].req.id, 
  47.548 +        make_tx_response(vif, shared_rings->tx_ring[i].req.id, 
  47.549                             RING_STATUS_DROPPED);
  47.550      }
  47.551      vif->tx_req_cons = i;
  47.552 @@ -2296,9 +2285,9 @@ long do_net_io_op(netop_t *uop)
  47.553  }
  47.554  
  47.555  
  47.556 -static void __make_tx_response(net_vif_t     *vif, 
  47.557 -                               unsigned short id, 
  47.558 -                               unsigned char  st)
  47.559 +static void make_tx_response(net_vif_t     *vif, 
  47.560 +                             unsigned short id, 
  47.561 +                             unsigned char  st)
  47.562  {
  47.563      unsigned int pos;
  47.564      tx_resp_entry_t *resp;
  47.565 @@ -2329,7 +2318,6 @@ static void make_rx_response(net_vif_t  
  47.566      rx_resp_entry_t *resp;
  47.567  
  47.568      /* Place on the response ring for the relevant domain. */ 
  47.569 -    spin_lock(&vif->rx_lock);
  47.570      pos  = vif->rx_resp_prod;
  47.571      resp = &vif->shared_rings->rx_ring[pos].resp;
  47.572      resp->id     = id;
  47.573 @@ -2344,7 +2332,6 @@ static void make_rx_response(net_vif_t  
  47.574          unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
  47.575          guest_event_notify(cpu_mask);    
  47.576      }
  47.577 -    spin_unlock(&vif->rx_lock);
  47.578  }
  47.579  
  47.580  
    48.1 --- a/xen/net/skbuff.c	Sat Dec 20 11:49:50 2003 +0000
    48.2 +++ b/xen/net/skbuff.c	Sat Dec 20 12:44:39 2003 +0000
    48.3 @@ -133,41 +133,20 @@ static __inline__ void skb_head_to_pool(
    48.4  
    48.5  static inline u8 *alloc_skb_data_page(struct sk_buff *skb)
    48.6  {
    48.7 -    struct list_head *list_ptr;
    48.8 -    struct pfn_info  *pf;
    48.9 -    unsigned long flags;
   48.10 -        
   48.11 -    spin_lock_irqsave(&free_list_lock, flags);
   48.12 -
   48.13 -    if (!free_pfns) return NULL;
   48.14 -
   48.15 -    list_ptr = free_list.next;
   48.16 -    pf = list_entry(list_ptr, struct pfn_info, list);
   48.17 -    pf->flags = 0;
   48.18 -    list_del(&pf->list);
   48.19 -    free_pfns--;
   48.20 -
   48.21 -    spin_unlock_irqrestore(&free_list_lock, flags);
   48.22 -
   48.23 +    struct pfn_info *pf;
   48.24 +    if ( unlikely((pf = alloc_domain_page(NULL)) == NULL) )
   48.25 +        return NULL;
   48.26      skb->pf = pf;
   48.27      return (u8 *)((pf - frame_table) << PAGE_SHIFT);
   48.28  }
   48.29  
   48.30  static inline void dealloc_skb_data_page(struct sk_buff *skb)
   48.31  {
   48.32 -    struct pfn_info  *pf;
   48.33 +    struct pfn_info *pf = skb->pf;
   48.34      unsigned long flags;
   48.35 -
   48.36 -    pf = skb->pf;
   48.37 -
   48.38      spin_lock_irqsave(&free_list_lock, flags);
   48.39 -        
   48.40 -    pf->flags = 0;
   48.41 -    set_page_type_count(pf, 0);
   48.42 -    set_page_tot_count(pf, 0);
   48.43      list_add(&pf->list, &free_list);
   48.44      free_pfns++;
   48.45 -
   48.46      spin_unlock_irqrestore(&free_list_lock, flags);
   48.47  
   48.48  }
    49.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c	Sat Dec 20 11:49:50 2003 +0000
    49.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c	Sat Dec 20 12:44:39 2003 +0000
    49.3 @@ -40,7 +40,7 @@ static void DEBUG_allow_pt_reads(void)
    49.4          pte = update_debug_queue[i].ptep;
    49.5          if ( pte == NULL ) continue;
    49.6          update_debug_queue[i].ptep = NULL;
    49.7 -        update.ptr = pte;
    49.8 +        update.ptr = virt_to_machine(pte);
    49.9          update.val = update_debug_queue[i].pteval;
   49.10          HYPERVISOR_mmu_update(&update, 1);
   49.11      }
   49.12 @@ -59,7 +59,7 @@ static void DEBUG_disallow_pt_read(unsig
   49.13      pgd = pgd_offset_k(va);
   49.14      pmd = pmd_offset(pgd, va);
   49.15      pte = pte_offset(pmd, va);
   49.16 -    update.ptr = pte;
   49.17 +    update.ptr = virt_to_machine(pte);
   49.18      pteval = *(unsigned long *)pte;
   49.19      update.val = pteval & ~_PAGE_PRESENT;
   49.20      HYPERVISOR_mmu_update(&update, 1);
   49.21 @@ -95,7 +95,9 @@ void MULTICALL_flush_page_update_queue(v
   49.22  #if MMU_UPDATE_DEBUG > 0
   49.23          DEBUG_allow_pt_reads();
   49.24  #endif
   49.25 -        queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, idx);
   49.26 +        queue_multicall2(__HYPERVISOR_mmu_update, 
   49.27 +                         (unsigned long)update_queue, 
   49.28 +                         idx);
   49.29          idx = 0;
   49.30      }
   49.31      spin_unlock_irqrestore(&update_lock, flags);
   49.32 @@ -134,7 +136,7 @@ void queue_l1_entry_update(pte_t *ptr, u
   49.33  #if MMU_UPDATE_DEBUG > 0
   49.34      DEBUG_disallow_pt_read((unsigned long)ptr);
   49.35  #endif
   49.36 -    update_queue[idx].ptr = (unsigned long)ptr;
   49.37 +    update_queue[idx].ptr = virt_to_machine(ptr);
   49.38      update_queue[idx].val = val;
   49.39      increment_index();
   49.40      spin_unlock_irqrestore(&update_lock, flags);
   49.41 @@ -144,7 +146,7 @@ void queue_l2_entry_update(pmd_t *ptr, u
   49.42  {
   49.43      unsigned long flags;
   49.44      spin_lock_irqsave(&update_lock, flags);
   49.45 -    update_queue[idx].ptr = (unsigned long)ptr;
   49.46 +    update_queue[idx].ptr = virt_to_machine(ptr);
   49.47      update_queue[idx].val = val;
   49.48      increment_index();
   49.49      spin_unlock_irqrestore(&update_lock, flags);
    50.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c	Sat Dec 20 11:49:50 2003 +0000
    50.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c	Sat Dec 20 12:44:39 2003 +0000
    50.3 @@ -113,13 +113,10 @@ static inline void set_pte_phys (unsigne
    50.4      }
    50.5      pte = pte_offset(pmd, vaddr);
    50.6  
    50.7 -#if 0 /* Not in Xen, since this breaks clear_fixmap. */
    50.8 -    if (pte_val(*pte))
    50.9 -        pte_ERROR(*pte);
   50.10 -#endif
   50.11 -
   50.12 -    /* We queue directly, avoiding hidden phys->machine translation. */
   50.13 -    queue_l1_entry_update(pte, phys | pgprot_val(prot));
   50.14 +    if ( pte_io(*pte) || (pgprot_val(prot) & _PAGE_IO) )
   50.15 +        queue_unchecked_mmu_update(pte, phys | pgprot_val(prot));
   50.16 +    else
   50.17 +        queue_l1_entry_update(pte, phys | pgprot_val(prot));
   50.18  
   50.19      /*
   50.20       * It's enough to flush this one mapping.
   50.21 @@ -137,8 +134,7 @@ void __set_fixmap(enum fixed_addresses i
   50.22          printk("Invalid __set_fixmap\n");
   50.23          return;
   50.24      }
   50.25 -    set_pte_phys(address, phys, 
   50.26 -                 __pgprot(pgprot_val(PAGE_KERNEL)|pgprot_val(flags)));
   50.27 +    set_pte_phys(address, phys, flags);
   50.28  }
   50.29  
   50.30  void clear_fixmap(enum fixed_addresses idx)
    51.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c	Sat Dec 20 11:49:50 2003 +0000
    51.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c	Sat Dec 20 12:44:39 2003 +0000
    51.3 @@ -202,14 +202,15 @@ void __init *bt_ioremap(unsigned long ma
    51.4           */
    51.5          nrpages = size >> PAGE_SHIFT;
    51.6          if (nrpages > NR_FIX_BTMAPS)
    51.7 -                return NULL;
    51.8 +            return NULL;
    51.9  
   51.10          /*
   51.11           * Ok, go for it..
   51.12           */
   51.13          idx = FIX_BTMAP_BEGIN;
   51.14          while (nrpages > 0) {
   51.15 -                set_fixmap(idx, machine_addr);
   51.16 +                __set_fixmap(idx, machine_addr, 
   51.17 +                             __pgprot(__PAGE_KERNEL|_PAGE_IO));
   51.18                  machine_addr += PAGE_SIZE;
   51.19                  --idx;
   51.20                  --nrpages;