ia64/xen-unstable

changeset 1015:dce3446ac01e

bitkeeper revision 1.656 (3fe4de1f1IOfUVzwLIqE8EHIf7xJoA)

Merge nidd.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into nidd.cl.cam.ac.uk:/auto/anfs/scratch/labyrinth/iap10/xeno-clone/xeno.bk
author iap10@nidd.cl.cam.ac.uk
date Sat Dec 20 23:41:19 2003 +0000 (2003-12-20)
parents 4bfd14bb15cd b301a3d4a2a5
children 94cd24f6b95e
files .rootkeys tools/xc/lib/xc_linux_build.c tools/xc/lib/xc_linux_restore.c tools/xc/lib/xc_linux_save.c xen/GUEST_CHANGES xen/Makefile xen/TODO xen/arch/i386/Rules.mk xen/arch/i386/apic.c xen/arch/i386/entry.S xen/arch/i386/flushtlb.c xen/arch/i386/io_apic.c xen/arch/i386/ioremap.c xen/arch/i386/irq.c xen/arch/i386/mm.c xen/arch/i386/pci-irq.c xen/arch/i386/process.c xen/arch/i386/smp.c xen/arch/i386/smpboot.c xen/arch/i386/traps.c xen/common/dom0_ops.c xen/common/dom_mem_ops.c xen/common/domain.c xen/common/kernel.c xen/common/memory.c xen/common/network.c xen/common/page_alloc.c xen/drivers/block/ll_rw_blk.c xen/drivers/block/xen_block.c xen/drivers/block/xen_vbd.c xen/drivers/net/e1000/e1000_main.c xen/include/asm-i386/atomic.h xen/include/asm-i386/flushtlb.h xen/include/asm-i386/io.h xen/include/asm-i386/page.h xen/include/asm-i386/pgalloc.h xen/include/asm-i386/smp.h xen/include/asm-i386/spinlock.h xen/include/asm-i386/system.h xen/include/hypervisor-ifs/dom0_ops.h xen/include/hypervisor-ifs/hypervisor-if.h xen/include/xeno/config.h xen/include/xeno/mm.h xen/include/xeno/perfc.h xen/include/xeno/perfc_defn.h xen/include/xeno/sched.h xen/include/xeno/vif.h xen/net/dev.c xen/net/skbuff.c xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c xenolinux-2.4.23-sparse/arch/xeno/mm/init.c xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
line diff
     1.1 --- a/.rootkeys	Sat Dec 20 23:39:49 2003 +0000
     1.2 +++ b/.rootkeys	Sat Dec 20 23:41:19 2003 +0000
     1.3 @@ -79,10 +79,8 @@ 3fbd0a42l40lM0IICw2jXbQBVZSdZg tools/xc/
     1.4  3fbd4bd6GtGwZGxYUJPOheYIR7bPaA tools/xc/py/XenoUtil.py
     1.5  3fbd0a40yT6G3M9hMpaz5xTUdl0E4g tools/xc/py/setup.py
     1.6  3f72f1bdJPsV3JCnBqs9ddL9tr6D2g xen/COPYING
     1.7 -3f841450eJvqAD1Dldc0_aOweGiglQ xen/GUEST_CHANGES
     1.8  3ddb79bcbOVHh38VJzc97-JEGD4dJQ xen/Makefile
     1.9  3ddb79bcWnTwYsQRWl_PaneJfa6p0w xen/Rules.mk
    1.10 -3e74d2be6ELqhaY1sW0yyHRKhpOvDQ xen/TODO
    1.11  3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/i386/Makefile
    1.12  3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/i386/Rules.mk
    1.13  3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/i386/acpitable.c
    1.14 @@ -92,6 +90,7 @@ 3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/
    1.15  3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c
    1.16  3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S
    1.17  3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c
    1.18 +3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c
    1.19  3ddb79bcesE5E-lS4QhRhlqXxqj9cA xen/arch/i386/i387.c
    1.20  3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/i386/i8259.c
    1.21  3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/i386/idle0_task.c
     2.1 --- a/tools/xc/lib/xc_linux_build.c	Sat Dec 20 23:39:49 2003 +0000
     2.2 +++ b/tools/xc/lib/xc_linux_build.c	Sat Dec 20 23:41:19 2003 +0000
     2.3 @@ -106,12 +106,12 @@ static int setup_guestos(int xc_handle,
     2.4                           const char *cmdline,
     2.5                           unsigned long shared_info_frame)
     2.6  {
     2.7 -    l1_pgentry_t *vl1tab = NULL, *vl1e = NULL;
     2.8 -    l2_pgentry_t *vl2tab = NULL, *vl2e = NULL;
     2.9 +    l1_pgentry_t *vl1tab;
    2.10 +    l2_pgentry_t *vl2tab;
    2.11      unsigned long *page_array = NULL;
    2.12      mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL;
    2.13      int alloc_index, num_pt_pages;
    2.14 -    unsigned long l2tab;
    2.15 +    unsigned long l2tab, l2e, l1e=0;
    2.16      unsigned long l1tab = 0;
    2.17      unsigned long num_pgt_updates = 0;
    2.18      unsigned long count, pt_start, i, j;
    2.19 @@ -230,44 +230,46 @@ static int setup_guestos(int xc_handle,
    2.20      if ( (vl2tab = map_pfn(pm_handle, l2tab >> PAGE_SHIFT)) == NULL )
    2.21          goto error_out;
    2.22      memset(vl2tab, 0, PAGE_SIZE);
    2.23 -    vl2e = vl2tab + l2_table_offset(virt_load_addr);
    2.24 +    unmap_pfn(pm_handle, vl2tab);
    2.25 +    l2e = l2tab + (l2_table_offset(virt_load_addr)*sizeof(l2_pgentry_t));
    2.26      for ( count = 0; count < tot_pages; count++ )
    2.27      {    
    2.28 -        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) 
    2.29 +        if ( (l1e & (PAGE_SIZE-1)) == 0 )
    2.30          {
    2.31              l1tab = page_array[alloc_index] << PAGE_SHIFT;
    2.32              if ( (vl1tab = map_pfn(pm_handle, l1tab >> PAGE_SHIFT)) == NULL )
    2.33                  goto error_out;
    2.34              memset(vl1tab, 0, PAGE_SIZE);
    2.35 +            unmap_pfn(pm_handle, vl1tab);
    2.36              alloc_index--;
    2.37  		
    2.38 -            vl1e = vl1tab + l1_table_offset(virt_load_addr + 
    2.39 -                                            (count << PAGE_SHIFT));
    2.40 +            l1e = l1tab + (l1_table_offset(virt_load_addr+(count<<PAGE_SHIFT))*
    2.41 +                           sizeof(l1_pgentry_t));
    2.42  
    2.43              /* make apropriate entry in the page directory */
    2.44 -            pgt_updates->ptr = (unsigned long)vl2e;
    2.45 +            pgt_updates->ptr = l2e;
    2.46              pgt_updates->val = l1tab | L2_PROT;
    2.47              pgt_updates++;
    2.48              num_pgt_updates++;
    2.49 -            vl2e++;
    2.50 +            l2e += sizeof(l2_pgentry_t);
    2.51          }
    2.52  
    2.53          if ( count < pt_start )
    2.54          {
    2.55 -            pgt_updates->ptr = (unsigned long)vl1e;
    2.56 +            pgt_updates->ptr = l1e;
    2.57              pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT;
    2.58              pgt_updates++;
    2.59              num_pgt_updates++;
    2.60 -            vl1e++;
    2.61 +            l1e += sizeof(l1_pgentry_t);
    2.62          }
    2.63          else
    2.64          {
    2.65 -            pgt_updates->ptr = (unsigned long)vl1e;
    2.66 +            pgt_updates->ptr = l1e;
    2.67              pgt_updates->val = 
    2.68                  ((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW;
    2.69              pgt_updates++;
    2.70              num_pgt_updates++;
    2.71 -            vl1e++;
    2.72 +            l1e += sizeof(l1_pgentry_t);
    2.73          }
    2.74  
    2.75          pgt_updates->ptr = 
     3.1 --- a/tools/xc/lib/xc_linux_restore.c	Sat Dec 20 23:39:49 2003 +0000
     3.2 +++ b/tools/xc/lib/xc_linux_restore.c	Sat Dec 20 23:41:19 2003 +0000
     3.3 @@ -301,7 +301,8 @@ int xc_linux_restore(int xc_handle,
     3.4                      page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
     3.5                  }
     3.6                  if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
     3.7 -                                    (unsigned long)&ppage[j], page[j]) )
     3.8 +                                    (mfn<<PAGE_SHIFT)+(j*sizeof(l1_pgentry_t)),
     3.9 +                                    page[j]) )
    3.10                      goto out;
    3.11              }
    3.12              break;
    3.13 @@ -337,7 +338,8 @@ int xc_linux_restore(int xc_handle,
    3.14                      page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
    3.15                  }
    3.16                  if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
    3.17 -                                    (unsigned long)&ppage[j], page[j]) )
    3.18 +                                    (mfn<<PAGE_SHIFT)+(j*sizeof(l2_pgentry_t)),
    3.19 +                                    page[j]) )
    3.20                      goto out;
    3.21              }
    3.22              break;
    3.23 @@ -345,9 +347,6 @@ int xc_linux_restore(int xc_handle,
    3.24              memcpy(ppage, page, PAGE_SIZE);
    3.25              break;
    3.26          }
    3.27 -        /* NB. Must flush before unmapping page, as pass VAs to Xen. */
    3.28 -        if ( flush_mmu_updates(xc_handle, mmu_updates, &mmu_update_idx) )
    3.29 -            goto out;
    3.30          unmap_pfn(pm_handle, ppage);
    3.31  
    3.32          if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
     4.1 --- a/tools/xc/lib/xc_linux_save.c	Sat Dec 20 23:39:49 2003 +0000
     4.2 +++ b/tools/xc/lib/xc_linux_save.c	Sat Dec 20 23:41:19 2003 +0000
     4.3 @@ -44,19 +44,20 @@ static int check_pfn_ownership(int xc_ha
     4.4  {
     4.5      dom0_op_t op;
     4.6      op.cmd = DOM0_GETPAGEFRAMEINFO;
     4.7 -    op.u.getpageframeinfo.pfn = mfn;
     4.8 -    if ( (do_dom0_op(xc_handle, &op) < 0) || 
     4.9 -         (op.u.getpageframeinfo.domain != dom) )
    4.10 -        return 0;
    4.11 -    return 1;
    4.12 +    op.u.getpageframeinfo.pfn    = mfn;
    4.13 +    op.u.getpageframeinfo.domain = dom;
    4.14 +    return (do_dom0_op(xc_handle, &op) >= 0);
    4.15  }
    4.16  
    4.17  #define GETPFN_ERR (~0U)
    4.18 -static unsigned int get_pfn_type(int xc_handle, unsigned long mfn)
    4.19 +static unsigned int get_pfn_type(int xc_handle, 
    4.20 +                                 unsigned long mfn, 
    4.21 +                                 unsigned int dom)
    4.22  {
    4.23      dom0_op_t op;
    4.24      op.cmd = DOM0_GETPAGEFRAMEINFO;
    4.25 -    op.u.getpageframeinfo.pfn = mfn;
    4.26 +    op.u.getpageframeinfo.pfn    = mfn;
    4.27 +    op.u.getpageframeinfo.domain = dom;
    4.28      if ( do_dom0_op(xc_handle, &op) < 0 )
    4.29      {
    4.30          PERROR("Unexpected failure when getting page frame info!");
    4.31 @@ -259,7 +260,8 @@ int xc_linux_save(int xc_handle,
    4.32          mfn_to_pfn_table[mfn] = i;
    4.33  
    4.34          /* Query page type by MFN, but store it by PFN. */
    4.35 -        if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn)) == GETPFN_ERR )
    4.36 +        if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) == 
    4.37 +             GETPFN_ERR )
    4.38              goto out;
    4.39      }
    4.40  
     5.1 --- a/xen/GUEST_CHANGES	Sat Dec 20 23:39:49 2003 +0000
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,26 +0,0 @@
     5.4 -
     5.5 -The interface between Xen and overlying guest OSes has changed in the
     5.6 -following ways since version 1.0:
     5.7 -
     5.8 -Modified hypercall 'pt_update'
     5.9 -------------------------------
    5.10 -Page-table updates passed to the 'pt_update' hypercall must now
    5.11 -specify a virtual address that maps the PTE to be modified. Previously
    5.12 -a physical address was used, requiring Xen to temporarily map the PTE
    5.13 -into its own private region so that it could be read and written.
    5.14 -This affects only commands of type PGREQ_NORMAL_UPDATE and
    5.15 -PGREQ_UNCHECKED_UPDATE.
    5.16 -
    5.17 -New hypercall 'update_va_mapping'
    5.18 ----------------------------------
    5.19 -A new high-speed page-table update method has been introduced, which
    5.20 -may be of particular benefit when fixing up application page faults.
    5.21 -Invoked as 'update_va_mapping(page_number, new_pte_value, flags)':
    5.22 - <page_number>: The virtual page number in the current address space 
    5.23 -                whose PTE is to be modified.
    5.24 - <new_pte_value>: The new value to write into the PTE.
    5.25 - <flags>: An ORed combination of
    5.26 -          UVMF_INVLPG: Flush stale TLB entry of the updated page mapping
    5.27 -          UVMF_FLUSH_TLB: Flush all TLB entries
    5.28 -You can see this new call in use in Xenolinux (common/memory.c).
    5.29 -
     6.1 --- a/xen/Makefile	Sat Dec 20 23:39:49 2003 +0000
     6.2 +++ b/xen/Makefile	Sat Dec 20 23:41:19 2003 +0000
     6.3 @@ -2,7 +2,7 @@
     6.4  # This is the correct place to edit the build version.
     6.5  # All other places this is stored (eg. compile.h) should be autogenerated.
     6.6  export XEN_VERSION       = 1
     6.7 -export XEN_SUBVERSION    = 2
     6.8 +export XEN_SUBVERSION    = 3
     6.9  export XEN_EXTRAVERSION  = "-rc"
    6.10  
    6.11  export BASEDIR          := $(shell pwd)
     7.1 --- a/xen/TODO	Sat Dec 20 23:39:49 2003 +0000
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,54 +0,0 @@
     7.4 -
     7.5 -This is stuff we probably want to implement in the near future.
     7.6 -
     7.7 - -- Keir (16/3/03)
     7.8 -
     7.9 -
    7.10 -1. DOMAIN-0 MANAGEMENT DAEMON
    7.11 ------------------------------
    7.12 -A better control daemon is required for domain 0, which keeps proper
    7.13 -track of machine resources and can make sensible policy choices. This
    7.14 -may require support in Xen; for example, notifications (eg. DOMn is
    7.15 -killed), and requests (eg. can DOMn allocate x frames of memory?).
    7.16 -
    7.17 -2. ASSIGNING DOMAINS TO PROCESSORS
    7.18 -----------------------------------
    7.19 -More intelligent assignment of domains to processors. In
    7.20 -particular, we don't play well with hyperthreading: we will assign
    7.21 -domains to virtual processors on the same package, rather then
    7.22 -spreading them across processor packages.
    7.23 -
    7.24 -What we need to do is port code from Linux which stores information on
    7.25 -relationships between processors in the system (eg. which ones are
    7.26 -siblings in the same package). We then use this to balance domains
    7.27 -across packages, and across virtual processors within a package.
    7.28 -
    7.29 -3. SANE NETWORK ROUTING
    7.30 ------------------------
    7.31 -The current virtual firewall/router is completely broken. Needs a new
    7.32 -design and implementation!
    7.33 -
    7.34 -
    7.35 -
    7.36 -Graveyard
    7.37 -*********
    7.38 -
    7.39 -The hypervisor page cache
    7.40 --------------------------
    7.41 -This will allow guest OSes to make use of spare pages in the system, but
    7.42 -allow them to be immediately used for any new domains or memory requests.
    7.43 -The idea is that, when a page is laundered and falls off Linux's clean_LRU
    7.44 -list, rather than freeing it it becomes a candidate for passing down into
    7.45 -the hypervisor. In return, xeno-linux may ask for one of its previously-
    7.46 -cached pages back:
    7.47 - (page, new_id) = cache_query(page, old_id);
    7.48 -If the requested page couldn't be kept, a blank page is returned.
    7.49 -When would Linux make the query? Whenever it wants a page back without
    7.50 -the delay or going to disc. Also, whenever a page would otherwise be
    7.51 -flushed to disc.
    7.52 -
    7.53 -To try and add to the cache: (blank_page, new_id) = cache_query(page, NULL);
    7.54 - [NULL means "give me a blank page"].
    7.55 -To try and retrieve from the cache: (page, new_id) = cache_query(x_page, id)
    7.56 - [we may request that x_page just be discarded, and therefore not impinge
    7.57 -  on this domain's cache quota].
     9.1 --- a/xen/arch/i386/apic.c	Sat Dec 20 23:39:49 2003 +0000
     9.2 +++ b/xen/arch/i386/apic.c	Sat Dec 20 23:41:19 2003 +0000
     9.3 @@ -47,7 +47,7 @@
     9.4  #include <asm/hardirq.h>
     9.5  #include <asm/apic.h>
     9.6  #include <xeno/mm.h>
     9.7 -
     9.8 +#include <asm/io_apic.h>
     9.9  #include <asm/timex.h>
    9.10  #include <xeno/ac_timer.h>
    9.11  #include <xeno/perfc.h>
    10.1 --- a/xen/arch/i386/entry.S	Sat Dec 20 23:39:49 2003 +0000
    10.2 +++ b/xen/arch/i386/entry.S	Sat Dec 20 23:41:19 2003 +0000
    10.3 @@ -82,7 +82,6 @@
    10.4  #include <xeno/config.h>
    10.5  #include <xeno/errno.h>
    10.6  #include <hypervisor-ifs/hypervisor-if.h>
    10.7 -#include <asm/smp.h>
    10.8  
    10.9  EBX		= 0x00
   10.10  ECX		= 0x04
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/xen/arch/i386/flushtlb.c	Sat Dec 20 23:41:19 2003 +0000
    11.3 @@ -0,0 +1,64 @@
    11.4 +/******************************************************************************
    11.5 + * flushtlb.c
    11.6 + * 
    11.7 + * TLB flushes are timestamped using a global virtual 'clock' which ticks
    11.8 + * on any TLB flush on any processor.
    11.9 + * 
   11.10 + * Copyright (c) 2003, K A Fraser
   11.11 + */
   11.12 +
   11.13 +#include <xeno/config.h>
   11.14 +#include <xeno/sched.h>
   11.15 +#include <asm/flushtlb.h>
   11.16 +
   11.17 +unsigned long tlbflush_mask;
   11.18 +unsigned long tlbflush_clock;
   11.19 +unsigned long tlbflush_time[NR_CPUS];
   11.20 +
   11.21 +static inline void tlb_clocktick(unsigned int cpu)
   11.22 +{
   11.23 +    unsigned long x, nx, y, ny;
   11.24 +    
   11.25 +    clear_bit(cpu, &tlbflush_mask);
   11.26 +
   11.27 +    /* Tick the clock. 'y' contains the current time after the tick. */
   11.28 +    ny = tlbflush_clock;
   11.29 +    do {
   11.30 +#ifdef CONFIG_SMP
   11.31 +        if ( unlikely(((y = ny+1) & (GLOBAL_FLUSH_PERIOD - 1)) == 0) )
   11.32 +        {
   11.33 +            new_tlbflush_clock_period();
   11.34 +            y = tlbflush_clock;
   11.35 +            break;
   11.36 +        }
   11.37 +#else
   11.38 +        y = ny+1;
   11.39 +#endif
   11.40 +    }
   11.41 +    while ( unlikely((ny = cmpxchg(&tlbflush_clock, y-1, y)) != y-1) );
   11.42 +
   11.43 +    /* Update cpu's timestamp to current time, unless someone else beats us. */
   11.44 +    nx = tlbflush_time[cpu];
   11.45 +    do { 
   11.46 +        if ( unlikely((x = nx) >= y) )
   11.47 +            break;
   11.48 +    }
   11.49 +    while ( unlikely((nx = cmpxchg(&tlbflush_time[cpu], x, y)) != x) );
   11.50 +}
   11.51 +
   11.52 +void write_cr3_counted(unsigned long pa)
   11.53 +{
   11.54 +    __asm__ __volatile__ ( 
   11.55 +        "movl %0, %%cr3"
   11.56 +        : : "r" (pa) : "memory" );
   11.57 +    tlb_clocktick(smp_processor_id());
   11.58 +}
   11.59 +
   11.60 +void flush_tlb_counted(void)
   11.61 +{
   11.62 +    __asm__ __volatile__ ( 
   11.63 +        "movl %%cr3, %%eax; movl %%eax, %%cr3"
   11.64 +        : : : "memory", "eax" );
   11.65 +    tlb_clocktick(smp_processor_id());
   11.66 +}
   11.67 +
    12.1 --- a/xen/arch/i386/io_apic.c	Sat Dec 20 23:39:49 2003 +0000
    12.2 +++ b/xen/arch/i386/io_apic.c	Sat Dec 20 23:41:19 2003 +0000
    12.3 @@ -28,6 +28,8 @@
    12.4  #include <xeno/config.h>
    12.5  #include <asm/mc146818rtc.h>
    12.6  #include <asm/io.h>
    12.7 +#include <asm/mpspec.h>
    12.8 +#include <asm/io_apic.h>
    12.9  #include <asm/smp.h>
   12.10  #include <asm/desc.h>
   12.11  #include <asm/smpboot.h>
    13.1 --- a/xen/arch/i386/ioremap.c	Sat Dec 20 23:39:49 2003 +0000
    13.2 +++ b/xen/arch/i386/ioremap.c	Sat Dec 20 23:41:19 2003 +0000
    13.3 @@ -15,92 +15,50 @@
    13.4  #include <asm/pgalloc.h>
    13.5  #include <asm/page.h>
    13.6  
    13.7 -static unsigned long remap_base = 0;
    13.8 +static unsigned long remap_base = IOREMAP_VIRT_START;
    13.9  
   13.10  #define PAGE_ALIGN(addr)    (((addr)+PAGE_SIZE-1)&PAGE_MASK)
   13.11  
   13.12 -static void new_l2e(l2_pgentry_t *pl2e)
   13.13 -{
   13.14 -    l1_pgentry_t *pl1e = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
   13.15 -    if ( !pl1e ) BUG();
   13.16 -    clear_page(pl1e);
   13.17 -    *pl2e = mk_l2_pgentry(__pa(pl1e)|__PAGE_HYPERVISOR);
   13.18 -}
   13.19 -
   13.20 -
   13.21 -void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
   13.22 +void * __ioremap(unsigned long phys_addr, 
   13.23 +                 unsigned long size, 
   13.24 +                 unsigned long flags)
   13.25  {
   13.26      unsigned long vaddr;
   13.27      unsigned long offset, cur=0, last_addr;
   13.28      l2_pgentry_t *pl2e;
   13.29      l1_pgentry_t *pl1e;
   13.30  
   13.31 -    /* First time through, start allocating from far end of virtual memory. */
   13.32 -    if ( !remap_base ) remap_base = IOREMAP_VIRT_START;
   13.33 -
   13.34      /* Don't allow wraparound or zero size */
   13.35      last_addr = phys_addr + size - 1;
   13.36 -    if (!size || last_addr < phys_addr)
   13.37 +    if ( (size == 0) || (last_addr < phys_addr) )
   13.38          return NULL;
   13.39  
   13.40 -    /*
   13.41 -     * Don't remap the low PCI/ISA area, it's always mapped..
   13.42 -     */
   13.43 -    if (phys_addr >= 0xA0000 && last_addr < 0x100000)
   13.44 +    /* Don't remap the low PCI/ISA area: it's always mapped. */
   13.45 +    if ( (phys_addr >= 0xA0000) && (last_addr < 0x100000) )
   13.46          return phys_to_virt(phys_addr);
   13.47  
   13.48 -    if(remap_base + size > IOREMAP_VIRT_END-1) {
   13.49 -      printk("ioremap: going past end of reserved space!\n");
   13.50 -      return NULL;
   13.51 +    if ( (remap_base + size) > (IOREMAP_VIRT_END - 1) )
   13.52 +    {
   13.53 +        printk("ioremap: going past end of reserved space!\n");
   13.54 +        return NULL;
   13.55      }
   13.56 -#if 0
   13.57 -    /*
   13.58 -     * Don't allow anybody to remap normal RAM that we're using..
   13.59 -     */
   13.60 -    if (phys_addr < virt_to_phys(high_memory)) {
   13.61 -        char *t_addr, *t_end;
   13.62 -        struct pfn_info *page;
   13.63  
   13.64 -        t_addr = __va(phys_addr);
   13.65 -        t_end = t_addr + (size - 1);
   13.66 -	   
   13.67 -        for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
   13.68 -            if(!PageReserved(page))
   13.69 -                return NULL;
   13.70 -    }
   13.71 -#endif
   13.72 -
   13.73 -    /*
   13.74 -     * Mappings have to be page-aligned
   13.75 -     */
   13.76 +    /* Mappings have to be page-aligned. */
   13.77      offset = phys_addr & ~PAGE_MASK;
   13.78      phys_addr &= PAGE_MASK;
   13.79      size = PAGE_ALIGN(last_addr) - phys_addr;
   13.80  
   13.81 -    /*
   13.82 -     * Ok, go for it..
   13.83 -     */
   13.84 +    /* Ok, go for it. */
   13.85      vaddr = remap_base;
   13.86      remap_base += size;
   13.87      pl2e = &idle_pg_table[l2_table_offset(vaddr)];
   13.88 -    if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
   13.89      pl1e = l2_pgentry_to_l1(*pl2e++) + l1_table_offset(vaddr);
   13.90 -    for ( ; ; ) 
   13.91 -    {
   13.92 -        if ( !l1_pgentry_empty(*pl1e) ) BUG();
   13.93 +    do {
   13.94          *pl1e++ = mk_l1_pgentry((phys_addr+cur)|PAGE_HYPERVISOR|flags);
   13.95 -        cur += PAGE_SIZE;
   13.96 -        if ( cur == size ) break;
   13.97 -        if ( !((unsigned long)pl1e & (PAGE_SIZE-1)) )
   13.98 -        {
   13.99 -            if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
  13.100 -            pl1e = l2_pgentry_to_l1(*pl2e++);        
  13.101 -        }
  13.102      }
  13.103 +    while ( (cur += PAGE_SIZE) != size );
  13.104  
  13.105 -    flush_tlb_all();
  13.106 -
  13.107 -    return (void *) (offset + (char *)vaddr);
  13.108 +    return (void *)(offset + (char *)vaddr);
  13.109  }
  13.110  
  13.111  void iounmap(void *addr)
    14.1 --- a/xen/arch/i386/irq.c	Sat Dec 20 23:39:49 2003 +0000
    14.2 +++ b/xen/arch/i386/irq.c	Sat Dec 20 23:41:19 2003 +0000
    14.3 @@ -24,7 +24,8 @@
    14.4  #include <xeno/interrupt.h>
    14.5  #include <xeno/irq.h>
    14.6  #include <xeno/slab.h>
    14.7 -
    14.8 +#include <asm/mpspec.h>
    14.9 +#include <asm/io_apic.h>
   14.10  #include <asm/msr.h>
   14.11  #include <asm/hardirq.h>
   14.12  #include <asm/ptrace.h>
    15.1 --- a/xen/arch/i386/mm.c	Sat Dec 20 23:39:49 2003 +0000
    15.2 +++ b/xen/arch/i386/mm.c	Sat Dec 20 23:41:19 2003 +0000
    15.3 @@ -27,8 +27,8 @@
    15.4  #include <asm/fixmap.h>
    15.5  #include <asm/domain_page.h>
    15.6  
    15.7 -static inline void set_pte_phys (unsigned long vaddr,
    15.8 -                                 l1_pgentry_t entry)
    15.9 +static inline void set_pte_phys(unsigned long vaddr,
   15.10 +                                l1_pgentry_t entry)
   15.11  {
   15.12      l2_pgentry_t *l2ent;
   15.13      l1_pgentry_t *l1ent;
   15.14 @@ -41,20 +41,22 @@ static inline void set_pte_phys (unsigne
   15.15      __flush_tlb_one(vaddr);
   15.16  }
   15.17  
   15.18 -void __set_fixmap (enum fixed_addresses idx, 
   15.19 -                   l1_pgentry_t entry)
   15.20 +
   15.21 +void __set_fixmap(enum fixed_addresses idx, 
   15.22 +                  l1_pgentry_t entry)
   15.23  {
   15.24      unsigned long address = __fix_to_virt(idx);
   15.25  
   15.26 -    if (idx >= __end_of_fixed_addresses) {
   15.27 +    if ( likely(idx < __end_of_fixed_addresses) )
   15.28 +        set_pte_phys(address, entry);
   15.29 +    else
   15.30          printk("Invalid __set_fixmap\n");
   15.31 -        return;
   15.32 -    }
   15.33 -    set_pte_phys(address, entry);
   15.34  }
   15.35  
   15.36 -static void __init fixrange_init (unsigned long start, 
   15.37 -                                  unsigned long end, l2_pgentry_t *pg_base)
   15.38 +
   15.39 +static void __init fixrange_init(unsigned long start, 
   15.40 +                                 unsigned long end, 
   15.41 +                                 l2_pgentry_t *pg_base)
   15.42  {
   15.43      l2_pgentry_t *l2e;
   15.44      int i;
   15.45 @@ -66,7 +68,8 @@ static void __init fixrange_init (unsign
   15.46  
   15.47      for ( ; (i < ENTRIES_PER_L2_PAGETABLE) && (vaddr != end); l2e++, i++ ) 
   15.48      {
   15.49 -        if ( !l2_pgentry_empty(*l2e) ) continue;
   15.50 +        if ( !l2_pgentry_empty(*l2e) )
   15.51 +            continue;
   15.52          page = (unsigned long)get_free_page(GFP_KERNEL);
   15.53          clear_page(page);
   15.54          *l2e = mk_l2_pgentry(__pa(page) | __PAGE_HYPERVISOR);
   15.55 @@ -79,11 +82,6 @@ void __init paging_init(void)
   15.56      unsigned long addr;
   15.57      void *ioremap_pt;
   15.58  
   15.59 -    /* XXX initialised in boot.S */
   15.60 -    /*if ( cpu_has_pge ) set_in_cr4(X86_CR4_PGE);*/
   15.61 -    /*if ( cpu_has_pse ) set_in_cr4(X86_CR4_PSE);*/
   15.62 -    /*if ( cpu_has_pae ) set_in_cr4(X86_CR4_PAE);*/
   15.63 -
   15.64      /*
   15.65       * Fixed mappings, only the page table structure has to be
   15.66       * created - mappings will be set by set_fixmap():
   15.67 @@ -115,12 +113,12 @@ void __init paging_init(void)
   15.68  
   15.69  }
   15.70  
   15.71 -void __init zap_low_mappings (void)
   15.72 +void __init zap_low_mappings(void)
   15.73  {
   15.74      int i;
   15.75      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   15.76          idle_pg_table[i] = mk_l2_pgentry(0);
   15.77 -    flush_tlb_all();
   15.78 +    flush_tlb_all_pge();
   15.79  }
   15.80  
   15.81  
   15.82 @@ -212,86 +210,54 @@ long set_gdt(struct task_struct *p,
   15.83               unsigned int entries)
   15.84  {
   15.85      /* NB. There are 512 8-byte entries per GDT page. */
   15.86 -    unsigned int i, j, nr_pages = (entries + 511) / 512;
   15.87 -    unsigned long pfn, *gdt_page;
   15.88 -    long ret = -EINVAL;
   15.89 -    struct pfn_info *page;
   15.90 +    int i, nr_pages = (entries + 511) / 512;
   15.91 +    unsigned long pfn;
   15.92      struct desc_struct *vgdt;
   15.93  
   15.94 -    spin_lock(&p->page_lock);
   15.95 -
   15.96      /* Check the new GDT. */
   15.97      for ( i = 0; i < nr_pages; i++ )
   15.98      {
   15.99 -        if ( frames[i] >= max_page ) 
  15.100 -            goto out;
  15.101 -        
  15.102 -        page = frame_table + frames[i];
  15.103 -        if ( (page->flags & PG_domain_mask) != p->domain )
  15.104 -            goto out;
  15.105 -
  15.106 -        if ( (page->flags & PG_type_mask) != PGT_gdt_page )
  15.107 -        {
  15.108 -            if ( page_type_count(page) != 0 )
  15.109 -                goto out;
  15.110 -
  15.111 -            /* Check all potential GDT entries in the page. */
  15.112 -            gdt_page = map_domain_mem(frames[0] << PAGE_SHIFT);
  15.113 -            for ( j = 0; j < 512; j++ )
  15.114 -                if ( !check_descriptor(gdt_page[j*2], gdt_page[j*2+1]) )
  15.115 -                    goto out;
  15.116 -            unmap_domain_mem(gdt_page);
  15.117 -        }
  15.118 -    }
  15.119 -
  15.120 -    /* Tear down the old GDT. */
  15.121 -    for ( i = 0; i < 16; i++ )
  15.122 -    {
  15.123 -        pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]);
  15.124 -        p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
  15.125 -        if ( pfn == 0 ) continue;
  15.126 -        page = frame_table + pfn;
  15.127 -        ASSERT((page->flags & PG_type_mask) == PGT_gdt_page);
  15.128 -        ASSERT((page->flags & PG_domain_mask) == p->domain);
  15.129 -        ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
  15.130 -        put_page_type(page);
  15.131 -        put_page_tot(page);
  15.132 -    }
  15.133 -
  15.134 -    /* Install the new GDT. */
  15.135 -    for ( i = 0; i < nr_pages; i++ )
  15.136 -    {
  15.137 -        p->mm.perdomain_pt[i] =
  15.138 -            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  15.139 -        
  15.140 -        page = frame_table + frames[i];
  15.141 -        page->flags &= ~(PG_type_mask | PG_need_flush);
  15.142 -        page->flags |= PGT_gdt_page;
  15.143 -        get_page_type(page);
  15.144 -        get_page_tot(page);
  15.145 +        if ( unlikely(frames[i] >= max_page) ||
  15.146 +             unlikely(!get_page_and_type(&frame_table[frames[i]], 
  15.147 +                                         p, PGT_gdt_page)) )
  15.148 +            goto fail;
  15.149      }
  15.150  
  15.151      /* Copy reserved GDT entries to the new GDT. */
  15.152 -    vgdt = map_domain_mem(frames[i] << PAGE_SHIFT);
  15.153 +    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
  15.154      memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
  15.155             gdt_table + FIRST_RESERVED_GDT_ENTRY, 
  15.156             NR_RESERVED_GDT_ENTRIES*8);
  15.157      unmap_domain_mem(vgdt);
  15.158  
  15.159 +    /* Tear down the old GDT. */
  15.160 +    for ( i = 0; i < 16; i++ )
  15.161 +    {
  15.162 +        if ( (pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i])) != 0 )
  15.163 +            put_page_and_type(&frame_table[pfn]);
  15.164 +        p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
  15.165 +    }
  15.166 +
  15.167 +    /* Install the new GDT. */
  15.168 +    for ( i = 0; i < nr_pages; i++ )
  15.169 +        p->mm.perdomain_pt[i] =
  15.170 +            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  15.171 +
  15.172      SET_GDT_ADDRESS(p, GDT_VIRT_START);
  15.173      SET_GDT_ENTRIES(p, (entries*8)-1);
  15.174  
  15.175 -    ret = 0; /* success */
  15.176 +    return 0;
  15.177  
  15.178 - out:
  15.179 -    spin_unlock(&p->page_lock);
  15.180 -    return ret;
  15.181 + fail:
  15.182 +    while ( i-- > 0 )
  15.183 +        put_page_and_type(&frame_table[frames[i]]);
  15.184 +    return -EINVAL;
  15.185  }
  15.186  
  15.187  
  15.188  long do_set_gdt(unsigned long *frame_list, unsigned int entries)
  15.189  {
  15.190 -    unsigned int nr_pages = (entries + 511) / 512;
  15.191 +    int nr_pages = (entries + 511) / 512;
  15.192      unsigned long frames[16];
  15.193      long ret;
  15.194  
  15.195 @@ -321,14 +287,12 @@ long do_update_descriptor(
  15.196      if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) )
  15.197          return -EINVAL;
  15.198  
  15.199 -    spin_lock(&current->page_lock);
  15.200 -
  15.201 -    page = frame_table + pfn;
  15.202 -    if ( (page->flags & PG_domain_mask) != current->domain )
  15.203 +    page = &frame_table[pfn];
  15.204 +    if ( unlikely(!get_page(page, current)) )
  15.205          goto out;
  15.206  
  15.207      /* Check if the given frame is in use in an unsafe context. */
  15.208 -    switch ( (page->flags & PG_type_mask) )
  15.209 +    switch ( page->type_and_flags & PGT_type_mask )
  15.210      {
  15.211      case PGT_gdt_page:
  15.212          /* Disallow updates of Xen-reserved descriptors in the current GDT. */
  15.213 @@ -336,12 +300,17 @@ long do_update_descriptor(
  15.214               (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
  15.215               (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
  15.216              goto out;
  15.217 +        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
  15.218 +            goto out;
  15.219 +        break;
  15.220      case PGT_ldt_page:
  15.221 -    case PGT_writeable_page:
  15.222 +        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
  15.223 +            goto out;
  15.224          break;
  15.225      default:
  15.226 -        if ( page_type_count(page) != 0 )
  15.227 +        if ( unlikely(!get_page_type(page, PGT_writeable_page)) )
  15.228              goto out;
  15.229 +        break;
  15.230      }
  15.231  
  15.232      /* All is good so make the update. */
  15.233 @@ -350,9 +319,11 @@ long do_update_descriptor(
  15.234      gdt_pent[1] = word2;
  15.235      unmap_domain_mem(gdt_pent);
  15.236  
  15.237 +    put_page_type(page);
  15.238 +
  15.239      ret = 0; /* success */
  15.240  
  15.241   out:
  15.242 -    spin_unlock(&current->page_lock);
  15.243 +    put_page(page);
  15.244      return ret;
  15.245  }
    16.1 --- a/xen/arch/i386/pci-irq.c	Sat Dec 20 23:39:49 2003 +0000
    16.2 +++ b/xen/arch/i386/pci-irq.c	Sat Dec 20 23:41:19 2003 +0000
    16.3 @@ -6,16 +6,15 @@
    16.4  
    16.5  #include <linux/config.h>
    16.6  #include <linux/types.h>
    16.7 -/*#include <linux/kernel.h>*/
    16.8  #include <linux/pci.h>
    16.9  #include <linux/init.h>
   16.10  #include <linux/slab.h>
   16.11  #include <linux/interrupt.h>
   16.12  #include <linux/irq.h>
   16.13  #include <linux/sched.h>
   16.14 -
   16.15  #include <asm/io.h>
   16.16  #include <asm/smp.h>
   16.17 +#include <asm/mpspec.h>
   16.18  #include <asm/io_apic.h>
   16.19  
   16.20  #include "pci-i386.h"
    17.1 --- a/xen/arch/i386/process.c	Sat Dec 20 23:39:49 2003 +0000
    17.2 +++ b/xen/arch/i386/process.c	Sat Dec 20 23:41:19 2003 +0000
    17.3 @@ -27,6 +27,7 @@
    17.4  #include <asm/processor.h>
    17.5  #include <asm/desc.h>
    17.6  #include <asm/i387.h>
    17.7 +#include <asm/mpspec.h>
    17.8  
    17.9  #include <xeno/irq.h>
   17.10  #include <xeno/event.h>
   17.11 @@ -263,7 +264,7 @@ void switch_to(struct task_struct *prev_
   17.12      tss->ss1  = next->ss1;
   17.13  
   17.14      /* Switch page tables.  */
   17.15 -    __write_cr3_counted(pagetable_val(next_p->mm.pagetable));
   17.16 +    write_cr3_counted(pagetable_val(next_p->mm.pagetable));
   17.17  
   17.18      set_current(next_p);
   17.19  
    18.1 --- a/xen/arch/i386/smp.c	Sat Dec 20 23:39:49 2003 +0000
    18.2 +++ b/xen/arch/i386/smp.c	Sat Dec 20 23:41:19 2003 +0000
    18.3 @@ -16,6 +16,7 @@
    18.4  #include <asm/mc146818rtc.h>
    18.5  #include <asm/pgalloc.h>
    18.6  #include <asm/smpboot.h>
    18.7 +#include <asm/hardirq.h>
    18.8  
    18.9  #ifdef CONFIG_SMP
   18.10  
   18.11 @@ -264,34 +265,67 @@ static spinlock_t tlbstate_lock = SPIN_L
   18.12  asmlinkage void smp_invalidate_interrupt(void)
   18.13  {
   18.14      ack_APIC_irq();
   18.15 -    if (test_and_clear_bit(smp_processor_id(), &flush_cpumask))
   18.16 -        local_flush_tlb();
   18.17 +    clear_bit(smp_processor_id(), &flush_cpumask);
   18.18 +    local_flush_tlb();
   18.19  }
   18.20  
   18.21 -void flush_tlb_others(unsigned long cpumask)
   18.22 +void flush_tlb_mask(unsigned long mask)
   18.23  {
   18.24 -    spin_lock(&tlbstate_lock);
   18.25 -    atomic_set_mask(cpumask, &flush_cpumask);
   18.26 -    send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
   18.27 -    while (flush_cpumask) continue;
   18.28 +    if ( unlikely(in_irq()) )
   18.29 +        BUG();
   18.30 +    
   18.31 +    if ( mask & (1 << smp_processor_id()) )
   18.32 +    {
   18.33 +        local_flush_tlb();
   18.34 +        mask &= ~(1 << smp_processor_id());
   18.35 +    }
   18.36 +
   18.37 +    if ( mask != 0 )
   18.38 +    {
   18.39 +        spin_lock(&tlbstate_lock);
   18.40 +        flush_cpumask = mask;
   18.41 +        send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
   18.42 +        while ( flush_cpumask != 0 )
   18.43 +        {
   18.44 +            rep_nop();
   18.45 +            barrier();
   18.46 +        }
   18.47 +        spin_unlock(&tlbstate_lock);
   18.48 +    }
   18.49 +}
   18.50 +
   18.51 +void new_tlbflush_clock_period(void)
   18.52 +{
   18.53 +    if ( unlikely(!spin_trylock(&tlbstate_lock)) )
   18.54 +        return;
   18.55 +
   18.56 +    if ( unlikely((flush_cpumask = tlbflush_mask) != 0) )
   18.57 +    {
   18.58 +        send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR);
   18.59 +        while ( flush_cpumask != 0 )
   18.60 +        {
   18.61 +            rep_nop();
   18.62 +            barrier();
   18.63 +        }
   18.64 +    }
   18.65 +
   18.66 +    /* No need for cmpxchg updates here: we are protected by tlbstate lock. */
   18.67 +    tlbflush_mask = (1 << smp_num_cpus) - 1;
   18.68 +    wmb(); /* Reset the mask before allowing the clock to continue ticking. */
   18.69 +    tlbflush_clock++;
   18.70 +
   18.71      spin_unlock(&tlbstate_lock);
   18.72  }
   18.73 -	
   18.74 -static inline void do_flush_tlb_all_local(void)
   18.75 +
   18.76 +static void flush_tlb_all_pge_ipi(void* info)
   18.77  {
   18.78 -    __flush_tlb_all();
   18.79 +    __flush_tlb_pge();
   18.80  }
   18.81  
   18.82 -static void flush_tlb_all_ipi(void* info)
   18.83 +void flush_tlb_all_pge(void)
   18.84  {
   18.85 -    do_flush_tlb_all_local();
   18.86 -}
   18.87 -
   18.88 -void flush_tlb_all(void)
   18.89 -{
   18.90 -    smp_call_function (flush_tlb_all_ipi,0,1,1);
   18.91 -
   18.92 -    do_flush_tlb_all_local();
   18.93 +    smp_call_function (flush_tlb_all_pge_ipi,0,1,1);
   18.94 +    __flush_tlb_pge();
   18.95  }
   18.96  
   18.97  void smp_send_event_check_mask(unsigned long cpu_mask)
    19.1 --- a/xen/arch/i386/smpboot.c	Sat Dec 20 23:39:49 2003 +0000
    19.2 +++ b/xen/arch/i386/smpboot.c	Sat Dec 20 23:41:19 2003 +0000
    19.3 @@ -44,6 +44,8 @@
    19.4  #include <xeno/smp.h>
    19.5  #include <asm/msr.h>
    19.6  #include <asm/system.h>
    19.7 +#include <asm/mpspec.h>
    19.8 +#include <asm/io_apic.h>
    19.9  #include <xeno/sched.h>
   19.10  #include <xeno/delay.h>
   19.11  #include <xeno/lib.h>
    20.1 --- a/xen/arch/i386/traps.c	Sat Dec 20 23:39:49 2003 +0000
    20.2 +++ b/xen/arch/i386/traps.c	Sat Dec 20 23:41:19 2003 +0000
    20.3 @@ -211,6 +211,7 @@ static inline void do_trap(int trapnr, c
    20.4  
    20.5      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
    20.6      {
    20.7 +        DPRINTK("Trap %d: %08lx -> %08lx\n", trapnr, regs->eip, fixup);
    20.8          regs->eip = fixup;
    20.9          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
   20.10          return;
   20.11 @@ -328,6 +329,7 @@ asmlinkage void do_page_fault(struct pt_
   20.12  
   20.13      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
   20.14      {
   20.15 +        DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup);
   20.16          regs->eip = fixup;
   20.17          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
   20.18          return;
   20.19 @@ -411,6 +413,7 @@ asmlinkage void do_general_protection(st
   20.20  
   20.21      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
   20.22      {
   20.23 +        DPRINTK("GPF (%04lx): %08lx -> %08lx\n", error_code, regs->eip, fixup);
   20.24          regs->eip = fixup;
   20.25          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
   20.26          return;
    21.1 --- a/xen/common/dom0_ops.c	Sat Dec 20 23:39:49 2003 +0000
    21.2 +++ b/xen/common/dom0_ops.c	Sat Dec 20 23:41:19 2003 +0000
    21.3 @@ -38,31 +38,6 @@ static unsigned int get_domnr(void)
    21.4      return 0;
    21.5  }
    21.6  
    21.7 -static void build_page_list(struct task_struct *p)
    21.8 -{
    21.9 -    unsigned long *list;
   21.10 -    unsigned long curr;
   21.11 -    struct list_head *list_ent;
   21.12 -
   21.13 -    curr = list_entry(p->pg_head.next, struct pfn_info, list) - frame_table;
   21.14 -    list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
   21.15 -
   21.16 -    list_for_each(list_ent, &p->pg_head)
   21.17 -    {
   21.18 -        *list++ = list_entry(list_ent, struct pfn_info, list) - frame_table;
   21.19 -
   21.20 -        if( ((unsigned long)list & ~PAGE_MASK) == 0 )
   21.21 -        {
   21.22 -            struct list_head *ent = frame_table[curr].list.next;
   21.23 -            curr = list_entry(ent, struct pfn_info, list) - frame_table;
   21.24 -            unmap_domain_mem(list-1);
   21.25 -            list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
   21.26 -        }
   21.27 -    }
   21.28 -
   21.29 -    unmap_domain_mem(list);
   21.30 -}
   21.31 -
   21.32  static int msr_cpu_mask;
   21.33  static unsigned long msr_addr;
   21.34  static unsigned long msr_lo;
   21.35 @@ -163,8 +138,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   21.36              goto exit_create;
   21.37          }
   21.38  
   21.39 -        build_page_list(p);
   21.40 -        
   21.41          ret = p->domain;
   21.42          
   21.43          op.u.createdomain.domain = ret;
   21.44 @@ -246,7 +219,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   21.45      case DOM0_GETMEMLIST:
   21.46      {
   21.47          int i;
   21.48 -        struct task_struct * p = find_domain_by_id(op.u.getmemlist.domain);
   21.49 +        struct task_struct *p = find_domain_by_id(op.u.getmemlist.domain);
   21.50          unsigned long max_pfns = op.u.getmemlist.max_pfns;
   21.51          unsigned long pfn;
   21.52          unsigned long *buffer = op.u.getmemlist.buffer;
   21.53 @@ -255,28 +228,27 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   21.54          ret = -EINVAL;
   21.55          if ( p != NULL )
   21.56          {
   21.57 -            list_ent = p->pg_head.next;
   21.58 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
   21.59 -            
   21.60 -            for ( i = 0; (i < max_pfns) && (list_ent != &p->pg_head); i++ )
   21.61 +            ret = 0;
   21.62 +
   21.63 +            spin_lock(&p->page_list_lock);
   21.64 +            list_ent = p->page_list.next;
   21.65 +            for ( i = 0; (i < max_pfns) && (list_ent != &p->page_list); i++ )
   21.66              {
   21.67 +                pfn = list_entry(list_ent, struct pfn_info, list) - 
   21.68 +                    frame_table;
   21.69                  if ( put_user(pfn, buffer) )
   21.70                  {
   21.71                      ret = -EFAULT;
   21.72 -                    goto out_getmemlist;
   21.73 +                    break;
   21.74                  }
   21.75                  buffer++;
   21.76                  list_ent = frame_table[pfn].list.next;
   21.77 -                pfn = list_entry(list_ent, struct pfn_info, list) - 
   21.78 -                    frame_table;
   21.79              }
   21.80 +            spin_unlock(&p->page_list_lock);
   21.81  
   21.82              op.u.getmemlist.num_pfns = i;
   21.83              copy_to_user(u_dom0_op, &op, sizeof(op));
   21.84 -
   21.85 -            ret = 0;
   21.86 -
   21.87 -        out_getmemlist:
   21.88 +            
   21.89              put_task_struct(p);
   21.90          }
   21.91      }
   21.92 @@ -369,21 +341,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   21.93      {
   21.94          struct pfn_info *page;
   21.95          unsigned long pfn = op.u.getpageframeinfo.pfn;
   21.96 -        
   21.97 -        if ( pfn >= max_page )
   21.98 -        {
   21.99 -            ret = -EINVAL;
  21.100 -        }
  21.101 -        else
  21.102 +        unsigned int dom = op.u.getpageframeinfo.domain;
  21.103 +        struct task_struct *p;
  21.104 +
  21.105 +        ret = -EINVAL;
  21.106 +
  21.107 +        if ( unlikely(pfn >= max_page) || 
  21.108 +             unlikely((p = find_domain_by_id(dom)) == NULL) )
  21.109 +            break;
  21.110 +
  21.111 +        page = &frame_table[pfn];
  21.112 +
  21.113 +        if ( likely(get_page(page, p)) )
  21.114          {
  21.115 -            page = frame_table + pfn;
  21.116 -            
  21.117 -            op.u.getpageframeinfo.domain = page->flags & PG_domain_mask;
  21.118 -            op.u.getpageframeinfo.type   = NONE;
  21.119 +            op.u.getpageframeinfo.type = NONE;
  21.120  
  21.121 -            if ( page_type_count(page) != 0 )
  21.122 +            if ( (page->type_and_flags & PGT_count_mask) != 0 )
  21.123              {
  21.124 -                switch ( page->flags & PG_type_mask )
  21.125 +                switch ( page->type_and_flags & PGT_type_mask )
  21.126                  {
  21.127                  case PGT_l1_page_table:
  21.128                      op.u.getpageframeinfo.type = L1TAB;
  21.129 @@ -393,9 +368,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
  21.130                      break;
  21.131                  }
  21.132              }
  21.133 +            
  21.134 +            put_page(page);
  21.135 +        }
  21.136  
  21.137 -            copy_to_user(u_dom0_op, &op, sizeof(op));
  21.138 -        }
  21.139 +        put_task_struct(p);
  21.140 +
  21.141 +        copy_to_user(u_dom0_op, &op, sizeof(op));
  21.142      }
  21.143      break;
  21.144  
    22.1 --- a/xen/common/dom_mem_ops.c	Sat Dec 20 23:39:49 2003 +0000
    22.2 +++ b/xen/common/dom_mem_ops.c	Sat Dec 20 23:41:19 2003 +0000
    22.3 @@ -16,58 +16,26 @@
    22.4  #include <xeno/event.h>
    22.5  #include <asm/domain_page.h>
    22.6  
    22.7 -#if 0
    22.8 -#define DPRINTK(_f, _a...) printk( _f , ## _a )
    22.9 -#else
   22.10 -#define DPRINTK(_f, _a...) ((void)0)
   22.11 -#endif
   22.12 -
   22.13  static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
   22.14  {
   22.15 -    struct list_head *temp;
   22.16 -    struct pfn_info  *pf;     /* pfn_info of current page */
   22.17 +    struct pfn_info  *page;
   22.18      unsigned long     mpfn;   /* machine frame number of current page */
   22.19      void             *va;     /* Xen-usable mapping of current page */
   22.20      unsigned long     i;
   22.21 -    unsigned long     flags;
   22.22  
   22.23 -    /*
   22.24 -     * POLICY DECISION: Each domain has a page limit.
   22.25 -     * NB. The first part of test is because op.size could be so big that
   22.26 -     * tot_pages + op.size overflows a u_long.
   22.27 -     */
   22.28 -    if( (op.size > p->max_pages) ||
   22.29 -        ((p->tot_pages + op.size) > p->max_pages) )
   22.30 -        return -ENOMEM;
   22.31 -
   22.32 -    spin_lock_irqsave(&free_list_lock, flags);
   22.33 -
   22.34 -    if ( free_pfns < (op.size + (SLACK_DOMAIN_MEM_KILOBYTES >> 
   22.35 -                                  (PAGE_SHIFT-10))) ) 
   22.36 -    {
   22.37 -        spin_unlock_irqrestore(&free_list_lock, flags);
   22.38 -        return -ENOMEM;
   22.39 -    }
   22.40 -
   22.41 -    spin_lock(&p->page_lock);
   22.42 -    
   22.43 -    temp = free_list.next;
   22.44      for ( i = 0; i < op.size; i++ )
   22.45      {
   22.46 -        /* Get a free page and add it to the domain's page list. */
   22.47 -        pf = list_entry(temp, struct pfn_info, list);
   22.48 -        pf->flags |= p->domain;
   22.49 -        set_page_type_count(pf, 0);
   22.50 -        set_page_tot_count(pf, 0);
   22.51 -        temp = temp->next;
   22.52 -        list_del(&pf->list);
   22.53 -        list_add_tail(&pf->list, &p->pg_head);
   22.54 -        free_pfns--;
   22.55 +        /* Leave some slack pages; e.g., for the network. */
   22.56 +        if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
   22.57 +                                   (PAGE_SHIFT-10))) ) 
   22.58 +            break;
   22.59  
   22.60 -        p->tot_pages++;
   22.61 -
   22.62 +        /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
   22.63 +        if ( unlikely((page = alloc_domain_page(p)) == NULL) )
   22.64 +            break;
   22.65 +        
   22.66          /* Inform the domain of the new page's machine address. */ 
   22.67 -        mpfn = (unsigned long)(pf - frame_table);
   22.68 +        mpfn = (unsigned long)(page - frame_table);
   22.69          copy_to_user(op.pages, &mpfn, sizeof(mpfn));
   22.70          op.pages++; 
   22.71  
   22.72 @@ -77,26 +45,17 @@ static long alloc_dom_mem(struct task_st
   22.73          unmap_domain_mem(va);
   22.74      }
   22.75  
   22.76 -    spin_unlock(&p->page_lock);
   22.77 -    spin_unlock_irqrestore(&free_list_lock, flags);
   22.78 -    
   22.79 -    return op.size;
   22.80 +    return i;
   22.81  }
   22.82      
   22.83  static long free_dom_mem(struct task_struct *p, reservation_decrease_t op)
   22.84  {
   22.85 -    struct list_head *temp;
   22.86 -    struct pfn_info  *pf;     /* pfn_info of current page */
   22.87 +    struct pfn_info  *page;
   22.88      unsigned long     mpfn;   /* machine frame number of current page */
   22.89      unsigned long     i;
   22.90 -    unsigned long     flags;
   22.91      long              rc = 0;
   22.92      int               need_flush = 0;
   22.93  
   22.94 -    spin_lock_irqsave(&free_list_lock, flags);
   22.95 -    spin_lock(&p->page_lock);
   22.96 -
   22.97 -    temp = free_list.next;
   22.98      for ( i = 0; i < op.size; i++ )
   22.99      {
  22.100          copy_from_user(&mpfn, op.pages, sizeof(mpfn));
  22.101 @@ -109,37 +68,28 @@ static long free_dom_mem(struct task_str
  22.102              goto out;
  22.103          }
  22.104  
  22.105 -        pf = &frame_table[mpfn];
  22.106 -        if ( (page_type_count(pf) != 0) || 
  22.107 -             (page_tot_count(pf) != 0) ||
  22.108 -             ((pf->flags & PG_domain_mask) != p->domain) )
  22.109 +        page = &frame_table[mpfn];
  22.110 +        if ( unlikely(!get_page(page, p)) )
  22.111          {
  22.112 -            DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n",
  22.113 -                    p->domain, page_type_count(pf), 
  22.114 -                    page_tot_count(pf), pf->flags);
  22.115 +            DPRINTK("Bad page free for domain %d\n", p->domain);
  22.116              rc = -EINVAL;
  22.117              goto out;
  22.118          }
  22.119  
  22.120 -        need_flush |= pf->flags & PG_need_flush;
  22.121 -
  22.122 -        pf->flags = 0;
  22.123 +        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
  22.124 +            put_page_and_type(page);
  22.125  
  22.126 -        list_del(&pf->list);
  22.127 -        list_add(&pf->list, &free_list);
  22.128 -        free_pfns++;
  22.129 +        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
  22.130 +            put_page(page);
  22.131  
  22.132 -        p->tot_pages--;
  22.133 +        put_page(page);
  22.134      }
  22.135  
  22.136   out:
  22.137 -    spin_unlock(&p->page_lock);
  22.138 -    spin_unlock_irqrestore(&free_list_lock, flags);
  22.139 -    
  22.140      if ( need_flush )
  22.141      {
  22.142          __flush_tlb();
  22.143 -        perfc_incrc(need_flush_tlb_flush);
  22.144 +        perfc_incr(need_flush_tlb_flush);
  22.145      }
  22.146  
  22.147      return rc ? rc : op.size;
    23.1 --- a/xen/common/domain.c	Sat Dec 20 23:39:49 2003 +0000
    23.2 +++ b/xen/common/domain.c	Sat Dec 20 23:41:19 2003 +0000
    23.3 @@ -51,12 +51,11 @@ struct task_struct *do_createdomain(unsi
    23.4      sprintf(p->name, "Domain-%d", dom_id);
    23.5  
    23.6      spin_lock_init(&p->blk_ring_lock);
    23.7 -    spin_lock_init(&p->page_lock);
    23.8      spin_lock_init(&p->event_channel_lock);
    23.9  
   23.10      p->shared_info = (void *)get_free_page(GFP_KERNEL);
   23.11      memset(p->shared_info, 0, PAGE_SIZE);
   23.12 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id);
   23.13 +    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
   23.14  
   23.15      p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
   23.16      memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
   23.17 @@ -67,8 +66,10 @@ struct task_struct *do_createdomain(unsi
   23.18  
   23.19      sched_add_domain(p);
   23.20  
   23.21 -    INIT_LIST_HEAD(&p->pg_head);
   23.22 +    spin_lock_init(&p->page_list_lock);
   23.23 +    INIT_LIST_HEAD(&p->page_list);
   23.24      p->max_pages = p->tot_pages = 0;
   23.25 +
   23.26      write_lock_irqsave(&tasklist_lock, flags);
   23.27      SET_LINKS(p);
   23.28      p->next_hash = task_hash[TASK_HASH(dom_id)];
   23.29 @@ -218,77 +219,203 @@ long stop_other_domain(unsigned int dom)
   23.30      return 0;
   23.31  }
   23.32  
   23.33 -unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
   23.34 +struct pfn_info *alloc_domain_page(struct task_struct *p)
   23.35  {
   23.36 -    struct list_head *temp;
   23.37 -    struct pfn_info *pf;
   23.38 -    unsigned int alloc_pfns;
   23.39 -    unsigned int req_pages;
   23.40 -    unsigned long flags;
   23.41 -
   23.42 -    /* how many pages do we need to alloc? */
   23.43 -    req_pages = kbytes >> (PAGE_SHIFT - 10);
   23.44 +    struct pfn_info *page = NULL;
   23.45 +    unsigned long flags, mask, pfn_stamp, cpu_stamp;
   23.46 +    int i;
   23.47  
   23.48      spin_lock_irqsave(&free_list_lock, flags);
   23.49 -    
   23.50 -    /* is there enough mem to serve the request? */   
   23.51 -    if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) >
   23.52 -         free_pfns )
   23.53 +    if ( likely(!list_empty(&free_list)) )
   23.54 +    {
   23.55 +        page = list_entry(free_list.next, struct pfn_info, list);
   23.56 +        list_del(&page->list);
   23.57 +        free_pfns--;
   23.58 +    }
   23.59 +    spin_unlock_irqrestore(&free_list_lock, flags);
   23.60 +
   23.61 +    if ( unlikely(page == NULL) )
   23.62 +        return NULL;
   23.63 +
   23.64 +    if ( unlikely((mask = page->u.cpu_mask) != 0) )
   23.65      {
   23.66 -        spin_unlock_irqrestore(&free_list_lock, flags);
   23.67 -        return -1;
   23.68 +        pfn_stamp = page->tlbflush_timestamp;
   23.69 +        for ( i = 0; mask != 0; i++ )
   23.70 +        {
   23.71 +            if ( unlikely(mask & (1<<i)) )
   23.72 +            {
   23.73 +                cpu_stamp = tlbflush_time[i];
   23.74 +                if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
   23.75 +                    mask &= ~(1<<i);
   23.76 +            }
   23.77 +        }
   23.78 +
   23.79 +        if ( unlikely(mask != 0) )
   23.80 +        {
   23.81 +            if ( unlikely(in_irq()) )
   23.82 +            {
   23.83 +                DPRINTK("Returning NULL from alloc_domain_page: in_irq\n");
   23.84 +                goto free_and_exit;
   23.85 +            }
   23.86 +            perfc_incrc(need_flush_tlb_flush);
   23.87 +            flush_tlb_mask(mask);
   23.88 +        }
   23.89 +    }
   23.90 +
   23.91 +    page->u.domain = p;
   23.92 +    page->type_and_flags = 0;
   23.93 +    if ( p != NULL )
   23.94 +    {
   23.95 +        if ( unlikely(in_irq()) )
   23.96 +            BUG();
   23.97 +        wmb(); /* Domain pointer must be visible before updating refcnt. */
   23.98 +        spin_lock(&p->page_list_lock);
   23.99 +        if ( unlikely(p->tot_pages >= p->max_pages) )
  23.100 +        {
  23.101 +            spin_unlock(&p->page_list_lock);
  23.102 +            goto free_and_exit;
  23.103 +        }
  23.104 +        list_add_tail(&page->list, &p->page_list);
  23.105 +        p->tot_pages++;
  23.106 +        page->count_and_flags = PGC_allocated | 1;
  23.107 +        spin_unlock(&p->page_list_lock);
  23.108      }
  23.109  
  23.110 -    /* allocate pages and build a thread through frame_table */
  23.111 -    temp = free_list.next;
  23.112 -    for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ )
  23.113 +    return page;
  23.114 +
  23.115 + free_and_exit:
  23.116 +    spin_lock_irqsave(&free_list_lock, flags);
  23.117 +    list_add(&page->list, &free_list);
  23.118 +    free_pfns++;
  23.119 +    spin_unlock_irqrestore(&free_list_lock, flags);
  23.120 +    return NULL;
  23.121 +}
  23.122 +
  23.123 +void free_domain_page(struct pfn_info *page)
  23.124 +{
  23.125 +    unsigned long flags;
  23.126 +    struct task_struct *p = page->u.domain;
  23.127 +
  23.128 +    if ( unlikely(in_irq()) )
  23.129 +        BUG();
  23.130 +
  23.131 +    if ( likely(!IS_XEN_HEAP_FRAME(page)) )
  23.132 +    {
  23.133 +        /*
  23.134 +         * No race with setting of zombie bit. If it wasn't set before the
  23.135 +         * last reference was dropped, then it can't be set now.
  23.136 +         */
  23.137 +        page->u.cpu_mask = 0;
  23.138 +        if ( !(page->count_and_flags & PGC_zombie) )
  23.139 +        {
  23.140 +            page->tlbflush_timestamp = tlbflush_clock;
  23.141 +            page->u.cpu_mask = 1 << p->processor;
  23.142 +
  23.143 +            spin_lock(&p->page_list_lock);
  23.144 +            list_del(&page->list);
  23.145 +            p->tot_pages--;
  23.146 +            spin_unlock(&p->page_list_lock);
  23.147 +        }
  23.148 +
  23.149 +        page->count_and_flags = 0;
  23.150 +
  23.151 +        spin_lock_irqsave(&free_list_lock, flags);
  23.152 +        list_add(&page->list, &free_list);
  23.153 +        free_pfns++;
  23.154 +        spin_unlock_irqrestore(&free_list_lock, flags);
  23.155 +    }
  23.156 +    else
  23.157      {
  23.158 -        pf = list_entry(temp, struct pfn_info, list);
  23.159 -        pf->flags = p->domain;
  23.160 -        set_page_type_count(pf, 0);
  23.161 -        set_page_tot_count(pf, 0);
  23.162 -        temp = temp->next;
  23.163 -        list_del(&pf->list);
  23.164 -        list_add_tail(&pf->list, &p->pg_head);
  23.165 -        free_pfns--;
  23.166 -        ASSERT(free_pfns != 0);
  23.167 +        /*
  23.168 +         * No need for a TLB flush. Non-domain pages are always co-held by Xen,
  23.169 +         * and the Xen reference is not dropped until the domain is dead.
  23.170 +         * DOM0 may hold references, but it's trusted so no need to flush.
  23.171 +         */
  23.172 +        page->u.cpu_mask = 0;
  23.173 +        page->count_and_flags = 0;
  23.174 +        free_page((unsigned long)page_to_virt(page));
  23.175      }
  23.176 -   
  23.177 -    spin_unlock_irqrestore(&free_list_lock, flags);
  23.178 -    
  23.179 -    p->tot_pages = req_pages;
  23.180 +}
  23.181 +
  23.182 +
  23.183 +void free_all_dom_mem(struct task_struct *p)
  23.184 +{
  23.185 +    struct list_head *ent, zombies;
  23.186 +    struct pfn_info *page;
  23.187 +
  23.188 +    INIT_LIST_HEAD(&zombies);
  23.189 +
  23.190 +    spin_lock(&p->page_list_lock);
  23.191 +    while ( (ent = p->page_list.next) != &p->page_list )
  23.192 +    {
  23.193 +        page = list_entry(ent, struct pfn_info, list);
  23.194 +
  23.195 +        if ( unlikely(!get_page(page, p)) )
  23.196 +        {
  23.197 +            /*
  23.198 +             * Another CPU has dropped the last reference and is responsible 
  23.199 +             * for removing the page from this list. Wait for them to do so.
  23.200 +             */
  23.201 +            spin_unlock(&p->page_list_lock);
  23.202 +            while ( p->page_list.next == ent )
  23.203 +                barrier();
  23.204 +            spin_lock(&p->page_list_lock);
  23.205 +            continue;
  23.206 +        }
  23.207 +
  23.208 +        set_bit(_PGC_zombie, &page->count_and_flags);
  23.209 +
  23.210 +        list_del(&page->list);
  23.211 +        p->tot_pages--;
  23.212 +
  23.213 +        list_add(&page->list, &zombies);
  23.214 +    }
  23.215 +    spin_unlock(&p->page_list_lock);
  23.216 +
  23.217 +    /* We do the potentially complex 'put' operations with no lock held. */
  23.218 +    while ( (ent = zombies.next) != &zombies )
  23.219 +    {
  23.220 +        page = list_entry(ent, struct pfn_info, list);
  23.221 +
  23.222 +        list_del(&page->list);
  23.223 +        
  23.224 +        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
  23.225 +            put_page_and_type(page);
  23.226 +
  23.227 +        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
  23.228 +            put_page(page);
  23.229 +
  23.230 +        put_page(page);
  23.231 +    }
  23.232 +}
  23.233 +
  23.234 +
  23.235 +unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
  23.236 +{
  23.237 +    unsigned int alloc_pfns, nr_pages;
  23.238 +
  23.239 +    nr_pages = kbytes >> (PAGE_SHIFT - 10);
  23.240  
  23.241      /* TEMPORARY: max_pages should be explicitly specified. */
  23.242 -    p->max_pages = p->tot_pages;
  23.243 +    p->max_pages = nr_pages;
  23.244 +
  23.245 +    for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ )
  23.246 +    {
  23.247 +        if ( unlikely(alloc_domain_page(p) == NULL) ||
  23.248 +             unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
  23.249 +                                   (PAGE_SHIFT-10))) )
  23.250 +        {
  23.251 +            free_all_dom_mem(p);
  23.252 +            return -1;
  23.253 +        }
  23.254 +    }
  23.255 +
  23.256 +    p->tot_pages = nr_pages;
  23.257  
  23.258      return 0;
  23.259  }
  23.260   
  23.261  
  23.262 -void free_all_dom_mem(struct task_struct *p)
  23.263 -{
  23.264 -    struct list_head *ent;
  23.265 -    unsigned long flags;
  23.266 -
  23.267 -    spin_lock_irqsave(&free_list_lock, flags);
  23.268 -    while ( (ent = p->pg_head.next) != &p->pg_head )
  23.269 -    {
  23.270 -        struct pfn_info *pf = list_entry(ent, struct pfn_info, list);
  23.271 -        set_page_type_count(pf, 0);
  23.272 -        set_page_tot_count(pf, 0);
  23.273 -        pf->flags = 0;
  23.274 -        ASSERT(ent->next->prev == ent);
  23.275 -        ASSERT(ent->prev->next == ent);
  23.276 -        list_del(ent);
  23.277 -        list_add(ent, &free_list);
  23.278 -        free_pfns++;
  23.279 -    }
  23.280 -    spin_unlock_irqrestore(&free_list_lock, flags);
  23.281 -
  23.282 -    p->tot_pages = 0;
  23.283 -}
  23.284 -
  23.285 -
  23.286  /* Release resources belonging to task @p. */
  23.287  void release_task(struct task_struct *p)
  23.288  {
  23.289 @@ -309,7 +436,6 @@ void release_task(struct task_struct *p)
  23.290      destroy_event_channels(p);
  23.291      free_page((unsigned long)p->mm.perdomain_pt);
  23.292      UNSHARE_PFN(virt_to_page(p->shared_info));
  23.293 -    free_page((unsigned long)p->shared_info);
  23.294      free_all_dom_mem(p);
  23.295  
  23.296      kmem_cache_free(task_struct_cachep, p);
  23.297 @@ -360,11 +486,10 @@ int final_setup_guestos(struct task_stru
  23.298      p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs;
  23.299      p->failsafe_address  = builddomain->ctxt.failsafe_callback_eip;
  23.300      
  23.301 -    /* NB. Page base must already be pinned! */
  23.302      phys_l2tab = builddomain->ctxt.pt_base;
  23.303      p->mm.pagetable = mk_pagetable(phys_l2tab);
  23.304 -    get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]);
  23.305 -    get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]);
  23.306 +    get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, 
  23.307 +                      PGT_l2_page_table);
  23.308  
  23.309      /* Set up the shared info structure. */
  23.310      update_dom_time(p->shared_info);
  23.311 @@ -449,7 +574,7 @@ int setup_guestos(struct task_struct *p,
  23.312          return -ENOMEM;
  23.313      }
  23.314  
  23.315 -    alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) -
  23.316 +    alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) -
  23.317          frame_table;
  23.318      alloc_address <<= PAGE_SHIFT;
  23.319      alloc_index = p->tot_pages;
  23.320 @@ -497,7 +622,7 @@ int setup_guestos(struct task_struct *p,
  23.321      p->mm.pagetable = mk_pagetable(phys_l2tab);
  23.322  
  23.323      l2tab += l2_table_offset(virt_load_address);
  23.324 -    cur_address = list_entry(p->pg_head.next, struct pfn_info, list) -
  23.325 +    cur_address = list_entry(p->page_list.next, struct pfn_info, list) -
  23.326          frame_table;
  23.327      cur_address <<= PAGE_SHIFT;
  23.328      for ( count = 0; count < p->tot_pages; count++ )
  23.329 @@ -514,10 +639,10 @@ int setup_guestos(struct task_struct *p,
  23.330          }
  23.331          *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
  23.332          
  23.333 -        page = frame_table + (cur_address >> PAGE_SHIFT);
  23.334 -        page->flags = dom | PGT_writeable_page | PG_need_flush;
  23.335 -        set_page_type_count(page, 1);
  23.336 -        set_page_tot_count(page, 1);
  23.337 +        page = &frame_table[cur_address >> PAGE_SHIFT];
  23.338 +        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
  23.339 +        if ( !get_page_and_type(page, p, PGT_writeable_page) )
  23.340 +            BUG();
  23.341          /* Set up the MPT entry. */
  23.342          machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
  23.343  
  23.344 @@ -538,8 +663,9 @@ int setup_guestos(struct task_struct *p,
  23.345      {
  23.346          *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
  23.347          page = frame_table + l1_pgentry_to_pagenr(*l1tab);
  23.348 -        page->flags = dom | PGT_l1_page_table;
  23.349 -        get_page_tot(page);
  23.350 +        page->type_and_flags &= ~PGT_type_mask;
  23.351 +        page->type_and_flags |= PGT_l1_page_table;
  23.352 +        get_page(page, p); /* an extra ref because of readable mapping */
  23.353          l1tab++;
  23.354          if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
  23.355          {
  23.356 @@ -548,9 +674,13 @@ int setup_guestos(struct task_struct *p,
  23.357              l2tab++;
  23.358          }
  23.359      }
  23.360 -    get_page_type(page); /* guest_pinned */
  23.361 -    get_page_tot(page);  /* guest_pinned */
  23.362 -    page->flags = dom | PG_guest_pinned | PGT_l2_page_table;
  23.363 +    /* Rewrite last L1 page to be a L2 page. */
  23.364 +    page->type_and_flags &= ~PGT_type_mask;
  23.365 +    page->type_and_flags |= PGT_l2_page_table;
  23.366 +    /* Get another ref to L2 page so that it can be pinned. */
  23.367 +    if ( !get_page_and_type(page, p, PGT_l2_page_table) )
  23.368 +        BUG();
  23.369 +    set_bit(_PGC_guest_pinned, &page->count_and_flags);
  23.370      unmap_domain_mem(l1start);
  23.371  
  23.372      /* Set up shared info area. */
  23.373 @@ -565,7 +695,7 @@ int setup_guestos(struct task_struct *p,
  23.374  
  23.375      /* Install the new page tables. */
  23.376      __cli();
  23.377 -    __write_cr3_counted(pagetable_val(p->mm.pagetable));
  23.378 +    write_cr3_counted(pagetable_val(p->mm.pagetable));
  23.379  
  23.380      /* Copy the guest OS image. */    
  23.381      src  = (char *)(phy_data_start + 12);
  23.382 @@ -632,7 +762,7 @@ int setup_guestos(struct task_struct *p,
  23.383  
  23.384  
  23.385      /* Reinstate the caller's page tables. */
  23.386 -    __write_cr3_counted(pagetable_val(current->mm.pagetable));
  23.387 +    write_cr3_counted(pagetable_val(current->mm.pagetable));
  23.388      __sti();
  23.389  
  23.390      p->flags |= PF_CONSTRUCTED;
    24.1 --- a/xen/common/kernel.c	Sat Dec 20 23:39:49 2003 +0000
    24.2 +++ b/xen/common/kernel.c	Sat Dec 20 23:41:19 2003 +0000
    24.3 @@ -181,6 +181,13 @@ void cmain (unsigned long magic, multibo
    24.4          for ( ; ; ) ;
    24.5      }
    24.6  
    24.7 +    /* The array of pfn_info structures must fit into the reserved area. */
    24.8 +    if ( sizeof(struct pfn_info) > 24 )
    24.9 +    {
   24.10 +        printk("'struct pfn_info' too large to fit in Xen address space!\n");
   24.11 +        for ( ; ; ) ;
   24.12 +    }
   24.13 +
   24.14      set_current(&idle0_task);
   24.15  
   24.16      max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10);
    25.1 --- a/xen/common/memory.c	Sat Dec 20 23:39:49 2003 +0000
    25.2 +++ b/xen/common/memory.c	Sat Dec 20 23:41:19 2003 +0000
    25.3 @@ -139,34 +139,28 @@
    25.4  #include <asm/uaccess.h>
    25.5  #include <asm/domain_page.h>
    25.6  
    25.7 -#if 0
    25.8 -#define MEM_LOG(_f, _a...) 
    25.9 +#ifndef NDEBUG
   25.10 +#define MEM_LOG(_f, _a...)                           \
   25.11    printk("DOM%d: (file=memory.c, line=%d) " _f "\n", \
   25.12           current->domain, __LINE__, ## _a )
   25.13  #else
   25.14  #define MEM_LOG(_f, _a...) ((void)0)
   25.15  #endif
   25.16  
   25.17 -/* Domain 0 is allowed to submit requests on behalf of others. */
   25.18 -#define DOMAIN_OKAY(_f) \
   25.19 -    ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
   25.20 +static int alloc_l2_table(struct pfn_info *page);
   25.21 +static int alloc_l1_table(struct pfn_info *page);
   25.22 +static int get_page_from_pagenr(unsigned long page_nr);
   25.23 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
   25.24 +                                         unsigned int type);
   25.25  
   25.26 -/* 'get' checks parameter for validity before inc'ing refcnt. */
   25.27 -static int get_l2_table(unsigned long page_nr);
   25.28 -static int get_l1_table(unsigned long page_nr);
   25.29 -static int get_page(unsigned long page_nr, int writeable);
   25.30 -static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
   25.31 -/* 'put' does no checking because if refcnt not zero, entity must be valid. */
   25.32 -static void put_l2_table(unsigned long page_nr);
   25.33 -static void put_l1_table(unsigned long page_nr);
   25.34 -static void put_page(unsigned long page_nr, int writeable);
   25.35 -static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
   25.36 +static void free_l2_table(struct pfn_info *page);
   25.37 +static void free_l1_table(struct pfn_info *page);
   25.38  
   25.39 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t);
   25.40 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
   25.41  static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
   25.42  
   25.43  /* frame table size and its size in pages */
   25.44 -frame_table_t * frame_table;
   25.45 +struct pfn_info *frame_table;
   25.46  unsigned long frame_table_size;
   25.47  unsigned long max_page;
   25.48  
   25.49 @@ -176,8 +170,11 @@ unsigned int free_pfns;
   25.50  
   25.51  /* Used to defer flushing of memory structures. */
   25.52  static struct {
   25.53 -    int flush_tlb;
   25.54 -    int refresh_ldt;
   25.55 +#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
   25.56 +#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
   25.57 +#define DOP_RESTORE_CR0 (1<<2) /* Set the WP bit in CR0.         */
   25.58 +    unsigned long flags;
   25.59 +    unsigned long cr0;
   25.60  } deferred_op[NR_CPUS] __cacheline_aligned;
   25.61  
   25.62  /*
   25.63 @@ -196,7 +193,7 @@ void __init init_frametable(unsigned lon
   25.64      max_page = nr_pages;
   25.65      frame_table_size = nr_pages * sizeof(struct pfn_info);
   25.66      frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
   25.67 -    frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
   25.68 +    frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
   25.69      memset(frame_table, 0, frame_table_size);
   25.70  
   25.71      free_pfns = 0;
   25.72 @@ -218,7 +215,7 @@ void __init init_frametable(unsigned lon
   25.73  
   25.74  static void __invalidate_shadow_ldt(struct task_struct *p)
   25.75  {
   25.76 -    int i, cpu = p->processor;
   25.77 +    int i;
   25.78      unsigned long pfn;
   25.79      struct pfn_info *page;
   25.80      
   25.81 @@ -230,16 +227,13 @@ static void __invalidate_shadow_ldt(stru
   25.82          if ( pfn == 0 ) continue;
   25.83          p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
   25.84          page = frame_table + pfn;
   25.85 -        ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
   25.86 -        ASSERT((page->flags & PG_domain_mask) == p->domain);
   25.87 -        ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
   25.88 -        put_page_type(page);
   25.89 -        put_page_tot(page);                
   25.90 +        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
   25.91 +        ASSERT_PAGE_IS_DOMAIN(page, p);
   25.92 +        put_page_and_type(page);
   25.93      }
   25.94  
   25.95      /* Dispose of the (now possibly invalid) mappings from the TLB.  */
   25.96 -    deferred_op[cpu].flush_tlb   = 1;
   25.97 -    deferred_op[cpu].refresh_ldt = 1;
   25.98 +    deferred_op[p->processor].flags |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
   25.99  }
  25.100  
  25.101  
  25.102 @@ -251,556 +245,614 @@ static inline void invalidate_shadow_ldt
  25.103  }
  25.104  
  25.105  
  25.106 +int alloc_segdesc_page(struct pfn_info *page)
  25.107 +{
  25.108 +    unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
  25.109 +    int i;
  25.110 +
  25.111 +    for ( i = 0; i < 512; i++ )
  25.112 +        if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) )
  25.113 +            goto fail;
  25.114 +
  25.115 +    unmap_domain_mem(descs);
  25.116 +    return 1;
  25.117 +
  25.118 + fail:
  25.119 +    unmap_domain_mem(descs);
  25.120 +    return 0;
  25.121 +}
  25.122 +
  25.123 +
  25.124  /* Map shadow page at offset @off. Returns 0 on success. */
  25.125  int map_ldt_shadow_page(unsigned int off)
  25.126  {
  25.127      struct task_struct *p = current;
  25.128 -    unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
  25.129 -    unsigned long l1e, *ldt_page;
  25.130 -    struct pfn_info *page;
  25.131 -    int i, ret = -1;
  25.132 +    unsigned long l1e;
  25.133  
  25.134 -    /* We cannot take a page_lock in interrupt context. */
  25.135 -    if ( in_interrupt() )
  25.136 +    if ( unlikely(in_interrupt()) )
  25.137          BUG();
  25.138  
  25.139 -    spin_lock(&p->page_lock);
  25.140 -
  25.141 -    __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
  25.142 -    if ( unlikely(!(l1e & _PAGE_PRESENT)) )
  25.143 -        goto out;
  25.144 -
  25.145 -    page = frame_table + (l1e >> PAGE_SHIFT);
  25.146 -    if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) )
  25.147 -    {
  25.148 -        if ( unlikely(page_type_count(page) != 0) )
  25.149 -            goto out;
  25.150 +    __get_user(l1e, (unsigned long *)&linear_pg_table[(p->mm.ldt_base >> 
  25.151 +                                                       PAGE_SHIFT) + off]);
  25.152  
  25.153 -        /* Check all potential LDT entries in the page. */
  25.154 -        ldt_page = (unsigned long *)addr;
  25.155 -        for ( i = 0; i < 512; i++ )
  25.156 -            if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) )
  25.157 -                goto out;
  25.158 -        if ( unlikely(page->flags & PG_need_flush) )
  25.159 -        {
  25.160 -            perfc_incrc(need_flush_tlb_flush);
  25.161 -            __write_cr3_counted(pagetable_val(p->mm.pagetable));
  25.162 -            page->flags &= ~PG_need_flush;
  25.163 -        }
  25.164 +    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
  25.165 +         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
  25.166 +                                     p, PGT_ldt_page)) )
  25.167 +        return 0;
  25.168  
  25.169 -        page->flags &= ~PG_type_mask;
  25.170 -        page->flags |= PGT_ldt_page;
  25.171 -    }
  25.172 -
  25.173 -    /* Success! */
  25.174 -    get_page_type(page);
  25.175 -    get_page_tot(page);
  25.176 -    p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW);
  25.177 +    p->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
  25.178      p->mm.shadow_ldt_mapcnt++;
  25.179  
  25.180 -    ret = 0;
  25.181 -
  25.182 - out:
  25.183 -    spin_unlock(&p->page_lock);
  25.184 -    return ret;
  25.185 +    return 1;
  25.186  }
  25.187  
  25.188  
  25.189 -/* Return original refcnt, or -1 on error. */
  25.190 -static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
  25.191 +/* Domain 0 is allowed to build page tables on others' behalf. */
  25.192 +static inline int dom0_get_page(struct pfn_info *page)
  25.193  {
  25.194 -    struct pfn_info *page;
  25.195 -    unsigned long flags;
  25.196 +    unsigned long x, nx, y = page->count_and_flags;
  25.197 +
  25.198 +    do {
  25.199 +        x  = y;
  25.200 +        nx = x + 1;
  25.201 +        if ( unlikely((x & PGC_count_mask) == 0) ||
  25.202 +             unlikely((nx & PGC_count_mask) == 0) )
  25.203 +            return 0;
  25.204 +    }
  25.205 +    while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
  25.206 +
  25.207 +    return 1;
  25.208 +}
  25.209 +
  25.210 +
  25.211 +static int get_page_from_pagenr(unsigned long page_nr)
  25.212 +{
  25.213 +    struct pfn_info *page = &frame_table[page_nr];
  25.214  
  25.215      if ( unlikely(page_nr >= max_page) )
  25.216      {
  25.217          MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
  25.218 -        return -1;
  25.219 +        return 0;
  25.220      }
  25.221 -    page = frame_table + page_nr;
  25.222 -    flags = page->flags;
  25.223 -    if ( unlikely(!DOMAIN_OKAY(flags)) )
  25.224 -    {
  25.225 -        MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
  25.226 -        return -1;
  25.227 -    }
  25.228 -    if ( (flags & PG_type_mask) != type )
  25.229 +
  25.230 +    if ( unlikely(!get_page(page, current)) &&
  25.231 +         ((current->domain != 0) || !dom0_get_page(page)) )
  25.232      {
  25.233 -        if ( page_type_count(page) != 0 )
  25.234 -        {
  25.235 -            MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
  25.236 -                    page_nr << PAGE_SHIFT,
  25.237 -                    flags & PG_type_mask, type, page_type_count(page));
  25.238 -            return -1;
  25.239 -        }
  25.240 -
  25.241 -        if ( unlikely(flags & PG_need_flush) )
  25.242 -        {
  25.243 -            deferred_op[smp_processor_id()].flush_tlb = 1;
  25.244 -            page->flags &= ~PG_need_flush;
  25.245 -            perfc_incrc(need_flush_tlb_flush);
  25.246 -        }
  25.247 -
  25.248 -        page->flags &= ~PG_type_mask;
  25.249 -        page->flags |= type;
  25.250 +        MEM_LOG("Could not get page reference for pfn %08lx\n", page_nr);
  25.251 +        return 0;
  25.252      }
  25.253  
  25.254 -    get_page_tot(page);
  25.255 -    return get_page_type(page);
  25.256 +    return 1;
  25.257 +}
  25.258 +
  25.259 +
  25.260 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  25.261 +                                         unsigned int type)
  25.262 +{
  25.263 +    struct pfn_info *page = &frame_table[page_nr];
  25.264 +
  25.265 +    if ( unlikely(!get_page_from_pagenr(page_nr)) )
  25.266 +        return 0;
  25.267 +
  25.268 +    if ( unlikely(!get_page_type(page, type)) )
  25.269 +    {
  25.270 +        MEM_LOG("Bad page type for pfn %08lx (%08lx)", 
  25.271 +                page_nr, page->type_and_flags);
  25.272 +        put_page(page);
  25.273 +        return 0;
  25.274 +    }
  25.275 +
  25.276 +    return 1;
  25.277  }
  25.278  
  25.279  
  25.280 -/* Return new refcnt, or -1 on error. */
  25.281 -static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
  25.282 +/*
  25.283 + * We allow an L2 table to map itself, to achieve a linear p.t. Note that this
  25.284 + * does not raise any reference counts.
  25.285 + */
  25.286 +static int check_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn)
  25.287 +{
  25.288 +    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
  25.289 +    {
  25.290 +        MEM_LOG("Attempt to create linear p.t. with write perms");
  25.291 +        return 0;
  25.292 +    }
  25.293 +
  25.294 +    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
  25.295 +    {
  25.296 +        MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
  25.297 +        return 0;
  25.298 +    }
  25.299 +
  25.300 +    return 1;
  25.301 +}
  25.302 +
  25.303 +
  25.304 +static int get_page_from_l1e(l1_pgentry_t l1e)
  25.305 +{
  25.306 +    ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
  25.307 +
  25.308 +    if ( unlikely((l1_pgentry_val(l1e) &
  25.309 +                   (_PAGE_GLOBAL|_PAGE_PAT))) )
  25.310 +    {
  25.311 +        MEM_LOG("Bad L1 page type settings %04lx",
  25.312 +                l1_pgentry_val(l1e) &
  25.313 +                (_PAGE_GLOBAL|_PAGE_PAT));
  25.314 +        return 0;
  25.315 +    }
  25.316 +
  25.317 +    if ( l1_pgentry_val(l1e) & _PAGE_RW )
  25.318 +    {
  25.319 +        if ( unlikely(!get_page_and_type_from_pagenr(
  25.320 +            l1_pgentry_to_pagenr(l1e), PGT_writeable_page)) )
  25.321 +            return 0;
  25.322 +        set_bit(_PGC_tlb_flush_on_type_change, 
  25.323 +                &frame_table[l1_pgentry_to_pagenr(l1e)].count_and_flags);
  25.324 +    }
  25.325 +    else
  25.326 +    {
  25.327 +        if ( unlikely(!get_page_from_pagenr(l1_pgentry_to_pagenr(l1e))) )
  25.328 +            return 0;
  25.329 +    }
  25.330 +
  25.331 +    return 1;
  25.332 +}
  25.333 +
  25.334 +
  25.335 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  25.336 +static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  25.337 +{
  25.338 +    ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
  25.339 +
  25.340 +    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  25.341 +    {
  25.342 +        MEM_LOG("Bad L2 page type settings %04lx",
  25.343 +                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
  25.344 +        return 0;
  25.345 +    }
  25.346 +
  25.347 +    if ( unlikely(!get_page_and_type_from_pagenr(
  25.348 +        l2_pgentry_to_pagenr(l2e), PGT_l1_page_table)) &&
  25.349 +         unlikely(!check_linear_pagetable(l2e, pfn)) )
  25.350 +        return 0;
  25.351 +
  25.352 +    return 1;
  25.353 +}
  25.354 +
  25.355 +
  25.356 +static void put_page_from_l1e(l1_pgentry_t l1e)
  25.357  {
  25.358      struct pfn_info *page;
  25.359  
  25.360 -    if ( unlikely(page_nr >= max_page) )
  25.361 +    ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
  25.362 +
  25.363 +    page = &frame_table[l1_pgentry_to_pagenr(l1e)];
  25.364 +
  25.365 +    if ( l1_pgentry_val(l1e) & _PAGE_RW )
  25.366      {
  25.367 -        MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
  25.368 -        return -1;
  25.369 +        put_page_and_type(page);
  25.370      }
  25.371 -    page = frame_table + page_nr;
  25.372 -    if ( unlikely(!DOMAIN_OKAY(page->flags)) || 
  25.373 -         unlikely(((page->flags & PG_type_mask) != type)) ) 
  25.374 +    else
  25.375      {
  25.376 -        MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
  25.377 -                page->flags & PG_domain_mask, page->flags & PG_type_mask,
  25.378 -                type);
  25.379 -        return -1;
  25.380 +        /* We expect this is rare so we blow the entire shadow LDT. */
  25.381 +        if ( unlikely(((page->type_and_flags & PGT_type_mask) == 
  25.382 +                       PGT_ldt_page)) &&
  25.383 +             unlikely(((page->type_and_flags & PGT_count_mask) != 0)) )
  25.384 +            invalidate_shadow_ldt();
  25.385 +        put_page(page);
  25.386      }
  25.387 -    ASSERT(page_type_count(page) != 0);
  25.388 -    put_page_tot(page);
  25.389 -    return put_page_type(page);
  25.390 +}
  25.391 +
  25.392 +
  25.393 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  25.394 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  25.395 +{
  25.396 +    ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
  25.397 +
  25.398 +    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
  25.399 +         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
  25.400 +        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  25.401  }
  25.402  
  25.403  
  25.404 -/* We allow a L2 table to map itself, to achieve a linear pagetable. */
  25.405 -/* NB. There's no need for a put_twisted_l2_table() function!! */
  25.406 -static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
  25.407 +static int alloc_l2_table(struct pfn_info *page)
  25.408  {
  25.409 -    unsigned long l2v = l2_pgentry_val(l2e);
  25.410 +    unsigned long page_nr = page - frame_table;
  25.411 +    l2_pgentry_t *pl2e, l2e;
  25.412 +    int i;
  25.413 +   
  25.414 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  25.415 +
  25.416 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  25.417 +    {
  25.418 +        l2e = pl2e[i];
  25.419 +
  25.420 +        if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 
  25.421 +            continue;
  25.422  
  25.423 -    /* Clearly the mapping must be read-only :-) */
  25.424 -    if ( (l2v & _PAGE_RW) )
  25.425 +        if ( unlikely(!get_page_from_l2e(l2e, page_nr)) )
  25.426 +            goto fail;
  25.427 +    }
  25.428 +    
  25.429 +    /* Now we add our private high mappings. */
  25.430 +    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  25.431 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  25.432 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  25.433 +    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  25.434 +        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  25.435 +    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  25.436 +        mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) | 
  25.437 +                      __PAGE_HYPERVISOR);
  25.438 +
  25.439 +    unmap_domain_mem(pl2e);
  25.440 +    return 1;
  25.441 +
  25.442 + fail:
  25.443 +    while ( i-- > 0 )
  25.444      {
  25.445 -        MEM_LOG("Attempt to install twisted L2 entry with write permissions");
  25.446 -        return -1;
  25.447 +        l2e = pl2e[i];
  25.448 +        if ( l2_pgentry_val(l2e) & _PAGE_PRESENT )
  25.449 +            put_page_from_l2e(l2e, page_nr);
  25.450      }
  25.451  
  25.452 -    /* This is a sufficient final check. */
  25.453 -    if ( (l2v >> PAGE_SHIFT) != entry_pfn )
  25.454 -    {
  25.455 -        MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
  25.456 -        return -1;
  25.457 -    }
  25.458 -    
  25.459 -    /* We don't bump the reference counts. */
  25.460 +    unmap_domain_mem(pl2e);
  25.461      return 0;
  25.462  }
  25.463  
  25.464  
  25.465 -static int get_l2_table(unsigned long page_nr)
  25.466 +static int alloc_l1_table(struct pfn_info *page)
  25.467  {
  25.468 -    struct pfn_info *page;
  25.469 -    struct task_struct *p;
  25.470 -    l2_pgentry_t *p_l2_entry, l2_entry;
  25.471 -    int i, ret=0;
  25.472 -   
  25.473 -    ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
  25.474 -    if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
  25.475 -    
  25.476 -    /* NEW level-2 page table! Deal with every PDE in the table. */
  25.477 -    p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  25.478 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  25.479 +    unsigned long page_nr = page - frame_table;
  25.480 +    l1_pgentry_t *pl1e, l1e;
  25.481 +    int i;
  25.482 +
  25.483 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  25.484 +
  25.485 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  25.486      {
  25.487 -        l2_entry = *p_l2_entry++;
  25.488 -        if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
  25.489 -        if ( unlikely((l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  25.490 -        {
  25.491 -            MEM_LOG("Bad L2 page type settings %04lx",
  25.492 -                    l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
  25.493 -            ret = -1;
  25.494 +        l1e = pl1e[i];
  25.495 +
  25.496 +        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 
  25.497 +            continue;
  25.498 +
  25.499 +        if ( unlikely(!get_page_from_l1e(l1e)) )
  25.500              goto fail;
  25.501 -        }
  25.502 -        /* Assume we're mapping an L1 table, falling back to twisted L2. */
  25.503 -        ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
  25.504 -        if ( unlikely(ret) ) ret = get_twisted_l2_table(page_nr, l2_entry);
  25.505 -        if ( unlikely(ret) ) goto fail;
  25.506 -    }
  25.507 -    
  25.508 -    /* Now we simply slap in our high mapping. */
  25.509 -    memcpy(p_l2_entry, 
  25.510 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  25.511 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  25.512 -    p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
  25.513 -              DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
  25.514 -        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  25.515 -
  25.516 -    /*
  25.517 -     * The per-domain PGD is slightly tricky, as we may not be executing
  25.518 -     * in the context of the correct domain (DOM0 builds pt's for others).
  25.519 -     */
  25.520 -    page = frame_table + page_nr;
  25.521 -    if ( (p = find_domain_by_id(page->flags & PG_domain_mask)) != NULL )
  25.522 -    {
  25.523 -        p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
  25.524 -                  DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
  25.525 -            mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
  25.526 -        put_task_struct(p);
  25.527      }
  25.528  
  25.529 - out:
  25.530 -    unmap_domain_mem(p_l2_entry);
  25.531 -    return ret;
  25.532 +    /* Make sure we unmap the right page! */
  25.533 +    unmap_domain_mem(pl1e);
  25.534 +    return 1;
  25.535  
  25.536   fail:
  25.537 -    p_l2_entry--;
  25.538      while ( i-- > 0 )
  25.539      {
  25.540 -        l2_entry = *--p_l2_entry;
  25.541 -        if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
  25.542 -            put_l1_table(l2_pgentry_to_pagenr(l2_entry));
  25.543 +        l1e = pl1e[i];
  25.544 +        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
  25.545 +            continue;
  25.546 +        put_page_from_l1e(l1e);
  25.547      }
  25.548 -    if ( dec_page_refcnt(page_nr, PGT_l2_page_table) != 0 )
  25.549 -        BUG();
  25.550 -    goto out;
  25.551 +
  25.552 +    unmap_domain_mem(pl1e);
  25.553 +    return 0;
  25.554  }
  25.555  
  25.556  
  25.557 -static int get_l1_table(unsigned long page_nr)
  25.558 +static void free_l2_table(struct pfn_info *page)
  25.559  {
  25.560 -    l1_pgentry_t *p_l1_entry, l1_entry;
  25.561 -    int i, ret;
  25.562 +    unsigned long page_nr = page - frame_table;
  25.563 +    l2_pgentry_t *pl2e, l2e;
  25.564 +    int i;
  25.565 +
  25.566 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  25.567  
  25.568 -    /* Update ref count for page pointed at by PDE. */
  25.569 -    ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
  25.570 -    if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
  25.571 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  25.572 +    {
  25.573 +        l2e = pl2e[i];
  25.574 +        if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
  25.575 +             unlikely((l2_pgentry_val(l2e) >> PAGE_SHIFT) != page_nr) )
  25.576 +            put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  25.577 +    }
  25.578  
  25.579 -    /* NEW level-1 page table! Deal with every PTE in the table. */
  25.580 -    p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  25.581 +    unmap_domain_mem(pl2e);
  25.582 +}
  25.583 +
  25.584 +
  25.585 +static void free_l1_table(struct pfn_info *page)
  25.586 +{
  25.587 +    unsigned long page_nr = page - frame_table;
  25.588 +    l1_pgentry_t *pl1e, l1e;
  25.589 +    int i;
  25.590 +
  25.591 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  25.592 +
  25.593      for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  25.594      {
  25.595 -        l1_entry = *p_l1_entry++;
  25.596 -        if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
  25.597 -        if ( unlikely((l1_pgentry_val(l1_entry) &
  25.598 -                       (_PAGE_GLOBAL|_PAGE_PAT))) )
  25.599 -        {
  25.600 -            MEM_LOG("Bad L1 page type settings %04lx",
  25.601 -                    l1_pgentry_val(l1_entry) &
  25.602 -                    (_PAGE_GLOBAL|_PAGE_PAT));
  25.603 -            ret = -1;
  25.604 -            goto fail;
  25.605 -        }
  25.606 -        ret = get_page(l1_pgentry_to_pagenr(l1_entry),
  25.607 -                       l1_pgentry_val(l1_entry) & _PAGE_RW);
  25.608 -        if ( unlikely(ret) ) goto fail;
  25.609 +        l1e = pl1e[i];
  25.610 +        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 
  25.611 +            continue;
  25.612 +        put_page_from_l1e(l1e);
  25.613      }
  25.614  
  25.615 -    /* Make sure we unmap the right page! */
  25.616 -    unmap_domain_mem(p_l1_entry-1);
  25.617 -    return ret;
  25.618 +    unmap_domain_mem(pl1e);
  25.619 +}
  25.620 +
  25.621  
  25.622 - fail:
  25.623 -    p_l1_entry--;
  25.624 -    while ( i-- > 0 )
  25.625 -    {
  25.626 -        l1_entry = *--p_l1_entry;
  25.627 -        if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 
  25.628 -            put_page(l1_pgentry_to_pagenr(l1_entry), 
  25.629 -                     l1_pgentry_val(l1_entry) & _PAGE_RW);
  25.630 -    }
  25.631 -    if ( dec_page_refcnt(page_nr, PGT_l1_page_table) != 0 )
  25.632 -        BUG();
  25.633 -    unmap_domain_mem(p_l1_entry);
  25.634 -    return ret;
  25.635 +static inline int update_l2e(l2_pgentry_t *pl2e, 
  25.636 +                             l2_pgentry_t  ol2e, 
  25.637 +                             l2_pgentry_t  nl2e)
  25.638 +{
  25.639 +    unsigned long o = cmpxchg((unsigned long *)pl2e, 
  25.640 +                              l2_pgentry_val(ol2e), 
  25.641 +                              l2_pgentry_val(nl2e));
  25.642 +    if ( o != l2_pgentry_val(ol2e) )
  25.643 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  25.644 +                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
  25.645 +    return (o == l2_pgentry_val(ol2e));
  25.646  }
  25.647  
  25.648  
  25.649 -static int get_page(unsigned long page_nr, int writeable)
  25.650 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  25.651 +static int mod_l2_entry(l2_pgentry_t *pl2e, 
  25.652 +                        l2_pgentry_t nl2e, 
  25.653 +                        unsigned long pfn)
  25.654  {
  25.655 -    struct pfn_info *page;
  25.656 -    unsigned long flags;
  25.657 +    l2_pgentry_t ol2e;
  25.658 +    unsigned long _ol2e;
  25.659  
  25.660 -    /* Update ref count for page pointed at by PTE. */
  25.661 -    if ( unlikely(page_nr >= max_page) )
  25.662 -    {
  25.663 -        MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
  25.664 -        return(-1);
  25.665 -    }
  25.666 -    page = frame_table + page_nr;
  25.667 -    flags = page->flags;
  25.668 -    if ( unlikely(!DOMAIN_OKAY(flags)) )
  25.669 +    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
  25.670 +                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  25.671      {
  25.672 -        MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
  25.673 -        return(-1);
  25.674 -    }
  25.675 -
  25.676 -    if ( writeable )
  25.677 -    {
  25.678 -        if ( (flags & PG_type_mask) != PGT_writeable_page )
  25.679 -        {
  25.680 -            if ( page_type_count(page) != 0 )
  25.681 -            {
  25.682 -                MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
  25.683 -                        flags & PG_type_mask, PGT_writeable_page,
  25.684 -                        page_type_count(page));
  25.685 -                return(-1);
  25.686 -            }
  25.687 -            page->flags &= ~PG_type_mask;
  25.688 -            page->flags |= PGT_writeable_page;
  25.689 -        }
  25.690 -        page->flags |= PG_need_flush;
  25.691 -        get_page_type(page);
  25.692 +        MEM_LOG("Illegal L2 update attempt in hypervisor area %p", pl2e);
  25.693 +        return 0;
  25.694      }
  25.695  
  25.696 -    get_page_tot(page);
  25.697 -    
  25.698 -    return(0);
  25.699 -}
  25.700 -
  25.701 -
  25.702 -static void put_l2_table(unsigned long page_nr)
  25.703 -{
  25.704 -    l2_pgentry_t *p_l2_entry, l2_entry;
  25.705 -    int i;
  25.706 +    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
  25.707 +        return 0;
  25.708 +    ol2e = mk_l2_pgentry(_ol2e);
  25.709  
  25.710 -    if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return;
  25.711 +    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
  25.712 +    {
  25.713 +        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  25.714 +        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) != 0 )
  25.715 +        {
  25.716 +            if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
  25.717 +                return 0;
  25.718  
  25.719 -    /* We had last reference to level-2 page table. Free the PDEs. */
  25.720 -    p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  25.721 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  25.722 -    {
  25.723 -        l2_entry = *p_l2_entry++;
  25.724 -        if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
  25.725 -            put_l1_table(l2_pgentry_to_pagenr(l2_entry));
  25.726 -    }
  25.727 +            if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  25.728 +            {
  25.729 +                put_page_from_l2e(nl2e, pfn);
  25.730 +                return 0;
  25.731 +            }
  25.732  
  25.733 -    unmap_domain_mem(p_l2_entry);
  25.734 -}
  25.735 -
  25.736 -
  25.737 -static void put_l1_table(unsigned long page_nr)
  25.738 -{
  25.739 -    l1_pgentry_t *p_l1_entry, l1_entry;
  25.740 -    int i;
  25.741 -
  25.742 -    if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return;
  25.743 +            if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
  25.744 +                put_page_from_l2e(ol2e, pfn);
  25.745 +        }
  25.746 +        else if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  25.747 +        {
  25.748 +            return 0;
  25.749 +        }
  25.750 +    }
  25.751 +    else
  25.752 +    {
  25.753 +        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  25.754 +            return 0;
  25.755  
  25.756 -    /* We had last reference to level-1 page table. Free the PTEs. */
  25.757 -    p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
  25.758 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  25.759 -    {
  25.760 -        l1_entry = *p_l1_entry++;
  25.761 -        if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 
  25.762 -            put_page(l1_pgentry_to_pagenr(l1_entry), 
  25.763 -                     l1_pgentry_val(l1_entry) & _PAGE_RW);
  25.764 +        if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
  25.765 +            put_page_from_l2e(ol2e, pfn);
  25.766      }
  25.767 -
  25.768 -    /* Make sure we unmap the right page! */
  25.769 -    unmap_domain_mem(p_l1_entry-1);
  25.770 +    
  25.771 +    return 1;
  25.772  }
  25.773  
  25.774  
  25.775 -static void put_page(unsigned long page_nr, int writeable)
  25.776 +static inline int update_l1e(l1_pgentry_t *pl1e, 
  25.777 +                             l1_pgentry_t  ol1e, 
  25.778 +                             l1_pgentry_t  nl1e)
  25.779  {
  25.780 -    struct pfn_info *page;
  25.781 -    ASSERT(page_nr < max_page);
  25.782 -    page = frame_table + page_nr;
  25.783 -    ASSERT(DOMAIN_OKAY(page->flags));
  25.784 -    ASSERT((!writeable) || 
  25.785 -           ((page_type_count(page) != 0) && 
  25.786 -            ((page->flags & PG_type_mask) == PGT_writeable_page) &&
  25.787 -            ((page->flags & PG_need_flush) == PG_need_flush)));
  25.788 -    if ( writeable )
  25.789 +    unsigned long o = l1_pgentry_val(ol1e);
  25.790 +    unsigned long n = l1_pgentry_val(nl1e);
  25.791 +
  25.792 +    while ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
  25.793      {
  25.794 -        put_page_type(page);
  25.795 +        unsigned int cpu = smp_processor_id();
  25.796 +        /* The CMPXCHG faulted -- maybe we need to clear the WP bit. */
  25.797 +        if ( deferred_op[cpu].flags & DOP_RESTORE_CR0 )
  25.798 +        {
  25.799 +            MEM_LOG("cmpxchg fault despite WP bit cleared\n");
  25.800 +            return 0;
  25.801 +        }
  25.802 +        deferred_op[cpu].cr0 = read_cr0();
  25.803 +        write_cr0(deferred_op[cpu].cr0 & ~X86_CR0_WP);
  25.804 +        deferred_op[cpu].flags |= DOP_RESTORE_CR0;
  25.805      }
  25.806 -    else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
  25.807 -                       (page_type_count(page) != 0)) )
  25.808 -    {
  25.809 -        /* We expect this is rare so we just blow the entire shadow LDT. */
  25.810 -        invalidate_shadow_ldt();
  25.811 -    }
  25.812 -    put_page_tot(page);
  25.813 +
  25.814 +    if ( o != l1_pgentry_val(ol1e))
  25.815 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  25.816 +                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
  25.817 +
  25.818 +    /* The swap was successful if the old value we saw is equal to ol1e. */
  25.819 +    return (o == l1_pgentry_val(ol1e));
  25.820  }
  25.821  
  25.822  
  25.823 -static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry)
  25.824 +/* Update the L1 entry at pl1e to new value nl1e. */
  25.825 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  25.826  {
  25.827 -    l2_pgentry_t old_l2_entry = *p_l2_entry;
  25.828 +    l1_pgentry_t ol1e;
  25.829 +    unsigned long _ol1e;
  25.830  
  25.831 -    if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
  25.832 -                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  25.833 -    {
  25.834 -        MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
  25.835 -                p_l2_entry);
  25.836 -        goto fail;
  25.837 -    }
  25.838 -
  25.839 -    if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
  25.840 +    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
  25.841      {
  25.842 -        if ( unlikely((l2_pgentry_val(new_l2_entry) & 
  25.843 -                       (_PAGE_GLOBAL|_PAGE_PSE))) )
  25.844 -        {
  25.845 -            MEM_LOG("Bad L2 entry val %04lx",
  25.846 -                    l2_pgentry_val(new_l2_entry) & 
  25.847 -                    (_PAGE_GLOBAL|_PAGE_PSE));
  25.848 -            goto fail;
  25.849 -        }
  25.850 -        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  25.851 -        if ( ((l2_pgentry_val(old_l2_entry) ^ 
  25.852 -               l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
  25.853 -        {
  25.854 -            /* Assume we're mapping an L1 table, falling back to twisted L2. */
  25.855 -            if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) )
  25.856 -            {
  25.857 -                /* NB. No need to sanity-check the VA: done already. */
  25.858 -                unsigned long l1e = l1_pgentry_val(
  25.859 -                    linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]);
  25.860 -                if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) )
  25.861 -                    goto fail;
  25.862 -            }
  25.863 -
  25.864 -            if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) 
  25.865 -                put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));            
  25.866 -        } 
  25.867 -    }
  25.868 -    else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
  25.869 -    {
  25.870 -        put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
  25.871 +        MEM_LOG("Bad get_user\n");
  25.872 +        return 0;
  25.873      }
  25.874      
  25.875 -    *p_l2_entry = new_l2_entry;
  25.876 -    return 0;
  25.877 -
  25.878 - fail:
  25.879 -    return -1;
  25.880 -}
  25.881 -
  25.882 +    ol1e = mk_l1_pgentry(_ol1e);
  25.883  
  25.884 -static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry)
  25.885 -{
  25.886 -    l1_pgentry_t old_l1_entry = *p_l1_entry;
  25.887 -
  25.888 -    if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
  25.889 +    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
  25.890      {
  25.891 -        if ( unlikely((l1_pgentry_val(new_l1_entry) &
  25.892 -                       (_PAGE_GLOBAL|_PAGE_PAT))) ) 
  25.893 -        {
  25.894 -            MEM_LOG("Bad L1 entry val %04lx",
  25.895 -                    l1_pgentry_val(new_l1_entry) & 
  25.896 -                    (_PAGE_GLOBAL|_PAGE_PAT));
  25.897 -            goto fail;
  25.898 -        }
  25.899          /*
  25.900           * Differ in mapping (bits 12-31), writeable (bit 1), or
  25.901           * presence (bit 0)?
  25.902           */
  25.903 -        if ( ((l1_pgentry_val(old_l1_entry) ^
  25.904 -               l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
  25.905 +        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) != 0 )
  25.906          {
  25.907 -            if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
  25.908 -                          l1_pgentry_val(new_l1_entry) & _PAGE_RW) )
  25.909 -                goto fail;
  25.910 +            if ( unlikely(!get_page_from_l1e(nl1e)) )
  25.911 +                return 0;
  25.912 +
  25.913 +            if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  25.914 +            {
  25.915 +                put_page_from_l1e(nl1e);
  25.916 +                return 0;
  25.917 +            }
  25.918  
  25.919 -            if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) 
  25.920 -                put_page(l1_pgentry_to_pagenr(old_l1_entry),
  25.921 -                         l1_pgentry_val(old_l1_entry) & _PAGE_RW);
  25.922 -        } 
  25.923 +            if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
  25.924 +                put_page_from_l1e(ol1e);
  25.925 +        }
  25.926 +        else if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  25.927 +        {
  25.928 +            return 0;
  25.929 +        }
  25.930      }
  25.931 -    else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
  25.932 +    else 
  25.933      {
  25.934 -        put_page(l1_pgentry_to_pagenr(old_l1_entry),
  25.935 -                 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
  25.936 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  25.937 +            return 0;
  25.938 +
  25.939 +        if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
  25.940 +            put_page_from_l1e(ol1e);
  25.941      }
  25.942  
  25.943 -    *p_l1_entry = new_l1_entry;
  25.944 -    return 0;
  25.945 +    return 1;
  25.946 +}
  25.947 +
  25.948 +
  25.949 +int alloc_page_type(struct pfn_info *page, unsigned int type)
  25.950 +{
  25.951 +    if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, 
  25.952 +                                     &page->count_and_flags)) )
  25.953 +    {
  25.954 +        struct task_struct *p = page->u.domain;
  25.955 +        mb(); /* Check zombie status before using domain ptr. */
  25.956 +        /*
  25.957 +         * NB. 'p' may no longer be valid by time we dereference it, so
  25.958 +         * p->processor might be garbage. We clamp it, just in case.
  25.959 +         */
  25.960 +        if ( !test_bit(_PGC_zombie, &page->count_and_flags) &&
  25.961 +             unlikely(NEED_FLUSH(tlbflush_time[(p->processor)&(NR_CPUS-1)], 
  25.962 +                                 page->tlbflush_timestamp)) )
  25.963 +        {
  25.964 +            perfc_incr(need_flush_tlb_flush);
  25.965 +            flush_tlb_cpu(p->processor);
  25.966 +        }
  25.967 +    }
  25.968  
  25.969 - fail:
  25.970 -    return -1;
  25.971 +    switch ( type )
  25.972 +    {
  25.973 +    case PGT_l1_page_table:
  25.974 +        return alloc_l1_table(page);
  25.975 +    case PGT_l2_page_table:
  25.976 +        return alloc_l2_table(page);
  25.977 +    case PGT_gdt_page:
  25.978 +    case PGT_ldt_page:
  25.979 +        return alloc_segdesc_page(page);
  25.980 +    default:
  25.981 +        BUG();
  25.982 +    }
  25.983 +
  25.984 +    return 0;
  25.985 +}
  25.986 +
  25.987 +
  25.988 +void free_page_type(struct pfn_info *page, unsigned int type)
  25.989 +{
  25.990 +    switch ( type )
  25.991 +    {
  25.992 +    case PGT_l1_page_table:
  25.993 +        return free_l1_table(page);
  25.994 +    case PGT_l2_page_table:
  25.995 +        return free_l2_table(page);
  25.996 +    default:
  25.997 +        BUG();
  25.998 +    }
  25.999  }
 25.1000  
 25.1001  
 25.1002  static int do_extended_command(unsigned long ptr, unsigned long val)
 25.1003  {
 25.1004 -    int err = 0, cpu = smp_processor_id();
 25.1005 +    int okay = 1, cpu = smp_processor_id();
 25.1006      unsigned int cmd = val & MMUEXT_CMD_MASK;
 25.1007      unsigned long pfn = ptr >> PAGE_SHIFT;
 25.1008 -    struct pfn_info *page = frame_table + pfn;
 25.1009 +    struct pfn_info *page = &frame_table[pfn];
 25.1010  
 25.1011      /* 'ptr' must be in range except where it isn't a machine address. */
 25.1012      if ( (pfn >= max_page) && (cmd != MMUEXT_SET_LDT) )
 25.1013 +    {
 25.1014 +        MEM_LOG("Ptr out of range for extended MMU command");
 25.1015          return 1;
 25.1016 +    }
 25.1017  
 25.1018      switch ( cmd )
 25.1019      {
 25.1020      case MMUEXT_PIN_L1_TABLE:
 25.1021 -        if ( unlikely(page->flags & PG_guest_pinned) )
 25.1022 -        {
 25.1023 -            MEM_LOG("Pfn %08lx already pinned", pfn);
 25.1024 -            err = 1;
 25.1025 -            break;
 25.1026 -        }
 25.1027 -        err = get_l1_table(pfn);
 25.1028 -        goto mark_as_pinned;
 25.1029 -
 25.1030      case MMUEXT_PIN_L2_TABLE:
 25.1031 -        if ( unlikely(page->flags & PG_guest_pinned) )
 25.1032 -        {
 25.1033 -            MEM_LOG("Pfn %08lx already pinned", pfn);
 25.1034 -            err = 1;
 25.1035 -            break;
 25.1036 -        }
 25.1037 -        err = get_l2_table(pfn);
 25.1038 -
 25.1039 -    mark_as_pinned:
 25.1040 -        if ( unlikely(err) )
 25.1041 +        okay = get_page_and_type_from_pagenr(pfn, 
 25.1042 +                                             (cmd == MMUEXT_PIN_L2_TABLE) ? 
 25.1043 +                                             PGT_l2_page_table : 
 25.1044 +                                             PGT_l1_page_table);
 25.1045 +        if ( unlikely(!okay) )
 25.1046          {
 25.1047              MEM_LOG("Error while pinning pfn %08lx", pfn);
 25.1048              break;
 25.1049          }
 25.1050 -        page->flags |= PG_guest_pinned;
 25.1051 +
 25.1052 +        if ( unlikely(test_and_set_bit(_PGC_guest_pinned, 
 25.1053 +                                       &page->count_and_flags)) )
 25.1054 +        {
 25.1055 +            MEM_LOG("Pfn %08lx already pinned", pfn);
 25.1056 +            put_page_and_type(page);
 25.1057 +            okay = 0;
 25.1058 +            break;
 25.1059 +        }
 25.1060 +
 25.1061          break;
 25.1062  
 25.1063      case MMUEXT_UNPIN_TABLE:
 25.1064 -        if ( unlikely(!DOMAIN_OKAY(page->flags)) )
 25.1065 +        if ( unlikely(!(okay = get_page_from_pagenr(pfn))) )
 25.1066          {
 25.1067 -            err = 1;
 25.1068 -            MEM_LOG("Page %08lx bad domain (dom=%ld)",
 25.1069 -                    ptr, page->flags & PG_domain_mask);
 25.1070 +            MEM_LOG("Page %08lx bad domain (dom=%p)",
 25.1071 +                    ptr, page->u.domain);
 25.1072          }
 25.1073 -        else if ( likely(page->flags & PG_guest_pinned) )
 25.1074 +        else if ( likely(test_and_clear_bit(_PGC_guest_pinned, 
 25.1075 +                                            &page->count_and_flags)) )
 25.1076          {
 25.1077 -            page->flags &= ~PG_guest_pinned;
 25.1078 -            ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
 25.1079 -                put_l1_table(pfn) : put_l2_table(pfn);
 25.1080 +            put_page_and_type(page);
 25.1081          }
 25.1082          else
 25.1083          {
 25.1084 -            err = 1;
 25.1085 +            okay = 0;
 25.1086              MEM_LOG("Pfn %08lx not pinned", pfn);
 25.1087          }
 25.1088          break;
 25.1089  
 25.1090      case MMUEXT_NEW_BASEPTR:
 25.1091 -        err = get_l2_table(pfn);
 25.1092 -        if ( !err )
 25.1093 +        okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table);
 25.1094 +        if ( likely(okay) )
 25.1095          {
 25.1096 -            put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
 25.1097 +            put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable)
 25.1098 +                                          >> PAGE_SHIFT]);
 25.1099              current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
 25.1100              invalidate_shadow_ldt();
 25.1101 -            deferred_op[cpu].flush_tlb = 1;
 25.1102 +            deferred_op[cpu].flags |= DOP_FLUSH_TLB;
 25.1103          }
 25.1104          else
 25.1105          {
 25.1106 -            MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
 25.1107 +            MEM_LOG("Error while installing new baseptr %08lx", ptr);
 25.1108          }
 25.1109          break;
 25.1110          
 25.1111      case MMUEXT_TLB_FLUSH:
 25.1112 -        deferred_op[cpu].flush_tlb = 1;
 25.1113 +        deferred_op[cpu].flags |= DOP_FLUSH_TLB;
 25.1114          break;
 25.1115      
 25.1116      case MMUEXT_INVLPG:
 25.1117 @@ -815,7 +867,7 @@ static int do_extended_command(unsigned 
 25.1118               ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
 25.1119               ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
 25.1120          {
 25.1121 -            err = 1;
 25.1122 +            okay = 0;
 25.1123              MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
 25.1124          }
 25.1125          else if ( (current->mm.ldt_ents != ents) || 
 25.1126 @@ -825,37 +877,39 @@ static int do_extended_command(unsigned 
 25.1127              current->mm.ldt_base = ptr;
 25.1128              current->mm.ldt_ents = ents;
 25.1129              load_LDT(current);
 25.1130 -            deferred_op[cpu].refresh_ldt = (ents != 0);
 25.1131 +            deferred_op[cpu].flags &= ~DOP_RELOAD_LDT;
 25.1132 +            if ( ents != 0 )
 25.1133 +                deferred_op[cpu].flags |= DOP_RELOAD_LDT;
 25.1134          }
 25.1135          break;
 25.1136      }
 25.1137  
 25.1138      default:
 25.1139          MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
 25.1140 -        err = 1;
 25.1141 +        okay = 0;
 25.1142          break;
 25.1143      }
 25.1144  
 25.1145 -    return err;
 25.1146 +    return okay;
 25.1147  }
 25.1148  
 25.1149  
 25.1150  int do_mmu_update(mmu_update_t *ureqs, int count)
 25.1151  {
 25.1152      mmu_update_t req;
 25.1153 -    unsigned long flags, pfn, l1e;
 25.1154 +    unsigned long va = 0, flags, pfn, prev_pfn = 0;
 25.1155      struct pfn_info *page;
 25.1156 -    int rc = 0, err = 0, i, cpu = smp_processor_id();
 25.1157 +    int rc = 0, okay = 1, i, cpu = smp_processor_id();
 25.1158      unsigned int cmd;
 25.1159 -    unsigned long cr0 = 0;
 25.1160  
 25.1161 -    perfc_incrc( calls_to_mmu_update ); 
 25.1162 -    perfc_addc( num_page_updates, count );
 25.1163 +    perfc_incrc(calls_to_mmu_update); 
 25.1164 +    perfc_addc(num_page_updates, count);
 25.1165  
 25.1166      for ( i = 0; i < count; i++ )
 25.1167      {
 25.1168          if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
 25.1169          {
 25.1170 +            MEM_LOG("Bad copy_from_user");
 25.1171              rc = -EFAULT;
 25.1172              break;
 25.1173          }
 25.1174 @@ -863,77 +917,85 @@ int do_mmu_update(mmu_update_t *ureqs, i
 25.1175          cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
 25.1176          pfn = req.ptr >> PAGE_SHIFT;
 25.1177  
 25.1178 -        err = 1;
 25.1179 -
 25.1180 -        spin_lock(&current->page_lock);
 25.1181 +        okay = 0;
 25.1182  
 25.1183 -        /* Get the page-frame number that a non-extended command references. */
 25.1184 -        if ( (cmd == MMU_NORMAL_PT_UPDATE) || 
 25.1185 -             (cmd == MMU_UNCHECKED_PT_UPDATE) )
 25.1186 -        {
 25.1187 -            if ( cr0 == 0 )
 25.1188 -            {
 25.1189 -                cr0 = read_cr0();
 25.1190 -                write_cr0(cr0 & ~X86_CR0_WP);
 25.1191 -            }
 25.1192 -            /* Need to use 'get_user' since the VA's PGD may be absent. */
 25.1193 -            __get_user(l1e, (unsigned long *)(linear_pg_table+pfn));
 25.1194 -            /* Now check that the VA's PTE isn't absent. */
 25.1195 -            if ( unlikely(!(l1e & _PAGE_PRESENT)) )
 25.1196 -            {
 25.1197 -                MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e);
 25.1198 -                goto unlock;
 25.1199 -            }
 25.1200 -            /* Finally, get the underlying machine address. */
 25.1201 -            pfn = l1e >> PAGE_SHIFT;
 25.1202 -        }
 25.1203 -
 25.1204 -        /* Least significant bits of 'ptr' demux the operation type. */
 25.1205          switch ( cmd )
 25.1206          {
 25.1207              /*
 25.1208               * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
 25.1209               */
 25.1210          case MMU_NORMAL_PT_UPDATE:
 25.1211 -            page  = frame_table + pfn;
 25.1212 -            flags = page->flags;
 25.1213 +            page = &frame_table[pfn];
 25.1214  
 25.1215 -            if ( likely(DOMAIN_OKAY(flags)) )
 25.1216 +            if ( unlikely(!get_page(page, current)) &&
 25.1217 +                 ((current->domain != 0) || !dom0_get_page(page)) )
 25.1218              {
 25.1219 -                switch ( (flags & PG_type_mask) )
 25.1220 -                {
 25.1221 -                case PGT_l1_page_table: 
 25.1222 -                    err = mod_l1_entry((l1_pgentry_t *)req.ptr, 
 25.1223 -                                       mk_l1_pgentry(req.val)); 
 25.1224 -                    break;
 25.1225 -                case PGT_l2_page_table: 
 25.1226 -                    err = mod_l2_entry((l2_pgentry_t *)req.ptr, 
 25.1227 -                                       mk_l2_pgentry(req.val)); 
 25.1228 -                    break;                    
 25.1229 -                default:
 25.1230 -                    if ( page_type_count(page) == 0 )
 25.1231 -                    {
 25.1232 -                        *(unsigned long *)req.ptr = req.val;
 25.1233 -                        err = 0;
 25.1234 -                    }
 25.1235 -                    else
 25.1236 -                        MEM_LOG("Update to bad page %08lx", req.ptr);
 25.1237 -                    break;
 25.1238 -                }
 25.1239 +                MEM_LOG("Could not get page for normal update");
 25.1240 +                break;
 25.1241 +            }
 25.1242 +
 25.1243 +            if ( likely(prev_pfn == pfn) )
 25.1244 +            {
 25.1245 +                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 25.1246              }
 25.1247              else
 25.1248              {
 25.1249 -                MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
 25.1250 -                        current->domain, pfn);
 25.1251 +                if ( prev_pfn != 0 )
 25.1252 +                    unmap_domain_mem((void *)va);
 25.1253 +                va = (unsigned long)map_domain_mem(req.ptr);
 25.1254 +                prev_pfn = pfn;
 25.1255              }
 25.1256 +
 25.1257 +            switch ( (page->type_and_flags & PGT_type_mask) )
 25.1258 +            {
 25.1259 +            case PGT_l1_page_table: 
 25.1260 +                if ( likely(get_page_type(page, PGT_l1_page_table)) )
 25.1261 +                {
 25.1262 +                    okay = mod_l1_entry((l1_pgentry_t *)va, 
 25.1263 +                                        mk_l1_pgentry(req.val)); 
 25.1264 +                    put_page_type(page);
 25.1265 +                }
 25.1266 +                break;
 25.1267 +            case PGT_l2_page_table:
 25.1268 +                if ( likely(get_page_type(page, PGT_l2_page_table)) )
 25.1269 +                {
 25.1270 +                    okay = mod_l2_entry((l2_pgentry_t *)va, 
 25.1271 +                                        mk_l2_pgentry(req.val),
 25.1272 +                                        pfn); 
 25.1273 +                    put_page_type(page);
 25.1274 +                }
 25.1275 +                break;
 25.1276 +            default:
 25.1277 +                if ( likely(get_page_type(page, PGT_writeable_page)) )
 25.1278 +                {
 25.1279 +                    *(unsigned long *)va = req.val;
 25.1280 +                    okay = 1;
 25.1281 +                    put_page_type(page);
 25.1282 +                }
 25.1283 +                break;
 25.1284 +            }
 25.1285 +            
 25.1286 +            put_page(page);
 25.1287 +
 25.1288              break;
 25.1289  
 25.1290          case MMU_UNCHECKED_PT_UPDATE:
 25.1291              req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 25.1292              if ( likely(IS_PRIV(current)) )
 25.1293              {
 25.1294 -                *(unsigned long *)req.ptr = req.val;
 25.1295 -                err = 0;
 25.1296 +                if ( likely(prev_pfn == pfn) )
 25.1297 +                {
 25.1298 +                    va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 25.1299 +                }
 25.1300 +                else
 25.1301 +                {
 25.1302 +                    if ( prev_pfn != 0 )
 25.1303 +                        unmap_domain_mem((void *)va);
 25.1304 +                    va = (unsigned long)map_domain_mem(req.ptr);
 25.1305 +                    prev_pfn = pfn;
 25.1306 +                }
 25.1307 +                *(unsigned long *)va = req.val;
 25.1308 +                okay = 1;
 25.1309              }
 25.1310              else
 25.1311              {
 25.1312 @@ -942,21 +1004,18 @@ int do_mmu_update(mmu_update_t *ureqs, i
 25.1313              break;
 25.1314              
 25.1315          case MMU_MACHPHYS_UPDATE:
 25.1316 -            page = frame_table + pfn;
 25.1317 +            page = &frame_table[pfn];
 25.1318              if ( unlikely(pfn >= max_page) )
 25.1319              {
 25.1320                  MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
 25.1321              }
 25.1322 -            else if ( likely(DOMAIN_OKAY(page->flags)) )
 25.1323 +            else if ( likely(get_page(page, current)) ||
 25.1324 +                      ((current->domain == 0) && dom0_get_page(page)) )
 25.1325              {
 25.1326                  machine_to_phys_mapping[pfn] = req.val;
 25.1327 -                err = 0;
 25.1328 +                okay = 1;
 25.1329 +                put_page(page);
 25.1330              }
 25.1331 -            else
 25.1332 -            {
 25.1333 -                MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
 25.1334 -                        current->domain, pfn);
 25.1335 -            }            
 25.1336              break;
 25.1337  
 25.1338              /*
 25.1339 @@ -965,7 +1024,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
 25.1340               */
 25.1341          case MMU_EXTENDED_COMMAND:
 25.1342              req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 25.1343 -            err = do_extended_command(req.ptr, req.val);
 25.1344 +            okay = do_extended_command(req.ptr, req.val);
 25.1345              break;
 25.1346  
 25.1347          default:
 25.1348 @@ -973,10 +1032,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
 25.1349              break;
 25.1350          }
 25.1351  
 25.1352 -    unlock:
 25.1353 -        spin_unlock(&current->page_lock);
 25.1354 -
 25.1355 -        if ( unlikely(err) )
 25.1356 +        if ( unlikely(!okay) )
 25.1357          {
 25.1358              rc = -EINVAL;
 25.1359              break;
 25.1360 @@ -985,20 +1041,20 @@ int do_mmu_update(mmu_update_t *ureqs, i
 25.1361          ureqs++;
 25.1362      }
 25.1363  
 25.1364 -    if ( deferred_op[cpu].flush_tlb )
 25.1365 -    {
 25.1366 -        deferred_op[cpu].flush_tlb = 0;
 25.1367 -        __write_cr3_counted(pagetable_val(current->mm.pagetable));
 25.1368 -    }
 25.1369 +    if ( prev_pfn != 0 )
 25.1370 +        unmap_domain_mem((void *)va);
 25.1371 +
 25.1372 +    flags = deferred_op[cpu].flags;
 25.1373 +    deferred_op[cpu].flags = 0;
 25.1374  
 25.1375 -    if ( deferred_op[cpu].refresh_ldt )
 25.1376 -    {
 25.1377 -        deferred_op[cpu].refresh_ldt = 0;
 25.1378 +    if ( flags & DOP_FLUSH_TLB )
 25.1379 +        write_cr3_counted(pagetable_val(current->mm.pagetable));
 25.1380 +
 25.1381 +    if ( flags & DOP_RELOAD_LDT )
 25.1382          (void)map_ldt_shadow_page(0);
 25.1383 -    }
 25.1384  
 25.1385 -    if ( cr0 != 0 )
 25.1386 -        write_cr0(cr0);
 25.1387 +    if ( unlikely(flags & DOP_RESTORE_CR0) )
 25.1388 +        write_cr0(deferred_op[cpu].cr0);
 25.1389  
 25.1390      return rc;
 25.1391  }
 25.1392 @@ -1006,48 +1062,34 @@ int do_mmu_update(mmu_update_t *ureqs, i
 25.1393  
 25.1394  int do_update_va_mapping(unsigned long page_nr, 
 25.1395                           unsigned long val, 
 25.1396 -                         unsigned long flags)
 25.1397 +                         unsigned long caller_flags)
 25.1398  {
 25.1399 -    unsigned long _x, cr0 = 0;
 25.1400      struct task_struct *p = current;
 25.1401 -    int err = -EINVAL;
 25.1402 +    int err = 0;
 25.1403 +    unsigned int cpu = p->processor;
 25.1404 +    unsigned long defer_flags;
 25.1405  
 25.1406      if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
 25.1407 -        goto out;
 25.1408 -
 25.1409 -    spin_lock(&p->page_lock);
 25.1410 +        return -EINVAL;
 25.1411  
 25.1412 -    /* Check that the VA's page-directory entry is present.. */
 25.1413 -    if ( unlikely((err = __get_user(_x, (unsigned long *)
 25.1414 -                                    (&linear_pg_table[page_nr]))) != 0) )
 25.1415 -        goto unlock_and_out;
 25.1416 +    if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 
 25.1417 +                                mk_l1_pgentry(val))) )
 25.1418 +        err = -EINVAL;
 25.1419  
 25.1420 -    /* If the VA's page-directory entry is read-only, we frob the WP bit. */
 25.1421 -    if ( unlikely(__put_user(_x, (unsigned long *)
 25.1422 -                             (&linear_pg_table[page_nr]))) )
 25.1423 -    {
 25.1424 -        cr0 = read_cr0();
 25.1425 -        write_cr0(cr0 & ~X86_CR0_WP);        
 25.1426 -    }
 25.1427 +    defer_flags = deferred_op[cpu].flags;
 25.1428 +    deferred_op[cpu].flags = 0;
 25.1429  
 25.1430 -    if ( unlikely(mod_l1_entry(&linear_pg_table[page_nr], 
 25.1431 -                               mk_l1_pgentry(val)) != 0) )
 25.1432 -    {
 25.1433 -        err = -EINVAL;
 25.1434 -        goto check_cr0_unlock_and_out;
 25.1435 -    }
 25.1436 -
 25.1437 -    if ( unlikely(flags & UVMF_INVLPG) )
 25.1438 +    if ( unlikely(defer_flags & DOP_FLUSH_TLB) || 
 25.1439 +         unlikely(caller_flags & UVMF_FLUSH_TLB) )
 25.1440 +        write_cr3_counted(pagetable_val(p->mm.pagetable));
 25.1441 +    else if ( unlikely(caller_flags & UVMF_INVLPG) )
 25.1442          __flush_tlb_one(page_nr << PAGE_SHIFT);
 25.1443  
 25.1444 -    if ( unlikely(flags & UVMF_FLUSH_TLB) )
 25.1445 -        __write_cr3_counted(pagetable_val(p->mm.pagetable));
 25.1446 +    if ( unlikely(defer_flags & DOP_RELOAD_LDT) )
 25.1447 +        (void)map_ldt_shadow_page(0);
 25.1448  
 25.1449 - check_cr0_unlock_and_out:
 25.1450 -    if ( unlikely(cr0 != 0) )
 25.1451 -        write_cr0(cr0);
 25.1452 - unlock_and_out:
 25.1453 -    spin_unlock(&p->page_lock);
 25.1454 - out:
 25.1455 +    if ( unlikely(defer_flags & DOP_RESTORE_CR0) )
 25.1456 +        write_cr0(deferred_op[cpu].cr0);
 25.1457 +
 25.1458      return err;
 25.1459  }
    26.1 --- a/xen/common/network.c	Sat Dec 20 23:39:49 2003 +0000
    26.2 +++ b/xen/common/network.c	Sat Dec 20 23:41:19 2003 +0000
    26.3 @@ -90,7 +90,7 @@ net_vif_t *create_net_vif(int domain)
    26.4      if ( sizeof(net_ring_t) > PAGE_SIZE ) BUG();
    26.5      new_ring = (net_ring_t *)get_free_page(GFP_KERNEL);
    26.6      clear_page(new_ring);
    26.7 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), domain);
    26.8 +    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
    26.9  
   26.10      /*
   26.11       * Fill in the new vif struct. Note that, while the vif's refcnt is
    27.1 --- a/xen/common/page_alloc.c	Sat Dec 20 23:39:49 2003 +0000
    27.2 +++ b/xen/common/page_alloc.c	Sat Dec 20 23:41:19 2003 +0000
    27.3 @@ -188,12 +188,12 @@ void __init init_page_allocator(unsigned
    27.4  /* Release a PHYSICAL address range to the allocator. */
    27.5  void release_bytes_to_allocator(unsigned long min, unsigned long max)
    27.6  {
    27.7 -    min = round_pgup  (min) + PAGE_OFFSET;
    27.8 -    max = round_pgdown(max) + PAGE_OFFSET;
    27.9 +    min = round_pgup  (min);
   27.10 +    max = round_pgdown(max);
   27.11  
   27.12      while ( min < max )
   27.13      {
   27.14 -        __free_pages(min, 0);
   27.15 +        __free_pages(min+PAGE_OFFSET, 0);
   27.16          min += PAGE_SIZE;
   27.17      }
   27.18  }
   27.19 @@ -210,7 +210,6 @@ unsigned long __get_free_pages(int mask,
   27.20  retry:
   27.21      spin_lock_irqsave(&alloc_lock, flags);
   27.22  
   27.23 -
   27.24      /* Find smallest order which can satisfy the request. */
   27.25      for ( i = order; i < FREELIST_SIZE; i++ ) {
   27.26  	if ( !FREELIST_EMPTY(free_head[i]) ) 
    28.1 --- a/xen/drivers/block/ll_rw_blk.c	Sat Dec 20 23:39:49 2003 +0000
    28.2 +++ b/xen/drivers/block/ll_rw_blk.c	Sat Dec 20 23:41:19 2003 +0000
    28.3 @@ -14,31 +14,15 @@
    28.4  #include <xeno/types.h>
    28.5  #include <xeno/lib.h>
    28.6  #include <xeno/sched.h>
    28.7 -/*#include <xeno/kernel_stat.h>*/
    28.8  #include <xeno/errno.h>
    28.9 -/*#include <xeno/locks.h>*/
   28.10  #include <xeno/mm.h>
   28.11 -/*#include <xeno/swap.h>*/
   28.12  #include <xeno/init.h>
   28.13 -/*#include <xeno/smp_lock.h>*/
   28.14 -/*#include <xeno/completion.h>*/
   28.15 -
   28.16  #include <asm/system.h>
   28.17  #include <asm/io.h>
   28.18  #include <xeno/blk.h>
   28.19 -/*#include <xeno/highmem.h>*/
   28.20  #include <xeno/slab.h>
   28.21  #include <xeno/module.h>
   28.22  
   28.23 -/*
   28.24 - * KAF: We can turn off noise relating to barking guest-OS requests.
   28.25 - */
   28.26 -#if 0
   28.27 -#define DPRINTK(_f, _a...) printk(_f , ## _a)
   28.28 -#else
   28.29 -#define DPRINTK(_f, _a...) ((void)0)
   28.30 -#endif
   28.31 -
   28.32  /* This will die as all synchronous stuff is coming to an end */
   28.33  #if 0 
   28.34  #define complete(_r) panic("completion.h stuff may be needed...")
   28.35 @@ -47,8 +31,6 @@
   28.36  #define complete(_r) (*(int *)(_r) = 0)
   28.37  #endif
   28.38  
   28.39 -
   28.40 -
   28.41  /*
   28.42   * MAC Floppy IWM hooks
   28.43   */
    29.1 --- a/xen/drivers/block/xen_block.c	Sat Dec 20 23:39:49 2003 +0000
    29.2 +++ b/xen/drivers/block/xen_block.c	Sat Dec 20 23:41:19 2003 +0000
    29.3 @@ -20,12 +20,6 @@
    29.4  #include <xeno/vbd.h>
    29.5  #include <xeno/slab.h>
    29.6  
    29.7 -#if 0
    29.8 -#define DPRINTK(_f, _a...) printk( _f , ## _a )
    29.9 -#else
   29.10 -#define DPRINTK(_f, _a...) ((void)0)
   29.11 -#endif
   29.12 -
   29.13  /*
   29.14   * These are rather arbitrary. They are fairly large because adjacent
   29.15   * requests pulled from a communication ring are quite likely to end
   29.16 @@ -60,15 +54,11 @@ static atomic_t nr_pending;
   29.17  
   29.18  static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
   29.19  
   29.20 -static int __buffer_is_valid(struct task_struct *p, 
   29.21 -                             unsigned long buffer, 
   29.22 -                             unsigned short size,
   29.23 -                             int writeable_buffer);
   29.24 -static void __lock_buffer(unsigned long buffer,
   29.25 -                          unsigned short size,
   29.26 -                          int writeable_buffer);
   29.27 -static void unlock_buffer(struct task_struct *p,
   29.28 -                          unsigned long buffer,
   29.29 +static int lock_buffer(struct task_struct *p,
   29.30 +                       unsigned long buffer,
   29.31 +                       unsigned short size,
   29.32 +                       int writeable_buffer);
   29.33 +static void unlock_buffer(unsigned long buffer,
   29.34                            unsigned short size,
   29.35                            int writeable_buffer);
   29.36  
   29.37 @@ -185,8 +175,7 @@ static void end_block_io_op_softirq(stru
   29.38      {
   29.39          pending_req = bh->pending_req;
   29.40          
   29.41 -        unlock_buffer(pending_req->domain, 
   29.42 -                      virt_to_phys(bh->b_data), 
   29.43 +        unlock_buffer(virt_to_phys(bh->b_data), 
   29.44                        bh->b_size, 
   29.45                        (pending_req->operation==READ));
   29.46          
   29.47 @@ -321,55 +310,10 @@ long do_block_io_op(block_io_op_t *u_blo
   29.48   * DOWNWARD CALLS -- These interface with the block-device layer proper.
   29.49   */
   29.50  
   29.51 -static int __buffer_is_valid(struct task_struct *p, 
   29.52 -                             unsigned long buffer, 
   29.53 -                             unsigned short size,
   29.54 -                             int writeable_buffer)
   29.55 -{
   29.56 -    unsigned long    pfn;
   29.57 -    struct pfn_info *page;
   29.58 -    int rc = 0;
   29.59 -
   29.60 -    /* A request may span multiple page frames. Each must be checked. */
   29.61 -    for ( pfn = buffer >> PAGE_SHIFT; 
   29.62 -          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   29.63 -          pfn++ )
   29.64 -    {
   29.65 -        /* Each frame must be within bounds of machine memory. */
   29.66 -        if ( pfn >= max_page )
   29.67 -        {
   29.68 -            DPRINTK("pfn out of range: %08lx\n", pfn);
   29.69 -            goto out;
   29.70 -        }
   29.71 -
   29.72 -        page = frame_table + pfn;
   29.73 -
   29.74 -        /* Each frame must belong to the requesting domain. */
   29.75 -        if ( (page->flags & PG_domain_mask) != p->domain )
   29.76 -        {
   29.77 -            DPRINTK("bad domain: expected %d, got %ld\n", 
   29.78 -                    p->domain, page->flags & PG_domain_mask);
   29.79 -            goto out;
   29.80 -        }
   29.81 -
   29.82 -        /* If reading into the frame, the frame must be writeable. */
   29.83 -        if ( writeable_buffer &&
   29.84 -             ((page->flags & PG_type_mask) != PGT_writeable_page) &&
   29.85 -             (page_type_count(page) != 0) )
   29.86 -        {
   29.87 -            DPRINTK("non-writeable page passed for block read\n");
   29.88 -            goto out;
   29.89 -        }
   29.90 -    }    
   29.91 -
   29.92 -    rc = 1;
   29.93 - out:
   29.94 -    return rc;
   29.95 -}
   29.96 -
   29.97 -static void __lock_buffer(unsigned long buffer,
   29.98 -                          unsigned short size,
   29.99 -                          int writeable_buffer)
  29.100 +static int lock_buffer(struct task_struct *p,
  29.101 +                       unsigned long buffer,
  29.102 +                       unsigned short size,
  29.103 +                       int writeable_buffer)
  29.104  {
  29.105      unsigned long    pfn;
  29.106      struct pfn_info *page;
  29.107 @@ -378,40 +322,48 @@ static void __lock_buffer(unsigned long 
  29.108            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
  29.109            pfn++ )
  29.110      {
  29.111 -        page = frame_table + pfn;
  29.112 -        if ( writeable_buffer )
  29.113 +        if ( unlikely(pfn >= max_page) )
  29.114 +            goto fail;
  29.115 +
  29.116 +        page = &frame_table[pfn];
  29.117 +
  29.118 +        if ( unlikely(!get_page(page, p)) )
  29.119 +            goto fail;
  29.120 +
  29.121 +        if ( writeable_buffer && 
  29.122 +             unlikely(!get_page_type(page, PGT_writeable_page)) )
  29.123          {
  29.124 -            if ( page_type_count(page) == 0 )
  29.125 -            {
  29.126 -                page->flags &= ~PG_type_mask;
  29.127 -                /* No need for PG_need_flush here. */
  29.128 -                page->flags |= PGT_writeable_page;
  29.129 -            }
  29.130 -            get_page_type(page);
  29.131 +            put_page(page);
  29.132 +            goto fail;
  29.133          }
  29.134 -        get_page_tot(page);
  29.135      }
  29.136 +
  29.137 +    return 1;
  29.138 +
  29.139 + fail:
  29.140 +    while ( pfn-- > (buffer >> PAGE_SHIFT) )
  29.141 +    {        
  29.142 +        if ( writeable_buffer )
  29.143 +            put_page_type(&frame_table[pfn]);
  29.144 +        put_page(&frame_table[pfn]);
  29.145 +    }
  29.146 +    return 0;
  29.147  }
  29.148  
  29.149 -static void unlock_buffer(struct task_struct *p,
  29.150 -                          unsigned long buffer,
  29.151 +static void unlock_buffer(unsigned long buffer,
  29.152                            unsigned short size,
  29.153                            int writeable_buffer)
  29.154  {
  29.155 -    unsigned long    pfn;
  29.156 -    struct pfn_info *page;
  29.157 +    unsigned long pfn;
  29.158  
  29.159 -    spin_lock(&p->page_lock);
  29.160      for ( pfn = buffer >> PAGE_SHIFT; 
  29.161            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
  29.162            pfn++ )
  29.163      {
  29.164 -        page = frame_table + pfn;
  29.165          if ( writeable_buffer )
  29.166 -            put_page_type(page);
  29.167 -        put_page_tot(page);
  29.168 +            put_page_type(&frame_table[pfn]);
  29.169 +        put_page(&frame_table[pfn]);
  29.170      }
  29.171 -    spin_unlock(&p->page_lock);
  29.172  }
  29.173  
  29.174  static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
  29.175 @@ -480,8 +432,6 @@ static void dispatch_rw_block_io(struct 
  29.176      int new_segs, nr_psegs = 0;
  29.177      phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
  29.178  
  29.179 -    spin_lock(&p->page_lock);
  29.180 -
  29.181      /* Check that number of segments is sane. */
  29.182      if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
  29.183      {
  29.184 @@ -506,7 +456,7 @@ static void dispatch_rw_block_io(struct 
  29.185              goto bad_descriptor;
  29.186          }
  29.187  
  29.188 -        if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
  29.189 +        if ( !lock_buffer(p, buffer, nr_sects<<9, (operation==READ)) )
  29.190  	{
  29.191              DPRINTK("invalid buffer\n");
  29.192              goto bad_descriptor;
  29.193 @@ -530,6 +480,7 @@ static void dispatch_rw_block_io(struct 
  29.194                          req->sector_number + tot_sects, 
  29.195                          req->sector_number + tot_sects + nr_sects, 
  29.196                          req->device); 
  29.197 +                unlock_buffer(buffer, nr_sects<<9, (operation==READ));
  29.198                  goto bad_descriptor;
  29.199              }
  29.200  
  29.201 @@ -546,12 +497,6 @@ static void dispatch_rw_block_io(struct 
  29.202          if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
  29.203      }
  29.204  
  29.205 -    /* Lock pages associated with each buffer head. */
  29.206 -    for ( i = 0; i < nr_psegs; i++ )
  29.207 -        __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, 
  29.208 -                      (operation==READ));
  29.209 -    spin_unlock(&p->page_lock);
  29.210 -
  29.211      atomic_inc(&nr_pending);
  29.212      pending_req = pending_reqs + pending_ring[pending_cons];
  29.213      PENDREQ_IDX_INC(pending_cons);
  29.214 @@ -594,7 +539,6 @@ static void dispatch_rw_block_io(struct 
  29.215      return;
  29.216  
  29.217   bad_descriptor:
  29.218 -    spin_unlock(&p->page_lock);
  29.219      make_response(p, req->id, req->operation, 1);
  29.220  } 
  29.221  
  29.222 @@ -670,7 +614,7 @@ void init_blkdev_info(struct task_struct
  29.223      if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
  29.224      p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
  29.225      clear_page(p->blk_ring_base);
  29.226 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
  29.227 +    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
  29.228      p->blkdev_list.next = NULL;
  29.229      spin_lock_init(&p->vbd_lock);
  29.230  }
  29.231 @@ -680,7 +624,6 @@ void destroy_blkdev_info(struct task_str
  29.232  {
  29.233      ASSERT(!__on_blkdev_list(p));
  29.234      UNSHARE_PFN(virt_to_page(p->blk_ring_base));
  29.235 -    free_page((unsigned long)p->blk_ring_base);
  29.236      destroy_all_vbds(p);
  29.237  }
  29.238  
    30.1 --- a/xen/drivers/block/xen_vbd.c	Sat Dec 20 23:39:49 2003 +0000
    30.2 +++ b/xen/drivers/block/xen_vbd.c	Sat Dec 20 23:41:19 2003 +0000
    30.3 @@ -23,13 +23,6 @@
    30.4  extern int ide_probe_devices(xen_disk_info_t *xdi);
    30.5  extern int scsi_probe_devices(xen_disk_info_t *xdi);
    30.6  
    30.7 -
    30.8 -#if 0
    30.9 -#define DPRINTK(_f, _a...) printk( _f , ## _a )
   30.10 -#else
   30.11 -#define DPRINTK(_f, _a...) ((void)0)
   30.12 -#endif
   30.13 -
   30.14  /* XXX SMH: crappy 'hash function' .. fix when care. */
   30.15  #define HSH(_x) ((_x) & (VBD_HTAB_SZ - 1))
   30.16  
   30.17 @@ -447,16 +440,9 @@ long vbd_probe(vbd_probe_t *probe)
   30.18      if ( (probe->domain == VBD_PROBE_ALL) || IS_PRIV(p) )
   30.19      { 
   30.20          /* Privileged domains always get access to the 'real' devices. */
   30.21 -        if ( (ret = ide_probe_devices(&probe->xdi)) != 0 ) 
   30.22 -        {
   30.23 -            DPRINTK("vbd_probe: error %d in probing ide devices\n", ret); 
   30.24 +        if ( ((ret = ide_probe_devices(&probe->xdi)) != 0) ||
   30.25 +             ((ret = scsi_probe_devices(&probe->xdi)) != 0) )
   30.26              goto out; 
   30.27 -        }
   30.28 -        if ( (ret = scsi_probe_devices(&probe->xdi)) != 0 )
   30.29 -        { 
   30.30 -            DPRINTK("vbd_probe: error %d in probing scsi devices\n", ret); 
   30.31 -            goto out; 
   30.32 -        }
   30.33      } 
   30.34  
   30.35      if ( probe->domain == VBD_PROBE_ALL )
   30.36 @@ -469,8 +455,6 @@ long vbd_probe(vbd_probe_t *probe)
   30.37              { 
   30.38                  if( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
   30.39                  { 
   30.40 -                    DPRINTK("vbd_probe: error %d in probing virtual devices\n",
   30.41 -                            ret); 
   30.42                      read_unlock_irqrestore(&tasklist_lock, flags);
   30.43                      goto out; 
   30.44                  }
   30.45 @@ -478,17 +462,12 @@ long vbd_probe(vbd_probe_t *probe)
   30.46          }
   30.47          read_unlock_irqrestore(&tasklist_lock, flags);
   30.48      } 
   30.49 -    else 
   30.50 -    { 
   30.51 -        if ( (ret = vbd_probe_devices(&probe->xdi, p)) )
   30.52 -        { 
   30.53 -            DPRINTK("vbd_probe: error %d in probing virtual devices\n", ret); 
   30.54 -            goto out; 
   30.55 -        }
   30.56 -
   30.57 -    }
   30.58 +    else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
   30.59 +        goto out; 
   30.60  
   30.61   out: 
   30.62 +    if ( ret != 0 )
   30.63 +        DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); 
   30.64      if ( p != NULL )
   30.65          put_task_struct(p); 
   30.66      return ret; 
    31.1 --- a/xen/drivers/net/e1000/e1000_main.c	Sat Dec 20 23:39:49 2003 +0000
    31.2 +++ b/xen/drivers/net/e1000/e1000_main.c	Sat Dec 20 23:41:19 2003 +0000
    31.3 @@ -1816,10 +1816,12 @@ e1000_xmit_frame(struct sk_buff *skb, st
    31.4  static void
    31.5  e1000_tx_timeout(struct net_device *netdev)
    31.6  {
    31.7 +#if 0
    31.8  	struct e1000_adapter *adapter = netdev->priv;
    31.9  
   31.10  	/* Do the reset outside of interrupt context */
   31.11 -	//schedule_work(&adapter->tx_timeout_task);
   31.12 +	schedule_work(&adapter->tx_timeout_task);
   31.13 +#endif
   31.14  	e1000_tx_timeout_task(netdev); // XXXX HACK!!! XEN
   31.15  }
   31.16  
    32.1 --- a/xen/include/asm-i386/atomic.h	Sat Dec 20 23:39:49 2003 +0000
    32.2 +++ b/xen/include/asm-i386/atomic.h	Sat Dec 20 23:41:19 2003 +0000
    32.3 @@ -186,15 +186,6 @@ static __inline__ int atomic_add_negativ
    32.4  	return c;
    32.5  }
    32.6  
    32.7 -/* These are x86-specific, used by some header files */
    32.8 -#define atomic_clear_mask(mask, addr) \
    32.9 -__asm__ __volatile__(LOCK "andl %0,%1" \
   32.10 -: : "r" (~(mask)),"m" (*addr) : "memory")
   32.11 -
   32.12 -#define atomic_set_mask(mask, addr) \
   32.13 -__asm__ __volatile__(LOCK "orl %0,%1" \
   32.14 -: : "r" (mask),"m" (*addr) : "memory")
   32.15 -
   32.16  /* Atomic operations are already serializing on x86 */
   32.17  #define smp_mb__before_atomic_dec()	barrier()
   32.18  #define smp_mb__after_atomic_dec()	barrier()
    33.1 --- a/xen/include/asm-i386/flushtlb.h	Sat Dec 20 23:39:49 2003 +0000
    33.2 +++ b/xen/include/asm-i386/flushtlb.h	Sat Dec 20 23:41:19 2003 +0000
    33.3 @@ -1,40 +1,39 @@
    33.4  /******************************************************************************
    33.5   * flushtlb.h
    33.6   * 
    33.7 - * TLB flush macros that count flushes.  Counting is used to enforce 
    33.8 - * zero-copy safety, particularily for the network code.
    33.9 - *
   33.10 - * akw - Jan 21, 2003
   33.11 + * TLB flushes are timestamped using a global virtual 'clock' which ticks
   33.12 + * on any TLB flush on any processor.
   33.13 + * 
   33.14 + * Copyright (c) 2003, K A Fraser
   33.15   */
   33.16  
   33.17 -#ifndef __FLUSHTLB_H
   33.18 -#define __FLUSHTLB_H
   33.19 +#ifndef __FLUSHTLB_H__
   33.20 +#define __FLUSHTLB_H__
   33.21  
   33.22  #include <xeno/smp.h>
   33.23 -#include <asm/atomic.h>
   33.24  
   33.25 -atomic_t tlb_flush_count[NR_CPUS];
   33.26 -
   33.27 -#define __write_cr3_counted(__pa)                                       \
   33.28 -    do {                                                                \
   33.29 -                __asm__ __volatile__ (                                  \
   33.30 -                        "movl %0, %%cr3;"                               \
   33.31 -                        :: "r" (__pa)                                   \
   33.32 -                        : "memory");                                    \
   33.33 -                atomic_inc(&tlb_flush_count[smp_processor_id()]);       \
   33.34 -    } while (0)
   33.35 +/*
   33.36 + * Every GLOBAL_FLUSH_PERIOD ticks of the tlbflush clock, every TLB in the
   33.37 + * system is guaranteed to have been flushed.
   33.38 + */
   33.39 +#define GLOBAL_FLUSH_PERIOD (1<<16)
   33.40  
   33.41 -#define __flush_tlb_counted()                                           \
   33.42 -        do {                                                            \
   33.43 -                unsigned int tmpreg;                                    \
   33.44 -                                                                        \
   33.45 -                __asm__ __volatile__(                                   \
   33.46 -                        "movl %%cr3, %0;  # flush TLB \n"               \
   33.47 -                        "movl %0, %%cr3;                "               \
   33.48 -                        : "=r" (tmpreg)                                 \
   33.49 -                        :: "memory");                                   \
   33.50 -                atomic_inc(&tlb_flush_count[smp_processor_id()]);       \
   33.51 -        } while (0)
   33.52 +/*
   33.53 + * '_cpu_stamp' is the current timestamp for the CPU we are testing.
   33.54 + * '_lastuse_stamp' is a timestamp taken when the PFN we are testing was last 
   33.55 + * used for a purpose that may have caused the CPU's TLB to become tainted.
   33.56 + */
   33.57 +#define NEED_FLUSH(_cpu_stamp, _lastuse_stamp) \
   33.58 + (((_cpu_stamp) > (_lastuse_stamp)) ||         \
   33.59 +  (((_lastuse_stamp) - (_cpu_stamp)) > (2*GLOBAL_FLUSH_PERIOD)))
   33.60  
   33.61 -#endif
   33.62 -                           
   33.63 +extern unsigned long tlbflush_mask;
   33.64 +extern unsigned long tlbflush_clock;
   33.65 +extern unsigned long tlbflush_time[NR_CPUS];
   33.66 +
   33.67 +extern void new_tlbflush_clock_period(void);
   33.68 +
   33.69 +extern void write_cr3_counted(unsigned long pa);
   33.70 +extern void flush_tlb_counted(void);
   33.71 +
   33.72 +#endif /* __FLUSHTLB_H__ */
    34.1 --- a/xen/include/asm-i386/io.h	Sat Dec 20 23:39:49 2003 +0000
    34.2 +++ b/xen/include/asm-i386/io.h	Sat Dec 20 23:41:19 2003 +0000
    34.3 @@ -36,10 +36,9 @@ static inline void * phys_to_virt(unsign
    34.4  	return __va(address);
    34.5  }
    34.6  
    34.7 -/*
    34.8 - * Change "struct page" to physical address.
    34.9 - */
   34.10 -#define page_to_phys(page)	((page - frame_table) << PAGE_SHIFT)
   34.11 +#define page_to_pfn(_page)  ((unsigned long)((_page) - frame_table))
   34.12 +#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT)
   34.13 +#define page_to_virt(_page) phys_to_virt(page_to_phys(_page))
   34.14  
   34.15  extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
   34.16  
    35.1 --- a/xen/include/asm-i386/page.h	Sat Dec 20 23:39:49 2003 +0000
    35.2 +++ b/xen/include/asm-i386/page.h	Sat Dec 20 23:41:19 2003 +0000
    35.3 @@ -92,7 +92,7 @@ typedef struct { unsigned long pt_lo; } 
    35.4  extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE];
    35.5  extern void paging_init(void);
    35.6  
    35.7 -#define __flush_tlb() __flush_tlb_counted()
    35.8 +#define __flush_tlb() flush_tlb_counted()
    35.9  
   35.10  /* Flush global pages as well. */
   35.11  
   35.12 @@ -111,10 +111,10 @@ extern void paging_init(void);
   35.13          } while (0)
   35.14  
   35.15  
   35.16 -#define __flush_tlb_all()						\
   35.17 +#define __flush_tlb_pge()						\
   35.18  	do {								\
   35.19                  __pge_off();                                            \
   35.20 -		__flush_tlb_counted();					\
   35.21 +		flush_tlb_counted();					\
   35.22                  __pge_on();                                             \
   35.23  	} while (0)
   35.24  
    36.1 --- a/xen/include/asm-i386/pgalloc.h	Sat Dec 20 23:39:49 2003 +0000
    36.2 +++ b/xen/include/asm-i386/pgalloc.h	Sat Dec 20 23:41:19 2003 +0000
    36.3 @@ -47,28 +47,24 @@
    36.4  
    36.5  #ifndef CONFIG_SMP
    36.6  
    36.7 -#define flush_tlb()         __flush_tlb()
    36.8 -#define flush_tlb_all()     __flush_tlb_all()
    36.9 -#define local_flush_tlb()   __flush_tlb()
   36.10 -#define flush_tlb_cpu(_cpu) __flush_tlb()
   36.11 +#define flush_tlb()           __flush_tlb()
   36.12 +#define flush_tlb_all()       __flush_tlb()
   36.13 +#define flush_tlb_all_pge()   __flush_tlb_pge()
   36.14 +#define local_flush_tlb()     __flush_tlb()
   36.15 +#define flush_tlb_cpu(_cpu)   __flush_tlb()
   36.16 +#define flush_tlb_mask(_mask) __flush_tlb()
   36.17  
   36.18  #else
   36.19  
   36.20  #include <xeno/smp.h>
   36.21  
   36.22 -#define flush_tlb()	    __flush_tlb()
   36.23 -#define local_flush_tlb()   __flush_tlb()
   36.24 -
   36.25 -extern void flush_tlb_all(void);
   36.26 +extern void flush_tlb_mask(unsigned long mask);
   36.27 +extern void flush_tlb_all_pge(void);
   36.28  
   36.29 -extern void flush_tlb_others(unsigned long cpumask);
   36.30 -static inline void flush_tlb_cpu(unsigned int cpu)
   36.31 -{
   36.32 -    if ( cpu == smp_processor_id() )
   36.33 -        __flush_tlb();
   36.34 -    else
   36.35 -        flush_tlb_others(1<<cpu);
   36.36 -}
   36.37 +#define flush_tlb()	    __flush_tlb()
   36.38 +#define flush_tlb_all()     flush_tlb_mask((1 << smp_num_cpus) - 1)
   36.39 +#define local_flush_tlb()   __flush_tlb()
   36.40 +#define flush_tlb_cpu(_cpu) flush_tlb_mask(1 << (_cpu))
   36.41  
   36.42  #endif
   36.43  
    37.1 --- a/xen/include/asm-i386/smp.h	Sat Dec 20 23:39:49 2003 +0000
    37.2 +++ b/xen/include/asm-i386/smp.h	Sat Dec 20 23:41:19 2003 +0000
    37.3 @@ -1,15 +1,8 @@
    37.4  #ifndef __ASM_SMP_H
    37.5  #define __ASM_SMP_H
    37.6  
    37.7 -#ifndef __ASSEMBLY__
    37.8  #include <xeno/config.h>
    37.9  #include <asm/ptrace.h>
   37.10 -#include <asm/fixmap.h>
   37.11 -#include <asm/bitops.h>
   37.12 -#include <asm/mpspec.h>
   37.13 -#include <asm/io_apic.h>
   37.14 -#include <asm/apic.h>
   37.15 -#endif
   37.16  
   37.17  #ifdef CONFIG_SMP
   37.18  #define TARGET_CPUS cpu_online_map
   37.19 @@ -18,8 +11,6 @@
   37.20  #endif
   37.21  
   37.22  #ifdef CONFIG_SMP
   37.23 -#ifndef __ASSEMBLY__
   37.24 -
   37.25  /*
   37.26   * Private routines/data
   37.27   */
   37.28 @@ -74,6 +65,9 @@ extern void smp_store_cpu_info(int id);	
   37.29  
   37.30  #define smp_processor_id() (current->processor)
   37.31  
   37.32 +#include <asm/fixmap.h>
   37.33 +#include <asm/apic.h>
   37.34 +
   37.35  static __inline int hard_smp_processor_id(void)
   37.36  {
   37.37  	/* we don't want to mark this access volatile - bad code generation */
   37.38 @@ -86,7 +80,5 @@ static __inline int logical_smp_processo
   37.39  	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
   37.40  }
   37.41  
   37.42 -#endif /* !__ASSEMBLY__ */
   37.43 -
   37.44  #endif
   37.45  #endif
    38.1 --- a/xen/include/asm-i386/spinlock.h	Sat Dec 20 23:39:49 2003 +0000
    38.2 +++ b/xen/include/asm-i386/spinlock.h	Sat Dec 20 23:41:19 2003 +0000
    38.3 @@ -1,11 +1,10 @@
    38.4  #ifndef __ASM_SPINLOCK_H
    38.5  #define __ASM_SPINLOCK_H
    38.6  
    38.7 +#include <xeno/config.h>
    38.8 +#include <xeno/lib.h>
    38.9  #include <asm/atomic.h>
   38.10  #include <asm/rwlock.h>
   38.11 -#include <asm/page.h>
   38.12 -#include <xeno/config.h>
   38.13 -#include <xeno/lib.h>
   38.14  
   38.15  #if 0
   38.16  #define SPINLOCK_DEBUG	1
    39.1 --- a/xen/include/asm-i386/system.h	Sat Dec 20 23:39:49 2003 +0000
    39.2 +++ b/xen/include/asm-i386/system.h	Sat Dec 20 23:41:19 2003 +0000
    39.3 @@ -93,7 +93,34 @@ static inline unsigned long __cmpxchg(vo
    39.4  #define cmpxchg(ptr,o,n)\
    39.5  	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
    39.6  					(unsigned long)(n),sizeof(*(ptr))))
    39.7 -    
    39.8 +
    39.9 +
   39.10 +/*
   39.11 + * This function causes longword _o to be changed to _n at location _p.
   39.12 + * If this access causes a fault then we return 1, otherwise we return 0.
   39.13 + * If no fault occurs then _o is updated to teh value we saw at _p. If this
   39.14 + * is the same as the initial value of _o then _n is written to location _p.
   39.15 + */
   39.16 +#define cmpxchg_user(_p,_o,_n)                                          \
   39.17 +({                                                                      \
   39.18 +    int _rc;                                                            \
   39.19 +    __asm__ __volatile__ (                                              \
   39.20 +        "1: " LOCK_PREFIX "cmpxchgl %2,%3\n"                            \
   39.21 +        "2:\n"                                                          \
   39.22 +        ".section .fixup,\"ax\"\n"                                      \
   39.23 +        "3:     movl $1,%1\n"                                           \
   39.24 +        "       jmp 2b\n"                                               \
   39.25 +        ".previous\n"                                                   \
   39.26 +        ".section __ex_table,\"a\"\n"                                   \
   39.27 +        "       .align 4\n"                                             \
   39.28 +        "       .long 1b,3b\n"                                          \
   39.29 +        ".previous"                                                     \
   39.30 +        : "=a" (_o), "=r" (_rc)                                         \
   39.31 +        : "q" (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \
   39.32 +        : "memory");                                                    \
   39.33 +    _rc;                                                                \
   39.34 +})
   39.35 +
   39.36  /*
   39.37   * Force strict CPU ordering.
   39.38   * And yes, this is required on UP too when we're talking
    40.1 --- a/xen/include/hypervisor-ifs/dom0_ops.h	Sat Dec 20 23:39:49 2003 +0000
    40.2 +++ b/xen/include/hypervisor-ifs/dom0_ops.h	Sat Dec 20 23:41:19 2003 +0000
    40.3 @@ -141,8 +141,8 @@ typedef struct dom0_getpageframeinfo_st
    40.4  {
    40.5      /* IN variables. */
    40.6      unsigned long pfn;          /* Machine page frame number to query.       */
    40.7 +    unsigned int domain;        /* To which domain does the frame belong?    */
    40.8      /* OUT variables. */
    40.9 -    unsigned int domain;        /* To which domain does the frame belong?    */
   40.10      enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type?       */
   40.11  } dom0_getpageframeinfo_t;
   40.12  
    41.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h	Sat Dec 20 23:39:49 2003 +0000
    41.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h	Sat Dec 20 23:41:19 2003 +0000
    41.3 @@ -125,9 +125,9 @@
    41.4   *  which shifts the least bits out.
    41.5   */
    41.6  /* A normal page-table update request. */
    41.7 -#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is VA.      */
    41.8 +#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.      */
    41.9  /* DOM0 can make entirely unchecked updates which do not affect refcnts. */
   41.10 -#define MMU_UNCHECKED_PT_UPDATE  1 /* unchecked '*ptr = val'. ptr is VA.    */
   41.11 +#define MMU_UNCHECKED_PT_UPDATE  1 /* unchecked '*ptr = val'. ptr is MA.    */
   41.12  /* Update an entry in the machine->physical mapping table. */
   41.13  #define MMU_MACHPHYS_UPDATE      2 /* ptr = MA of frame to modify entry for */
   41.14  /* An extended command. */
    42.1 --- a/xen/include/xeno/config.h	Sat Dec 20 23:39:49 2003 +0000
    42.2 +++ b/xen/include/xeno/config.h	Sat Dec 20 23:41:19 2003 +0000
    42.3 @@ -145,6 +145,13 @@
    42.4  
    42.5  #define capable(_c) 0
    42.6  
    42.7 +#ifndef NDEBUG
    42.8 +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
    42.9 +                           __FILE__, __LINE__, ## _a)
   42.10 +#else
   42.11 +#define DPRINTK(_f, _a...) ((void)0)
   42.12 +#endif
   42.13 +
   42.14  #ifndef __ASSEMBLY__
   42.15  
   42.16  #include <xeno/compiler.h>
    43.1 --- a/xen/include/xeno/mm.h	Sat Dec 20 23:39:49 2003 +0000
    43.2 +++ b/xen/include/xeno/mm.h	Sat Dec 20 23:41:19 2003 +0000
    43.3 @@ -3,34 +3,35 @@
    43.4  #define __XENO_MM_H__
    43.5  
    43.6  #include <xeno/config.h>
    43.7 +#include <xeno/list.h>
    43.8 +#include <xeno/spinlock.h>
    43.9 +#include <xeno/perfc.h>
   43.10 +#include <xeno/sched.h>
   43.11 +
   43.12 +#include <asm/pgalloc.h>
   43.13  #include <asm/atomic.h>
   43.14  #include <asm/desc.h>
   43.15 -#include <xeno/list.h>
   43.16 +#include <asm/flushtlb.h>
   43.17 +#include <asm/io.h>
   43.18 +
   43.19  #include <hypervisor-ifs/hypervisor-if.h>
   43.20 -#include <xeno/spinlock.h>
   43.21  
   43.22 -/* XXX KAF: These may die eventually, but so many refs in slab.c :((( */
   43.23 +/*
   43.24 + * These are for compatibility with calls to the Linux memory allocators.
   43.25 + */
   43.26  
   43.27 -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */
   43.28  #define __GFP_DMA       0x01
   43.29 -
   43.30 -/* Action modifiers - doesn't change the zoning */
   43.31 +#define GFP_DMA         __GFP_DMA
   43.32  #define __GFP_WAIT      0x10    /* Can wait and reschedule? */
   43.33  #define __GFP_HIGH      0x20    /* Should access emergency pools? */
   43.34  #define __GFP_IO        0x40    /* Can start low memory physical IO? */
   43.35  #define __GFP_HIGHIO    0x80    /* Can start high mem physical IO? */
   43.36  #define __GFP_FS        0x100   /* Can call down to low-level FS? */
   43.37 -
   43.38  #define GFP_ATOMIC      (__GFP_HIGH)
   43.39 -#define GFP_KERNEL      (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
   43.40 +#define GFP_KERNEL      (__GFP_HIGH | __GFP_WAIT | __GFP_IO | \
   43.41 +                         __GFP_HIGHIO | __GFP_FS)
   43.42  
   43.43 -/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
   43.44 -   platforms, used as appropriate on others */
   43.45 -
   43.46 -#define GFP_DMA         __GFP_DMA
   43.47 -
   43.48 -
   43.49 -/******************************************************************************
   43.50 +/*
   43.51   * The following is for page_alloc.c.
   43.52   */
   43.53  
   43.54 @@ -44,95 +45,80 @@ void __free_pages(unsigned long p, int o
   43.55  #define free_page(_p) (__free_pages(_p,0))
   43.56  
   43.57  
   43.58 -/******************************************************************************
   43.59 - * The following is the array of page info. One entry per page owned
   43.60 - * by the hypervisor, indexed from `mem_map', just like Linux.
   43.61 - *
   43.62 - * 12.11.02. We no longer use struct page or mem_map, these are replaced
   43.63 - * with struct pfn_info and frame_table respectively. Boris Dragovic
   43.64 +/*
   43.65 + * Per-page-frame information.
   43.66   */
   43.67  
   43.68 -typedef struct pfn_info {
   43.69 -    struct list_head list;      /* ->mapping has some page lists. */
   43.70 -    unsigned long flags;        /* atomic flags. */
   43.71 -    unsigned long tot_count;    /* Total domain usage count. */
   43.72 -    unsigned long type_count;   /* pagetable/dir, or domain-writeable refs. */
   43.73 -} frame_table_t;
   43.74 -
   43.75 -#define get_page_tot(p)		 ((p)->tot_count++)
   43.76 -#define put_page_tot(p)		 \
   43.77 -    ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; })
   43.78 -#define page_tot_count(p)	 ((p)->tot_count)
   43.79 -#define set_page_tot_count(p,v)  ((p)->tot_count = v)
   43.80 -
   43.81 -#define get_page_type(p)	 ((p)->type_count++)
   43.82 -#define put_page_type(p)	 \
   43.83 -    ({ ASSERT((p)->type_count != 0); --(p)->type_count; })
   43.84 -#define page_type_count(p)	 ((p)->type_count)
   43.85 -#define set_page_type_count(p,v) ((p)->type_count = v)
   43.86 +struct pfn_info
   43.87 +{
   43.88 +    /* Each frame can be threaded onto a doubly-linked list. */
   43.89 +    struct list_head list;
   43.90 +    /* The following possible uses are context-dependent. */
   43.91 +    union {
   43.92 +        /* Page is in use and not a zombie: we keep a pointer to its owner. */
   43.93 +        struct task_struct *domain;
   43.94 +        /* Page is not currently allocated: mask of possibly-tainted TLBs. */
   43.95 +        unsigned long cpu_mask;
   43.96 +        /* Page is a zombie: this word currently has no use. */
   43.97 +        unsigned long _unused;
   43.98 +    } u;
   43.99 +    /* Reference count and various PGC_xxx flags and fields. */
  43.100 +    unsigned long       count_and_flags;
  43.101 +    /* Type reference count and various PGT_xxx flags and fields. */
  43.102 +    unsigned long       type_and_flags;
  43.103 +    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
  43.104 +    unsigned long       tlbflush_timestamp;
  43.105 +};
  43.106  
  43.107 -#define PG_domain_mask MAX_DOMAIN_ID /* owning domain (16 bits) */
  43.108 -/* hypervisor flags (domain == 0) */
  43.109 -#define PG_slab	       24
  43.110 -/* domain flags (domain != 0) */
  43.111 -/*
  43.112 - * NB. The following page types are MUTUALLY EXCLUSIVE.
  43.113 - * At most one can be true at any point, and 'type_count' counts how many
  43.114 - * references exist of the current type. A change in type can only occur
  43.115 - * when type_count == 0.
  43.116 - */
  43.117 -#define PG_type_mask        (15<<24) /* bits 24-27 */
  43.118 -#define PGT_none            (0<<24) /* no special uses of this page */
  43.119 -#define PGT_l1_page_table   (1<<24) /* using this page as an L1 page table? */
  43.120 -#define PGT_l2_page_table   (2<<24) /* using this page as an L2 page table? */
  43.121 -#define PGT_l3_page_table   (3<<24) /* using this page as an L3 page table? */
  43.122 -#define PGT_l4_page_table   (4<<24) /* using this page as an L4 page table? */
  43.123 -#define PGT_gdt_page        (5<<24) /* using this page in a GDT? */
  43.124 -#define PGT_ldt_page        (6<<24) /* using this page in an LDT? */
  43.125 -#define PGT_writeable_page  (7<<24) /* has writable mappings of this page? */
  43.126 + /* The following page types are MUTUALLY EXCLUSIVE. */
  43.127 +#define PGT_none            (0<<29) /* no special uses of this page */
  43.128 +#define PGT_l1_page_table   (1<<29) /* using this page as an L1 page table? */
  43.129 +#define PGT_l2_page_table   (2<<29) /* using this page as an L2 page table? */
  43.130 +#define PGT_l3_page_table   (3<<29) /* using this page as an L3 page table? */
  43.131 +#define PGT_l4_page_table   (4<<29) /* using this page as an L4 page table? */
  43.132 +#define PGT_gdt_page        (5<<29) /* using this page in a GDT? */
  43.133 +#define PGT_ldt_page        (6<<29) /* using this page in an LDT? */
  43.134 +#define PGT_writeable_page  (7<<29) /* has writable mappings of this page? */
  43.135 +#define PGT_type_mask       (7<<29) /* Bits 29-31. */
  43.136 + /* Has this page been validated for use as its current type? */
  43.137 +#define _PGT_validated      28
  43.138 +#define PGT_validated       (1<<_PGT_validated)
  43.139 + /* 28-bit count of uses of this frame as its current type. */
  43.140 +#define PGT_count_mask      ((1<<28)-1)
  43.141  
  43.142 -/*
  43.143 - * This bit indicates that the TLB must be flushed when the type count of this
  43.144 - * frame drops to zero. This is needed on current x86 processors only for
  43.145 - * frames which have guestos-accessible writeable mappings. In this case we
  43.146 - * must prevent stale TLB entries allowing the frame to be written if it used
  43.147 - * for a page table, for example.
  43.148 - * 
  43.149 - * We have this bit because the writeable type is actually also used to pin a
  43.150 - * page when it is used as a disk read buffer. This doesn't require a TLB flush
  43.151 - * because the frame never has a mapping in the TLB.
  43.152 - */
  43.153 -#define PG_need_flush       (1<<28)
  43.154 + /* The owner of this page is dead: 'u.domain' is no longer valid. */
  43.155 +#define _PGC_zombie                   31
  43.156 +#define PGC_zombie                    (1<<_PGC_zombie)
  43.157 + /* For safety, force a TLB flush when this page's type changes. */
  43.158 +#define _PGC_tlb_flush_on_type_change 30
  43.159 +#define PGC_tlb_flush_on_type_change  (1<<_PGC_tlb_flush_on_type_change)
  43.160 + /* Owning guest has pinned this page to its current type? */
  43.161 +#define _PGC_guest_pinned             29
  43.162 +#define PGC_guest_pinned              (1<<_PGC_guest_pinned)
  43.163 + /* Cleared when the owning guest 'frees' this page. */
  43.164 +#define _PGC_allocated                28
  43.165 +#define PGC_allocated                 (1<<_PGC_allocated)
  43.166 + /* 28-bit count of references to this frame. */
  43.167 +#define PGC_count_mask                ((1<<28)-1)
  43.168  
  43.169 -/*
  43.170 - * This bit indicates that the guest OS has pinned the page to its current
  43.171 - * type. For page tables this can avoid the frame scanning and reference-count
  43.172 - * updates that occur when the type count falls to zero.
  43.173 - */
  43.174 -#define PG_guest_pinned     (1<<29)
  43.175 +/* We trust the slab allocator in slab.c, and our use of it. */
  43.176 +#define PageSlab(page)		(1)
  43.177 +#define PageSetSlab(page)	((void)0)
  43.178 +#define PageClearSlab(page)	((void)0)
  43.179 +
  43.180 +#define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < MAX_MONITOR_ADDRESS)
  43.181  
  43.182 -#define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
  43.183 -#define PageSetSlab(page)	set_bit(PG_slab, &(page)->flags)
  43.184 -#define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
  43.185 -
  43.186 -#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                            \
  43.187 -    do {                                                             \
  43.188 -        (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \
  43.189 -        set_page_tot_count((_pfn), 2);                               \
  43.190 -        set_page_type_count((_pfn), 2);                              \
  43.191 +#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                  \
  43.192 +    do {                                                                   \
  43.193 +        (_pfn)->u.domain = (_dom);                                         \
  43.194 +        wmb(); /* install valid domain ptr before updating refcnt. */      \
  43.195 +        (_pfn)->count_and_flags = 1; /* Xen holds a writeable reference */ \
  43.196 +        (_pfn)->type_and_flags  = PGT_writeable_page | PGT_validated | 1;  \
  43.197      } while ( 0 )
  43.198  
  43.199 -#define UNSHARE_PFN(_pfn)                                            \
  43.200 -    do {                                                             \
  43.201 -        (_pfn)->flags = 0;                                           \
  43.202 -        set_page_tot_count((_pfn), 0);                               \
  43.203 -        set_page_type_count((_pfn), 0);                              \
  43.204 -    } while ( 0 )
  43.205 +#define UNSHARE_PFN(_pfn) put_page_and_type(_pfn)
  43.206  
  43.207 -/* The array of struct pfn_info,  
  43.208 - * free pfn list and number of free pfns in the free list
  43.209 - */
  43.210 -extern frame_table_t * frame_table;
  43.211 +extern struct pfn_info *frame_table;
  43.212  extern unsigned long frame_table_size;
  43.213  extern struct list_head free_list;
  43.214  extern spinlock_t free_list_lock;
  43.215 @@ -140,6 +126,180 @@ extern unsigned int free_pfns;
  43.216  extern unsigned long max_page;
  43.217  void init_frametable(unsigned long nr_pages);
  43.218  
  43.219 +struct pfn_info *alloc_domain_page(struct task_struct *p);
  43.220 +void free_domain_page(struct pfn_info *page);
  43.221 +
  43.222 +int alloc_page_type(struct pfn_info *page, unsigned int type);
  43.223 +void free_page_type(struct pfn_info *page, unsigned int type);
  43.224 +
  43.225 +static inline void put_page(struct pfn_info *page)
  43.226 +{
  43.227 +    unsigned long nx, x, y = page->count_and_flags;
  43.228 +
  43.229 +    do {
  43.230 +        x  = y;
  43.231 +        nx = x - 1;
  43.232 +    }
  43.233 +    while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
  43.234 +
  43.235 +    if ( unlikely((nx & PGC_count_mask) == 0) )
  43.236 +        free_domain_page(page);
  43.237 +}
  43.238 +
  43.239 +
  43.240 +static inline int get_page(struct pfn_info *page,
  43.241 +                           struct task_struct *domain)
  43.242 +{
  43.243 +    unsigned long x, nx, y = page->count_and_flags;
  43.244 +    struct task_struct *p, *np = page->u.domain;
  43.245 +
  43.246 +    do {
  43.247 +        x  = y;
  43.248 +        nx = x + 1;
  43.249 +        p  = np;
  43.250 +        if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
  43.251 +             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
  43.252 +             unlikely(x & PGC_zombie) ||             /* Zombie? */
  43.253 +             unlikely(p != domain) )                 /* Wrong owner? */
  43.254 +        {
  43.255 +            DPRINTK("Error pfn %08lx: ed=%p,sd=%p,caf=%08lx\n",
  43.256 +                    page_to_pfn(page), domain, p, x);
  43.257 +            return 0;
  43.258 +        }
  43.259 +        __asm__ __volatile__(
  43.260 +            LOCK_PREFIX "cmpxchg8b %3"
  43.261 +            : "=a" (np), "=d" (y), "=b" (p),
  43.262 +              "=m" (*(volatile unsigned long long *)(&page->u.domain))
  43.263 +            : "0" (p), "1" (x), "b" (p), "c" (nx) );
  43.264 +    }
  43.265 +    while ( unlikely(np != p) || unlikely(y != x) );
  43.266 +
  43.267 +    return 1;
  43.268 +}
  43.269 +
  43.270 +
  43.271 +static inline void put_page_type(struct pfn_info *page)
  43.272 +{
  43.273 +    unsigned long nx, x, y = page->type_and_flags;
  43.274 +
  43.275 + again:
  43.276 +    do {
  43.277 +        x  = y;
  43.278 +        nx = x - 1;
  43.279 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  43.280 +        {
  43.281 +            page->tlbflush_timestamp = tlbflush_clock;
  43.282 +            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  43.283 +                 likely(nx & PGT_validated) )
  43.284 +            {
  43.285 +                /*
  43.286 +                 * Page-table pages must be unvalidated when count is zero. The
  43.287 +                 * 'free' is safe because the refcnt is non-zero and the
  43.288 +                 * validated bit is clear => other ops will spin or fail.
  43.289 +                 */
  43.290 +                if ( unlikely((y = cmpxchg(&page->type_and_flags, x, 
  43.291 +                                           x & ~PGT_validated)) != x) )
  43.292 +                    goto again;
  43.293 +                /* We cleared the 'valid bit' so we must do the clear up. */
  43.294 +                free_page_type(page, x & PGT_type_mask);
  43.295 +                /* Carry on as we were, but with the 'valid bit' now clear. */
  43.296 +                x  &= ~PGT_validated;
  43.297 +                nx &= ~PGT_validated;
  43.298 +            }
  43.299 +        }
  43.300 +    }
  43.301 +    while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
  43.302 +}
  43.303 +
  43.304 +
  43.305 +static inline int get_page_type(struct pfn_info *page, unsigned long type)
  43.306 +{
  43.307 +    unsigned long nx, x, y = page->type_and_flags;
  43.308 + again:
  43.309 +    do {
  43.310 +        x  = y;
  43.311 +        nx = x + 1;
  43.312 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  43.313 +        {
  43.314 +            DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
  43.315 +            return 0;
  43.316 +        }
  43.317 +        else if ( unlikely((x & PGT_count_mask) == 0) )
  43.318 +        {
  43.319 +            if ( (x & PGT_type_mask) != type )
  43.320 +            {
  43.321 +                nx &= ~(PGT_type_mask | PGT_validated);
  43.322 +                nx |= type;
  43.323 +                /* No extra validation needed for writeable pages. */
  43.324 +                if ( type == PGT_writeable_page )
  43.325 +                    nx |= PGT_validated;
  43.326 +            }
  43.327 +        }
  43.328 +        else if ( unlikely((x & PGT_type_mask) != type) )
  43.329 +        {
  43.330 +            DPRINTK("Unexpected type (saw %08lx != exp %08lx) for pfn %08lx\n",
  43.331 +                    x & PGT_type_mask, type, page_to_pfn(page));
  43.332 +            return 0;
  43.333 +        }
  43.334 +        else if ( unlikely(!(x & PGT_validated)) )
  43.335 +        {
  43.336 +            /* Someone else is updating validation of this page. Wait... */
  43.337 +            while ( (y = page->type_and_flags) != x )
  43.338 +            {
  43.339 +                rep_nop();
  43.340 +                barrier();
  43.341 +            }
  43.342 +            goto again;
  43.343 +        }
  43.344 +    }
  43.345 +    while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
  43.346 +
  43.347 +    if ( unlikely(!(nx & PGT_validated)) )
  43.348 +    {
  43.349 +        /* Try to validate page type; drop the new reference on failure. */
  43.350 +        if ( unlikely(!alloc_page_type(page, type)) )
  43.351 +        {
  43.352 +            DPRINTK("Error while validating pfn %08lx for type %08lx\n",
  43.353 +                    page_to_pfn(page), type);
  43.354 +            put_page_type(page);
  43.355 +            return 0;
  43.356 +        }
  43.357 +        set_bit(_PGT_validated, &page->type_and_flags);
  43.358 +    }
  43.359 +
  43.360 +    return 1;
  43.361 +}
  43.362 +
  43.363 +
  43.364 +static inline void put_page_and_type(struct pfn_info *page)
  43.365 +{
  43.366 +    put_page_type(page);
  43.367 +    put_page(page);
  43.368 +}
  43.369 +
  43.370 +
  43.371 +static inline int get_page_and_type(struct pfn_info *page,
  43.372 +                                    struct task_struct *domain,
  43.373 +                                    unsigned int type)
  43.374 +{
  43.375 +    int rc = get_page(page, domain);
  43.376 +
  43.377 +    if ( likely(rc) && unlikely(!get_page_type(page, type)) )
  43.378 +    {
  43.379 +        put_page(page);
  43.380 +        rc = 0;
  43.381 +    }
  43.382 +
  43.383 +    return rc;
  43.384 +}
  43.385 +
  43.386 +#define ASSERT_PAGE_IS_TYPE(_p, _t)                \
  43.387 +    ASSERT(((_p)->type_and_flags & PGT_type_mask) == (_t));  \
  43.388 +    ASSERT(((_p)->type_and_flags & PGT_count_mask) != 0)
  43.389 +#define ASSERT_PAGE_IS_DOMAIN(_p, _d)              \
  43.390 +    ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0);  \
  43.391 +    ASSERT((_p)->u.domain == (_d))
  43.392 +
  43.393  int check_descriptor(unsigned long a, unsigned long b);
  43.394  
  43.395  /*
    44.1 --- a/xen/include/xeno/perfc.h	Sat Dec 20 23:39:49 2003 +0000
    44.2 +++ b/xen/include/xeno/perfc.h	Sat Dec 20 23:41:19 2003 +0000
    44.3 @@ -1,6 +1,6 @@
    44.4 -/*
    44.5 - * xen performance counters
    44.6 - */
    44.7 +
    44.8 +#ifndef __XENO_PERFC_H__
    44.9 +#define __XENO_PERFC_H__
   44.10  
   44.11  #include <asm/atomic.h>
   44.12  
   44.13 @@ -53,3 +53,4 @@ extern struct perfcounter_t perfcounters
   44.14  #define perfc_addc(x,y)   atomic_add((y), &perfcounters.x[smp_processor_id()])
   44.15  #define perfc_adda(x,y,z) atomic_add((z), &perfcounters.x[y])
   44.16  
   44.17 +#endif /* __XENO_PERFC_H__ */
    45.1 --- a/xen/include/xeno/perfc_defn.h	Sat Dec 20 23:39:49 2003 +0000
    45.2 +++ b/xen/include/xeno/perfc_defn.h	Sat Dec 20 23:41:19 2003 +0000
    45.3 @@ -12,7 +12,6 @@ PERFCOUNTER( net_hypercalls, "network hy
    45.4  PERFCOUNTER( net_rx_congestion_drop, "net rx congestion drops" )
    45.5  PERFCOUNTER( net_rx_capacity_drop, "net rx capacity drops" )
    45.6  PERFCOUNTER( net_rx_delivered, "net rx delivered" )
    45.7 -PERFCOUNTER( net_rx_tlbflush, "net rx tlb flushes" )
    45.8  PERFCOUNTER( net_tx_transmitted, "net tx transmitted" )
    45.9  
   45.10  PERFCOUNTER_CPU( domain_page_tlb_flush, "domain page tlb flushes" )
    46.1 --- a/xen/include/xeno/sched.h	Sat Dec 20 23:39:49 2003 +0000
    46.2 +++ b/xen/include/xeno/sched.h	Sat Dec 20 23:41:19 2003 +0000
    46.3 @@ -4,7 +4,6 @@
    46.4  #include <xeno/config.h>
    46.5  #include <xeno/types.h>
    46.6  #include <xeno/spinlock.h>
    46.7 -#include <asm/page.h>
    46.8  #include <asm/ptrace.h>
    46.9  #include <xeno/smp.h>
   46.10  #include <asm/processor.h>
   46.11 @@ -16,7 +15,6 @@
   46.12  #include <xeno/time.h>
   46.13  #include <xeno/ac_timer.h>
   46.14  #include <xeno/delay.h>
   46.15 -#include <xeno/slab.h>
   46.16  
   46.17  #define MAX_DOMAIN_NAME 16
   46.18  
   46.19 @@ -94,9 +92,10 @@ struct task_struct
   46.20      
   46.21      unsigned int domain;        /* domain id */
   46.22  
   46.23 -    struct list_head pg_head;
   46.24 -    unsigned int tot_pages;     /* number of pages currently possesed */
   46.25 -    unsigned int max_pages;     /* max number of pages that can be possesed */
   46.26 +    spinlock_t       page_list_lock;
   46.27 +    struct list_head page_list;
   46.28 +    unsigned int     tot_pages; /* number of pages currently possesed */
   46.29 +    unsigned int     max_pages; /* max number of pages that can be possesed */
   46.30  
   46.31      /* scheduling */
   46.32      struct list_head run_list;
   46.33 @@ -132,8 +131,6 @@ struct task_struct
   46.34  
   46.35      /* VM */
   46.36      struct mm_struct mm;
   46.37 -    /* We need this lock to check page types and frob reference counts. */
   46.38 -    spinlock_t page_lock;
   46.39  
   46.40      mm_segment_t addr_limit;
   46.41  
   46.42 @@ -194,6 +191,8 @@ extern struct task_struct *idle_task[NR_
   46.43  
   46.44  #define STACK_SIZE PAGE_SIZE
   46.45  
   46.46 +#include <xeno/slab.h>
   46.47 +
   46.48  extern kmem_cache_t *task_struct_cachep;
   46.49  #define alloc_task_struct()  \
   46.50    ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
    47.1 --- a/xen/include/xeno/vif.h	Sat Dec 20 23:39:49 2003 +0000
    47.2 +++ b/xen/include/xeno/vif.h	Sat Dec 20 23:41:19 2003 +0000
    47.3 @@ -34,7 +34,7 @@ extern struct net_device *the_dev;
    47.4  typedef struct rx_shadow_entry_st 
    47.5  {
    47.6      unsigned short id;
    47.7 -    unsigned short flush_count; /* 16 bits should be enough */
    47.8 +    unsigned short _pad;
    47.9      unsigned long  pte_ptr;
   47.10      unsigned long  buf_pfn;
   47.11  } rx_shadow_entry_t;
    48.1 --- a/xen/net/dev.c	Sat Dec 20 23:39:49 2003 +0000
    48.2 +++ b/xen/net/dev.c	Sat Dec 20 23:41:19 2003 +0000
    48.3 @@ -39,12 +39,6 @@
    48.4  #define rtnl_lock() ((void)0)
    48.5  #define rtnl_unlock() ((void)0)
    48.6  
    48.7 -#if 0
    48.8 -#define DPRINTK(_f, _a...) printk(_f , ## _a)
    48.9 -#else 
   48.10 -#define DPRINTK(_f, _a...) ((void)0)
   48.11 -#endif
   48.12 -
   48.13  #define TX_RING_INC(_i)    (((_i)+1) & (TX_RING_SIZE-1))
   48.14  #define RX_RING_INC(_i)    (((_i)+1) & (RX_RING_SIZE-1))
   48.15  #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
   48.16 @@ -54,9 +48,9 @@ struct skb_completion_queues skb_queue[N
   48.17  
   48.18  static int get_tx_bufs(net_vif_t *vif);
   48.19  
   48.20 -static void __make_tx_response(net_vif_t *vif, 
   48.21 -                               unsigned short id, 
   48.22 -                               unsigned char  st);
   48.23 +static void make_tx_response(net_vif_t     *vif, 
   48.24 +                             unsigned short id, 
   48.25 +                             unsigned char  st);
   48.26  static void make_rx_response(net_vif_t     *vif, 
   48.27                               unsigned short id, 
   48.28                               unsigned short size,
   48.29 @@ -499,89 +493,69 @@ struct netif_rx_stats netdev_rx_stat[NR_
   48.30  void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
   48.31  {
   48.32      rx_shadow_entry_t *rx;
   48.33 -    unsigned long *ptep; 
   48.34 +    unsigned long *ptep, pte; 
   48.35      struct pfn_info *old_page, *new_page, *pte_page;
   48.36      unsigned int i; 
   48.37      unsigned short size;
   48.38      unsigned char  offset, status = RING_STATUS_OK;
   48.39 +    struct task_struct *p = vif->domain;
   48.40  
   48.41      memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
   48.42      if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
   48.43          memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
   48.44  
   48.45 -    /*
   48.46 -     * Slightly gross: we need the page_lock so that we can do PTE checking.
   48.47 -     * However, we take it slightly early so that it can protect the update
   48.48 -     * of rx_cons. This saves us from grabbing two locks.
   48.49 -     */
   48.50 -    spin_lock(&vif->domain->page_lock);
   48.51 +    spin_lock(&vif->rx_lock);
   48.52  
   48.53      if ( (i = vif->rx_cons) == vif->rx_prod )
   48.54      {
   48.55 -        spin_unlock(&vif->domain->page_lock);
   48.56 +        spin_unlock(&vif->rx_lock);
   48.57          perfc_incr(net_rx_capacity_drop);
   48.58          return;
   48.59      }
   48.60 -    rx = vif->rx_shadow_ring + i;
   48.61 +    rx = &vif->rx_shadow_ring[i];
   48.62      vif->rx_cons = RX_RING_INC(i);
   48.63  
   48.64      size   = (unsigned short)skb->len;
   48.65      offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
   48.66  
   48.67 -    /* Release the page-table page. */
   48.68 -    pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
   48.69 -    put_page_type(pte_page);
   48.70 -    put_page_tot(pte_page);
   48.71 -
   48.72 -    old_page = frame_table + rx->buf_pfn;
   48.73 +    pte_page = &frame_table[rx->pte_ptr >> PAGE_SHIFT];
   48.74 +    old_page = &frame_table[rx->buf_pfn];
   48.75      new_page = skb->pf;
   48.76      
   48.77      ptep = map_domain_mem(rx->pte_ptr);
   48.78  
   48.79 -    if ( (*ptep & _PAGE_PRESENT) )
   48.80 +    new_page->u.domain = p;
   48.81 +    wmb(); /* make dom ptr visible before updating refcnt. */
   48.82 +    spin_lock(&p->page_list_lock);
   48.83 +    list_add(&new_page->list, &p->page_list);
   48.84 +    new_page->count_and_flags = PGC_allocated | 2;
   48.85 +    spin_unlock(&p->page_list_lock);
   48.86 +    get_page_type(new_page, PGT_writeable_page);
   48.87 +    set_bit(_PGC_tlb_flush_on_type_change, &new_page->count_and_flags);
   48.88 +    wmb(); /* Get type count and set flush bit before updating PTE. */
   48.89 +
   48.90 +    pte = *ptep;
   48.91 +    if ( unlikely(pte & _PAGE_PRESENT) || 
   48.92 +         unlikely(cmpxchg(ptep, pte, 
   48.93 +                          (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
   48.94 +                          ((new_page - frame_table) << PAGE_SHIFT))) != pte )
   48.95      {
   48.96 -        /* Bail out if the PTE has been reused under our feet. */
   48.97 -        list_add(&old_page->list, &vif->domain->pg_head);
   48.98 -        old_page->flags = vif->domain->domain;
   48.99          unmap_domain_mem(ptep);
  48.100 -        spin_unlock(&vif->domain->page_lock);
  48.101          status = RING_STATUS_BAD_PAGE;
  48.102          goto out;
  48.103      }
  48.104  
  48.105 -    /* Give the new page to the domain, marking it writeable. */
  48.106 -    set_page_type_count(new_page, 1);
  48.107 -    set_page_tot_count(new_page, 1);
  48.108 -    new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
  48.109 -    list_add(&new_page->list, &vif->domain->pg_head);
  48.110 -    
  48.111 -    /* Patch the PTE to map the new page as writeable. */
  48.112      machine_to_phys_mapping[new_page - frame_table] 
  48.113 -        = machine_to_phys_mapping[old_page - frame_table];        
  48.114 -    *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
  48.115 -        (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
  48.116 +        = machine_to_phys_mapping[old_page - frame_table];
  48.117      
  48.118      unmap_domain_mem(ptep);
  48.119  
  48.120 -    spin_unlock(&vif->domain->page_lock);
  48.121 -    
  48.122      /* Our skbuff now points at the guest's old frame. */
  48.123      skb->pf = old_page;
  48.124  
  48.125      /* Updates must happen before releasing the descriptor. */
  48.126      smp_wmb();
  48.127  
  48.128 -    /*
  48.129 -     * NB. The remote flush here should be safe, as we hold no locks. The 
  48.130 -     * network driver that called us should also have no nasty locks.
  48.131 -     */
  48.132 -    if ( rx->flush_count == (unsigned short)
  48.133 -         atomic_read(&tlb_flush_count[vif->domain->processor]) )
  48.134 -    {
  48.135 -        perfc_incr(net_rx_tlbflush);
  48.136 -        flush_tlb_cpu(vif->domain->processor);
  48.137 -    }
  48.138 -
  48.139      perfc_incr(net_rx_delivered);
  48.140  
  48.141      /* record this so they can be billed */
  48.142 @@ -589,7 +563,9 @@ void deliver_packet(struct sk_buff *skb,
  48.143      vif->total_bytes_received += size;
  48.144  
  48.145   out:
  48.146 +    put_page_and_type(pte_page);
  48.147      make_rx_response(vif, rx->id, size, status, offset);
  48.148 +    spin_unlock(&vif->rx_lock);
  48.149  }
  48.150  
  48.151  /**
  48.152 @@ -785,8 +761,8 @@ static void net_tx_action(unsigned long 
  48.153          skb->mac.raw  = skb->data; 
  48.154          skb->guest_id = tx->id;
  48.155          
  48.156 -        skb_shinfo(skb)->frags[0].page        = frame_table +
  48.157 -            (tx->payload >> PAGE_SHIFT);
  48.158 +        skb_shinfo(skb)->frags[0].page        = 
  48.159 +            &frame_table[tx->payload >> PAGE_SHIFT];
  48.160          skb_shinfo(skb)->frags[0].size        = tx->size - PKT_PROT_LEN;
  48.161          skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
  48.162          skb_shinfo(skb)->nr_frags = 1;
  48.163 @@ -856,10 +832,8 @@ static void tx_skb_release(struct sk_buf
  48.164  
  48.165      vif = skb->src_vif;
  48.166      
  48.167 -    spin_lock(&vif->domain->page_lock);
  48.168      for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
  48.169 -        put_page_tot(skb_shinfo(skb)->frags[i].page);
  48.170 -    spin_unlock(&vif->domain->page_lock);
  48.171 +        put_page(skb_shinfo(skb)->frags[i].page);
  48.172      
  48.173      if ( skb->skb_type == SKB_NODATA )
  48.174          kmem_cache_free(net_header_cachep, skb->head);
  48.175 @@ -867,7 +841,7 @@ static void tx_skb_release(struct sk_buf
  48.176      skb_shinfo(skb)->nr_frags = 0; 
  48.177      
  48.178      spin_lock(&vif->tx_lock);
  48.179 -    __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
  48.180 +    make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
  48.181      spin_unlock(&vif->tx_lock);
  48.182      
  48.183      /*
  48.184 @@ -1904,7 +1878,7 @@ static int get_tx_bufs(net_vif_t *vif)
  48.185          if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
  48.186          {
  48.187              DPRINTK("Bad packet size: %d\n", tx.size);
  48.188 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.189 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.190              continue; 
  48.191          }
  48.192  
  48.193 @@ -1932,23 +1906,21 @@ static int get_tx_bufs(net_vif_t *vif)
  48.194          vif->remaining_credit -= tx.size;
  48.195  
  48.196          /* No crossing a page boundary as the payload mustn't fragment. */
  48.197 -        if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) 
  48.198 +        if ( unlikely(((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE) ) 
  48.199          {
  48.200              DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 
  48.201                      tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
  48.202 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.203 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.204              continue;
  48.205          }
  48.206  
  48.207          buf_pfn  = tx.addr >> PAGE_SHIFT;
  48.208          buf_page = frame_table + buf_pfn;
  48.209 -        spin_lock(&p->page_lock);
  48.210 -        if ( (buf_pfn >= max_page) || 
  48.211 -             ((buf_page->flags & PG_domain_mask) != p->domain) ) 
  48.212 +        if ( unlikely(buf_pfn >= max_page) || 
  48.213 +             unlikely(!get_page(buf_page, p)) )
  48.214          {
  48.215              DPRINTK("Bad page frame\n");
  48.216 -            spin_unlock(&p->page_lock);
  48.217 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.218 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.219              continue;
  48.220          }
  48.221              
  48.222 @@ -1958,8 +1930,8 @@ static int get_tx_bufs(net_vif_t *vif)
  48.223              init_tx_header(vif, g_data, tx.size, the_dev));
  48.224          if ( protocol == 0 )
  48.225          {
  48.226 -            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.227 -            goto tx_unmap_and_continue;
  48.228 +            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.229 +            goto cleanup_and_continue;
  48.230          }
  48.231  
  48.232          target = net_get_target_vif(g_data, tx.size, vif);
  48.233 @@ -1969,9 +1941,9 @@ static int get_tx_bufs(net_vif_t *vif)
  48.234              /* Local delivery */
  48.235              if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
  48.236              {
  48.237 -                __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.238 +                make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
  48.239                  put_vif(target);
  48.240 -                goto tx_unmap_and_continue;
  48.241 +                goto cleanup_and_continue;
  48.242              }
  48.243  
  48.244              skb->src_vif = vif;
  48.245 @@ -1995,7 +1967,7 @@ static int get_tx_bufs(net_vif_t *vif)
  48.246              if ( netif_rx(skb) == NET_RX_DROP )
  48.247                  kfree_skb(skb);
  48.248  
  48.249 -            __make_tx_response(vif, tx.id, RING_STATUS_OK);
  48.250 +            make_tx_response(vif, tx.id, RING_STATUS_OK);
  48.251          }
  48.252          else if ( (target == VIF_PHYS) || IS_PRIV(p) )
  48.253          {
  48.254 @@ -2005,23 +1977,24 @@ static int get_tx_bufs(net_vif_t *vif)
  48.255                  kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
  48.256              if ( vif->tx_shadow_ring[j].header == NULL )
  48.257              { 
  48.258 -                __make_tx_response(vif, tx.id, RING_STATUS_OK);
  48.259 -                goto tx_unmap_and_continue;
  48.260 +                make_tx_response(vif, tx.id, RING_STATUS_OK);
  48.261 +                goto cleanup_and_continue;
  48.262              }
  48.263  
  48.264              memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
  48.265              vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
  48.266 -            get_page_tot(buf_page);
  48.267 +            buf_page = NULL; /* hand off our page reference */
  48.268              j = TX_RING_INC(j);
  48.269          }
  48.270          else
  48.271          {
  48.272 -            __make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
  48.273 +            make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
  48.274          }
  48.275  
  48.276 -    tx_unmap_and_continue:
  48.277 +    cleanup_and_continue:
  48.278 +        if ( buf_page != NULL )
  48.279 +            put_page(buf_page);
  48.280          unmap_domain_mem(g_data);
  48.281 -        spin_unlock(&p->page_lock);
  48.282      }
  48.283  
  48.284      /*
  48.285 @@ -2044,33 +2017,18 @@ static int get_tx_bufs(net_vif_t *vif)
  48.286  }
  48.287  
  48.288  
  48.289 -static long get_bufs_from_vif(net_vif_t *vif)
  48.290 +static void get_rx_bufs(net_vif_t *vif)
  48.291  {
  48.292 -    net_ring_t *shared_rings;
  48.293 -    net_idx_t *shared_idxs;
  48.294 +    struct task_struct *p = vif->domain;
  48.295 +    net_ring_t *shared_rings = vif->shared_rings;
  48.296 +    net_idx_t *shared_idxs = vif->shared_idxs;
  48.297      unsigned int i, j;
  48.298      rx_req_entry_t rx;
  48.299      unsigned long  pte_pfn, buf_pfn;
  48.300      struct pfn_info *pte_page, *buf_page;
  48.301 -    struct task_struct *p = vif->domain;
  48.302 -    unsigned long *ptep;    
  48.303 -
  48.304 -    shared_idxs  = vif->shared_idxs;
  48.305 -    shared_rings = vif->shared_rings;
  48.306 -        
  48.307 -    /*
  48.308 -     * PHASE 1 -- TRANSMIT RING
  48.309 -     */
  48.310 +    unsigned long *ptep, pte;
  48.311  
  48.312 -    if ( get_tx_bufs(vif) )
  48.313 -    {
  48.314 -        add_to_net_schedule_list_tail(vif);
  48.315 -        maybe_schedule_tx_action();
  48.316 -    }
  48.317 -
  48.318 -    /*
  48.319 -     * PHASE 2 -- RECEIVE RING
  48.320 -     */
  48.321 +    spin_lock(&vif->rx_lock);
  48.322  
  48.323      /*
  48.324       * Collect up new receive buffers. We collect up to the guest OS's new
  48.325 @@ -2085,66 +2043,83 @@ static long get_bufs_from_vif(net_vif_t 
  48.326      {
  48.327          rx = shared_rings->rx_ring[i].req;
  48.328  
  48.329 -        pte_pfn = rx.addr >> PAGE_SHIFT;
  48.330 -        pte_page = frame_table + pte_pfn;
  48.331 +        pte_pfn  = rx.addr >> PAGE_SHIFT;
  48.332 +        pte_page = &frame_table[pte_pfn];
  48.333              
  48.334 -        spin_lock(&p->page_lock);
  48.335 -        if ( (pte_pfn >= max_page) || 
  48.336 -             ((pte_page->flags & (PG_type_mask | PG_domain_mask)) != 
  48.337 -              (PGT_l1_page_table | p->domain)) ) 
  48.338 +        /* The address passed down must be to a valid PTE. */
  48.339 +        if ( unlikely(pte_pfn >= max_page) ||
  48.340 +             unlikely(!get_page_and_type(pte_page, p, PGT_l1_page_table)) )
  48.341          {
  48.342              DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
  48.343 -                    p->domain, pte_pfn, max_page, pte_page->flags);
  48.344 -            spin_unlock(&p->page_lock);
  48.345 +                    p->domain, pte_pfn, max_page, pte_page->type_and_flags);
  48.346              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  48.347              continue;
  48.348          }
  48.349 -            
  48.350 +        
  48.351          ptep = map_domain_mem(rx.addr);
  48.352 -            
  48.353 -        if ( !(*ptep & _PAGE_PRESENT) )
  48.354 +        pte  = *ptep;
  48.355 +        
  48.356 +        /* We must be passed a valid writeable mapping to swizzle. */
  48.357 +        if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) != 
  48.358 +                      (_PAGE_PRESENT|_PAGE_RW)) ||
  48.359 +             unlikely(cmpxchg(ptep, pte, pte & ~_PAGE_PRESENT) != pte) )
  48.360          {
  48.361 -            DPRINTK("Invalid PTE passed down (not present)\n");
  48.362 +            DPRINTK("Invalid PTE passed down (not present or changing)\n");
  48.363 +            put_page_and_type(pte_page);
  48.364 +            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  48.365 +            goto rx_unmap_and_continue;
  48.366 +        }
  48.367 +        
  48.368 +        buf_pfn  = pte >> PAGE_SHIFT;
  48.369 +        buf_page = &frame_table[buf_pfn];
  48.370 +
  48.371 +        /*
  48.372 +         * The page must belong to the correct domain, and must be mapped
  48.373 +         * just once as a writeable page.
  48.374 +         */
  48.375 +        if ( unlikely(buf_page->u.domain != p) ||
  48.376 +             unlikely(!test_and_clear_bit(_PGC_allocated, 
  48.377 +                                          &buf_page->count_and_flags)) ||
  48.378 +             unlikely(cmpxchg(&buf_page->type_and_flags, 
  48.379 +                              PGT_writeable_page|PGT_validated|1,
  48.380 +                              0) != (PGT_writeable_page|PGT_validated|1)) )
  48.381 +        {
  48.382 +            DPRINTK("Bad domain or page mapped writeable more than once.\n");
  48.383 +            if ( buf_page->u.domain == p )
  48.384 +                set_bit(_PGC_allocated, &buf_page->count_and_flags);
  48.385 +            if ( unlikely(cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) !=
  48.386 +                          (pte & ~_PAGE_PRESENT)) )
  48.387 +                put_page_and_type(buf_page);
  48.388 +            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  48.389 +            goto rx_unmap_and_continue;
  48.390 +        }
  48.391 +
  48.392 +        /*
  48.393 +         * Now ensure that we can take the last references to this page.
  48.394 +         * The final count should be 2, because of PGC_allocated.
  48.395 +         */
  48.396 +        if ( unlikely(cmpxchg(&buf_page->count_and_flags, 
  48.397 +                              PGC_tlb_flush_on_type_change | 2, 0) != 
  48.398 +                      (PGC_tlb_flush_on_type_change | 2)) )
  48.399 +        {
  48.400 +            DPRINTK("Page held more than once\n");
  48.401 +            /* Leave the page unmapped at 'ptep'. Stoopid domain! */
  48.402              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  48.403              goto rx_unmap_and_continue;
  48.404          }
  48.405              
  48.406 -        buf_pfn  = *ptep >> PAGE_SHIFT;
  48.407 -        buf_page = frame_table + buf_pfn;
  48.408 +        /* Remove from the domain's allocation list. */
  48.409 +        spin_lock(&p->page_list_lock);
  48.410 +        list_del(&buf_page->list);
  48.411 +        spin_unlock(&p->page_list_lock);
  48.412  
  48.413 -        if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
  48.414 -              (PGT_writeable_page | p->domain)) || 
  48.415 -             (page_tot_count(buf_page) != 1) )
  48.416 -        {
  48.417 -            DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
  48.418 -                    page_type_count(buf_page), page_tot_count(buf_page), 
  48.419 -                    buf_page->flags);
  48.420 -            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
  48.421 -            goto rx_unmap_and_continue;
  48.422 -        }
  48.423 -            
  48.424 -        /*
  48.425 -         * The pte they passed was good, so take it away from them. We also
  48.426 -         * lock down the page-table page, so it doesn't go away.
  48.427 -         */
  48.428 -        get_page_type(pte_page);
  48.429 -        get_page_tot(pte_page);
  48.430 -        *ptep &= ~_PAGE_PRESENT;
  48.431 -        buf_page->flags = 0;
  48.432 -        set_page_type_count(buf_page, 0);
  48.433 -        set_page_tot_count(buf_page, 0);
  48.434 -        list_del(&buf_page->list);
  48.435 -
  48.436 -        vif->rx_shadow_ring[j].id          = rx.id;
  48.437 -        vif->rx_shadow_ring[j].pte_ptr     = rx.addr;
  48.438 -        vif->rx_shadow_ring[j].buf_pfn     = buf_pfn;
  48.439 -        vif->rx_shadow_ring[j].flush_count = (unsigned short) 
  48.440 -            atomic_read(&tlb_flush_count[smp_processor_id()]);
  48.441 +        vif->rx_shadow_ring[j].id      = rx.id;
  48.442 +        vif->rx_shadow_ring[j].pte_ptr = rx.addr;
  48.443 +        vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
  48.444          j = RX_RING_INC(j);
  48.445              
  48.446      rx_unmap_and_continue:
  48.447          unmap_domain_mem(ptep);
  48.448 -        spin_unlock(&p->page_lock);
  48.449      }
  48.450  
  48.451      vif->rx_req_cons = i;
  48.452 @@ -2155,6 +2130,20 @@ static long get_bufs_from_vif(net_vif_t 
  48.453          vif->rx_prod = j;
  48.454      }
  48.455  
  48.456 +    spin_unlock(&vif->rx_lock);
  48.457 +}
  48.458 +
  48.459 +
  48.460 +static long get_bufs_from_vif(net_vif_t *vif)
  48.461 +{
  48.462 +    if ( get_tx_bufs(vif) )
  48.463 +    {
  48.464 +        add_to_net_schedule_list_tail(vif);
  48.465 +        maybe_schedule_tx_action();
  48.466 +    }
  48.467 +
  48.468 +    get_rx_bufs(vif);
  48.469 +
  48.470      return 0;
  48.471  }
  48.472  
  48.473 @@ -2162,7 +2151,7 @@ static long get_bufs_from_vif(net_vif_t 
  48.474  long flush_bufs_for_vif(net_vif_t *vif)
  48.475  {
  48.476      int i;
  48.477 -    unsigned long *pte;
  48.478 +    unsigned long *ptep, pte;
  48.479      struct pfn_info *page;
  48.480      struct task_struct *p = vif->domain;
  48.481      rx_shadow_entry_t *rx;
  48.482 @@ -2170,7 +2159,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
  48.483      net_idx_t *shared_idxs = vif->shared_idxs;
  48.484  
  48.485      /* Return any outstanding receive buffers to the guest OS. */
  48.486 -    spin_lock(&p->page_lock);
  48.487 +    spin_lock(&vif->rx_lock);
  48.488      for ( i = vif->rx_req_cons; 
  48.489            (i != shared_idxs->rx_req_prod) && 
  48.490                (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 
  48.491 @@ -2184,32 +2173,32 @@ long flush_bufs_for_vif(net_vif_t *vif)
  48.492      {
  48.493          rx = &vif->rx_shadow_ring[i];
  48.494  
  48.495 -        /* Release the page-table page. */
  48.496 -        page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
  48.497 -        put_page_type(page);
  48.498 -        put_page_tot(page);
  48.499 -
  48.500          /* Give the buffer page back to the domain. */
  48.501 -        page = frame_table + rx->buf_pfn;
  48.502 -        list_add(&page->list, &p->pg_head);
  48.503 -        page->flags = vif->domain->domain;
  48.504 +        page = &frame_table[rx->buf_pfn];
  48.505 +        spin_lock(&p->page_list_lock);
  48.506 +        list_add(&page->list, &p->page_list);
  48.507 +        page->count_and_flags = PGC_allocated | 2;
  48.508 +        spin_unlock(&p->page_list_lock);
  48.509 +        get_page_type(page, PGT_writeable_page);
  48.510 +        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
  48.511 +        wmb();
  48.512  
  48.513          /* Patch up the PTE if it hasn't changed under our feet. */
  48.514 -        pte = map_domain_mem(rx->pte_ptr);
  48.515 -        if ( !(*pte & _PAGE_PRESENT) )
  48.516 -        {
  48.517 -            *pte = (rx->buf_pfn<<PAGE_SHIFT) | (*pte & ~PAGE_MASK) | 
  48.518 -                _PAGE_RW | _PAGE_PRESENT;
  48.519 -            page->flags |= PGT_writeable_page | PG_need_flush;
  48.520 -            set_page_type_count(page, 1);
  48.521 -            set_page_tot_count(page, 1);
  48.522 -        }
  48.523 -        unmap_domain_mem(pte);
  48.524 +        ptep = map_domain_mem(rx->pte_ptr);
  48.525 +        pte  = *ptep;
  48.526 +        if ( unlikely(pte & _PAGE_PRESENT) ||
  48.527 +             unlikely(cmpxchg(ptep, pte, (rx->buf_pfn<<PAGE_SHIFT) | 
  48.528 +                              (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT)
  48.529 +                      != pte) )
  48.530 +            put_page_and_type(page);
  48.531 +        unmap_domain_mem(ptep);
  48.532 +
  48.533 +        put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]);
  48.534  
  48.535          make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
  48.536      }
  48.537      vif->rx_cons = i;
  48.538 -    spin_unlock(&p->page_lock);
  48.539 +    spin_unlock(&vif->rx_lock);
  48.540  
  48.541      /*
  48.542       * Flush pending transmit buffers. The guest may still have to wait for
  48.543 @@ -2221,7 +2210,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
  48.544                (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 
  48.545            i = TX_RING_INC(i) )
  48.546      {
  48.547 -        __make_tx_response(vif, shared_rings->tx_ring[i].req.id, 
  48.548 +        make_tx_response(vif, shared_rings->tx_ring[i].req.id, 
  48.549                             RING_STATUS_DROPPED);
  48.550      }
  48.551      vif->tx_req_cons = i;
  48.552 @@ -2296,9 +2285,9 @@ long do_net_io_op(netop_t *uop)
  48.553  }
  48.554  
  48.555  
  48.556 -static void __make_tx_response(net_vif_t     *vif, 
  48.557 -                               unsigned short id, 
  48.558 -                               unsigned char  st)
  48.559 +static void make_tx_response(net_vif_t     *vif, 
  48.560 +                             unsigned short id, 
  48.561 +                             unsigned char  st)
  48.562  {
  48.563      unsigned int pos;
  48.564      tx_resp_entry_t *resp;
  48.565 @@ -2329,7 +2318,6 @@ static void make_rx_response(net_vif_t  
  48.566      rx_resp_entry_t *resp;
  48.567  
  48.568      /* Place on the response ring for the relevant domain. */ 
  48.569 -    spin_lock(&vif->rx_lock);
  48.570      pos  = vif->rx_resp_prod;
  48.571      resp = &vif->shared_rings->rx_ring[pos].resp;
  48.572      resp->id     = id;
  48.573 @@ -2344,7 +2332,6 @@ static void make_rx_response(net_vif_t  
  48.574          unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
  48.575          guest_event_notify(cpu_mask);    
  48.576      }
  48.577 -    spin_unlock(&vif->rx_lock);
  48.578  }
  48.579  
  48.580  
    49.1 --- a/xen/net/skbuff.c	Sat Dec 20 23:39:49 2003 +0000
    49.2 +++ b/xen/net/skbuff.c	Sat Dec 20 23:41:19 2003 +0000
    49.3 @@ -133,41 +133,20 @@ static __inline__ void skb_head_to_pool(
    49.4  
    49.5  static inline u8 *alloc_skb_data_page(struct sk_buff *skb)
    49.6  {
    49.7 -    struct list_head *list_ptr;
    49.8 -    struct pfn_info  *pf;
    49.9 -    unsigned long flags;
   49.10 -        
   49.11 -    spin_lock_irqsave(&free_list_lock, flags);
   49.12 -
   49.13 -    if (!free_pfns) return NULL;
   49.14 -
   49.15 -    list_ptr = free_list.next;
   49.16 -    pf = list_entry(list_ptr, struct pfn_info, list);
   49.17 -    pf->flags = 0;
   49.18 -    list_del(&pf->list);
   49.19 -    free_pfns--;
   49.20 -
   49.21 -    spin_unlock_irqrestore(&free_list_lock, flags);
   49.22 -
   49.23 +    struct pfn_info *pf;
   49.24 +    if ( unlikely((pf = alloc_domain_page(NULL)) == NULL) )
   49.25 +        return NULL;
   49.26      skb->pf = pf;
   49.27      return (u8 *)((pf - frame_table) << PAGE_SHIFT);
   49.28  }
   49.29  
   49.30  static inline void dealloc_skb_data_page(struct sk_buff *skb)
   49.31  {
   49.32 -    struct pfn_info  *pf;
   49.33 +    struct pfn_info *pf = skb->pf;
   49.34      unsigned long flags;
   49.35 -
   49.36 -    pf = skb->pf;
   49.37 -
   49.38      spin_lock_irqsave(&free_list_lock, flags);
   49.39 -        
   49.40 -    pf->flags = 0;
   49.41 -    set_page_type_count(pf, 0);
   49.42 -    set_page_tot_count(pf, 0);
   49.43      list_add(&pf->list, &free_list);
   49.44      free_pfns++;
   49.45 -
   49.46      spin_unlock_irqrestore(&free_list_lock, flags);
   49.47  
   49.48  }
    50.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c	Sat Dec 20 23:39:49 2003 +0000
    50.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c	Sat Dec 20 23:41:19 2003 +0000
    50.3 @@ -40,7 +40,7 @@ static void DEBUG_allow_pt_reads(void)
    50.4          pte = update_debug_queue[i].ptep;
    50.5          if ( pte == NULL ) continue;
    50.6          update_debug_queue[i].ptep = NULL;
    50.7 -        update.ptr = pte;
    50.8 +        update.ptr = virt_to_machine(pte);
    50.9          update.val = update_debug_queue[i].pteval;
   50.10          HYPERVISOR_mmu_update(&update, 1);
   50.11      }
   50.12 @@ -59,7 +59,7 @@ static void DEBUG_disallow_pt_read(unsig
   50.13      pgd = pgd_offset_k(va);
   50.14      pmd = pmd_offset(pgd, va);
   50.15      pte = pte_offset(pmd, va);
   50.16 -    update.ptr = pte;
   50.17 +    update.ptr = virt_to_machine(pte);
   50.18      pteval = *(unsigned long *)pte;
   50.19      update.val = pteval & ~_PAGE_PRESENT;
   50.20      HYPERVISOR_mmu_update(&update, 1);
   50.21 @@ -95,7 +95,9 @@ void MULTICALL_flush_page_update_queue(v
   50.22  #if MMU_UPDATE_DEBUG > 0
   50.23          DEBUG_allow_pt_reads();
   50.24  #endif
   50.25 -        queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, idx);
   50.26 +        queue_multicall2(__HYPERVISOR_mmu_update, 
   50.27 +                         (unsigned long)update_queue, 
   50.28 +                         idx);
   50.29          idx = 0;
   50.30      }
   50.31      spin_unlock_irqrestore(&update_lock, flags);
   50.32 @@ -134,7 +136,7 @@ void queue_l1_entry_update(pte_t *ptr, u
   50.33  #if MMU_UPDATE_DEBUG > 0
   50.34      DEBUG_disallow_pt_read((unsigned long)ptr);
   50.35  #endif
   50.36 -    update_queue[idx].ptr = (unsigned long)ptr;
   50.37 +    update_queue[idx].ptr = virt_to_machine(ptr);
   50.38      update_queue[idx].val = val;
   50.39      increment_index();
   50.40      spin_unlock_irqrestore(&update_lock, flags);
   50.41 @@ -144,7 +146,7 @@ void queue_l2_entry_update(pmd_t *ptr, u
   50.42  {
   50.43      unsigned long flags;
   50.44      spin_lock_irqsave(&update_lock, flags);
   50.45 -    update_queue[idx].ptr = (unsigned long)ptr;
   50.46 +    update_queue[idx].ptr = virt_to_machine(ptr);
   50.47      update_queue[idx].val = val;
   50.48      increment_index();
   50.49      spin_unlock_irqrestore(&update_lock, flags);
    51.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c	Sat Dec 20 23:39:49 2003 +0000
    51.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c	Sat Dec 20 23:41:19 2003 +0000
    51.3 @@ -113,13 +113,10 @@ static inline void set_pte_phys (unsigne
    51.4      }
    51.5      pte = pte_offset(pmd, vaddr);
    51.6  
    51.7 -#if 0 /* Not in Xen, since this breaks clear_fixmap. */
    51.8 -    if (pte_val(*pte))
    51.9 -        pte_ERROR(*pte);
   51.10 -#endif
   51.11 -
   51.12 -    /* We queue directly, avoiding hidden phys->machine translation. */
   51.13 -    queue_l1_entry_update(pte, phys | pgprot_val(prot));
   51.14 +    if ( pte_io(*pte) || (pgprot_val(prot) & _PAGE_IO) )
   51.15 +        queue_unchecked_mmu_update(pte, phys | pgprot_val(prot));
   51.16 +    else
   51.17 +        queue_l1_entry_update(pte, phys | pgprot_val(prot));
   51.18  
   51.19      /*
   51.20       * It's enough to flush this one mapping.
   51.21 @@ -137,8 +134,7 @@ void __set_fixmap(enum fixed_addresses i
   51.22          printk("Invalid __set_fixmap\n");
   51.23          return;
   51.24      }
   51.25 -    set_pte_phys(address, phys, 
   51.26 -                 __pgprot(pgprot_val(PAGE_KERNEL)|pgprot_val(flags)));
   51.27 +    set_pte_phys(address, phys, flags);
   51.28  }
   51.29  
   51.30  void clear_fixmap(enum fixed_addresses idx)
    52.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c	Sat Dec 20 23:39:49 2003 +0000
    52.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c	Sat Dec 20 23:41:19 2003 +0000
    52.3 @@ -202,14 +202,15 @@ void __init *bt_ioremap(unsigned long ma
    52.4           */
    52.5          nrpages = size >> PAGE_SHIFT;
    52.6          if (nrpages > NR_FIX_BTMAPS)
    52.7 -                return NULL;
    52.8 +            return NULL;
    52.9  
   52.10          /*
   52.11           * Ok, go for it..
   52.12           */
   52.13          idx = FIX_BTMAP_BEGIN;
   52.14          while (nrpages > 0) {
   52.15 -                set_fixmap(idx, machine_addr);
   52.16 +                __set_fixmap(idx, machine_addr, 
   52.17 +                             __pgprot(__PAGE_KERNEL|_PAGE_IO));
   52.18                  machine_addr += PAGE_SIZE;
   52.19                  --idx;
   52.20                  --nrpages;