ia64/xen-unstable

changeset 84:af2f305e6020

bitkeeper revision 1.13 (3e15d543UbLg8zdxnspCHQplKUlnzQ)

Many files:
Clean up page-table update interface. BIG MODIFICATIONS HERE.
mremap.c, swapfile.c, memory.c, exec.c:
new file
author kaf24@labyrinth.cl.cam.ac.uk
date Fri Jan 03 18:24:03 2003 +0000 (2003-01-03)
parents fe3bada5b25b
children c3e6a52cd801
files .rootkeys xen-2.4.16/arch/i386/Rules.mk xen-2.4.16/arch/i386/entry.S xen-2.4.16/common/memory.c xen-2.4.16/common/page_alloc.c xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h xen-2.4.16/include/xeno/mm.h xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c xenolinux-2.4.16-sparse/arch/xeno/mm/init.c xenolinux-2.4.16-sparse/fs/exec.c xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h xenolinux-2.4.16-sparse/mm/memory.c xenolinux-2.4.16-sparse/mm/mremap.c xenolinux-2.4.16-sparse/mm/swapfile.c
line diff
     1.1 --- a/.rootkeys	Wed Dec 18 18:07:22 2002 +0000
     1.2 +++ b/.rootkeys	Fri Jan 03 18:24:03 2003 +0000
     1.3 @@ -290,6 +290,7 @@ 3ddb79b7aKdTkbr3u6aze8tVwGh_TQ xenolinux
     1.4  3ddb79bbx682YH6vR2zbVOXwg73ULg xenolinux-2.4.16-sparse/drivers/block/ll_rw_blk.c
     1.5  3ddb79bcJfHdwrPsjqgI33_OsGdVCg xenolinux-2.4.16-sparse/drivers/block/rd.c
     1.6  3ddb79bcpVu-IbnqwQqpRqsEbLpsuw xenolinux-2.4.16-sparse/drivers/char/tty_io.c
     1.7 +3e15d5273gfR2fbcYe05kqBSAvCX_w xenolinux-2.4.16-sparse/fs/exec.c
     1.8  3ddb79bba_zKpuurHVeWfgDkyPoq8A xenolinux-2.4.16-sparse/fs/nfs/nfsroot.c
     1.9  3ddb79b8VFtfWSCrXKPN2K21zd_vtw xenolinux-2.4.16-sparse/include/asm-xeno/a.out.h
    1.10  3ddb79b8Zzi13p3OAPV25QgiC3THAQ xenolinux-2.4.16-sparse/include/asm-xeno/apic.h
    1.11 @@ -400,3 +401,6 @@ 3ddb79bbA52x94o6uwDYsbzrH2hjzA xenolinux
    1.12  3ddb79bb_7YG4U75ZmEic9YXWTW7Vw xenolinux-2.4.16-sparse/include/linux/sunrpc/debug.h
    1.13  3ddb79bcxkVPfWlZ1PQKvDrfArzOVw xenolinux-2.4.16-sparse/kernel/panic.c
    1.14  3ddb79bbP31im-mx2NbfthSeqty1Dg xenolinux-2.4.16-sparse/mk
    1.15 +3e15d52e0_j129JPvo7xfYGndVFpwQ xenolinux-2.4.16-sparse/mm/memory.c
    1.16 +3e15d535DLvpzTrLRUIerB69LpJD1g xenolinux-2.4.16-sparse/mm/mremap.c
    1.17 +3e15d531m1Y1_W8ki64AFOU_ua4C4w xenolinux-2.4.16-sparse/mm/swapfile.c
     2.1 --- a/xen-2.4.16/arch/i386/Rules.mk	Wed Dec 18 18:07:22 2002 +0000
     2.2 +++ b/xen-2.4.16/arch/i386/Rules.mk	Fri Jan 03 18:24:03 2003 +0000
     2.3 @@ -8,7 +8,7 @@ MONITOR_BASE := 0xE0100000
     2.4  # Bootloader should load monitor to this real address
     2.5  LOAD_BASE    := 0x00100000
     2.6  CFLAGS  := -fno-builtin -O3 -Wall -DMONITOR_BASE=$(MONITOR_BASE) 
     2.7 -CFLAGS  += -I$(BASEDIR)/include -D__KERNEL__
     2.8 +CFLAGS  += -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
     2.9  LDFLAGS := -T xeno.lds -N
    2.10  
    2.11  
     3.1 --- a/xen-2.4.16/arch/i386/entry.S	Wed Dec 18 18:07:22 2002 +0000
     3.2 +++ b/xen-2.4.16/arch/i386/entry.S	Fri Jan 03 18:24:03 2003 +0000
     3.3 @@ -516,7 +516,7 @@ ENTRY(hypervisor_call_table)
     3.4          .long SYMBOL_NAME(do_set_trap_table)
     3.5          .long SYMBOL_NAME(do_process_page_updates)
     3.6          .long SYMBOL_NAME(do_console_write)
     3.7 -        .long SYMBOL_NAME(do_set_pagetable)
     3.8 +        .long SYMBOL_NAME(sys_ni_syscall)
     3.9          .long SYMBOL_NAME(do_set_guest_stack)
    3.10          .long SYMBOL_NAME(do_net_update)
    3.11          .long SYMBOL_NAME(do_fpu_taskswitch)
     4.1 --- a/xen-2.4.16/common/memory.c	Wed Dec 18 18:07:22 2002 +0000
     4.2 +++ b/xen-2.4.16/common/memory.c	Fri Jan 03 18:24:03 2003 +0000
     4.3 @@ -7,8 +7,7 @@
     4.4   * 
     4.5   * Domains trap to process_page_updates with a list of update requests.
     4.6   * This is a list of (ptr, val) pairs, where the requested operation
     4.7 - * is *ptr = val. The exceptions are when ptr is PGREQ_ADD_BASEPTR, or
     4.8 - * PGREQ_REMOVE_BASEPTR.
     4.9 + * is *ptr = val.
    4.10   * 
    4.11   * Reference counting of pages:
    4.12   * ----------------------------
    4.13 @@ -28,6 +27,15 @@
    4.14   * referred to in its current incarnation. Therefore, a page can only
    4.15   * change its type when its type count is zero.
    4.16   * 
    4.17 + * Pinning the page type:
    4.18 + * ----------------------
    4.19 + * The type of a page can be pinned/unpinned with the commands
    4.20 + * PGEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
    4.21 + * pinning is not reference counted, so it can't be nested).
    4.22 + * This is useful to prevent a page's type count falling to zero, at which
    4.23 + * point safety checks would need to be carried out next time the count
    4.24 + * is increased again.
    4.25 + * 
    4.26   * A further note on writeable page mappings:
    4.27   * ------------------------------------------
    4.28   * For simplicity, the count of writeable mappings for a page may not
    4.29 @@ -194,6 +202,7 @@ unsigned long max_page;
    4.30  struct list_head free_list;
    4.31  unsigned int free_pfns;
    4.32  
    4.33 +static int tlb_flush[NR_CPUS];
    4.34  
    4.35  /*
    4.36   * init_frametable:
    4.37 @@ -208,6 +217,8 @@ unsigned long __init init_frametable(uns
    4.38      struct pfn_info *pf;
    4.39      unsigned long page_index;
    4.40  
    4.41 +    memset(tlb_flush, 0, sizeof(tlb_flush));
    4.42 +
    4.43      max_page = nr_pages;
    4.44      frame_table_size = nr_pages * sizeof(struct pfn_info);
    4.45      frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
    4.46 @@ -440,13 +451,14 @@ static void put_page(unsigned long page_
    4.47      ASSERT(page_nr < max_page);
    4.48      page = frame_table + page_nr;
    4.49      ASSERT((page->flags & PG_domain_mask) == current->domain);
    4.50 -    ASSERT((((page->flags & PG_type_mask) == PGT_writeable_page) &&
    4.51 -            (page_type_count(page) != 0)) ||
    4.52 -           (((page->flags & PG_type_mask) == PGT_none) &&
    4.53 -            (page_type_count(page) == 0)));
    4.54 -    ASSERT((!writeable) || (page_type_count(page) != 0));
    4.55 +    ASSERT((!writeable) || 
    4.56 +           ((page_type_count(page) != 0) && 
    4.57 +            ((page->flags & PG_type_mask) == PGT_writeable_page)));
    4.58      if ( writeable && (put_page_type(page) == 0) )
    4.59 +    {
    4.60 +        tlb_flush[smp_processor_id()] = 1;
    4.61          page->flags &= ~PG_type_mask;
    4.62 +    }
    4.63      put_page_tot(page);
    4.64  }
    4.65  
    4.66 @@ -458,7 +470,7 @@ static int mod_l2_entry(l2_pgentry_t *p_
    4.67      if ( (((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
    4.68           DOMAIN_ENTRIES_PER_L2_PAGETABLE )
    4.69      {
    4.70 -        MEM_LOG("Illegal L2 update attempt in hypervisor area %p\n",
    4.71 +        MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
    4.72                  p_l2_entry);
    4.73          goto fail;
    4.74      }
    4.75 @@ -544,6 +556,95 @@ static int mod_l1_entry(l1_pgentry_t *p_
    4.76  }
    4.77  
    4.78  
    4.79 +static int do_extended_command(unsigned long ptr, unsigned long val)
    4.80 +{
    4.81 +    int err = 0;
    4.82 +    unsigned long pfn = ptr >> PAGE_SHIFT;
    4.83 +    struct pfn_info *page = frame_table + pfn;
    4.84 +
    4.85 +    switch ( (val & PGEXT_CMD_MASK) )
    4.86 +    {
    4.87 +    case PGEXT_PIN_L1_TABLE:
    4.88 +        err = get_l1_table(pfn);
    4.89 +        goto mark_as_pinned;
    4.90 +    case PGEXT_PIN_L2_TABLE:
    4.91 +        err = get_l2_table(pfn);
    4.92 +    mark_as_pinned:
    4.93 +        if ( err )
    4.94 +        {
    4.95 +            MEM_LOG("Error while pinning pfn %08lx", pfn);
    4.96 +            break;
    4.97 +        }
    4.98 +        put_page_type(page);
    4.99 +        put_page_tot(page);
   4.100 +        if ( !(page->type_count & REFCNT_PIN_BIT) )
   4.101 +        {
   4.102 +            page->type_count |= REFCNT_PIN_BIT;
   4.103 +            page->tot_count  |= REFCNT_PIN_BIT;
   4.104 +        }
   4.105 +        else
   4.106 +        {
   4.107 +            MEM_LOG("Pfn %08lx already pinned", pfn);
   4.108 +            err = 1;
   4.109 +        }
   4.110 +        break;
   4.111 +
   4.112 +    case PGEXT_UNPIN_TABLE:
   4.113 +        if ( (page->flags & PG_domain_mask) != current->domain )
   4.114 +        {
   4.115 +            err = 1;
   4.116 +            MEM_LOG("Page %08lx bad domain (dom=%ld)",
   4.117 +                    ptr, page->flags & PG_domain_mask);
   4.118 +        }
   4.119 +        else if ( (page->type_count & REFCNT_PIN_BIT) )
   4.120 +        {
   4.121 +            page->type_count &= ~REFCNT_PIN_BIT;
   4.122 +            page->tot_count  &= ~REFCNT_PIN_BIT;
   4.123 +            get_page_type(page);
   4.124 +            get_page_tot(page);
   4.125 +            ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
   4.126 +                put_l1_table(pfn) : put_l2_table(pfn);
   4.127 +        }
   4.128 +        else
   4.129 +        {
   4.130 +            err = 1;
   4.131 +            MEM_LOG("Pfn %08lx not pinned", pfn);
   4.132 +        }
   4.133 +        break;
   4.134 +
   4.135 +    case PGEXT_NEW_BASEPTR:
   4.136 +        err = get_l2_table(pfn);
   4.137 +        if ( !err )
   4.138 +        {
   4.139 +            put_l2_table(__pa(pagetable_ptr(current->mm.pagetable)) 
   4.140 +                         >> PAGE_SHIFT);
   4.141 +            current->mm.pagetable = 
   4.142 +                mk_pagetable((unsigned long)__va(pfn<<PAGE_SHIFT));
   4.143 +        }
   4.144 +        else
   4.145 +        {
   4.146 +            MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
   4.147 +        }
   4.148 +        /* fall through */
   4.149 +        
   4.150 +    case PGEXT_TLB_FLUSH:
   4.151 +        tlb_flush[smp_processor_id()] = 1;
   4.152 +        break;
   4.153 +    
   4.154 +    case PGEXT_INVLPG:
   4.155 +        __asm__ __volatile__ ("invlpg %0" : : 
   4.156 +                              "m" (*(char*)(val & ~PGEXT_CMD_MASK)));
   4.157 +        break;
   4.158 +
   4.159 +    default:
   4.160 +        MEM_LOG("Invalid extended pt command 0x%08lx", val & PGEXT_CMD_MASK);
   4.161 +        err = 1;
   4.162 +        break;
   4.163 +    }
   4.164 +
   4.165 +    return err;
   4.166 +}
   4.167 +
   4.168  /* Apply updates to page table @pagetable_id within the current domain. */
   4.169  int do_process_page_updates(page_update_request_t *updates, int count)
   4.170  {
   4.171 @@ -559,39 +660,23 @@ int do_process_page_updates(page_update_
   4.172              kill_domain_with_errmsg("Cannot read page update request");
   4.173          }
   4.174  
   4.175 +        pfn = cur.ptr >> PAGE_SHIFT;
   4.176 +        if ( pfn >= max_page )
   4.177 +        {
   4.178 +            MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
   4.179 +            kill_domain_with_errmsg("Page update request out of range");
   4.180 +        }
   4.181 +
   4.182          err = 1;
   4.183  
   4.184 -        pfn = cur.ptr >> PAGE_SHIFT;
   4.185 -        if ( !pfn )
   4.186 +        /* Least significant bits of 'ptr' demux the operation type. */
   4.187 +        switch ( cur.ptr & (sizeof(l1_pgentry_t)-1) )
   4.188          {
   4.189 -            switch ( cur.ptr )
   4.190 -            {
   4.191 -            case PGREQ_ADD_BASEPTR:
   4.192 -                err = get_l2_table(cur.val >> PAGE_SHIFT);
   4.193 -                break;
   4.194 -            case PGREQ_REMOVE_BASEPTR:
   4.195 -                if ( cur.val == __pa(pagetable_ptr(current->mm.pagetable)) )
   4.196 -                {
   4.197 -                    MEM_LOG("Attempt to remove current baseptr! %08lx",
   4.198 -                            cur.val);
   4.199 -                }
   4.200 -                else
   4.201 -                {
   4.202 -                    err = put_l2_table(cur.val >> PAGE_SHIFT);
   4.203 -                }
   4.204 -                break;
   4.205 -            default:
   4.206 -                MEM_LOG("Invalid page update command %08lx", cur.ptr);
   4.207 -                break;
   4.208 -            }
   4.209 -        }
   4.210 -        else if ( (cur.ptr & (sizeof(l1_pgentry_t)-1)) || (pfn >= max_page) )
   4.211 -        {
   4.212 -            MEM_LOG("Page out of range (%08lx>%08lx) or misalign %08lx",
   4.213 -                    pfn, max_page, cur.ptr);
   4.214 -        }
   4.215 -        else
   4.216 -        {
   4.217 +
   4.218 +            /*
   4.219 +             * PGREQ_NORMAL: Normal update to any level of page table.
   4.220 +             */
   4.221 +        case PGREQ_NORMAL:
   4.222              page = frame_table + pfn;
   4.223              flags = page->flags;
   4.224              if ( (flags & PG_domain_mask) == current->domain )
   4.225 @@ -607,20 +692,47 @@ int do_process_page_updates(page_update_
   4.226                                         mk_l2_pgentry(cur.val)); 
   4.227                      break;
   4.228                  default:
   4.229 -                    /*
   4.230 -                     * This might occur if a page-table update is
   4.231 -                     * requested before we've inferred the type
   4.232 -                     * of the containing page. It shouldn't happen
   4.233 -                     * if page tables are built strictly top-down, so
   4.234 -                     * we have a MEM_LOG warning message.
   4.235 -                     */
   4.236 -                    MEM_LOG("Unnecessary update to non-pt page %08lx",
   4.237 -                            cur.ptr);
   4.238 -                    *(unsigned long *)__va(cur.ptr) = cur.val;
   4.239 -                    err = 0;
   4.240 +                    MEM_LOG("Update to non-pt page %08lx", cur.ptr);
   4.241                      break;
   4.242                  }
   4.243              }
   4.244 +            break;
   4.245 +
   4.246 +            /*
   4.247 +             * PGREQ_UNCHECKED_UPDATE: Make an unchecked update to a
   4.248 +             * bottom-level page-table entry.
   4.249 +             * Restrictions apply:
   4.250 +             *  1. Update only allowed by domain 0.
   4.251 +             *  2. Update must be to a level-1 pte belonging to dom0.
   4.252 +             */
   4.253 +        case PGREQ_UNCHECKED_UPDATE:
   4.254 +            cur.ptr &= ~(sizeof(l1_pgentry_t) - 1);
   4.255 +            page = frame_table + pfn;
   4.256 +            flags = page->flags;
   4.257 +            if ( (flags | current->domain) == PGT_l1_page_table )
   4.258 +            {
   4.259 +                *(unsigned long *)__va(cur.ptr) = cur.val;
   4.260 +                err = 0;
   4.261 +            }
   4.262 +            else
   4.263 +            {
   4.264 +                MEM_LOG("UNCHECKED_UPDATE: Bad domain %d, or"
   4.265 +                        " bad pte type %08lx", current->domain, flags);
   4.266 +            }
   4.267 +            break;
   4.268 +
   4.269 +            /*
   4.270 +             * PGREQ_EXTENDED_COMMAND: Extended command is specified
   4.271 +             * in the least-siginificant bits of the 'value' field.
   4.272 +             */
   4.273 +        case PGREQ_EXTENDED_COMMAND:
   4.274 +            cur.ptr &= ~(sizeof(l1_pgentry_t) - 1);
   4.275 +            err = do_extended_command(cur.ptr, cur.val);
   4.276 +            break;
   4.277 +
   4.278 +        default:
   4.279 +            MEM_LOG("Invalid page update command %08lx", cur.ptr);
   4.280 +            break;
   4.281          }
   4.282  
   4.283          if ( err )
   4.284 @@ -631,40 +743,14 @@ int do_process_page_updates(page_update_
   4.285          updates++;
   4.286      }
   4.287  
   4.288 -    __asm__ __volatile__ ("movl %%eax,%%cr3" : : 
   4.289 -                          "a" (__pa(pagetable_ptr(current->mm.pagetable))));
   4.290 +    if ( tlb_flush[smp_processor_id()] )
   4.291 +    {
   4.292 +        tlb_flush[smp_processor_id()] = 0;
   4.293 +        __asm__ __volatile__ (
   4.294 +            "movl %%eax,%%cr3" : : 
   4.295 +            "a" (__pa(pagetable_ptr(current->mm.pagetable))));
   4.296 +    }
   4.297 +
   4.298      return(0);
   4.299  }
   4.300  
   4.301 -
   4.302 -int do_set_pagetable(unsigned long ptr)
   4.303 -{
   4.304 -    struct pfn_info *page;
   4.305 -    unsigned long pfn, flags;
   4.306 -
   4.307 -    if ( (ptr & ~PAGE_MASK) ) 
   4.308 -    {
   4.309 -        MEM_LOG("Misaligned new baseptr %08lx", ptr);
   4.310 -        return -1;
   4.311 -    }
   4.312 -    pfn = ptr >> PAGE_SHIFT;
   4.313 -    if ( pfn >= max_page )
   4.314 -    {
   4.315 -        MEM_LOG("Page out of range (%08lx>%08lx)", pfn, max_page);
   4.316 -        return -1;
   4.317 -    }
   4.318 -    page = frame_table + (ptr >> PAGE_SHIFT);
   4.319 -    flags = page->flags;
   4.320 -    if ( (flags & (PG_domain_mask|PG_type_mask)) != 
   4.321 -         (current->domain|PGT_l2_page_table) )
   4.322 -    {
   4.323 -        MEM_LOG("Page %08lx bad type/domain (dom=%ld) "
   4.324 -                "(type %08lx != expected %08x)",
   4.325 -                ptr, flags & PG_domain_mask, flags & PG_type_mask,
   4.326 -                PGT_l2_page_table);
   4.327 -        return -1;
   4.328 -    }
   4.329 -    current->mm.pagetable = mk_pagetable((unsigned long)__va(ptr));
   4.330 -    __asm__ __volatile__ ("movl %%eax,%%cr3" : : "a" (ptr));
   4.331 -    return 0;
   4.332 -}
     5.1 --- a/xen-2.4.16/common/page_alloc.c	Wed Dec 18 18:07:22 2002 +0000
     5.2 +++ b/xen-2.4.16/common/page_alloc.c	Fri Jan 03 18:24:03 2003 +0000
     5.3 @@ -188,7 +188,7 @@ unsigned long __get_free_pages(int mask,
     5.4      if ( i == FREELIST_SIZE )
     5.5      {
     5.6          printk("Cannot handle page request order %d!\n", order);
     5.7 -	return NULL; 
     5.8 +	return 0;
     5.9      }
    5.10   
    5.11      /* Unlink a chunk. */
     6.1 --- a/xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h	Wed Dec 18 18:07:22 2002 +0000
     6.2 +++ b/xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h	Fri Jan 03 18:24:03 2003 +0000
     6.3 @@ -21,9 +21,30 @@ typedef struct trap_info_st
     6.4  
     6.5  typedef struct
     6.6  {
     6.7 -#define PGREQ_ADD_BASEPTR    0
     6.8 -#define PGREQ_REMOVE_BASEPTR 1
     6.9 +/*
    6.10 + * PGREQ_XXX: specified in least-significant bits of 'ptr' field.
    6.11 + * All requests specify relevent PTE or PT address in 'ptr'.
    6.12 + * Normal requests specify update value in 'value'.
    6.13 + * Extended requests specify command in least 8 bits of 'value'.
    6.14 + */
    6.15 +/* A normal page-table update request. */
    6.16 +#define PGREQ_NORMAL           0
    6.17 +/* Make an unchecked update to a base-level pte. */
    6.18 +#define PGREQ_UNCHECKED_UPDATE 1
    6.19 +/* An extended command. */
    6.20 +#define PGREQ_EXTENDED_COMMAND 2
    6.21      unsigned long ptr, val; /* *ptr = val */
    6.22 +/* Announce a new top-level page table. */
    6.23 +#define PGEXT_PIN_L1_TABLE      0
    6.24 +#define PGEXT_PIN_L2_TABLE      1
    6.25 +#define PGEXT_PIN_L3_TABLE      2
    6.26 +#define PGEXT_PIN_L4_TABLE      3
    6.27 +#define PGEXT_UNPIN_TABLE       4
    6.28 +#define PGEXT_NEW_BASEPTR       5
    6.29 +#define PGEXT_TLB_FLUSH         6
    6.30 +#define PGEXT_INVLPG            7
    6.31 +#define PGEXT_CMD_MASK        255
    6.32 +#define PGEXT_CMD_SHIFT         8
    6.33  } page_update_request_t;
    6.34  
    6.35  
    6.36 @@ -32,7 +53,7 @@ typedef struct
    6.37  #define __HYPERVISOR_set_trap_table  0
    6.38  #define __HYPERVISOR_pt_update       1
    6.39  #define __HYPERVISOR_console_write   2
    6.40 -#define __HYPERVISOR_set_pagetable   3
    6.41 +/* vector 3 unused */
    6.42  #define __HYPERVISOR_set_guest_stack 4
    6.43  #define __HYPERVISOR_net_update      5
    6.44  #define __HYPERVISOR_fpu_taskswitch  6
     7.1 --- a/xen-2.4.16/include/xeno/mm.h	Wed Dec 18 18:07:22 2002 +0000
     7.2 +++ b/xen-2.4.16/include/xeno/mm.h	Fri Jan 03 18:24:03 2003 +0000
     7.3 @@ -62,6 +62,13 @@ typedef struct pfn_info {
     7.4      unsigned long type_count;   /* pagetable/dir, or domain-writeable refs. */
     7.5  } frame_table_t;
     7.6  
     7.7 +/*
     7.8 + * We use a high bit to indicate that a page is pinned.
     7.9 + * We do not use the top bit as that would mean that we'd get confused with
    7.10 + * -ve error numbers in some places in common/memory.c.
    7.11 + */
    7.12 +#define REFCNT_PIN_BIT 0x40000000UL
    7.13 +
    7.14  #define get_page_tot(p)		 ((p)->tot_count++)
    7.15  #define put_page_tot(p)		 (--(p)->tot_count)
    7.16  #define page_tot_count(p)	 ((p)->tot_count)
     8.1 --- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S	Wed Dec 18 18:07:22 2002 +0000
     8.2 +++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S	Fri Jan 03 18:24:03 2003 +0000
     8.3 @@ -57,5 +57,11 @@ ENTRY(stack_start)
     8.4  ENTRY(empty_zero_page)
     8.5  
     8.6  .org 0x2000
     8.7 +ENTRY(cpu0_pte_quicklist)
     8.8 +
     8.9 +.org 0x2400
    8.10 +ENTRY(cpu0_pgd_quicklist)
    8.11 +        
    8.12 +.org 0x2800
    8.13  ENTRY(stext)
    8.14  ENTRY(_stext)
     9.1 --- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c	Wed Dec 18 18:07:22 2002 +0000
     9.2 +++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c	Fri Jan 03 18:24:03 2003 +0000
     9.3 @@ -145,14 +145,20 @@ void __init setup_arch(char **cmdline_p)
     9.4      unsigned long bootmap_size;
     9.5      char str[256]; int strcnt;
     9.6  
     9.7 -    void hypervisor_callback(void);
     9.8 -    void failsafe_callback(void);
     9.9 +    extern void hypervisor_callback(void);
    9.10 +    extern void failsafe_callback(void);
    9.11 +
    9.12 +    extern unsigned long cpu0_pte_quicklist[];
    9.13 +    extern unsigned long cpu0_pgd_quicklist[];
    9.14  
    9.15      HYPERVISOR_shared_info->event_address    = 
    9.16          (unsigned long)hypervisor_callback;
    9.17      HYPERVISOR_shared_info->failsafe_address =
    9.18          (unsigned long)failsafe_callback;
    9.19  
    9.20 +    boot_cpu_data.pgd_quick = cpu0_pgd_quicklist;
    9.21 +    boot_cpu_data.pte_quick = cpu0_pte_quicklist;
    9.22 +
    9.23      ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
    9.24      memset(&drive_info, 0, sizeof(drive_info));
    9.25      memset(&screen_info, 0, sizeof(screen_info));
    10.1 --- a/xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c	Wed Dec 18 18:07:22 2002 +0000
    10.2 +++ b/xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c	Fri Jan 03 18:24:03 2003 +0000
    10.3 @@ -155,9 +155,32 @@ asmlinkage void do_page_fault(struct pt_
    10.4  	siginfo_t info;
    10.5  
    10.6          /* Set the "privileged fault" bit to something sane. */
    10.7 -        error_code &= ~4;
    10.8 +        error_code &= 3;
    10.9          error_code |= (regs->xcs & 2) << 1;
   10.10  
   10.11 +#if PT_UPDATE_DEBUG > 0
   10.12 +        if ( (error_code == 0) && (address >= TASK_SIZE) )
   10.13 +        {
   10.14 +            unsigned long paddr = __pa(address);
   10.15 +            int i;
   10.16 +            for ( i = 0; i < pt_update_queue_idx; i++ )
   10.17 +            {
   10.18 +                if ( update_debug_queue[i].ptr == paddr )
   10.19 +                {
   10.20 +                    printk("XXX now(EIP=%08lx:ptr=%08lx) "
   10.21 +                           "then(%s/%d:p/v=%08lx/%08lx)\n",
   10.22 +                           regs->eip, address,
   10.23 +                           update_debug_queue[i].file,
   10.24 +                           update_debug_queue[i].line,
   10.25 +                           update_debug_queue[i].ptr,
   10.26 +                           update_debug_queue[i].val);
   10.27 +                }
   10.28 +            }
   10.29 +        }
   10.30 +#endif
   10.31 +
   10.32 +        if ( flush_page_update_queue() != 0 ) return;
   10.33 +
   10.34  	/*
   10.35  	 * We fault-in kernel-space virtual memory on-demand. The
   10.36  	 * 'reference' page table is init_mm.pgd.
   10.37 @@ -291,12 +314,14 @@ no_context:
   10.38  	printk(" printing eip:\n");
   10.39  	printk("%08lx\n", regs->eip);
   10.40          page = ((unsigned long *) cur_pgd)[address >> 22];
   10.41 -        printk(KERN_ALERT "*pde = %08lx\n", page);
   10.42 +        printk(KERN_ALERT "*pde = %08lx(%08lx)\n", page, page - start_info.phys_base);
   10.43          if (page & 1) {
   10.44                  page &= PAGE_MASK;
   10.45                  address &= 0x003ff000;
   10.46 +                page -= start_info.phys_base;
   10.47                  page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
   10.48 -                printk(KERN_ALERT "*pte = %08lx\n", page);
   10.49 +                printk(KERN_ALERT "*pte = %08lx(%08lx)\n", page, 
   10.50 +                       page - start_info.phys_base);
   10.51          }
   10.52   	die("Oops", regs, error_code);
   10.53  	bust_spinlocks(0);
   10.54 @@ -366,6 +391,7 @@ vmalloc_fault:
   10.55  		if (!pmd_present(*pmd_k))
   10.56  			goto no_context;
   10.57  		set_pmd(pmd, *pmd_k);
   10.58 +                XENO_flush_page_update_queue(); /* flush PMD update */
   10.59  
   10.60  		pte_k = pte_offset(pmd_k, address);
   10.61  		if (!pte_present(*pte_k))
    11.1 --- a/xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c	Wed Dec 18 18:07:22 2002 +0000
    11.2 +++ b/xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c	Fri Jan 03 18:24:03 2003 +0000
    11.3 @@ -7,52 +7,162 @@
    11.4   */
    11.5  
    11.6  #include <linux/config.h>
    11.7 +#include <linux/sched.h>
    11.8  #include <asm/hypervisor.h>
    11.9  #include <asm/page.h>
   11.10  #include <asm/pgtable.h>
   11.11  
   11.12 -#define QUEUE_SIZE 1
   11.13 +#define QUEUE_SIZE 2048
   11.14  static page_update_request_t update_queue[QUEUE_SIZE];
   11.15 +unsigned int pt_update_queue_idx = 0;
   11.16 +#define idx pt_update_queue_idx
   11.17 +
   11.18 +#if PT_UPDATE_DEBUG > 0
   11.19 +page_update_debug_t update_debug_queue[QUEUE_SIZE] = {{0}};
   11.20 +#undef queue_l1_entry_update
   11.21 +#undef queue_l2_entry_update
   11.22 +static void DEBUG_allow_pt_reads(void)
   11.23 +{
   11.24 +    pte_t *pte;
   11.25 +    page_update_request_t update;
   11.26 +    int i;
   11.27 +    for ( i = idx-1; i >= 0; i-- )
   11.28 +    {
   11.29 +        pte = update_debug_queue[i].ptep;
   11.30 +        if ( pte == NULL ) continue;
   11.31 +        update_debug_queue[i].ptep = NULL;
   11.32 +        update.ptr = __pa(pte) + start_info.phys_base;
   11.33 +        update.val = update_debug_queue[i].pteval;
   11.34 +        HYPERVISOR_pt_update(&update, 1);
   11.35 +    }
   11.36 +}
   11.37 +static void DEBUG_disallow_pt_read(unsigned long pa)
   11.38 +{
   11.39 +    pte_t *pte;
   11.40 +    pmd_t *pmd;
   11.41 +    pgd_t *pgd;
   11.42 +    unsigned long pteval;
   11.43 +    /*
   11.44 +     * We may fault because of an already outstanding update.
   11.45 +     * That's okay -- it'll get fixed up in the fault handler.
   11.46 +     */
   11.47 +    page_update_request_t update;
   11.48 +    unsigned long va = (unsigned long)__va(pa);
   11.49 +    pgd = pgd_offset_k(va);
   11.50 +    pmd = pmd_offset(pgd, va);
   11.51 +    pte = pte_offset(pmd, va);
   11.52 +    update.ptr = __pa(pte) + start_info.phys_base;
   11.53 +    pteval = *(unsigned long *)pte;
   11.54 +    update.val = pteval & ~_PAGE_PRESENT;
   11.55 +    HYPERVISOR_pt_update(&update, 1);
   11.56 +    update_debug_queue[idx].ptep = pte;
   11.57 +    update_debug_queue[idx].pteval = pteval;
   11.58 +}
   11.59 +#endif
   11.60 +
   11.61 +#if PT_UPDATE_DEBUG > 1
   11.62 +#undef queue_pt_switch
   11.63 +#undef queue_tlb_flush
   11.64 +#undef queue_invlpg
   11.65 +#undef queue_pgd_pin
   11.66 +#undef queue_pgd_unpin
   11.67 +#undef queue_pte_pin
   11.68 +#undef queue_pte_unpin
   11.69 +#endif
   11.70 +
   11.71 +
   11.72 +/*
   11.73 + * This is the current pagetable base pointer, which is updated
   11.74 + * on context switch.
   11.75 + */
   11.76 +unsigned long pt_baseptr;
   11.77 +
   11.78 +void _flush_page_update_queue(void)
   11.79 +{
   11.80 +    if ( idx == 0 ) return;
   11.81 +#if PT_UPDATE_DEBUG > 1
   11.82 +    printk("Flushing %d entries from pt update queue\n", idx);
   11.83 +#endif
   11.84 +#if PT_UPDATE_DEBUG > 0
   11.85 +    DEBUG_allow_pt_reads();
   11.86 +#endif
   11.87 +    HYPERVISOR_pt_update(update_queue, idx);
   11.88 +    idx = 0;
   11.89 +}
   11.90 +
   11.91 +static void increment_index(void)
   11.92 +{
   11.93 +    if ( ++idx == QUEUE_SIZE ) _flush_page_update_queue();
   11.94 +}
   11.95  
   11.96  void queue_l1_entry_update(unsigned long ptr, unsigned long val)
   11.97  {
   11.98 -    update_queue[0].ptr = ptr + start_info.phys_base;
   11.99 -    update_queue[0].val = val;
  11.100 -    flush_page_update_queue();
  11.101 +#if PT_UPDATE_DEBUG > 0
  11.102 +    DEBUG_disallow_pt_read(ptr);
  11.103 +#endif
  11.104 +    update_queue[idx].ptr = ptr + start_info.phys_base;
  11.105 +    update_queue[idx].val = val;
  11.106 +    increment_index();
  11.107  }
  11.108  
  11.109  void queue_l2_entry_update(unsigned long ptr, unsigned long val)
  11.110  {
  11.111 -    update_queue[0].ptr = ptr + start_info.phys_base;
  11.112 -    update_queue[0].val = val;
  11.113 -    flush_page_update_queue();
  11.114 +    update_queue[idx].ptr = ptr + start_info.phys_base;
  11.115 +    update_queue[idx].val = val;
  11.116 +    increment_index();
  11.117  }
  11.118  
  11.119 -void queue_baseptr_create(unsigned long ptr)
  11.120 +void queue_pt_switch(unsigned long ptr)
  11.121  {
  11.122 -    update_queue[0].ptr = PGREQ_ADD_BASEPTR;
  11.123 -    update_queue[0].val = ptr + start_info.phys_base;
  11.124 -    flush_page_update_queue();
  11.125 -}
  11.126 -
  11.127 -void queue_baseptr_remove(unsigned long ptr)
  11.128 -{
  11.129 -    update_queue[0].ptr = PGREQ_REMOVE_BASEPTR;
  11.130 -    update_queue[0].val = ptr + start_info.phys_base;
  11.131 -    flush_page_update_queue();
  11.132 +    update_queue[idx].ptr  = ptr + start_info.phys_base;
  11.133 +    update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND;
  11.134 +    update_queue[idx].val  = PGEXT_NEW_BASEPTR;
  11.135 +    increment_index();
  11.136  }
  11.137  
  11.138  void queue_tlb_flush(void)
  11.139  {
  11.140 -    /* nothing */
  11.141 +    update_queue[idx].ptr  = PGREQ_EXTENDED_COMMAND;
  11.142 +    update_queue[idx].val  = PGEXT_TLB_FLUSH;
  11.143 +    increment_index();
  11.144 +}
  11.145 +
  11.146 +void queue_invlpg(unsigned long ptr)
  11.147 +{
  11.148 +    update_queue[idx].ptr  = PGREQ_EXTENDED_COMMAND;
  11.149 +    update_queue[idx].val  = ptr & PAGE_MASK;
  11.150 +    update_queue[idx].val |= PGEXT_INVLPG;
  11.151 +    increment_index();
  11.152 +}
  11.153 +
  11.154 +void queue_pgd_pin(unsigned long ptr)
  11.155 +{
  11.156 +    update_queue[idx].ptr  = ptr + start_info.phys_base;
  11.157 +    update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND;
  11.158 +    update_queue[idx].val  = PGEXT_PIN_L2_TABLE;
  11.159 +    increment_index();
  11.160  }
  11.161  
  11.162 -void queue_tlb_flush_one(unsigned long ptr)
  11.163 +void queue_pgd_unpin(unsigned long ptr)
  11.164  {
  11.165 -    /* nothing */
  11.166 +    update_queue[idx].ptr  = ptr + start_info.phys_base;
  11.167 +    update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND;
  11.168 +    update_queue[idx].val  = PGEXT_UNPIN_TABLE;
  11.169 +    increment_index();
  11.170  }
  11.171  
  11.172 -void flush_page_update_queue(void)
  11.173 +void queue_pte_pin(unsigned long ptr)
  11.174  {
  11.175 -    HYPERVISOR_pt_update(update_queue, 1);
  11.176 +    update_queue[idx].ptr  = ptr + start_info.phys_base;
  11.177 +    update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND;
  11.178 +    update_queue[idx].val  = PGEXT_PIN_L1_TABLE;
  11.179 +    increment_index();
  11.180  }
  11.181 +
  11.182 +void queue_pte_unpin(unsigned long ptr)
  11.183 +{
  11.184 +    update_queue[idx].ptr  = ptr + start_info.phys_base;
  11.185 +    update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND;
  11.186 +    update_queue[idx].val  = PGEXT_UNPIN_TABLE;
  11.187 +    increment_index();
  11.188 +}
    12.1 --- a/xenolinux-2.4.16-sparse/arch/xeno/mm/init.c	Wed Dec 18 18:07:22 2002 +0000
    12.2 +++ b/xenolinux-2.4.16-sparse/arch/xeno/mm/init.c	Fri Jan 03 18:24:03 2003 +0000
    12.3 @@ -46,15 +46,11 @@ int do_check_pgt_cache(int low, int high
    12.4      int freed = 0;
    12.5      if(pgtable_cache_size > high) {
    12.6          do {
    12.7 -            if (pgd_quicklist) {
    12.8 +            if (!QUICKLIST_EMPTY(pgd_quicklist)) {
    12.9                  free_pgd_slow(get_pgd_fast());
   12.10                  freed++;
   12.11              }
   12.12 -            if (pmd_quicklist) {
   12.13 -                pmd_free_slow(pmd_alloc_one_fast(NULL, 0));
   12.14 -                freed++;
   12.15 -            }
   12.16 -            if (pte_quicklist) {
   12.17 +            if (!QUICKLIST_EMPTY(pte_quicklist)) {
   12.18                  pte_free_slow(pte_alloc_one_fast(NULL, 0));
   12.19                  freed++;
   12.20              }
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/xenolinux-2.4.16-sparse/fs/exec.c	Fri Jan 03 18:24:03 2003 +0000
    13.3 @@ -0,0 +1,986 @@
    13.4 +/*
    13.5 + *  linux/fs/exec.c
    13.6 + *
    13.7 + *  Copyright (C) 1991, 1992  Linus Torvalds
    13.8 + */
    13.9 +
   13.10 +/*
   13.11 + * #!-checking implemented by tytso.
   13.12 + */
   13.13 +/*
   13.14 + * Demand-loading implemented 01.12.91 - no need to read anything but
   13.15 + * the header into memory. The inode of the executable is put into
   13.16 + * "current->executable", and page faults do the actual loading. Clean.
   13.17 + *
   13.18 + * Once more I can proudly say that linux stood up to being changed: it
   13.19 + * was less than 2 hours work to get demand-loading completely implemented.
   13.20 + *
   13.21 + * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
   13.22 + * current->executable is only used by the procfs.  This allows a dispatch
   13.23 + * table to check for several different types  of binary formats.  We keep
   13.24 + * trying until we recognize the file or we run out of supported binary
   13.25 + * formats. 
   13.26 + */
   13.27 +
   13.28 +#include <linux/config.h>
   13.29 +#include <linux/slab.h>
   13.30 +#include <linux/file.h>
   13.31 +#include <linux/mman.h>
   13.32 +#include <linux/a.out.h>
   13.33 +#include <linux/stat.h>
   13.34 +#include <linux/fcntl.h>
   13.35 +#include <linux/smp_lock.h>
   13.36 +#include <linux/init.h>
   13.37 +#include <linux/pagemap.h>
   13.38 +#include <linux/highmem.h>
   13.39 +#include <linux/spinlock.h>
   13.40 +#include <linux/personality.h>
   13.41 +#define __NO_VERSION__
   13.42 +#include <linux/module.h>
   13.43 +
   13.44 +#include <asm/uaccess.h>
   13.45 +#include <asm/pgalloc.h>
   13.46 +#include <asm/mmu_context.h>
   13.47 +
   13.48 +#ifdef CONFIG_KMOD
   13.49 +#include <linux/kmod.h>
   13.50 +#endif
   13.51 +
   13.52 +int core_uses_pid;
   13.53 +
   13.54 +static struct linux_binfmt *formats;
   13.55 +static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
   13.56 +
   13.57 +int register_binfmt(struct linux_binfmt * fmt)
   13.58 +{
   13.59 +	struct linux_binfmt ** tmp = &formats;
   13.60 +
   13.61 +	if (!fmt)
   13.62 +		return -EINVAL;
   13.63 +	if (fmt->next)
   13.64 +		return -EBUSY;
   13.65 +	write_lock(&binfmt_lock);
   13.66 +	while (*tmp) {
   13.67 +		if (fmt == *tmp) {
   13.68 +			write_unlock(&binfmt_lock);
   13.69 +			return -EBUSY;
   13.70 +		}
   13.71 +		tmp = &(*tmp)->next;
   13.72 +	}
   13.73 +	fmt->next = formats;
   13.74 +	formats = fmt;
   13.75 +	write_unlock(&binfmt_lock);
   13.76 +	return 0;	
   13.77 +}
   13.78 +
   13.79 +int unregister_binfmt(struct linux_binfmt * fmt)
   13.80 +{
   13.81 +	struct linux_binfmt ** tmp = &formats;
   13.82 +
   13.83 +	write_lock(&binfmt_lock);
   13.84 +	while (*tmp) {
   13.85 +		if (fmt == *tmp) {
   13.86 +			*tmp = fmt->next;
   13.87 +			write_unlock(&binfmt_lock);
   13.88 +			return 0;
   13.89 +		}
   13.90 +		tmp = &(*tmp)->next;
   13.91 +	}
   13.92 +	write_unlock(&binfmt_lock);
   13.93 +	return -EINVAL;
   13.94 +}
   13.95 +
   13.96 +static inline void put_binfmt(struct linux_binfmt * fmt)
   13.97 +{
   13.98 +	if (fmt->module)
   13.99 +		__MOD_DEC_USE_COUNT(fmt->module);
  13.100 +}
  13.101 +
  13.102 +/*
  13.103 + * Note that a shared library must be both readable and executable due to
  13.104 + * security reasons.
  13.105 + *
  13.106 + * Also note that we take the address to load from from the file itself.
  13.107 + */
  13.108 +asmlinkage long sys_uselib(const char * library)
  13.109 +{
  13.110 +	struct file * file;
  13.111 +	struct nameidata nd;
  13.112 +	int error;
  13.113 +
  13.114 +	error = user_path_walk(library, &nd);
  13.115 +	if (error)
  13.116 +		goto out;
  13.117 +
  13.118 +	error = -EINVAL;
  13.119 +	if (!S_ISREG(nd.dentry->d_inode->i_mode))
  13.120 +		goto exit;
  13.121 +
  13.122 +	error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC);
  13.123 +	if (error)
  13.124 +		goto exit;
  13.125 +
  13.126 +	file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
  13.127 +	error = PTR_ERR(file);
  13.128 +	if (IS_ERR(file))
  13.129 +		goto out;
  13.130 +
  13.131 +	error = -ENOEXEC;
  13.132 +	if(file->f_op && file->f_op->read) {
  13.133 +		struct linux_binfmt * fmt;
  13.134 +
  13.135 +		read_lock(&binfmt_lock);
  13.136 +		for (fmt = formats ; fmt ; fmt = fmt->next) {
  13.137 +			if (!fmt->load_shlib)
  13.138 +				continue;
  13.139 +			if (!try_inc_mod_count(fmt->module))
  13.140 +				continue;
  13.141 +			read_unlock(&binfmt_lock);
  13.142 +			error = fmt->load_shlib(file);
  13.143 +			read_lock(&binfmt_lock);
  13.144 +			put_binfmt(fmt);
  13.145 +			if (error != -ENOEXEC)
  13.146 +				break;
  13.147 +		}
  13.148 +		read_unlock(&binfmt_lock);
  13.149 +	}
  13.150 +	fput(file);
  13.151 +out:
  13.152 +  	return error;
  13.153 +exit:
  13.154 +	path_release(&nd);
  13.155 +	goto out;
  13.156 +}
  13.157 +
  13.158 +/*
  13.159 + * count() counts the number of arguments/envelopes
  13.160 + */
  13.161 +static int count(char ** argv, int max)
  13.162 +{
  13.163 +	int i = 0;
  13.164 +
  13.165 +	if (argv != NULL) {
  13.166 +		for (;;) {
  13.167 +			char * p;
  13.168 +
  13.169 +			if (get_user(p, argv))
  13.170 +				return -EFAULT;
  13.171 +			if (!p)
  13.172 +				break;
  13.173 +			argv++;
  13.174 +			if(++i > max)
  13.175 +				return -E2BIG;
  13.176 +		}
  13.177 +	}
  13.178 +	return i;
  13.179 +}
  13.180 +
  13.181 +/*
  13.182 + * 'copy_strings()' copies argument/envelope strings from user
  13.183 + * memory to free pages in kernel mem. These are in a format ready
  13.184 + * to be put directly into the top of new user memory.
  13.185 + */
  13.186 +int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) 
  13.187 +{
  13.188 +	while (argc-- > 0) {
  13.189 +		char *str;
  13.190 +		int len;
  13.191 +		unsigned long pos;
  13.192 +
  13.193 +		if (get_user(str, argv+argc) || !(len = strnlen_user(str, bprm->p)))
  13.194 +			return -EFAULT;
  13.195 +		if (bprm->p < len) 
  13.196 +			return -E2BIG; 
  13.197 +
  13.198 +		bprm->p -= len;
  13.199 +		/* XXX: add architecture specific overflow check here. */ 
  13.200 +
  13.201 +		pos = bprm->p;
  13.202 +		while (len > 0) {
  13.203 +			char *kaddr;
  13.204 +			int i, new, err;
  13.205 +			struct page *page;
  13.206 +			int offset, bytes_to_copy;
  13.207 +
  13.208 +			offset = pos % PAGE_SIZE;
  13.209 +			i = pos/PAGE_SIZE;
  13.210 +			page = bprm->page[i];
  13.211 +			new = 0;
  13.212 +			if (!page) {
  13.213 +				page = alloc_page(GFP_HIGHUSER);
  13.214 +				bprm->page[i] = page;
  13.215 +				if (!page)
  13.216 +					return -ENOMEM;
  13.217 +				new = 1;
  13.218 +			}
  13.219 +			kaddr = kmap(page);
  13.220 +
  13.221 +			if (new && offset)
  13.222 +				memset(kaddr, 0, offset);
  13.223 +			bytes_to_copy = PAGE_SIZE - offset;
  13.224 +			if (bytes_to_copy > len) {
  13.225 +				bytes_to_copy = len;
  13.226 +				if (new)
  13.227 +					memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len);
  13.228 +			}
  13.229 +			err = copy_from_user(kaddr + offset, str, bytes_to_copy);
  13.230 +			kunmap(page);
  13.231 +
  13.232 +			if (err)
  13.233 +				return -EFAULT; 
  13.234 +
  13.235 +			pos += bytes_to_copy;
  13.236 +			str += bytes_to_copy;
  13.237 +			len -= bytes_to_copy;
  13.238 +		}
  13.239 +	}
  13.240 +	return 0;
  13.241 +}
  13.242 +
  13.243 +/*
  13.244 + * Like copy_strings, but get argv and its values from kernel memory.
  13.245 + */
  13.246 +int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
  13.247 +{
  13.248 +	int r;
  13.249 +	mm_segment_t oldfs = get_fs();
  13.250 +	set_fs(KERNEL_DS); 
  13.251 +	r = copy_strings(argc, argv, bprm);
  13.252 +	set_fs(oldfs);
  13.253 +	return r; 
  13.254 +}
  13.255 +
  13.256 +/*
  13.257 + * This routine is used to map in a page into an address space: needed by
  13.258 + * execve() for the initial stack and environment pages.
  13.259 + *
  13.260 + * tsk->mmap_sem is held for writing.
  13.261 + */
  13.262 +void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
  13.263 +{
  13.264 +	pgd_t * pgd;
  13.265 +	pmd_t * pmd;
  13.266 +	pte_t * pte;
  13.267 +
  13.268 +	if (page_count(page) != 1)
  13.269 +		printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
  13.270 +	pgd = pgd_offset(tsk->mm, address);
  13.271 +
  13.272 +	spin_lock(&tsk->mm->page_table_lock);
  13.273 +	pmd = pmd_alloc(tsk->mm, pgd, address);
  13.274 +	if (!pmd)
  13.275 +		goto out;
  13.276 +	pte = pte_alloc(tsk->mm, pmd, address);
  13.277 +	if (!pte)
  13.278 +		goto out;
  13.279 +	if (!pte_none(*pte))
  13.280 +		goto out;
  13.281 +	lru_cache_add(page);
  13.282 +	flush_dcache_page(page);
  13.283 +	flush_page_to_ram(page);
  13.284 +	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
  13.285 +	XENO_flush_page_update_queue();
  13.286 +	tsk->mm->rss++;
  13.287 +	spin_unlock(&tsk->mm->page_table_lock);
  13.288 +
  13.289 +	/* no need for flush_tlb */
  13.290 +	return;
  13.291 +out:
  13.292 +	spin_unlock(&tsk->mm->page_table_lock);
  13.293 +	__free_page(page);
  13.294 +	force_sig(SIGKILL, tsk);
  13.295 +	return;
  13.296 +}
  13.297 +
  13.298 +int setup_arg_pages(struct linux_binprm *bprm)
  13.299 +{
  13.300 +	unsigned long stack_base;
  13.301 +	struct vm_area_struct *mpnt;
  13.302 +	int i;
  13.303 +
  13.304 +	stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
  13.305 +
  13.306 +	bprm->p += stack_base;
  13.307 +	if (bprm->loader)
  13.308 +		bprm->loader += stack_base;
  13.309 +	bprm->exec += stack_base;
  13.310 +
  13.311 +	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  13.312 +	if (!mpnt) 
  13.313 +		return -ENOMEM; 
  13.314 +	
  13.315 +	down_write(&current->mm->mmap_sem);
  13.316 +	{
  13.317 +		mpnt->vm_mm = current->mm;
  13.318 +		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
  13.319 +		mpnt->vm_end = STACK_TOP;
  13.320 +		mpnt->vm_page_prot = PAGE_COPY;
  13.321 +		mpnt->vm_flags = VM_STACK_FLAGS;
  13.322 +		mpnt->vm_ops = NULL;
  13.323 +		mpnt->vm_pgoff = 0;
  13.324 +		mpnt->vm_file = NULL;
  13.325 +		mpnt->vm_private_data = (void *) 0;
  13.326 +		insert_vm_struct(current->mm, mpnt);
  13.327 +		current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
  13.328 +	} 
  13.329 +
  13.330 +	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
  13.331 +		struct page *page = bprm->page[i];
  13.332 +		if (page) {
  13.333 +			bprm->page[i] = NULL;
  13.334 +			put_dirty_page(current,page,stack_base);
  13.335 +		}
  13.336 +		stack_base += PAGE_SIZE;
  13.337 +	}
  13.338 +	up_write(&current->mm->mmap_sem);
  13.339 +	
  13.340 +	return 0;
  13.341 +}
  13.342 +
  13.343 +struct file *open_exec(const char *name)
  13.344 +{
  13.345 +	struct nameidata nd;
  13.346 +	struct inode *inode;
  13.347 +	struct file *file;
  13.348 +	int err = 0;
  13.349 +
  13.350 +	if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
  13.351 +		err = path_walk(name, &nd);
  13.352 +	file = ERR_PTR(err);
  13.353 +	if (!err) {
  13.354 +		inode = nd.dentry->d_inode;
  13.355 +		file = ERR_PTR(-EACCES);
  13.356 +		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
  13.357 +		    S_ISREG(inode->i_mode)) {
  13.358 +			int err = permission(inode, MAY_EXEC);
  13.359 +			if (!err && !(inode->i_mode & 0111))
  13.360 +				err = -EACCES;
  13.361 +			file = ERR_PTR(err);
  13.362 +			if (!err) {
  13.363 +				file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
  13.364 +				if (!IS_ERR(file)) {
  13.365 +					err = deny_write_access(file);
  13.366 +					if (err) {
  13.367 +						fput(file);
  13.368 +						file = ERR_PTR(err);
  13.369 +					}
  13.370 +				}
  13.371 +out:
  13.372 +				return file;
  13.373 +			}
  13.374 +		}
  13.375 +		path_release(&nd);
  13.376 +	}
  13.377 +	goto out;
  13.378 +}
  13.379 +
  13.380 +int kernel_read(struct file *file, unsigned long offset,
  13.381 +	char * addr, unsigned long count)
  13.382 +{
  13.383 +	mm_segment_t old_fs;
  13.384 +	loff_t pos = offset;
  13.385 +	int result = -ENOSYS;
  13.386 +
  13.387 +	if (!file->f_op->read)
  13.388 +		goto fail;
  13.389 +	old_fs = get_fs();
  13.390 +	set_fs(get_ds());
  13.391 +	result = file->f_op->read(file, addr, count, &pos);
  13.392 +	set_fs(old_fs);
  13.393 +fail:
  13.394 +	return result;
  13.395 +}
  13.396 +
  13.397 +static int exec_mmap(void)
  13.398 +{
  13.399 +	struct mm_struct * mm, * old_mm;
  13.400 +
  13.401 +	old_mm = current->mm;
  13.402 +	if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
  13.403 +		mm_release();
  13.404 +		exit_mmap(old_mm);
  13.405 +		return 0;
  13.406 +	}
  13.407 +
  13.408 +	mm = mm_alloc();
  13.409 +	if (mm) {
  13.410 +		struct mm_struct *active_mm;
  13.411 +
  13.412 +		if (init_new_context(current, mm)) {
  13.413 +			mmdrop(mm);
  13.414 +			return -ENOMEM;
  13.415 +		}
  13.416 +
  13.417 +		/* Add it to the list of mm's */
  13.418 +		spin_lock(&mmlist_lock);
  13.419 +		list_add(&mm->mmlist, &init_mm.mmlist);
  13.420 +		mmlist_nr++;
  13.421 +		spin_unlock(&mmlist_lock);
  13.422 +
  13.423 +		task_lock(current);
  13.424 +		active_mm = current->active_mm;
  13.425 +		current->mm = mm;
  13.426 +		current->active_mm = mm;
  13.427 +		task_unlock(current);
  13.428 +		activate_mm(active_mm, mm);
  13.429 +		mm_release();
  13.430 +		if (old_mm) {
  13.431 +			if (active_mm != old_mm) BUG();
  13.432 +			mmput(old_mm);
  13.433 +			return 0;
  13.434 +		}
  13.435 +		mmdrop(active_mm);
  13.436 +		return 0;
  13.437 +	}
  13.438 +	return -ENOMEM;
  13.439 +}
  13.440 +
  13.441 +/*
  13.442 + * This function makes sure the current process has its own signal table,
  13.443 + * so that flush_signal_handlers can later reset the handlers without
  13.444 + * disturbing other processes.  (Other processes might share the signal
  13.445 + * table via the CLONE_SIGNAL option to clone().)
  13.446 + */
  13.447 + 
  13.448 +static inline int make_private_signals(void)
  13.449 +{
  13.450 +	struct signal_struct * newsig;
  13.451 +
  13.452 +	if (atomic_read(&current->sig->count) <= 1)
  13.453 +		return 0;
  13.454 +	newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
  13.455 +	if (newsig == NULL)
  13.456 +		return -ENOMEM;
  13.457 +	spin_lock_init(&newsig->siglock);
  13.458 +	atomic_set(&newsig->count, 1);
  13.459 +	memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
  13.460 +	spin_lock_irq(&current->sigmask_lock);
  13.461 +	current->sig = newsig;
  13.462 +	spin_unlock_irq(&current->sigmask_lock);
  13.463 +	return 0;
  13.464 +}
  13.465 +	
  13.466 +/*
  13.467 + * If make_private_signals() made a copy of the signal table, decrement the
  13.468 + * refcount of the original table, and free it if necessary.
  13.469 + * We don't do that in make_private_signals() so that we can back off
  13.470 + * in flush_old_exec() if an error occurs after calling make_private_signals().
  13.471 + */
  13.472 +
  13.473 +static inline void release_old_signals(struct signal_struct * oldsig)
  13.474 +{
  13.475 +	if (current->sig == oldsig)
  13.476 +		return;
  13.477 +	if (atomic_dec_and_test(&oldsig->count))
  13.478 +		kmem_cache_free(sigact_cachep, oldsig);
  13.479 +}
  13.480 +
  13.481 +/*
  13.482 + * These functions flushes out all traces of the currently running executable
  13.483 + * so that a new one can be started
  13.484 + */
  13.485 +
  13.486 +static inline void flush_old_files(struct files_struct * files)
  13.487 +{
  13.488 +	long j = -1;
  13.489 +
  13.490 +	write_lock(&files->file_lock);
  13.491 +	for (;;) {
  13.492 +		unsigned long set, i;
  13.493 +
  13.494 +		j++;
  13.495 +		i = j * __NFDBITS;
  13.496 +		if (i >= files->max_fds || i >= files->max_fdset)
  13.497 +			break;
  13.498 +		set = files->close_on_exec->fds_bits[j];
  13.499 +		if (!set)
  13.500 +			continue;
  13.501 +		files->close_on_exec->fds_bits[j] = 0;
  13.502 +		write_unlock(&files->file_lock);
  13.503 +		for ( ; set ; i++,set >>= 1) {
  13.504 +			if (set & 1) {
  13.505 +				sys_close(i);
  13.506 +			}
  13.507 +		}
  13.508 +		write_lock(&files->file_lock);
  13.509 +
  13.510 +	}
  13.511 +	write_unlock(&files->file_lock);
  13.512 +}
  13.513 +
  13.514 +/*
  13.515 + * An execve() will automatically "de-thread" the process.
  13.516 + * Note: we don't have to hold the tasklist_lock to test
  13.517 + * whether we migth need to do this. If we're not part of
  13.518 + * a thread group, there is no way we can become one
  13.519 + * dynamically. And if we are, we only need to protect the
  13.520 + * unlink - even if we race with the last other thread exit,
  13.521 + * at worst the list_del_init() might end up being a no-op.
  13.522 + */
  13.523 +static inline void de_thread(struct task_struct *tsk)
  13.524 +{
  13.525 +	if (!list_empty(&tsk->thread_group)) {
  13.526 +		write_lock_irq(&tasklist_lock);
  13.527 +		list_del_init(&tsk->thread_group);
  13.528 +		write_unlock_irq(&tasklist_lock);
  13.529 +	}
  13.530 +
  13.531 +	/* Minor oddity: this might stay the same. */
  13.532 +	tsk->tgid = tsk->pid;
  13.533 +}
  13.534 +
  13.535 +int flush_old_exec(struct linux_binprm * bprm)
  13.536 +{
  13.537 +	char * name;
  13.538 +	int i, ch, retval;
  13.539 +	struct signal_struct * oldsig;
  13.540 +
  13.541 +	/*
  13.542 +	 * Make sure we have a private signal table
  13.543 +	 */
  13.544 +	oldsig = current->sig;
  13.545 +	retval = make_private_signals();
  13.546 +	if (retval) goto flush_failed;
  13.547 +
  13.548 +	/* 
  13.549 +	 * Release all of the old mmap stuff
  13.550 +	 */
  13.551 +	retval = exec_mmap();
  13.552 +	if (retval) goto mmap_failed;
  13.553 +
  13.554 +	/* This is the point of no return */
  13.555 +	release_old_signals(oldsig);
  13.556 +
  13.557 +	current->sas_ss_sp = current->sas_ss_size = 0;
  13.558 +
  13.559 +	if (current->euid == current->uid && current->egid == current->gid)
  13.560 +		current->mm->dumpable = 1;
  13.561 +	name = bprm->filename;
  13.562 +	for (i=0; (ch = *(name++)) != '\0';) {
  13.563 +		if (ch == '/')
  13.564 +			i = 0;
  13.565 +		else
  13.566 +			if (i < 15)
  13.567 +				current->comm[i++] = ch;
  13.568 +	}
  13.569 +	current->comm[i] = '\0';
  13.570 +
  13.571 +	flush_thread();
  13.572 +
  13.573 +	de_thread(current);
  13.574 +
  13.575 +	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
  13.576 +	    permission(bprm->file->f_dentry->d_inode,MAY_READ))
  13.577 +		current->mm->dumpable = 0;
  13.578 +
  13.579 +	/* An exec changes our domain. We are no longer part of the thread
  13.580 +	   group */
  13.581 +	   
  13.582 +	current->self_exec_id++;
  13.583 +			
  13.584 +	flush_signal_handlers(current);
  13.585 +	flush_old_files(current->files);
  13.586 +
  13.587 +	return 0;
  13.588 +
  13.589 +mmap_failed:
  13.590 +flush_failed:
  13.591 +	spin_lock_irq(&current->sigmask_lock);
  13.592 +	if (current->sig != oldsig) {
  13.593 +		kfree(current->sig);
  13.594 +		current->sig = oldsig;
  13.595 +	}
  13.596 +	spin_unlock_irq(&current->sigmask_lock);
  13.597 +	return retval;
  13.598 +}
  13.599 +
  13.600 +/*
  13.601 + * We mustn't allow tracing of suid binaries, unless
  13.602 + * the tracer has the capability to trace anything..
  13.603 + */
  13.604 +static inline int must_not_trace_exec(struct task_struct * p)
  13.605 +{
  13.606 +	return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP);
  13.607 +}
  13.608 +
  13.609 +/* 
  13.610 + * Fill the binprm structure from the inode. 
  13.611 + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
  13.612 + */
  13.613 +int prepare_binprm(struct linux_binprm *bprm)
  13.614 +{
  13.615 +	int mode;
  13.616 +	struct inode * inode = bprm->file->f_dentry->d_inode;
  13.617 +
  13.618 +	mode = inode->i_mode;
  13.619 +	/*
  13.620 +	 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
  13.621 +	 * vfs_permission lets a non-executable through
  13.622 +	 */
  13.623 +	if (!(mode & 0111))	/* with at least _one_ execute bit set */
  13.624 +		return -EACCES;
  13.625 +	if (bprm->file->f_op == NULL)
  13.626 +		return -EACCES;
  13.627 +
  13.628 +	bprm->e_uid = current->euid;
  13.629 +	bprm->e_gid = current->egid;
  13.630 +
  13.631 +	if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
  13.632 +		/* Set-uid? */
  13.633 +		if (mode & S_ISUID)
  13.634 +			bprm->e_uid = inode->i_uid;
  13.635 +
  13.636 +		/* Set-gid? */
  13.637 +		/*
  13.638 +		 * If setgid is set but no group execute bit then this
  13.639 +		 * is a candidate for mandatory locking, not a setgid
  13.640 +		 * executable.
  13.641 +		 */
  13.642 +		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
  13.643 +			bprm->e_gid = inode->i_gid;
  13.644 +	}
  13.645 +
  13.646 +	/* We don't have VFS support for capabilities yet */
  13.647 +	cap_clear(bprm->cap_inheritable);
  13.648 +	cap_clear(bprm->cap_permitted);
  13.649 +	cap_clear(bprm->cap_effective);
  13.650 +
  13.651 +	/*  To support inheritance of root-permissions and suid-root
  13.652 +         *  executables under compatibility mode, we raise all three
  13.653 +         *  capability sets for the file.
  13.654 +         *
  13.655 +         *  If only the real uid is 0, we only raise the inheritable
  13.656 +         *  and permitted sets of the executable file.
  13.657 +         */
  13.658 +
  13.659 +	if (!issecure(SECURE_NOROOT)) {
  13.660 +		if (bprm->e_uid == 0 || current->uid == 0) {
  13.661 +			cap_set_full(bprm->cap_inheritable);
  13.662 +			cap_set_full(bprm->cap_permitted);
  13.663 +		}
  13.664 +		if (bprm->e_uid == 0) 
  13.665 +			cap_set_full(bprm->cap_effective);
  13.666 +	}
  13.667 +
  13.668 +	memset(bprm->buf,0,BINPRM_BUF_SIZE);
  13.669 +	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
  13.670 +}
  13.671 +
  13.672 +/*
  13.673 + * This function is used to produce the new IDs and capabilities
  13.674 + * from the old ones and the file's capabilities.
  13.675 + *
  13.676 + * The formula used for evolving capabilities is:
  13.677 + *
  13.678 + *       pI' = pI
  13.679 + * (***) pP' = (fP & X) | (fI & pI)
  13.680 + *       pE' = pP' & fE          [NB. fE is 0 or ~0]
  13.681 + *
  13.682 + * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
  13.683 + * ' indicates post-exec(), and X is the global 'cap_bset'.
  13.684 + *
  13.685 + */
  13.686 +
  13.687 +void compute_creds(struct linux_binprm *bprm) 
  13.688 +{
  13.689 +	kernel_cap_t new_permitted, working;
  13.690 +	int do_unlock = 0;
  13.691 +
  13.692 +	new_permitted = cap_intersect(bprm->cap_permitted, cap_bset);
  13.693 +	working = cap_intersect(bprm->cap_inheritable,
  13.694 +				current->cap_inheritable);
  13.695 +	new_permitted = cap_combine(new_permitted, working);
  13.696 +
  13.697 +	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
  13.698 +	    !cap_issubset(new_permitted, current->cap_permitted)) {
  13.699 +                current->mm->dumpable = 0;
  13.700 +		
  13.701 +		lock_kernel();
  13.702 +		if (must_not_trace_exec(current)
  13.703 +		    || atomic_read(&current->fs->count) > 1
  13.704 +		    || atomic_read(&current->files->count) > 1
  13.705 +		    || atomic_read(&current->sig->count) > 1) {
  13.706 +			if(!capable(CAP_SETUID)) {
  13.707 +				bprm->e_uid = current->uid;
  13.708 +				bprm->e_gid = current->gid;
  13.709 +			}
  13.710 +			if(!capable(CAP_SETPCAP)) {
  13.711 +				new_permitted = cap_intersect(new_permitted,
  13.712 +							current->cap_permitted);
  13.713 +			}
  13.714 +		}
  13.715 +		do_unlock = 1;
  13.716 +	}
  13.717 +
  13.718 +
  13.719 +	/* For init, we want to retain the capabilities set
  13.720 +         * in the init_task struct. Thus we skip the usual
  13.721 +         * capability rules */
  13.722 +	if (current->pid != 1) {
  13.723 +		current->cap_permitted = new_permitted;
  13.724 +		current->cap_effective =
  13.725 +			cap_intersect(new_permitted, bprm->cap_effective);
  13.726 +	}
  13.727 +	
  13.728 +        /* AUD: Audit candidate if current->cap_effective is set */
  13.729 +
  13.730 +        current->suid = current->euid = current->fsuid = bprm->e_uid;
  13.731 +        current->sgid = current->egid = current->fsgid = bprm->e_gid;
  13.732 +
  13.733 +	if(do_unlock)
  13.734 +		unlock_kernel();
  13.735 +	current->keep_capabilities = 0;
  13.736 +}
  13.737 +
  13.738 +
  13.739 +void remove_arg_zero(struct linux_binprm *bprm)
  13.740 +{
  13.741 +	if (bprm->argc) {
  13.742 +		unsigned long offset;
  13.743 +		char * kaddr;
  13.744 +		struct page *page;
  13.745 +
  13.746 +		offset = bprm->p % PAGE_SIZE;
  13.747 +		goto inside;
  13.748 +
  13.749 +		while (bprm->p++, *(kaddr+offset++)) {
  13.750 +			if (offset != PAGE_SIZE)
  13.751 +				continue;
  13.752 +			offset = 0;
  13.753 +			kunmap(page);
  13.754 +inside:
  13.755 +			page = bprm->page[bprm->p/PAGE_SIZE];
  13.756 +			kaddr = kmap(page);
  13.757 +		}
  13.758 +		kunmap(page);
  13.759 +		bprm->argc--;
  13.760 +	}
  13.761 +}
  13.762 +
  13.763 +/*
  13.764 + * cycle the list of binary formats handler, until one recognizes the image
  13.765 + */
  13.766 +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
  13.767 +{
  13.768 +	int try,retval=0;
  13.769 +	struct linux_binfmt *fmt;
  13.770 +#ifdef __alpha__
  13.771 +	/* handle /sbin/loader.. */
  13.772 +	{
  13.773 +	    struct exec * eh = (struct exec *) bprm->buf;
  13.774 +
  13.775 +	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
  13.776 +		(eh->fh.f_flags & 0x3000) == 0x3000)
  13.777 +	    {
  13.778 +		struct file * file;
  13.779 +		unsigned long loader;
  13.780 +
  13.781 +		allow_write_access(bprm->file);
  13.782 +		fput(bprm->file);
  13.783 +		bprm->file = NULL;
  13.784 +
  13.785 +	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
  13.786 +
  13.787 +		file = open_exec("/sbin/loader");
  13.788 +		retval = PTR_ERR(file);
  13.789 +		if (IS_ERR(file))
  13.790 +			return retval;
  13.791 +
  13.792 +		/* Remember if the application is TASO.  */
  13.793 +		bprm->sh_bang = eh->ah.entry < 0x100000000;
  13.794 +
  13.795 +		bprm->file = file;
  13.796 +		bprm->loader = loader;
  13.797 +		retval = prepare_binprm(bprm);
  13.798 +		if (retval<0)
  13.799 +			return retval;
  13.800 +		/* should call search_binary_handler recursively here,
  13.801 +		   but it does not matter */
  13.802 +	    }
  13.803 +	}
  13.804 +#endif
  13.805 +	/* kernel module loader fixup */
  13.806 +	/* so we don't try to load run modprobe in kernel space. */
  13.807 +	set_fs(USER_DS);
  13.808 +	for (try=0; try<2; try++) {
  13.809 +		read_lock(&binfmt_lock);
  13.810 +		for (fmt = formats ; fmt ; fmt = fmt->next) {
  13.811 +			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
  13.812 +			if (!fn)
  13.813 +				continue;
  13.814 +			if (!try_inc_mod_count(fmt->module))
  13.815 +				continue;
  13.816 +			read_unlock(&binfmt_lock);
  13.817 +			retval = fn(bprm, regs);
  13.818 +			if (retval >= 0) {
  13.819 +				put_binfmt(fmt);
  13.820 +				allow_write_access(bprm->file);
  13.821 +				if (bprm->file)
  13.822 +					fput(bprm->file);
  13.823 +				bprm->file = NULL;
  13.824 +				current->did_exec = 1;
  13.825 +				return retval;
  13.826 +			}
  13.827 +			read_lock(&binfmt_lock);
  13.828 +			put_binfmt(fmt);
  13.829 +			if (retval != -ENOEXEC)
  13.830 +				break;
  13.831 +			if (!bprm->file) {
  13.832 +				read_unlock(&binfmt_lock);
  13.833 +				return retval;
  13.834 +			}
  13.835 +		}
  13.836 +		read_unlock(&binfmt_lock);
  13.837 +		if (retval != -ENOEXEC) {
  13.838 +			break;
  13.839 +#ifdef CONFIG_KMOD
  13.840 +		}else{
  13.841 +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
  13.842 +			char modname[20];
  13.843 +			if (printable(bprm->buf[0]) &&
  13.844 +			    printable(bprm->buf[1]) &&
  13.845 +			    printable(bprm->buf[2]) &&
  13.846 +			    printable(bprm->buf[3]))
  13.847 +				break; /* -ENOEXEC */
  13.848 +			sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
  13.849 +			request_module(modname);
  13.850 +#endif
  13.851 +		}
  13.852 +	}
  13.853 +	return retval;
  13.854 +}
  13.855 +
  13.856 +
  13.857 +/*
  13.858 + * sys_execve() executes a new program.
  13.859 + */
  13.860 +int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
  13.861 +{
  13.862 +	struct linux_binprm bprm;
  13.863 +	struct file *file;
  13.864 +	int retval;
  13.865 +	int i;
  13.866 +
  13.867 +	file = open_exec(filename);
  13.868 +
  13.869 +	retval = PTR_ERR(file);
  13.870 +	if (IS_ERR(file))
  13.871 +		return retval;
  13.872 +
  13.873 +	bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
  13.874 +	memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); 
  13.875 +
  13.876 +	bprm.file = file;
  13.877 +	bprm.filename = filename;
  13.878 +	bprm.sh_bang = 0;
  13.879 +	bprm.loader = 0;
  13.880 +	bprm.exec = 0;
  13.881 +	if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) {
  13.882 +		allow_write_access(file);
  13.883 +		fput(file);
  13.884 +		return bprm.argc;
  13.885 +	}
  13.886 +
  13.887 +	if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) {
  13.888 +		allow_write_access(file);
  13.889 +		fput(file);
  13.890 +		return bprm.envc;
  13.891 +	}
  13.892 +
  13.893 +	retval = prepare_binprm(&bprm);
  13.894 +	if (retval < 0) 
  13.895 +		goto out; 
  13.896 +
  13.897 +	retval = copy_strings_kernel(1, &bprm.filename, &bprm);
  13.898 +	if (retval < 0) 
  13.899 +		goto out; 
  13.900 +
  13.901 +	bprm.exec = bprm.p;
  13.902 +	retval = copy_strings(bprm.envc, envp, &bprm);
  13.903 +	if (retval < 0) 
  13.904 +		goto out; 
  13.905 +
  13.906 +	retval = copy_strings(bprm.argc, argv, &bprm);
  13.907 +	if (retval < 0) 
  13.908 +		goto out; 
  13.909 +
  13.910 +	retval = search_binary_handler(&bprm,regs);
  13.911 +	if (retval >= 0)
  13.912 +		/* execve success */
  13.913 +		return retval;
  13.914 +
  13.915 +out:
  13.916 +	/* Something went wrong, return the inode and free the argument pages*/
  13.917 +	allow_write_access(bprm.file);
  13.918 +	if (bprm.file)
  13.919 +		fput(bprm.file);
  13.920 +
  13.921 +	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
  13.922 +		struct page * page = bprm.page[i];
  13.923 +		if (page)
  13.924 +			__free_page(page);
  13.925 +	}
  13.926 +
  13.927 +	return retval;
  13.928 +}
  13.929 +
  13.930 +void set_binfmt(struct linux_binfmt *new)
  13.931 +{
  13.932 +	struct linux_binfmt *old = current->binfmt;
  13.933 +	if (new && new->module)
  13.934 +		__MOD_INC_USE_COUNT(new->module);
  13.935 +	current->binfmt = new;
  13.936 +	if (old && old->module)
  13.937 +		__MOD_DEC_USE_COUNT(old->module);
  13.938 +}
  13.939 +
  13.940 +int do_coredump(long signr, struct pt_regs * regs)
  13.941 +{
  13.942 +	struct linux_binfmt * binfmt;
  13.943 +	char corename[6+sizeof(current->comm)+10];
  13.944 +	struct file * file;
  13.945 +	struct inode * inode;
  13.946 +	int retval = 0;
  13.947 +
  13.948 +	lock_kernel();
  13.949 +	binfmt = current->binfmt;
  13.950 +	if (!binfmt || !binfmt->core_dump)
  13.951 +		goto fail;
  13.952 +	if (!current->mm->dumpable)
  13.953 +		goto fail;
  13.954 +	current->mm->dumpable = 0;
  13.955 +	if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
  13.956 +		goto fail;
  13.957 +
  13.958 +	memcpy(corename,"core.", 5);
  13.959 +	corename[4] = '\0';
  13.960 + 	if (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)
  13.961 + 		sprintf(&corename[4], ".%d", current->pid);
  13.962 +	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600);
  13.963 +	if (IS_ERR(file))
  13.964 +		goto fail;
  13.965 +	inode = file->f_dentry->d_inode;
  13.966 +	if (inode->i_nlink > 1)
  13.967 +		goto close_fail;	/* multiple links - don't dump */
  13.968 +	if (d_unhashed(file->f_dentry))
  13.969 +		goto close_fail;
  13.970 +
  13.971 +	if (!S_ISREG(inode->i_mode))
  13.972 +		goto close_fail;
  13.973 +	if (!file->f_op)
  13.974 +		goto close_fail;
  13.975 +	if (!file->f_op->write)
  13.976 +		goto close_fail;
  13.977 +	if (do_truncate(file->f_dentry, 0) != 0)
  13.978 +		goto close_fail;
  13.979 +
  13.980 +	down_read(&current->mm->mmap_sem);
  13.981 +	retval = binfmt->core_dump(signr, regs, file);
  13.982 +	up_read(&current->mm->mmap_sem);
  13.983 +
  13.984 +close_fail:
  13.985 +	filp_close(file, NULL);
  13.986 +fail:
  13.987 +	unlock_kernel();
  13.988 +	return retval;
  13.989 +}
    14.1 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h	Wed Dec 18 18:07:22 2002 +0000
    14.2 +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h	Fri Jan 03 18:24:03 2003 +0000
    14.3 @@ -24,18 +24,111 @@ extern union start_info_union start_info
    14.4  /* arch/xeno/kernel/hypervisor.c */
    14.5  void do_hypervisor_callback(struct pt_regs *regs);
    14.6  
    14.7 +
    14.8  /* arch/xeno/mm/hypervisor.c */
    14.9  /*
   14.10 - * NB. ptr values should be fake-physical. 'vals' should be alread
   14.11 + * NB. ptr values should be fake-physical. 'vals' should be already
   14.12   * fully adjusted (ie. for start_info.phys_base).
   14.13   */
   14.14 +
   14.15 +extern unsigned int pt_update_queue_idx;
   14.16 +
   14.17  void queue_l1_entry_update(unsigned long ptr, unsigned long val);
   14.18  void queue_l2_entry_update(unsigned long ptr, unsigned long val);
   14.19 -void queue_baseptr_create(unsigned long ptr);
   14.20 -void queue_baseptr_remove(unsigned long ptr);
   14.21 +void queue_pt_switch(unsigned long ptr);
   14.22  void queue_tlb_flush(void);
   14.23 -void queue_tlb_flush_one(unsigned long ptr);
   14.24 -void flush_page_update_queue(void);
   14.25 +void queue_invlpg(unsigned long ptr);
   14.26 +void queue_pgd_pin(unsigned long ptr);
   14.27 +void queue_pgd_unpin(unsigned long ptr);
   14.28 +void queue_pte_pin(unsigned long ptr);
   14.29 +void queue_pte_unpin(unsigned long ptr);
   14.30 +
   14.31 +#define PT_UPDATE_DEBUG 0
   14.32 +
   14.33 +#if PT_UPDATE_DEBUG > 0
   14.34 +typedef struct {
   14.35 +    unsigned long ptr, val, pteval;
   14.36 +    void *ptep;
   14.37 +    int line; char *file;
   14.38 +} page_update_debug_t;
   14.39 +extern page_update_debug_t update_debug_queue[];
   14.40 +#define queue_l1_entry_update(_p,_v) ({                           \
   14.41 + update_debug_queue[pt_update_queue_idx].ptr  = (_p);             \
   14.42 + update_debug_queue[pt_update_queue_idx].val  = (_v);             \
   14.43 + update_debug_queue[pt_update_queue_idx].line = __LINE__;         \
   14.44 + update_debug_queue[pt_update_queue_idx].file = __FILE__;         \
   14.45 + queue_l1_entry_update((_p),(_v));                                \
   14.46 +})
   14.47 +#define queue_l2_entry_update(_p,_v) ({                           \
   14.48 + update_debug_queue[pt_update_queue_idx].ptr  = (_p);             \
   14.49 + update_debug_queue[pt_update_queue_idx].val  = (_v);             \
   14.50 + update_debug_queue[pt_update_queue_idx].line = __LINE__;         \
   14.51 + update_debug_queue[pt_update_queue_idx].file = __FILE__;         \
   14.52 + queue_l2_entry_update((_p),(_v));                                \
   14.53 +})
   14.54 +#endif
   14.55 +
   14.56 +#if PT_UPDATE_DEBUG > 1
   14.57 +#undef queue_l1_entry_update
   14.58 +#undef queue_l2_entry_update
   14.59 +#define queue_l1_entry_update(_p,_v) ({                           \
   14.60 + update_debug_queue[pt_update_queue_idx].ptr  = (_p);             \
   14.61 + update_debug_queue[pt_update_queue_idx].val  = (_v);             \
   14.62 + update_debug_queue[pt_update_queue_idx].line = __LINE__;         \
   14.63 + update_debug_queue[pt_update_queue_idx].file = __FILE__;         \
   14.64 + printk("L1 %s %d: %08lx (%08lx -> %08lx)\n", __FILE__, __LINE__, \
   14.65 +        (_p)+start_info.phys_base, *(unsigned long *)__va(_p),    \
   14.66 +        (unsigned long)(_v));                                     \
   14.67 + queue_l1_entry_update((_p),(_v));                                \
   14.68 +})
   14.69 +#define queue_l2_entry_update(_p,_v) ({                           \
   14.70 + update_debug_queue[pt_update_queue_idx].ptr  = (_p);             \
   14.71 + update_debug_queue[pt_update_queue_idx].val  = (_v);             \
   14.72 + update_debug_queue[pt_update_queue_idx].line = __LINE__;         \
   14.73 + update_debug_queue[pt_update_queue_idx].file = __FILE__;         \
   14.74 + printk("L2 %s %d: %08lx (%08lx -> %08lx)\n", __FILE__, __LINE__, \
   14.75 +        (_p)+start_info.phys_base, *(unsigned long *)__va(_p),    \
   14.76 +        (unsigned long)(_v));                                     \
   14.77 + queue_l2_entry_update((_p),(_v));                                \
   14.78 +})
   14.79 +#define queue_pt_switch(_p) ({                                    \
   14.80 + printk("PTSWITCH %s %d: %08lx\n", __FILE__, __LINE__, (_p));     \
   14.81 + queue_pt_switch(_p);                                             \
   14.82 +})   
   14.83 +#define queue_tlb_flush() ({                                      \
   14.84 + printk("TLB FLUSH %s %d\n", __FILE__, __LINE__);                 \
   14.85 + queue_tlb_flush();                                               \
   14.86 +})   
   14.87 +#define queue_invlpg(_p) ({                                       \
   14.88 + printk("INVLPG %s %d: %08lx\n", __FILE__, __LINE__, (_p));       \
   14.89 + queue_invlpg(_p);                                                \
   14.90 +})   
   14.91 +#define queue_pgd_pin(_p) ({                                      \
   14.92 + printk("PGD PIN %s %d: %08lx\n", __FILE__, __LINE__, (_p));      \
   14.93 + queue_pgd_pin(_p);                                               \
   14.94 +})   
   14.95 +#define queue_pgd_unpin(_p) ({                                    \
   14.96 + printk("PGD UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p));    \
   14.97 + queue_pgd_unpin(_p);                                             \
   14.98 +})   
   14.99 +#define queue_pte_pin(_p) ({                                      \
  14.100 + printk("PTE PIN %s %d: %08lx\n", __FILE__, __LINE__, (_p));      \
  14.101 + queue_pte_pin(_p);                                               \
  14.102 +})   
  14.103 +#define queue_pte_unpin(_p) ({                                    \
  14.104 + printk("PTE UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p));    \
  14.105 + queue_pte_unpin(_p);                                             \
  14.106 +})   
  14.107 +#endif
  14.108 +
  14.109 +void _flush_page_update_queue(void);
  14.110 +static inline int flush_page_update_queue(void)
  14.111 +{
  14.112 +    unsigned int idx = pt_update_queue_idx;
  14.113 +    if ( idx != 0 ) _flush_page_update_queue();
  14.114 +    return idx;
  14.115 +}
  14.116 +#define XENO_flush_page_update_queue() (_flush_page_update_queue())
  14.117  
  14.118  
  14.119  /*
  14.120 @@ -78,17 +171,6 @@ static inline int HYPERVISOR_console_wri
  14.121      return ret;
  14.122  }
  14.123  
  14.124 -static inline int HYPERVISOR_set_pagetable(unsigned long ptr)
  14.125 -{
  14.126 -    int ret;
  14.127 -    __asm__ __volatile__ (
  14.128 -        TRAP_INSTR
  14.129 -        : "=a" (ret) : "0" (__HYPERVISOR_set_pagetable),
  14.130 -        "b" (ptr) );
  14.131 -
  14.132 -    return ret;
  14.133 -}
  14.134 -
  14.135  static inline int HYPERVISOR_set_guest_stack(
  14.136      unsigned long ss, unsigned long esp)
  14.137  {
    15.1 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h	Wed Dec 18 18:07:22 2002 +0000
    15.2 +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h	Fri Jan 03 18:24:03 2003 +0000
    15.3 @@ -45,7 +45,8 @@ static inline void switch_mm(struct mm_s
    15.4  		set_bit(cpu, &next->context.cpuvalid);
    15.5  		/* Re-load page tables */
    15.6                  cur_pgd = next->pgd;
    15.7 -                HYPERVISOR_set_pagetable(__pa(cur_pgd) + start_info.phys_base);
    15.8 +                queue_pt_switch(__pa(cur_pgd));
    15.9 +                XENO_flush_page_update_queue();
   15.10  	}
   15.11  #ifdef CONFIG_SMP
   15.12  	else {
    16.1 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h	Wed Dec 18 18:07:22 2002 +0000
    16.2 +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h	Fri Jan 03 18:24:03 2003 +0000
    16.3 @@ -7,13 +7,23 @@
    16.4  #include <asm/fixmap.h>
    16.5  #include <linux/threads.h>
    16.6  
    16.7 +/*
    16.8 + * Quick lists are aligned so that least significant bits of array pointer
    16.9 + * are all zero when list is empty, and all one when list is full.
   16.10 + */
   16.11 +#define QUICKLIST_ENTRIES 256
   16.12 +#define QUICKLIST_EMPTY(_l) !((unsigned long)(_l) & ((QUICKLIST_ENTRIES*4)-1))
   16.13 +#define QUICKLIST_FULL(_l)  QUICKLIST_EMPTY((_l)+1)
   16.14  #define pgd_quicklist (current_cpu_data.pgd_quick)
   16.15  #define pmd_quicklist (current_cpu_data.pmd_quick)
   16.16  #define pte_quicklist (current_cpu_data.pte_quick)
   16.17  #define pgtable_cache_size (current_cpu_data.pgtable_cache_sz)
   16.18  
   16.19 -#define pmd_populate(mm, pmd, pte) \
   16.20 - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
   16.21 +#define pmd_populate(mm, pmd, pte)                \
   16.22 + do {                                             \
   16.23 +  set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));   \
   16.24 +  XENO_flush_page_update_queue();                 \
   16.25 + } while ( 0 )
   16.26  
   16.27  static __inline__ pgd_t *get_pgd_slow(void)
   16.28  {
   16.29 @@ -31,43 +41,18 @@ static __inline__ pgd_t *get_pgd_slow(vo
   16.30          kpmd = pmd_offset(kpgd, (unsigned long)pgd);
   16.31          kpte = pte_offset(kpmd, (unsigned long)pgd);
   16.32          queue_l1_entry_update(__pa(kpte), (*(unsigned long *)kpte)&~_PAGE_RW);
   16.33 -        queue_baseptr_create(__pa(pgd));
   16.34 +        queue_pgd_pin(__pa(pgd));
   16.35      }
   16.36  
   16.37      return pgd;
   16.38  }
   16.39  
   16.40 -#if 0
   16.41 -static __inline__ pgd_t *get_pgd_fast(void)
   16.42 -{
   16.43 -    unsigned long *ret;
   16.44 -
   16.45 -    if ((ret = pgd_quicklist) != NULL) {
   16.46 -        pgd_quicklist = (unsigned long *)(*ret);
   16.47 -        ret[0] = 0;
   16.48 -        pgtable_cache_size--;
   16.49 -    } else
   16.50 -        ret = (unsigned long *)get_pgd_slow();
   16.51 -    return (pgd_t *)ret;
   16.52 -}
   16.53 -
   16.54 -static __inline__ void free_pgd_fast(pgd_t *pgd)
   16.55 -{
   16.56 -    *(unsigned long *)pgd = (unsigned long) pgd_quicklist;
   16.57 -    pgd_quicklist = (unsigned long *) pgd;
   16.58 -    pgtable_cache_size++;
   16.59 -}
   16.60 -#else
   16.61 -#define get_pgd_fast get_pgd_slow
   16.62 -#define free_pgd_fast free_pgd_slow
   16.63 -#endif
   16.64 -
   16.65  static __inline__ void free_pgd_slow(pgd_t *pgd)
   16.66  {
   16.67      pgd_t *kpgd;
   16.68      pmd_t *kpmd;
   16.69      pte_t *kpte;
   16.70 -    queue_baseptr_remove(__pa(pgd));
   16.71 +    queue_pgd_unpin(__pa(pgd));
   16.72      kpgd = pgd_offset_k((unsigned long)pgd);
   16.73      kpmd = pmd_offset(kpgd, (unsigned long)pgd);
   16.74      kpte = pte_offset(kpmd, (unsigned long)pgd);
   16.75 @@ -75,6 +60,27 @@ static __inline__ void free_pgd_slow(pgd
   16.76      free_page((unsigned long)pgd);
   16.77  }
   16.78  
   16.79 +static __inline__ pgd_t *get_pgd_fast(void)
   16.80 +{
   16.81 +    unsigned long ret;
   16.82 +
   16.83 +    if ( !QUICKLIST_EMPTY(pgd_quicklist) ) {
   16.84 +        ret = *(--pgd_quicklist);
   16.85 +        pgtable_cache_size--;
   16.86 +    } else
   16.87 +        ret = (unsigned long)get_pgd_slow();
   16.88 +    return (pgd_t *)ret;
   16.89 +}
   16.90 +
   16.91 +static __inline__ void free_pgd_fast(pgd_t *pgd)
   16.92 +{
   16.93 +    if ( !QUICKLIST_FULL(pgd_quicklist) ) {
   16.94 +        *(pgd_quicklist++) = (unsigned long)pgd;
   16.95 +        pgtable_cache_size++;
   16.96 +    } else
   16.97 +        free_pgd_slow(pgd);
   16.98 +}
   16.99 +
  16.100  static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  16.101  {
  16.102      pte_t *pte;
  16.103 @@ -90,18 +96,29 @@ static inline pte_t *pte_alloc_one(struc
  16.104          kpmd = pmd_offset(kpgd, (unsigned long)pte);
  16.105          kpte = pte_offset(kpmd, (unsigned long)pte);
  16.106          queue_l1_entry_update(__pa(kpte), (*(unsigned long *)kpte)&~_PAGE_RW);
  16.107 +        queue_pte_pin(__pa(pte));
  16.108      }
  16.109      return pte;
  16.110  }
  16.111  
  16.112 -#if 0
  16.113 +static __inline__ void pte_free_slow(pte_t *pte)
  16.114 +{
  16.115 +    pgd_t *kpgd;
  16.116 +    pmd_t *kpmd;
  16.117 +    pte_t *kpte;
  16.118 +    queue_pte_unpin(__pa(pte));
  16.119 +    kpgd = pgd_offset_k((unsigned long)pte);
  16.120 +    kpmd = pmd_offset(kpgd, (unsigned long)pte);
  16.121 +    kpte = pte_offset(kpmd, (unsigned long)pte);
  16.122 +    queue_l1_entry_update(__pa(kpte), (*(unsigned long *)kpte)|_PAGE_RW);
  16.123 +    free_page((unsigned long)pte);
  16.124 +}
  16.125 +
  16.126  static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
  16.127  {
  16.128 -    unsigned long *ret;
  16.129 -
  16.130 -    if ((ret = (unsigned long *)pte_quicklist) != NULL) {
  16.131 -        pte_quicklist = (unsigned long *)(*ret);
  16.132 -        ret[0] = ret[1];
  16.133 +    unsigned long ret = 0;
  16.134 +    if ( !QUICKLIST_EMPTY(pte_quicklist) ) {
  16.135 +        ret = *(--pte_quicklist);
  16.136          pgtable_cache_size--;
  16.137      }
  16.138      return (pte_t *)ret;
  16.139 @@ -109,25 +126,11 @@ static inline pte_t *pte_alloc_one_fast(
  16.140  
  16.141  static __inline__ void pte_free_fast(pte_t *pte)
  16.142  {
  16.143 -    *(unsigned long *)pte = (unsigned long) pte_quicklist;
  16.144 -    pte_quicklist = (unsigned long *) pte;
  16.145 -    pgtable_cache_size++;
  16.146 -}
  16.147 -#else
  16.148 -#define pte_alloc_one_fast pte_alloc_one
  16.149 -#define pte_free_fast pte_free_slow
  16.150 -#endif
  16.151 -
  16.152 -static __inline__ void pte_free_slow(pte_t *pte)
  16.153 -{
  16.154 -    pgd_t *kpgd;
  16.155 -    pmd_t *kpmd;
  16.156 -    pte_t *kpte;
  16.157 -    kpgd = pgd_offset_k((unsigned long)pte);
  16.158 -    kpmd = pmd_offset(kpgd, (unsigned long)pte);
  16.159 -    kpte = pte_offset(kpmd, (unsigned long)pte);
  16.160 -    queue_l1_entry_update(__pa(kpte), (*(unsigned long *)kpte)|_PAGE_RW);
  16.161 -    free_page((unsigned long)pte);
  16.162 +    if ( !QUICKLIST_FULL(pte_quicklist) ) {
  16.163 +        *(pte_quicklist++) = (unsigned long)pte;
  16.164 +        pgtable_cache_size++;
  16.165 +    } else
  16.166 +        pte_free_slow(pte);
  16.167  }
  16.168  
  16.169  #define pte_free(pte)		pte_free_fast(pte)
  16.170 @@ -158,28 +161,29 @@ extern int do_check_pgt_cache(int, int);
  16.171  
  16.172  static inline void flush_tlb_mm(struct mm_struct *mm)
  16.173  {
  16.174 -    if (mm == current->active_mm)
  16.175 -        __flush_tlb();
  16.176 +    if ( mm == current->active_mm ) queue_tlb_flush();
  16.177 +    XENO_flush_page_update_queue();
  16.178  }
  16.179  
  16.180  static inline void flush_tlb_page(struct vm_area_struct *vma,
  16.181                                    unsigned long addr)
  16.182  {
  16.183 -    if (vma->vm_mm == current->active_mm)
  16.184 -        __flush_tlb_one(addr);
  16.185 +    if ( vma->vm_mm == current->active_mm ) queue_invlpg(addr);
  16.186 +    XENO_flush_page_update_queue();
  16.187  }
  16.188  
  16.189  static inline void flush_tlb_range(struct mm_struct *mm,
  16.190                                     unsigned long start, unsigned long end)
  16.191  {
  16.192 -    if (mm == current->active_mm)
  16.193 -        __flush_tlb();
  16.194 +    if ( mm == current->active_mm ) queue_tlb_flush();
  16.195 +    XENO_flush_page_update_queue();
  16.196  }
  16.197  
  16.198  static inline void flush_tlb_pgtables(struct mm_struct *mm,
  16.199  				      unsigned long start, unsigned long end)
  16.200  {
  16.201      /* i386 does not keep any page table caches in TLB */
  16.202 +    XENO_flush_page_update_queue();
  16.203  }
  16.204  
  16.205  #endif /* _I386_PGALLOC_H */
    17.1 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h	Wed Dec 18 18:07:22 2002 +0000
    17.2 +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h	Fri Jan 03 18:24:03 2003 +0000
    17.3 @@ -37,10 +37,10 @@ extern void paging_init(void);
    17.4  
    17.5  extern unsigned long pgkern_mask;
    17.6  
    17.7 -#define __flush_tlb() queue_tlb_flush()
    17.8 +#define __flush_tlb() ({ queue_tlb_flush(); XENO_flush_page_update_queue(); })
    17.9  #define __flush_tlb_global() __flush_tlb()
   17.10  #define __flush_tlb_all() __flush_tlb_global()
   17.11 -#define __flush_tlb_one(addr) queue_tlb_flush_one(addr)
   17.12 +#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XENO_flush_page_update_queue(); })
   17.13  
   17.14  /*
   17.15   * ZERO_PAGE is a global shared page that is always zero: used
   17.16 @@ -281,7 +281,7 @@ static inline pte_t pte_modify(pte_t pte
   17.17  
   17.18  /* Find an entry in the third-level page table.. */
   17.19  #define __pte_offset(address) \
   17.20 -		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
   17.21 +                ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
   17.22  #define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \
   17.23  			__pte_offset(address))
   17.24  
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/xenolinux-2.4.16-sparse/mm/memory.c	Fri Jan 03 18:24:03 2003 +0000
    18.3 @@ -0,0 +1,1442 @@
    18.4 +/*
    18.5 + *  linux/mm/memory.c
    18.6 + *
    18.7 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
    18.8 + */
    18.9 +
   18.10 +/*
   18.11 + * demand-loading started 01.12.91 - seems it is high on the list of
   18.12 + * things wanted, and it should be easy to implement. - Linus
   18.13 + */
   18.14 +
   18.15 +/*
   18.16 + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
   18.17 + * pages started 02.12.91, seems to work. - Linus.
   18.18 + *
   18.19 + * Tested sharing by executing about 30 /bin/sh: under the old kernel it
   18.20 + * would have taken more than the 6M I have free, but it worked well as
   18.21 + * far as I could see.
   18.22 + *
   18.23 + * Also corrected some "invalidate()"s - I wasn't doing enough of them.
   18.24 + */
   18.25 +
   18.26 +/*
   18.27 + * Real VM (paging to/from disk) started 18.12.91. Much more work and
   18.28 + * thought has to go into this. Oh, well..
   18.29 + * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
   18.30 + *		Found it. Everything seems to work now.
   18.31 + * 20.12.91  -  Ok, making the swap-device changeable like the root.
   18.32 + */
   18.33 +
   18.34 +/*
   18.35 + * 05.04.94  -  Multi-page memory management added for v1.1.
   18.36 + * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
   18.37 + *
   18.38 + * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
   18.39 + *		(Gerhard.Wichert@pdb.siemens.de)
   18.40 + */
   18.41 +
   18.42 +#include <linux/mm.h>
   18.43 +#include <linux/mman.h>
   18.44 +#include <linux/swap.h>
   18.45 +#include <linux/smp_lock.h>
   18.46 +#include <linux/swapctl.h>
   18.47 +#include <linux/iobuf.h>
   18.48 +#include <linux/highmem.h>
   18.49 +#include <linux/pagemap.h>
   18.50 +
   18.51 +#include <asm/pgalloc.h>
   18.52 +#include <asm/uaccess.h>
   18.53 +#include <asm/tlb.h>
   18.54 +
   18.55 +unsigned long max_mapnr;
   18.56 +unsigned long num_physpages;
   18.57 +void * high_memory;
   18.58 +struct page *highmem_start_page;
   18.59 +
   18.60 +/*
   18.61 + * We special-case the C-O-W ZERO_PAGE, because it's such
   18.62 + * a common occurrence (no need to read the page to know
   18.63 + * that it's zero - better for the cache and memory subsystem).
   18.64 + */
   18.65 +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
   18.66 +{
   18.67 +	if (from == ZERO_PAGE(address)) {
   18.68 +		clear_user_highpage(to, address);
   18.69 +		return;
   18.70 +	}
   18.71 +	copy_user_highpage(to, from, address);
   18.72 +}
   18.73 +
   18.74 +mem_map_t * mem_map;
   18.75 +
   18.76 +/*
   18.77 + * Called by TLB shootdown 
   18.78 + */
   18.79 +void __free_pte(pte_t pte)
   18.80 +{
   18.81 +	struct page *page = pte_page(pte);
   18.82 +	if ((!VALID_PAGE(page)) || PageReserved(page))
   18.83 +		return;
   18.84 +	if (pte_dirty(pte))
   18.85 +		set_page_dirty(page);		
   18.86 +	free_page_and_swap_cache(page);
   18.87 +}
   18.88 +
   18.89 +
   18.90 +/*
   18.91 + * Note: this doesn't free the actual pages themselves. That
   18.92 + * has been handled earlier when unmapping all the memory regions.
   18.93 + */
   18.94 +static inline void free_one_pmd(pmd_t * dir)
   18.95 +{
   18.96 +	pte_t * pte;
   18.97 +
   18.98 +	if (pmd_none(*dir))
   18.99 +		return;
  18.100 +	if (pmd_bad(*dir)) {
  18.101 +		pmd_ERROR(*dir);
  18.102 +		pmd_clear(dir);
  18.103 +		return;
  18.104 +	}
  18.105 +	pte = pte_offset(dir, 0);
  18.106 +	pmd_clear(dir);
  18.107 +	pte_free(pte);
  18.108 +}
  18.109 +
  18.110 +static inline void free_one_pgd(pgd_t * dir)
  18.111 +{
  18.112 +	int j;
  18.113 +	pmd_t * pmd;
  18.114 +
  18.115 +	if (pgd_none(*dir))
  18.116 +		return;
  18.117 +	if (pgd_bad(*dir)) {
  18.118 +		pgd_ERROR(*dir);
  18.119 +		pgd_clear(dir);
  18.120 +		return;
  18.121 +	}
  18.122 +	pmd = pmd_offset(dir, 0);
  18.123 +	pgd_clear(dir);
  18.124 +	for (j = 0; j < PTRS_PER_PMD ; j++) {
  18.125 +		prefetchw(pmd+j+(PREFETCH_STRIDE/16));
  18.126 +		free_one_pmd(pmd+j);
  18.127 +	}
  18.128 +	pmd_free(pmd);
  18.129 +}
  18.130 +
  18.131 +/* Low and high watermarks for page table cache.
  18.132 +   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
  18.133 + */
  18.134 +int pgt_cache_water[2] = { 25, 50 };
  18.135 +
  18.136 +/* Returns the number of pages freed */
  18.137 +int check_pgt_cache(void)
  18.138 +{
  18.139 +	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
  18.140 +}
  18.141 +
  18.142 +
  18.143 +/*
  18.144 + * This function clears all user-level page tables of a process - this
  18.145 + * is needed by execve(), so that old pages aren't in the way.
  18.146 + */
  18.147 +void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
  18.148 +{
  18.149 +	pgd_t * page_dir = mm->pgd;
  18.150 +
  18.151 +	spin_lock(&mm->page_table_lock);
  18.152 +	page_dir += first;
  18.153 +	do {
  18.154 +		free_one_pgd(page_dir);
  18.155 +		page_dir++;
  18.156 +	} while (--nr);
  18.157 +	XENO_flush_page_update_queue();
  18.158 +	spin_unlock(&mm->page_table_lock);
  18.159 +
  18.160 +	/* keep the page table cache within bounds */
  18.161 +	check_pgt_cache();
  18.162 +}
  18.163 +
  18.164 +#define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
  18.165 +#define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
  18.166 +
  18.167 +/*
  18.168 + * copy one vm_area from one task to the other. Assumes the page tables
  18.169 + * already present in the new task to be cleared in the whole range
  18.170 + * covered by this vma.
  18.171 + *
  18.172 + * 08Jan98 Merged into one routine from several inline routines to reduce
  18.173 + *         variable count and make things faster. -jj
  18.174 + *
  18.175 + * dst->page_table_lock is held on entry and exit,
  18.176 + * but may be dropped within pmd_alloc() and pte_alloc().
  18.177 + */
  18.178 +int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
  18.179 +			struct vm_area_struct *vma)
  18.180 +{
  18.181 +	pgd_t * src_pgd, * dst_pgd;
  18.182 +	unsigned long address = vma->vm_start;
  18.183 +	unsigned long end = vma->vm_end;
  18.184 +	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
  18.185 +
  18.186 +	src_pgd = pgd_offset(src, address)-1;
  18.187 +	dst_pgd = pgd_offset(dst, address)-1;
  18.188 +
  18.189 +	for (;;) {
  18.190 +		pmd_t * src_pmd, * dst_pmd;
  18.191 +
  18.192 +		src_pgd++; dst_pgd++;
  18.193 +		
  18.194 +		/* copy_pmd_range */
  18.195 +		
  18.196 +		if (pgd_none(*src_pgd))
  18.197 +			goto skip_copy_pmd_range;
  18.198 +		if (pgd_bad(*src_pgd)) {
  18.199 +			pgd_ERROR(*src_pgd);
  18.200 +			pgd_clear(src_pgd);
  18.201 +skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
  18.202 +			if (!address || (address >= end))
  18.203 +				goto out;
  18.204 +			continue;
  18.205 +		}
  18.206 +
  18.207 +		src_pmd = pmd_offset(src_pgd, address);
  18.208 +		dst_pmd = pmd_alloc(dst, dst_pgd, address);
  18.209 +		if (!dst_pmd)
  18.210 +			goto nomem;
  18.211 +
  18.212 +		do {
  18.213 +			pte_t * src_pte, * dst_pte;
  18.214 +		
  18.215 +			/* copy_pte_range */
  18.216 +		
  18.217 +			if (pmd_none(*src_pmd))
  18.218 +				goto skip_copy_pte_range;
  18.219 +			if (pmd_bad(*src_pmd)) {
  18.220 +				pmd_ERROR(*src_pmd);
  18.221 +				pmd_clear(src_pmd);
  18.222 +skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
  18.223 +				if (address >= end)
  18.224 +					goto out;
  18.225 +				goto cont_copy_pmd_range;
  18.226 +			}
  18.227 +
  18.228 +			src_pte = pte_offset(src_pmd, address);
  18.229 +			dst_pte = pte_alloc(dst, dst_pmd, address);
  18.230 +			if (!dst_pte)
  18.231 +				goto nomem;
  18.232 +
  18.233 +			spin_lock(&src->page_table_lock);			
  18.234 +			do {
  18.235 +				pte_t pte = *src_pte;
  18.236 +				struct page *ptepage;
  18.237 +				
  18.238 +				/* copy_one_pte */
  18.239 +
  18.240 +				if (pte_none(pte))
  18.241 +					goto cont_copy_pte_range_noset;
  18.242 +				if (!pte_present(pte)) {
  18.243 +					swap_duplicate(pte_to_swp_entry(pte));
  18.244 +					goto cont_copy_pte_range;
  18.245 +				}
  18.246 +				ptepage = pte_page(pte);
  18.247 +				if ((!VALID_PAGE(ptepage)) || 
  18.248 +				    PageReserved(ptepage))
  18.249 +					goto cont_copy_pte_range;
  18.250 +
  18.251 +				/* If it's a COW mapping, write protect it both in the parent and the child */
  18.252 +				if (cow) {
  18.253 +					/* XENO modification: modified ordering here to avoid RaW hazard. */
  18.254 +					pte = *src_pte;
  18.255 +					pte = pte_wrprotect(pte);
  18.256 +					ptep_set_wrprotect(src_pte);
  18.257 +				}
  18.258 +
  18.259 +				/* If it's a shared mapping, mark it clean in the child */
  18.260 +				if (vma->vm_flags & VM_SHARED)
  18.261 +					pte = pte_mkclean(pte);
  18.262 +				pte = pte_mkold(pte);
  18.263 +				get_page(ptepage);
  18.264 +				dst->rss++;
  18.265 +
  18.266 +cont_copy_pte_range:		set_pte(dst_pte, pte);
  18.267 +cont_copy_pte_range_noset:	address += PAGE_SIZE;
  18.268 +				if (address >= end)
  18.269 +					goto out_unlock;
  18.270 +				src_pte++;
  18.271 +				dst_pte++;
  18.272 +			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
  18.273 +			spin_unlock(&src->page_table_lock);
  18.274 +		
  18.275 +cont_copy_pmd_range:	src_pmd++;
  18.276 +			dst_pmd++;
  18.277 +		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
  18.278 +	}
  18.279 +out_unlock:
  18.280 +	spin_unlock(&src->page_table_lock);
  18.281 +out:
  18.282 +	return 0;
  18.283 +nomem:
  18.284 +	return -ENOMEM;
  18.285 +}
  18.286 +
  18.287 +/*
  18.288 + * Return indicates whether a page was freed so caller can adjust rss
  18.289 + */
  18.290 +static inline void forget_pte(pte_t page)
  18.291 +{
  18.292 +	if (!pte_none(page)) {
  18.293 +		printk("forget_pte: old mapping existed!\n");
  18.294 +		BUG();
  18.295 +	}
  18.296 +}
  18.297 +
  18.298 +static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
  18.299 +{
  18.300 +	unsigned long offset;
  18.301 +	pte_t * ptep;
  18.302 +	int freed = 0;
  18.303 +
  18.304 +	if (pmd_none(*pmd))
  18.305 +		return 0;
  18.306 +	if (pmd_bad(*pmd)) {
  18.307 +		pmd_ERROR(*pmd);
  18.308 +		pmd_clear(pmd);
  18.309 +		return 0;
  18.310 +	}
  18.311 +	ptep = pte_offset(pmd, address);
  18.312 +	offset = address & ~PMD_MASK;
  18.313 +	if (offset + size > PMD_SIZE)
  18.314 +		size = PMD_SIZE - offset;
  18.315 +	size &= PAGE_MASK;
  18.316 +	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
  18.317 +		pte_t pte = *ptep;
  18.318 +		if (pte_none(pte))
  18.319 +			continue;
  18.320 +		if (pte_present(pte)) {
  18.321 +			struct page *page = pte_page(pte);
  18.322 +			if (VALID_PAGE(page) && !PageReserved(page))
  18.323 +				freed ++;
  18.324 +			/* This will eventually call __free_pte on the pte. */
  18.325 +			tlb_remove_page(tlb, ptep, address + offset);
  18.326 +		} else {
  18.327 +			free_swap_and_cache(pte_to_swp_entry(pte));
  18.328 +			pte_clear(ptep);
  18.329 +		}
  18.330 +	}
  18.331 +
  18.332 +	return freed;
  18.333 +}
  18.334 +
  18.335 +static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size)
  18.336 +{
  18.337 +	pmd_t * pmd;
  18.338 +	unsigned long end;
  18.339 +	int freed;
  18.340 +
  18.341 +	if (pgd_none(*dir))
  18.342 +		return 0;
  18.343 +	if (pgd_bad(*dir)) {
  18.344 +		pgd_ERROR(*dir);
  18.345 +		pgd_clear(dir);
  18.346 +		return 0;
  18.347 +	}
  18.348 +	pmd = pmd_offset(dir, address);
  18.349 +	end = address + size;
  18.350 +	if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
  18.351 +		end = ((address + PGDIR_SIZE) & PGDIR_MASK);
  18.352 +	freed = 0;
  18.353 +	do {
  18.354 +		freed += zap_pte_range(tlb, pmd, address, end - address);
  18.355 +		address = (address + PMD_SIZE) & PMD_MASK; 
  18.356 +		pmd++;
  18.357 +	} while (address < end);
  18.358 +	return freed;
  18.359 +}
  18.360 +
  18.361 +/*
  18.362 + * remove user pages in a given range.
  18.363 + */
  18.364 +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
  18.365 +{
  18.366 +	mmu_gather_t *tlb;
  18.367 +	pgd_t * dir;
  18.368 +	unsigned long start = address, end = address + size;
  18.369 +	int freed = 0;
  18.370 +
  18.371 +	dir = pgd_offset(mm, address);
  18.372 +
  18.373 +	/*
  18.374 +	 * This is a long-lived spinlock. That's fine.
  18.375 +	 * There's no contention, because the page table
  18.376 +	 * lock only protects against kswapd anyway, and
  18.377 +	 * even if kswapd happened to be looking at this
  18.378 +	 * process we _want_ it to get stuck.
  18.379 +	 */
  18.380 +	if (address >= end)
  18.381 +		BUG();
  18.382 +	spin_lock(&mm->page_table_lock);
  18.383 +	flush_cache_range(mm, address, end);
  18.384 +	tlb = tlb_gather_mmu(mm);
  18.385 +
  18.386 +	do {
  18.387 +		freed += zap_pmd_range(tlb, dir, address, end - address);
  18.388 +		address = (address + PGDIR_SIZE) & PGDIR_MASK;
  18.389 +		dir++;
  18.390 +	} while (address && (address < end));
  18.391 +
  18.392 +	/* this will flush any remaining tlb entries */
  18.393 +	tlb_finish_mmu(tlb, start, end);
  18.394 +
  18.395 +	/*
  18.396 +	 * Update rss for the mm_struct (not necessarily current->mm)
  18.397 +	 * Notice that rss is an unsigned long.
  18.398 +	 */
  18.399 +	if (mm->rss > freed)
  18.400 +		mm->rss -= freed;
  18.401 +	else
  18.402 +		mm->rss = 0;
  18.403 +	spin_unlock(&mm->page_table_lock);
  18.404 +}
  18.405 +
  18.406 +
  18.407 +/*
  18.408 + * Do a quick page-table lookup for a single page. 
  18.409 + */
  18.410 +static struct page * follow_page(unsigned long address, int write) 
  18.411 +{
  18.412 +	pgd_t *pgd;
  18.413 +	pmd_t *pmd;
  18.414 +	pte_t *ptep, pte;
  18.415 +
  18.416 +	pgd = pgd_offset(current->mm, address);
  18.417 +	if (pgd_none(*pgd) || pgd_bad(*pgd))
  18.418 +		goto out;
  18.419 +
  18.420 +	pmd = pmd_offset(pgd, address);
  18.421 +	if (pmd_none(*pmd) || pmd_bad(*pmd))
  18.422 +		goto out;
  18.423 +
  18.424 +	ptep = pte_offset(pmd, address);
  18.425 +	if (!ptep)
  18.426 +		goto out;
  18.427 +
  18.428 +	pte = *ptep;
  18.429 +	if (pte_present(pte)) {
  18.430 +		if (!write ||
  18.431 +		    (pte_write(pte) && pte_dirty(pte)))
  18.432 +			return pte_page(pte);
  18.433 +	}
  18.434 +
  18.435 +out:
  18.436 +	return 0;
  18.437 +}
  18.438 +
  18.439 +/* 
  18.440 + * Given a physical address, is there a useful struct page pointing to
  18.441 + * it?  This may become more complex in the future if we start dealing
  18.442 + * with IO-aperture pages in kiobufs.
  18.443 + */
  18.444 +
  18.445 +static inline struct page * get_page_map(struct page *page)
  18.446 +{
  18.447 +	if (!VALID_PAGE(page))
  18.448 +		return 0;
  18.449 +	return page;
  18.450 +}
  18.451 +
  18.452 +/*
  18.453 + * Force in an entire range of pages from the current process's user VA,
  18.454 + * and pin them in physical memory.  
  18.455 + */
  18.456 +
  18.457 +#define dprintk(x...)
  18.458 +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
  18.459 +{
  18.460 +	unsigned long		ptr, end;
  18.461 +	int			err;
  18.462 +	struct mm_struct *	mm;
  18.463 +	struct vm_area_struct *	vma = 0;
  18.464 +	struct page *		map;
  18.465 +	int			i;
  18.466 +	int			datain = (rw == READ);
  18.467 +	
  18.468 +	/* Make sure the iobuf is not already mapped somewhere. */
  18.469 +	if (iobuf->nr_pages)
  18.470 +		return -EINVAL;
  18.471 +
  18.472 +	mm = current->mm;
  18.473 +	dprintk ("map_user_kiobuf: begin\n");
  18.474 +	
  18.475 +	ptr = va & PAGE_MASK;
  18.476 +	end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
  18.477 +	err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
  18.478 +	if (err)
  18.479 +		return err;
  18.480 +
  18.481 +	down_read(&mm->mmap_sem);
  18.482 +
  18.483 +	err = -EFAULT;
  18.484 +	iobuf->locked = 0;
  18.485 +	iobuf->offset = va & ~PAGE_MASK;
  18.486 +	iobuf->length = len;
  18.487 +	
  18.488 +	i = 0;
  18.489 +	
  18.490 +	/* 
  18.491 +	 * First of all, try to fault in all of the necessary pages
  18.492 +	 */
  18.493 +	while (ptr < end) {
  18.494 +		if (!vma || ptr >= vma->vm_end) {
  18.495 +			vma = find_vma(current->mm, ptr);
  18.496 +			if (!vma) 
  18.497 +				goto out_unlock;
  18.498 +			if (vma->vm_start > ptr) {
  18.499 +				if (!(vma->vm_flags & VM_GROWSDOWN))
  18.500 +					goto out_unlock;
  18.501 +				if (expand_stack(vma, ptr))
  18.502 +					goto out_unlock;
  18.503 +			}
  18.504 +			if (((datain) && (!(vma->vm_flags & VM_WRITE))) ||
  18.505 +					(!(vma->vm_flags & VM_READ))) {
  18.506 +				err = -EACCES;
  18.507 +				goto out_unlock;
  18.508 +			}
  18.509 +		}
  18.510 +		spin_lock(&mm->page_table_lock);
  18.511 +		while (!(map = follow_page(ptr, datain))) {
  18.512 +			int ret;
  18.513 +
  18.514 +			spin_unlock(&mm->page_table_lock);
  18.515 +			ret = handle_mm_fault(current->mm, vma, ptr, datain);
  18.516 +			if (ret <= 0) {
  18.517 +				if (!ret)
  18.518 +					goto out_unlock;
  18.519 +				else {
  18.520 +					err = -ENOMEM;
  18.521 +					goto out_unlock;
  18.522 +				}
  18.523 +			}
  18.524 +			spin_lock(&mm->page_table_lock);
  18.525 +		}			
  18.526 +		map = get_page_map(map);
  18.527 +		if (map) {
  18.528 +			flush_dcache_page(map);
  18.529 +			page_cache_get(map);
  18.530 +		} else
  18.531 +			printk (KERN_INFO "Mapped page missing [%d]\n", i);
  18.532 +		spin_unlock(&mm->page_table_lock);
  18.533 +		iobuf->maplist[i] = map;
  18.534 +		iobuf->nr_pages = ++i;
  18.535 +		
  18.536 +		ptr += PAGE_SIZE;
  18.537 +	}
  18.538 +
  18.539 +	up_read(&mm->mmap_sem);
  18.540 +	dprintk ("map_user_kiobuf: end OK\n");
  18.541 +	return 0;
  18.542 +
  18.543 + out_unlock:
  18.544 +	up_read(&mm->mmap_sem);
  18.545 +	unmap_kiobuf(iobuf);
  18.546 +	dprintk ("map_user_kiobuf: end %d\n", err);
  18.547 +	return err;
  18.548 +}
  18.549 +
  18.550 +/*
  18.551 + * Mark all of the pages in a kiobuf as dirty 
  18.552 + *
  18.553 + * We need to be able to deal with short reads from disk: if an IO error
  18.554 + * occurs, the number of bytes read into memory may be less than the
  18.555 + * size of the kiobuf, so we have to stop marking pages dirty once the
  18.556 + * requested byte count has been reached.
  18.557 + */
  18.558 +
  18.559 +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
  18.560 +{
  18.561 +	int index, offset, remaining;
  18.562 +	struct page *page;
  18.563 +	
  18.564 +	index = iobuf->offset >> PAGE_SHIFT;
  18.565 +	offset = iobuf->offset & ~PAGE_MASK;
  18.566 +	remaining = bytes;
  18.567 +	if (remaining > iobuf->length)
  18.568 +		remaining = iobuf->length;
  18.569 +	
  18.570 +	while (remaining > 0 && index < iobuf->nr_pages) {
  18.571 +		page = iobuf->maplist[index];
  18.572 +		
  18.573 +		if (!PageReserved(page))
  18.574 +			SetPageDirty(page);
  18.575 +
  18.576 +		remaining -= (PAGE_SIZE - offset);
  18.577 +		offset = 0;
  18.578 +		index++;
  18.579 +	}
  18.580 +}
  18.581 +
  18.582 +/*
  18.583 + * Unmap all of the pages referenced by a kiobuf.  We release the pages,
  18.584 + * and unlock them if they were locked. 
  18.585 + */
  18.586 +
  18.587 +void unmap_kiobuf (struct kiobuf *iobuf) 
  18.588 +{
  18.589 +	int i;
  18.590 +	struct page *map;
  18.591 +	
  18.592 +	for (i = 0; i < iobuf->nr_pages; i++) {
  18.593 +		map = iobuf->maplist[i];
  18.594 +		if (map) {
  18.595 +			if (iobuf->locked)
  18.596 +				UnlockPage(map);
  18.597 +			page_cache_release(map);
  18.598 +		}
  18.599 +	}
  18.600 +	
  18.601 +	iobuf->nr_pages = 0;
  18.602 +	iobuf->locked = 0;
  18.603 +}
  18.604 +
  18.605 +
  18.606 +/*
  18.607 + * Lock down all of the pages of a kiovec for IO.
  18.608 + *
  18.609 + * If any page is mapped twice in the kiovec, we return the error -EINVAL.
  18.610 + *
  18.611 + * The optional wait parameter causes the lock call to block until all
  18.612 + * pages can be locked if set.  If wait==0, the lock operation is
  18.613 + * aborted if any locked pages are found and -EAGAIN is returned.
  18.614 + */
  18.615 +
  18.616 +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
  18.617 +{
  18.618 +	struct kiobuf *iobuf;
  18.619 +	int i, j;
  18.620 +	struct page *page, **ppage;
  18.621 +	int doublepage = 0;
  18.622 +	int repeat = 0;
  18.623 +	
  18.624 + repeat:
  18.625 +	
  18.626 +	for (i = 0; i < nr; i++) {
  18.627 +		iobuf = iovec[i];
  18.628 +
  18.629 +		if (iobuf->locked)
  18.630 +			continue;
  18.631 +
  18.632 +		ppage = iobuf->maplist;
  18.633 +		for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
  18.634 +			page = *ppage;
  18.635 +			if (!page)
  18.636 +				continue;
  18.637 +			
  18.638 +			if (TryLockPage(page)) {
  18.639 +				while (j--) {
  18.640 +					struct page *tmp = *--ppage;
  18.641 +					if (tmp)
  18.642 +						UnlockPage(tmp);
  18.643 +				}
  18.644 +				goto retry;
  18.645 +			}
  18.646 +		}
  18.647 +		iobuf->locked = 1;
  18.648 +	}
  18.649 +
  18.650 +	return 0;
  18.651 +	
  18.652 + retry:
  18.653 +	
  18.654 +	/* 
  18.655 +	 * We couldn't lock one of the pages.  Undo the locking so far,
  18.656 +	 * wait on the page we got to, and try again.  
  18.657 +	 */
  18.658 +	
  18.659 +	unlock_kiovec(nr, iovec);
  18.660 +	if (!wait)
  18.661 +		return -EAGAIN;
  18.662 +	
  18.663 +	/* 
  18.664 +	 * Did the release also unlock the page we got stuck on?
  18.665 +	 */
  18.666 +	if (!PageLocked(page)) {
  18.667 +		/* 
  18.668 +		 * If so, we may well have the page mapped twice
  18.669 +		 * in the IO address range.  Bad news.  Of
  18.670 +		 * course, it _might_ just be a coincidence,
  18.671 +		 * but if it happens more than once, chances
  18.672 +		 * are we have a double-mapped page. 
  18.673 +		 */
  18.674 +		if (++doublepage >= 3) 
  18.675 +			return -EINVAL;
  18.676 +		
  18.677 +		/* Try again...  */
  18.678 +		wait_on_page(page);
  18.679 +	}
  18.680 +	
  18.681 +	if (++repeat < 16)
  18.682 +		goto repeat;
  18.683 +	return -EAGAIN;
  18.684 +}
  18.685 +
  18.686 +/*
  18.687 + * Unlock all of the pages of a kiovec after IO.
  18.688 + */
  18.689 +
  18.690 +int unlock_kiovec(int nr, struct kiobuf *iovec[])
  18.691 +{
  18.692 +	struct kiobuf *iobuf;
  18.693 +	int i, j;
  18.694 +	struct page *page, **ppage;
  18.695 +	
  18.696 +	for (i = 0; i < nr; i++) {
  18.697 +		iobuf = iovec[i];
  18.698 +
  18.699 +		if (!iobuf->locked)
  18.700 +			continue;
  18.701 +		iobuf->locked = 0;
  18.702 +		
  18.703 +		ppage = iobuf->maplist;
  18.704 +		for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
  18.705 +			page = *ppage;
  18.706 +			if (!page)
  18.707 +				continue;
  18.708 +			UnlockPage(page);
  18.709 +		}
  18.710 +	}
  18.711 +	return 0;
  18.712 +}
  18.713 +
  18.714 +static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
  18.715 +                                     unsigned long size, pgprot_t prot)
  18.716 +{
  18.717 +	unsigned long end;
  18.718 +
  18.719 +	address &= ~PMD_MASK;
  18.720 +	end = address + size;
  18.721 +	if (end > PMD_SIZE)
  18.722 +		end = PMD_SIZE;
  18.723 +	do {
  18.724 +		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
  18.725 +		pte_t oldpage = ptep_get_and_clear(pte);
  18.726 +		set_pte(pte, zero_pte);
  18.727 +		forget_pte(oldpage);
  18.728 +		address += PAGE_SIZE;
  18.729 +		pte++;
  18.730 +	} while (address && (address < end));
  18.731 +}
  18.732 +
  18.733 +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
  18.734 +                                    unsigned long size, pgprot_t prot)
  18.735 +{
  18.736 +	unsigned long end;
  18.737 +
  18.738 +	address &= ~PGDIR_MASK;
  18.739 +	end = address + size;
  18.740 +	if (end > PGDIR_SIZE)
  18.741 +		end = PGDIR_SIZE;
  18.742 +	do {
  18.743 +		pte_t * pte = pte_alloc(mm, pmd, address);
  18.744 +		if (!pte)
  18.745 +			return -ENOMEM;
  18.746 +		zeromap_pte_range(pte, address, end - address, prot);
  18.747 +		address = (address + PMD_SIZE) & PMD_MASK;
  18.748 +		pmd++;
  18.749 +	} while (address && (address < end));
  18.750 +	return 0;
  18.751 +}
  18.752 +
  18.753 +int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
  18.754 +{
  18.755 +	int error = 0;
  18.756 +	pgd_t * dir;
  18.757 +	unsigned long beg = address;
  18.758 +	unsigned long end = address + size;
  18.759 +	struct mm_struct *mm = current->mm;
  18.760 +
  18.761 +	dir = pgd_offset(mm, address);
  18.762 +	flush_cache_range(mm, beg, end);
  18.763 +	if (address >= end)
  18.764 +		BUG();
  18.765 +
  18.766 +	spin_lock(&mm->page_table_lock);
  18.767 +	do {
  18.768 +		pmd_t *pmd = pmd_alloc(mm, dir, address);
  18.769 +		error = -ENOMEM;
  18.770 +		if (!pmd)
  18.771 +			break;
  18.772 +		error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
  18.773 +		if (error)
  18.774 +			break;
  18.775 +		address = (address + PGDIR_SIZE) & PGDIR_MASK;
  18.776 +		dir++;
  18.777 +	} while (address && (address < end));
  18.778 +	spin_unlock(&mm->page_table_lock);
  18.779 +	flush_tlb_range(mm, beg, end);
  18.780 +	return error;
  18.781 +}
  18.782 +
  18.783 +/*
  18.784 + * maps a range of physical memory into the requested pages. the old
  18.785 + * mappings are removed. any references to nonexistent pages results
  18.786 + * in null mappings (currently treated as "copy-on-access")
  18.787 + */
  18.788 +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
  18.789 +	unsigned long phys_addr, pgprot_t prot)
  18.790 +{
  18.791 +	unsigned long end;
  18.792 +
  18.793 +	address &= ~PMD_MASK;
  18.794 +	end = address + size;
  18.795 +	if (end > PMD_SIZE)
  18.796 +		end = PMD_SIZE;
  18.797 +	do {
  18.798 +		struct page *page;
  18.799 +		pte_t oldpage;
  18.800 +		oldpage = ptep_get_and_clear(pte);
  18.801 +
  18.802 +		page = virt_to_page(__va(phys_addr));
  18.803 +		if ((!VALID_PAGE(page)) || PageReserved(page))
  18.804 + 			set_pte(pte, mk_pte_phys(phys_addr, prot));
  18.805 +		forget_pte(oldpage);
  18.806 +		address += PAGE_SIZE;
  18.807 +		phys_addr += PAGE_SIZE;
  18.808 +		pte++;
  18.809 +	} while (address && (address < end));
  18.810 +}
  18.811 +
  18.812 +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
  18.813 +	unsigned long phys_addr, pgprot_t prot)
  18.814 +{
  18.815 +	unsigned long end;
  18.816 +
  18.817 +	address &= ~PGDIR_MASK;
  18.818 +	end = address + size;
  18.819 +	if (end > PGDIR_SIZE)
  18.820 +		end = PGDIR_SIZE;
  18.821 +	phys_addr -= address;
  18.822 +	do {
  18.823 +		pte_t * pte = pte_alloc(mm, pmd, address);
  18.824 +		if (!pte)
  18.825 +			return -ENOMEM;
  18.826 +		remap_pte_range(pte, address, end - address, address + phys_addr, prot);
  18.827 +		address = (address + PMD_SIZE) & PMD_MASK;
  18.828 +		pmd++;
  18.829 +	} while (address && (address < end));
  18.830 +	return 0;
  18.831 +}
  18.832 +
  18.833 +/*  Note: this is only safe if the mm semaphore is held when called. */
  18.834 +int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
  18.835 +{
  18.836 +	int error = 0;
  18.837 +	pgd_t * dir;
  18.838 +	unsigned long beg = from;
  18.839 +	unsigned long end = from + size;
  18.840 +	struct mm_struct *mm = current->mm;
  18.841 +
  18.842 +	phys_addr -= from;
  18.843 +	dir = pgd_offset(mm, from);
  18.844 +	flush_cache_range(mm, beg, end);
  18.845 +	if (from >= end)
  18.846 +		BUG();
  18.847 +
  18.848 +	spin_lock(&mm->page_table_lock);
  18.849 +	do {
  18.850 +		pmd_t *pmd = pmd_alloc(mm, dir, from);
  18.851 +		error = -ENOMEM;
  18.852 +		if (!pmd)
  18.853 +			break;
  18.854 +		error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
  18.855 +		if (error)
  18.856 +			break;
  18.857 +		from = (from + PGDIR_SIZE) & PGDIR_MASK;
  18.858 +		dir++;
  18.859 +	} while (from && (from < end));
  18.860 +	spin_unlock(&mm->page_table_lock);
  18.861 +	flush_tlb_range(mm, beg, end);
  18.862 +	return error;
  18.863 +}
  18.864 +
  18.865 +/*
  18.866 + * Establish a new mapping:
  18.867 + *  - flush the old one
  18.868 + *  - update the page tables
  18.869 + *  - inform the TLB about the new one
  18.870 + *
  18.871 + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
  18.872 + */
  18.873 +static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
  18.874 +{
  18.875 +	set_pte(page_table, entry);
  18.876 +	flush_tlb_page(vma, address);
  18.877 +	update_mmu_cache(vma, address, entry);
  18.878 +}
  18.879 +
  18.880 +/*
  18.881 + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
  18.882 + */
  18.883 +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
  18.884 +		pte_t *page_table)
  18.885 +{
  18.886 +	flush_page_to_ram(new_page);
  18.887 +	flush_cache_page(vma, address);
  18.888 +	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
  18.889 +}
  18.890 +
  18.891 +/*
  18.892 + * This routine handles present pages, when users try to write
  18.893 + * to a shared page. It is done by copying the page to a new address
  18.894 + * and decrementing the shared-page counter for the old page.
  18.895 + *
  18.896 + * Goto-purists beware: the only reason for goto's here is that it results
  18.897 + * in better assembly code.. The "default" path will see no jumps at all.
  18.898 + *
  18.899 + * Note that this routine assumes that the protection checks have been
  18.900 + * done by the caller (the low-level page fault routine in most cases).
  18.901 + * Thus we can safely just mark it writable once we've done any necessary
  18.902 + * COW.
  18.903 + *
  18.904 + * We also mark the page dirty at this point even though the page will
  18.905 + * change only once the write actually happens. This avoids a few races,
  18.906 + * and potentially makes it more efficient.
  18.907 + *
  18.908 + * We hold the mm semaphore and the page_table_lock on entry and exit
  18.909 + * with the page_table_lock released.
  18.910 + */
  18.911 +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
  18.912 +	unsigned long address, pte_t *page_table, pte_t pte)
  18.913 +{
  18.914 +	struct page *old_page, *new_page;
  18.915 +
  18.916 +	old_page = pte_page(pte);
  18.917 +	if (!VALID_PAGE(old_page))
  18.918 +		goto bad_wp_page;
  18.919 +
  18.920 +	if (!TryLockPage(old_page)) {
  18.921 +		int reuse = can_share_swap_page(old_page);
  18.922 +		unlock_page(old_page);
  18.923 +		if (reuse) {
  18.924 +			flush_cache_page(vma, address);
  18.925 +			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
  18.926 +			spin_unlock(&mm->page_table_lock);
  18.927 +			return 1;	/* Minor fault */
  18.928 +		}
  18.929 +	}
  18.930 +
  18.931 +	/*
  18.932 +	 * Ok, we need to copy. Oh, well..
  18.933 +	 */
  18.934 +	page_cache_get(old_page);
  18.935 +	spin_unlock(&mm->page_table_lock);
  18.936 +
  18.937 +	new_page = alloc_page(GFP_HIGHUSER);
  18.938 +	if (!new_page)
  18.939 +		goto no_mem;
  18.940 +	copy_cow_page(old_page,new_page,address);
  18.941 +
  18.942 +	/*
  18.943 +	 * Re-check the pte - we dropped the lock
  18.944 +	 */
  18.945 +	spin_lock(&mm->page_table_lock);
  18.946 +	if (pte_same(*page_table, pte)) {
  18.947 +		if (PageReserved(old_page))
  18.948 +			++mm->rss;
  18.949 +		break_cow(vma, new_page, address, page_table);
  18.950 +		lru_cache_add(new_page);
  18.951 +
  18.952 +		/* Free the old page.. */
  18.953 +		new_page = old_page;
  18.954 +	}
  18.955 +	spin_unlock(&mm->page_table_lock);
  18.956 +	page_cache_release(new_page);
  18.957 +	page_cache_release(old_page);
  18.958 +	return 1;	/* Minor fault */
  18.959 +
  18.960 +bad_wp_page:
  18.961 +	spin_unlock(&mm->page_table_lock);
  18.962 +	printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
  18.963 +	return -1;
  18.964 +no_mem:
  18.965 +	page_cache_release(old_page);
  18.966 +	return -1;
  18.967 +}
  18.968 +
  18.969 +static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff)
  18.970 +{
  18.971 +	do {
  18.972 +		struct mm_struct *mm = mpnt->vm_mm;
  18.973 +		unsigned long start = mpnt->vm_start;
  18.974 +		unsigned long end = mpnt->vm_end;
  18.975 +		unsigned long len = end - start;
  18.976 +		unsigned long diff;
  18.977 +
  18.978 +		/* mapping wholly truncated? */
  18.979 +		if (mpnt->vm_pgoff >= pgoff) {
  18.980 +			zap_page_range(mm, start, len);
  18.981 +			continue;
  18.982 +		}
  18.983 +
  18.984 +		/* mapping wholly unaffected? */
  18.985 +		len = len >> PAGE_SHIFT;
  18.986 +		diff = pgoff - mpnt->vm_pgoff;
  18.987 +		if (diff >= len)
  18.988 +			continue;
  18.989 +
  18.990 +		/* Ok, partially affected.. */
  18.991 +		start += diff << PAGE_SHIFT;
  18.992 +		len = (len - diff) << PAGE_SHIFT;
  18.993 +		zap_page_range(mm, start, len);
  18.994 +	} while ((mpnt = mpnt->vm_next_share) != NULL);
  18.995 +}
  18.996 +
  18.997 +/*
  18.998 + * Handle all mappings that got truncated by a "truncate()"
  18.999 + * system call.
 18.1000 + *
 18.1001 + * NOTE! We have to be ready to update the memory sharing
 18.1002 + * between the file and the memory map for a potential last
 18.1003 + * incomplete page.  Ugly, but necessary.
 18.1004 + */
 18.1005 +int vmtruncate(struct inode * inode, loff_t offset)
 18.1006 +{
 18.1007 +	unsigned long pgoff;
 18.1008 +	struct address_space *mapping = inode->i_mapping;
 18.1009 +	unsigned long limit;
 18.1010 +
 18.1011 +	if (inode->i_size < offset)
 18.1012 +		goto do_expand;
 18.1013 +	inode->i_size = offset;
 18.1014 +	spin_lock(&mapping->i_shared_lock);
 18.1015 +	if (!mapping->i_mmap && !mapping->i_mmap_shared)
 18.1016 +		goto out_unlock;
 18.1017 +
 18.1018 +	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 18.1019 +	if (mapping->i_mmap != NULL)
 18.1020 +		vmtruncate_list(mapping->i_mmap, pgoff);
 18.1021 +	if (mapping->i_mmap_shared != NULL)
 18.1022 +		vmtruncate_list(mapping->i_mmap_shared, pgoff);
 18.1023 +
 18.1024 +out_unlock:
 18.1025 +	spin_unlock(&mapping->i_shared_lock);
 18.1026 +	truncate_inode_pages(mapping, offset);
 18.1027 +	goto out_truncate;
 18.1028 +
 18.1029 +do_expand:
 18.1030 +	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 18.1031 +	if (limit != RLIM_INFINITY) {
 18.1032 +		if (inode->i_size >= limit) {
 18.1033 +			send_sig(SIGXFSZ, current, 0);
 18.1034 +			goto out;
 18.1035 +		}
 18.1036 +		if (offset > limit) {
 18.1037 +			send_sig(SIGXFSZ, current, 0);
 18.1038 +			offset = limit;
 18.1039 +		}
 18.1040 +	}
 18.1041 +	inode->i_size = offset;
 18.1042 +
 18.1043 +out_truncate:
 18.1044 +	if (inode->i_op && inode->i_op->truncate) {
 18.1045 +		lock_kernel();
 18.1046 +		inode->i_op->truncate(inode);
 18.1047 +		unlock_kernel();
 18.1048 +	}
 18.1049 +out:
 18.1050 +	return 0;
 18.1051 +}
 18.1052 +
 18.1053 +/* 
 18.1054 + * Primitive swap readahead code. We simply read an aligned block of
 18.1055 + * (1 << page_cluster) entries in the swap area. This method is chosen
 18.1056 + * because it doesn't cost us any seek time.  We also make sure to queue
 18.1057 + * the 'original' request together with the readahead ones...  
 18.1058 + */
 18.1059 +void swapin_readahead(swp_entry_t entry)
 18.1060 +{
 18.1061 +	int i, num;
 18.1062 +	struct page *new_page;
 18.1063 +	unsigned long offset;
 18.1064 +
 18.1065 +	/*
 18.1066 +	 * Get the number of handles we should do readahead io to.
 18.1067 +	 */
 18.1068 +	num = valid_swaphandles(entry, &offset);
 18.1069 +	for (i = 0; i < num; offset++, i++) {
 18.1070 +		/* Ok, do the async read-ahead now */
 18.1071 +		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
 18.1072 +		if (!new_page)
 18.1073 +			break;
 18.1074 +		page_cache_release(new_page);
 18.1075 +	}
 18.1076 +	return;
 18.1077 +}
 18.1078 +
 18.1079 +/*
 18.1080 + * We hold the mm semaphore and the page_table_lock on entry and
 18.1081 + * should release the pagetable lock on exit..
 18.1082 + */
 18.1083 +static int do_swap_page(struct mm_struct * mm,
 18.1084 +	struct vm_area_struct * vma, unsigned long address,
 18.1085 +	pte_t * page_table, pte_t orig_pte, int write_access)
 18.1086 +{
 18.1087 +	struct page *page;
 18.1088 +	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 18.1089 +	pte_t pte;
 18.1090 +	int ret = 1;
 18.1091 +
 18.1092 +	spin_unlock(&mm->page_table_lock);
 18.1093 +	page = lookup_swap_cache(entry);
 18.1094 +	if (!page) {
 18.1095 +		swapin_readahead(entry);
 18.1096 +		page = read_swap_cache_async(entry);
 18.1097 +		if (!page) {
 18.1098 +			/*
 18.1099 +			 * Back out if somebody else faulted in this pte while
 18.1100 +			 * we released the page table lock.
 18.1101 +			 */
 18.1102 +			int retval;
 18.1103 +			spin_lock(&mm->page_table_lock);
 18.1104 +			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
 18.1105 +			spin_unlock(&mm->page_table_lock);
 18.1106 +			return retval;
 18.1107 +		}
 18.1108 +
 18.1109 +		/* Had to read the page from swap area: Major fault */
 18.1110 +		ret = 2;
 18.1111 +	}
 18.1112 +
 18.1113 +	lock_page(page);
 18.1114 +
 18.1115 +	/*
 18.1116 +	 * Back out if somebody else faulted in this pte while we
 18.1117 +	 * released the page table lock.
 18.1118 +	 */
 18.1119 +	spin_lock(&mm->page_table_lock);
 18.1120 +	if (!pte_same(*page_table, orig_pte)) {
 18.1121 +		spin_unlock(&mm->page_table_lock);
 18.1122 +		unlock_page(page);
 18.1123 +		page_cache_release(page);
 18.1124 +		return 1;
 18.1125 +	}
 18.1126 +
 18.1127 +	/* The page isn't present yet, go ahead with the fault. */
 18.1128 +		
 18.1129 +	swap_free(entry);
 18.1130 +	if (vm_swap_full())
 18.1131 +		remove_exclusive_swap_page(page);
 18.1132 +
 18.1133 +	mm->rss++;
 18.1134 +	pte = mk_pte(page, vma->vm_page_prot);
 18.1135 +	if (write_access && can_share_swap_page(page))
 18.1136 +		pte = pte_mkdirty(pte_mkwrite(pte));
 18.1137 +	unlock_page(page);
 18.1138 +
 18.1139 +	flush_page_to_ram(page);
 18.1140 +	flush_icache_page(vma, page);
 18.1141 +	set_pte(page_table, pte);
 18.1142 +
 18.1143 +	/* No need to invalidate - it was non-present before */
 18.1144 +	update_mmu_cache(vma, address, pte);
 18.1145 +	XENO_flush_page_update_queue();
 18.1146 +	spin_unlock(&mm->page_table_lock);
 18.1147 +	return ret;
 18.1148 +}
 18.1149 +
 18.1150 +/*
 18.1151 + * We are called with the MM semaphore and page_table_lock
 18.1152 + * spinlock held to protect against concurrent faults in
 18.1153 + * multithreaded programs. 
 18.1154 + */
 18.1155 +static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 18.1156 +{
 18.1157 +	pte_t entry;
 18.1158 +
 18.1159 +	/* Read-only mapping of ZERO_PAGE. */
 18.1160 +	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 18.1161 +
 18.1162 +	/* ..except if it's a write access */
 18.1163 +	if (write_access) {
 18.1164 +		struct page *page;
 18.1165 +
 18.1166 +		/* Allocate our own private page. */
 18.1167 +		spin_unlock(&mm->page_table_lock);
 18.1168 +
 18.1169 +		page = alloc_page(GFP_HIGHUSER);
 18.1170 +		if (!page)
 18.1171 +			goto no_mem;
 18.1172 +		clear_user_highpage(page, addr);
 18.1173 +
 18.1174 +		spin_lock(&mm->page_table_lock);
 18.1175 +		if (!pte_none(*page_table)) {
 18.1176 +			page_cache_release(page);
 18.1177 +			spin_unlock(&mm->page_table_lock);
 18.1178 +			return 1;
 18.1179 +		}
 18.1180 +		mm->rss++;
 18.1181 +		flush_page_to_ram(page);
 18.1182 +		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 18.1183 +		lru_cache_add(page);
 18.1184 +	}
 18.1185 +
 18.1186 +	set_pte(page_table, entry);
 18.1187 +
 18.1188 +	/* No need to invalidate - it was non-present before */
 18.1189 +	update_mmu_cache(vma, addr, entry);
 18.1190 +	XENO_flush_page_update_queue();
 18.1191 +	spin_unlock(&mm->page_table_lock);
 18.1192 +	return 1;	/* Minor fault */
 18.1193 +
 18.1194 +no_mem:
 18.1195 +	return -1;
 18.1196 +}
 18.1197 +
 18.1198 +/*
 18.1199 + * do_no_page() tries to create a new page mapping. It aggressively
 18.1200 + * tries to share with existing pages, but makes a separate copy if
 18.1201 + * the "write_access" parameter is true in order to avoid the next
 18.1202 + * page fault.
 18.1203 + *
 18.1204 + * As this is called only for pages that do not currently exist, we
 18.1205 + * do not need to flush old virtual caches or the TLB.
 18.1206 + *
 18.1207 + * This is called with the MM semaphore held and the page table
 18.1208 + * spinlock held. Exit with the spinlock released.
 18.1209 + */
 18.1210 +static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 18.1211 +	unsigned long address, int write_access, pte_t *page_table)
 18.1212 +{
 18.1213 +	struct page * new_page;
 18.1214 +	pte_t entry;
 18.1215 +
 18.1216 +	if (!vma->vm_ops || !vma->vm_ops->nopage)
 18.1217 +		return do_anonymous_page(mm, vma, page_table, write_access, address);
 18.1218 +	spin_unlock(&mm->page_table_lock);
 18.1219 +
 18.1220 +	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 18.1221 +
 18.1222 +	if (new_page == NULL)	/* no page was available -- SIGBUS */
 18.1223 +		return 0;
 18.1224 +	if (new_page == NOPAGE_OOM)
 18.1225 +		return -1;
 18.1226 +
 18.1227 +	/*
 18.1228 +	 * Should we do an early C-O-W break?
 18.1229 +	 */
 18.1230 +	if (write_access && !(vma->vm_flags & VM_SHARED)) {
 18.1231 +		struct page * page = alloc_page(GFP_HIGHUSER);
 18.1232 +		if (!page)
 18.1233 +			return -1;
 18.1234 +		copy_highpage(page, new_page);
 18.1235 +		page_cache_release(new_page);
 18.1236 +		lru_cache_add(page);
 18.1237 +		new_page = page;
 18.1238 +	}
 18.1239 +
 18.1240 +	spin_lock(&mm->page_table_lock);
 18.1241 +	/*
 18.1242 +	 * This silly early PAGE_DIRTY setting removes a race
 18.1243 +	 * due to the bad i386 page protection. But it's valid
 18.1244 +	 * for other architectures too.
 18.1245 +	 *
 18.1246 +	 * Note that if write_access is true, we either now have
 18.1247 +	 * an exclusive copy of the page, or this is a shared mapping,
 18.1248 +	 * so we can make it writable and dirty to avoid having to
 18.1249 +	 * handle that later.
 18.1250 +	 */
 18.1251 +	/* Only go through if we didn't race with anybody else... */
 18.1252 +	if (pte_none(*page_table)) {
 18.1253 +		++mm->rss;
 18.1254 +		flush_page_to_ram(new_page);
 18.1255 +		flush_icache_page(vma, new_page);
 18.1256 +		entry = mk_pte(new_page, vma->vm_page_prot);
 18.1257 +		if (write_access)
 18.1258 +			entry = pte_mkwrite(pte_mkdirty(entry));
 18.1259 +		set_pte(page_table, entry);
 18.1260 +	} else {
 18.1261 +		/* One of our sibling threads was faster, back out. */
 18.1262 +		page_cache_release(new_page);
 18.1263 +		spin_unlock(&mm->page_table_lock);
 18.1264 +		return 1;
 18.1265 +	}
 18.1266 +
 18.1267 +	/* no need to invalidate: a not-present page shouldn't be cached */
 18.1268 +	update_mmu_cache(vma, address, entry);
 18.1269 +	XENO_flush_page_update_queue();
 18.1270 +        spin_unlock(&mm->page_table_lock);
 18.1271 +	return 2;	/* Major fault */
 18.1272 +}
 18.1273 +
 18.1274 +/*
 18.1275 + * These routines also need to handle stuff like marking pages dirty
 18.1276 + * and/or accessed for architectures that don't do it in hardware (most
 18.1277 + * RISC architectures).  The early dirtying is also good on the i386.
 18.1278 + *
 18.1279 + * There is also a hook called "update_mmu_cache()" that architectures
 18.1280 + * with external mmu caches can use to update those (ie the Sparc or
 18.1281 + * PowerPC hashed page tables that act as extended TLBs).
 18.1282 + *
 18.1283 + * Note the "page_table_lock". It is to protect against kswapd removing
 18.1284 + * pages from under us. Note that kswapd only ever _removes_ pages, never
 18.1285 + * adds them. As such, once we have noticed that the page is not present,
 18.1286 + * we can drop the lock early.
 18.1287 + *
 18.1288 + * The adding of pages is protected by the MM semaphore (which we hold),
 18.1289 + * so we don't need to worry about a page being suddenly been added into
 18.1290 + * our VM.
 18.1291 + *
 18.1292 + * We enter with the pagetable spinlock held, we are supposed to
 18.1293 + * release it when done.
 18.1294 + */
 18.1295 +static inline int handle_pte_fault(struct mm_struct *mm,
 18.1296 +	struct vm_area_struct * vma, unsigned long address,
 18.1297 +	int write_access, pte_t * pte)
 18.1298 +{
 18.1299 +	pte_t entry;
 18.1300 +
 18.1301 +	entry = *pte;
 18.1302 +	if (!pte_present(entry)) {
 18.1303 +		/*
 18.1304 +		 * If it truly wasn't present, we know that kswapd
 18.1305 +		 * and the PTE updates will not touch it later. So
 18.1306 +		 * drop the lock.
 18.1307 +		 */
 18.1308 +		if (pte_none(entry))
 18.1309 +			return do_no_page(mm, vma, address, write_access, pte);
 18.1310 +		return do_swap_page(mm, vma, address, pte, entry, write_access);
 18.1311 +	}
 18.1312 +
 18.1313 +	if (write_access) {
 18.1314 +		if (!pte_write(entry))
 18.1315 +			return do_wp_page(mm, vma, address, pte, entry);
 18.1316 +
 18.1317 +		entry = pte_mkdirty(entry);
 18.1318 +	}
 18.1319 +	entry = pte_mkyoung(entry);
 18.1320 +	establish_pte(vma, address, pte, entry);
 18.1321 +	XENO_flush_page_update_queue();
 18.1322 +	spin_unlock(&mm->page_table_lock);
 18.1323 +	return 1;
 18.1324 +}
 18.1325 +
 18.1326 +/*
 18.1327 + * By the time we get here, we already hold the mm semaphore
 18.1328 + */
 18.1329 +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 18.1330 +	unsigned long address, int write_access)
 18.1331 +{
 18.1332 +	pgd_t *pgd;
 18.1333 +	pmd_t *pmd;
 18.1334 +
 18.1335 +	current->state = TASK_RUNNING;
 18.1336 +	pgd = pgd_offset(mm, address);
 18.1337 +
 18.1338 +	/*
 18.1339 +	 * We need the page table lock to synchronize with kswapd
 18.1340 +	 * and the SMP-safe atomic PTE updates.
 18.1341 +	 */
 18.1342 +	spin_lock(&mm->page_table_lock);
 18.1343 +	pmd = pmd_alloc(mm, pgd, address);
 18.1344 +
 18.1345 +	if (pmd) {
 18.1346 +		pte_t * pte = pte_alloc(mm, pmd, address);
 18.1347 +		if (pte)
 18.1348 +			return handle_pte_fault(mm, vma, address, write_access, pte);
 18.1349 +	}
 18.1350 +	spin_unlock(&mm->page_table_lock);
 18.1351 +	return -1;
 18.1352 +}
 18.1353 +
 18.1354 +/*
 18.1355 + * Allocate page middle directory.
 18.1356 + *
 18.1357 + * We've already handled the fast-path in-line, and we own the
 18.1358 + * page table lock.
 18.1359 + *
 18.1360 + * On a two-level page table, this ends up actually being entirely
 18.1361 + * optimized away.
 18.1362 + */
 18.1363 +pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 18.1364 +{
 18.1365 +	pmd_t *new;
 18.1366 +
 18.1367 +	/* "fast" allocation can happen without dropping the lock.. */
 18.1368 +	new = pmd_alloc_one_fast(mm, address);
 18.1369 +	if (!new) {
 18.1370 +		spin_unlock(&mm->page_table_lock);
 18.1371 +		new = pmd_alloc_one(mm, address);
 18.1372 +		spin_lock(&mm->page_table_lock);
 18.1373 +		if (!new)
 18.1374 +			return NULL;
 18.1375 +
 18.1376 +		/*
 18.1377 +		 * Because we dropped the lock, we should re-check the
 18.1378 +		 * entry, as somebody else could have populated it..
 18.1379 +		 */
 18.1380 +		if (!pgd_none(*pgd)) {
 18.1381 +			pmd_free(new);
 18.1382 +			goto out;
 18.1383 +		}
 18.1384 +	}
 18.1385 +	pgd_populate(mm, pgd, new);
 18.1386 +out:
 18.1387 +	return pmd_offset(pgd, address);
 18.1388 +}
 18.1389 +
 18.1390 +/*
 18.1391 + * Allocate the page table directory.
 18.1392 + *
 18.1393 + * We've already handled the fast-path in-line, and we own the
 18.1394 + * page table lock.
 18.1395 + */
 18.1396 +pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 18.1397 +{
 18.1398 +	if (pmd_none(*pmd)) {
 18.1399 +		pte_t *new;
 18.1400 +
 18.1401 +		/* "fast" allocation can happen without dropping the lock.. */
 18.1402 +		new = pte_alloc_one_fast(mm, address);
 18.1403 +		if (!new) {
 18.1404 +			XENO_flush_page_update_queue();
 18.1405 +			spin_unlock(&mm->page_table_lock);
 18.1406 +			new = pte_alloc_one(mm, address);
 18.1407 +			spin_lock(&mm->page_table_lock);
 18.1408 +			if (!new)
 18.1409 +				return NULL;
 18.1410 +
 18.1411 +			/*
 18.1412 +			 * Because we dropped the lock, we should re-check the
 18.1413 +			 * entry, as somebody else could have populated it..
 18.1414 +			 */
 18.1415 +			if (!pmd_none(*pmd)) {
 18.1416 +				pte_free(new);
 18.1417 +				goto out;
 18.1418 +			}
 18.1419 +		}
 18.1420 +		pmd_populate(mm, pmd, new);
 18.1421 +	}
 18.1422 +out:
 18.1423 +	return pte_offset(pmd, address);
 18.1424 +}
 18.1425 +
 18.1426 +/*
 18.1427 + * Simplistic page force-in..
 18.1428 + */
 18.1429 +int make_pages_present(unsigned long addr, unsigned long end)
 18.1430 +{
 18.1431 +	int write;
 18.1432 +	struct mm_struct *mm = current->mm;
 18.1433 +	struct vm_area_struct * vma;
 18.1434 +
 18.1435 +	vma = find_vma(mm, addr);
 18.1436 +	write = (vma->vm_flags & VM_WRITE) != 0;
 18.1437 +	if (addr >= end)
 18.1438 +		BUG();
 18.1439 +	do {
 18.1440 +		if (handle_mm_fault(mm, vma, addr, write) < 0)
 18.1441 +			return -1;
 18.1442 +		addr += PAGE_SIZE;
 18.1443 +	} while (addr < end);
 18.1444 +	return 0;
 18.1445 +}
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/xenolinux-2.4.16-sparse/mm/mremap.c	Fri Jan 03 18:24:03 2003 +0000
    19.3 @@ -0,0 +1,354 @@
    19.4 +/*
    19.5 + *	linux/mm/remap.c
    19.6 + *
    19.7 + *	(C) Copyright 1996 Linus Torvalds
    19.8 + */
    19.9 +
   19.10 +#include <linux/slab.h>
   19.11 +#include <linux/smp_lock.h>
   19.12 +#include <linux/shm.h>
   19.13 +#include <linux/mman.h>
   19.14 +#include <linux/swap.h>
   19.15 +
   19.16 +#include <asm/uaccess.h>
   19.17 +#include <asm/pgalloc.h>
   19.18 +
   19.19 +extern int vm_enough_memory(long pages);
   19.20 +
   19.21 +static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
   19.22 +{
   19.23 +	pgd_t * pgd;
   19.24 +	pmd_t * pmd;
   19.25 +	pte_t * pte = NULL;
   19.26 +
   19.27 +	pgd = pgd_offset(mm, addr);
   19.28 +	if (pgd_none(*pgd))
   19.29 +		goto end;
   19.30 +	if (pgd_bad(*pgd)) {
   19.31 +		pgd_ERROR(*pgd);
   19.32 +		pgd_clear(pgd);
   19.33 +		goto end;
   19.34 +	}
   19.35 +
   19.36 +	pmd = pmd_offset(pgd, addr);
   19.37 +	if (pmd_none(*pmd))
   19.38 +		goto end;
   19.39 +	if (pmd_bad(*pmd)) {
   19.40 +		pmd_ERROR(*pmd);
   19.41 +		pmd_clear(pmd);
   19.42 +		goto end;
   19.43 +	}
   19.44 +
   19.45 +	pte = pte_offset(pmd, addr);
   19.46 +	if (pte_none(*pte))
   19.47 +		pte = NULL;
   19.48 +end:
   19.49 +	return pte;
   19.50 +}
   19.51 +
   19.52 +static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
   19.53 +{
   19.54 +	pmd_t * pmd;
   19.55 +	pte_t * pte = NULL;
   19.56 +
   19.57 +	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
   19.58 +	if (pmd)
   19.59 +		pte = pte_alloc(mm, pmd, addr);
   19.60 +	return pte;
   19.61 +}
   19.62 +
   19.63 +static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst)
   19.64 +{
   19.65 +	int error = 0;
   19.66 +	pte_t pte;
   19.67 +
   19.68 +	if (!pte_none(*src)) {
   19.69 +		pte = ptep_get_and_clear(src);
   19.70 +		if (!dst) {
   19.71 +			/* No dest?  We must put it back. */
   19.72 +			dst = src;
   19.73 +			error++;
   19.74 +		}
   19.75 +		set_pte(dst, pte);
   19.76 +	}
   19.77 +	return error;
   19.78 +}
   19.79 +
   19.80 +static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
   19.81 +{
   19.82 +	int error = 0;
   19.83 +	pte_t * src;
   19.84 +
   19.85 +	spin_lock(&mm->page_table_lock);
   19.86 +	src = get_one_pte(mm, old_addr);
   19.87 +	if (src)
   19.88 +		error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr));
   19.89 +	spin_unlock(&mm->page_table_lock);
   19.90 +	return error;
   19.91 +}
   19.92 +
   19.93 +static int move_page_tables(struct mm_struct * mm,
   19.94 +	unsigned long new_addr, unsigned long old_addr, unsigned long len)
   19.95 +{
   19.96 +	unsigned long offset = len;
   19.97 +
   19.98 +	flush_cache_range(mm, old_addr, old_addr + len);
   19.99 +
  19.100 +	/*
  19.101 +	 * This is not the clever way to do this, but we're taking the
  19.102 +	 * easy way out on the assumption that most remappings will be
  19.103 +	 * only a few pages.. This also makes error recovery easier.
  19.104 +	 */
  19.105 +	while (offset) {
  19.106 +		offset -= PAGE_SIZE;
  19.107 +		if (move_one_page(mm, old_addr + offset, new_addr + offset))
  19.108 +			goto oops_we_failed;
  19.109 +	}
  19.110 +	flush_tlb_range(mm, old_addr, old_addr + len);
  19.111 +	return 0;
  19.112 +
  19.113 +	/*
  19.114 +	 * Ok, the move failed because we didn't have enough pages for
  19.115 +	 * the new page table tree. This is unlikely, but we have to
  19.116 +	 * take the possibility into account. In that case we just move
  19.117 +	 * all the pages back (this will work, because we still have
  19.118 +	 * the old page tables)
  19.119 +	 */
  19.120 +oops_we_failed:
  19.121 +	XENO_flush_page_update_queue();
  19.122 +	flush_cache_range(mm, new_addr, new_addr + len);
  19.123 +	while ((offset += PAGE_SIZE) < len)
  19.124 +		move_one_page(mm, new_addr + offset, old_addr + offset);
  19.125 +	XENO_flush_page_update_queue();
  19.126 +	zap_page_range(mm, new_addr, len);
  19.127 +	return -1;
  19.128 +}
  19.129 +
  19.130 +static inline unsigned long move_vma(struct vm_area_struct * vma,
  19.131 +	unsigned long addr, unsigned long old_len, unsigned long new_len,
  19.132 +	unsigned long new_addr)
  19.133 +{
  19.134 +	struct mm_struct * mm = vma->vm_mm;
  19.135 +	struct vm_area_struct * new_vma, * next, * prev;
  19.136 +	int allocated_vma;
  19.137 +
  19.138 +	new_vma = NULL;
  19.139 +	next = find_vma_prev(mm, new_addr, &prev);
  19.140 +	if (next) {
  19.141 +		if (prev && prev->vm_end == new_addr &&
  19.142 +		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
  19.143 +			spin_lock(&mm->page_table_lock);
  19.144 +			prev->vm_end = new_addr + new_len;
  19.145 +			spin_unlock(&mm->page_table_lock);
  19.146 +			new_vma = prev;
  19.147 +			if (next != prev->vm_next)
  19.148 +				BUG();
  19.149 +			if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) {
  19.150 +				spin_lock(&mm->page_table_lock);
  19.151 +				prev->vm_end = next->vm_end;
  19.152 +				__vma_unlink(mm, next, prev);
  19.153 +				spin_unlock(&mm->page_table_lock);
  19.154 +
  19.155 +				mm->map_count--;
  19.156 +				kmem_cache_free(vm_area_cachep, next);
  19.157 +			}
  19.158 +		} else if (next->vm_start == new_addr + new_len &&
  19.159 +			   can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
  19.160 +			spin_lock(&mm->page_table_lock);
  19.161 +			next->vm_start = new_addr;
  19.162 +			spin_unlock(&mm->page_table_lock);
  19.163 +			new_vma = next;
  19.164 +		}
  19.165 +	} else {
  19.166 +		prev = find_vma(mm, new_addr-1);
  19.167 +		if (prev && prev->vm_end == new_addr &&
  19.168 +		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
  19.169 +			spin_lock(&mm->page_table_lock);
  19.170 +			prev->vm_end = new_addr + new_len;
  19.171 +			spin_unlock(&mm->page_table_lock);
  19.172 +			new_vma = prev;
  19.173 +		}
  19.174 +	}
  19.175 +
  19.176 +	allocated_vma = 0;
  19.177 +	if (!new_vma) {
  19.178 +		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  19.179 +		if (!new_vma)
  19.180 +			goto out;
  19.181 +		allocated_vma = 1;
  19.182 +	}
  19.183 +
  19.184 +	if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
  19.185 +		if (allocated_vma) {
  19.186 +			*new_vma = *vma;
  19.187 +			new_vma->vm_start = new_addr;
  19.188 +			new_vma->vm_end = new_addr+new_len;
  19.189 +			new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
  19.190 +			new_vma->vm_raend = 0;
  19.191 +			if (new_vma->vm_file)
  19.192 +				get_file(new_vma->vm_file);
  19.193 +			if (new_vma->vm_ops && new_vma->vm_ops->open)
  19.194 +				new_vma->vm_ops->open(new_vma);
  19.195 +			insert_vm_struct(current->mm, new_vma);
  19.196 +		}
  19.197 +		do_munmap(current->mm, addr, old_len);
  19.198 +		current->mm->total_vm += new_len >> PAGE_SHIFT;
  19.199 +		if (new_vma->vm_flags & VM_LOCKED) {
  19.200 +			current->mm->locked_vm += new_len >> PAGE_SHIFT;
  19.201 +			make_pages_present(new_vma->vm_start,
  19.202 +					   new_vma->vm_end);
  19.203 +		}
  19.204 +		return new_addr;
  19.205 +	}
  19.206 +	if (allocated_vma)
  19.207 +		kmem_cache_free(vm_area_cachep, new_vma);
  19.208 + out:
  19.209 +	return -ENOMEM;
  19.210 +}
  19.211 +
  19.212 +/*
  19.213 + * Expand (or shrink) an existing mapping, potentially moving it at the
  19.214 + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
  19.215 + *
  19.216 + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
  19.217 + * This option implies MREMAP_MAYMOVE.
  19.218 + */
  19.219 +unsigned long do_mremap(unsigned long addr,
  19.220 +	unsigned long old_len, unsigned long new_len,
  19.221 +	unsigned long flags, unsigned long new_addr)
  19.222 +{
  19.223 +	struct vm_area_struct *vma;
  19.224 +	unsigned long ret = -EINVAL;
  19.225 +
  19.226 +	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
  19.227 +		goto out;
  19.228 +
  19.229 +	if (addr & ~PAGE_MASK)
  19.230 +		goto out;
  19.231 +
  19.232 +	old_len = PAGE_ALIGN(old_len);
  19.233 +	new_len = PAGE_ALIGN(new_len);
  19.234 +
  19.235 +	/* new_addr is only valid if MREMAP_FIXED is specified */
  19.236 +	if (flags & MREMAP_FIXED) {
  19.237 +		if (new_addr & ~PAGE_MASK)
  19.238 +			goto out;
  19.239 +		if (!(flags & MREMAP_MAYMOVE))
  19.240 +			goto out;
  19.241 +
  19.242 +		if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
  19.243 +			goto out;
  19.244 +
  19.245 +		/* Check if the location we're moving into overlaps the
  19.246 +		 * old location at all, and fail if it does.
  19.247 +		 */
  19.248 +		if ((new_addr <= addr) && (new_addr+new_len) > addr)
  19.249 +			goto out;
  19.250 +
  19.251 +		if ((addr <= new_addr) && (addr+old_len) > new_addr)
  19.252 +			goto out;
  19.253 +
  19.254 +		do_munmap(current->mm, new_addr, new_len);
  19.255 +	}
  19.256 +
  19.257 +	/*
  19.258 +	 * Always allow a shrinking remap: that just unmaps
  19.259 +	 * the unnecessary pages..
  19.260 +	 */
  19.261 +	ret = addr;
  19.262 +	if (old_len >= new_len) {
  19.263 +		do_munmap(current->mm, addr+new_len, old_len - new_len);
  19.264 +		if (!(flags & MREMAP_FIXED) || (new_addr == addr))
  19.265 +			goto out;
  19.266 +	}
  19.267 +
  19.268 +	/*
  19.269 +	 * Ok, we need to grow..  or relocate.
  19.270 +	 */
  19.271 +	ret = -EFAULT;
  19.272 +	vma = find_vma(current->mm, addr);
  19.273 +	if (!vma || vma->vm_start > addr)
  19.274 +		goto out;
  19.275 +	/* We can't remap across vm area boundaries */
  19.276 +	if (old_len > vma->vm_end - addr)
  19.277 +		goto out;
  19.278 +	if (vma->vm_flags & VM_DONTEXPAND) {
  19.279 +		if (new_len > old_len)
  19.280 +			goto out;
  19.281 +	}
  19.282 +	if (vma->vm_flags & VM_LOCKED) {
  19.283 +		unsigned long locked = current->mm->locked_vm << PAGE_SHIFT;
  19.284 +		locked += new_len - old_len;
  19.285 +		ret = -EAGAIN;
  19.286 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
  19.287 +			goto out;
  19.288 +	}
  19.289 +	ret = -ENOMEM;
  19.290 +	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
  19.291 +	    > current->rlim[RLIMIT_AS].rlim_cur)
  19.292 +		goto out;
  19.293 +	/* Private writable mapping? Check memory availability.. */
  19.294 +	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
  19.295 +	    !(flags & MAP_NORESERVE)				 &&
  19.296 +	    !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT))
  19.297 +		goto out;
  19.298 +
  19.299 +	/* old_len exactly to the end of the area..
  19.300 +	 * And we're not relocating the area.
  19.301 +	 */
  19.302 +	if (old_len == vma->vm_end - addr &&
  19.303 +	    !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
  19.304 +	    (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
  19.305 +		unsigned long max_addr = TASK_SIZE;
  19.306 +		if (vma->vm_next)
  19.307 +			max_addr = vma->vm_next->vm_start;
  19.308 +		/* can we just expand the current mapping? */
  19.309 +		if (max_addr - addr >= new_len) {
  19.310 +			int pages = (new_len - old_len) >> PAGE_SHIFT;
  19.311 +			spin_lock(&vma->vm_mm->page_table_lock);
  19.312 +			vma->vm_end = addr + new_len;
  19.313 +			spin_unlock(&vma->vm_mm->page_table_lock);
  19.314 +			current->mm->total_vm += pages;
  19.315 +			if (vma->vm_flags & VM_LOCKED) {
  19.316 +				current->mm->locked_vm += pages;
  19.317 +				make_pages_present(addr + old_len,
  19.318 +						   addr + new_len);
  19.319 +			}
  19.320 +			ret = addr;
  19.321 +			goto out;
  19.322 +		}
  19.323 +	}
  19.324 +
  19.325 +	/*
  19.326 +	 * We weren't able to just expand or shrink the area,
  19.327 +	 * we need to create a new one and move it..
  19.328 +	 */
  19.329 +	ret = -ENOMEM;
  19.330 +	if (flags & MREMAP_MAYMOVE) {
  19.331 +		if (!(flags & MREMAP_FIXED)) {
  19.332 +			unsigned long map_flags = 0;
  19.333 +			if (vma->vm_flags & VM_SHARED)
  19.334 +				map_flags |= MAP_SHARED;
  19.335 +
  19.336 +			new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags);
  19.337 +			ret = new_addr;
  19.338 +			if (new_addr & ~PAGE_MASK)
  19.339 +				goto out;
  19.340 +		}
  19.341 +		ret = move_vma(vma, addr, old_len, new_len, new_addr);
  19.342 +	}
  19.343 +out:
  19.344 +	return ret;
  19.345 +}
  19.346 +
  19.347 +asmlinkage unsigned long sys_mremap(unsigned long addr,
  19.348 +	unsigned long old_len, unsigned long new_len,
  19.349 +	unsigned long flags, unsigned long new_addr)
  19.350 +{
  19.351 +	unsigned long ret;
  19.352 +
  19.353 +	down_write(&current->mm->mmap_sem);
  19.354 +	ret = do_mremap(addr, old_len, new_len, flags, new_addr);
  19.355 +	up_write(&current->mm->mmap_sem);
  19.356 +	return ret;
  19.357 +}
    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/xenolinux-2.4.16-sparse/mm/swapfile.c	Fri Jan 03 18:24:03 2003 +0000
    20.3 @@ -0,0 +1,1291 @@
    20.4 +/*
    20.5 + *  linux/mm/swapfile.c
    20.6 + *
    20.7 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
    20.8 + *  Swap reorganised 29.12.95, Stephen Tweedie
    20.9 + */
   20.10 +
   20.11 +#include <linux/slab.h>
   20.12 +#include <linux/smp_lock.h>
   20.13 +#include <linux/kernel_stat.h>
   20.14 +#include <linux/swap.h>
   20.15 +#include <linux/swapctl.h>
   20.16 +#include <linux/blkdev.h> /* for blk_size */
   20.17 +#include <linux/vmalloc.h>
   20.18 +#include <linux/pagemap.h>
   20.19 +#include <linux/shm.h>
   20.20 +#include <linux/compiler.h>
   20.21 +
   20.22 +#include <asm/pgtable.h>
   20.23 +
   20.24 +spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
   20.25 +unsigned int nr_swapfiles;
   20.26 +int total_swap_pages;
   20.27 +static int swap_overflow;
   20.28 +
   20.29 +static const char Bad_file[] = "Bad swap file entry ";
   20.30 +static const char Unused_file[] = "Unused swap file entry ";
   20.31 +static const char Bad_offset[] = "Bad swap offset entry ";
   20.32 +static const char Unused_offset[] = "Unused swap offset entry ";
   20.33 +
   20.34 +struct swap_list_t swap_list = {-1, -1};
   20.35 +
   20.36 +struct swap_info_struct swap_info[MAX_SWAPFILES];
   20.37 +
   20.38 +#define SWAPFILE_CLUSTER 256
   20.39 +
   20.40 +static inline int scan_swap_map(struct swap_info_struct *si)
   20.41 +{
   20.42 +	unsigned long offset;
   20.43 +	/* 
   20.44 +	 * We try to cluster swap pages by allocating them
   20.45 +	 * sequentially in swap.  Once we've allocated
   20.46 +	 * SWAPFILE_CLUSTER pages this way, however, we resort to
   20.47 +	 * first-free allocation, starting a new cluster.  This
   20.48 +	 * prevents us from scattering swap pages all over the entire
   20.49 +	 * swap partition, so that we reduce overall disk seek times
   20.50 +	 * between swap pages.  -- sct */
   20.51 +	if (si->cluster_nr) {
   20.52 +		while (si->cluster_next <= si->highest_bit) {
   20.53 +			offset = si->cluster_next++;
   20.54 +			if (si->swap_map[offset])
   20.55 +				continue;
   20.56 +			si->cluster_nr--;
   20.57 +			goto got_page;
   20.58 +		}
   20.59 +	}
   20.60 +	si->cluster_nr = SWAPFILE_CLUSTER;
   20.61 +
   20.62 +	/* try to find an empty (even not aligned) cluster. */
   20.63 +	offset = si->lowest_bit;
   20.64 + check_next_cluster:
   20.65 +	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
   20.66 +	{
   20.67 +		int nr;
   20.68 +		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
   20.69 +			if (si->swap_map[nr])
   20.70 +			{
   20.71 +				offset = nr+1;
   20.72 +				goto check_next_cluster;
   20.73 +			}
   20.74 +		/* We found a completly empty cluster, so start
   20.75 +		 * using it.
   20.76 +		 */
   20.77 +		goto got_page;
   20.78 +	}
   20.79 +	/* No luck, so now go finegrined as usual. -Andrea */
   20.80 +	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
   20.81 +		if (si->swap_map[offset])
   20.82 +			continue;
   20.83 +		si->lowest_bit = offset+1;
   20.84 +	got_page:
   20.85 +		if (offset == si->lowest_bit)
   20.86 +			si->lowest_bit++;
   20.87 +		if (offset == si->highest_bit)
   20.88 +			si->highest_bit--;
   20.89 +		if (si->lowest_bit > si->highest_bit) {
   20.90 +			si->lowest_bit = si->max;
   20.91 +			si->highest_bit = 0;
   20.92 +		}
   20.93 +		si->swap_map[offset] = 1;
   20.94 +		nr_swap_pages--;
   20.95 +		si->cluster_next = offset+1;
   20.96 +		return offset;
   20.97 +	}
   20.98 +	si->lowest_bit = si->max;
   20.99 +	si->highest_bit = 0;
  20.100 +	return 0;
  20.101 +}
  20.102 +
  20.103 +swp_entry_t get_swap_page(void)
  20.104 +{
  20.105 +	struct swap_info_struct * p;
  20.106 +	unsigned long offset;
  20.107 +	swp_entry_t entry;
  20.108 +	int type, wrapped = 0;
  20.109 +
  20.110 +	entry.val = 0;	/* Out of memory */
  20.111 +	swap_list_lock();
  20.112 +	type = swap_list.next;
  20.113 +	if (type < 0)
  20.114 +		goto out;
  20.115 +	if (nr_swap_pages <= 0)
  20.116 +		goto out;
  20.117 +
  20.118 +	while (1) {
  20.119 +		p = &swap_info[type];
  20.120 +		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  20.121 +			swap_device_lock(p);
  20.122 +			offset = scan_swap_map(p);
  20.123 +			swap_device_unlock(p);
  20.124 +			if (offset) {
  20.125 +				entry = SWP_ENTRY(type,offset);
  20.126 +				type = swap_info[type].next;
  20.127 +				if (type < 0 ||
  20.128 +					p->prio != swap_info[type].prio) {
  20.129 +						swap_list.next = swap_list.head;
  20.130 +				} else {
  20.131 +					swap_list.next = type;
  20.132 +				}
  20.133 +				goto out;
  20.134 +			}
  20.135 +		}
  20.136 +		type = p->next;
  20.137 +		if (!wrapped) {
  20.138 +			if (type < 0 || p->prio != swap_info[type].prio) {
  20.139 +				type = swap_list.head;
  20.140 +				wrapped = 1;
  20.141 +			}
  20.142 +		} else
  20.143 +			if (type < 0)
  20.144 +				goto out;	/* out of swap space */
  20.145 +	}
  20.146 +out:
  20.147 +	swap_list_unlock();
  20.148 +	return entry;
  20.149 +}
  20.150 +
  20.151 +static struct swap_info_struct * swap_info_get(swp_entry_t entry)
  20.152 +{
  20.153 +	struct swap_info_struct * p;
  20.154 +	unsigned long offset, type;
  20.155 +
  20.156 +	if (!entry.val)
  20.157 +		goto out;
  20.158 +	type = SWP_TYPE(entry);
  20.159 +	if (type >= nr_swapfiles)
  20.160 +		goto bad_nofile;
  20.161 +	p = & swap_info[type];
  20.162 +	if (!(p->flags & SWP_USED))
  20.163 +		goto bad_device;
  20.164 +	offset = SWP_OFFSET(entry);
  20.165 +	if (offset >= p->max)
  20.166 +		goto bad_offset;
  20.167 +	if (!p->swap_map[offset])
  20.168 +		goto bad_free;
  20.169 +	swap_list_lock();
  20.170 +	if (p->prio > swap_info[swap_list.next].prio)
  20.171 +		swap_list.next = type;
  20.172 +	swap_device_lock(p);
  20.173 +	return p;
  20.174 +
  20.175 +bad_free:
  20.176 +	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
  20.177 +	goto out;
  20.178 +bad_offset:
  20.179 +	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
  20.180 +	goto out;
  20.181 +bad_device:
  20.182 +	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
  20.183 +	goto out;
  20.184 +bad_nofile:
  20.185 +	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
  20.186 +out:
  20.187 +	return NULL;
  20.188 +}	
  20.189 +
  20.190 +static void swap_info_put(struct swap_info_struct * p)
  20.191 +{
  20.192 +	swap_device_unlock(p);
  20.193 +	swap_list_unlock();
  20.194 +}
  20.195 +
  20.196 +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
  20.197 +{
  20.198 +	int count = p->swap_map[offset];
  20.199 +
  20.200 +	if (count < SWAP_MAP_MAX) {
  20.201 +		count--;
  20.202 +		p->swap_map[offset] = count;
  20.203 +		if (!count) {
  20.204 +			if (offset < p->lowest_bit)
  20.205 +				p->lowest_bit = offset;
  20.206 +			if (offset > p->highest_bit)
  20.207 +				p->highest_bit = offset;
  20.208 +			nr_swap_pages++;
  20.209 +		}
  20.210 +	}
  20.211 +	return count;
  20.212 +}
  20.213 +
  20.214 +/*
  20.215 + * Caller has made sure that the swapdevice corresponding to entry
  20.216 + * is still around or has not been recycled.
  20.217 + */
  20.218 +void swap_free(swp_entry_t entry)
  20.219 +{
  20.220 +	struct swap_info_struct * p;
  20.221 +
  20.222 +	p = swap_info_get(entry);
  20.223 +	if (p) {
  20.224 +		swap_entry_free(p, SWP_OFFSET(entry));
  20.225 +		swap_info_put(p);
  20.226 +	}
  20.227 +}
  20.228 +
  20.229 +/*
  20.230 + * Check if we're the only user of a swap page,
  20.231 + * when the page is locked.
  20.232 + */
  20.233 +static int exclusive_swap_page(struct page *page)
  20.234 +{
  20.235 +	int retval = 0;
  20.236 +	struct swap_info_struct * p;
  20.237 +	swp_entry_t entry;
  20.238 +
  20.239 +	entry.val = page->index;
  20.240 +	p = swap_info_get(entry);
  20.241 +	if (p) {
  20.242 +		/* Is the only swap cache user the cache itself? */
  20.243 +		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
  20.244 +			/* Recheck the page count with the pagecache lock held.. */
  20.245 +			spin_lock(&pagecache_lock);
  20.246 +			if (page_count(page) - !!page->buffers == 2)
  20.247 +				retval = 1;
  20.248 +			spin_unlock(&pagecache_lock);
  20.249 +		}
  20.250 +		swap_info_put(p);
  20.251 +	}
  20.252 +	return retval;
  20.253 +}
  20.254 +
  20.255 +/*
  20.256 + * We can use this swap cache entry directly
  20.257 + * if there are no other references to it.
  20.258 + *
  20.259 + * Here "exclusive_swap_page()" does the real
  20.260 + * work, but we opportunistically check whether
  20.261 + * we need to get all the locks first..
  20.262 + */
  20.263 +int can_share_swap_page(struct page *page)
  20.264 +{
  20.265 +	int retval = 0;
  20.266 +
  20.267 +	if (!PageLocked(page))
  20.268 +		BUG();
  20.269 +	switch (page_count(page)) {
  20.270 +	case 3:
  20.271 +		if (!page->buffers)
  20.272 +			break;
  20.273 +		/* Fallthrough */
  20.274 +	case 2:
  20.275 +		if (!PageSwapCache(page))
  20.276 +			break;
  20.277 +		retval = exclusive_swap_page(page);
  20.278 +		break;
  20.279 +	case 1:
  20.280 +		if (PageReserved(page))
  20.281 +			break;
  20.282 +		retval = 1;
  20.283 +	}
  20.284 +	return retval;
  20.285 +}
  20.286 +
  20.287 +/*
  20.288 + * Work out if there are any other processes sharing this
  20.289 + * swap cache page. Free it if you can. Return success.
  20.290 + */
  20.291 +int remove_exclusive_swap_page(struct page *page)
  20.292 +{
  20.293 +	int retval;
  20.294 +	struct swap_info_struct * p;
  20.295 +	swp_entry_t entry;
  20.296 +
  20.297 +	if (!PageLocked(page))
  20.298 +		BUG();
  20.299 +	if (!PageSwapCache(page))
  20.300 +		return 0;
  20.301 +	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
  20.302 +		return 0;
  20.303 +
  20.304 +	entry.val = page->index;
  20.305 +	p = swap_info_get(entry);
  20.306 +	if (!p)
  20.307 +		return 0;
  20.308 +
  20.309 +	/* Is the only swap cache user the cache itself? */
  20.310 +	retval = 0;
  20.311 +	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
  20.312 +		/* Recheck the page count with the pagecache lock held.. */
  20.313 +		spin_lock(&pagecache_lock);
  20.314 +		if (page_count(page) - !!page->buffers == 2) {
  20.315 +			__delete_from_swap_cache(page);
  20.316 +			SetPageDirty(page);
  20.317 +			retval = 1;
  20.318 +		}
  20.319 +		spin_unlock(&pagecache_lock);
  20.320 +	}
  20.321 +	swap_info_put(p);
  20.322 +
  20.323 +	if (retval) {
  20.324 +		block_flushpage(page, 0);
  20.325 +		swap_free(entry);
  20.326 +		page_cache_release(page);
  20.327 +	}
  20.328 +
  20.329 +	return retval;
  20.330 +}
  20.331 +
  20.332 +/*
  20.333 + * Free the swap entry like above, but also try to
  20.334 + * free the page cache entry if it is the last user.
  20.335 + */
  20.336 +void free_swap_and_cache(swp_entry_t entry)
  20.337 +{
  20.338 +	struct swap_info_struct * p;
  20.339 +	struct page *page = NULL;
  20.340 +
  20.341 +	p = swap_info_get(entry);
  20.342 +	if (p) {
  20.343 +		if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
  20.344 +			page = find_trylock_page(&swapper_space, entry.val);
  20.345 +		swap_info_put(p);
  20.346 +	}
  20.347 +	if (page) {
  20.348 +		page_cache_get(page);
  20.349 +		/* Only cache user (+us), or swap space full? Free it! */
  20.350 +		if (page_count(page) == 2 || vm_swap_full()) {
  20.351 +			delete_from_swap_cache(page);
  20.352 +			SetPageDirty(page);
  20.353 +		}
  20.354 +		UnlockPage(page);
  20.355 +		page_cache_release(page);
  20.356 +	}
  20.357 +}
  20.358 +
  20.359 +/*
  20.360 + * The swap entry has been read in advance, and we return 1 to indicate
  20.361 + * that the page has been used or is no longer needed.
  20.362 + *
  20.363 + * Always set the resulting pte to be nowrite (the same as COW pages
  20.364 + * after one process has exited).  We don't know just how many PTEs will
  20.365 + * share this swap entry, so be cautious and let do_wp_page work out
  20.366 + * what to do if a write is requested later.
  20.367 + */
  20.368 +/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  20.369 +static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
  20.370 +	pte_t *dir, swp_entry_t entry, struct page* page)
  20.371 +{
  20.372 +	pte_t pte = *dir;
  20.373 +
  20.374 +	if (likely(pte_to_swp_entry(pte).val != entry.val))
  20.375 +		return;
  20.376 +	if (unlikely(pte_none(pte) || pte_present(pte)))
  20.377 +		return;
  20.378 +	get_page(page);
  20.379 +	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
  20.380 +	swap_free(entry);
  20.381 +	++vma->vm_mm->rss;
  20.382 +}
  20.383 +
  20.384 +/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  20.385 +static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
  20.386 +	unsigned long address, unsigned long size, unsigned long offset,
  20.387 +	swp_entry_t entry, struct page* page)
  20.388 +{
  20.389 +	pte_t * pte;
  20.390 +	unsigned long end;
  20.391 +
  20.392 +	if (pmd_none(*dir))
  20.393 +		return;
  20.394 +	if (pmd_bad(*dir)) {
  20.395 +		pmd_ERROR(*dir);
  20.396 +		pmd_clear(dir);
  20.397 +		return;
  20.398 +	}
  20.399 +	pte = pte_offset(dir, address);
  20.400 +	offset += address & PMD_MASK;
  20.401 +	address &= ~PMD_MASK;
  20.402 +	end = address + size;
  20.403 +	if (end > PMD_SIZE)
  20.404 +		end = PMD_SIZE;
  20.405 +	do {
  20.406 +		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
  20.407 +		address += PAGE_SIZE;
  20.408 +		pte++;
  20.409 +	} while (address && (address < end));
  20.410 +}
  20.411 +
  20.412 +/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  20.413 +static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
  20.414 +	unsigned long address, unsigned long size,
  20.415 +	swp_entry_t entry, struct page* page)
  20.416 +{
  20.417 +	pmd_t * pmd;
  20.418 +	unsigned long offset, end;
  20.419 +
  20.420 +	if (pgd_none(*dir))
  20.421 +		return;
  20.422 +	if (pgd_bad(*dir)) {
  20.423 +		pgd_ERROR(*dir);
  20.424 +		pgd_clear(dir);
  20.425 +		return;
  20.426 +	}
  20.427 +	pmd = pmd_offset(dir, address);
  20.428 +	offset = address & PGDIR_MASK;
  20.429 +	address &= ~PGDIR_MASK;
  20.430 +	end = address + size;
  20.431 +	if (end > PGDIR_SIZE)
  20.432 +		end = PGDIR_SIZE;
  20.433 +	if (address >= end)
  20.434 +		BUG();
  20.435 +	do {
  20.436 +		unuse_pmd(vma, pmd, address, end - address, offset, entry,
  20.437 +			  page);
  20.438 +		address = (address + PMD_SIZE) & PMD_MASK;
  20.439 +		pmd++;
  20.440 +	} while (address && (address < end));
  20.441 +}
  20.442 +
  20.443 +/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  20.444 +static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
  20.445 +			swp_entry_t entry, struct page* page)
  20.446 +{
  20.447 +	unsigned long start = vma->vm_start, end = vma->vm_end;
  20.448 +
  20.449 +	if (start >= end)
  20.450 +		BUG();
  20.451 +	do {
  20.452 +		unuse_pgd(vma, pgdir, start, end - start, entry, page);
  20.453 +		start = (start + PGDIR_SIZE) & PGDIR_MASK;
  20.454 +		pgdir++;
  20.455 +	} while (start && (start < end));
  20.456 +}
  20.457 +
  20.458 +static void unuse_process(struct mm_struct * mm,
  20.459 +			swp_entry_t entry, struct page* page)
  20.460 +{
  20.461 +	struct vm_area_struct* vma;
  20.462 +
  20.463 +	/*
  20.464 +	 * Go through process' page directory.
  20.465 +	 */
  20.466 +	spin_lock(&mm->page_table_lock);
  20.467 +	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  20.468 +		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
  20.469 +		unuse_vma(vma, pgd, entry, page);
  20.470 +	}
  20.471 +	XENO_flush_page_update_queue();
  20.472 +	spin_unlock(&mm->page_table_lock);
  20.473 +	return;
  20.474 +}
  20.475 +
  20.476 +/*
  20.477 + * Scan swap_map from current position to next entry still in use.
  20.478 + * Recycle to start on reaching the end, returning 0 when empty.
  20.479 + */
  20.480 +static int find_next_to_unuse(struct swap_info_struct *si, int prev)
  20.481 +{
  20.482 +	int max = si->max;
  20.483 +	int i = prev;
  20.484 +	int count;
  20.485 +
  20.486 +	/*
  20.487 +	 * No need for swap_device_lock(si) here: we're just looking
  20.488 +	 * for whether an entry is in use, not modifying it; false
  20.489 +	 * hits are okay, and sys_swapoff() has already prevented new
  20.490 +	 * allocations from this area (while holding swap_list_lock()).
  20.491 +	 */
  20.492 +	for (;;) {
  20.493 +		if (++i >= max) {
  20.494 +			if (!prev) {
  20.495 +				i = 0;
  20.496 +				break;
  20.497 +			}
  20.498 +			/*
  20.499 +			 * No entries in use at top of swap_map,
  20.500 +			 * loop back to start and recheck there.
  20.501 +			 */
  20.502 +			max = prev + 1;
  20.503 +			prev = 0;
  20.504 +			i = 1;
  20.505 +		}
  20.506 +		count = si->swap_map[i];
  20.507 +		if (count && count != SWAP_MAP_BAD)
  20.508 +			break;
  20.509 +	}
  20.510 +	return i;
  20.511 +}
  20.512 +
  20.513 +/*
  20.514 + * We completely avoid races by reading each swap page in advance,
  20.515 + * and then search for the process using it.  All the necessary
  20.516 + * page table adjustments can then be made atomically.
  20.517 + */
  20.518 +static int try_to_unuse(unsigned int type)
  20.519 +{
  20.520 +	struct swap_info_struct * si = &swap_info[type];
  20.521 +	struct mm_struct *start_mm;
  20.522 +	unsigned short *swap_map;
  20.523 +	unsigned short swcount;
  20.524 +	struct page *page;
  20.525 +	swp_entry_t entry;
  20.526 +	int i = 0;
  20.527 +	int retval = 0;
  20.528 +	int reset_overflow = 0;
  20.529 +
  20.530 +	/*
  20.531 +	 * When searching mms for an entry, a good strategy is to
  20.532 +	 * start at the first mm we freed the previous entry from
  20.533 +	 * (though actually we don't notice whether we or coincidence
  20.534 +	 * freed the entry).  Initialize this start_mm with a hold.
  20.535 +	 *
  20.536 +	 * A simpler strategy would be to start at the last mm we
  20.537 +	 * freed the previous entry from; but that would take less
  20.538 +	 * advantage of mmlist ordering (now preserved by swap_out()),
  20.539 +	 * which clusters forked address spaces together, most recent
  20.540 +	 * child immediately after parent.  If we race with dup_mmap(),
  20.541 +	 * we very much want to resolve parent before child, otherwise
  20.542 +	 * we may miss some entries: using last mm would invert that.
  20.543 +	 */
  20.544 +	start_mm = &init_mm;
  20.545 +	atomic_inc(&init_mm.mm_users);
  20.546 +
  20.547 +	/*
  20.548 +	 * Keep on scanning until all entries have gone.  Usually,
  20.549 +	 * one pass through swap_map is enough, but not necessarily:
  20.550 +	 * mmput() removes mm from mmlist before exit_mmap() and its
  20.551 +	 * zap_page_range().  That's not too bad, those entries are
  20.552 +	 * on their way out, and handled faster there than here.
  20.553 +	 * do_munmap() behaves similarly, taking the range out of mm's
  20.554 +	 * vma list before zap_page_range().  But unfortunately, when
  20.555 +	 * unmapping a part of a vma, it takes the whole out first,
  20.556 +	 * then reinserts what's left after (might even reschedule if
  20.557 +	 * open() method called) - so swap entries may be invisible
  20.558 +	 * to swapoff for a while, then reappear - but that is rare.
  20.559 +	 */
  20.560 +	while ((i = find_next_to_unuse(si, i))) {
  20.561 +		/* 
  20.562 +		 * Get a page for the entry, using the existing swap
  20.563 +		 * cache page if there is one.  Otherwise, get a clean
  20.564 +		 * page and read the swap into it. 
  20.565 +		 */
  20.566 +		swap_map = &si->swap_map[i];
  20.567 +		entry = SWP_ENTRY(type, i);
  20.568 +		page = read_swap_cache_async(entry);
  20.569 +		if (!page) {
  20.570 +			/*
  20.571 +			 * Either swap_duplicate() failed because entry
  20.572 +			 * has been freed independently, and will not be
  20.573 +			 * reused since sys_swapoff() already disabled
  20.574 +			 * allocation from here, or alloc_page() failed.
  20.575 +			 */
  20.576 +			if (!*swap_map)
  20.577 +				continue;
  20.578 +			retval = -ENOMEM;
  20.579 +			break;
  20.580 +		}
  20.581 +
  20.582 +		/*
  20.583 +		 * Don't hold on to start_mm if it looks like exiting.
  20.584 +		 */
  20.585 +		if (atomic_read(&start_mm->mm_users) == 1) {
  20.586 +			mmput(start_mm);
  20.587 +			start_mm = &init_mm;
  20.588 +			atomic_inc(&init_mm.mm_users);
  20.589 +		}
  20.590 +
  20.591 +		/*
  20.592 +		 * Wait for and lock page.  When do_swap_page races with
  20.593 +		 * try_to_unuse, do_swap_page can handle the fault much
  20.594 +		 * faster than try_to_unuse can locate the entry.  This
  20.595 +		 * apparently redundant "wait_on_page" lets try_to_unuse
  20.596 +		 * defer to do_swap_page in such a case - in some tests,
  20.597 +		 * do_swap_page and try_to_unuse repeatedly compete.
  20.598 +		 */
  20.599 +		wait_on_page(page);
  20.600 +		lock_page(page);
  20.601 +
  20.602 +		/*
  20.603 +		 * Remove all references to entry, without blocking.
  20.604 +		 * Whenever we reach init_mm, there's no address space
  20.605 +		 * to search, but use it as a reminder to search shmem.
  20.606 +		 */
  20.607 +		swcount = *swap_map;
  20.608 +		if (swcount > 1) {
  20.609 +			flush_page_to_ram(page);
  20.610 +			if (start_mm == &init_mm)
  20.611 +				shmem_unuse(entry, page);
  20.612 +			else
  20.613 +				unuse_process(start_mm, entry, page);
  20.614 +		}
  20.615 +		if (*swap_map > 1) {
  20.616 +			int set_start_mm = (*swap_map >= swcount);
  20.617 +			struct list_head *p = &start_mm->mmlist;
  20.618 +			struct mm_struct *new_start_mm = start_mm;
  20.619 +			struct mm_struct *mm;
  20.620 +
  20.621 +			spin_lock(&mmlist_lock);
  20.622 +			while (*swap_map > 1 &&
  20.623 +					(p = p->next) != &start_mm->mmlist) {
  20.624 +				mm = list_entry(p, struct mm_struct, mmlist);
  20.625 +				swcount = *swap_map;
  20.626 +				if (mm == &init_mm) {
  20.627 +					set_start_mm = 1;
  20.628 +					shmem_unuse(entry, page);
  20.629 +				} else
  20.630 +					unuse_process(mm, entry, page);
  20.631 +				if (set_start_mm && *swap_map < swcount) {
  20.632 +					new_start_mm = mm;
  20.633 +					set_start_mm = 0;
  20.634 +				}
  20.635 +			}
  20.636 +			atomic_inc(&new_start_mm->mm_users);
  20.637 +			spin_unlock(&mmlist_lock);
  20.638 +			mmput(start_mm);
  20.639 +			start_mm = new_start_mm;
  20.640 +		}
  20.641 +
  20.642 +		/*
  20.643 +		 * How could swap count reach 0x7fff when the maximum
  20.644 +		 * pid is 0x7fff, and there's no way to repeat a swap
  20.645 +		 * page within an mm (except in shmem, where it's the
  20.646 +		 * shared object which takes the reference count)?
  20.647 +		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
  20.648 +		 *
  20.649 +		 * If that's wrong, then we should worry more about
  20.650 +		 * exit_mmap() and do_munmap() cases described above:
  20.651 +		 * we might be resetting SWAP_MAP_MAX too early here.
  20.652 +		 * We know "Undead"s can happen, they're okay, so don't
  20.653 +		 * report them; but do report if we reset SWAP_MAP_MAX.
  20.654 +		 */
  20.655 +		if (*swap_map == SWAP_MAP_MAX) {
  20.656 +			swap_list_lock();
  20.657 +			swap_device_lock(si);
  20.658 +			nr_swap_pages++;
  20.659 +			*swap_map = 1;
  20.660 +			swap_device_unlock(si);
  20.661 +			swap_list_unlock();
  20.662 +			reset_overflow = 1;
  20.663 +		}
  20.664 +
  20.665 +		/*
  20.666 +		 * If a reference remains (rare), we would like to leave
  20.667 +		 * the page in the swap cache; but try_to_swap_out could
  20.668 +		 * then re-duplicate the entry once we drop page lock,
  20.669 +		 * so we might loop indefinitely; also, that page could
  20.670 +		 * not be swapped out to other storage meanwhile.  So:
  20.671 +		 * delete from cache even if there's another reference,
  20.672 +		 * after ensuring that the data has been saved to disk -
  20.673 +		 * since if the reference remains (rarer), it will be
  20.674 +		 * read from disk into another page.  Splitting into two
  20.675 +		 * pages would be incorrect if swap supported "shared
  20.676 +		 * private" pages, but they are handled by tmpfs files.
  20.677 +		 * Note shmem_unuse already deleted its from swap cache.
  20.678 +		 */
  20.679 +		swcount = *swap_map;
  20.680 +		if ((swcount > 0) != PageSwapCache(page))
  20.681 +			BUG();
  20.682 +		if ((swcount > 1) && PageDirty(page)) {
  20.683 +			rw_swap_page(WRITE, page);
  20.684 +			lock_page(page);
  20.685 +		}
  20.686 +		if (PageSwapCache(page))
  20.687 +			delete_from_swap_cache(page);
  20.688 +
  20.689 +		/*
  20.690 +		 * So we could skip searching mms once swap count went
  20.691 +		 * to 1, we did not mark any present ptes as dirty: must
  20.692 +		 * mark page dirty so try_to_swap_out will preserve it.
  20.693 +		 */
  20.694 +		SetPageDirty(page);
  20.695 +		UnlockPage(page);
  20.696 +		page_cache_release(page);
  20.697 +
  20.698 +		/*
  20.699 +		 * Make sure that we aren't completely killing
  20.700 +		 * interactive performance.  Interruptible check on
  20.701 +		 * signal_pending() would be nice, but changes the spec?
  20.702 +		 */
  20.703 +		if (current->need_resched)
  20.704 +			schedule();
  20.705 +	}
  20.706 +
  20.707 +	mmput(start_mm);
  20.708 +	if (reset_overflow) {
  20.709 +		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
  20.710 +		swap_overflow = 0;
  20.711 +	}
  20.712 +	return retval;
  20.713 +}
  20.714 +
  20.715 +asmlinkage long sys_swapoff(const char * specialfile)
  20.716 +{
  20.717 +	struct swap_info_struct * p = NULL;
  20.718 +	unsigned short *swap_map;
  20.719 +	struct nameidata nd;
  20.720 +	int i, type, prev;
  20.721 +	int err;
  20.722 +	
  20.723 +	if (!capable(CAP_SYS_ADMIN))
  20.724 +		return -EPERM;
  20.725 +
  20.726 +	err = user_path_walk(specialfile, &nd);
  20.727 +	if (err)
  20.728 +		goto out;
  20.729 +
  20.730 +	lock_kernel();
  20.731 +	prev = -1;
  20.732 +	swap_list_lock();
  20.733 +	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
  20.734 +		p = swap_info + type;
  20.735 +		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  20.736 +			if (p->swap_file == nd.dentry)
  20.737 +			  break;
  20.738 +		}
  20.739 +		prev = type;
  20.740 +	}
  20.741 +	err = -EINVAL;
  20.742 +	if (type < 0) {
  20.743 +		swap_list_unlock();
  20.744 +		goto out_dput;
  20.745 +	}
  20.746 +
  20.747 +	if (prev < 0) {
  20.748 +		swap_list.head = p->next;
  20.749 +	} else {
  20.750 +		swap_info[prev].next = p->next;
  20.751 +	}
  20.752 +	if (type == swap_list.next) {
  20.753 +		/* just pick something that's safe... */
  20.754 +		swap_list.next = swap_list.head;
  20.755 +	}
  20.756 +	nr_swap_pages -= p->pages;
  20.757 +	total_swap_pages -= p->pages;
  20.758 +	p->flags = SWP_USED;
  20.759 +	swap_list_unlock();
  20.760 +	unlock_kernel();
  20.761 +	err = try_to_unuse(type);
  20.762 +	lock_kernel();
  20.763 +	if (err) {
  20.764 +		/* re-insert swap space back into swap_list */
  20.765 +		swap_list_lock();
  20.766 +		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
  20.767 +			if (p->prio >= swap_info[i].prio)
  20.768 +				break;
  20.769 +		p->next = i;
  20.770 +		if (prev < 0)
  20.771 +			swap_list.head = swap_list.next = p - swap_info;
  20.772 +		else
  20.773 +			swap_info[prev].next = p - swap_info;
  20.774 +		nr_swap_pages += p->pages;
  20.775 +		total_swap_pages += p->pages;
  20.776 +		p->flags = SWP_WRITEOK;
  20.777 +		swap_list_unlock();
  20.778 +		goto out_dput;
  20.779 +	}
  20.780 +	if (p->swap_device)
  20.781 +		blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
  20.782 +	path_release(&nd);
  20.783 +
  20.784 +	swap_list_lock();
  20.785 +	swap_device_lock(p);
  20.786 +	nd.mnt = p->swap_vfsmnt;
  20.787 +	nd.dentry = p->swap_file;
  20.788 +	p->swap_vfsmnt = NULL;
  20.789 +	p->swap_file = NULL;
  20.790 +	p->swap_device = 0;
  20.791 +	p->max = 0;
  20.792 +	swap_map = p->swap_map;
  20.793 +	p->swap_map = NULL;
  20.794 +	p->flags = 0;
  20.795 +	swap_device_unlock(p);
  20.796 +	swap_list_unlock();
  20.797 +	vfree(swap_map);
  20.798 +	err = 0;
  20.799 +
  20.800 +out_dput:
  20.801 +	unlock_kernel();
  20.802 +	path_release(&nd);
  20.803 +out:
  20.804 +	return err;
  20.805 +}
  20.806 +
  20.807 +int get_swaparea_info(char *buf)
  20.808 +{
  20.809 +	char * page = (char *) __get_free_page(GFP_KERNEL);
  20.810 +	struct swap_info_struct *ptr = swap_info;
  20.811 +	int i, j, len = 0, usedswap;
  20.812 +
  20.813 +	if (!page)
  20.814 +		return -ENOMEM;
  20.815 +
  20.816 +	len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
  20.817 +	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
  20.818 +		if ((ptr->flags & SWP_USED) && ptr->swap_map) {
  20.819 +			char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
  20.820 +						page, PAGE_SIZE);
  20.821 +
  20.822 +			len += sprintf(buf + len, "%-31s ", path);
  20.823 +
  20.824 +			if (!ptr->swap_device)
  20.825 +				len += sprintf(buf + len, "file\t\t");
  20.826 +			else
  20.827 +				len += sprintf(buf + len, "partition\t");
  20.828 +
  20.829 +			usedswap = 0;
  20.830 +			for (j = 0; j < ptr->max; ++j)
  20.831 +				switch (ptr->swap_map[j]) {
  20.832 +					case SWAP_MAP_BAD:
  20.833 +					case 0:
  20.834 +						continue;
  20.835 +					default:
  20.836 +						usedswap++;
  20.837 +				}
  20.838 +			len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
  20.839 +				usedswap << (PAGE_SHIFT - 10), ptr->prio);
  20.840 +		}
  20.841 +	}
  20.842 +	free_page((unsigned long) page);
  20.843 +	return len;
  20.844 +}
  20.845 +
  20.846 +int is_swap_partition(kdev_t dev) {
  20.847 +	struct swap_info_struct *ptr = swap_info;
  20.848 +	int i;
  20.849 +
  20.850 +	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
  20.851 +		if (ptr->flags & SWP_USED)
  20.852 +			if (ptr->swap_device == dev)
  20.853 +				return 1;
  20.854 +	}
  20.855 +	return 0;
  20.856 +}
  20.857 +
  20.858 +/*
  20.859 + * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
  20.860 + *
  20.861 + * The swapon system call
  20.862 + */
  20.863 +asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
  20.864 +{
  20.865 +	struct swap_info_struct * p;
  20.866 +	struct nameidata nd;
  20.867 +	struct inode * swap_inode;
  20.868 +	unsigned int type;
  20.869 +	int i, j, prev;
  20.870 +	int error;
  20.871 +	static int least_priority = 0;
  20.872 +	union swap_header *swap_header = 0;
  20.873 +	int swap_header_version;
  20.874 +	int nr_good_pages = 0;
  20.875 +	unsigned long maxpages = 1;
  20.876 +	int swapfilesize;
  20.877 +	struct block_device *bdev = NULL;
  20.878 +	unsigned short *swap_map;
  20.879 +	
  20.880 +	if (!capable(CAP_SYS_ADMIN))
  20.881 +		return -EPERM;
  20.882 +	lock_kernel();
  20.883 +	swap_list_lock();
  20.884 +	p = swap_info;
  20.885 +	for (type = 0 ; type < nr_swapfiles ; type++,p++)
  20.886 +		if (!(p->flags & SWP_USED))
  20.887 +			break;
  20.888 +	error = -EPERM;
  20.889 +	if (type >= MAX_SWAPFILES) {
  20.890 +		swap_list_unlock();
  20.891 +		goto out;
  20.892 +	}
  20.893 +	if (type >= nr_swapfiles)
  20.894 +		nr_swapfiles = type+1;
  20.895 +	p->flags = SWP_USED;
  20.896 +	p->swap_file = NULL;
  20.897 +	p->swap_vfsmnt = NULL;
  20.898 +	p->swap_device = 0;
  20.899 +	p->swap_map = NULL;
  20.900 +	p->lowest_bit = 0;
  20.901 +	p->highest_bit = 0;
  20.902 +	p->cluster_nr = 0;
  20.903 +	p->sdev_lock = SPIN_LOCK_UNLOCKED;
  20.904 +	p->next = -1;
  20.905 +	if (swap_flags & SWAP_FLAG_PREFER) {
  20.906 +		p->prio =
  20.907 +		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
  20.908 +	} else {
  20.909 +		p->prio = --least_priority;
  20.910 +	}
  20.911 +	swap_list_unlock();
  20.912 +	error = user_path_walk(specialfile, &nd);
  20.913 +	if (error)
  20.914 +		goto bad_swap_2;
  20.915 +
  20.916 +	p->swap_file = nd.dentry;
  20.917 +	p->swap_vfsmnt = nd.mnt;
  20.918 +	swap_inode = nd.dentry->d_inode;
  20.919 +	error = -EINVAL;
  20.920 +
  20.921 +	if (S_ISBLK(swap_inode->i_mode)) {
  20.922 +		kdev_t dev = swap_inode->i_rdev;
  20.923 +		struct block_device_operations *bdops;
  20.924 +
  20.925 +		p->swap_device = dev;
  20.926 +		set_blocksize(dev, PAGE_SIZE);
  20.927 +		
  20.928 +		bd_acquire(swap_inode);
  20.929 +		bdev = swap_inode->i_bdev;
  20.930 +		bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
  20.931 +		if (bdops) bdev->bd_op = bdops;
  20.932 +
  20.933 +		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
  20.934 +		if (error)
  20.935 +			goto bad_swap_2;
  20.936 +		set_blocksize(dev, PAGE_SIZE);
  20.937 +		error = -ENODEV;
  20.938 +		if (!dev || (blk_size[MAJOR(dev)] &&
  20.939 +		     !blk_size[MAJOR(dev)][MINOR(dev)]))
  20.940 +			goto bad_swap;
  20.941 +		swapfilesize = 0;
  20.942 +		if (blk_size[MAJOR(dev)])
  20.943 +			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
  20.944 +				>> (PAGE_SHIFT - 10);
  20.945 +	} else if (S_ISREG(swap_inode->i_mode))
  20.946 +		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
  20.947 +	else
  20.948 +		goto bad_swap;
  20.949 +
  20.950 +	error = -EBUSY;
  20.951 +	for (i = 0 ; i < nr_swapfiles ; i++) {
  20.952 +		struct swap_info_struct *q = &swap_info[i];
  20.953 +		if (i == type || !q->swap_file)
  20.954 +			continue;
  20.955 +		if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
  20.956 +			goto bad_swap;
  20.957 +	}
  20.958 +
  20.959 +	swap_header = (void *) __get_free_page(GFP_USER);
  20.960 +	if (!swap_header) {
  20.961 +		printk("Unable to start swapping: out of memory :-)\n");
  20.962 +		error = -ENOMEM;
  20.963 +		goto bad_swap;
  20.964 +	}
  20.965 +
  20.966 +	lock_page(virt_to_page(swap_header));
  20.967 +	rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
  20.968 +
  20.969 +	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
  20.970 +		swap_header_version = 1;
  20.971 +	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
  20.972 +		swap_header_version = 2;
  20.973 +	else {
  20.974 +		printk("Unable to find swap-space signature\n");
  20.975 +		error = -EINVAL;
  20.976 +		goto bad_swap;
  20.977 +	}
  20.978 +	
  20.979 +	switch (swap_header_version) {
  20.980 +	case 1:
  20.981 +		memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
  20.982 +		j = 0;
  20.983 +		p->lowest_bit = 0;
  20.984 +		p->highest_bit = 0;
  20.985 +		for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
  20.986 +			if (test_bit(i,(char *) swap_header)) {
  20.987 +				if (!p->lowest_bit)
  20.988 +					p->lowest_bit = i;
  20.989 +				p->highest_bit = i;
  20.990 +				maxpages = i+1;
  20.991 +				j++;
  20.992 +			}
  20.993 +		}
  20.994 +		nr_good_pages = j;
  20.995 +		p->swap_map = vmalloc(maxpages * sizeof(short));
  20.996 +		if (!p->swap_map) {
  20.997 +			error = -ENOMEM;		
  20.998 +			goto bad_swap;
  20.999 +		}
 20.1000 +		for (i = 1 ; i < maxpages ; i++) {
 20.1001 +			if (test_bit(i,(char *) swap_header))
 20.1002 +				p->swap_map[i] = 0;
 20.1003 +			else
 20.1004 +				p->swap_map[i] = SWAP_MAP_BAD;
 20.1005 +		}
 20.1006 +		break;
 20.1007 +
 20.1008 +	case 2:
 20.1009 +		/* Check the swap header's sub-version and the size of
 20.1010 +                   the swap file and bad block lists */
 20.1011 +		if (swap_header->info.version != 1) {
 20.1012 +			printk(KERN_WARNING
 20.1013 +			       "Unable to handle swap header version %d\n",
 20.1014 +			       swap_header->info.version);
 20.1015 +			error = -EINVAL;
 20.1016 +			goto bad_swap;
 20.1017 +		}
 20.1018 +
 20.1019 +		p->lowest_bit  = 1;
 20.1020 +		maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
 20.1021 +		if (maxpages > swap_header->info.last_page)
 20.1022 +			maxpages = swap_header->info.last_page;
 20.1023 +		p->highest_bit = maxpages - 1;
 20.1024 +
 20.1025 +		error = -EINVAL;
 20.1026 +		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 20.1027 +			goto bad_swap;
 20.1028 +		
 20.1029 +		/* OK, set up the swap map and apply the bad block list */
 20.1030 +		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
 20.1031 +			error = -ENOMEM;
 20.1032 +			goto bad_swap;
 20.1033 +		}
 20.1034 +
 20.1035 +		error = 0;
 20.1036 +		memset(p->swap_map, 0, maxpages * sizeof(short));
 20.1037 +		for (i=0; i<swap_header->info.nr_badpages; i++) {
 20.1038 +			int page = swap_header->info.badpages[i];
 20.1039 +			if (page <= 0 || page >= swap_header->info.last_page)
 20.1040 +				error = -EINVAL;
 20.1041 +			else
 20.1042 +				p->swap_map[page] = SWAP_MAP_BAD;
 20.1043 +		}
 20.1044 +		nr_good_pages = swap_header->info.last_page -
 20.1045 +				swap_header->info.nr_badpages -
 20.1046 +				1 /* header page */;
 20.1047 +		if (error) 
 20.1048 +			goto bad_swap;
 20.1049 +	}
 20.1050 +	
 20.1051 +	if (swapfilesize && maxpages > swapfilesize) {
 20.1052 +		printk(KERN_WARNING
 20.1053 +		       "Swap area shorter than signature indicates\n");
 20.1054 +		error = -EINVAL;
 20.1055 +		goto bad_swap;
 20.1056 +	}
 20.1057 +	if (!nr_good_pages) {
 20.1058 +		printk(KERN_WARNING "Empty swap-file\n");
 20.1059 +		error = -EINVAL;
 20.1060 +		goto bad_swap;
 20.1061 +	}
 20.1062 +	p->swap_map[0] = SWAP_MAP_BAD;
 20.1063 +	swap_list_lock();
 20.1064 +	swap_device_lock(p);
 20.1065 +	p->max = maxpages;
 20.1066 +	p->flags = SWP_WRITEOK;
 20.1067 +	p->pages = nr_good_pages;
 20.1068 +	nr_swap_pages += nr_good_pages;
 20.1069 +	total_swap_pages += nr_good_pages;
 20.1070 +	printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
 20.1071 +	       nr_good_pages<<(PAGE_SHIFT-10), p->prio);
 20.1072 +
 20.1073 +	/* insert swap space into swap_list: */
 20.1074 +	prev = -1;
 20.1075 +	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
 20.1076 +		if (p->prio >= swap_info[i].prio) {
 20.1077 +			break;
 20.1078 +		}
 20.1079 +		prev = i;
 20.1080 +	}
 20.1081 +	p->next = i;
 20.1082 +	if (prev < 0) {
 20.1083 +		swap_list.head = swap_list.next = p - swap_info;
 20.1084 +	} else {
 20.1085 +		swap_info[prev].next = p - swap_info;
 20.1086 +	}
 20.1087 +	swap_device_unlock(p);
 20.1088 +	swap_list_unlock();
 20.1089 +	error = 0;
 20.1090 +	goto out;
 20.1091 +bad_swap:
 20.1092 +	if (bdev)
 20.1093 +		blkdev_put(bdev, BDEV_SWAP);
 20.1094 +bad_swap_2:
 20.1095 +	swap_list_lock();
 20.1096 +	swap_map = p->swap_map;
 20.1097 +	nd.mnt = p->swap_vfsmnt;
 20.1098 +	nd.dentry = p->swap_file;
 20.1099 +	p->swap_device = 0;
 20.1100 +	p->swap_file = NULL;
 20.1101 +	p->swap_vfsmnt = NULL;
 20.1102 +	p->swap_map = NULL;
 20.1103 +	p->flags = 0;
 20.1104 +	if (!(swap_flags & SWAP_FLAG_PREFER))
 20.1105 +		++least_priority;
 20.1106 +	swap_list_unlock();
 20.1107 +	if (swap_map)
 20.1108 +		vfree(swap_map);
 20.1109 +	path_release(&nd);
 20.1110 +out:
 20.1111 +	if (swap_header)
 20.1112 +		free_page((long) swap_header);
 20.1113 +	unlock_kernel();
 20.1114 +	return error;
 20.1115 +}
 20.1116 +
 20.1117 +void si_swapinfo(struct sysinfo *val)
 20.1118 +{
 20.1119 +	unsigned int i;
 20.1120 +	unsigned long nr_to_be_unused = 0;
 20.1121 +
 20.1122 +	swap_list_lock();
 20.1123 +	for (i = 0; i < nr_swapfiles; i++) {
 20.1124 +		unsigned int j;
 20.1125 +		if (swap_info[i].flags != SWP_USED)
 20.1126 +			continue;
 20.1127 +		for (j = 0; j < swap_info[i].max; ++j) {
 20.1128 +			switch (swap_info[i].swap_map[j]) {
 20.1129 +				case 0:
 20.1130 +				case SWAP_MAP_BAD:
 20.1131 +					continue;
 20.1132 +				default:
 20.1133 +					nr_to_be_unused++;
 20.1134 +			}
 20.1135 +		}
 20.1136 +	}
 20.1137 +	val->freeswap = nr_swap_pages + nr_to_be_unused;
 20.1138 +	val->totalswap = total_swap_pages + nr_to_be_unused;
 20.1139 +	swap_list_unlock();
 20.1140 +}
 20.1141 +
 20.1142 +/*
 20.1143 + * Verify that a swap entry is valid and increment its swap map count.
 20.1144 + *
 20.1145 + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
 20.1146 + * "permanent", but will be reclaimed by the next swapoff.
 20.1147 + */
 20.1148 +int swap_duplicate(swp_entry_t entry)
 20.1149 +{
 20.1150 +	struct swap_info_struct * p;
 20.1151 +	unsigned long offset, type;
 20.1152 +	int result = 0;
 20.1153 +
 20.1154 +	type = SWP_TYPE(entry);
 20.1155 +	if (type >= nr_swapfiles)
 20.1156 +		goto bad_file;
 20.1157 +	p = type + swap_info;
 20.1158 +	offset = SWP_OFFSET(entry);
 20.1159 +
 20.1160 +	swap_device_lock(p);
 20.1161 +	if (offset < p->max && p->swap_map[offset]) {
 20.1162 +		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
 20.1163 +			p->swap_map[offset]++;
 20.1164 +			result = 1;
 20.1165 +		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
 20.1166 +			if (swap_overflow++ < 5)
 20.1167 +				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
 20.1168 +			p->swap_map[offset] = SWAP_MAP_MAX;
 20.1169 +			result = 1;
 20.1170 +		}
 20.1171 +	}
 20.1172 +	swap_device_unlock(p);
 20.1173 +out:
 20.1174 +	return result;
 20.1175 +
 20.1176 +bad_file:
 20.1177 +	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 20.1178 +	goto out;
 20.1179 +}
 20.1180 +
 20.1181 +/*
 20.1182 + * Page lock needs to be held in all cases to prevent races with
 20.1183 + * swap file deletion.
 20.1184 + */
 20.1185 +int swap_count(struct page *page)
 20.1186 +{
 20.1187 +	struct swap_info_struct * p;
 20.1188 +	unsigned long offset, type;
 20.1189 +	swp_entry_t entry;
 20.1190 +	int retval = 0;
 20.1191 +
 20.1192 +	entry.val = page->index;
 20.1193 +	if (!entry.val)
 20.1194 +		goto bad_entry;
 20.1195 +	type = SWP_TYPE(entry);
 20.1196 +	if (type >= nr_swapfiles)
 20.1197 +		goto bad_file;
 20.1198 +	p = type + swap_info;
 20.1199 +	offset = SWP_OFFSET(entry);
 20.1200 +	if (offset >= p->max)
 20.1201 +		goto bad_offset;
 20.1202 +	if (!p->swap_map[offset])
 20.1203 +		goto bad_unused;
 20.1204 +	retval = p->swap_map[offset];
 20.1205 +out:
 20.1206 +	return retval;
 20.1207 +
 20.1208 +bad_entry:
 20.1209 +	printk(KERN_ERR "swap_count: null entry!\n");
 20.1210 +	goto out;
 20.1211 +bad_file:
 20.1212 +	printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val);
 20.1213 +	goto out;
 20.1214 +bad_offset:
 20.1215 +	printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val);
 20.1216 +	goto out;
 20.1217 +bad_unused:
 20.1218 +	printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val);
 20.1219 +	goto out;
 20.1220 +}
 20.1221 +
 20.1222 +/*
 20.1223 + * Prior swap_duplicate protects against swap device deletion.
 20.1224 + */
 20.1225 +void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
 20.1226 +			kdev_t *dev, struct inode **swapf)
 20.1227 +{
 20.1228 +	unsigned long type;
 20.1229 +	struct swap_info_struct *p;
 20.1230 +
 20.1231 +	type = SWP_TYPE(entry);
 20.1232 +	if (type >= nr_swapfiles) {
 20.1233 +		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
 20.1234 +		return;
 20.1235 +	}
 20.1236 +
 20.1237 +	p = &swap_info[type];
 20.1238 +	*offset = SWP_OFFSET(entry);
 20.1239 +	if (*offset >= p->max && *offset != 0) {
 20.1240 +		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
 20.1241 +		return;
 20.1242 +	}
 20.1243 +	if (p->swap_map && !p->swap_map[*offset]) {
 20.1244 +		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
 20.1245 +		return;
 20.1246 +	}
 20.1247 +	if (!(p->flags & SWP_USED)) {
 20.1248 +		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
 20.1249 +		return;
 20.1250 +	}
 20.1251 +
 20.1252 +	if (p->swap_device) {
 20.1253 +		*dev = p->swap_device;
 20.1254 +	} else if (p->swap_file) {
 20.1255 +		*swapf = p->swap_file->d_inode;
 20.1256 +	} else {
 20.1257 +		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
 20.1258 +	}
 20.1259 +	return;
 20.1260 +}
 20.1261 +
 20.1262 +/*
 20.1263 + * swap_device_lock prevents swap_map being freed. Don't grab an extra
 20.1264 + * reference on the swaphandle, it doesn't matter if it becomes unused.
 20.1265 + */
 20.1266 +int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 20.1267 +{
 20.1268 +	int ret = 0, i = 1 << page_cluster;
 20.1269 +	unsigned long toff;
 20.1270 +	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 20.1271 +
 20.1272 +	if (!page_cluster)	/* no readahead */
 20.1273 +		return 0;
 20.1274 +	toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
 20.1275 +	if (!toff)		/* first page is swap header */
 20.1276 +		toff++, i--;
 20.1277 +	*offset = toff;
 20.1278 +
 20.1279 +	swap_device_lock(swapdev);
 20.1280 +	do {
 20.1281 +		/* Don't read-ahead past the end of the swap area */
 20.1282 +		if (toff >= swapdev->max)
 20.1283 +			break;
 20.1284 +		/* Don't read in free or bad pages */
 20.1285 +		if (!swapdev->swap_map[toff])
 20.1286 +			break;
 20.1287 +		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
 20.1288 +			break;
 20.1289 +		toff++;
 20.1290 +		ret++;
 20.1291 +	} while (--i);
 20.1292 +	swap_device_unlock(swapdev);
 20.1293 +	return ret;
 20.1294 +}