direct-io.hg

changeset 2314:b1347b2eb538

bitkeeper revision 1.1159.1.92 (4125c2a1hkXswFK5ZlWTGYbKQihTmQ)

Merge xenbk@gandalf:/var/bk/xeno-unstable.bk
into wray-m-3.hpl.hp.com:/home/mjw/repos-bk/xeno-unstable.bk
author mjw@wray-m-3.hpl.hp.com
date Fri Aug 20 09:21:37 2004 +0000 (2004-08-20)
parents 8aa79ed4d0b4 76d1b88c0c2e
children 6f8b5d4e7a39
files .rootkeys BitKeeper/etc/ignore linux-2.4.26-xen-sparse/arch/xen/config.in linux-2.4.26-xen-sparse/arch/xen/defconfig-xen0 linux-2.4.26-xen-sparse/arch/xen/defconfig-xenU linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c linux-2.4.26-xen-sparse/arch/xen/mm/ioremap.c linux-2.4.26-xen-sparse/drivers/char/mem.c linux-2.4.26-xen-sparse/include/asm-xen/pgtable-2level.h linux-2.4.26-xen-sparse/include/linux/mm.h linux-2.4.26-xen-sparse/mm/page_alloc.c linux-2.6.7-xen-sparse/arch/xen/Kconfig linux-2.6.7-xen-sparse/arch/xen/configs/xen0_defconfig linux-2.6.7-xen-sparse/arch/xen/configs/xenU_defconfig linux-2.6.7-xen-sparse/arch/xen/i386/kernel/pci-dma.c linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c linux-2.6.7-xen-sparse/arch/xen/i386/mm/ioremap.c linux-2.6.7-xen-sparse/drivers/char/mem.c linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/io.h linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h linux-2.6.7-xen-sparse/include/linux/bio.h linux-2.6.7-xen-sparse/include/linux/page-flags.h linux-2.6.7-xen-sparse/include/linux/skbuff.h linux-2.6.7-xen-sparse/mm/page_alloc.c tools/examples/Makefile tools/examples/xmdefconfig tools/examples/xmdefconfig-example tools/examples/xmdefconfig-netbsd tools/examples/xmexample1 tools/examples/xmexample2 tools/libxc/xc.h tools/libxc/xc_linux_save.c tools/python/xen/xm/opts.py xen/arch/x86/memory.c xen/arch/x86/setup.c xen/arch/x86/shadow.c xen/arch/x86/smp.c xen/common/dom0_ops.c xen/common/domain.c xen/common/kernel.c xen/common/memory.c xen/common/page_alloc.c xen/include/asm-x86/mm.h xen/include/hypervisor-ifs/hypervisor-if.h xen/include/xen/sched.h
line diff
     1.1 --- a/.rootkeys	Fri Aug 20 09:11:43 2004 +0000
     1.2 +++ b/.rootkeys	Fri Aug 20 09:21:37 2004 +0000
     1.3 @@ -111,6 +111,7 @@ 3e5a4e68mTr0zcp9SXDbnd-XLrrfxw linux-2.4
     1.4  3f1056a9L_kqHcFheV00KbKBzv9j5w linux-2.4.26-xen-sparse/include/asm-xen/vga.h
     1.5  40659defgWA92arexpMGn8X3QMDj3w linux-2.4.26-xen-sparse/include/asm-xen/xor.h
     1.6  3f056927gMHl7mWB89rb73JahbhQIA linux-2.4.26-xen-sparse/include/linux/blk.h
     1.7 +4124f66fPHG6yvB_vXmesjvzrJ3yMg linux-2.4.26-xen-sparse/include/linux/mm.h
     1.8  401c0590D_kwJDU59X8NyvqSv_Cl2A linux-2.4.26-xen-sparse/include/linux/sched.h
     1.9  40a248afgI0_JKthdYAe8beVfXSTpQ linux-2.4.26-xen-sparse/include/linux/skbuff.h
    1.10  401c0592pLrp_aCbQRo9GXiYQQaVVA linux-2.4.26-xen-sparse/include/linux/timer.h
    1.11 @@ -242,6 +243,9 @@ 3f108af1ylCIm82H052FVTfXACBHrw linux-2.6
    1.12  4122466356eIBnC9ot44WSVVIFyhQA linux-2.6.7-xen-sparse/include/asm-xen/queues.h
    1.13  3fa8e3f0kBLeE4To2vpdi3cpJbIkbQ linux-2.6.7-xen-sparse/include/asm-xen/suspend.h
    1.14  3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
    1.15 +4124d8c4aocX7A-jIbuGraWN84pxGQ linux-2.6.7-xen-sparse/include/linux/bio.h
    1.16 +4124f66fp5QwbDHEfoUIa7pqO5Xhag linux-2.6.7-xen-sparse/include/linux/page-flags.h
    1.17 +4124f66f4NaKNa0xPiGGykn9QaZk3w linux-2.6.7-xen-sparse/include/linux/skbuff.h
    1.18  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
    1.19  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
    1.20  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.21 @@ -264,9 +268,8 @@ 405ff55dawQyCHFEnJ067ChPRoXBBA tools/exa
    1.22  40ee75a9xFz6S05sDKu-JCLqyVTkDA tools/examples/network
    1.23  40ee75a967sxgcRY4Q7zXoVUaJ4flA tools/examples/vif-bridge
    1.24  40ee75a93cqxHp6MiYXxxwR5j2_8QQ tools/examples/xend-config.sxp
    1.25 -41090ec8Pj_bkgCBpg2W7WfmNkumEA tools/examples/xmdefconfig
    1.26 -40cf2937oKlROYOJTN8GWwWM5AmjBg tools/examples/xmdefconfig-example
    1.27 -40dfd40auJwNnb8NoiSnRkvZaaXkUg tools/examples/xmdefconfig-netbsd
    1.28 +41090ec8Pj_bkgCBpg2W7WfmNkumEA tools/examples/xmexample1
    1.29 +40cf2937oKlROYOJTN8GWwWM5AmjBg tools/examples/xmexample2
    1.30  3fbba6dbDfYvJSsw9500b4SZyUhxjQ tools/libxc/Makefile
    1.31  3fbba6dc1uU7U3IFeF6A-XEOYF2MkQ tools/libxc/rpm.spec
    1.32  3fbba6dcrNxtygEcgJYAJJ1gCQqfsA tools/libxc/xc.h
     2.1 --- a/BitKeeper/etc/ignore	Fri Aug 20 09:11:43 2004 +0000
     2.2 +++ b/BitKeeper/etc/ignore	Fri Aug 20 09:21:37 2004 +0000
     2.3 @@ -35,6 +35,7 @@ linux-xen-sparse
     2.4  patches/*
     2.5  tools/*/build/lib*/*.py
     2.6  tools/balloon/balloon
     2.7 +tools/check/.*
     2.8  tools/misc/miniterm/miniterm
     2.9  tools/misc/xen_cpuperf
    2.10  tools/web-shutdown.tap
     3.1 --- a/linux-2.4.26-xen-sparse/arch/xen/config.in	Fri Aug 20 09:11:43 2004 +0000
     3.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/config.in	Fri Aug 20 09:21:37 2004 +0000
     3.3 @@ -20,7 +20,10 @@ endmenu
     3.4  # The IBM S/390 patch needs this.
     3.5  define_bool CONFIG_NO_IDLE_HZ y
     3.6  
     3.7 -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" != "y" ]; then
     3.8 +if [ "$CONFIG_XEN_PHYSDEV_ACCESS" == "y" ]; then
     3.9 +   define_bool CONFIG_FOREIGN_PAGES y
    3.10 +else
    3.11 +   define_bool CONFIG_FOREIGN_PAGES n
    3.12     define_bool CONFIG_NETDEVICES y
    3.13     define_bool CONFIG_VT n
    3.14  fi
    3.15 @@ -103,7 +106,7 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then
    3.16     bool 'HIGHMEM I/O support' CONFIG_HIGHIO
    3.17  fi
    3.18  
    3.19 -define_int CONFIG_FORCE_MAX_ZONEORDER 12
    3.20 +define_int CONFIG_FORCE_MAX_ZONEORDER 11
    3.21  
    3.22  #bool 'Symmetric multi-processing support' CONFIG_SMP
    3.23  #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
     4.1 --- a/linux-2.4.26-xen-sparse/arch/xen/defconfig-xen0	Fri Aug 20 09:11:43 2004 +0000
     4.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/defconfig-xen0	Fri Aug 20 09:21:37 2004 +0000
     4.3 @@ -13,6 +13,7 @@ CONFIG_UID16=y
     4.4  CONFIG_XEN_PRIVILEGED_GUEST=y
     4.5  CONFIG_XEN_PHYSDEV_ACCESS=y
     4.6  CONFIG_NO_IDLE_HZ=y
     4.7 +CONFIG_FOREIGN_PAGES=y
     4.8  
     4.9  #
    4.10  # Code maturity level options
    4.11 @@ -50,7 +51,7 @@ CONFIG_X86_TSC=y
    4.12  CONFIG_X86_L1_CACHE_SHIFT=5
    4.13  CONFIG_NOHIGHMEM=y
    4.14  # CONFIG_HIGHMEM4G is not set
    4.15 -CONFIG_FORCE_MAX_ZONEORDER=12
    4.16 +CONFIG_FORCE_MAX_ZONEORDER=11
    4.17  
    4.18  #
    4.19  # General setup
     5.1 --- a/linux-2.4.26-xen-sparse/arch/xen/defconfig-xenU	Fri Aug 20 09:11:43 2004 +0000
     5.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/defconfig-xenU	Fri Aug 20 09:21:37 2004 +0000
     5.3 @@ -13,6 +13,7 @@ CONFIG_UID16=y
     5.4  # CONFIG_XEN_PRIVILEGED_GUEST is not set
     5.5  # CONFIG_XEN_PHYSDEV_ACCESS is not set
     5.6  CONFIG_NO_IDLE_HZ=y
     5.7 +# CONFIG_FOREIGN_PAGES is not set
     5.8  CONFIG_NETDEVICES=y
     5.9  # CONFIG_VT is not set
    5.10  
    5.11 @@ -52,7 +53,6 @@ CONFIG_X86_TSC=y
    5.12  CONFIG_X86_L1_CACHE_SHIFT=5
    5.13  CONFIG_NOHIGHMEM=y
    5.14  # CONFIG_HIGHMEM4G is not set
    5.15 -CONFIG_FORCE_MAX_ZONEORDER=12
    5.16  
    5.17  #
    5.18  # General setup
     6.1 --- a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c	Fri Aug 20 09:11:43 2004 +0000
     6.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c	Fri Aug 20 09:21:37 2004 +0000
     6.3 @@ -36,13 +36,16 @@ typedef struct user_balloon_op {
     6.4  } user_balloon_op_t;
     6.5  /* END OF USER DEFINE */
     6.6  
     6.7 -/* Dead entry written into balloon-owned entries in the PMT. */
     6.8 -#define DEAD 0xdeadbeef
     6.9 -
    6.10  static struct proc_dir_entry *balloon_pde;
    6.11  unsigned long credit;
    6.12  static unsigned long current_pages, most_seen_pages;
    6.13  
    6.14 +/*
    6.15 + * Dead entry written into balloon-owned entries in the PMT.
    6.16 + * It is deliberately different to INVALID_P2M_ENTRY.
    6.17 + */
    6.18 +#define DEAD 0xdead1234
    6.19 +
    6.20  static inline pte_t *get_ptep(unsigned long addr)
    6.21  {
    6.22      pgd_t *pgd; pmd_t *pmd; pte_t *ptep;
    6.23 @@ -79,17 +82,16 @@ static unsigned long inflate_balloon(uns
    6.24      for ( i = 0; i < num_pages; i++, currp++ )
    6.25      {
    6.26  	struct page *page = alloc_page(GFP_HIGHUSER);
    6.27 -	unsigned long pfn =  page - mem_map;
    6.28 +	unsigned long pfn = page - mem_map;
    6.29  
    6.30          /* If allocation fails then free all reserved pages. */
    6.31 -        if ( page == 0 )
    6.32 +        if ( page == NULL )
    6.33          {
    6.34 -            printk(KERN_ERR "Unable to inflate balloon by %ld, only %ld pages free.",
    6.35 -                   num_pages, i);
    6.36 +            printk(KERN_ERR "Unable to inflate balloon by %ld, only"
    6.37 +                   " %ld pages free.", num_pages, i);
    6.38              currp = parray;
    6.39 -            for(j = 0; j < i; j++, ++currp){
    6.40 +            for ( j = 0; j < i; j++, currp++ )
    6.41                  __free_page((struct page *) (mem_map + *currp));
    6.42 -            }
    6.43  	    ret = -EFAULT;
    6.44              goto cleanup;
    6.45          }
    6.46 @@ -102,9 +104,8 @@ static unsigned long inflate_balloon(uns
    6.47      {
    6.48  	unsigned long mfn = phys_to_machine_mapping[*currp];
    6.49          curraddr = (unsigned long)page_address(mem_map + *currp);
    6.50 -	if (curraddr)
    6.51 +	if ( curraddr != 0 )
    6.52              queue_l1_entry_update(get_ptep(curraddr), 0);
    6.53 -
    6.54          phys_to_machine_mapping[*currp] = DEAD;
    6.55          *currp = mfn;
    6.56      }
    6.57 @@ -313,17 +314,18 @@ claim_new_pages(unsigned long num_pages)
    6.58      XEN_flush_page_update_queue();
    6.59      new_page_cnt = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation, 
    6.60                                  parray, num_pages, 0);
    6.61 -    if (new_page_cnt != num_pages)
    6.62 +    if ( new_page_cnt != num_pages )
    6.63      {
    6.64          printk(KERN_WARNING
    6.65              "claim_new_pages: xen granted only %lu of %lu requested pages\n",
    6.66              new_page_cnt, num_pages);
    6.67  
    6.68 -	/* XXX
    6.69 -	 * avoid xen lockup when user forgot to setdomainmaxmem.  xen
    6.70 -	 * usually can dribble out a few pages and then hangs
    6.71 +	/* 
    6.72 +	 * Avoid xen lockup when user forgot to setdomainmaxmem. Xen
    6.73 +	 * usually can dribble out a few pages and then hangs.
    6.74  	 */
    6.75 -	if (new_page_cnt < 1000) {
    6.76 +	if ( new_page_cnt < 1000 )
    6.77 +        {
    6.78              printk(KERN_WARNING "Remember to use setdomainmaxmem\n");
    6.79  	    HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, 
    6.80                                  parray, new_page_cnt, 0);
    6.81 @@ -331,7 +333,7 @@ claim_new_pages(unsigned long num_pages)
    6.82  	}
    6.83      }
    6.84      memcpy(phys_to_machine_mapping+most_seen_pages, parray,
    6.85 -            new_page_cnt * sizeof(unsigned long));
    6.86 +           new_page_cnt * sizeof(unsigned long));
    6.87  
    6.88      pagetable_extend(most_seen_pages,new_page_cnt);
    6.89  
    6.90 @@ -465,12 +467,15 @@ static int __init init_module(void)
    6.91      /* 
    6.92       * make a new phys map if mem= says xen can give us memory  to grow
    6.93       */
    6.94 -    if (max_pfn > start_info.nr_pages) {
    6.95 +    if ( max_pfn > start_info.nr_pages )
    6.96 +    {
    6.97          extern unsigned long *phys_to_machine_mapping;
    6.98          unsigned long *newmap;
    6.99          newmap = (unsigned long *)vmalloc(max_pfn * sizeof(unsigned long));
   6.100 -        phys_to_machine_mapping = memcpy(newmap, phys_to_machine_mapping,
   6.101 -            start_info.nr_pages * sizeof(unsigned long));
   6.102 +        memset(newmap, ~0, max_pfn * sizeof(unsigned long));
   6.103 +        memcpy(newmap, phys_to_machine_mapping,
   6.104 +               start_info.nr_pages * sizeof(unsigned long));
   6.105 +        phys_to_machine_mapping = newmap;
   6.106      }
   6.107  
   6.108      return 0;
     7.1 --- a/linux-2.4.26-xen-sparse/arch/xen/mm/ioremap.c	Fri Aug 20 09:11:43 2004 +0000
     7.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/mm/ioremap.c	Fri Aug 20 09:21:37 2004 +0000
     7.3 @@ -115,17 +115,10 @@ int direct_remap_area_pages(struct mm_st
     7.4  #define MAX_DIRECTMAP_MMU_QUEUE 130
     7.5      mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
     7.6  
     7.7 -    if ( domid != 0 )
     7.8 -    {
     7.9 -        u[0].ptr  = MMU_EXTENDED_COMMAND;
    7.10 -        u[0].val  = MMUEXT_SET_FOREIGNDOM;
    7.11 -        u[0].val |= (unsigned long)domid << 16;
    7.12 -        v = w = &u[1];
    7.13 -    }
    7.14 -    else
    7.15 -    {
    7.16 -        v = w = &u[0];
    7.17 -    }
    7.18 +    u[0].ptr  = MMU_EXTENDED_COMMAND;
    7.19 +    u[0].val  = MMUEXT_SET_FOREIGNDOM;
    7.20 +    u[0].val |= (unsigned long)domid << 16;
    7.21 +    v = w = &u[1];
    7.22  
    7.23      start_address = address;
    7.24  
     8.1 --- a/linux-2.4.26-xen-sparse/drivers/char/mem.c	Fri Aug 20 09:11:43 2004 +0000
     8.2 +++ b/linux-2.4.26-xen-sparse/drivers/char/mem.c	Fri Aug 20 09:21:37 2004 +0000
     8.3 @@ -237,6 +237,9 @@ static int mmap_mem(struct file * file, 
     8.4  	if (!(start_info.flags & SIF_PRIVILEGED))
     8.5  		return -ENXIO;
     8.6  
     8.7 +	if (file->private_data == NULL)
     8.8 +		file->private_data = (void *)(unsigned long)DOMID_IO;
     8.9 +
    8.10  	/* DONTCOPY is essential for Xen as copy_page_range is broken. */
    8.11  	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
    8.12  	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
     9.1 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgtable-2level.h	Fri Aug 20 09:11:43 2004 +0000
     9.2 +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgtable-2level.h	Fri Aug 20 09:21:37 2004 +0000
     9.3 @@ -58,7 +58,19 @@ static inline pmd_t * pmd_offset(pgd_t *
     9.4   *     then we'll have p2m(m2p(MFN))==MFN.
     9.5   * If we detect a special mapping then it doesn't have a 'struct page'.
     9.6   * We force !VALID_PAGE() by returning an out-of-range pointer.
     9.7 + *
     9.8 + * NB. These checks require that, for any MFN that is not in our reservation,
     9.9 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
    9.10 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
    9.11 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
    9.12 + * 
    9.13 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
    9.14 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
    9.15 + *      require. In all the cases we care about, the high bit gets shifted out
    9.16 + *      (e.g., phys_to_machine()) so behaviour there is correct.
    9.17   */
    9.18 +#define INVALID_P2M_ENTRY (~0UL)
    9.19 +#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1)))
    9.20  #define pte_page(_pte)                                        \
    9.21  ({                                                            \
    9.22      unsigned long mfn = (_pte).pte_low >> PAGE_SHIFT;         \
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/linux-2.4.26-xen-sparse/include/linux/mm.h	Fri Aug 20 09:21:37 2004 +0000
    10.3 @@ -0,0 +1,703 @@
    10.4 +#ifndef _LINUX_MM_H
    10.5 +#define _LINUX_MM_H
    10.6 +
    10.7 +#include <linux/sched.h>
    10.8 +#include <linux/errno.h>
    10.9 +
   10.10 +#ifdef __KERNEL__
   10.11 +
   10.12 +#include <linux/config.h>
   10.13 +#include <linux/string.h>
   10.14 +#include <linux/list.h>
   10.15 +#include <linux/mmzone.h>
   10.16 +#include <linux/swap.h>
   10.17 +#include <linux/rbtree.h>
   10.18 +
   10.19 +extern unsigned long max_mapnr;
   10.20 +extern unsigned long num_physpages;
   10.21 +extern unsigned long num_mappedpages;
   10.22 +extern void * high_memory;
   10.23 +extern int page_cluster;
   10.24 +/* The inactive_clean lists are per zone. */
   10.25 +extern struct list_head active_list;
   10.26 +extern struct list_head inactive_list;
   10.27 +
   10.28 +#include <asm/page.h>
   10.29 +#include <asm/pgtable.h>
   10.30 +#include <asm/atomic.h>
   10.31 +
   10.32 +/*
   10.33 + * Linux kernel virtual memory manager primitives.
   10.34 + * The idea being to have a "virtual" mm in the same way
   10.35 + * we have a virtual fs - giving a cleaner interface to the
   10.36 + * mm details, and allowing different kinds of memory mappings
   10.37 + * (from shared memory to executable loading to arbitrary
   10.38 + * mmap() functions).
   10.39 + */
   10.40 +
   10.41 +/*
   10.42 + * This struct defines a memory VMM memory area. There is one of these
   10.43 + * per VM-area/task.  A VM area is any part of the process virtual memory
   10.44 + * space that has a special rule for the page-fault handlers (ie a shared
   10.45 + * library, the executable area etc).
   10.46 + */
   10.47 +struct vm_area_struct {
   10.48 +	struct mm_struct * vm_mm;	/* The address space we belong to. */
   10.49 +	unsigned long vm_start;		/* Our start address within vm_mm. */
   10.50 +	unsigned long vm_end;		/* The first byte after our end address
   10.51 +					   within vm_mm. */
   10.52 +
   10.53 +	/* linked list of VM areas per task, sorted by address */
   10.54 +	struct vm_area_struct *vm_next;
   10.55 +
   10.56 +	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
   10.57 +	unsigned long vm_flags;		/* Flags, listed below. */
   10.58 +
   10.59 +	rb_node_t vm_rb;
   10.60 +
   10.61 +	/*
   10.62 +	 * For areas with an address space and backing store,
   10.63 +	 * one of the address_space->i_mmap{,shared} lists,
   10.64 +	 * for shm areas, the list of attaches, otherwise unused.
   10.65 +	 */
   10.66 +	struct vm_area_struct *vm_next_share;
   10.67 +	struct vm_area_struct **vm_pprev_share;
   10.68 +
   10.69 +	/* Function pointers to deal with this struct. */
   10.70 +	struct vm_operations_struct * vm_ops;
   10.71 +
   10.72 +	/* Information about our backing store: */
   10.73 +	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
   10.74 +					   units, *not* PAGE_CACHE_SIZE */
   10.75 +	struct file * vm_file;		/* File we map to (can be NULL). */
   10.76 +	unsigned long vm_raend;		/* XXX: put full readahead info here. */
   10.77 +	void * vm_private_data;		/* was vm_pte (shared mem) */
   10.78 +};
   10.79 +
   10.80 +/*
   10.81 + * vm_flags..
   10.82 + */
   10.83 +#define VM_READ		0x00000001	/* currently active flags */
   10.84 +#define VM_WRITE	0x00000002
   10.85 +#define VM_EXEC		0x00000004
   10.86 +#define VM_SHARED	0x00000008
   10.87 +
   10.88 +#define VM_MAYREAD	0x00000010	/* limits for mprotect() etc */
   10.89 +#define VM_MAYWRITE	0x00000020
   10.90 +#define VM_MAYEXEC	0x00000040
   10.91 +#define VM_MAYSHARE	0x00000080
   10.92 +
   10.93 +#define VM_GROWSDOWN	0x00000100	/* general info on the segment */
   10.94 +#define VM_GROWSUP	0x00000200
   10.95 +#define VM_SHM		0x00000400	/* shared memory area, don't swap out */
   10.96 +#define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
   10.97 +
   10.98 +#define VM_EXECUTABLE	0x00001000
   10.99 +#define VM_LOCKED	0x00002000
  10.100 +#define VM_IO           0x00004000	/* Memory mapped I/O or similar */
  10.101 +
  10.102 +					/* Used by sys_madvise() */
  10.103 +#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
  10.104 +#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
  10.105 +
  10.106 +#define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
  10.107 +#define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
  10.108 +#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
  10.109 +
  10.110 +#ifndef VM_STACK_FLAGS
  10.111 +#define VM_STACK_FLAGS	0x00000177
  10.112 +#endif
  10.113 +
  10.114 +#define VM_READHINTMASK			(VM_SEQ_READ | VM_RAND_READ)
  10.115 +#define VM_ClearReadHint(v)		(v)->vm_flags &= ~VM_READHINTMASK
  10.116 +#define VM_NormalReadHint(v)		(!((v)->vm_flags & VM_READHINTMASK))
  10.117 +#define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
  10.118 +#define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
  10.119 +
  10.120 +/* read ahead limits */
  10.121 +extern int vm_min_readahead;
  10.122 +extern int vm_max_readahead;
  10.123 +
  10.124 +/*
  10.125 + * mapping from the currently active vm_flags protection bits (the
  10.126 + * low four bits) to a page protection mask..
  10.127 + */
  10.128 +extern pgprot_t protection_map[16];
  10.129 +
  10.130 +
  10.131 +/*
  10.132 + * These are the virtual MM functions - opening of an area, closing and
  10.133 + * unmapping it (needed to keep files on disk up-to-date etc), pointer
  10.134 + * to the functions called when a no-page or a wp-page exception occurs. 
  10.135 + */
  10.136 +struct vm_operations_struct {
  10.137 +	void (*open)(struct vm_area_struct * area);
  10.138 +	void (*close)(struct vm_area_struct * area);
  10.139 +	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
  10.140 +};
  10.141 +
  10.142 +/*
  10.143 + * Each physical page in the system has a struct page associated with
  10.144 + * it to keep track of whatever it is we are using the page for at the
  10.145 + * moment. Note that we have no way to track which tasks are using
  10.146 + * a page.
  10.147 + *
  10.148 + * Try to keep the most commonly accessed fields in single cache lines
  10.149 + * here (16 bytes or greater).  This ordering should be particularly
  10.150 + * beneficial on 32-bit processors.
  10.151 + *
  10.152 + * The first line is data used in page cache lookup, the second line
  10.153 + * is used for linear searches (eg. clock algorithm scans). 
  10.154 + *
  10.155 + * TODO: make this structure smaller, it could be as small as 32 bytes.
  10.156 + */
  10.157 +typedef struct page {
  10.158 +	struct list_head list;		/* ->mapping has some page lists. */
  10.159 +	struct address_space *mapping;	/* The inode (or ...) we belong to. */
  10.160 +	unsigned long index;		/* Our offset within mapping. */
  10.161 +	struct page *next_hash;		/* Next page sharing our hash bucket in
  10.162 +					   the pagecache hash table. */
  10.163 +	atomic_t count;			/* Usage count, see below. */
  10.164 +	unsigned long flags;		/* atomic flags, some possibly
  10.165 +					   updated asynchronously */
  10.166 +	struct list_head lru;		/* Pageout list, eg. active_list;
  10.167 +					   protected by pagemap_lru_lock !! */
  10.168 +	struct page **pprev_hash;	/* Complement to *next_hash. */
  10.169 +	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
  10.170 +
  10.171 +	/*
  10.172 +	 * On machines where all RAM is mapped into kernel address space,
  10.173 +	 * we can simply calculate the virtual address. On machines with
  10.174 +	 * highmem some memory is mapped into kernel virtual memory
  10.175 +	 * dynamically, so we need a place to store that address.
  10.176 +	 * Note that this field could be 16 bits on x86 ... ;)
  10.177 +	 *
  10.178 +	 * Architectures with slow multiplication can define
  10.179 +	 * WANT_PAGE_VIRTUAL in asm/page.h
  10.180 +	 */
  10.181 +#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
  10.182 +	void *virtual;			/* Kernel virtual address (NULL if
  10.183 +					   not kmapped, ie. highmem) */
  10.184 +#endif /* CONFIG_HIGMEM || WANT_PAGE_VIRTUAL */
  10.185 +} mem_map_t;
  10.186 +
  10.187 +/*
  10.188 + * Methods to modify the page usage count.
  10.189 + *
  10.190 + * What counts for a page usage:
  10.191 + * - cache mapping   (page->mapping)
  10.192 + * - disk mapping    (page->buffers)
  10.193 + * - page mapped in a task's page tables, each mapping
  10.194 + *   is counted separately
  10.195 + *
  10.196 + * Also, many kernel routines increase the page count before a critical
  10.197 + * routine so they can be sure the page doesn't go away from under them.
  10.198 + */
  10.199 +#define get_page(p)		atomic_inc(&(p)->count)
  10.200 +#define put_page(p)		__free_page(p)
  10.201 +#define put_page_testzero(p) 	atomic_dec_and_test(&(p)->count)
  10.202 +#define page_count(p)		atomic_read(&(p)->count)
  10.203 +#define set_page_count(p,v) 	atomic_set(&(p)->count, v)
  10.204 +
  10.205 +/*
  10.206 + * Various page->flags bits:
  10.207 + *
  10.208 + * PG_reserved is set for special pages, which can never be swapped
  10.209 + * out. Some of them might not even exist (eg empty_bad_page)...
  10.210 + *
  10.211 + * Multiple processes may "see" the same page. E.g. for untouched
  10.212 + * mappings of /dev/null, all processes see the same page full of
  10.213 + * zeroes, and text pages of executables and shared libraries have
  10.214 + * only one copy in memory, at most, normally.
  10.215 + *
  10.216 + * For the non-reserved pages, page->count denotes a reference count.
  10.217 + *   page->count == 0 means the page is free.
  10.218 + *   page->count == 1 means the page is used for exactly one purpose
  10.219 + *   (e.g. a private data page of one process).
  10.220 + *
  10.221 + * A page may be used for kmalloc() or anyone else who does a
  10.222 + * __get_free_page(). In this case the page->count is at least 1, and
  10.223 + * all other fields are unused but should be 0 or NULL. The
  10.224 + * management of this page is the responsibility of the one who uses
  10.225 + * it.
  10.226 + *
  10.227 + * The other pages (we may call them "process pages") are completely
  10.228 + * managed by the Linux memory manager: I/O, buffers, swapping etc.
  10.229 + * The following discussion applies only to them.
  10.230 + *
  10.231 + * A page may belong to an inode's memory mapping. In this case,
  10.232 + * page->mapping is the pointer to the inode, and page->index is the
  10.233 + * file offset of the page, in units of PAGE_CACHE_SIZE.
  10.234 + *
  10.235 + * A page may have buffers allocated to it. In this case,
  10.236 + * page->buffers is a circular list of these buffer heads. Else,
  10.237 + * page->buffers == NULL.
  10.238 + *
  10.239 + * For pages belonging to inodes, the page->count is the number of
  10.240 + * attaches, plus 1 if buffers are allocated to the page, plus one
  10.241 + * for the page cache itself.
  10.242 + *
  10.243 + * All pages belonging to an inode are in these doubly linked lists:
  10.244 + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages;
  10.245 + * using the page->list list_head. These fields are also used for
  10.246 + * freelist managemet (when page->count==0).
  10.247 + *
  10.248 + * There is also a hash table mapping (mapping,index) to the page
  10.249 + * in memory if present. The lists for this hash table use the fields
  10.250 + * page->next_hash and page->pprev_hash.
  10.251 + *
  10.252 + * All process pages can do I/O:
  10.253 + * - inode pages may need to be read from disk,
  10.254 + * - inode pages which have been modified and are MAP_SHARED may need
  10.255 + *   to be written to disk,
  10.256 + * - private pages which have been modified may need to be swapped out
  10.257 + *   to swap space and (later) to be read back into memory.
  10.258 + * During disk I/O, PG_locked is used. This bit is set before I/O
  10.259 + * and reset when I/O completes. page_waitqueue(page) is a wait queue of all
  10.260 + * tasks waiting for the I/O on this page to complete.
  10.261 + * PG_uptodate tells whether the page's contents is valid.
  10.262 + * When a read completes, the page becomes uptodate, unless a disk I/O
  10.263 + * error happened.
  10.264 + *
  10.265 + * For choosing which pages to swap out, inode pages carry a
  10.266 + * PG_referenced bit, which is set any time the system accesses
  10.267 + * that page through the (mapping,index) hash table. This referenced
  10.268 + * bit, together with the referenced bit in the page tables, is used
  10.269 + * to manipulate page->age and move the page across the active,
  10.270 + * inactive_dirty and inactive_clean lists.
  10.271 + *
  10.272 + * Note that the referenced bit, the page->lru list_head and the
  10.273 + * active, inactive_dirty and inactive_clean lists are protected by
  10.274 + * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
  10.275 + *
  10.276 + * PG_skip is used on sparc/sparc64 architectures to "skip" certain
  10.277 + * parts of the address space.
  10.278 + *
  10.279 + * PG_error is set to indicate that an I/O error occurred on this page.
  10.280 + *
  10.281 + * PG_arch_1 is an architecture specific page state bit.  The generic
  10.282 + * code guarantees that this bit is cleared for a page when it first
  10.283 + * is entered into the page cache.
  10.284 + *
  10.285 + * PG_highmem pages are not permanently mapped into the kernel virtual
  10.286 + * address space, they need to be kmapped separately for doing IO on
  10.287 + * the pages. The struct page (these bits with information) are always
  10.288 + * mapped into kernel address space...
  10.289 + */
  10.290 +#define PG_locked		 0	/* Page is locked. Don't touch. */
  10.291 +#define PG_error		 1
  10.292 +#define PG_referenced		 2
  10.293 +#define PG_uptodate		 3
  10.294 +#define PG_dirty		 4
  10.295 +#define PG_unused		 5
  10.296 +#define PG_lru			 6
  10.297 +#define PG_active		 7
  10.298 +#define PG_slab			 8
  10.299 +#define PG_skip			10
  10.300 +#define PG_highmem		11
  10.301 +#define PG_checked		12	/* kill me in 2.5.<early>. */
  10.302 +#define PG_arch_1		13
  10.303 +#define PG_reserved		14
  10.304 +#define PG_launder		15	/* written out by VM pressure.. */
  10.305 +#define PG_fs_1			16	/* Filesystem specific */
  10.306 +#define PG_foreign		21	/* Page belongs to foreign allocator */
  10.307 +
  10.308 +#ifndef arch_set_page_uptodate
  10.309 +#define arch_set_page_uptodate(page)
  10.310 +#endif
  10.311 +
  10.312 +/* Make it prettier to test the above... */
  10.313 +#define UnlockPage(page)	unlock_page(page)
  10.314 +#define Page_Uptodate(page)	test_bit(PG_uptodate, &(page)->flags)
  10.315 +#define SetPageUptodate(page) \
  10.316 +	do {								\
  10.317 +		arch_set_page_uptodate(page);				\
  10.318 +		set_bit(PG_uptodate, &(page)->flags);			\
  10.319 +	} while (0)
  10.320 +#define ClearPageUptodate(page)	clear_bit(PG_uptodate, &(page)->flags)
  10.321 +#define PageDirty(page)		test_bit(PG_dirty, &(page)->flags)
  10.322 +#define SetPageDirty(page)	set_bit(PG_dirty, &(page)->flags)
  10.323 +#define ClearPageDirty(page)	clear_bit(PG_dirty, &(page)->flags)
  10.324 +#define PageLocked(page)	test_bit(PG_locked, &(page)->flags)
  10.325 +#define LockPage(page)		set_bit(PG_locked, &(page)->flags)
  10.326 +#define TryLockPage(page)	test_and_set_bit(PG_locked, &(page)->flags)
  10.327 +#define PageChecked(page)	test_bit(PG_checked, &(page)->flags)
  10.328 +#define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
  10.329 +#define ClearPageChecked(page)	clear_bit(PG_checked, &(page)->flags)
  10.330 +#define PageLaunder(page)	test_bit(PG_launder, &(page)->flags)
  10.331 +#define SetPageLaunder(page)	set_bit(PG_launder, &(page)->flags)
  10.332 +#define ClearPageLaunder(page)	clear_bit(PG_launder, &(page)->flags)
  10.333 +#define ClearPageArch1(page)	clear_bit(PG_arch_1, &(page)->flags)
  10.334 +
  10.335 +/* A foreign page uses a custom destructor rather than the buddy allocator. */
  10.336 +#ifdef CONFIG_FOREIGN_PAGES
  10.337 +#define PageForeign(page)	test_bit(PG_foreign, &(page)->flags)
  10.338 +#define SetPageForeign(page)	set_bit(PG_foreign, &(page)->flags)
  10.339 +#define ClearPageForeign(page)	clear_bit(PG_foreign, &(page)->flags)
  10.340 +#define PageForeignDestructor(page)	\
  10.341 +	( (void (*) (struct page *)) (page)->mapping )
  10.342 +#else
  10.343 +#define PageForeign(page)	0
  10.344 +#define PageForeignDestructor(page)	void
  10.345 +#endif
  10.346 +
  10.347 +/*
  10.348 + * The zone field is never updated after free_area_init_core()
  10.349 + * sets it, so none of the operations on it need to be atomic.
  10.350 + */
  10.351 +#define NODE_SHIFT 4
  10.352 +#define ZONE_SHIFT (BITS_PER_LONG - 8)
  10.353 +
  10.354 +struct zone_struct;
  10.355 +extern struct zone_struct *zone_table[];
  10.356 +
  10.357 +static inline zone_t *page_zone(struct page *page)
  10.358 +{
  10.359 +	return zone_table[page->flags >> ZONE_SHIFT];
  10.360 +}
  10.361 +
  10.362 +static inline void set_page_zone(struct page *page, unsigned long zone_num)
  10.363 +{
  10.364 +	page->flags &= ~(~0UL << ZONE_SHIFT);
  10.365 +	page->flags |= zone_num << ZONE_SHIFT;
  10.366 +}
  10.367 +
  10.368 +/*
  10.369 + * In order to avoid #ifdefs within C code itself, we define
  10.370 + * set_page_address to a noop for non-highmem machines, where
  10.371 + * the field isn't useful.
  10.372 + * The same is true for page_address() in arch-dependent code.
  10.373 + */
  10.374 +#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
  10.375 +
  10.376 +#define set_page_address(page, address)			\
  10.377 +	do {						\
  10.378 +		(page)->virtual = (address);		\
  10.379 +	} while(0)
  10.380 +
  10.381 +#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
  10.382 +#define set_page_address(page, address)  do { } while(0)
  10.383 +#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
  10.384 +
  10.385 +/*
  10.386 + * Permanent address of a page. Obviously must never be
  10.387 + * called on a highmem page.
  10.388 + */
  10.389 +#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
  10.390 +
  10.391 +#define page_address(page) ((page)->virtual)
  10.392 +
  10.393 +#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
  10.394 +
  10.395 +#define page_address(page)						\
  10.396 +	__va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT)	\
  10.397 +			+ page_zone(page)->zone_start_paddr)
  10.398 +
  10.399 +#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
  10.400 +
  10.401 +extern void FASTCALL(set_page_dirty(struct page *));
  10.402 +
  10.403 +/*
  10.404 + * The first mb is necessary to safely close the critical section opened by the
  10.405 + * TryLockPage(), the second mb is necessary to enforce ordering between
  10.406 + * the clear_bit and the read of the waitqueue (to avoid SMP races with a
  10.407 + * parallel wait_on_page).
  10.408 + */
  10.409 +#define PageError(page)		test_bit(PG_error, &(page)->flags)
  10.410 +#define SetPageError(page)	set_bit(PG_error, &(page)->flags)
  10.411 +#define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
  10.412 +#define PageReferenced(page)	test_bit(PG_referenced, &(page)->flags)
  10.413 +#define SetPageReferenced(page)	set_bit(PG_referenced, &(page)->flags)
  10.414 +#define ClearPageReferenced(page)	clear_bit(PG_referenced, &(page)->flags)
  10.415 +#define PageTestandClearReferenced(page)	test_and_clear_bit(PG_referenced, &(page)->flags)
  10.416 +#define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
  10.417 +#define PageSetSlab(page)	set_bit(PG_slab, &(page)->flags)
  10.418 +#define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
  10.419 +#define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
  10.420 +
  10.421 +#define PageActive(page)	test_bit(PG_active, &(page)->flags)
  10.422 +#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
  10.423 +#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
  10.424 +
  10.425 +#define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
  10.426 +#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
  10.427 +#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
  10.428 +
  10.429 +#ifdef CONFIG_HIGHMEM
  10.430 +#define PageHighMem(page)		test_bit(PG_highmem, &(page)->flags)
  10.431 +#else
  10.432 +#define PageHighMem(page)		0 /* needed to optimize away at compile time */
  10.433 +#endif
  10.434 +
  10.435 +#define SetPageReserved(page)		set_bit(PG_reserved, &(page)->flags)
  10.436 +#define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
  10.437 +
  10.438 +/*
  10.439 + * Error return values for the *_nopage functions
  10.440 + */
  10.441 +#define NOPAGE_SIGBUS	(NULL)
  10.442 +#define NOPAGE_OOM	((struct page *) (-1))
  10.443 +
  10.444 +/* The array of struct pages */
  10.445 +extern mem_map_t * mem_map;
  10.446 +
  10.447 +/*
  10.448 + * There is only one page-allocator function, and two main namespaces to
  10.449 + * it. The alloc_page*() variants return 'struct page *' and as such
  10.450 + * can allocate highmem pages, the *get*page*() variants return
  10.451 + * virtual kernel addresses to the allocated page(s).
  10.452 + */
  10.453 +extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
  10.454 +extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist));
  10.455 +extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
  10.456 +
  10.457 +static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
  10.458 +{
  10.459 +	/*
  10.460 +	 * Gets optimized away by the compiler.
  10.461 +	 */
  10.462 +	if (order >= MAX_ORDER)
  10.463 +		return NULL;
  10.464 +	return _alloc_pages(gfp_mask, order);
  10.465 +}
  10.466 +
  10.467 +#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
  10.468 +
  10.469 +extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
  10.470 +extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
  10.471 +
  10.472 +#define __get_free_page(gfp_mask) \
  10.473 +		__get_free_pages((gfp_mask),0)
  10.474 +
  10.475 +#define __get_dma_pages(gfp_mask, order) \
  10.476 +		__get_free_pages((gfp_mask) | GFP_DMA,(order))
  10.477 +
  10.478 +/*
  10.479 + * The old interface name will be removed in 2.5:
  10.480 + */
  10.481 +#define get_free_page get_zeroed_page
  10.482 +
  10.483 +/*
  10.484 + * There is only one 'core' page-freeing function.
  10.485 + */
  10.486 +extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
  10.487 +extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
  10.488 +
  10.489 +#define __free_page(page) __free_pages((page), 0)
  10.490 +#define free_page(addr) free_pages((addr),0)
  10.491 +
  10.492 +extern void show_free_areas(void);
  10.493 +extern void show_free_areas_node(pg_data_t *pgdat);
  10.494 +
  10.495 +extern void clear_page_tables(struct mm_struct *, unsigned long, int);
  10.496 +
  10.497 +extern int fail_writepage(struct page *);
  10.498 +struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused);
  10.499 +struct file *shmem_file_setup(char * name, loff_t size);
  10.500 +extern void shmem_lock(struct file * file, int lock);
  10.501 +extern int shmem_zero_setup(struct vm_area_struct *);
  10.502 +
  10.503 +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
  10.504 +extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
  10.505 +extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
  10.506 +extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
  10.507 +
  10.508 +extern int vmtruncate(struct inode * inode, loff_t offset);
  10.509 +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
  10.510 +extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
  10.511 +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
  10.512 +extern int make_pages_present(unsigned long addr, unsigned long end);
  10.513 +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
  10.514 +extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len);
  10.515 +extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len);
  10.516 +extern int ptrace_attach(struct task_struct *tsk);
  10.517 +extern int ptrace_detach(struct task_struct *, unsigned int);
  10.518 +extern void ptrace_disable(struct task_struct *);
  10.519 +extern int ptrace_check_attach(struct task_struct *task, int kill);
  10.520 +
  10.521 +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
  10.522 +		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
  10.523 +
  10.524 +/*
  10.525 + * On a two-level page table, this ends up being trivial. Thus the
  10.526 + * inlining and the symmetry break with pte_alloc() that does all
  10.527 + * of this out-of-line.
  10.528 + */
  10.529 +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  10.530 +{
  10.531 +	if (pgd_none(*pgd))
  10.532 +		return __pmd_alloc(mm, pgd, address);
  10.533 +	return pmd_offset(pgd, address);
  10.534 +}
  10.535 +
  10.536 +extern int pgt_cache_water[2];
  10.537 +extern int check_pgt_cache(void);
  10.538 +
  10.539 +extern void free_area_init(unsigned long * zones_size);
  10.540 +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
  10.541 +	unsigned long * zones_size, unsigned long zone_start_paddr, 
  10.542 +	unsigned long *zholes_size);
  10.543 +extern void mem_init(void);
  10.544 +extern void show_mem(void);
  10.545 +extern void si_meminfo(struct sysinfo * val);
  10.546 +extern void swapin_readahead(swp_entry_t);
  10.547 +
  10.548 +extern struct address_space swapper_space;
  10.549 +#define PageSwapCache(page) ((page)->mapping == &swapper_space)
  10.550 +
  10.551 +static inline int is_page_cache_freeable(struct page * page)
  10.552 +{
  10.553 +	return page_count(page) - !!page->buffers == 1;
  10.554 +}
  10.555 +
  10.556 +extern int FASTCALL(can_share_swap_page(struct page *));
  10.557 +extern int FASTCALL(remove_exclusive_swap_page(struct page *));
  10.558 +
  10.559 +extern void __free_pte(pte_t);
  10.560 +
  10.561 +/* mmap.c */
  10.562 +extern void lock_vma_mappings(struct vm_area_struct *);
  10.563 +extern void unlock_vma_mappings(struct vm_area_struct *);
  10.564 +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
  10.565 +extern void __insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
  10.566 +extern void build_mmap_rb(struct mm_struct *);
  10.567 +extern void exit_mmap(struct mm_struct *);
  10.568 +
  10.569 +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
  10.570 +
  10.571 +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
  10.572 +	unsigned long len, unsigned long prot,
  10.573 +	unsigned long flag, unsigned long pgoff);
  10.574 +
  10.575 +static inline unsigned long do_mmap(struct file *file, unsigned long addr,
  10.576 +	unsigned long len, unsigned long prot,
  10.577 +	unsigned long flag, unsigned long offset)
  10.578 +{
  10.579 +	unsigned long ret = -EINVAL;
  10.580 +	if ((offset + PAGE_ALIGN(len)) < offset)
  10.581 +		goto out;
  10.582 +	if (!(offset & ~PAGE_MASK))
  10.583 +		ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
  10.584 +out:
  10.585 +	return ret;
  10.586 +}
  10.587 +
  10.588 +extern int do_munmap(struct mm_struct *, unsigned long, size_t);
  10.589 +
  10.590 +extern unsigned long do_brk(unsigned long, unsigned long);
  10.591 +
  10.592 +static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev)
  10.593 +{
  10.594 +	prev->vm_next = vma->vm_next;
  10.595 +	rb_erase(&vma->vm_rb, &mm->mm_rb);
  10.596 +	if (mm->mmap_cache == vma)
  10.597 +		mm->mmap_cache = prev;
  10.598 +}
  10.599 +
  10.600 +static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags)
  10.601 +{
  10.602 +	if (!vma->vm_file && vma->vm_flags == vm_flags)
  10.603 +		return 1;
  10.604 +	else
  10.605 +		return 0;
  10.606 +}
  10.607 +
  10.608 +struct zone_t;
  10.609 +/* filemap.c */
  10.610 +extern void remove_inode_page(struct page *);
  10.611 +extern unsigned long page_unuse(struct page *);
  10.612 +extern void truncate_inode_pages(struct address_space *, loff_t);
  10.613 +
  10.614 +/* generic vm_area_ops exported for stackable file systems */
  10.615 +extern int filemap_sync(struct vm_area_struct *, unsigned long,	size_t, unsigned int);
  10.616 +extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
  10.617 +
  10.618 +/*
  10.619 + * GFP bitmasks..
  10.620 + */
  10.621 +/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */
  10.622 +#define __GFP_DMA	0x01
  10.623 +#define __GFP_HIGHMEM	0x02
  10.624 +
  10.625 +/* Action modifiers - doesn't change the zoning */
  10.626 +#define __GFP_WAIT	0x10	/* Can wait and reschedule? */
  10.627 +#define __GFP_HIGH	0x20	/* Should access emergency pools? */
  10.628 +#define __GFP_IO	0x40	/* Can start low memory physical IO? */
  10.629 +#define __GFP_HIGHIO	0x80	/* Can start high mem physical IO? */
  10.630 +#define __GFP_FS	0x100	/* Can call down to low-level FS? */
  10.631 +
  10.632 +#define GFP_NOHIGHIO	(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
  10.633 +#define GFP_NOIO	(__GFP_HIGH | __GFP_WAIT)
  10.634 +#define GFP_NOFS	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO)
  10.635 +#define GFP_ATOMIC	(__GFP_HIGH)
  10.636 +#define GFP_USER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
  10.637 +#define GFP_HIGHUSER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM)
  10.638 +#define GFP_KERNEL	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
  10.639 +#define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
  10.640 +#define GFP_KSWAPD	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
  10.641 +
  10.642 +/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
  10.643 +   platforms, used as appropriate on others */
  10.644 +
  10.645 +#define GFP_DMA		__GFP_DMA
  10.646 +
  10.647 +static inline unsigned int pf_gfp_mask(unsigned int gfp_mask)
  10.648 +{
  10.649 +	/* avoid all memory balancing I/O methods if this task cannot block on I/O */
  10.650 +	if (current->flags & PF_NOIO)
  10.651 +		gfp_mask &= ~(__GFP_IO | __GFP_HIGHIO | __GFP_FS);
  10.652 +
  10.653 +	return gfp_mask;
  10.654 +}
  10.655 +	
  10.656 +/* vma is the first one with  address < vma->vm_end,
  10.657 + * and even  address < vma->vm_start. Have to extend vma. */
  10.658 +static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
  10.659 +{
  10.660 +	unsigned long grow;
  10.661 +
  10.662 +	/*
  10.663 +	 * vma->vm_start/vm_end cannot change under us because the caller is required
  10.664 +	 * to hold the mmap_sem in write mode. We need to get the spinlock only
  10.665 +	 * before relocating the vma range ourself.
  10.666 +	 */
  10.667 +	address &= PAGE_MASK;
  10.668 + 	spin_lock(&vma->vm_mm->page_table_lock);
  10.669 +	grow = (vma->vm_start - address) >> PAGE_SHIFT;
  10.670 +	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
  10.671 +	    ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) {
  10.672 +		spin_unlock(&vma->vm_mm->page_table_lock);
  10.673 +		return -ENOMEM;
  10.674 +	}
  10.675 +	vma->vm_start = address;
  10.676 +	vma->vm_pgoff -= grow;
  10.677 +	vma->vm_mm->total_vm += grow;
  10.678 +	if (vma->vm_flags & VM_LOCKED)
  10.679 +		vma->vm_mm->locked_vm += grow;
  10.680 +	spin_unlock(&vma->vm_mm->page_table_lock);
  10.681 +	return 0;
  10.682 +}
  10.683 +
  10.684 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
  10.685 +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
  10.686 +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
  10.687 +					     struct vm_area_struct **pprev);
  10.688 +
  10.689 +/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
  10.690 +   NULL if none.  Assume start_addr < end_addr. */
  10.691 +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
  10.692 +{
  10.693 +	struct vm_area_struct * vma = find_vma(mm,start_addr);
  10.694 +
  10.695 +	if (vma && end_addr <= vma->vm_start)
  10.696 +		vma = NULL;
  10.697 +	return vma;
  10.698 +}
  10.699 +
  10.700 +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
  10.701 +
  10.702 +extern struct page * vmalloc_to_page(void *addr);
  10.703 +
  10.704 +#endif /* __KERNEL__ */
  10.705 +
  10.706 +#endif
    11.1 --- a/linux-2.4.26-xen-sparse/mm/page_alloc.c	Fri Aug 20 09:11:43 2004 +0000
    11.2 +++ b/linux-2.4.26-xen-sparse/mm/page_alloc.c	Fri Aug 20 09:21:37 2004 +0000
    11.3 @@ -89,6 +89,9 @@ static void __free_pages_ok (struct page
    11.4  	struct page *base;
    11.5  	zone_t *zone;
    11.6  
    11.7 +	if (PageForeign(page))
    11.8 +		return (PageForeignDestructor(page))(page);
    11.9 +
   11.10  	/*
   11.11  	 * Yes, think what happens when other parts of the kernel take 
   11.12  	 * a reference to a page in order to pin it for io. -ben
   11.13 @@ -102,7 +105,7 @@ static void __free_pages_ok (struct page
   11.14  	if (page->buffers)
   11.15  		BUG();
   11.16  	if (page->mapping)
   11.17 -		return (*(void(*)(struct page *))page->mapping)(page);
   11.18 +		BUG();
   11.19  	if (!VALID_PAGE(page))
   11.20  		BUG();
   11.21  	if (PageLocked(page))
    12.1 --- a/linux-2.6.7-xen-sparse/arch/xen/Kconfig	Fri Aug 20 09:11:43 2004 +0000
    12.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/Kconfig	Fri Aug 20 09:21:37 2004 +0000
    12.3 @@ -44,11 +44,15 @@ config XEN_WRITABLE_PAGETABLES
    12.4  
    12.5  endmenu
    12.6  
    12.7 -# Xen's block device backend driver needs 2^12 pages
    12.8 -config FORCE_MAX_ZONEORDER
    12.9 -        int
   12.10 -        default "12" if XEN_PHYSDEV_ACCESS
   12.11 -        default "11" if !XEN_PHYSDEV_ACCESS
   12.12 +config FOREIGN_PAGES
   12.13 +	bool
   12.14 +	default y if XEN_PHYSDEV_ACCESS
   12.15 +	default n if !XEN_PHYSDEV_ACCESS
   12.16 +
   12.17 +config PAGESIZED_SKBS
   12.18 +	bool
   12.19 +	default y if XEN_PHYSDEV_ACCESS
   12.20 +	default n if !XEN_PHYSDEV_ACCESS
   12.21  
   12.22  #config VT
   12.23  #	bool
    13.1 --- a/linux-2.6.7-xen-sparse/arch/xen/configs/xen0_defconfig	Fri Aug 20 09:11:43 2004 +0000
    13.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/configs/xen0_defconfig	Fri Aug 20 09:21:37 2004 +0000
    13.3 @@ -10,7 +10,8 @@ CONFIG_NO_IDLE_HZ=y
    13.4  #
    13.5  CONFIG_XEN_PRIVILEGED_GUEST=y
    13.6  CONFIG_XEN_PHYSDEV_ACCESS=y
    13.7 -CONFIG_FORCE_MAX_ZONEORDER=12
    13.8 +CONFIG_FOREIGN_PAGES=y
    13.9 +CONFIG_PAGESIZED_SKBS=y
   13.10  CONFIG_X86=y
   13.11  # CONFIG_X86_64 is not set
   13.12  
    14.1 --- a/linux-2.6.7-xen-sparse/arch/xen/configs/xenU_defconfig	Fri Aug 20 09:11:43 2004 +0000
    14.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/configs/xenU_defconfig	Fri Aug 20 09:21:37 2004 +0000
    14.3 @@ -10,7 +10,8 @@ CONFIG_NO_IDLE_HZ=y
    14.4  #
    14.5  # CONFIG_XEN_PRIVILEGED_GUEST is not set
    14.6  # CONFIG_XEN_PHYSDEV_ACCESS is not set
    14.7 -CONFIG_FORCE_MAX_ZONEORDER=11
    14.8 +# CONFIG_FOREIGN_PAGES is not set
    14.9 +# CONFIG_PAGESIZED_SKBS is not set
   14.10  CONFIG_X86=y
   14.11  # CONFIG_X86_64 is not set
   14.12  
    15.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/pci-dma.c	Fri Aug 20 09:11:43 2004 +0000
    15.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/pci-dma.c	Fri Aug 20 09:21:37 2004 +0000
    15.3 @@ -61,6 +61,8 @@ void *dma_alloc_coherent(struct device *
    15.4  			pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
    15.5  			pfn = pte->pte_low >> PAGE_SHIFT;
    15.6  			queue_l1_entry_update(pte, 0);
    15.7 +			phys_to_machine_mapping[(__pa(ret)>>PAGE_SHIFT)+i] =
    15.8 +				INVALID_P2M_ENTRY;
    15.9  			flush_page_update_queue();
   15.10  			if (HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, 
   15.11  						  &pfn, 1, 0) != 1) BUG();
   15.12 @@ -79,7 +81,6 @@ void *dma_alloc_coherent(struct device *
   15.13  				pfn+i, (__pa(ret)>>PAGE_SHIFT)+i);
   15.14  			phys_to_machine_mapping[(__pa(ret)>>PAGE_SHIFT)+i] =
   15.15  				pfn+i;
   15.16 -                        flush_page_update_queue();
   15.17  		}
   15.18  		flush_page_update_queue();
   15.19  	}
    16.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Aug 20 09:11:43 2004 +0000
    16.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Aug 20 09:21:37 2004 +0000
    16.3 @@ -299,7 +299,7 @@ unsigned long allocate_empty_lowmem_regi
    16.4          pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
    16.5          pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
    16.6          queue_l1_entry_update(pte, 0);
    16.7 -        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
    16.8 +        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = INVALID_P2M_ENTRY;
    16.9      }
   16.10  
   16.11      flush_page_update_queue();
    17.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/ioremap.c	Fri Aug 20 09:11:43 2004 +0000
    17.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/ioremap.c	Fri Aug 20 09:21:37 2004 +0000
    17.3 @@ -415,17 +415,10 @@ int direct_remap_area_pages(struct mm_st
    17.4  #define MAX_DIRECTMAP_MMU_QUEUE 130
    17.5      mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
    17.6  
    17.7 -    if ( domid != 0 )
    17.8 -    {
    17.9 -        u[0].ptr  = MMU_EXTENDED_COMMAND;
   17.10 -        u[0].val  = MMUEXT_SET_FOREIGNDOM;
   17.11 -        u[0].val |= (unsigned long)domid << 16;
   17.12 -        v = w = &u[1];
   17.13 -    }
   17.14 -    else
   17.15 -    {
   17.16 -        v = w = &u[0];
   17.17 -    }
   17.18 +    u[0].ptr  = MMU_EXTENDED_COMMAND;
   17.19 +    u[0].val  = MMUEXT_SET_FOREIGNDOM;
   17.20 +    u[0].val |= (unsigned long)domid << 16;
   17.21 +    v = w = &u[1];
   17.22  
   17.23      start_address = address;
   17.24  
    18.1 --- a/linux-2.6.7-xen-sparse/drivers/char/mem.c	Fri Aug 20 09:11:43 2004 +0000
    18.2 +++ b/linux-2.6.7-xen-sparse/drivers/char/mem.c	Fri Aug 20 09:21:37 2004 +0000
    18.3 @@ -247,6 +247,9 @@ static int mmap_mem(struct file * file, 
    18.4  	if (!(start_info.flags & SIF_PRIVILEGED))
    18.5  		return -ENXIO;
    18.6  
    18.7 +	if (file->private_data == NULL)
    18.8 +		file->private_data = (void *)(unsigned long)DOMID_IO;
    18.9 +
   18.10  	/* DONTCOPY is essential for Xen as copy_page_range is broken. */
   18.11  	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
   18.12  	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
    19.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c	Fri Aug 20 09:11:43 2004 +0000
    19.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c	Fri Aug 20 09:21:37 2004 +0000
    19.3 @@ -24,22 +24,15 @@
    19.4  #define MAX_PENDING_REQS 64
    19.5  #define BATCH_PER_DOMAIN 16
    19.6  
    19.7 -/*
    19.8 - * NB. We place a page of padding between each buffer page to avoid incorrect
    19.9 - * merging of requests by the IDE and SCSI merging routines. Otherwise, two
   19.10 - * adjacent buffers in a scatter-gather request would have adjacent page
   19.11 - * numbers: since the merge routines don't realise that this is in *pseudophys*
   19.12 - * space, not real space, they may collapse the s-g elements!
   19.13 - */
   19.14  static unsigned long mmap_vstart;
   19.15  #define MMAP_PAGES_PER_REQUEST \
   19.16 -    (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
   19.17 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
   19.18  #define MMAP_PAGES             \
   19.19      (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
   19.20  #define MMAP_VADDR(_req,_seg)                        \
   19.21      (mmap_vstart +                                   \
   19.22       ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
   19.23 -     ((_seg) * 2 * PAGE_SIZE))
   19.24 +     ((_seg) * PAGE_SIZE))
   19.25  
   19.26  /*
   19.27   * Each outstanding request that we've passed to the lower device layers has a 
   19.28 @@ -415,7 +408,7 @@ static void dispatch_rw_block_io(blkif_t
   19.29          mcl[i].args[3] = blkif->domid;
   19.30  
   19.31          phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
   19.32 -            phys_seg[i].buffer >> PAGE_SHIFT;
   19.33 +            FOREIGN_FRAME(phys_seg[i].buffer >> PAGE_SHIFT);
   19.34      }
   19.35  
   19.36      if ( unlikely(HYPERVISOR_multicall(mcl, nr_psegs) != 0) )
    20.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c	Fri Aug 20 09:11:43 2004 +0000
    20.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c	Fri Aug 20 09:21:37 2004 +0000
    20.3 @@ -1,5 +1,5 @@
    20.4  /******************************************************************************
    20.5 - * block.c
    20.6 + * blkfront.c
    20.7   * 
    20.8   * XenLinux virtual block-device driver.
    20.9   * 
   20.10 @@ -67,11 +67,12 @@ static inline int GET_ID_FROM_FREELIST( 
   20.11  {
   20.12      unsigned long free = rec_ring_free;
   20.13  
   20.14 -    if(free>BLKIF_RING_SIZE) BUG();
   20.15 +    if ( free > BLKIF_RING_SIZE )
   20.16 +        BUG();
   20.17  
   20.18      rec_ring_free = rec_ring[free].id;
   20.19  
   20.20 -    rec_ring[free].id = 0x0fffffee; // debug
   20.21 +    rec_ring[free].id = 0x0fffffee; /* debug */
   20.22  
   20.23      return free;
   20.24  }
   20.25 @@ -253,8 +254,6 @@ static int blkif_queue_request(struct re
   20.26      id = GET_ID_FROM_FREELIST();
   20.27      rec_ring[id].id = (unsigned long) req;
   20.28  
   20.29 -//printk(KERN_ALERT"r: %d req %p (%ld)\n",req_prod,req,id);
   20.30 -
   20.31      ring_req->id = id;
   20.32      ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
   20.33          BLKIF_OP_READ;
   20.34 @@ -300,8 +299,6 @@ void do_blkif_request(request_queue_t *r
   20.35  
   20.36      DPRINTK("Entered do_blkif_request\n"); 
   20.37  
   20.38 -//printk(KERN_ALERT"r: %d req\n",req_prod);
   20.39 -
   20.40      queued = 0;
   20.41  
   20.42      while ((req = elv_next_request(rq)) != NULL) {
   20.43 @@ -310,7 +307,8 @@ void do_blkif_request(request_queue_t *r
   20.44              continue;
   20.45          }
   20.46  
   20.47 -        if (BLKIF_RING_FULL) {
   20.48 +        if ( BLKIF_RING_FULL )
   20.49 +        {
   20.50              blk_stop_queue(rq);
   20.51              break;
   20.52          }
   20.53 @@ -358,11 +356,9 @@ static irqreturn_t blkif_int(int irq, vo
   20.54  	id = bret->id;
   20.55  	req = (struct request *)rec_ring[id].id;
   20.56  
   20.57 -//printk(KERN_ALERT"i: %d req %p (%ld)\n",i,req,id);
   20.58 -
   20.59  	blkif_completion( &rec_ring[id] );
   20.60  
   20.61 -	ADD_ID_TO_FREELIST(id);  // overwrites req
   20.62 +	ADD_ID_TO_FREELIST(id); /* overwrites req */
   20.63  
   20.64          switch ( bret->operation )
   20.65          {
   20.66 @@ -772,8 +768,6 @@ static int blkif_queue_request(unsigned 
   20.67      req->nr_segments   = 1;
   20.68      req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
   20.69  
   20.70 -//printk("N: %d req %p (%ld)\n",req_prod,rec_ring[xid].id,xid);
   20.71 -
   20.72      req_prod++;
   20.73  
   20.74      /* Keep a private copy so we can reissue requests when recovering. */    
   20.75 @@ -892,8 +886,6 @@ static void blkif_int(int irq, void *dev
   20.76  	id = bret->id;
   20.77  	bh = (struct buffer_head *)rec_ring[id].id; 
   20.78  
   20.79 -//printk("i: %d req %p (%ld)\n",i,bh,id);
   20.80 -
   20.81  	blkif_completion( &rec_ring[id] );
   20.82  
   20.83  	ADD_ID_TO_FREELIST(id);
   20.84 @@ -942,16 +934,11 @@ static inline void translate_req_to_pfn(
   20.85      xreq->operation     = req->operation;
   20.86      xreq->nr_segments   = req->nr_segments;
   20.87      xreq->device        = req->device;
   20.88 -    // preserve id
   20.89 +    /* preserve id */
   20.90      xreq->sector_number = req->sector_number;
   20.91  
   20.92      for ( i = 0; i < req->nr_segments; i++ )
   20.93 -    {
   20.94 -        xreq->frame_and_sects[i] = (req->frame_and_sects[i] & ~PAGE_MASK) |
   20.95 -            (machine_to_phys_mapping[req->frame_and_sects[i] >> PAGE_SHIFT] <<
   20.96 -             PAGE_SHIFT);
   20.97 -    }
   20.98 -    
   20.99 +        xreq->frame_and_sects[i] = machine_to_phys(req->frame_and_sects[i]);
  20.100  }
  20.101  
  20.102  static inline void translate_req_to_mfn(blkif_request_t *xreq,
  20.103 @@ -962,15 +949,11 @@ static inline void translate_req_to_mfn(
  20.104      xreq->operation     = req->operation;
  20.105      xreq->nr_segments   = req->nr_segments;
  20.106      xreq->device        = req->device;
  20.107 -    xreq->id            = req->id;   // copy id (unlike above)
  20.108 +    xreq->id            = req->id;   /* copy id (unlike above) */
  20.109      xreq->sector_number = req->sector_number;
  20.110  
  20.111      for ( i = 0; i < req->nr_segments; i++ )
  20.112 -    {
  20.113 -        xreq->frame_and_sects[i] = (req->frame_and_sects[i] & ~PAGE_MASK) |
  20.114 -            (phys_to_machine_mapping[req->frame_and_sects[i] >> PAGE_SHIFT] << 
  20.115 -             PAGE_SHIFT);
  20.116 -    }
  20.117 +        xreq->frame_and_sects[i] = phys_to_machine(req->frame_and_sects[i]);
  20.118  }
  20.119  
  20.120  
  20.121 @@ -978,7 +961,6 @@ static inline void translate_req_to_mfn(
  20.122  static inline void flush_requests(void)
  20.123  {
  20.124      DISABLE_SCATTERGATHER();
  20.125 -//printk(KERN_ALERT"flush %d\n",req_prod);
  20.126      wmb(); /* Ensure that the frontend can see the requests. */
  20.127      blk_ring->req_prod = req_prod;
  20.128      notify_via_evtchn(blkif_evtchn);
  20.129 @@ -1010,8 +992,6 @@ void blkif_control_send(blkif_request_t 
  20.130      blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req.id = id;
  20.131      rec_ring[id].id = (unsigned long) req;
  20.132  
  20.133 -//printk("c: %d req %p (%ld)\n",req_prod,req,id);
  20.134 -
  20.135      translate_req_to_pfn( &rec_ring[id], req );
  20.136  
  20.137      req_prod++;
  20.138 @@ -1094,13 +1074,13 @@ static void blkif_status_change(blkif_fe
  20.139                     " in state %d\n", blkif_state);
  20.140              break;
  20.141          }
  20.142 +
  20.143          blkif_evtchn = status->evtchn;
  20.144 -        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
  20.145 -        if ( (rc=request_irq(blkif_irq, blkif_int, 
  20.146 -                          SA_SAMPLE_RANDOM, "blkif", NULL)) )
  20.147 -	{
  20.148 +        blkif_irq    = bind_evtchn_to_irq(blkif_evtchn);
  20.149 +
  20.150 +        if ( (rc = request_irq(blkif_irq, blkif_int, 
  20.151 +                               SA_SAMPLE_RANDOM, "blkif", NULL)) )
  20.152  	    printk(KERN_ALERT"blkfront request_irq failed (%ld)\n",rc);
  20.153 -	}
  20.154  
  20.155          if ( recovery )
  20.156          {
  20.157 @@ -1109,31 +1089,28 @@ static void blkif_status_change(blkif_fe
  20.158  	    /* Hmm, requests might be re-ordered when we re-issue them.
  20.159  	       This will need to be fixed once we have barriers */
  20.160  
  20.161 -	    // req_prod = 0;   : already is zero
  20.162 -
  20.163 -	    // stage 1 : find active and move to safety
  20.164 -	    for ( i=0; i <BLKIF_RING_SIZE; i++ )
  20.165 +	    /* Stage 1 : Find active and move to safety. */
  20.166 +	    for ( i = 0; i < BLKIF_RING_SIZE; i++ )
  20.167  	    {
  20.168  		if ( rec_ring[i].id >= PAGE_OFFSET )
  20.169  		{
  20.170  		    translate_req_to_mfn(
  20.171 -			&blk_ring->ring[req_prod].req, &rec_ring[i] );
  20.172 -
  20.173 +			&blk_ring->ring[req_prod].req, &rec_ring[i]);
  20.174  		    req_prod++;
  20.175  		}
  20.176  	    }
  20.177  
  20.178 -printk(KERN_ALERT"blkfront: recovered %d descriptors\n",req_prod);
  20.179 +            printk(KERN_ALERT"blkfront: recovered %d descriptors\n",req_prod);
  20.180  	    
  20.181 -	    // stage 2 : set up shadow list
  20.182 -	    for ( i=0; i<req_prod; i++ )
  20.183 +            /* Stage 2 : Set up shadow list. */
  20.184 +	    for ( i = 0; i < req_prod; i++ )
  20.185  	    {
  20.186  		rec_ring[i].id = blk_ring->ring[i].req.id;		
  20.187  		blk_ring->ring[i].req.id = i;
  20.188 -		translate_req_to_pfn( &rec_ring[i], &blk_ring->ring[i].req );
  20.189 +		translate_req_to_pfn(&rec_ring[i], &blk_ring->ring[i].req);
  20.190  	    }
  20.191  
  20.192 -	    // stage 3 : set up free list
  20.193 +	    /* Stage 3 : Set up free list. */
  20.194  	    for ( ; i < BLKIF_RING_SIZE; i++ )
  20.195  		rec_ring[i].id = i+1;
  20.196  	    rec_ring_free = req_prod;
  20.197 @@ -1150,9 +1127,6 @@ printk(KERN_ALERT"blkfront: recovered %d
  20.198  
  20.199              /* Kicks things back into life. */
  20.200              flush_requests();
  20.201 -
  20.202 -
  20.203 -
  20.204          }
  20.205          else
  20.206          {
  20.207 @@ -1270,7 +1244,7 @@ void blkdev_resume(void)
  20.208  
  20.209  /* XXXXX THIS IS A TEMPORARY FUNCTION UNTIL WE GET GRANT TABLES */
  20.210  
  20.211 -void blkif_completion( blkif_request_t *req )
  20.212 +void blkif_completion(blkif_request_t *req)
  20.213  {
  20.214      int i;
  20.215  
  20.216 @@ -1281,10 +1255,8 @@ void blkif_completion( blkif_request_t *
  20.217  	{
  20.218  	    unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT;
  20.219  	    unsigned long mfn = phys_to_machine_mapping[pfn];
  20.220 -
  20.221  	    queue_machphys_update(mfn, pfn);
  20.222  	}
  20.223 -
  20.224  	break;
  20.225      }
  20.226      
    21.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Fri Aug 20 09:11:43 2004 +0000
    21.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Fri Aug 20 09:21:37 2004 +0000
    21.3 @@ -204,6 +204,12 @@ static void net_rx_action(unsigned long 
    21.4          mdata   = virt_to_machine(vdata);
    21.5          new_mfn = get_new_mfn();
    21.6          
    21.7 +        /*
    21.8 +         * Set the new P2M table entry before reassigning the old data page.
    21.9 +         * Heed the comment in pgtable-2level.h:pte_page(). :-)
   21.10 +         */
   21.11 +        phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
   21.12 +        
   21.13          mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
   21.14          mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;  
   21.15          mmu[1].ptr  = MMU_EXTENDED_COMMAND;
   21.16 @@ -250,8 +256,6 @@ static void net_rx_action(unsigned long 
   21.17          mdata   = ((mmu[2].ptr & PAGE_MASK) |
   21.18                     ((unsigned long)skb->data & ~PAGE_MASK));
   21.19          
   21.20 -        phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
   21.21 -        
   21.22          atomic_set(&(skb_shinfo(skb)->dataref), 1);
   21.23          skb_shinfo(skb)->nr_frags = 0;
   21.24          skb_shinfo(skb)->frag_list = NULL;
   21.25 @@ -372,7 +376,6 @@ static void net_tx_action(unsigned long 
   21.26      netif_tx_request_t txreq;
   21.27      u16 pending_idx;
   21.28      NETIF_RING_IDX i;
   21.29 -    struct page *page;
   21.30      multicall_entry_t *mcl;
   21.31      PEND_RING_IDX dc, dp;
   21.32  
   21.33 @@ -556,17 +559,16 @@ static void net_tx_action(unsigned long 
   21.34          }
   21.35  
   21.36          phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
   21.37 -            txreq.addr >> PAGE_SHIFT;
   21.38 +            FOREIGN_FRAME(txreq.addr >> PAGE_SHIFT);
   21.39  
   21.40          __skb_put(skb, PKT_PROT_LEN);
   21.41          memcpy(skb->data, 
   21.42                 (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
   21.43                 PKT_PROT_LEN);
   21.44  
   21.45 -        page = virt_to_page(MMAP_VADDR(pending_idx));
   21.46 -
   21.47          /* Append the packet payload as a fragment. */
   21.48 -        skb_shinfo(skb)->frags[0].page        = page;
   21.49 +        skb_shinfo(skb)->frags[0].page        = 
   21.50 +            virt_to_page(MMAP_VADDR(pending_idx));
   21.51          skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
   21.52          skb_shinfo(skb)->frags[0].page_offset = 
   21.53              (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
   21.54 @@ -577,17 +579,6 @@ static void net_tx_action(unsigned long 
   21.55          skb->dev      = netif->dev;
   21.56          skb->protocol = eth_type_trans(skb, skb->dev);
   21.57  
   21.58 -        /*
   21.59 -         * Destructor information. We hideously abuse the 'mapping' pointer,
   21.60 -         * which isn't otherwise used by us. The page deallocator is modified
   21.61 -         * to interpret a non-NULL value as a destructor function to be called.
   21.62 -         * This works okay because in all other cases the pointer must be NULL
   21.63 -         * when the page is freed (normally Linux will explicitly bug out if
   21.64 -         * it sees otherwise.
   21.65 -         */
   21.66 -        page->mapping = (struct address_space *)netif_page_release;
   21.67 -        set_page_count(page, 1);
   21.68 -
   21.69          netif->stats.tx_bytes += txreq.size;
   21.70          netif->stats.tx_packets++;
   21.71  
   21.72 @@ -603,8 +594,8 @@ static void netif_page_release(struct pa
   21.73      unsigned long flags;
   21.74      u16 pending_idx = page - virt_to_page(mmap_vstart);
   21.75  
   21.76 -    /* Stop the abuse. */
   21.77 -    page->mapping = NULL;
   21.78 +    /* Ready for next use. */
   21.79 +    set_page_count(page, 1);
   21.80  
   21.81      spin_lock_irqsave(&dealloc_lock, flags);
   21.82      dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
   21.83 @@ -738,6 +729,7 @@ static irqreturn_t netif_be_dbg(int irq,
   21.84  static int __init netback_init(void)
   21.85  {
   21.86      int i;
   21.87 +    struct page *page;
   21.88  
   21.89      if ( !(start_info.flags & SIF_NET_BE_DOMAIN) &&
   21.90  	 !(start_info.flags & SIF_INITDOMAIN) )
   21.91 @@ -753,6 +745,13 @@ static int __init netback_init(void)
   21.92      if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
   21.93          BUG();
   21.94  
   21.95 +    for ( i = 0; i < MAX_PENDING_REQS; i++ )
   21.96 +    {
   21.97 +        page = virt_to_page(MMAP_VADDR(i));
   21.98 +        SetPageForeign(page);
   21.99 +        PageForeignDestructor(page) = netif_page_release;
  21.100 +    }
  21.101 +
  21.102      pending_cons = 0;
  21.103      pending_prod = MAX_PENDING_REQS;
  21.104      for ( i = 0; i < MAX_PENDING_REQS; i++ )
    22.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Fri Aug 20 09:11:43 2004 +0000
    22.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Fri Aug 20 09:21:37 2004 +0000
    22.3 @@ -263,9 +263,9 @@ static void network_alloc_rx_buffers(str
    22.4          
    22.5          rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
    22.6  
    22.7 -	/* remove this page from pseudo phys map (migration optimization) */
    22.8 +	/* Remove this page from pseudo phys map before passing back to Xen. */
    22.9  	phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] 
   22.10 -	    = 0x80000001;
   22.11 +	    = INVALID_P2M_ENTRY;
   22.12  
   22.13          rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
   22.14          rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
   22.15 @@ -478,15 +478,6 @@ static int netif_poll(struct net_device 
   22.16          mcl->args[2] = 0;
   22.17          mcl++;
   22.18          (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
   22.19 -
   22.20 -#if 0 
   22.21 -	if (unlikely(rx_mcl[0].args[5] != 0))
   22.22 -	    printk(KERN_ALERT"Hypercall0 failed %u\n",np->rx->resp_prod);
   22.23 -
   22.24 -	if (unlikely(rx_mcl[1].args[5] != 0))
   22.25 -	    printk(KERN_ALERT"Hypercall1 failed %u\n",np->rx->resp_prod);
   22.26 -#endif
   22.27 -
   22.28      }
   22.29  
   22.30      while ( (skb = __skb_dequeue(&rxq)) != NULL )
    23.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c	Fri Aug 20 09:11:43 2004 +0000
    23.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c	Fri Aug 20 09:21:37 2004 +0000
    23.3 @@ -138,17 +138,10 @@ static int privcmd_ioctl(struct inode *i
    23.4          if ( (m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end )
    23.5          { ret = -EFAULT; goto batch_err; }
    23.6  
    23.7 -        if ( m.dom != 0 )
    23.8 -        {
    23.9 -            u[0].ptr  = MMU_EXTENDED_COMMAND;
   23.10 -            u[0].val  = MMUEXT_SET_FOREIGNDOM;
   23.11 -            u[0].val |= (unsigned long)m.dom << 16;
   23.12 -            v = w = &u[1];
   23.13 -        }
   23.14 -        else
   23.15 -        {
   23.16 -            v = w = &u[0];
   23.17 -        }
   23.18 +        u[0].ptr  = MMU_EXTENDED_COMMAND;
   23.19 +        u[0].val  = MMUEXT_SET_FOREIGNDOM;
   23.20 +        u[0].val |= (unsigned long)m.dom << 16;
   23.21 +        v = w = &u[1];
   23.22  
   23.23          p = m.arr;
   23.24          addr = m.addr;
    24.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/io.h	Fri Aug 20 09:11:43 2004 +0000
    24.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/io.h	Fri Aug 20 09:21:37 2004 +0000
    24.3 @@ -88,6 +88,13 @@ static inline void * phys_to_virt(unsign
    24.4  #define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
    24.5  #define page_to_phys(page)       (phys_to_machine(page_to_pseudophys(page)))
    24.6  
    24.7 +#define bio_to_pseudophys(bio)	(page_to_pseudophys(bio_page((bio))) + (unsigned long) bio_offset((bio)))
    24.8 +#define bvec_to_pseudophys(bv)	(page_to_pseudophys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
    24.9 +
   24.10 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
   24.11 +	(((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
   24.12 +	 ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == bvec_to_pseudophys((vec2))))
   24.13 +
   24.14  extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
   24.15  
   24.16  /**
    25.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h	Fri Aug 20 09:11:43 2004 +0000
    25.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h	Fri Aug 20 09:21:37 2004 +0000
    25.3 @@ -88,30 +88,33 @@ static inline pte_t ptep_get_and_clear(p
    25.4   *     not have MFN in our p2m table. Conversely, if the page is ours,
    25.5   *     then we'll have p2m(m2p(MFN))==MFN.
    25.6   * If we detect a special mapping then it doesn't have a 'struct page'.
    25.7 - * We force !VALID_PAGE() by returning an out-of-range pointer.
    25.8 + * We force !pfn_valid() by returning an out-of-range pointer.
    25.9 + *
   25.10 + * NB. These checks require that, for any MFN that is not in our reservation,
   25.11 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
   25.12 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
   25.13 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
   25.14 + * 
   25.15 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
   25.16 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
   25.17 + *      require. In all the cases we care about, the high bit gets shifted out
   25.18 + *      (e.g., phys_to_machine()) so behaviour there is correct.
   25.19   */
   25.20 -#define pte_page(_pte)                                        \
   25.21 -({                                                            \
   25.22 -    unsigned long mfn = (_pte).pte_low >> PAGE_SHIFT;         \
   25.23 -    unsigned long pfn = mfn_to_pfn(mfn);                      \
   25.24 -    if ( (pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn) )     \
   25.25 -        pfn = max_mapnr; /* special: force !VALID_PAGE() */   \
   25.26 -    pfn_to_page(pfn);                                         \
   25.27 -})
   25.28 -
   25.29 -#define pte_none(x)		(!(x).pte_low)
   25.30 -/* See comments above pte_page */
   25.31 -/* XXXcl check pte_present because msync.c:filemap_sync_pte calls
   25.32 - * without pte_present check */
   25.33 +#define INVALID_P2M_ENTRY (~0UL)
   25.34 +#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1)))
   25.35  #define pte_pfn(_pte)                                                   \
   25.36  ({                                                                      \
   25.37      unsigned long mfn = (_pte).pte_low >> PAGE_SHIFT;                   \
   25.38 -    unsigned long pfn = pte_present(_pte) ? mfn_to_pfn(mfn) : mfn;      \
   25.39 +    unsigned long pfn = mfn_to_pfn(mfn);                                \
   25.40      if ( (pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn) )               \
   25.41          pfn = max_mapnr; /* special: force !pfn_valid() */              \
   25.42      pfn;                                                                \
   25.43  })
   25.44  
   25.45 +#define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
   25.46 +
   25.47 +#define pte_none(x)		(!(x).pte_low)
   25.48 +
   25.49  #define pfn_pte(pfn, prot)	__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
   25.50  #define pfn_pte_ma(pfn, prot)	__pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
   25.51  #define pfn_pmd(pfn, prot)	__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/linux-2.6.7-xen-sparse/include/linux/bio.h	Fri Aug 20 09:21:37 2004 +0000
    26.3 @@ -0,0 +1,304 @@
    26.4 +/*
    26.5 + * 2.5 block I/O model
    26.6 + *
    26.7 + * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
    26.8 + *
    26.9 + * This program is free software; you can redistribute it and/or modify
   26.10 + * it under the terms of the GNU General Public License version 2 as
   26.11 + * published by the Free Software Foundation.
   26.12 + *
   26.13 + * This program is distributed in the hope that it will be useful,
   26.14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   26.15 +
   26.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   26.17 + * GNU General Public License for more details.
   26.18 + *
   26.19 + * You should have received a copy of the GNU General Public Licens
   26.20 + * along with this program; if not, write to the Free Software
   26.21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
   26.22 + */
   26.23 +#ifndef __LINUX_BIO_H
   26.24 +#define __LINUX_BIO_H
   26.25 +
   26.26 +#include <linux/highmem.h>
   26.27 +#include <linux/mempool.h>
   26.28 +
   26.29 +/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
   26.30 +#include <asm/io.h>
   26.31 +#ifndef BIO_VMERGE_BOUNDARY
   26.32 +#define BIO_VMERGE_BOUNDARY	0
   26.33 +#endif
   26.34 +
   26.35 +#define BIO_DEBUG
   26.36 +
   26.37 +#ifdef BIO_DEBUG
   26.38 +#define BIO_BUG_ON	BUG_ON
   26.39 +#else
   26.40 +#define BIO_BUG_ON
   26.41 +#endif
   26.42 +
   26.43 +#define BIO_MAX_PAGES		(256)
   26.44 +#define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
   26.45 +#define BIO_MAX_SECTORS		(BIO_MAX_SIZE >> 9)
   26.46 +
   26.47 +/*
   26.48 + * was unsigned short, but we might as well be ready for > 64kB I/O pages
   26.49 + */
   26.50 +struct bio_vec {
   26.51 +	struct page	*bv_page;
   26.52 +	unsigned int	bv_len;
   26.53 +	unsigned int	bv_offset;
   26.54 +};
   26.55 +
   26.56 +struct bio;
   26.57 +typedef int (bio_end_io_t) (struct bio *, unsigned int, int);
   26.58 +typedef void (bio_destructor_t) (struct bio *);
   26.59 +
   26.60 +/*
   26.61 + * main unit of I/O for the block layer and lower layers (ie drivers and
   26.62 + * stacking drivers)
   26.63 + */
   26.64 +struct bio {
   26.65 +	sector_t		bi_sector;
   26.66 +	struct bio		*bi_next;	/* request queue link */
   26.67 +	struct block_device	*bi_bdev;
   26.68 +	unsigned long		bi_flags;	/* status, command, etc */
   26.69 +	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
   26.70 +						 * top bits priority
   26.71 +						 */
   26.72 +
   26.73 +	unsigned short		bi_vcnt;	/* how many bio_vec's */
   26.74 +	unsigned short		bi_idx;		/* current index into bvl_vec */
   26.75 +
   26.76 +	/* Number of segments in this BIO after
   26.77 +	 * physical address coalescing is performed.
   26.78 +	 */
   26.79 +	unsigned short		bi_phys_segments;
   26.80 +
   26.81 +	/* Number of segments after physical and DMA remapping
   26.82 +	 * hardware coalescing is performed.
   26.83 +	 */
   26.84 +	unsigned short		bi_hw_segments;
   26.85 +
   26.86 +	unsigned int		bi_size;	/* residual I/O count */
   26.87 +	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
   26.88 +
   26.89 +	struct bio_vec		*bi_io_vec;	/* the actual vec list */
   26.90 +
   26.91 +	bio_end_io_t		*bi_end_io;
   26.92 +	atomic_t		bi_cnt;		/* pin count */
   26.93 +
   26.94 +	void			*bi_private;
   26.95 +
   26.96 +	bio_destructor_t	*bi_destructor;	/* destructor */
   26.97 +};
   26.98 +
   26.99 +/*
  26.100 + * bio flags
  26.101 + */
  26.102 +#define BIO_UPTODATE	0	/* ok after I/O completion */
  26.103 +#define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
  26.104 +#define BIO_EOF		2	/* out-out-bounds error */
  26.105 +#define BIO_SEG_VALID	3	/* nr_hw_seg valid */
  26.106 +#define BIO_CLONED	4	/* doesn't own data */
  26.107 +#define BIO_BOUNCED	5	/* bio is a bounce bio */
  26.108 +#define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
  26.109 +
  26.110 +/*
  26.111 + * top 4 bits of bio flags indicate the pool this bio came from
  26.112 + */
  26.113 +#define BIO_POOL_BITS		(4)
  26.114 +#define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
  26.115 +#define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
  26.116 +#define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)	
  26.117 +
  26.118 +/*
  26.119 + * bio bi_rw flags
  26.120 + *
  26.121 + * bit 0 -- read (not set) or write (set)
  26.122 + * bit 1 -- rw-ahead when set
  26.123 + * bit 2 -- barrier
  26.124 + * bit 3 -- fail fast, don't want low level driver retries
  26.125 + * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  26.126 + */
  26.127 +#define BIO_RW		0
  26.128 +#define BIO_RW_AHEAD	1
  26.129 +#define BIO_RW_BARRIER	2
  26.130 +#define BIO_RW_FAILFAST	3
  26.131 +#define BIO_RW_SYNC	4
  26.132 +
  26.133 +/*
  26.134 + * various member access, note that bio_data should of course not be used
  26.135 + * on highmem page vectors
  26.136 + */
  26.137 +#define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
  26.138 +#define bio_iovec(bio)		bio_iovec_idx((bio), (bio)->bi_idx)
  26.139 +#define bio_page(bio)		bio_iovec((bio))->bv_page
  26.140 +#define bio_offset(bio)		bio_iovec((bio))->bv_offset
  26.141 +#define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
  26.142 +#define bio_sectors(bio)	((bio)->bi_size >> 9)
  26.143 +#define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
  26.144 +#define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
  26.145 +#define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
  26.146 +#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
  26.147 +
  26.148 +/*
  26.149 + * will die
  26.150 + */
  26.151 +#define bio_to_phys(bio)	(page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio)))
  26.152 +#define bvec_to_phys(bv)	(page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
  26.153 +
  26.154 +/*
  26.155 + * queues that have highmem support enabled may still need to revert to
  26.156 + * PIO transfers occasionally and thus map high pages temporarily. For
  26.157 + * permanent PIO fall back, user is probably better off disabling highmem
  26.158 + * I/O completely on that queue (see ide-dma for example)
  26.159 + */
  26.160 +#define __bio_kmap_atomic(bio, idx, kmtype)				\
  26.161 +	(kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) +	\
  26.162 +		bio_iovec_idx((bio), (idx))->bv_offset)
  26.163 +
  26.164 +#define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype)
  26.165 +
  26.166 +/*
  26.167 + * merge helpers etc
  26.168 + */
  26.169 +
  26.170 +#define __BVEC_END(bio)		bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
  26.171 +#define __BVEC_START(bio)	bio_iovec_idx((bio), (bio)->bi_idx)
  26.172 +/* Platforms may set this to restrict multi-page buffer merging. */
  26.173 +#ifndef BIOVEC_PHYS_MERGEABLE
  26.174 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
  26.175 +	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
  26.176 +#endif
  26.177 +#define BIOVEC_VIRT_MERGEABLE(vec1, vec2)	\
  26.178 +	((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
  26.179 +#define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
  26.180 +	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
  26.181 +#define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
  26.182 +	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
  26.183 +#define BIO_SEG_BOUNDARY(q, b1, b2) \
  26.184 +	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
  26.185 +
  26.186 +#define bio_io_error(bio, bytes) bio_endio((bio), (bytes), -EIO)
  26.187 +
  26.188 +/*
  26.189 + * drivers should not use the __ version unless they _really_ want to
  26.190 + * run through the entire bio and not just pending pieces
  26.191 + */
  26.192 +#define __bio_for_each_segment(bvl, bio, i, start_idx)			\
  26.193 +	for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);	\
  26.194 +	     i < (bio)->bi_vcnt;					\
  26.195 +	     bvl++, i++)
  26.196 +
  26.197 +#define bio_for_each_segment(bvl, bio, i)				\
  26.198 +	__bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
  26.199 +
  26.200 +/*
  26.201 + * get a reference to a bio, so it won't disappear. the intended use is
  26.202 + * something like:
  26.203 + *
  26.204 + * bio_get(bio);
  26.205 + * submit_bio(rw, bio);
  26.206 + * if (bio->bi_flags ...)
  26.207 + *	do_something
  26.208 + * bio_put(bio);
  26.209 + *
  26.210 + * without the bio_get(), it could potentially complete I/O before submit_bio
  26.211 + * returns. and then bio would be freed memory when if (bio->bi_flags ...)
  26.212 + * runs
  26.213 + */
  26.214 +#define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
  26.215 +
  26.216 +
  26.217 +/*
  26.218 + * A bio_pair is used when we need to split a bio.
  26.219 + * This can only happen for a bio that refers to just one
  26.220 + * page of data, and in the unusual situation when the
  26.221 + * page crosses a chunk/device boundary
  26.222 + *
  26.223 + * The address of the master bio is stored in bio1.bi_private
  26.224 + * The address of the pool the pair was allocated from is stored
  26.225 + *   in bio2.bi_private
  26.226 + */
  26.227 +struct bio_pair {
  26.228 +	struct bio	bio1, bio2;
  26.229 +	struct bio_vec	bv1, bv2;
  26.230 +	atomic_t	cnt;
  26.231 +	int		error;
  26.232 +};
  26.233 +extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
  26.234 +				  int first_sectors);
  26.235 +extern mempool_t *bio_split_pool;
  26.236 +extern void bio_pair_release(struct bio_pair *dbio);
  26.237 +
  26.238 +extern struct bio *bio_alloc(int, int);
  26.239 +extern void bio_put(struct bio *);
  26.240 +
  26.241 +extern void bio_endio(struct bio *, unsigned int, int);
  26.242 +struct request_queue;
  26.243 +extern int bio_phys_segments(struct request_queue *, struct bio *);
  26.244 +extern int bio_hw_segments(struct request_queue *, struct bio *);
  26.245 +
  26.246 +extern void __bio_clone(struct bio *, struct bio *);
  26.247 +extern struct bio *bio_clone(struct bio *, int);
  26.248 +
  26.249 +extern void bio_init(struct bio *);
  26.250 +
  26.251 +extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
  26.252 +extern int bio_get_nr_vecs(struct block_device *);
  26.253 +extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
  26.254 +				unsigned long, unsigned int, int);
  26.255 +extern void bio_unmap_user(struct bio *, int);
  26.256 +extern void bio_set_pages_dirty(struct bio *bio);
  26.257 +extern void bio_check_pages_dirty(struct bio *bio);
  26.258 +
  26.259 +#ifdef CONFIG_HIGHMEM
  26.260 +/*
  26.261 + * remember to add offset! and never ever reenable interrupts between a
  26.262 + * bvec_kmap_irq and bvec_kunmap_irq!!
  26.263 + *
  26.264 + * This function MUST be inlined - it plays with the CPU interrupt flags.
  26.265 + * Hence the `extern inline'.
  26.266 + */
  26.267 +extern inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
  26.268 +{
  26.269 +	unsigned long addr;
  26.270 +
  26.271 +	/*
  26.272 +	 * might not be a highmem page, but the preempt/irq count
  26.273 +	 * balancing is a lot nicer this way
  26.274 +	 */
  26.275 +	local_irq_save(*flags);
  26.276 +	addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ);
  26.277 +
  26.278 +	BUG_ON(addr & ~PAGE_MASK);
  26.279 +
  26.280 +	return (char *) addr + bvec->bv_offset;
  26.281 +}
  26.282 +
  26.283 +extern inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
  26.284 +{
  26.285 +	unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
  26.286 +
  26.287 +	kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ);
  26.288 +	local_irq_restore(*flags);
  26.289 +}
  26.290 +
  26.291 +#else
  26.292 +#define bvec_kmap_irq(bvec, flags)	(page_address((bvec)->bv_page) + (bvec)->bv_offset)
  26.293 +#define bvec_kunmap_irq(buf, flags)	do { *(flags) = 0; } while (0)
  26.294 +#endif
  26.295 +
  26.296 +extern inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
  26.297 +				   unsigned long *flags)
  26.298 +{
  26.299 +	return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);
  26.300 +}
  26.301 +#define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
  26.302 +
  26.303 +#define bio_kmap_irq(bio, flags) \
  26.304 +	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
  26.305 +#define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
  26.306 +
  26.307 +#endif /* __LINUX_BIO_H */
    27.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.2 +++ b/linux-2.6.7-xen-sparse/include/linux/page-flags.h	Fri Aug 20 09:21:37 2004 +0000
    27.3 @@ -0,0 +1,343 @@
    27.4 +/*
    27.5 + * Macros for manipulating and testing page->flags
    27.6 + */
    27.7 +
    27.8 +#ifndef PAGE_FLAGS_H
    27.9 +#define PAGE_FLAGS_H
   27.10 +
   27.11 +#include <linux/percpu.h>
   27.12 +#include <linux/cache.h>
   27.13 +#include <asm/pgtable.h>
   27.14 +
   27.15 +/*
   27.16 + * Various page->flags bits:
   27.17 + *
   27.18 + * PG_reserved is set for special pages, which can never be swapped out. Some
   27.19 + * of them might not even exist (eg empty_bad_page)...
   27.20 + *
   27.21 + * The PG_private bitflag is set if page->private contains a valid value.
   27.22 + *
   27.23 + * During disk I/O, PG_locked is used. This bit is set before I/O and
   27.24 + * reset when I/O completes. page_waitqueue(page) is a wait queue of all tasks
   27.25 + * waiting for the I/O on this page to complete.
   27.26 + *
   27.27 + * PG_uptodate tells whether the page's contents is valid.  When a read
   27.28 + * completes, the page becomes uptodate, unless a disk I/O error happened.
   27.29 + *
   27.30 + * For choosing which pages to swap out, inode pages carry a PG_referenced bit,
   27.31 + * which is set any time the system accesses that page through the (mapping,
   27.32 + * index) hash table.  This referenced bit, together with the referenced bit
   27.33 + * in the page tables, is used to manipulate page->age and move the page across
   27.34 + * the active, inactive_dirty and inactive_clean lists.
   27.35 + *
   27.36 + * Note that the referenced bit, the page->lru list_head and the active,
   27.37 + * inactive_dirty and inactive_clean lists are protected by the
   27.38 + * zone->lru_lock, and *NOT* by the usual PG_locked bit!
   27.39 + *
   27.40 + * PG_error is set to indicate that an I/O error occurred on this page.
   27.41 + *
   27.42 + * PG_arch_1 is an architecture specific page state bit.  The generic code
   27.43 + * guarantees that this bit is cleared for a page when it first is entered into
   27.44 + * the page cache.
   27.45 + *
   27.46 + * PG_highmem pages are not permanently mapped into the kernel virtual address
   27.47 + * space, they need to be kmapped separately for doing IO on the pages.  The
   27.48 + * struct page (these bits with information) are always mapped into kernel
   27.49 + * address space...
   27.50 + */
   27.51 +
   27.52 +/*
   27.53 + * Don't use the *_dontuse flags.  Use the macros.  Otherwise you'll break
   27.54 + * locked- and dirty-page accounting.  The top eight bits of page->flags are
   27.55 + * used for page->zone, so putting flag bits there doesn't work.
   27.56 + */
   27.57 +#define PG_locked	 	 0	/* Page is locked. Don't touch. */
   27.58 +#define PG_error		 1
   27.59 +#define PG_referenced		 2
   27.60 +#define PG_uptodate		 3
   27.61 +
   27.62 +#define PG_dirty	 	 4
   27.63 +#define PG_lru			 5
   27.64 +#define PG_active		 6
   27.65 +#define PG_slab			 7	/* slab debug (Suparna wants this) */
   27.66 +
   27.67 +#define PG_highmem		 8
   27.68 +#define PG_checked		 9	/* kill me in 2.5.<early>. */
   27.69 +#define PG_arch_1		10
   27.70 +#define PG_reserved		11
   27.71 +
   27.72 +#define PG_private		12	/* Has something at ->private */
   27.73 +#define PG_writeback		13	/* Page is under writeback */
   27.74 +#define PG_nosave		14	/* Used for system suspend/resume */
   27.75 +#define PG_maplock		15	/* Lock bit for rmap to ptes */
   27.76 +
   27.77 +#define PG_swapcache		16	/* Swap page: swp_entry_t in private */
   27.78 +#define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
   27.79 +#define PG_reclaim		18	/* To be reclaimed asap */
   27.80 +#define PG_compound		19	/* Part of a compound page */
   27.81 +
   27.82 +#define PG_anon			20	/* Anonymous: anon_vma in mapping */
   27.83 +#define PG_foreign		21	/* Page belongs to foreign allocator */
   27.84 +
   27.85 +
   27.86 +/*
   27.87 + * Global page accounting.  One instance per CPU.  Only unsigned longs are
   27.88 + * allowed.
   27.89 + */
   27.90 +struct page_state {
   27.91 +	unsigned long nr_dirty;		/* Dirty writeable pages */
   27.92 +	unsigned long nr_writeback;	/* Pages under writeback */
   27.93 +	unsigned long nr_unstable;	/* NFS unstable pages */
   27.94 +	unsigned long nr_page_table_pages;/* Pages used for pagetables */
   27.95 +	unsigned long nr_mapped;	/* mapped into pagetables */
   27.96 +	unsigned long nr_slab;		/* In slab */
   27.97 +#define GET_PAGE_STATE_LAST nr_slab
   27.98 +
   27.99 +	/*
  27.100 +	 * The below are zeroed by get_page_state().  Use get_full_page_state()
  27.101 +	 * to add up all these.
  27.102 +	 */
  27.103 +	unsigned long pgpgin;		/* Disk reads */
  27.104 +	unsigned long pgpgout;		/* Disk writes */
  27.105 +	unsigned long pswpin;		/* swap reads */
  27.106 +	unsigned long pswpout;		/* swap writes */
  27.107 +	unsigned long pgalloc_high;	/* page allocations */
  27.108 +
  27.109 +	unsigned long pgalloc_normal;
  27.110 +	unsigned long pgalloc_dma;
  27.111 +	unsigned long pgfree;		/* page freeings */
  27.112 +	unsigned long pgactivate;	/* pages moved inactive->active */
  27.113 +	unsigned long pgdeactivate;	/* pages moved active->inactive */
  27.114 +
  27.115 +	unsigned long pgfault;		/* faults (major+minor) */
  27.116 +	unsigned long pgmajfault;	/* faults (major only) */
  27.117 +	unsigned long pgrefill_high;	/* inspected in refill_inactive_zone */
  27.118 +	unsigned long pgrefill_normal;
  27.119 +	unsigned long pgrefill_dma;
  27.120 +
  27.121 +	unsigned long pgsteal_high;	/* total highmem pages reclaimed */
  27.122 +	unsigned long pgsteal_normal;
  27.123 +	unsigned long pgsteal_dma;
  27.124 +	unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
  27.125 +	unsigned long pgscan_kswapd_normal;
  27.126 +
  27.127 +	unsigned long pgscan_kswapd_dma;
  27.128 +	unsigned long pgscan_direct_high;/* total highmem pages scanned */
  27.129 +	unsigned long pgscan_direct_normal;
  27.130 +	unsigned long pgscan_direct_dma;
  27.131 +	unsigned long pginodesteal;	/* pages reclaimed via inode freeing */
  27.132 +
  27.133 +	unsigned long slabs_scanned;	/* slab objects scanned */
  27.134 +	unsigned long kswapd_steal;	/* pages reclaimed by kswapd */
  27.135 +	unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
  27.136 +	unsigned long pageoutrun;	/* kswapd's calls to page reclaim */
  27.137 +	unsigned long allocstall;	/* direct reclaim calls */
  27.138 +
  27.139 +	unsigned long pgrotated;	/* pages rotated to tail of the LRU */
  27.140 +};
  27.141 +
  27.142 +DECLARE_PER_CPU(struct page_state, page_states);
  27.143 +
  27.144 +extern void get_page_state(struct page_state *ret);
  27.145 +extern void get_full_page_state(struct page_state *ret);
  27.146 +extern unsigned long __read_page_state(unsigned offset);
  27.147 +
  27.148 +#define read_page_state(member) \
  27.149 +	__read_page_state(offsetof(struct page_state, member))
  27.150 +
  27.151 +#define mod_page_state(member, delta)					\
  27.152 +	do {								\
  27.153 +		unsigned long flags;					\
  27.154 +		local_irq_save(flags);					\
  27.155 +		__get_cpu_var(page_states).member += (delta);		\
  27.156 +		local_irq_restore(flags);				\
  27.157 +	} while (0)
  27.158 +
  27.159 +
  27.160 +#define inc_page_state(member)	mod_page_state(member, 1UL)
  27.161 +#define dec_page_state(member)	mod_page_state(member, 0UL - 1)
  27.162 +#define add_page_state(member,delta) mod_page_state(member, (delta))
  27.163 +#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
  27.164 +
  27.165 +#define mod_page_state_zone(zone, member, delta)			\
  27.166 +	do {								\
  27.167 +		unsigned long flags;					\
  27.168 +		local_irq_save(flags);					\
  27.169 +		if (is_highmem(zone))					\
  27.170 +			__get_cpu_var(page_states).member##_high += (delta);\
  27.171 +		else if (is_normal(zone))				\
  27.172 +			__get_cpu_var(page_states).member##_normal += (delta);\
  27.173 +		else							\
  27.174 +			__get_cpu_var(page_states).member##_dma += (delta);\
  27.175 +		local_irq_restore(flags);				\
  27.176 +	} while (0)
  27.177 +
  27.178 +/*
  27.179 + * Manipulation of page state flags
  27.180 + */
  27.181 +#define PageLocked(page)		\
  27.182 +		test_bit(PG_locked, &(page)->flags)
  27.183 +#define SetPageLocked(page)		\
  27.184 +		set_bit(PG_locked, &(page)->flags)
  27.185 +#define TestSetPageLocked(page)		\
  27.186 +		test_and_set_bit(PG_locked, &(page)->flags)
  27.187 +#define ClearPageLocked(page)		\
  27.188 +		clear_bit(PG_locked, &(page)->flags)
  27.189 +#define TestClearPageLocked(page)	\
  27.190 +		test_and_clear_bit(PG_locked, &(page)->flags)
  27.191 +
  27.192 +#define PageError(page)		test_bit(PG_error, &(page)->flags)
  27.193 +#define SetPageError(page)	set_bit(PG_error, &(page)->flags)
  27.194 +#define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
  27.195 +
  27.196 +#define PageReferenced(page)	test_bit(PG_referenced, &(page)->flags)
  27.197 +#define SetPageReferenced(page)	set_bit(PG_referenced, &(page)->flags)
  27.198 +#define ClearPageReferenced(page)	clear_bit(PG_referenced, &(page)->flags)
  27.199 +#define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags)
  27.200 +
  27.201 +#ifndef arch_set_page_uptodate
  27.202 +#define arch_set_page_uptodate(page) do { } while (0)
  27.203 +#endif
  27.204 +
  27.205 +#define PageUptodate(page)	test_bit(PG_uptodate, &(page)->flags)
  27.206 +#define SetPageUptodate(page) \
  27.207 +	do {								\
  27.208 +		arch_set_page_uptodate(page);				\
  27.209 +		set_bit(PG_uptodate, &(page)->flags);			\
  27.210 +	} while (0)
  27.211 +#define ClearPageUptodate(page)	clear_bit(PG_uptodate, &(page)->flags)
  27.212 +
  27.213 +#define PageDirty(page)		test_bit(PG_dirty, &(page)->flags)
  27.214 +#define SetPageDirty(page)	set_bit(PG_dirty, &(page)->flags)
  27.215 +#define TestSetPageDirty(page)	test_and_set_bit(PG_dirty, &(page)->flags)
  27.216 +#define ClearPageDirty(page)	clear_bit(PG_dirty, &(page)->flags)
  27.217 +#define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags)
  27.218 +
  27.219 +#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
  27.220 +#define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
  27.221 +#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
  27.222 +#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
  27.223 +
  27.224 +#define PageActive(page)	test_bit(PG_active, &(page)->flags)
  27.225 +#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
  27.226 +#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
  27.227 +#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
  27.228 +#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
  27.229 +
  27.230 +#define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
  27.231 +#define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
  27.232 +#define ClearPageSlab(page)	clear_bit(PG_slab, &(page)->flags)
  27.233 +#define TestClearPageSlab(page)	test_and_clear_bit(PG_slab, &(page)->flags)
  27.234 +#define TestSetPageSlab(page)	test_and_set_bit(PG_slab, &(page)->flags)
  27.235 +
  27.236 +#ifdef CONFIG_HIGHMEM
  27.237 +#define PageHighMem(page)	test_bit(PG_highmem, &(page)->flags)
  27.238 +#else
  27.239 +#define PageHighMem(page)	0 /* needed to optimize away at compile time */
  27.240 +#endif
  27.241 +
  27.242 +#define PageChecked(page)	test_bit(PG_checked, &(page)->flags)
  27.243 +#define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
  27.244 +#define ClearPageChecked(page)	clear_bit(PG_checked, &(page)->flags)
  27.245 +
  27.246 +#define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
  27.247 +#define SetPageReserved(page)	set_bit(PG_reserved, &(page)->flags)
  27.248 +#define ClearPageReserved(page)	clear_bit(PG_reserved, &(page)->flags)
  27.249 +
  27.250 +#define SetPagePrivate(page)	set_bit(PG_private, &(page)->flags)
  27.251 +#define ClearPagePrivate(page)	clear_bit(PG_private, &(page)->flags)
  27.252 +#define PagePrivate(page)	test_bit(PG_private, &(page)->flags)
  27.253 +
  27.254 +#define PageWriteback(page)	test_bit(PG_writeback, &(page)->flags)
  27.255 +#define SetPageWriteback(page)						\
  27.256 +	do {								\
  27.257 +		if (!test_and_set_bit(PG_writeback,			\
  27.258 +				&(page)->flags))			\
  27.259 +			inc_page_state(nr_writeback);			\
  27.260 +	} while (0)
  27.261 +#define TestSetPageWriteback(page)					\
  27.262 +	({								\
  27.263 +		int ret;						\
  27.264 +		ret = test_and_set_bit(PG_writeback,			\
  27.265 +					&(page)->flags);		\
  27.266 +		if (!ret)						\
  27.267 +			inc_page_state(nr_writeback);			\
  27.268 +		ret;							\
  27.269 +	})
  27.270 +#define ClearPageWriteback(page)					\
  27.271 +	do {								\
  27.272 +		if (test_and_clear_bit(PG_writeback,			\
  27.273 +				&(page)->flags))			\
  27.274 +			dec_page_state(nr_writeback);			\
  27.275 +	} while (0)
  27.276 +#define TestClearPageWriteback(page)					\
  27.277 +	({								\
  27.278 +		int ret;						\
  27.279 +		ret = test_and_clear_bit(PG_writeback,			\
  27.280 +				&(page)->flags);			\
  27.281 +		if (ret)						\
  27.282 +			dec_page_state(nr_writeback);			\
  27.283 +		ret;							\
  27.284 +	})
  27.285 +
  27.286 +#define PageNosave(page)	test_bit(PG_nosave, &(page)->flags)
  27.287 +#define SetPageNosave(page)	set_bit(PG_nosave, &(page)->flags)
  27.288 +#define TestSetPageNosave(page)	test_and_set_bit(PG_nosave, &(page)->flags)
  27.289 +#define ClearPageNosave(page)		clear_bit(PG_nosave, &(page)->flags)
  27.290 +#define TestClearPageNosave(page)	test_and_clear_bit(PG_nosave, &(page)->flags)
  27.291 +
  27.292 +#define PageMappedToDisk(page)	test_bit(PG_mappedtodisk, &(page)->flags)
  27.293 +#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
  27.294 +#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
  27.295 +
  27.296 +#define PageReclaim(page)	test_bit(PG_reclaim, &(page)->flags)
  27.297 +#define SetPageReclaim(page)	set_bit(PG_reclaim, &(page)->flags)
  27.298 +#define ClearPageReclaim(page)	clear_bit(PG_reclaim, &(page)->flags)
  27.299 +#define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
  27.300 +
  27.301 +#define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
  27.302 +#define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
  27.303 +#define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
  27.304 +
  27.305 +#define PageAnon(page)		test_bit(PG_anon, &(page)->flags)
  27.306 +#define SetPageAnon(page)	set_bit(PG_anon, &(page)->flags)
  27.307 +#define ClearPageAnon(page)	clear_bit(PG_anon, &(page)->flags)
  27.308 +
  27.309 +/* A foreign page uses a custom destructor rather than the buddy allocator. */
  27.310 +#ifdef CONFIG_FOREIGN_PAGES
  27.311 +#define PageForeign(page)	test_bit(PG_foreign, &(page)->flags)
  27.312 +#define SetPageForeign(page)	set_bit(PG_foreign, &(page)->flags)
  27.313 +#define ClearPageForeign(page)	clear_bit(PG_foreign, &(page)->flags)
  27.314 +#define PageForeignDestructor(page)	\
  27.315 +	( (void (*) (struct page *)) (page)->mapping )
  27.316 +#else
  27.317 +#define PageForeign(page)	0
  27.318 +#define PageForeignDestructor(page)	void
  27.319 +#endif
  27.320 +
  27.321 +#ifdef CONFIG_SWAP
  27.322 +#define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
  27.323 +#define SetPageSwapCache(page)	set_bit(PG_swapcache, &(page)->flags)
  27.324 +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags)
  27.325 +#else
  27.326 +#define PageSwapCache(page)	0
  27.327 +#endif
  27.328 +
  27.329 +struct page;	/* forward declaration */
  27.330 +
  27.331 +int test_clear_page_dirty(struct page *page);
  27.332 +int __clear_page_dirty(struct page *page);
  27.333 +int test_clear_page_writeback(struct page *page);
  27.334 +int test_set_page_writeback(struct page *page);
  27.335 +
  27.336 +static inline void clear_page_dirty(struct page *page)
  27.337 +{
  27.338 +	test_clear_page_dirty(page);
  27.339 +}
  27.340 +
  27.341 +static inline void set_page_writeback(struct page *page)
  27.342 +{
  27.343 +	test_set_page_writeback(page);
  27.344 +}
  27.345 +
  27.346 +#endif	/* PAGE_FLAGS_H */
    28.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.2 +++ b/linux-2.6.7-xen-sparse/include/linux/skbuff.h	Fri Aug 20 09:21:37 2004 +0000
    28.3 @@ -0,0 +1,1073 @@
    28.4 +/*
    28.5 + *	Definitions for the 'struct sk_buff' memory handlers.
    28.6 + *
    28.7 + *	Authors:
    28.8 + *		Alan Cox, <gw4pts@gw4pts.ampr.org>
    28.9 + *		Florian La Roche, <rzsfl@rz.uni-sb.de>
   28.10 + *
   28.11 + *	This program is free software; you can redistribute it and/or
   28.12 + *	modify it under the terms of the GNU General Public License
   28.13 + *	as published by the Free Software Foundation; either version
   28.14 + *	2 of the License, or (at your option) any later version.
   28.15 + */
   28.16 +
   28.17 +#ifndef _LINUX_SKBUFF_H
   28.18 +#define _LINUX_SKBUFF_H
   28.19 +
   28.20 +#include <linux/config.h>
   28.21 +#include <linux/kernel.h>
   28.22 +#include <linux/compiler.h>
   28.23 +#include <linux/time.h>
   28.24 +#include <linux/cache.h>
   28.25 +
   28.26 +#include <asm/atomic.h>
   28.27 +#include <asm/types.h>
   28.28 +#include <linux/spinlock.h>
   28.29 +#include <linux/mm.h>
   28.30 +#include <linux/highmem.h>
   28.31 +#include <linux/poll.h>
   28.32 +#include <linux/net.h>
   28.33 +
   28.34 +#define HAVE_ALLOC_SKB		/* For the drivers to know */
   28.35 +#define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
   28.36 +#define SLAB_SKB 		/* Slabified skbuffs 	   */
   28.37 +
   28.38 +#define CHECKSUM_NONE 0
   28.39 +#define CHECKSUM_HW 1
   28.40 +#define CHECKSUM_UNNECESSARY 2
   28.41 +
   28.42 +#define SKB_DATA_ALIGN(X)	(((X) + (SMP_CACHE_BYTES - 1)) & \
   28.43 +				 ~(SMP_CACHE_BYTES - 1))
   28.44 +#define SKB_MAX_ORDER(X, ORDER)	(((PAGE_SIZE << (ORDER)) - (X) - \
   28.45 +				  sizeof(struct skb_shared_info)) & \
   28.46 +				  ~(SMP_CACHE_BYTES - 1))
   28.47 +#define SKB_MAX_HEAD(X)		(SKB_MAX_ORDER((X), 0))
   28.48 +#define SKB_MAX_ALLOC		(SKB_MAX_ORDER(0, 2))
   28.49 +
   28.50 +/* A. Checksumming of received packets by device.
   28.51 + *
   28.52 + *	NONE: device failed to checksum this packet.
   28.53 + *		skb->csum is undefined.
   28.54 + *
   28.55 + *	UNNECESSARY: device parsed packet and wouldbe verified checksum.
   28.56 + *		skb->csum is undefined.
   28.57 + *	      It is bad option, but, unfortunately, many of vendors do this.
   28.58 + *	      Apparently with secret goal to sell you new device, when you
   28.59 + *	      will add new protocol to your host. F.e. IPv6. 8)
   28.60 + *
   28.61 + *	HW: the most generic way. Device supplied checksum of _all_
   28.62 + *	    the packet as seen by netif_rx in skb->csum.
   28.63 + *	    NOTE: Even if device supports only some protocols, but
   28.64 + *	    is able to produce some skb->csum, it MUST use HW,
   28.65 + *	    not UNNECESSARY.
   28.66 + *
   28.67 + * B. Checksumming on output.
   28.68 + *
   28.69 + *	NONE: skb is checksummed by protocol or csum is not required.
   28.70 + *
   28.71 + *	HW: device is required to csum packet as seen by hard_start_xmit
   28.72 + *	from skb->h.raw to the end and to record the checksum
   28.73 + *	at skb->h.raw+skb->csum.
   28.74 + *
   28.75 + *	Device must show its capabilities in dev->features, set
   28.76 + *	at device setup time.
   28.77 + *	NETIF_F_HW_CSUM	- it is clever device, it is able to checksum
   28.78 + *			  everything.
   28.79 + *	NETIF_F_NO_CSUM - loopback or reliable single hop media.
   28.80 + *	NETIF_F_IP_CSUM - device is dumb. It is able to csum only
   28.81 + *			  TCP/UDP over IPv4. Sigh. Vendors like this
   28.82 + *			  way by an unknown reason. Though, see comment above
   28.83 + *			  about CHECKSUM_UNNECESSARY. 8)
   28.84 + *
   28.85 + *	Any questions? No questions, good. 		--ANK
   28.86 + */
   28.87 +
   28.88 +#ifdef __i386__
   28.89 +#define NET_CALLER(arg) (*(((void **)&arg) - 1))
   28.90 +#else
   28.91 +#define NET_CALLER(arg) __builtin_return_address(0)
   28.92 +#endif
   28.93 +
   28.94 +#ifdef CONFIG_NETFILTER
   28.95 +struct nf_conntrack {
   28.96 +	atomic_t use;
   28.97 +	void (*destroy)(struct nf_conntrack *);
   28.98 +};
   28.99 +
  28.100 +struct nf_ct_info {
  28.101 +	struct nf_conntrack *master;
  28.102 +};
  28.103 +
  28.104 +#ifdef CONFIG_BRIDGE_NETFILTER
  28.105 +struct nf_bridge_info {
  28.106 +	atomic_t use;
  28.107 +	struct net_device *physindev;
  28.108 +	struct net_device *physoutdev;
  28.109 +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
  28.110 +	struct net_device *netoutdev;
  28.111 +#endif
  28.112 +	unsigned int mask;
  28.113 +	unsigned long data[32 / sizeof(unsigned long)];
  28.114 +};
  28.115 +#endif
  28.116 +
  28.117 +#endif
  28.118 +
  28.119 +struct sk_buff_head {
  28.120 +	/* These two members must be first. */
  28.121 +	struct sk_buff	*next;
  28.122 +	struct sk_buff	*prev;
  28.123 +
  28.124 +	__u32		qlen;
  28.125 +	spinlock_t	lock;
  28.126 +};
  28.127 +
  28.128 +struct sk_buff;
  28.129 +
  28.130 +/* To allow 64K frame to be packed as single skb without frag_list */
  28.131 +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2)
  28.132 +
  28.133 +typedef struct skb_frag_struct skb_frag_t;
  28.134 +
  28.135 +struct skb_frag_struct {
  28.136 +	struct page *page;
  28.137 +	__u16 page_offset;
  28.138 +	__u16 size;
  28.139 +};
  28.140 +
  28.141 +/* This data is invariant across clones and lives at
  28.142 + * the end of the header data, ie. at skb->end.
  28.143 + */
  28.144 +struct skb_shared_info {
  28.145 +	atomic_t	dataref;
  28.146 +	unsigned int	nr_frags;
  28.147 +	unsigned short	tso_size;
  28.148 +	unsigned short	tso_segs;
  28.149 +	struct sk_buff	*frag_list;
  28.150 +	skb_frag_t	frags[MAX_SKB_FRAGS];
  28.151 +};
  28.152 +
  28.153 +/** 
  28.154 + *	struct sk_buff - socket buffer
  28.155 + *	@next: Next buffer in list
  28.156 + *	@prev: Previous buffer in list
  28.157 + *	@list: List we are on
  28.158 + *	@sk: Socket we are owned by
  28.159 + *	@stamp: Time we arrived
  28.160 + *	@dev: Device we arrived on/are leaving by
  28.161 + *      @real_dev: The real device we are using
  28.162 + *	@h: Transport layer header
  28.163 + *	@nh: Network layer header
  28.164 + *	@mac: Link layer header
  28.165 + *	@dst: FIXME: Describe this field
  28.166 + *	@cb: Control buffer. Free for use by every layer. Put private vars here
  28.167 + *	@len: Length of actual data
  28.168 + *	@data_len: Data length
  28.169 + *	@mac_len: Length of link layer header
  28.170 + *	@csum: Checksum
  28.171 + *	@__unused: Dead field, may be reused
  28.172 + *	@cloned: Head may be cloned (check refcnt to be sure)
  28.173 + *	@pkt_type: Packet class
  28.174 + *	@ip_summed: Driver fed us an IP checksum
  28.175 + *	@priority: Packet queueing priority
  28.176 + *	@users: User count - see {datagram,tcp}.c
  28.177 + *	@protocol: Packet protocol from driver
  28.178 + *	@security: Security level of packet
  28.179 + *	@truesize: Buffer size 
  28.180 + *	@head: Head of buffer
  28.181 + *	@data: Data head pointer
  28.182 + *	@tail: Tail pointer
  28.183 + *	@end: End pointer
  28.184 + *	@destructor: Destruct function
  28.185 + *	@nfmark: Can be used for communication between hooks
  28.186 + *	@nfcache: Cache info
  28.187 + *	@nfct: Associated connection, if any
  28.188 + *	@nf_debug: Netfilter debugging
  28.189 + *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  28.190 + *      @private: Data which is private to the HIPPI implementation
  28.191 + *	@tc_index: Traffic control index
  28.192 + */
  28.193 +
  28.194 +struct sk_buff {
  28.195 +	/* These two members must be first. */
  28.196 +	struct sk_buff		*next;
  28.197 +	struct sk_buff		*prev;
  28.198 +
  28.199 +	struct sk_buff_head	*list;
  28.200 +	struct sock		*sk;
  28.201 +	struct timeval		stamp;
  28.202 +	struct net_device	*dev;
  28.203 +	struct net_device	*real_dev;
  28.204 +
  28.205 +	union {
  28.206 +		struct tcphdr	*th;
  28.207 +		struct udphdr	*uh;
  28.208 +		struct icmphdr	*icmph;
  28.209 +		struct igmphdr	*igmph;
  28.210 +		struct iphdr	*ipiph;
  28.211 +		struct ipv6hdr	*ipv6h;
  28.212 +		unsigned char	*raw;
  28.213 +	} h;
  28.214 +
  28.215 +	union {
  28.216 +		struct iphdr	*iph;
  28.217 +		struct ipv6hdr	*ipv6h;
  28.218 +		struct arphdr	*arph;
  28.219 +		unsigned char	*raw;
  28.220 +	} nh;
  28.221 +
  28.222 +	union {
  28.223 +	  	struct ethhdr	*ethernet;
  28.224 +	  	unsigned char 	*raw;
  28.225 +	} mac;
  28.226 +
  28.227 +	struct  dst_entry	*dst;
  28.228 +	struct	sec_path	*sp;
  28.229 +
  28.230 +	/*
  28.231 +	 * This is the control buffer. It is free to use for every
  28.232 +	 * layer. Please put your private variables there. If you
  28.233 +	 * want to keep them across layers you have to do a skb_clone()
  28.234 +	 * first. This is owned by whoever has the skb queued ATM.
  28.235 +	 */
  28.236 +	char			cb[48];
  28.237 +
  28.238 +	unsigned int		len,
  28.239 +				data_len,
  28.240 +				mac_len,
  28.241 +				csum;
  28.242 +	unsigned char		local_df,
  28.243 +				cloned,
  28.244 +				pkt_type,
  28.245 +				ip_summed;
  28.246 +	__u32			priority;
  28.247 +	unsigned short		protocol,
  28.248 +				security;
  28.249 +
  28.250 +	void			(*destructor)(struct sk_buff *skb);
  28.251 +#ifdef CONFIG_NETFILTER
  28.252 +        unsigned long		nfmark;
  28.253 +	__u32			nfcache;
  28.254 +	struct nf_ct_info	*nfct;
  28.255 +#ifdef CONFIG_NETFILTER_DEBUG
  28.256 +        unsigned int		nf_debug;
  28.257 +#endif
  28.258 +#ifdef CONFIG_BRIDGE_NETFILTER
  28.259 +	struct nf_bridge_info	*nf_bridge;
  28.260 +#endif
  28.261 +#endif /* CONFIG_NETFILTER */
  28.262 +#if defined(CONFIG_HIPPI)
  28.263 +	union {
  28.264 +		__u32		ifield;
  28.265 +	} private;
  28.266 +#endif
  28.267 +#ifdef CONFIG_NET_SCHED
  28.268 +       __u32			tc_index;               /* traffic control index */
  28.269 +#endif
  28.270 +
  28.271 +	/* These elements must be at the end, see alloc_skb() for details.  */
  28.272 +	unsigned int		truesize;
  28.273 +	atomic_t		users;
  28.274 +	unsigned char		*head,
  28.275 +				*data,
  28.276 +				*tail,
  28.277 +				*end;
  28.278 +};
  28.279 +
  28.280 +#ifdef __KERNEL__
  28.281 +/*
  28.282 + *	Handling routines are only of interest to the kernel
  28.283 + */
  28.284 +#include <linux/slab.h>
  28.285 +
  28.286 +#include <asm/system.h>
  28.287 +
  28.288 +extern void	       __kfree_skb(struct sk_buff *skb);
  28.289 +extern struct sk_buff *alloc_skb(unsigned int size, int priority);
  28.290 +extern void	       kfree_skbmem(struct sk_buff *skb);
  28.291 +extern struct sk_buff *skb_clone(struct sk_buff *skb, int priority);
  28.292 +extern struct sk_buff *skb_copy(const struct sk_buff *skb, int priority);
  28.293 +extern struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask);
  28.294 +extern int	       pskb_expand_head(struct sk_buff *skb,
  28.295 +					int nhead, int ntail, int gfp_mask);
  28.296 +extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
  28.297 +					    unsigned int headroom);
  28.298 +extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
  28.299 +				       int newheadroom, int newtailroom,
  28.300 +				       int priority);
  28.301 +extern struct sk_buff *		skb_pad(struct sk_buff *skb, int pad);
  28.302 +#define dev_kfree_skb(a)	kfree_skb(a)
  28.303 +extern void	      skb_over_panic(struct sk_buff *skb, int len,
  28.304 +				     void *here);
  28.305 +extern void	      skb_under_panic(struct sk_buff *skb, int len,
  28.306 +				      void *here);
  28.307 +
  28.308 +/* Internal */
  28.309 +#define skb_shinfo(SKB)		((struct skb_shared_info *)((SKB)->end))
  28.310 +
  28.311 +/**
  28.312 + *	skb_queue_empty - check if a queue is empty
  28.313 + *	@list: queue head
  28.314 + *
  28.315 + *	Returns true if the queue is empty, false otherwise.
  28.316 + */
  28.317 +static inline int skb_queue_empty(const struct sk_buff_head *list)
  28.318 +{
  28.319 +	return list->next == (struct sk_buff *)list;
  28.320 +}
  28.321 +
  28.322 +/**
  28.323 + *	skb_get - reference buffer
  28.324 + *	@skb: buffer to reference
  28.325 + *
  28.326 + *	Makes another reference to a socket buffer and returns a pointer
  28.327 + *	to the buffer.
  28.328 + */
  28.329 +static inline struct sk_buff *skb_get(struct sk_buff *skb)
  28.330 +{
  28.331 +	atomic_inc(&skb->users);
  28.332 +	return skb;
  28.333 +}
  28.334 +
  28.335 +/*
  28.336 + * If users == 1, we are the only owner and are can avoid redundant
  28.337 + * atomic change.
  28.338 + */
  28.339 +
  28.340 +/**
  28.341 + *	kfree_skb - free an sk_buff
  28.342 + *	@skb: buffer to free
  28.343 + *
  28.344 + *	Drop a reference to the buffer and free it if the usage count has
  28.345 + *	hit zero.
  28.346 + */
  28.347 +static inline void kfree_skb(struct sk_buff *skb)
  28.348 +{
  28.349 +	if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users))
  28.350 +		__kfree_skb(skb);
  28.351 +}
  28.352 +
  28.353 +/* Use this if you didn't touch the skb state [for fast switching] */
  28.354 +static inline void kfree_skb_fast(struct sk_buff *skb)
  28.355 +{
  28.356 +	if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users))
  28.357 +		kfree_skbmem(skb);
  28.358 +}
  28.359 +
  28.360 +/**
  28.361 + *	skb_cloned - is the buffer a clone
  28.362 + *	@skb: buffer to check
  28.363 + *
  28.364 + *	Returns true if the buffer was generated with skb_clone() and is
  28.365 + *	one of multiple shared copies of the buffer. Cloned buffers are
  28.366 + *	shared data so must not be written to under normal circumstances.
  28.367 + */
  28.368 +static inline int skb_cloned(const struct sk_buff *skb)
  28.369 +{
  28.370 +	return skb->cloned && atomic_read(&skb_shinfo(skb)->dataref) != 1;
  28.371 +}
  28.372 +
  28.373 +/**
  28.374 + *	skb_shared - is the buffer shared
  28.375 + *	@skb: buffer to check
  28.376 + *
  28.377 + *	Returns true if more than one person has a reference to this
  28.378 + *	buffer.
  28.379 + */
  28.380 +static inline int skb_shared(const struct sk_buff *skb)
  28.381 +{
  28.382 +	return atomic_read(&skb->users) != 1;
  28.383 +}
  28.384 +
  28.385 +/**
  28.386 + *	skb_share_check - check if buffer is shared and if so clone it
  28.387 + *	@skb: buffer to check
  28.388 + *	@pri: priority for memory allocation
  28.389 + *
  28.390 + *	If the buffer is shared the buffer is cloned and the old copy
  28.391 + *	drops a reference. A new clone with a single reference is returned.
  28.392 + *	If the buffer is not shared the original buffer is returned. When
  28.393 + *	being called from interrupt status or with spinlocks held pri must
  28.394 + *	be GFP_ATOMIC.
  28.395 + *
  28.396 + *	NULL is returned on a memory allocation failure.
  28.397 + */
  28.398 +static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri)
  28.399 +{
  28.400 +	might_sleep_if(pri & __GFP_WAIT);
  28.401 +	if (skb_shared(skb)) {
  28.402 +		struct sk_buff *nskb = skb_clone(skb, pri);
  28.403 +		kfree_skb(skb);
  28.404 +		skb = nskb;
  28.405 +	}
  28.406 +	return skb;
  28.407 +}
  28.408 +
  28.409 +/*
  28.410 + *	Copy shared buffers into a new sk_buff. We effectively do COW on
  28.411 + *	packets to handle cases where we have a local reader and forward
  28.412 + *	and a couple of other messy ones. The normal one is tcpdumping
  28.413 + *	a packet thats being forwarded.
  28.414 + */
  28.415 +
  28.416 +/**
  28.417 + *	skb_unshare - make a copy of a shared buffer
  28.418 + *	@skb: buffer to check
  28.419 + *	@pri: priority for memory allocation
  28.420 + *
  28.421 + *	If the socket buffer is a clone then this function creates a new
  28.422 + *	copy of the data, drops a reference count on the old copy and returns
  28.423 + *	the new copy with the reference count at 1. If the buffer is not a clone
  28.424 + *	the original buffer is returned. When called with a spinlock held or
  28.425 + *	from interrupt state @pri must be %GFP_ATOMIC
  28.426 + *
  28.427 + *	%NULL is returned on a memory allocation failure.
  28.428 + */
  28.429 +static inline struct sk_buff *skb_unshare(struct sk_buff *skb, int pri)
  28.430 +{
  28.431 +	might_sleep_if(pri & __GFP_WAIT);
  28.432 +	if (skb_cloned(skb)) {
  28.433 +		struct sk_buff *nskb = skb_copy(skb, pri);
  28.434 +		kfree_skb(skb);	/* Free our shared copy */
  28.435 +		skb = nskb;
  28.436 +	}
  28.437 +	return skb;
  28.438 +}
  28.439 +
  28.440 +/**
  28.441 + *	skb_peek
  28.442 + *	@list_: list to peek at
  28.443 + *
  28.444 + *	Peek an &sk_buff. Unlike most other operations you _MUST_
  28.445 + *	be careful with this one. A peek leaves the buffer on the
  28.446 + *	list and someone else may run off with it. You must hold
  28.447 + *	the appropriate locks or have a private queue to do this.
  28.448 + *
  28.449 + *	Returns %NULL for an empty list or a pointer to the head element.
  28.450 + *	The reference count is not incremented and the reference is therefore
  28.451 + *	volatile. Use with caution.
  28.452 + */
  28.453 +static inline struct sk_buff *skb_peek(struct sk_buff_head *list_)
  28.454 +{
  28.455 +	struct sk_buff *list = ((struct sk_buff *)list_)->next;
  28.456 +	if (list == (struct sk_buff *)list_)
  28.457 +		list = NULL;
  28.458 +	return list;
  28.459 +}
  28.460 +
  28.461 +/**
  28.462 + *	skb_peek_tail
  28.463 + *	@list_: list to peek at
  28.464 + *
  28.465 + *	Peek an &sk_buff. Unlike most other operations you _MUST_
  28.466 + *	be careful with this one. A peek leaves the buffer on the
  28.467 + *	list and someone else may run off with it. You must hold
  28.468 + *	the appropriate locks or have a private queue to do this.
  28.469 + *
  28.470 + *	Returns %NULL for an empty list or a pointer to the tail element.
  28.471 + *	The reference count is not incremented and the reference is therefore
  28.472 + *	volatile. Use with caution.
  28.473 + */
  28.474 +static inline struct sk_buff *skb_peek_tail(struct sk_buff_head *list_)
  28.475 +{
  28.476 +	struct sk_buff *list = ((struct sk_buff *)list_)->prev;
  28.477 +	if (list == (struct sk_buff *)list_)
  28.478 +		list = NULL;
  28.479 +	return list;
  28.480 +}
  28.481 +
  28.482 +/**
  28.483 + *	skb_queue_len	- get queue length
  28.484 + *	@list_: list to measure
  28.485 + *
  28.486 + *	Return the length of an &sk_buff queue.
  28.487 + */
  28.488 +static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
  28.489 +{
  28.490 +	return list_->qlen;
  28.491 +}
  28.492 +
  28.493 +static inline void skb_queue_head_init(struct sk_buff_head *list)
  28.494 +{
  28.495 +	spin_lock_init(&list->lock);
  28.496 +	list->prev = list->next = (struct sk_buff *)list;
  28.497 +	list->qlen = 0;
  28.498 +}
  28.499 +
  28.500 +/*
  28.501 + *	Insert an sk_buff at the start of a list.
  28.502 + *
  28.503 + *	The "__skb_xxxx()" functions are the non-atomic ones that
  28.504 + *	can only be called with interrupts disabled.
  28.505 + */
  28.506 +
  28.507 +/**
  28.508 + *	__skb_queue_head - queue a buffer at the list head
  28.509 + *	@list: list to use
  28.510 + *	@newsk: buffer to queue
  28.511 + *
  28.512 + *	Queue a buffer at the start of a list. This function takes no locks
  28.513 + *	and you must therefore hold required locks before calling it.
  28.514 + *
  28.515 + *	A buffer cannot be placed on two lists at the same time.
  28.516 + */
  28.517 +extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
  28.518 +static inline void __skb_queue_head(struct sk_buff_head *list,
  28.519 +				    struct sk_buff *newsk)
  28.520 +{
  28.521 +	struct sk_buff *prev, *next;
  28.522 +
  28.523 +	newsk->list = list;
  28.524 +	list->qlen++;
  28.525 +	prev = (struct sk_buff *)list;
  28.526 +	next = prev->next;
  28.527 +	newsk->next = next;
  28.528 +	newsk->prev = prev;
  28.529 +	next->prev  = prev->next = newsk;
  28.530 +}
  28.531 +
  28.532 +/**
  28.533 + *	__skb_queue_tail - queue a buffer at the list tail
  28.534 + *	@list: list to use
  28.535 + *	@newsk: buffer to queue
  28.536 + *
  28.537 + *	Queue a buffer at the end of a list. This function takes no locks
  28.538 + *	and you must therefore hold required locks before calling it.
  28.539 + *
  28.540 + *	A buffer cannot be placed on two lists at the same time.
  28.541 + */
  28.542 +extern void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);
  28.543 +static inline void __skb_queue_tail(struct sk_buff_head *list,
  28.544 +				   struct sk_buff *newsk)
  28.545 +{
  28.546 +	struct sk_buff *prev, *next;
  28.547 +
  28.548 +	newsk->list = list;
  28.549 +	list->qlen++;
  28.550 +	next = (struct sk_buff *)list;
  28.551 +	prev = next->prev;
  28.552 +	newsk->next = next;
  28.553 +	newsk->prev = prev;
  28.554 +	next->prev  = prev->next = newsk;
  28.555 +}
  28.556 +
  28.557 +
  28.558 +/**
  28.559 + *	__skb_dequeue - remove from the head of the queue
  28.560 + *	@list: list to dequeue from
  28.561 + *
  28.562 + *	Remove the head of the list. This function does not take any locks
  28.563 + *	so must be used with appropriate locks held only. The head item is
  28.564 + *	returned or %NULL if the list is empty.
  28.565 + */
  28.566 +extern struct sk_buff *skb_dequeue(struct sk_buff_head *list);
  28.567 +static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
  28.568 +{
  28.569 +	struct sk_buff *next, *prev, *result;
  28.570 +
  28.571 +	prev = (struct sk_buff *) list;
  28.572 +	next = prev->next;
  28.573 +	result = NULL;
  28.574 +	if (next != prev) {
  28.575 +		result	     = next;
  28.576 +		next	     = next->next;
  28.577 +		list->qlen--;
  28.578 +		next->prev   = prev;
  28.579 +		prev->next   = next;
  28.580 +		result->next = result->prev = NULL;
  28.581 +		result->list = NULL;
  28.582 +	}
  28.583 +	return result;
  28.584 +}
  28.585 +
  28.586 +
  28.587 +/*
  28.588 + *	Insert a packet on a list.
  28.589 + */
  28.590 +extern void        skb_insert(struct sk_buff *old, struct sk_buff *newsk);
  28.591 +static inline void __skb_insert(struct sk_buff *newsk,
  28.592 +				struct sk_buff *prev, struct sk_buff *next,
  28.593 +				struct sk_buff_head *list)
  28.594 +{
  28.595 +	newsk->next = next;
  28.596 +	newsk->prev = prev;
  28.597 +	next->prev  = prev->next = newsk;
  28.598 +	newsk->list = list;
  28.599 +	list->qlen++;
  28.600 +}
  28.601 +
  28.602 +/*
  28.603 + *	Place a packet after a given packet in a list.
  28.604 + */
  28.605 +extern void	   skb_append(struct sk_buff *old, struct sk_buff *newsk);
  28.606 +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk)
  28.607 +{
  28.608 +	__skb_insert(newsk, old, old->next, old->list);
  28.609 +}
  28.610 +
  28.611 +/*
  28.612 + * remove sk_buff from list. _Must_ be called atomically, and with
  28.613 + * the list known..
  28.614 + */
  28.615 +extern void	   skb_unlink(struct sk_buff *skb);
  28.616 +static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
  28.617 +{
  28.618 +	struct sk_buff *next, *prev;
  28.619 +
  28.620 +	list->qlen--;
  28.621 +	next	   = skb->next;
  28.622 +	prev	   = skb->prev;
  28.623 +	skb->next  = skb->prev = NULL;
  28.624 +	skb->list  = NULL;
  28.625 +	next->prev = prev;
  28.626 +	prev->next = next;
  28.627 +}
  28.628 +
  28.629 +
  28.630 +/* XXX: more streamlined implementation */
  28.631 +
  28.632 +/**
  28.633 + *	__skb_dequeue_tail - remove from the tail of the queue
  28.634 + *	@list: list to dequeue from
  28.635 + *
  28.636 + *	Remove the tail of the list. This function does not take any locks
  28.637 + *	so must be used with appropriate locks held only. The tail item is
  28.638 + *	returned or %NULL if the list is empty.
  28.639 + */
  28.640 +extern struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
  28.641 +static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
  28.642 +{
  28.643 +	struct sk_buff *skb = skb_peek_tail(list);
  28.644 +	if (skb)
  28.645 +		__skb_unlink(skb, list);
  28.646 +	return skb;
  28.647 +}
  28.648 +
  28.649 +
  28.650 +static inline int skb_is_nonlinear(const struct sk_buff *skb)
  28.651 +{
  28.652 +	return skb->data_len;
  28.653 +}
  28.654 +
  28.655 +static inline unsigned int skb_headlen(const struct sk_buff *skb)
  28.656 +{
  28.657 +	return skb->len - skb->data_len;
  28.658 +}
  28.659 +
  28.660 +static inline int skb_pagelen(const struct sk_buff *skb)
  28.661 +{
  28.662 +	int i, len = 0;
  28.663 +
  28.664 +	for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
  28.665 +		len += skb_shinfo(skb)->frags[i].size;
  28.666 +	return len + skb_headlen(skb);
  28.667 +}
  28.668 +
  28.669 +static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
  28.670 +{
  28.671 +	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  28.672 +	frag->page = page;
  28.673 +	frag->page_offset = off;
  28.674 +	frag->size = size;
  28.675 +	skb_shinfo(skb)->nr_frags = i+1;
  28.676 +}
  28.677 +
  28.678 +#define SKB_PAGE_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->nr_frags)
  28.679 +#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->frag_list)
  28.680 +#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))
  28.681 +
  28.682 +/*
  28.683 + *	Add data to an sk_buff
  28.684 + */
  28.685 +static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len)
  28.686 +{
  28.687 +	unsigned char *tmp = skb->tail;
  28.688 +	SKB_LINEAR_ASSERT(skb);
  28.689 +	skb->tail += len;
  28.690 +	skb->len  += len;
  28.691 +	return tmp;
  28.692 +}
  28.693 +
  28.694 +/**
  28.695 + *	skb_put - add data to a buffer
  28.696 + *	@skb: buffer to use
  28.697 + *	@len: amount of data to add
  28.698 + *
  28.699 + *	This function extends the used data area of the buffer. If this would
  28.700 + *	exceed the total buffer size the kernel will panic. A pointer to the
  28.701 + *	first byte of the extra data is returned.
  28.702 + */
  28.703 +static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
  28.704 +{
  28.705 +	unsigned char *tmp = skb->tail;
  28.706 +	SKB_LINEAR_ASSERT(skb);
  28.707 +	skb->tail += len;
  28.708 +	skb->len  += len;
  28.709 +	if (unlikely(skb->tail>skb->end))
  28.710 +		skb_over_panic(skb, len, current_text_addr());
  28.711 +	return tmp;
  28.712 +}
  28.713 +
  28.714 +static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
  28.715 +{
  28.716 +	skb->data -= len;
  28.717 +	skb->len  += len;
  28.718 +	return skb->data;
  28.719 +}
  28.720 +
  28.721 +/**
  28.722 + *	skb_push - add data to the start of a buffer
  28.723 + *	@skb: buffer to use
  28.724 + *	@len: amount of data to add
  28.725 + *
  28.726 + *	This function extends the used data area of the buffer at the buffer
  28.727 + *	start. If this would exceed the total buffer headroom the kernel will
  28.728 + *	panic. A pointer to the first byte of the extra data is returned.
  28.729 + */
  28.730 +static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
  28.731 +{
  28.732 +	skb->data -= len;
  28.733 +	skb->len  += len;
  28.734 +	if (unlikely(skb->data<skb->head))
  28.735 +		skb_under_panic(skb, len, current_text_addr());
  28.736 +	return skb->data;
  28.737 +}
  28.738 +
  28.739 +static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len)
  28.740 +{
  28.741 +	skb->len -= len;
  28.742 +	BUG_ON(skb->len < skb->data_len);
  28.743 +	return skb->data += len;
  28.744 +}
  28.745 +
  28.746 +/**
  28.747 + *	skb_pull - remove data from the start of a buffer
  28.748 + *	@skb: buffer to use
  28.749 + *	@len: amount of data to remove
  28.750 + *
  28.751 + *	This function removes data from the start of a buffer, returning
  28.752 + *	the memory to the headroom. A pointer to the next data in the buffer
  28.753 + *	is returned. Once the data has been pulled future pushes will overwrite
  28.754 + *	the old data.
  28.755 + */
  28.756 +static inline unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
  28.757 +{
  28.758 +	return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
  28.759 +}
  28.760 +
  28.761 +extern unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta);
  28.762 +
  28.763 +static inline unsigned char *__pskb_pull(struct sk_buff *skb, unsigned int len)
  28.764 +{
  28.765 +	if (len > skb_headlen(skb) &&
  28.766 +	    !__pskb_pull_tail(skb, len-skb_headlen(skb)))
  28.767 +		return NULL;
  28.768 +	skb->len -= len;
  28.769 +	return skb->data += len;
  28.770 +}
  28.771 +
  28.772 +static inline unsigned char *pskb_pull(struct sk_buff *skb, unsigned int len)
  28.773 +{
  28.774 +	return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
  28.775 +}
  28.776 +
  28.777 +static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
  28.778 +{
  28.779 +	if (likely(len <= skb_headlen(skb)))
  28.780 +		return 1;
  28.781 +	if (unlikely(len > skb->len))
  28.782 +		return 0;
  28.783 +	return __pskb_pull_tail(skb, len-skb_headlen(skb)) != NULL;
  28.784 +}
  28.785 +
  28.786 +/**
  28.787 + *	skb_headroom - bytes at buffer head
  28.788 + *	@skb: buffer to check
  28.789 + *
  28.790 + *	Return the number of bytes of free space at the head of an &sk_buff.
  28.791 + */
  28.792 +static inline int skb_headroom(const struct sk_buff *skb)
  28.793 +{
  28.794 +	return skb->data - skb->head;
  28.795 +}
  28.796 +
  28.797 +/**
  28.798 + *	skb_tailroom - bytes at buffer end
  28.799 + *	@skb: buffer to check
  28.800 + *
  28.801 + *	Return the number of bytes of free space at the tail of an sk_buff
  28.802 + */
  28.803 +static inline int skb_tailroom(const struct sk_buff *skb)
  28.804 +{
  28.805 +	return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
  28.806 +}
  28.807 +
  28.808 +/**
  28.809 + *	skb_reserve - adjust headroom
  28.810 + *	@skb: buffer to alter
  28.811 + *	@len: bytes to move
  28.812 + *
  28.813 + *	Increase the headroom of an empty &sk_buff by reducing the tail
  28.814 + *	room. This is only allowed for an empty buffer.
  28.815 + */
  28.816 +static inline void skb_reserve(struct sk_buff *skb, unsigned int len)
  28.817 +{
  28.818 +	skb->data += len;
  28.819 +	skb->tail += len;
  28.820 +}
  28.821 +
  28.822 +extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc);
  28.823 +
  28.824 +static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
  28.825 +{
  28.826 +	if (!skb->data_len) {
  28.827 +		skb->len  = len;
  28.828 +		skb->tail = skb->data + len;
  28.829 +	} else
  28.830 +		___pskb_trim(skb, len, 0);
  28.831 +}
  28.832 +
  28.833 +/**
  28.834 + *	skb_trim - remove end from a buffer
  28.835 + *	@skb: buffer to alter
  28.836 + *	@len: new length
  28.837 + *
  28.838 + *	Cut the length of a buffer down by removing data from the tail. If
  28.839 + *	the buffer is already under the length specified it is not modified.
  28.840 + */
  28.841 +static inline void skb_trim(struct sk_buff *skb, unsigned int len)
  28.842 +{
  28.843 +	if (skb->len > len)
  28.844 +		__skb_trim(skb, len);
  28.845 +}
  28.846 +
  28.847 +
  28.848 +static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
  28.849 +{
  28.850 +	if (!skb->data_len) {
  28.851 +		skb->len  = len;
  28.852 +		skb->tail = skb->data+len;
  28.853 +		return 0;
  28.854 +	}
  28.855 +	return ___pskb_trim(skb, len, 1);
  28.856 +}
  28.857 +
  28.858 +static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
  28.859 +{
  28.860 +	return (len < skb->len) ? __pskb_trim(skb, len) : 0;
  28.861 +}
  28.862 +
  28.863 +/**
  28.864 + *	skb_orphan - orphan a buffer
  28.865 + *	@skb: buffer to orphan
  28.866 + *
  28.867 + *	If a buffer currently has an owner then we call the owner's
  28.868 + *	destructor function and make the @skb unowned. The buffer continues
  28.869 + *	to exist but is no longer charged to its former owner.
  28.870 + */
  28.871 +static inline void skb_orphan(struct sk_buff *skb)
  28.872 +{
  28.873 +	if (skb->destructor)
  28.874 +		skb->destructor(skb);
  28.875 +	skb->destructor = NULL;
  28.876 +	skb->sk		= NULL;
  28.877 +}
  28.878 +
  28.879 +/**
  28.880 + *	__skb_queue_purge - empty a list
  28.881 + *	@list: list to empty
  28.882 + *
  28.883 + *	Delete all buffers on an &sk_buff list. Each buffer is removed from
  28.884 + *	the list and one reference dropped. This function does not take the
  28.885 + *	list lock and the caller must hold the relevant locks to use it.
  28.886 + */
  28.887 +extern void skb_queue_purge(struct sk_buff_head *list);
  28.888 +static inline void __skb_queue_purge(struct sk_buff_head *list)
  28.889 +{
  28.890 +	struct sk_buff *skb;
  28.891 +	while ((skb = __skb_dequeue(list)) != NULL)
  28.892 +		kfree_skb(skb);
  28.893 +}
  28.894 +
  28.895 +/**
  28.896 + *	__dev_alloc_skb - allocate an skbuff for sending
  28.897 + *	@length: length to allocate
  28.898 + *	@gfp_mask: get_free_pages mask, passed to alloc_skb
  28.899 + *
  28.900 + *	Allocate a new &sk_buff and assign it a usage count of one. The
  28.901 + *	buffer has unspecified headroom built in. Users should allocate
  28.902 + *	the headroom they think they need without accounting for the
  28.903 + *	built in space. The built in space is used for optimisations.
  28.904 + *
  28.905 + *	%NULL is returned in there is no free memory.
  28.906 + */
  28.907 +static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
  28.908 +					      int gfp_mask)
  28.909 +{
  28.910 +	struct sk_buff *skb;
  28.911 +#ifdef CONFIG_PAGESIZED_SKBS
  28.912 +	length = max(length, (unsigned int)(PAGE_SIZE - 16));
  28.913 +#endif
  28.914 +	skb = alloc_skb(length + 16, gfp_mask);
  28.915 +	if (likely(skb))
  28.916 +		skb_reserve(skb, 16);
  28.917 +	return skb;
  28.918 +}
  28.919 +
  28.920 +/**
  28.921 + *	dev_alloc_skb - allocate an skbuff for sending
  28.922 + *	@length: length to allocate
  28.923 + *
  28.924 + *	Allocate a new &sk_buff and assign it a usage count of one. The
  28.925 + *	buffer has unspecified headroom built in. Users should allocate
  28.926 + *	the headroom they think they need without accounting for the
  28.927 + *	built in space. The built in space is used for optimisations.
  28.928 + *
  28.929 + *	%NULL is returned in there is no free memory. Although this function
  28.930 + *	allocates memory it can be called from an interrupt.
  28.931 + */
  28.932 +static inline struct sk_buff *dev_alloc_skb(unsigned int length)
  28.933 +{
  28.934 +	return __dev_alloc_skb(length, GFP_ATOMIC);
  28.935 +}
  28.936 +
  28.937 +/**
  28.938 + *	skb_cow - copy header of skb when it is required
  28.939 + *	@skb: buffer to cow
  28.940 + *	@headroom: needed headroom
  28.941 + *
  28.942 + *	If the skb passed lacks sufficient headroom or its data part
  28.943 + *	is shared, data is reallocated. If reallocation fails, an error
  28.944 + *	is returned and original skb is not changed.
  28.945 + *
  28.946 + *	The result is skb with writable area skb->head...skb->tail
  28.947 + *	and at least @headroom of space at head.
  28.948 + */
  28.949 +static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
  28.950 +{
  28.951 +	int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
  28.952 +
  28.953 +	if (delta < 0)
  28.954 +		delta = 0;
  28.955 +
  28.956 +	if (delta || skb_cloned(skb))
  28.957 +		return pskb_expand_head(skb, (delta + 15) & ~15, 0, GFP_ATOMIC);
  28.958 +	return 0;
  28.959 +}
  28.960 +
  28.961 +/**
  28.962 + *	skb_padto	- pad an skbuff up to a minimal size
  28.963 + *	@skb: buffer to pad
  28.964 + *	@len: minimal length
  28.965 + *
  28.966 + *	Pads up a buffer to ensure the trailing bytes exist and are
  28.967 + *	blanked. If the buffer already contains sufficient data it
  28.968 + *	is untouched. Returns the buffer, which may be a replacement
  28.969 + *	for the original, or NULL for out of memory - in which case
  28.970 + *	the original buffer is still freed.
  28.971 + */
  28.972 + 
  28.973 +static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len)
  28.974 +{
  28.975 +	unsigned int size = skb->len;
  28.976 +	if (likely(size >= len))
  28.977 +		return skb;
  28.978 +	return skb_pad(skb, len-size);
  28.979 +}
  28.980 +
  28.981 +/**
  28.982 + *	skb_linearize - convert paged skb to linear one
  28.983 + *	@skb: buffer to linarize
  28.984 + *	@gfp: allocation mode
  28.985 + *
  28.986 + *	If there is no free memory -ENOMEM is returned, otherwise zero
  28.987 + *	is returned and the old skb data released.
  28.988 + */
  28.989 +extern int __skb_linearize(struct sk_buff *skb, int gfp);
  28.990 +static inline int skb_linearize(struct sk_buff *skb, int gfp)
  28.991 +{
  28.992 +	return __skb_linearize(skb, gfp);
  28.993 +}
  28.994 +
  28.995 +static inline void *kmap_skb_frag(const skb_frag_t *frag)
  28.996 +{
  28.997 +#ifdef CONFIG_HIGHMEM
  28.998 +	BUG_ON(in_irq());
  28.999 +
 28.1000 +	local_bh_disable();
 28.1001 +#endif
 28.1002 +	return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ);
 28.1003 +}
 28.1004 +
 28.1005 +static inline void kunmap_skb_frag(void *vaddr)
 28.1006 +{
 28.1007 +	kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ);
 28.1008 +#ifdef CONFIG_HIGHMEM
 28.1009 +	local_bh_enable();
 28.1010 +#endif
 28.1011 +}
 28.1012 +
 28.1013 +#define skb_queue_walk(queue, skb) \
 28.1014 +		for (skb = (queue)->next, prefetch(skb->next);	\
 28.1015 +		     (skb != (struct sk_buff *)(queue));	\
 28.1016 +		     skb = skb->next, prefetch(skb->next))
 28.1017 +
 28.1018 +
 28.1019 +extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
 28.1020 +					 int noblock, int *err);
 28.1021 +extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
 28.1022 +				     struct poll_table_struct *wait);
 28.1023 +extern int	       skb_copy_datagram(const struct sk_buff *from,
 28.1024 +					 int offset, char __user *to, int size);
 28.1025 +extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 28.1026 +					       int offset, struct iovec *to,
 28.1027 +					       int size);
 28.1028 +extern int	       skb_copy_and_csum_datagram(const struct sk_buff *skb,
 28.1029 +						  int offset, u8 __user *to,
 28.1030 +						  int len, unsigned int *csump);
 28.1031 +extern int	       skb_copy_and_csum_datagram_iovec(const
 28.1032 +							struct sk_buff *skb,
 28.1033 +							int hlen,
 28.1034 +							struct iovec *iov);
 28.1035 +extern void	       skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 28.1036 +extern unsigned int    skb_checksum(const struct sk_buff *skb, int offset,
 28.1037 +				    int len, unsigned int csum);
 28.1038 +extern int	       skb_copy_bits(const struct sk_buff *skb, int offset,
 28.1039 +				     void *to, int len);
 28.1040 +extern unsigned int    skb_copy_and_csum_bits(const struct sk_buff *skb,
 28.1041 +					      int offset, u8 *to, int len,
 28.1042 +					      unsigned int csum);
 28.1043 +extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 28.1044 +
 28.1045 +extern void skb_init(void);
 28.1046 +extern void skb_add_mtu(int mtu);
 28.1047 +
 28.1048 +#ifdef CONFIG_NETFILTER
 28.1049 +static inline void nf_conntrack_put(struct nf_ct_info *nfct)
 28.1050 +{
 28.1051 +	if (nfct && atomic_dec_and_test(&nfct->master->use))
 28.1052 +		nfct->master->destroy(nfct->master);
 28.1053 +}
 28.1054 +static inline void nf_conntrack_get(struct nf_ct_info *nfct)
 28.1055 +{
 28.1056 +	if (nfct)
 28.1057 +		atomic_inc(&nfct->master->use);
 28.1058 +}
 28.1059 +
 28.1060 +#ifdef CONFIG_BRIDGE_NETFILTER
 28.1061 +static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
 28.1062 +{
 28.1063 +	if (nf_bridge && atomic_dec_and_test(&nf_bridge->use))
 28.1064 +		kfree(nf_bridge);
 28.1065 +}
 28.1066 +static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
 28.1067 +{
 28.1068 +	if (nf_bridge)
 28.1069 +		atomic_inc(&nf_bridge->use);
 28.1070 +}
 28.1071 +#endif
 28.1072 +
 28.1073 +#endif
 28.1074 +
 28.1075 +#endif	/* __KERNEL__ */
 28.1076 +#endif	/* _LINUX_SKBUFF_H */
    29.1 --- a/linux-2.6.7-xen-sparse/mm/page_alloc.c	Fri Aug 20 09:11:43 2004 +0000
    29.2 +++ b/linux-2.6.7-xen-sparse/mm/page_alloc.c	Fri Aug 20 09:21:37 2004 +0000
    29.3 @@ -497,9 +497,8 @@ static void fastcall free_hot_cold_page(
    29.4  	struct per_cpu_pages *pcp;
    29.5  	unsigned long flags;
    29.6  
    29.7 -	/* XXX Xen: use mapping pointer as skb/data-page destructor */
    29.8 -	if (page->mapping)
    29.9 -		return (*(void(*)(struct page *))page->mapping)(page);
   29.10 +	if (PageForeign(page))
   29.11 +		return (PageForeignDestructor(page))(page);
   29.12  
   29.13  	kernel_map_pages(page, 1, 0);
   29.14  	inc_page_state(pgfree);
    30.1 --- a/tools/examples/Makefile	Fri Aug 20 09:11:43 2004 +0000
    30.2 +++ b/tools/examples/Makefile	Fri Aug 20 09:21:37 2004 +0000
    30.3 @@ -5,9 +5,8 @@ XEND_INITD = init.d/xend
    30.4  # Xen configuration dir and configs to go there.
    30.5  XEN_CONFIG_DIR = /etc/xen
    30.6  XEN_CONFIGS = xend-config.sxp
    30.7 -XEN_CONFIGS += xmdefconfig 
    30.8 -XEN_CONFIGS += xmdefconfig-example
    30.9 -XEN_CONFIGS += xmdefconfig-netbsd
   30.10 +XEN_CONFIGS += xmexample1 
   30.11 +XEN_CONFIGS += xmexample2
   30.12  
   30.13  # Xen script dir and scripts to go there.
   30.14  XEN_SCRIPT_DIR = /etc/xen/scripts
    31.1 --- a/tools/examples/xmdefconfig	Fri Aug 20 09:11:43 2004 +0000
    31.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.3 @@ -1,85 +0,0 @@
    31.4 -#  -*- mode: python; -*-
    31.5 -#============================================================================
    31.6 -# Python configuration setup for 'xm create'.
    31.7 -# This script sets the parameters used when a domain is created using 'xm create'.
    31.8 -# You use a separate script for each domain you want to create, or 
    31.9 -# you can set the parameters for the domain on the xm command line.
   31.10 -#============================================================================
   31.11 -
   31.12 -#----------------------------------------------------------------------------
   31.13 -# Kernel image file.
   31.14 -kernel = "/boot/vmlinuz-2.4.26-xenU"
   31.15 -
   31.16 -# Optional ramdisk.
   31.17 -#ramdisk = "/boot/initrd.gz"
   31.18 -
   31.19 -# The domain build function. Default is 'linux'.
   31.20 -#builder='linux'
   31.21 -#builder='netbsd'
   31.22 -
   31.23 -# Initial memory allocation (in megabytes) for the new domain.
   31.24 -memory = 64
   31.25 -
   31.26 -# A name for your domain. All domains must have different names.
   31.27 -name = "ExampleDomain"
   31.28 -
   31.29 -# Which CPU to start domain on? 
   31.30 -#cpu = -1   # leave to Xen to pick
   31.31 -
   31.32 -#----------------------------------------------------------------------------
   31.33 -# Define network interfaces.
   31.34 -
   31.35 -# Number of network interfaces. Default is 1.
   31.36 -#nics=1
   31.37 -
   31.38 -# Optionally define mac and/or bridge for the network interfaces.
   31.39 -# Random MACs are assigned if not given.
   31.40 -#vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0' ]
   31.41 -
   31.42 -#----------------------------------------------------------------------------
   31.43 -# Define the disk devices you want the domain to have access to, and
   31.44 -# what you want them accessible as.
   31.45 -# Each disk entry is of the form phy:UNAME,DEV,MODE
   31.46 -# where UNAME is the device, DEV is the device name the domain will see,
   31.47 -# and MODE is r for read-only, w for read-write.
   31.48 -
   31.49 -disk = [ 'phy:hda1,xda1,r' ]
   31.50 -
   31.51 -#----------------------------------------------------------------------------
   31.52 -# Set the kernel command line for the new domain.
   31.53 -# You only need to define the IP parameters and hostname if the domain's
   31.54 -# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
   31.55 -# You can use 'extra' to set the runlevel and custom environment
   31.56 -# variables used by custom rc scripts (e.g. VMID=, usr= ).
   31.57 -
   31.58 -# Set if you want dhcp to allocate the IP address.
   31.59 -#dhcp="dhcp"
   31.60 -# Set netmask.
   31.61 -#netmask=
   31.62 -# Set default gateway.
   31.63 -#gateway=
   31.64 -# Set the hostname.
   31.65 -#hostname= "vm%d" % vmid
   31.66 -
   31.67 -# Set root device.
   31.68 -root = "/dev/xda1 ro"
   31.69 -
   31.70 -# Root device for nfs.
   31.71 -#root = "/dev/nfs"
   31.72 -# The nfs server.
   31.73 -#nfs_server = '169.254.1.0'  
   31.74 -# Root directory on the nfs server.
   31.75 -#nfs_root   = '/full/path/to/root/directory'
   31.76 -
   31.77 -# Sets runlevel 4.
   31.78 -extra = "4"
   31.79 -
   31.80 -#----------------------------------------------------------------------------
   31.81 -# Set according to whether you want the domain restarted when it exits.
   31.82 -# The default is 'onreboot', which restarts the domain when it shuts down
   31.83 -# with exit code reboot.
   31.84 -# Other values are 'always', and 'never'.
   31.85 -
   31.86 -#restart = 'onreboot'
   31.87 -
   31.88 -#============================================================================
    32.1 --- a/tools/examples/xmdefconfig-example	Fri Aug 20 09:11:43 2004 +0000
    32.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.3 @@ -1,120 +0,0 @@
    32.4 -#  -*- mode: python; -*-
    32.5 -#============================================================================
    32.6 -# Example Python setup script for 'xm create'.
    32.7 -# This script sets the parameters used when a domain is created using 'xm create'.
    32.8 -#
    32.9 -# This is a relatively advanced script that uses a parameter, vmid, to control
   32.10 -# the settings. So this script can be used to start a set of domains by
   32.11 -# setting the vmid parameter on the 'xm create' command line. For example:
   32.12 -#
   32.13 -# xm create vmid=1
   32.14 -# xm create vmid=2
   32.15 -# xm create vmid=3
   32.16 -#
   32.17 -# The vmid is purely a script variable, and has no effect on the the domain
   32.18 -# id assigned to the new domain.
   32.19 -#============================================================================
   32.20 -
   32.21 -# Define script variables here.
   32.22 -# xm_vars is defined automatically, use xm_vars.var() to define a variable.
   32.23 -
   32.24 -# This function checks that 'vmid' has been given a valid value.
   32.25 -# It is called automatically by 'xm create'.
   32.26 -def vmid_check(var, val):
   32.27 -    val = int(val)
   32.28 -    if val <= 0:
   32.29 -        raise ValueError
   32.30 -    return val
   32.31 -
   32.32 -# Define the 'vmid' variable so that 'xm create' knows about it.
   32.33 -xm_vars.var('vmid',
   32.34 -            use="Virtual machine id. Integer greater than 0.",
   32.35 -            check=vmid_check)
   32.36 -
   32.37 -# Check the defined variables have valid values..
   32.38 -xm_vars.check()
   32.39 -
   32.40 -#----------------------------------------------------------------------------
   32.41 -# Kernel image file.
   32.42 -kernel = "/boot/vmlinuz-2.4.26-xenU"
   32.43 -
   32.44 -# Optional ramdisk.
   32.45 -#ramdisk = "/boot/initrd.gz"
   32.46 -
   32.47 -# The domain build function. Default is 'linux'.
   32.48 -#builder='linux'
   32.49 -#builder='netbsd'
   32.50 -
   32.51 -# Initial memory allocation (in megabytes) for the new domain.
   32.52 -memory = 64
   32.53 -
   32.54 -# A name for the new domain. All domains have to have different names,
   32.55 -# so we use the vmid to create a name.
   32.56 -name = "VM%d" % vmid
   32.57 -
   32.58 -# Which CPU to start domain on? 
   32.59 -#cpu = -1   # leave to Xen to pick
   32.60 -cpu = vmid  # set based on vmid (mod number of CPUs)
   32.61 -
   32.62 -#----------------------------------------------------------------------------
   32.63 -# Define network interfaces.
   32.64 -
   32.65 -# Number of network interfaces. Default is 1.
   32.66 -#nics=1
   32.67 -
   32.68 -# Optionally define mac and/or bridge for the network interfaces.
   32.69 -# Random MACs are assigned if not given.
   32.70 -#vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0' ]
   32.71 -
   32.72 -#----------------------------------------------------------------------------
   32.73 -# Define the disk devices you want the domain to have access to, and
   32.74 -# what you want them accessible as.
   32.75 -# Each disk entry is of the form phy:UNAME,DEV,MODE
   32.76 -# where UNAME is the device, DEV is the device name the domain will see,
   32.77 -# and MODE is r for read-only, w for read-write.
   32.78 -
   32.79 -# This makes the disk device depend on the vmid - assuming
   32.80 -# that devices sda7, sda8 etc. exist. The device is exported
   32.81 -# to all domains as sda1.
   32.82 -# All domains get sda6 read-only (to use for /usr, see below).
   32.83 -disk = [ 'phy:sda%d,sda1,w' % (7+vmid),
   32.84 -         'phy:sda6,sda6,r' ]
   32.85 -
   32.86 -#----------------------------------------------------------------------------
   32.87 -# Set the kernel command line for the new domain.
   32.88 -# You only need to define the IP parameters and hostname if the domain's
   32.89 -# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
   32.90 -# You can use 'extra' to set the runlevel and custom environment
   32.91 -# variables used by custom rc scripts (e.g. VMID=, usr= ).
   32.92 -
   32.93 -# Set if you want dhcp to allocate the IP address.
   32.94 -#dhcp="dhcp"
   32.95 -# Set netmask.
   32.96 -#netmask=
   32.97 -# Set default gateway.
   32.98 -#gateway=
   32.99 -# Set the hostname.
  32.100 -#hostname= "vm%d" % vmid
  32.101 -
  32.102 -# Set root device.
  32.103 -root = "/dev/sda1 ro"
  32.104 -
  32.105 -# Root device for nfs.
  32.106 -#root = "/dev/nfs"
  32.107 -# The nfs server.
  32.108 -#nfs_server = '169.254.1.0'  
  32.109 -# Root directory on the nfs server.
  32.110 -#nfs_root   = '/full/path/to/root/directory'
  32.111 -
  32.112 -# Sets runlevel 4 and the device for /usr.
  32.113 -extra = "4 VMID=%d usr=/dev/sda6" % vmid
  32.114 -
  32.115 -#----------------------------------------------------------------------------
  32.116 -# Set according to whether you want the domain restarted when it exits.
  32.117 -# The default is 'onreboot', which restarts the domain when it shuts down
  32.118 -# with exit code reboot.
  32.119 -# Other values are 'always', and 'never'.
  32.120 -
  32.121 -#restart = 'onreboot'
  32.122 -
  32.123 -#============================================================================
    33.1 --- a/tools/examples/xmdefconfig-netbsd	Fri Aug 20 09:11:43 2004 +0000
    33.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.3 @@ -1,123 +0,0 @@
    33.4 -#  -*- mode: python; -*-
    33.5 -#============================================================================
    33.6 -# Example Python setup script for 'xm create'.
    33.7 -# This script sets the parameters used when a domain is created using 'xm create'.
    33.8 -#
    33.9 -# This is a relatively advanced script that uses a parameter, vmid, to control
   33.10 -# the settings. So this script can be used to start a set of domains by
   33.11 -# setting the vmid parameter on the 'xm create' command line. For example:
   33.12 -#
   33.13 -# xm create vmid=1
   33.14 -# xm create vmid=2
   33.15 -# xm create vmid=3
   33.16 -#
   33.17 -# The vmid is purely a script variable, and has no effect on the the domain
   33.18 -# id assigned to the new domain.
   33.19 -#============================================================================
   33.20 -
   33.21 -# Define script variables here.
   33.22 -# xm_vars is defined automatically, use xm_vars.var() to define a variable.
   33.23 -
   33.24 -# This function checks that 'vmid' has been given a valid value.
   33.25 -# It is called automatically by 'xm create'.
   33.26 -def vmid_check(var, val):
   33.27 -    val = int(val)
   33.28 -    if val <= 0:
   33.29 -        raise ValueError
   33.30 -    return val
   33.31 -
   33.32 -# Define the 'vmid' variable so that 'xm create' knows about it.
   33.33 -xm_vars.var('vmid',
   33.34 -            use="Virtual machine id. Integer greater than 0.",
   33.35 -            check=vmid_check)
   33.36 -
   33.37 -# Check the defined variables have valid values..
   33.38 -xm_vars.check()
   33.39 -
   33.40 -#----------------------------------------------------------------------------
   33.41 -# Kernel image file.
   33.42 -image = "/boot/netbsd"
   33.43 -
   33.44 -# The domain build function.
   33.45 -builder='netbsd'
   33.46 -
   33.47 -# Initial memory allocation (in megabytes) for the new domain.
   33.48 -memory = 16
   33.49 -
   33.50 -# A name for the new domain. All domains have to have different names,
   33.51 -# so we use the vmid to create a name.
   33.52 -name = "NETBSD%d" % vmid
   33.53 -
   33.54 -#----------------------------------------------------------------------------
   33.55 -# Define network interfaces.
   33.56 -
   33.57 -# Number of network interfaces. Default is 1.
   33.58 -#nics=1
   33.59 -
   33.60 -# Optionally define mac and/or bridge for the network interfaces.
   33.61 -# Random MACs are assigned if not given.
   33.62 -#vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0' ]
   33.63 -
   33.64 -# Specify IP address(es), for the new domain.  You need to
   33.65 -# configure IP addrs within the domain just as you do normally.  This
   33.66 -# is just to let Xen know about them so it can route packets
   33.67 -# appropriately.
   33.68 -
   33.69 -#ipaddr = [ xenctl.utils.add_offset_to_ip(xenctl.utils.get_current_ipaddr(),vmid),
   33.70 -#           xenctl.utils.add_offset_to_ip('169.254.1.0',vmid),
   33.71 -#           ]
   33.72 -
   33.73 -#----------------------------------------------------------------------------
   33.74 -# Define the disk devices you want the domain to have access to, and
   33.75 -# what you want them accessible as.
   33.76 -# Each disk entry is of the form phy:UNAME,DEV,MODE
   33.77 -# where UNAME is the device, DEV is the device name the domain will see,
   33.78 -# and MODE is r for read-only, w for read-write.
   33.79 -
   33.80 -# This makes the disk device depend on the vmid - assuming
   33.81 -# that devices sda7, sda8 etc. exist. The device is exported
   33.82 -# to all domains as sda1.
   33.83 -# All domains get sda6 read-only (to use for /usr, see below).
   33.84 -disk = [ 'phy:sda%d,sda1,w' % (7+vmid),
   33.85 -         'phy:sda6,sda6,r' ]
   33.86 -
   33.87 -#----------------------------------------------------------------------------
   33.88 -# Set the kernel command line for the new domain.
   33.89 -# You only need to define the IP parameters and hostname if the domain's
   33.90 -# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
   33.91 -# You can use 'extra' to set the runlevel and custom environment
   33.92 -# variables used by custom rc scripts (e.g. VMID=, usr= ).
   33.93 -
   33.94 -# Set if you want dhcp to allocate the IP address.
   33.95 -#dhcp="dhcp"
   33.96 -# Set netmask.
   33.97 -#netmask=
   33.98 -# Set default gateway.
   33.99 -#gateway=
  33.100 -# Set the hostname.
  33.101 -#hostname= "vm%d" % vmid
  33.102 -
  33.103 -# Set root device.
  33.104 -root = "/dev/sda1 ro"
  33.105 -
  33.106 -# Root device for nfs.
  33.107 -#root = "/dev/nfs"
  33.108 -# The nfs server.
  33.109 -#nfs_server = '169.254.1.0'  
  33.110 -# Root directory on the nfs server.
  33.111 -#nfs_root   = '/full/path/to/root/directory'
  33.112 -
  33.113 -# Sets runlevel 4 and the device for /usr.
  33.114 -#extra = "4 VMID=%d usr=/dev/sda6" % vmid
  33.115 -extra = "4 VMID=%d bootdev=xennet0" % vmid
  33.116 -
  33.117 -
  33.118 -#----------------------------------------------------------------------------
  33.119 -# Set according to whether you want the domain restarted when it exits.
  33.120 -# The default is 'onreboot', which restarts the domain when it shuts down
  33.121 -# with exit code reboot.
  33.122 -# Other values are 'always', and 'never'.
  33.123 -#
  33.124 -#restart = 'onreboot'
  33.125 -
  33.126 -#============================================================================
    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/tools/examples/xmexample1	Fri Aug 20 09:21:37 2004 +0000
    34.3 @@ -0,0 +1,85 @@
    34.4 +#  -*- mode: python; -*-
    34.5 +#============================================================================
    34.6 +# Python configuration setup for 'xm create'.
    34.7 +# This script sets the parameters used when a domain is created using 'xm create'.
    34.8 +# You use a separate script for each domain you want to create, or 
    34.9 +# you can set the parameters for the domain on the xm command line.
   34.10 +#============================================================================
   34.11 +
   34.12 +#----------------------------------------------------------------------------
   34.13 +# Kernel image file.
   34.14 +kernel = "/boot/vmlinuz-2.4.26-xenU"
   34.15 +
   34.16 +# Optional ramdisk.
   34.17 +#ramdisk = "/boot/initrd.gz"
   34.18 +
   34.19 +# The domain build function. Default is 'linux'.
   34.20 +#builder='linux'
   34.21 +#builder='netbsd'
   34.22 +
   34.23 +# Initial memory allocation (in megabytes) for the new domain.
   34.24 +memory = 64
   34.25 +
   34.26 +# A name for your domain. All domains must have different names.
   34.27 +name = "ExampleDomain"
   34.28 +
   34.29 +# Which CPU to start domain on? 
   34.30 +#cpu = -1   # leave to Xen to pick
   34.31 +
   34.32 +#----------------------------------------------------------------------------
   34.33 +# Define network interfaces.
   34.34 +
   34.35 +# Number of network interfaces. Default is 1.
   34.36 +#nics=1
   34.37 +
   34.38 +# Optionally define mac and/or bridge for the network interfaces.
   34.39 +# Random MACs are assigned if not given.
   34.40 +#vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0' ]
   34.41 +
   34.42 +#----------------------------------------------------------------------------
   34.43 +# Define the disk devices you want the domain to have access to, and
   34.44 +# what you want them accessible as.
   34.45 +# Each disk entry is of the form phy:UNAME,DEV,MODE
   34.46 +# where UNAME is the device, DEV is the device name the domain will see,
   34.47 +# and MODE is r for read-only, w for read-write.
   34.48 +
   34.49 +disk = [ 'phy:hda1,xda1,r' ]
   34.50 +
   34.51 +#----------------------------------------------------------------------------
   34.52 +# Set the kernel command line for the new domain.
   34.53 +# You only need to define the IP parameters and hostname if the domain's
   34.54 +# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
   34.55 +# You can use 'extra' to set the runlevel and custom environment
   34.56 +# variables used by custom rc scripts (e.g. VMID=, usr= ).
   34.57 +
   34.58 +# Set if you want dhcp to allocate the IP address.
   34.59 +#dhcp="dhcp"
   34.60 +# Set netmask.
   34.61 +#netmask=
   34.62 +# Set default gateway.
   34.63 +#gateway=
   34.64 +# Set the hostname.
   34.65 +#hostname= "vm%d" % vmid
   34.66 +
   34.67 +# Set root device.
   34.68 +root = "/dev/xda1 ro"
   34.69 +
   34.70 +# Root device for nfs.
   34.71 +#root = "/dev/nfs"
   34.72 +# The nfs server.
   34.73 +#nfs_server = '169.254.1.0'  
   34.74 +# Root directory on the nfs server.
   34.75 +#nfs_root   = '/full/path/to/root/directory'
   34.76 +
   34.77 +# Sets runlevel 4.
   34.78 +extra = "4"
   34.79 +
   34.80 +#----------------------------------------------------------------------------
   34.81 +# Set according to whether you want the domain restarted when it exits.
   34.82 +# The default is 'onreboot', which restarts the domain when it shuts down
   34.83 +# with exit code reboot.
   34.84 +# Other values are 'always', and 'never'.
   34.85 +
   34.86 +#restart = 'onreboot'
   34.87 +
   34.88 +#============================================================================
    35.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.2 +++ b/tools/examples/xmexample2	Fri Aug 20 09:21:37 2004 +0000
    35.3 @@ -0,0 +1,120 @@
    35.4 +#  -*- mode: python; -*-
    35.5 +#============================================================================
    35.6 +# Example Python setup script for 'xm create'.
    35.7 +# This script sets the parameters used when a domain is created using 'xm create'.
    35.8 +#
    35.9 +# This is a relatively advanced script that uses a parameter, vmid, to control
   35.10 +# the settings. So this script can be used to start a set of domains by
   35.11 +# setting the vmid parameter on the 'xm create' command line. For example:
   35.12 +#
   35.13 +# xm create vmid=1
   35.14 +# xm create vmid=2
   35.15 +# xm create vmid=3
   35.16 +#
   35.17 +# The vmid is purely a script variable, and has no effect on the the domain
   35.18 +# id assigned to the new domain.
   35.19 +#============================================================================
   35.20 +
   35.21 +# Define script variables here.
   35.22 +# xm_vars is defined automatically, use xm_vars.var() to define a variable.
   35.23 +
   35.24 +# This function checks that 'vmid' has been given a valid value.
   35.25 +# It is called automatically by 'xm create'.
   35.26 +def vmid_check(var, val):
   35.27 +    val = int(val)
   35.28 +    if val <= 0:
   35.29 +        raise ValueError
   35.30 +    return val
   35.31 +
   35.32 +# Define the 'vmid' variable so that 'xm create' knows about it.
   35.33 +xm_vars.var('vmid',
   35.34 +            use="Virtual machine id. Integer greater than 0.",
   35.35 +            check=vmid_check)
   35.36 +
   35.37 +# Check the defined variables have valid values..
   35.38 +xm_vars.check()
   35.39 +
   35.40 +#----------------------------------------------------------------------------
   35.41 +# Kernel image file.
   35.42 +kernel = "/boot/vmlinuz-2.4.26-xenU"
   35.43 +
   35.44 +# Optional ramdisk.
   35.45 +#ramdisk = "/boot/initrd.gz"
   35.46 +
   35.47 +# The domain build function. Default is 'linux'.
   35.48 +#builder='linux'
   35.49 +#builder='netbsd'
   35.50 +
   35.51 +# Initial memory allocation (in megabytes) for the new domain.
   35.52 +memory = 64
   35.53 +
   35.54 +# A name for the new domain. All domains have to have different names,
   35.55 +# so we use the vmid to create a name.
   35.56 +name = "VM%d" % vmid
   35.57 +
   35.58 +# Which CPU to start domain on? 
   35.59 +#cpu = -1   # leave to Xen to pick
   35.60 +cpu = vmid  # set based on vmid (mod number of CPUs)
   35.61 +
   35.62 +#----------------------------------------------------------------------------
   35.63 +# Define network interfaces.
   35.64 +
   35.65 +# Number of network interfaces. Default is 1.
   35.66 +#nics=1
   35.67 +
   35.68 +# Optionally define mac and/or bridge for the network interfaces.
   35.69 +# Random MACs are assigned if not given.
   35.70 +#vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0' ]
   35.71 +
   35.72 +#----------------------------------------------------------------------------
   35.73 +# Define the disk devices you want the domain to have access to, and
   35.74 +# what you want them accessible as.
   35.75 +# Each disk entry is of the form phy:UNAME,DEV,MODE
   35.76 +# where UNAME is the device, DEV is the device name the domain will see,
   35.77 +# and MODE is r for read-only, w for read-write.
   35.78 +
   35.79 +# This makes the disk device depend on the vmid - assuming
   35.80 +# that devices sda7, sda8 etc. exist. The device is exported
   35.81 +# to all domains as sda1.
   35.82 +# All domains get sda6 read-only (to use for /usr, see below).
   35.83 +disk = [ 'phy:sda%d,sda1,w' % (7+vmid),
   35.84 +         'phy:sda6,sda6,r' ]
   35.85 +
   35.86 +#----------------------------------------------------------------------------
   35.87 +# Set the kernel command line for the new domain.
   35.88 +# You only need to define the IP parameters and hostname if the domain's
   35.89 +# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
   35.90 +# You can use 'extra' to set the runlevel and custom environment
   35.91 +# variables used by custom rc scripts (e.g. VMID=, usr= ).
   35.92 +
   35.93 +# Set if you want dhcp to allocate the IP address.
   35.94 +#dhcp="dhcp"
   35.95 +# Set netmask.
   35.96 +#netmask=
   35.97 +# Set default gateway.
   35.98 +#gateway=
   35.99 +# Set the hostname.
  35.100 +#hostname= "vm%d" % vmid
  35.101 +
  35.102 +# Set root device.
  35.103 +root = "/dev/sda1 ro"
  35.104 +
  35.105 +# Root device for nfs.
  35.106 +#root = "/dev/nfs"
  35.107 +# The nfs server.
  35.108 +#nfs_server = '169.254.1.0'  
  35.109 +# Root directory on the nfs server.
  35.110 +#nfs_root   = '/full/path/to/root/directory'
  35.111 +
  35.112 +# Sets runlevel 4 and the device for /usr.
  35.113 +extra = "4 VMID=%d usr=/dev/sda6" % vmid
  35.114 +
  35.115 +#----------------------------------------------------------------------------
  35.116 +# Set according to whether you want the domain restarted when it exits.
  35.117 +# The default is 'onreboot', which restarts the domain when it shuts down
  35.118 +# with exit code reboot.
  35.119 +# Other values are 'always', and 'never'.
  35.120 +
  35.121 +#restart = 'onreboot'
  35.122 +
  35.123 +#============================================================================
    36.1 --- a/tools/libxc/xc.h	Fri Aug 20 09:11:43 2004 +0000
    36.2 +++ b/tools/libxc/xc.h	Fri Aug 20 09:21:37 2004 +0000
    36.3 @@ -154,6 +154,8 @@ int xc_rrobin_global_set(int xc_handle, 
    36.4  int xc_rrobin_global_get(int xc_handle, u64 *slice);
    36.5  
    36.6  #define DOMID_SELF              (0x7FF0U)
    36.7 +#define DOMID_IO                (0x7FF1U)
    36.8 +#define DOMID_XEN               (0x7FF2U)
    36.9  
   36.10  typedef struct {
   36.11  #define EVTCHNSTAT_closed       0  /* Chennel is not in use.                 */
    37.1 --- a/tools/libxc/xc_linux_save.c	Fri Aug 20 09:11:43 2004 +0000
    37.2 +++ b/tools/libxc/xc_linux_save.c	Fri Aug 20 09:21:37 2004 +0000
    37.3 @@ -295,7 +295,7 @@ int xc_linux_save(int xc_handle, XcIOCon
    37.4      int rc = 1, i, j, k, last_iter, iter = 0;
    37.5      unsigned long mfn;
    37.6      u32 domid = ioctxt->domain;
    37.7 -    int live = 0; // (ioctxt->flags & XCFLAGS_LIVE);
    37.8 +    int live =  (ioctxt->flags & XCFLAGS_LIVE);
    37.9      int debug = (ioctxt->flags & XCFLAGS_DEBUG);
   37.10      int sent_last_iter, skip_this_iter;
   37.11  
   37.12 @@ -423,7 +423,7 @@ int xc_linux_save(int xc_handle, XcIOCon
   37.13      mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
   37.14  
   37.15      live_mfn_to_pfn_table = 
   37.16 -	mfn_mapper_map_single(xc_handle, 0x7FFFU, 
   37.17 +	mfn_mapper_map_single(xc_handle, DOMID_XEN, 
   37.18  			      PAGE_SIZE*1024, PROT_READ, 
   37.19  			      mfn_to_pfn_table_start_mfn );
   37.20  
   37.21 @@ -440,7 +440,8 @@ int xc_linux_save(int xc_handle, XcIOCon
   37.22  
   37.23      /* Domain is still running at this point */
   37.24  
   37.25 -    if( live ){ 
   37.26 +    if( live ){
   37.27 +printf("GO LIVE!!\n");
   37.28          if ( xc_shadow_control( xc_handle, domid, 
   37.29                                  DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
   37.30                                  NULL, 0, NULL ) < 0 ) {
    38.1 --- a/tools/python/xen/xm/opts.py	Fri Aug 20 09:11:43 2004 +0000
    38.2 +++ b/tools/python/xen/xm/opts.py	Fri Aug 20 09:21:37 2004 +0000
    38.3 @@ -347,7 +347,7 @@ class Opts:
    38.4  
    38.5      def var_usage(self):
    38.6          if self.vars:
    38.7 -            print 'The defconfig defines the following variables:'
    38.8 +            print 'The config file defines the following variables:'
    38.9              for var in self.vars:
   38.10                  var.show()
   38.11                  print
   38.12 @@ -372,11 +372,11 @@ class Opts:
   38.13              else:
   38.14                  p = self.vals.defconfig
   38.15              if os.path.exists(p):
   38.16 -                self.info('Using defconfig file %s.' % p)
   38.17 +                self.info('Using config file "%s".' % p)
   38.18                  self.load(p, help)
   38.19                  break
   38.20          else:
   38.21 -            self.err("Cannot open defconfig file %s" % self.vals.defconfig)
   38.22 +            self.err('Cannot open config file "%s"' % self.vals.defconfig)
   38.23  
   38.24      def load(self, defconfig, help):
   38.25          """Load a defconfig file. Local variables in the file
    39.1 --- a/xen/arch/x86/memory.c	Fri Aug 20 09:11:43 2004 +0000
    39.2 +++ b/xen/arch/x86/memory.c	Fri Aug 20 09:21:37 2004 +0000
    39.3 @@ -137,14 +137,49 @@ static struct {
    39.4   */
    39.5  #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
    39.6  
    39.7 -void ptwr_init_backpointers(void);
    39.8 +/* Private domain structs for DOMID_XEN and DOMID_IO. */
    39.9 +static struct domain *dom_xen, *dom_io;
   39.10  
   39.11  void arch_init_memory(void)
   39.12  {
   39.13 +    static void ptwr_init_backpointers(void);
   39.14 +    unsigned long mfn;
   39.15 +
   39.16      memset(percpu_info, 0, sizeof(percpu_info));
   39.17  
   39.18      vm_assist_info[VMASST_TYPE_writeable_pagetables].enable =
   39.19          ptwr_init_backpointers;
   39.20 +
   39.21 +    /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
   39.22 +    memset(machine_to_phys_mapping, 0x55, 4<<20);
   39.23 +
   39.24 +    /*
   39.25 +     * Initialise our DOMID_XEN domain.
   39.26 +     * Any Xen-heap pages that we will allow to be mapped will have
   39.27 +     * their domain field set to dom_xen.
   39.28 +     */
   39.29 +    dom_xen = alloc_domain_struct();
   39.30 +    atomic_set(&dom_xen->refcnt, 1);
   39.31 +    dom_xen->domain = DOMID_XEN;
   39.32 +
   39.33 +    /*
   39.34 +     * Initialise our DOMID_IO domain.
   39.35 +     * This domain owns no pages but is considered a special case when
   39.36 +     * mapping I/O pages, as the mappings occur at the priv of the caller.
   39.37 +     */
   39.38 +    dom_io = alloc_domain_struct();
   39.39 +    atomic_set(&dom_io->refcnt, 1);
   39.40 +    dom_io->domain = DOMID_IO;
   39.41 +
   39.42 +    /* M2P table is mappable read-only by privileged domains. */
   39.43 +    for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
   39.44 +          mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
   39.45 +          mfn++ )
   39.46 +    {
   39.47 +        frame_table[mfn].u.inuse.count_info = 1 | PGC_allocated;
   39.48 +        frame_table[mfn].u.inuse.type_info  = 1 | PGT_gdt_page; /* non-RW */
   39.49 +        frame_table[mfn].u.inuse.domain     = dom_xen;
   39.50 +    }
   39.51  }
   39.52  
   39.53  static void __invalidate_shadow_ldt(struct domain *d)
   39.54 @@ -178,7 +213,7 @@ static inline void invalidate_shadow_ldt
   39.55  }
   39.56  
   39.57  
   39.58 -int alloc_segdesc_page(struct pfn_info *page)
   39.59 +static int alloc_segdesc_page(struct pfn_info *page)
   39.60  {
   39.61      unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
   39.62      int i;
   39.63 @@ -345,11 +380,15 @@ get_page_from_l1e(
   39.64  
   39.65      if ( unlikely(!pfn_is_ram(pfn)) )
   39.66      {
   39.67 -        if ( IS_PRIV(current) )
   39.68 +        /* Revert to caller privileges if FD == DOMID_IO. */
   39.69 +        if ( d == dom_io )
   39.70 +            d = current;
   39.71 +
   39.72 +        if ( IS_PRIV(d) )
   39.73              return 1;
   39.74  
   39.75 -        if ( IS_CAPABLE_PHYSDEV(current) )
   39.76 -            return domain_iomem_in_pfn(current, pfn);
   39.77 +        if ( IS_CAPABLE_PHYSDEV(d) )
   39.78 +            return domain_iomem_in_pfn(d, pfn);
   39.79  
   39.80          MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
   39.81          return 0;
   39.82 @@ -827,9 +866,16 @@ static int do_extended_command(unsigned 
   39.83  
   39.84          if ( !IS_PRIV(d) )
   39.85          {
   39.86 -            MEM_LOG("Dom %u has no privilege to set subject domain",
   39.87 -                    d->domain);
   39.88 -            okay = 0;
   39.89 +            switch ( domid )
   39.90 +            {
   39.91 +            case DOMID_IO:
   39.92 +                get_knownalive_domain(e = dom_io);
   39.93 +                break;
   39.94 +            default:
   39.95 +                MEM_LOG("Dom %u cannot set foreign dom\n", d->domain);
   39.96 +                okay = 0;
   39.97 +                break;
   39.98 +            }
   39.99          }
  39.100          else
  39.101          {
  39.102 @@ -839,8 +885,19 @@ static int do_extended_command(unsigned 
  39.103              percpu_info[cpu].foreign = e = find_domain_by_id(domid);
  39.104              if ( e == NULL )
  39.105              {
  39.106 -                MEM_LOG("Unknown domain '%u'", domid);
  39.107 -                okay = 0;
  39.108 +                switch ( domid )
  39.109 +                {
  39.110 +                case DOMID_XEN:
  39.111 +                    get_knownalive_domain(e = dom_xen);
  39.112 +                    break;
  39.113 +                case DOMID_IO:
  39.114 +                    get_knownalive_domain(e = dom_io);
  39.115 +                    break;
  39.116 +                default:
  39.117 +                    MEM_LOG("Unknown domain '%u'", domid);
  39.118 +                    okay = 0;
  39.119 +                    break;
  39.120 +                }
  39.121              }
  39.122          }
  39.123          break;
  39.124 @@ -926,7 +983,7 @@ static int do_extended_command(unsigned 
  39.125           * the lock so they'll spin waiting for us.
  39.126           */
  39.127          if ( unlikely(e->tot_pages++ == 0) )
  39.128 -            get_domain(e);
  39.129 +            get_knownalive_domain(e);
  39.130          list_add_tail(&page->list, &e->page_list);
  39.131  
  39.132      reassign_fail:        
  39.133 @@ -1493,7 +1550,7 @@ int ptwr_do_page_fault(unsigned long add
  39.134      return 0;
  39.135  }
  39.136  
  39.137 -void ptwr_init_backpointers(void)
  39.138 +static void ptwr_init_backpointers(void)
  39.139  {
  39.140      struct pfn_info *page;
  39.141      unsigned long pde;
    40.1 --- a/xen/arch/x86/setup.c	Fri Aug 20 09:11:43 2004 +0000
    40.2 +++ b/xen/arch/x86/setup.c	Fri Aug 20 09:21:37 2004 +0000
    40.3 @@ -16,6 +16,7 @@
    40.4  #include <asm/domain_page.h>
    40.5  #include <asm/pdb.h>
    40.6  
    40.7 +extern void arch_init_memory(void);
    40.8  extern void init_IRQ(void);
    40.9  extern void trap_init(void);
   40.10  extern void time_init(void);
   40.11 @@ -360,6 +361,8 @@ void __init start_of_day(void)
   40.12      time_init(); /* installs software handler for HZ clock. */
   40.13      init_apic_mappings(); /* make APICs addressable in our pagetables. */
   40.14  
   40.15 +    arch_init_memory();
   40.16 +
   40.17  #ifndef CONFIG_SMP    
   40.18      APIC_init_uniprocessor();
   40.19  #else
    41.1 --- a/xen/arch/x86/shadow.c	Fri Aug 20 09:11:43 2004 +0000
    41.2 +++ b/xen/arch/x86/shadow.c	Fri Aug 20 09:21:37 2004 +0000
    41.3 @@ -29,41 +29,6 @@ hypercall lock anyhow (at least initiall
    41.4  ********/
    41.5  
    41.6  
    41.7 -/**
    41.8 -
    41.9 -FIXME:
   41.10 -
   41.11 -The shadow table flush command is dangerous on SMP systems as the
   41.12 -guest may be using the L2 on one CPU while the other is trying to 
   41.13 -blow the table away. 
   41.14 -
   41.15 -The current save restore code works around this by not calling FLUSH,
   41.16 -but by calling CLEAN2 which leaves all L2s in tact (this is probably
   41.17 -quicker anyhow).
   41.18 -
   41.19 -Even so, we have to be very careful. The flush code may need to cause
   41.20 -a TLB flush on another CPU. It needs to do this while holding the
   41.21 -shadow table lock. The trouble is, the guest may be in the shadow page
   41.22 -fault handler spinning waiting to grab the shadow lock. It may have
   41.23 -intterupts disabled, hence we can't use the normal flush_tlb_cpu
   41.24 -mechanism.
   41.25 -
   41.26 -For the moment, we have a grim race whereby the spinlock in the shadow
   41.27 -fault handler is actually a try lock, in a loop with a helper for the
   41.28 -tlb flush code.
   41.29 -
   41.30 -A better soloution would be to take a new flush lock, then raise a
   41.31 -per-domain soft irq on the other CPU.  The softirq will switch to
   41.32 -init's PTs, then do an atomic inc of a variable to count himself in,
   41.33 -then spin on a lock.  Having noticed that the other guy has counted
   41.34 -in, flush the shadow table, then release him by dropping the lock. He
   41.35 -will then reload cr3 from mm.page_table on the way out of the softirq.
   41.36 -
   41.37 -In domian-softirq context we know that the guy holds no locks and has
   41.38 -interrupts enabled. Nothing can go wrong ;-)
   41.39 -
   41.40 -**/
   41.41 -
   41.42  static inline void free_shadow_page(struct mm_struct *m, 
   41.43                                      struct pfn_info *page)
   41.44  {
   41.45 @@ -381,9 +346,9 @@ static int shadow_mode_table_op(struct d
   41.46  		d->mm.shadow_dirty_net_count   = 0;
   41.47  		d->mm.shadow_dirty_block_count = 0;
   41.48  	
   41.49 -		sc->pages = d->tot_pages;
   41.50 +		sc->pages = d->max_pages;
   41.51  
   41.52 -		if( d->tot_pages > sc->pages || 
   41.53 +		if( d->max_pages > sc->pages || 
   41.54  			!sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
   41.55  		{
   41.56  			rc = -EINVAL;
   41.57 @@ -393,10 +358,10 @@ static int shadow_mode_table_op(struct d
   41.58  	
   41.59  #define chunk (8*1024) // do this in 1KB chunks for L1 cache
   41.60  	
   41.61 -		for(i=0;i<d->tot_pages;i+=chunk)
   41.62 +		for(i=0;i<d->max_pages;i+=chunk)
   41.63  		{
   41.64 -			int bytes = ((  ((d->tot_pages-i) > (chunk))?
   41.65 -							(chunk):(d->tot_pages-i) ) + 7) / 8;
   41.66 +			int bytes = ((  ((d->max_pages-i) > (chunk))?
   41.67 +							(chunk):(d->max_pages-i) ) + 7) / 8;
   41.68  	    
   41.69  			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
   41.70  						  d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   41.71 @@ -428,21 +393,21 @@ static int shadow_mode_table_op(struct d
   41.72  		sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
   41.73  		sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
   41.74  	
   41.75 -		if( d->tot_pages > sc->pages || 
   41.76 +		if( d->max_pages > sc->pages || 
   41.77  			!sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
   41.78  		{
   41.79  			rc = -EINVAL;
   41.80  			goto out;
   41.81  		}
   41.82  	
   41.83 -		sc->pages = d->tot_pages;
   41.84 +		sc->pages = d->max_pages;
   41.85  	
   41.86  #define chunk (8*1024) // do this in 1KB chunks for L1 cache
   41.87  	
   41.88 -		for(i=0;i<d->tot_pages;i+=chunk)
   41.89 +		for(i=0;i<d->max_pages;i+=chunk)
   41.90  		{
   41.91 -			int bytes = ((  ((d->tot_pages-i) > (chunk))?
   41.92 -							(chunk):(d->tot_pages-i) ) + 7) / 8;
   41.93 +			int bytes = ((  ((d->max_pages-i) > (chunk))?
   41.94 +							(chunk):(d->max_pages-i) ) + 7) / 8;
   41.95  	    
   41.96  			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
   41.97  						  d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   41.98 @@ -475,7 +440,13 @@ int shadow_mode_control(struct domain *d
   41.99      unsigned int cmd = sc->op;
  41.100      int rc = 0;
  41.101  
  41.102 -    spin_lock(&d->mm.shadow_lock);
  41.103 +	if (d == current)
  41.104 +		printk("Attempt to control your _own_ shadow tables. I hope you know what you're doing!\n");
  41.105 +
  41.106 +	domain_pause(d);
  41.107 +	synchronise_pagetables(d->processor);
  41.108 +
  41.109 +	spin_lock(&d->mm.shadow_lock);
  41.110  
  41.111      if ( cmd == DOM0_SHADOW_CONTROL_OP_OFF )
  41.112      {
  41.113 @@ -502,10 +473,10 @@ int shadow_mode_control(struct domain *d
  41.114          rc = -EINVAL;
  41.115      }
  41.116  
  41.117 -	flush_tlb_cpu(d->processor);
  41.118 -   
  41.119      spin_unlock(&d->mm.shadow_lock);
  41.120  
  41.121 +	domain_unpause(d);
  41.122 +
  41.123      return rc;
  41.124  }
  41.125  
  41.126 @@ -518,6 +489,7 @@ static inline struct pfn_info *alloc_sha
  41.127  void unshadow_table( unsigned long gpfn, unsigned int type )
  41.128  {
  41.129      unsigned long spfn;
  41.130 +	struct domain *d = frame_table[gpfn].u.inuse.domain;
  41.131  
  41.132      SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
  41.133              type,
  41.134 @@ -530,11 +502,11 @@ void unshadow_table( unsigned long gpfn,
  41.135      // even in the SMP guest case, there won't be a race here as
  41.136      // this CPU was the one that cmpxchg'ed the page to invalid
  41.137  
  41.138 -    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
  41.139 +    spfn = __shadow_status(&d->mm, gpfn) & PSH_pfn_mask;
  41.140  
  41.141 -    delete_shadow_status(&current->mm, gpfn);
  41.142 +    delete_shadow_status(&d->mm, gpfn);
  41.143  
  41.144 -    free_shadow_page( &current->mm, &frame_table[spfn] );
  41.145 +    free_shadow_page(&d->mm, &frame_table[spfn] );
  41.146  
  41.147  }
  41.148  
  41.149 @@ -651,15 +623,7 @@ int shadow_fault( unsigned long va, long
  41.150  
  41.151      // take the lock and reread gpte
  41.152  
  41.153 -    while( unlikely(!spin_trylock(&current->mm.shadow_lock)) )
  41.154 -	{
  41.155 -		extern volatile unsigned long flush_cpumask;
  41.156 -		if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
  41.157 -			local_flush_tlb();
  41.158 -		rep_nop();
  41.159 -	}
  41.160 -	
  41.161 -	ASSERT(spin_is_locked(&current->mm.shadow_lock));
  41.162 +	spin_lock(&current->mm.shadow_lock);
  41.163  	
  41.164      if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
  41.165      {
    42.1 --- a/xen/arch/x86/smp.c	Fri Aug 20 09:11:43 2004 +0000
    42.2 +++ b/xen/arch/x86/smp.c	Fri Aug 20 09:21:37 2004 +0000
    42.3 @@ -212,7 +212,7 @@ static inline void send_IPI_allbutself(i
    42.4   */
    42.5  
    42.6  static spinlock_t flush_lock = SPIN_LOCK_UNLOCKED;
    42.7 -volatile unsigned long flush_cpumask;
    42.8 +static unsigned long flush_cpumask;
    42.9  
   42.10  asmlinkage void smp_invalidate_interrupt(void)
   42.11  {
    43.1 --- a/xen/common/dom0_ops.c	Fri Aug 20 09:11:43 2004 +0000
    43.2 +++ b/xen/common/dom0_ops.c	Fri Aug 20 09:21:37 2004 +0000
    43.3 @@ -25,13 +25,14 @@
    43.4  
    43.5  extern unsigned int alloc_new_dom_mem(struct domain *, unsigned int);
    43.6  extern long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op);
    43.7 -extern void arch_getdomaininfo_ctxt(struct domain *, full_execution_context_t *);
    43.8 +extern void arch_getdomaininfo_ctxt(
    43.9 +    struct domain *, full_execution_context_t *);
   43.10  
   43.11  static inline int is_free_domid(domid_t dom)
   43.12  {
   43.13      struct domain *d;
   43.14  
   43.15 -    if ( dom >= DOMID_SELF )
   43.16 +    if ( dom >= DOMID_FIRST_RESERVED )
   43.17          return 0;
   43.18  
   43.19      if ( (d = find_domain_by_id(dom)) == NULL )
   43.20 @@ -66,7 +67,7 @@ static int allocate_domid(domid_t *pdom)
   43.21      }
   43.22  
   43.23      /* Couldn't find a free domain id in 0..topdom, try higher. */
   43.24 -    for ( dom = topdom; dom < DOMID_SELF; dom++ )
   43.25 +    for ( dom = topdom; dom < DOMID_FIRST_RESERVED; dom++ )
   43.26      {
   43.27          if ( is_free_domid(dom) )
   43.28          {
   43.29 @@ -167,7 +168,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   43.30          domid_t           dom;
   43.31  
   43.32          dom = op->u.createdomain.domain;
   43.33 -        if ( (dom > 0) && (dom < DOMID_SELF) )
   43.34 +        if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
   43.35          {
   43.36              ret = -EINVAL;
   43.37              if ( !is_free_domid(dom) )
    44.1 --- a/xen/common/domain.c	Fri Aug 20 09:11:43 2004 +0000
    44.2 +++ b/xen/common/domain.c	Fri Aug 20 09:21:37 2004 +0000
    44.3 @@ -39,11 +39,18 @@ struct domain *do_createdomain(domid_t d
    44.4      d->domain    = dom_id;
    44.5      d->processor = cpu;
    44.6      d->create_time = NOW();
    44.7 -    /* Initialise the sleep_lock */
    44.8      spin_lock_init(&d->sleep_lock);
    44.9   
   44.10      memcpy(&d->thread, &idle0_task.thread, sizeof(d->thread));
   44.11  
   44.12 +    spin_lock_init(&d->page_alloc_lock);
   44.13 +    INIT_LIST_HEAD(&d->page_list);
   44.14 +    d->max_pages = d->tot_pages = 0;
   44.15 +
   44.16 +    /* Per-domain PCI-device list. */
   44.17 +    spin_lock_init(&d->pcidev_lock);
   44.18 +    INIT_LIST_HEAD(&d->pcidev_list);
   44.19 +
   44.20      if ( d->domain != IDLE_DOMAIN_ID )
   44.21      {
   44.22          if ( init_event_channels(d) != 0 )
   44.23 @@ -59,16 +66,8 @@ struct domain *do_createdomain(domid_t d
   44.24  
   44.25          d->addr_limit = USER_DS;
   44.26          
   44.27 -        spin_lock_init(&d->page_alloc_lock);
   44.28 -        INIT_LIST_HEAD(&d->page_list);
   44.29 -        d->max_pages = d->tot_pages = 0;
   44.30 -
   44.31  	arch_do_createdomain(d);
   44.32  
   44.33 -        /* Per-domain PCI-device list. */
   44.34 -        spin_lock_init(&d->pcidev_lock);
   44.35 -        INIT_LIST_HEAD(&d->pcidev_list);
   44.36 -
   44.37          sched_add_domain(d);
   44.38  
   44.39          write_lock_irqsave(&tasklist_lock, flags);
    45.1 --- a/xen/common/kernel.c	Fri Aug 20 09:11:43 2004 +0000
    45.2 +++ b/xen/common/kernel.c	Fri Aug 20 09:21:37 2004 +0000
    45.3 @@ -304,9 +304,6 @@ void cmain(multiboot_info_t *mbi)
    45.4  
    45.5      start_of_day();
    45.6  
    45.7 -    /* Add CPU0 idle task to the task hash list */
    45.8 -    task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task;
    45.9 -
   45.10      /* Create initial domain 0. */
   45.11      new_dom = do_createdomain(0, 0);
   45.12      if ( new_dom == NULL )
    46.1 --- a/xen/common/memory.c	Fri Aug 20 09:11:43 2004 +0000
    46.2 +++ b/xen/common/memory.c	Fri Aug 20 09:21:37 2004 +0000
    46.3 @@ -37,14 +37,8 @@ struct pfn_info *frame_table;
    46.4  unsigned long frame_table_size;
    46.5  unsigned long max_page;
    46.6  
    46.7 -extern void arch_init_memory(void);
    46.8 -
    46.9  void __init init_frametable(void *frametable_vstart, unsigned long nr_pages)
   46.10  {
   46.11 -    unsigned long mfn;
   46.12 -
   46.13 -    arch_init_memory();
   46.14 -
   46.15      max_page = nr_pages;
   46.16      frame_table_size = nr_pages * sizeof(struct pfn_info);
   46.17      frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
   46.18 @@ -54,17 +48,4 @@ void __init init_frametable(void *framet
   46.19          panic("Not enough memory for frame table - reduce Xen heap size?\n");
   46.20  
   46.21      memset(frame_table, 0, frame_table_size);
   46.22 -
   46.23 -    /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
   46.24 -    memset(machine_to_phys_mapping, 0x55, 4<<20);
   46.25 -
   46.26 -    /* Pin the ownership of the MP table so that DOM0 can map it later. */
   46.27 -    for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
   46.28 -          mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
   46.29 -          mfn++ )
   46.30 -    {
   46.31 -        frame_table[mfn].u.inuse.count_info = 1 | PGC_allocated;
   46.32 -        frame_table[mfn].u.inuse.type_info = 1 | PGT_gdt_page; /* non-RW */
   46.33 -        frame_table[mfn].u.inuse.domain = &idle0_task;
   46.34 -    }
   46.35  }
    47.1 --- a/xen/common/page_alloc.c	Fri Aug 20 09:11:43 2004 +0000
    47.2 +++ b/xen/common/page_alloc.c	Fri Aug 20 09:21:37 2004 +0000
    47.3 @@ -393,7 +393,7 @@ struct pfn_info *alloc_domheap_pages(str
    47.4      }
    47.5  
    47.6      if ( unlikely(d->tot_pages == 0) )
    47.7 -        get_domain(d);
    47.8 +        get_knownalive_domain(d);
    47.9  
   47.10      d->tot_pages += 1 << order;
   47.11  
   47.12 @@ -422,7 +422,7 @@ void free_domheap_pages(struct pfn_info 
   47.13          drop_dom_ref = (d->xenheap_pages == 0);
   47.14          spin_unlock_recursive(&d->page_alloc_lock);
   47.15      }
   47.16 -    else
   47.17 +    else if ( likely(d != NULL) )
   47.18      {
   47.19          /* NB. May recursively lock from domain_relinquish_memory(). */
   47.20          spin_lock_recursive(&d->page_alloc_lock);
   47.21 @@ -442,6 +442,12 @@ void free_domheap_pages(struct pfn_info 
   47.22  
   47.23          free_heap_pages(MEMZONE_DOM, pg, order);
   47.24      }
   47.25 +    else
   47.26 +    {
   47.27 +        /* Freeing an anonymous domain-heap page. */
   47.28 +        free_heap_pages(MEMZONE_DOM, pg, order);
   47.29 +        drop_dom_ref = 0;
   47.30 +    }
   47.31  
   47.32      if ( drop_dom_ref )
   47.33          put_domain(d);
    48.1 --- a/xen/include/asm-x86/mm.h	Fri Aug 20 09:11:43 2004 +0000
    48.2 +++ b/xen/include/asm-x86/mm.h	Fri Aug 20 09:21:37 2004 +0000
    48.3 @@ -108,7 +108,7 @@ struct pfn_info
    48.4          /* _dom holds an allocation reference */                            \
    48.5          (_pfn)->u.inuse.count_info = PGC_allocated | 1;                     \
    48.6          if ( unlikely((_dom)->xenheap_pages++ == 0) )                       \
    48.7 -            get_domain(_dom);                                               \
    48.8 +            get_knownalive_domain(_dom);                                    \
    48.9          spin_unlock(&(_dom)->page_alloc_lock);                              \
   48.10      } while ( 0 )
   48.11  
    49.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h	Fri Aug 20 09:11:43 2004 +0000
    49.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h	Fri Aug 20 09:21:37 2004 +0000
    49.3 @@ -87,8 +87,10 @@
    49.4   * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
    49.5   * Updates an entry in a page table. If updating an L1 table, and the new
    49.6   * table entry is valid/present, the mapped frame must belong to the FD, if
    49.7 - * an FD has been specified. If attempting to map an I/O page, then the FD
    49.8 - * is ignored, but the calling domain must have sufficient privilege.
    49.9 + * an FD has been specified. If attempting to map an I/O page then the
   49.10 + * caller assumes the privilege of the FD.
   49.11 + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
   49.12 + * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
   49.13   * ptr[:2]  -- Machine address of the page-table entry to modify.
   49.14   * val      -- Value to write.
   49.15   * 
   49.16 @@ -121,6 +123,7 @@
   49.17   *   val[7:0] == MMUEXT_SET_FOREIGNDOM:
   49.18   *   val[31:15] -- Domain to set as the Foreign Domain (FD).
   49.19   *                 (NB. DOMID_SELF is not recognised)
   49.20 + *                 If FD != DOMID_IO then the caller must be privileged.
   49.21   * 
   49.22   *   val[7:0] == MMUEXT_REASSIGN_PAGE:
   49.23   *   ptr[:2]  -- A machine address within the page to be reassigned to the FD.
   49.24 @@ -186,9 +189,31 @@
   49.25  #ifndef __ASSEMBLY__
   49.26  
   49.27  typedef u16 domid_t;
   49.28 +
   49.29 +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
   49.30 +#define DOMID_FIRST_RESERVED (0x7FF0U)
   49.31 +
   49.32  /* DOMID_SELF is used in certain contexts to refer to oneself. */
   49.33 -#define DOMID_SELF  (0x7FF0U)
   49.34 -/* NB. IDs >= 0x7FF1 are reserved for future use. */
   49.35 +#define DOMID_SELF (0x7FF0U)
   49.36 +
   49.37 +/*
   49.38 + * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
   49.39 + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
   49.40 + * is useful to ensure that no mappings to the OS's own heap are accidentally
   49.41 + * installed. (e.g., in Linux this could cause havoc as reference counts
   49.42 + * aren't adjusted on the I/O-mapping code path).
   49.43 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
   49.44 + * be specified by any calling domain.
   49.45 + */
   49.46 +#define DOMID_IO   (0x7FF1U)
   49.47 +
   49.48 +/*
   49.49 + * DOMID_XEN is used to allow privileged domains to map restricted parts of
   49.50 + * Xen's heap space (e.g., the machine_to_phys table).
   49.51 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
   49.52 + * the caller is privileged.
   49.53 + */
   49.54 +#define DOMID_XEN  (0x7FF2U)
   49.55  
   49.56  /*
   49.57   * Send an array of these to HYPERVISOR_mmu_update().
    50.1 --- a/xen/include/xen/sched.h	Fri Aug 20 09:11:43 2004 +0000
    50.2 +++ b/xen/include/xen/sched.h	Fri Aug 20 09:21:37 2004 +0000
    50.3 @@ -164,11 +164,26 @@ struct domain *alloc_domain_struct();
    50.4  #define DOMAIN_DESTRUCTED (1<<31) /* assumes atomic_t is >= 32 bits */
    50.5  #define put_domain(_d) \
    50.6    if ( atomic_dec_and_test(&(_d)->refcnt) ) domain_destruct(_d)
    50.7 +
    50.8 +/*
    50.9 + * Use this when you don't have an existing reference to @d. It returns
   50.10 + * FALSE if @d is being destructed.
   50.11 + */
   50.12  static inline int get_domain(struct domain *d)
   50.13  {
   50.14      atomic_inc(&d->refcnt);
   50.15      return !(atomic_read(&d->refcnt) & DOMAIN_DESTRUCTED);
   50.16  }
   50.17 +
   50.18 +/*
   50.19 + * Use this when you already have, or are borrowing, a reference to @d.
   50.20 + * In this case we know that @d cannot be destructed under our feet.
   50.21 + */
   50.22 +static inline void get_knownalive_domain(struct domain *d)
   50.23 +{
   50.24 +    atomic_inc(&d->refcnt);
   50.25 +    ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTRUCTED));
   50.26 +}
   50.27    
   50.28  extern struct domain *do_createdomain(
   50.29      domid_t dom_id, unsigned int cpu);