ia64/xen-unstable

changeset 2009:bf9e767bd96f

bitkeeper revision 1.1108.33.17 (410a9834dLHZXL-7q43y8gsiD74zLQ)

Merge freefall.cl.cam.ac.uk:/auto/groups/xeno/users/cl349/BK/xeno.bk-26dom0
into freefall.cl.cam.ac.uk:/local/scratch/cl349/xeno.bk-26dom0
author cl349@freefall.cl.cam.ac.uk
date Fri Jul 30 18:49:24 2004 +0000 (2004-07-30)
parents 0612b896b449 10ed1c2146f2
children 6a62546fdf51
files .rootkeys linux-2.4.26-xen-sparse/mkbuildtree linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c linux-2.6.7-xen-sparse/drivers/xen/Makefile linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig linux-2.6.7-xen-sparse/drivers/xen/net/Makefile linux-2.6.7-xen-sparse/drivers/xen/net/network.c linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile linux-2.6.7-xen-sparse/drivers/xen/netback/common.h linux-2.6.7-xen-sparse/drivers/xen/netback/control.c linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h linux-2.6.7-xen-sparse/mm/page_alloc.c
line diff
     1.1 --- a/.rootkeys	Fri Jul 30 14:22:41 2004 +0000
     1.2 +++ b/.rootkeys	Fri Jul 30 18:49:24 2004 +0000
     1.3 @@ -203,9 +203,14 @@ 40f56239fsLjvtD8YBRAWphps4FDjg linux-2.6
     1.4  3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.7-xen-sparse/drivers/xen/console/console.c
     1.5  40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.7-xen-sparse/drivers/xen/evtchn/Makefile
     1.6  40f56239DoibTX6R-ZYd3QTXAB8_TA linux-2.6.7-xen-sparse/drivers/xen/evtchn/evtchn.c
     1.7 -40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig
     1.8 -40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
     1.9 -405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/net/network.c
    1.10 +410a9817HEVJvred5Oy_uKH3HFJC5Q linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
    1.11 +4097ba831lpGeLlPg-bfV8XarVVuoQ linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
    1.12 +4097ba83wvv8yi5P5xugCUBAdb6O-A linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
    1.13 +4097ba83byY5bTSugJGZ1exTxIcMKw linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
    1.14 +4087cf0dGmSbFhFZyIZBJzvqxY-qBw linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
    1.15 +40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
    1.16 +40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
    1.17 +405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
    1.18  4108f5c1ppFXVpQzCOAZ6xXYubsjKA linux-2.6.7-xen-sparse/drivers/xen/privcmd/Makefile
    1.19  3e5a4e65IUfzzMu2kZFlGEB8-rpTaA linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c
    1.20  40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/desc.h
    1.21 @@ -251,6 +256,7 @@ 3f108af1ylCIm82H052FVTfXACBHrw linux-2.6
    1.22  40f5623cBiQhPHILVLrl3xa6bDBaRg linux-2.6.7-xen-sparse/include/asm-xen/xen.h
    1.23  3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
    1.24  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
    1.25 +410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
    1.26  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.27  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
    1.28  401d7e160vaxMBAUSLSicuZ7AQjJ3w tools/examples/Makefile
     2.1 --- a/linux-2.4.26-xen-sparse/mkbuildtree	Fri Jul 30 14:22:41 2004 +0000
     2.2 +++ b/linux-2.4.26-xen-sparse/mkbuildtree	Fri Jul 30 18:49:24 2004 +0000
     2.3 @@ -238,4 +238,4 @@ cd ${AD}/arch/xen/drivers/dom0
     2.4  ln -sf ../../../../${LINUX_26}/drivers/xen/privcmd/privcmd.c core.c
     2.5  
     2.6  cd ${AD}/arch/xen/drivers/netif/frontend
     2.7 -ln -sf ../../../../../${LINUX_26}/drivers/xen/net/network.c main.c
     2.8 +ln -sf ../../../../../${LINUX_26}/drivers/xen/netfront/netfront.c main.c
     3.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Jul 30 14:22:41 2004 +0000
     3.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Jul 30 18:49:24 2004 +0000
     3.3 @@ -8,6 +8,8 @@
     3.4  
     3.5  #include <linux/config.h>
     3.6  #include <linux/sched.h>
     3.7 +#include <linux/mm.h>
     3.8 +#include <linux/vmalloc.h>
     3.9  #include <asm/hypervisor.h>
    3.10  #include <asm/page.h>
    3.11  #include <asm/pgtable.h>
    3.12 @@ -126,8 +128,11 @@ static inline void __flush_page_update_q
    3.13  #endif
    3.14      idx = 0;
    3.15      wmb(); /* Make sure index is cleared first to avoid double updates. */
    3.16 -    if (unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0))
    3.17 -	panic("Failed to execute MMU updates");
    3.18 +    if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0) )
    3.19 +    {
    3.20 +        printk(KERN_ALERT "Failed to execute MMU updates.\n");
    3.21 +        BUG();
    3.22 +    }
    3.23  }
    3.24  
    3.25  void _flush_page_update_queue(void)
    3.26 @@ -262,3 +267,91 @@ void queue_machphys_update(unsigned long
    3.27      increment_index();
    3.28      spin_unlock_irqrestore(&update_lock, flags);
    3.29  }
    3.30 +
    3.31 +#ifdef CONFIG_XEN_PHYSDEV_ACCESS
    3.32 +
    3.33 +unsigned long allocate_empty_lowmem_region(unsigned long pages)
    3.34 +{
    3.35 +    pgd_t         *pgd; 
    3.36 +    pmd_t         *pmd;
    3.37 +    pte_t         *pte;
    3.38 +    unsigned long *pfn_array;
    3.39 +    unsigned long  vstart;
    3.40 +    unsigned long  i;
    3.41 +    int            ret;
    3.42 +    unsigned int   order = get_order(pages*PAGE_SIZE);
    3.43 +
    3.44 +    vstart = __get_free_pages(GFP_KERNEL, order);
    3.45 +    if ( vstart == 0 )
    3.46 +        return 0UL;
    3.47 +
    3.48 +    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
    3.49 +    if ( pfn_array == NULL )
    3.50 +        BUG();
    3.51 +
    3.52 +    for ( i = 0; i < (1<<order); i++ )
    3.53 +    {
    3.54 +        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
    3.55 +        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
    3.56 +        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
    3.57 +        pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
    3.58 +        queue_l1_entry_update(pte, 0);
    3.59 +        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
    3.60 +    }
    3.61 +
    3.62 +    flush_page_update_queue();
    3.63 +
    3.64 +    ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, 
    3.65 +                                pfn_array, 1<<order);
    3.66 +    if ( unlikely(ret != (1<<order)) )
    3.67 +    {
    3.68 +        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
    3.69 +        BUG();
    3.70 +    }
    3.71 +
    3.72 +    vfree(pfn_array);
    3.73 +
    3.74 +    return vstart;
    3.75 +}
    3.76 +
    3.77 +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages)
    3.78 +{
    3.79 +    pgd_t         *pgd; 
    3.80 +    pmd_t         *pmd;
    3.81 +    pte_t         *pte;
    3.82 +    unsigned long *pfn_array;
    3.83 +    unsigned long  i;
    3.84 +    int            ret;
    3.85 +    unsigned int   order = get_order(pages*PAGE_SIZE);
    3.86 +
    3.87 +    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
    3.88 +    if ( pfn_array == NULL )
    3.89 +        BUG();
    3.90 +
    3.91 +    ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
    3.92 +                                pfn_array, 1<<order);
    3.93 +    if ( unlikely(ret != (1<<order)) )
    3.94 +    {
    3.95 +        printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
    3.96 +               ret);
    3.97 +        BUG();
    3.98 +    }
    3.99 +
   3.100 +    for ( i = 0; i < (1<<order); i++ )
   3.101 +    {
   3.102 +        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
   3.103 +        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
   3.104 +        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
   3.105 +        queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL);
   3.106 +        queue_machphys_update(pfn_array[i], __pa(vstart)>>PAGE_SHIFT);
   3.107 +        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i];
   3.108 +    }
   3.109 +
   3.110 +    flush_page_update_queue();
   3.111 +
   3.112 +    vfree(pfn_array);
   3.113 +
   3.114 +    free_pages(vstart, order);
   3.115 +}
   3.116 +
   3.117 +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
     4.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/Makefile	Fri Jul 30 14:22:41 2004 +0000
     4.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/Makefile	Fri Jul 30 18:49:24 2004 +0000
     4.3 @@ -3,5 +3,7 @@
     4.4  obj-y	+= block/
     4.5  obj-y	+= console/
     4.6  obj-y	+= evtchn/
     4.7 -obj-y	+= net/
     4.8 +obj-y	+= netfront/
     4.9  obj-y	+= privcmd/
    4.10 +
    4.11 +obj-$(CONFIG_XEN_PHYSDEV_ACCESS)	+= netback/
     5.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig	Fri Jul 30 14:22:41 2004 +0000
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,6 +0,0 @@
     5.4 -
     5.5 -config XENNET
     5.6 -	tristate "Xen network driver"
     5.7 -	depends on NETDEVICES && ARCH_XEN
     5.8 -	help
     5.9 -	  Network driver for Xen
     6.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile	Fri Jul 30 14:22:41 2004 +0000
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,2 +0,0 @@
     6.4 -
     6.5 -obj-y	:= network.o
     7.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/net/network.c	Fri Jul 30 14:22:41 2004 +0000
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,882 +0,0 @@
     7.4 -/******************************************************************************
     7.5 - * Virtual network driver for conversing with remote driver backends.
     7.6 - * 
     7.7 - * Copyright (c) 2002-2004, K A Fraser
     7.8 - */
     7.9 -
    7.10 -#include <linux/config.h>
    7.11 -#include <linux/module.h>
    7.12 -#include <linux/version.h>
    7.13 -#include <linux/kernel.h>
    7.14 -#include <linux/sched.h>
    7.15 -#include <linux/slab.h>
    7.16 -#include <linux/string.h>
    7.17 -#include <linux/errno.h>
    7.18 -#include <linux/netdevice.h>
    7.19 -#include <linux/inetdevice.h>
    7.20 -#include <linux/etherdevice.h>
    7.21 -#include <linux/skbuff.h>
    7.22 -#include <linux/init.h>
    7.23 -
    7.24 -#include <asm/io.h>
    7.25 -#include <net/sock.h>
    7.26 -#include <net/pkt_sched.h>
    7.27 -
    7.28 -#include <asm-xen/evtchn.h>
    7.29 -#include <asm-xen/ctrl_if.h>
    7.30 -
    7.31 -#include <asm/page.h>
    7.32 -
    7.33 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
    7.34 -#include <asm-xen/netif.h>
    7.35 -#else
    7.36 -#include "../netif.h"
    7.37 -#define irqreturn_t void
    7.38 -#define IRQ_HANDLED
    7.39 -#endif
    7.40 -
    7.41 -#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
    7.42 -
    7.43 -static void network_tx_buf_gc(struct net_device *dev);
    7.44 -static void network_alloc_rx_buffers(struct net_device *dev);
    7.45 -
    7.46 -static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
    7.47 -static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
    7.48 -static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
    7.49 -
    7.50 -static struct list_head dev_list;
    7.51 -
    7.52 -struct net_private
    7.53 -{
    7.54 -    struct list_head list;
    7.55 -    struct net_device *dev;
    7.56 -
    7.57 -    struct net_device_stats stats;
    7.58 -    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
    7.59 -    unsigned int tx_full;
    7.60 -    
    7.61 -    netif_tx_interface_t *tx;
    7.62 -    netif_rx_interface_t *rx;
    7.63 -
    7.64 -    spinlock_t   tx_lock;
    7.65 -    spinlock_t   rx_lock;
    7.66 -
    7.67 -    unsigned int handle;
    7.68 -    unsigned int evtchn;
    7.69 -    unsigned int irq;
    7.70 -
    7.71 -    /* What is the status of our connection to the remote backend? */
    7.72 -#define BEST_CLOSED       0
    7.73 -#define BEST_DISCONNECTED 1
    7.74 -#define BEST_CONNECTED    2
    7.75 -    unsigned int backend_state;
    7.76 -
    7.77 -    /* Is this interface open or closed (down or up)? */
    7.78 -#define UST_CLOSED        0
    7.79 -#define UST_OPEN          1
    7.80 -    unsigned int user_state;
    7.81 -
    7.82 -    /*
    7.83 -     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
    7.84 -     * array is an index into a chain of free entries.
    7.85 -     */
    7.86 -    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
    7.87 -    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
    7.88 -};
    7.89 -
    7.90 -/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
    7.91 -#define ADD_ID_TO_FREELIST(_list, _id)             \
    7.92 -    (_list)[(_id)] = (_list)[0];                   \
    7.93 -    (_list)[0]     = (void *)(unsigned long)(_id);
    7.94 -#define GET_ID_FROM_FREELIST(_list)                \
    7.95 - ({ unsigned long _id = (unsigned long)(_list)[0]; \
    7.96 -    (_list)[0]  = (_list)[_id];                    \
    7.97 -    (unsigned short)_id; })
    7.98 -
    7.99 -static struct net_device *find_dev_by_handle(unsigned int handle)
   7.100 -{
   7.101 -    struct list_head *ent;
   7.102 -    struct net_private *np;
   7.103 -    list_for_each ( ent, &dev_list )
   7.104 -    {
   7.105 -        np = list_entry(ent, struct net_private, list);
   7.106 -        if ( np->handle == handle )
   7.107 -            return np->dev;
   7.108 -    }
   7.109 -    return NULL;
   7.110 -}
   7.111 -
   7.112 -/** Network interface info. */
   7.113 -struct netif_ctrl {
   7.114 -    /** Number of interfaces. */
   7.115 -    int interface_n;
   7.116 -    /** Number of connected interfaces. */
   7.117 -    int connected_n;
   7.118 -    /** Error code. */
   7.119 -    int err;
   7.120 -};
   7.121 -
   7.122 -static struct netif_ctrl netctrl;
   7.123 -
   7.124 -static void netctrl_init(void)
   7.125 -{
   7.126 -    memset(&netctrl, 0, sizeof(netctrl));
   7.127 -    netctrl.interface_n = -1;
   7.128 -}
   7.129 -
   7.130 -/** Get or set a network interface error.
   7.131 - */
   7.132 -static int netctrl_err(int err)
   7.133 -{
   7.134 -    if(err < 0 && !netctrl.err){
   7.135 -        netctrl.err = err;
   7.136 -        printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
   7.137 -    }
   7.138 -    return netctrl.err;
   7.139 -}
   7.140 -
   7.141 -/** Test if all network interfaces are connected.
   7.142 - *
   7.143 - * @return 1 if all connected, 0 if not, negative error code otherwise
   7.144 - */
   7.145 -static int netctrl_connected(void)
   7.146 -{
   7.147 -    int ok = 0;
   7.148 -    ok = (netctrl.err ? netctrl.err :
   7.149 -          (netctrl.connected_n == netctrl.interface_n));
   7.150 -    return ok;
   7.151 -}
   7.152 -
   7.153 -/** Count the connected network interfaces.
   7.154 - *
   7.155 - * @return connected count
   7.156 - */
   7.157 -static int netctrl_connected_count(void)
   7.158 -{
   7.159 -    
   7.160 -    struct list_head *ent;
   7.161 -    struct net_private *np;
   7.162 -    unsigned int connected;
   7.163 -
   7.164 -    connected = 0;
   7.165 -    
   7.166 -    list_for_each(ent, &dev_list)
   7.167 -    {
   7.168 -        np = list_entry(ent, struct net_private, list);
   7.169 -        if ( np->backend_state == BEST_CONNECTED )
   7.170 -            connected++;
   7.171 -    }
   7.172 -
   7.173 -    netctrl.connected_n = connected;
   7.174 -    return connected;
   7.175 -}
   7.176 -
   7.177 -static int network_open(struct net_device *dev)
   7.178 -{
   7.179 -    struct net_private *np = dev->priv;
   7.180 -
   7.181 -    memset(&np->stats, 0, sizeof(np->stats));
   7.182 -
   7.183 -    np->user_state = UST_OPEN;
   7.184 -
   7.185 -    network_alloc_rx_buffers(dev);
   7.186 -    np->rx->event = np->rx_resp_cons + 1;
   7.187 -
   7.188 -    netif_start_queue(dev);
   7.189 -
   7.190 -    return 0;
   7.191 -}
   7.192 -
   7.193 -
   7.194 -static void network_tx_buf_gc(struct net_device *dev)
   7.195 -{
   7.196 -    NETIF_RING_IDX i, prod;
   7.197 -    unsigned short id;
   7.198 -    struct net_private *np = dev->priv;
   7.199 -    struct sk_buff *skb;
   7.200 -
   7.201 -    if ( np->backend_state != BEST_CONNECTED )
   7.202 -        return;
   7.203 -
   7.204 -    do {
   7.205 -        prod = np->tx->resp_prod;
   7.206 -
   7.207 -        for ( i = np->tx_resp_cons; i != prod; i++ )
   7.208 -        {
   7.209 -            id  = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
   7.210 -            skb = np->tx_skbs[id];
   7.211 -            ADD_ID_TO_FREELIST(np->tx_skbs, id);
   7.212 -            dev_kfree_skb_any(skb);
   7.213 -        }
   7.214 -        
   7.215 -        np->tx_resp_cons = prod;
   7.216 -        
   7.217 -        /*
   7.218 -         * Set a new event, then check for race with update of tx_cons. Note
   7.219 -         * that it is essential to schedule a callback, no matter how few
   7.220 -         * buffers are pending. Even if there is space in the transmit ring,
   7.221 -         * higher layers may be blocked because too much data is outstanding:
   7.222 -         * in such cases notification from Xen is likely to be the only kick
   7.223 -         * that we'll get.
   7.224 -         */
   7.225 -        np->tx->event = 
   7.226 -            prod + ((np->tx->req_prod - prod) >> 1) + 1;
   7.227 -        mb();
   7.228 -    }
   7.229 -    while ( prod != np->tx->resp_prod );
   7.230 -
   7.231 -    if ( np->tx_full && 
   7.232 -         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
   7.233 -    {
   7.234 -        np->tx_full = 0;
   7.235 -        if ( np->user_state == UST_OPEN )
   7.236 -            netif_wake_queue(dev);
   7.237 -    }
   7.238 -}
   7.239 -
   7.240 -
   7.241 -static void network_alloc_rx_buffers(struct net_device *dev)
   7.242 -{
   7.243 -    unsigned short id;
   7.244 -    struct net_private *np = dev->priv;
   7.245 -    struct sk_buff *skb;
   7.246 -    NETIF_RING_IDX i = np->rx->req_prod;
   7.247 -    int nr_pfns = 0;
   7.248 -
   7.249 -    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
   7.250 -    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
   7.251 -         unlikely(np->backend_state != BEST_CONNECTED) )
   7.252 -        return;
   7.253 -
   7.254 -    do {
   7.255 -        skb = dev_alloc_skb(RX_BUF_SIZE);
   7.256 -        if ( unlikely(skb == NULL) )
   7.257 -            break;
   7.258 -
   7.259 -        skb->dev = dev;
   7.260 -
   7.261 -        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
   7.262 -            panic("alloc_skb needs to provide us page-aligned buffers.");
   7.263 -
   7.264 -        id = GET_ID_FROM_FREELIST(np->rx_skbs);
   7.265 -
   7.266 -        np->rx_skbs[id] = skb;
   7.267 -        
   7.268 -        np->rx->ring[MASK_NETIF_RX_IDX(i)].req.id = id;
   7.269 -        
   7.270 -        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
   7.271 -
   7.272 -        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
   7.273 -        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
   7.274 -        rx_mcl[nr_pfns].args[1] = 0;
   7.275 -        rx_mcl[nr_pfns].args[2] = 0;
   7.276 -
   7.277 -        nr_pfns++;
   7.278 -    }
   7.279 -    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
   7.280 -
   7.281 -    if ( unlikely(nr_pfns == 0) )
   7.282 -        return;
   7.283 -
   7.284 -    /*
   7.285 -     * We may have allocated buffers which have entries outstanding in the page
   7.286 -     * update queue -- make sure we flush those first!
   7.287 -     */
   7.288 -    flush_page_update_queue();
   7.289 -
   7.290 -    /* After all PTEs have been zapped we blow away stale TLB entries. */
   7.291 -    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
   7.292 -
   7.293 -    /* Give away a batch of pages. */
   7.294 -    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
   7.295 -    rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
   7.296 -    rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
   7.297 -    rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
   7.298 -
   7.299 -    /* Zap PTEs and give away pages in one big multicall. */
   7.300 -    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
   7.301 -
   7.302 -    /* Check return status of HYPERVISOR_dom_mem_op(). */
   7.303 -    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
   7.304 -        panic("Unable to reduce memory reservation\n");
   7.305 -
   7.306 -    np->rx->req_prod = i;
   7.307 -}
   7.308 -
   7.309 -
   7.310 -static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
   7.311 -{
   7.312 -    unsigned short id;
   7.313 -    struct net_private *np = (struct net_private *)dev->priv;
   7.314 -    netif_tx_request_t *tx;
   7.315 -    NETIF_RING_IDX i;
   7.316 -
   7.317 -    if ( unlikely(np->tx_full) )
   7.318 -    {
   7.319 -        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
   7.320 -        netif_stop_queue(dev);
   7.321 -        return -ENOBUFS;
   7.322 -    }
   7.323 -
   7.324 -    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
   7.325 -                  PAGE_SIZE) )
   7.326 -    {
   7.327 -        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
   7.328 -        if ( unlikely(new_skb == NULL) )
   7.329 -            return 1;
   7.330 -        skb_put(new_skb, skb->len);
   7.331 -        memcpy(new_skb->data, skb->data, skb->len);
   7.332 -        dev_kfree_skb(skb);
   7.333 -        skb = new_skb;
   7.334 -    }
   7.335 -    
   7.336 -    spin_lock_irq(&np->tx_lock);
   7.337 -
   7.338 -    if ( np->backend_state != BEST_CONNECTED )
   7.339 -    {
   7.340 -        spin_unlock_irq(&np->tx_lock);
   7.341 -        return 1;
   7.342 -    }
   7.343 -
   7.344 -    i = np->tx->req_prod;
   7.345 -
   7.346 -    id = GET_ID_FROM_FREELIST(np->tx_skbs);
   7.347 -    np->tx_skbs[id] = skb;
   7.348 -
   7.349 -    tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
   7.350 -
   7.351 -    tx->id   = id;
   7.352 -    tx->addr = virt_to_machine(skb->data);
   7.353 -    tx->size = skb->len;
   7.354 -
   7.355 -    wmb();
   7.356 -    np->tx->req_prod = i + 1;
   7.357 -
   7.358 -    network_tx_buf_gc(dev);
   7.359 -
   7.360 -    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
   7.361 -    {
   7.362 -        np->tx_full = 1;
   7.363 -        netif_stop_queue(dev);
   7.364 -    }
   7.365 -
   7.366 -    spin_unlock_irq(&np->tx_lock);
   7.367 -
   7.368 -    np->stats.tx_bytes += skb->len;
   7.369 -    np->stats.tx_packets++;
   7.370 -
   7.371 -    /* Only notify Xen if there are no outstanding responses. */
   7.372 -    mb();
   7.373 -    if ( np->tx->resp_prod == i )
   7.374 -        notify_via_evtchn(np->evtchn);
   7.375 -
   7.376 -    return 0;
   7.377 -}
   7.378 -
   7.379 -
   7.380 -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
   7.381 -{
   7.382 -    struct net_device *dev = dev_id;
   7.383 -    struct net_private *np = dev->priv;
   7.384 -    unsigned long flags;
   7.385 -
   7.386 -    spin_lock_irqsave(&np->tx_lock, flags);
   7.387 -    network_tx_buf_gc(dev);
   7.388 -    spin_unlock_irqrestore(&np->tx_lock, flags);
   7.389 -
   7.390 -    if ( (np->rx_resp_cons != np->rx->resp_prod) &&
   7.391 -         (np->user_state == UST_OPEN) )
   7.392 -        netif_rx_schedule(dev);
   7.393 -
   7.394 -    return IRQ_HANDLED;
   7.395 -}
   7.396 -
   7.397 -
   7.398 -static int netif_poll(struct net_device *dev, int *pbudget)
   7.399 -{
   7.400 -    struct net_private *np = dev->priv;
   7.401 -    struct sk_buff *skb;
   7.402 -    netif_rx_response_t *rx;
   7.403 -    NETIF_RING_IDX i;
   7.404 -    mmu_update_t *mmu = rx_mmu;
   7.405 -    multicall_entry_t *mcl = rx_mcl;
   7.406 -    int work_done, budget, more_to_do = 1;
   7.407 -    struct sk_buff_head rxq;
   7.408 -    unsigned long flags;
   7.409 -
   7.410 -    spin_lock(&np->rx_lock);
   7.411 -
   7.412 -    if ( np->backend_state != BEST_CONNECTED )
   7.413 -    {
   7.414 -        spin_unlock(&np->rx_lock);
   7.415 -        return 0;
   7.416 -    }
   7.417 -
   7.418 -    skb_queue_head_init(&rxq);
   7.419 -
   7.420 -    if ( (budget = *pbudget) > dev->quota )
   7.421 -        budget = dev->quota;
   7.422 -
   7.423 -    for ( i = np->rx_resp_cons, work_done = 0; 
   7.424 -          (i != np->rx->resp_prod) && (work_done < budget); 
   7.425 -          i++, work_done++ )
   7.426 -    {
   7.427 -        rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
   7.428 -
   7.429 -        /*
   7.430 -         * An error here is very odd. Usually indicates a backend bug,
   7.431 -         * low-memory condition, or that we didn't have reservation headroom.
   7.432 -         * Whatever - print an error and queue the id again straight away.
   7.433 -         */
   7.434 -        if ( unlikely(rx->status <= 0) )
   7.435 -        {
   7.436 -            /* Gate this error. We get a (valid) slew of them on suspend. */
   7.437 -            if ( np->user_state == UST_OPEN )
   7.438 -                printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
   7.439 -            np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
   7.440 -            wmb();
   7.441 -            np->rx->req_prod++;
   7.442 -            continue;
   7.443 -        }
   7.444 -
   7.445 -        skb = np->rx_skbs[rx->id];
   7.446 -        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
   7.447 -
   7.448 -        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
   7.449 -        skb_put(skb, rx->status);
   7.450 -
   7.451 -        np->stats.rx_packets++;
   7.452 -        np->stats.rx_bytes += rx->status;
   7.453 -
   7.454 -        /* Remap the page. */
   7.455 -        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
   7.456 -        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
   7.457 -        mmu++;
   7.458 -        mcl->op = __HYPERVISOR_update_va_mapping;
   7.459 -        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
   7.460 -        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
   7.461 -        mcl->args[2] = 0;
   7.462 -        mcl++;
   7.463 -
   7.464 -        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
   7.465 -            rx->addr >> PAGE_SHIFT;
   7.466 -
   7.467 -        __skb_queue_tail(&rxq, skb);
   7.468 -    }
   7.469 -
   7.470 -    /* Do all the remapping work, and M->P updates, in one big hypercall. */
   7.471 -    if ( likely((mcl - rx_mcl) != 0) )
   7.472 -    {
   7.473 -        mcl->op = __HYPERVISOR_mmu_update;
   7.474 -        mcl->args[0] = (unsigned long)rx_mmu;
   7.475 -        mcl->args[1] = mmu - rx_mmu;
   7.476 -        mcl->args[2] = 0;
   7.477 -        mcl++;
   7.478 -        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
   7.479 -    }
   7.480 -
   7.481 -    while ( (skb = __skb_dequeue(&rxq)) != NULL )
   7.482 -    {
   7.483 -        /* Set the shared-info area, which is hidden behind the real data. */
   7.484 -        atomic_set(&(skb_shinfo(skb)->dataref), 1);
   7.485 -        skb_shinfo(skb)->nr_frags = 0;
   7.486 -        skb_shinfo(skb)->frag_list = NULL;
   7.487 -
   7.488 -        /* Ethernet-specific work. Delayed to here as it peeks the header. */
   7.489 -        skb->protocol = eth_type_trans(skb, dev);
   7.490 -
   7.491 -        /* Pass it up. */
   7.492 -        netif_rx(skb);
   7.493 -        dev->last_rx = jiffies;
   7.494 -    }
   7.495 -
   7.496 -    np->rx_resp_cons = i;
   7.497 -
   7.498 -    network_alloc_rx_buffers(dev);
   7.499 -
   7.500 -    *pbudget   -= work_done;
   7.501 -    dev->quota -= work_done;
   7.502 -
   7.503 -    if ( work_done < budget )
   7.504 -    {
   7.505 -        local_irq_save(flags);
   7.506 -
   7.507 -        np->rx->event = i + 1;
   7.508 -    
   7.509 -        /* Deal with hypervisor racing our resetting of rx_event. */
   7.510 -        mb();
   7.511 -        if ( np->rx->resp_prod == i )
   7.512 -        {
   7.513 -            __netif_rx_complete(dev);
   7.514 -            more_to_do = 0;
   7.515 -        }
   7.516 -
   7.517 -        local_irq_restore(flags);
   7.518 -    }
   7.519 -
   7.520 -    spin_unlock(&np->rx_lock);
   7.521 -
   7.522 -    return more_to_do;
   7.523 -}
   7.524 -
   7.525 -
   7.526 -static int network_close(struct net_device *dev)
   7.527 -{
   7.528 -    struct net_private *np = dev->priv;
   7.529 -    np->user_state = UST_CLOSED;
   7.530 -    netif_stop_queue(np->dev);
   7.531 -    return 0;
   7.532 -}
   7.533 -
   7.534 -
   7.535 -static struct net_device_stats *network_get_stats(struct net_device *dev)
   7.536 -{
   7.537 -    struct net_private *np = (struct net_private *)dev->priv;
   7.538 -    return &np->stats;
   7.539 -}
   7.540 -
   7.541 -
   7.542 -static void network_connect(struct net_device *dev,
   7.543 -                            netif_fe_interface_status_changed_t *status)
   7.544 -{
   7.545 -    struct net_private *np;
   7.546 -    int i, requeue_idx;
   7.547 -    netif_tx_request_t *tx;
   7.548 -
   7.549 -    np = dev->priv;
   7.550 -    spin_lock_irq(&np->rx_lock);
   7.551 -    spin_lock(&np->tx_lock);
   7.552 -
   7.553 -    /* Recovery procedure: */
   7.554 -
   7.555 -    /* Step 1: Reinitialise variables. */
   7.556 -    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
   7.557 -    np->rx->event = 1;
   7.558 -
   7.559 -    /* Step 2: Rebuild the RX and TX ring contents.
   7.560 -     * NB. We could just free the queued TX packets now but we hope
   7.561 -     * that sending them out might do some good.  We have to rebuild
   7.562 -     * the RX ring because some of our pages are currently flipped out
   7.563 -     * so we can't just free the RX skbs.
   7.564 -     * NB2. Freelist index entries are always going to be less than
   7.565 -     *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
   7.566 -     * greater than __PAGE_OFFSET: we use this property to distinguish
   7.567 -     * them.
   7.568 -     */
   7.569 -
   7.570 -    /* Rebuild the TX buffer freelist and the TX ring itself.
   7.571 -     * NB. This reorders packets.  We could keep more private state
   7.572 -     * to avoid this but maybe it doesn't matter so much given the
   7.573 -     * interface has been down.
   7.574 -     */
   7.575 -    for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
   7.576 -    {
   7.577 -            if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
   7.578 -            {
   7.579 -                struct sk_buff *skb = np->tx_skbs[i];
   7.580 -                
   7.581 -                tx = &np->tx->ring[requeue_idx++].req;
   7.582 -                
   7.583 -                tx->id   = i;
   7.584 -                tx->addr = virt_to_machine(skb->data);
   7.585 -                tx->size = skb->len;
   7.586 -                
   7.587 -                np->stats.tx_bytes += skb->len;
   7.588 -                np->stats.tx_packets++;
   7.589 -            }
   7.590 -    }
   7.591 -    wmb();
   7.592 -    np->tx->req_prod = requeue_idx;
   7.593 -
   7.594 -    /* Rebuild the RX buffer freelist and the RX ring itself. */
   7.595 -    for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
   7.596 -        if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
   7.597 -            np->rx->ring[requeue_idx++].req.id = i;
   7.598 -    wmb();                
   7.599 -    np->rx->req_prod = requeue_idx;
   7.600 -
   7.601 -    /* Step 3: All public and private state should now be sane.  Get
   7.602 -     * ready to start sending and receiving packets and give the driver
   7.603 -     * domain a kick because we've probably just requeued some
   7.604 -     * packets.
   7.605 -     */
   7.606 -    np->backend_state = BEST_CONNECTED;
   7.607 -    notify_via_evtchn(status->evtchn);  
   7.608 -    network_tx_buf_gc(dev);
   7.609 -
   7.610 -    if ( np->user_state == UST_OPEN )
   7.611 -        netif_start_queue(dev);
   7.612 -
   7.613 -    spin_unlock(&np->tx_lock);
   7.614 -    spin_unlock_irq(&np->rx_lock);
   7.615 -}
   7.616 -
   7.617 -static void netif_status_change(netif_fe_interface_status_changed_t *status)
   7.618 -{
   7.619 -    ctrl_msg_t                   cmsg;
   7.620 -    netif_fe_interface_connect_t up;
   7.621 -    struct net_device *dev;
   7.622 -    struct net_private *np;
   7.623 -    
   7.624 -    if ( netctrl.interface_n <= 0 )
   7.625 -    {
   7.626 -        printk(KERN_WARNING "Status change: no interfaces\n");
   7.627 -        return;
   7.628 -    }
   7.629 -
   7.630 -    dev = find_dev_by_handle(status->handle);
   7.631 -    if(!dev){
   7.632 -        printk(KERN_WARNING "Status change: invalid netif handle %u\n",
   7.633 -               status->handle);
   7.634 -         return;
   7.635 -    }
   7.636 -    np  = dev->priv;
   7.637 -    
   7.638 -    switch ( status->status )
   7.639 -    {
   7.640 -    case NETIF_INTERFACE_STATUS_DESTROYED:
   7.641 -        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
   7.642 -               np->backend_state);
   7.643 -        break;
   7.644 -
   7.645 -    case NETIF_INTERFACE_STATUS_DISCONNECTED:
   7.646 -        if ( np->backend_state != BEST_CLOSED )
   7.647 -        {
   7.648 -            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
   7.649 -                   " in state %d\n", np->backend_state);
   7.650 -	    printk(KERN_INFO "Attempting to reconnect network interface\n");
   7.651 -
   7.652 -            /* Begin interface recovery.
   7.653 -	     *
   7.654 -	     * NB. Whilst we're recovering, we turn the carrier state off.  We
   7.655 -	     * take measures to ensure that this device isn't used for
   7.656 -	     * anything.  We also stop the queue for this device.  Various
   7.657 -	     * different approaches (e.g. continuing to buffer packets) have
   7.658 -	     * been tested but don't appear to improve the overall impact on
   7.659 -             * TCP connections.
   7.660 -	     *
   7.661 -             * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
   7.662 -             * is initiated by a special "RESET" message - disconnect could
   7.663 -             * just mean we're not allowed to use this interface any more.
   7.664 -             */
   7.665 -
   7.666 -            /* Stop old i/f to prevent errors whilst we rebuild the state. */
   7.667 -            spin_lock_irq(&np->tx_lock);
   7.668 -            spin_lock(&np->rx_lock);
   7.669 -            netif_stop_queue(dev);
   7.670 -            np->backend_state = BEST_DISCONNECTED;
   7.671 -            spin_unlock(&np->rx_lock);
   7.672 -            spin_unlock_irq(&np->tx_lock);
   7.673 -
   7.674 -            /* Free resources. */
   7.675 -            free_irq(np->irq, dev);
   7.676 -            unbind_evtchn_from_irq(np->evtchn);
   7.677 -	    free_page((unsigned long)np->tx);
   7.678 -            free_page((unsigned long)np->rx);
   7.679 -        }
   7.680 -
   7.681 -        /* Move from CLOSED to DISCONNECTED state. */
   7.682 -        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
   7.683 -        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
   7.684 -        memset(np->tx, 0, PAGE_SIZE);
   7.685 -        memset(np->rx, 0, PAGE_SIZE);
   7.686 -        np->backend_state = BEST_DISCONNECTED;
   7.687 -
   7.688 -        /* Construct an interface-CONNECT message for the domain controller. */
   7.689 -        cmsg.type      = CMSG_NETIF_FE;
   7.690 -        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
   7.691 -        cmsg.length    = sizeof(netif_fe_interface_connect_t);
   7.692 -        up.handle      = status->handle;
   7.693 -        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
   7.694 -        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
   7.695 -        memcpy(cmsg.msg, &up, sizeof(up));
   7.696 -        
   7.697 -        /* Tell the controller to bring up the interface. */
   7.698 -        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
   7.699 -        break;
   7.700 -
   7.701 -    case NETIF_INTERFACE_STATUS_CONNECTED:
   7.702 -        if ( np->backend_state == BEST_CLOSED )
   7.703 -        {
   7.704 -            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
   7.705 -                   " in state %d\n", np->backend_state);
   7.706 -            break;
   7.707 -        }
   7.708 -
   7.709 -        memcpy(dev->dev_addr, status->mac, ETH_ALEN);
   7.710 -
   7.711 -        network_connect(dev, status);
   7.712 -
   7.713 -        np->evtchn = status->evtchn;
   7.714 -        np->irq = bind_evtchn_to_irq(np->evtchn);
   7.715 -        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
   7.716 -                          dev->name, dev);
   7.717 -        
   7.718 -        netctrl_connected_count();
   7.719 -        break;
   7.720 -
   7.721 -    default:
   7.722 -        printk(KERN_WARNING "Status change to unknown value %d\n", 
   7.723 -               status->status);
   7.724 -        break;
   7.725 -    }
   7.726 -}
   7.727 -
   7.728 -/** Create a network device.
   7.729 - * @param handle device handle
   7.730 - * @param val return parameter for created device
   7.731 - * @return 0 on success, error code otherwise
   7.732 - */
   7.733 -static int create_netdev(int handle, struct net_device **val)
   7.734 -{
   7.735 -    int i, err = 0;
   7.736 -    struct net_device *dev = NULL;
   7.737 -    struct net_private *np = NULL;
   7.738 -
   7.739 -    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
   7.740 -    {
   7.741 -        printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
   7.742 -        err = -ENOMEM;
   7.743 -        goto exit;
   7.744 -    }
   7.745 -
   7.746 -    np                = dev->priv;
   7.747 -    np->backend_state = BEST_CLOSED;
   7.748 -    np->user_state    = UST_CLOSED;
   7.749 -    np->handle        = handle;
   7.750 -    
   7.751 -    spin_lock_init(&np->tx_lock);
   7.752 -    spin_lock_init(&np->rx_lock);
   7.753 -
   7.754 -    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
   7.755 -    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
   7.756 -        np->tx_skbs[i] = (void *)(i+1);
   7.757 -    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
   7.758 -        np->rx_skbs[i] = (void *)(i+1);
   7.759 -
   7.760 -    dev->open            = network_open;
   7.761 -    dev->hard_start_xmit = network_start_xmit;
   7.762 -    dev->stop            = network_close;
   7.763 -    dev->get_stats       = network_get_stats;
   7.764 -    dev->poll            = netif_poll;
   7.765 -    dev->weight          = 64;
   7.766 -    
   7.767 -    if ( (err = register_netdev(dev)) != 0 )
   7.768 -    {
   7.769 -        printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
   7.770 -        goto exit;
   7.771 -    }
   7.772 -    np->dev = dev;
   7.773 -    list_add(&np->list, &dev_list);
   7.774 -
   7.775 -  exit:
   7.776 -    if ( (err != 0) && (dev != NULL ) )
   7.777 -        kfree(dev);
   7.778 -    else if ( val != NULL )
   7.779 -        *val = dev;
   7.780 -    return err;
   7.781 -}
   7.782 -
   7.783 -/*
   7.784 - * Initialize the network control interface. Set the number of network devices
   7.785 - * and create them.
   7.786 - */
   7.787 -static void netif_driver_status_change(
   7.788 -    netif_fe_driver_status_changed_t *status)
   7.789 -{
   7.790 -    int err = 0;
   7.791 -    int i;
   7.792 -    
   7.793 -    netctrl.interface_n = status->nr_interfaces;
   7.794 -    netctrl.connected_n = 0;
   7.795 -
   7.796 -    for ( i = 0; i < netctrl.interface_n; i++ )
   7.797 -    {
   7.798 -        if ( (err = create_netdev(i, NULL)) != 0 )
   7.799 -        {
   7.800 -            netctrl_err(err);
   7.801 -            break;
   7.802 -        }
   7.803 -    }
   7.804 -}
   7.805 -
   7.806 -static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
   7.807 -{
   7.808 -    int respond = 1;
   7.809 -
   7.810 -    switch ( msg->subtype )
   7.811 -    {
   7.812 -    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
   7.813 -        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
   7.814 -            goto error;
   7.815 -        netif_status_change((netif_fe_interface_status_changed_t *)
   7.816 -                            &msg->msg[0]);
   7.817 -        break;
   7.818 -
   7.819 -    case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
   7.820 -        if ( msg->length != sizeof(netif_fe_driver_status_changed_t) )
   7.821 -            goto error;
   7.822 -        netif_driver_status_change((netif_fe_driver_status_changed_t *)
   7.823 -                                   &msg->msg[0]);
   7.824 -        /* Message is a response */
   7.825 -        respond = 0;
   7.826 -        break;
   7.827 -
   7.828 -    error:
   7.829 -    default:
   7.830 -        msg->length = 0;
   7.831 -        break;
   7.832 -    }
   7.833 -
   7.834 -    if ( respond )
   7.835 -        ctrl_if_send_response(msg);
   7.836 -}
   7.837 -
   7.838 -
   7.839 -static int __init netif_init(void)
   7.840 -{
   7.841 -    ctrl_msg_t                       cmsg;
   7.842 -    netif_fe_driver_status_changed_t st;
   7.843 -    int err = 0, wait_i, wait_n = 20;
   7.844 -
   7.845 -    if ( (start_info.flags & SIF_INITDOMAIN) ||
   7.846 -         (start_info.flags & SIF_NET_BE_DOMAIN) )
   7.847 -        return 0;
   7.848 -
   7.849 -    printk("Initialising Xen virtual ethernet frontend driver");
   7.850 -
   7.851 -    INIT_LIST_HEAD(&dev_list);
   7.852 -
   7.853 -    netctrl_init();
   7.854 -
   7.855 -    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
   7.856 -                                    CALLBACK_IN_BLOCKING_CONTEXT);
   7.857 -
   7.858 -    /* Send a driver-UP notification to the domain controller. */
   7.859 -    cmsg.type      = CMSG_NETIF_FE;
   7.860 -    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
   7.861 -    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
   7.862 -    st.status      = NETIF_DRIVER_STATUS_UP;
   7.863 -    st.nr_interfaces = 0;
   7.864 -    memcpy(cmsg.msg, &st, sizeof(st));
   7.865 -    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
   7.866 -
   7.867 -    /* Wait for all interfaces to be connected. */
   7.868 -    for ( wait_i = 0; ; wait_i++)
   7.869 -    {
   7.870 -        if ( (err = (wait_i < wait_n) ? netctrl_connected() : -ENETDOWN) != 0 )
   7.871 -        {
   7.872 -            err = (err > 0) ? 0 : err;
   7.873 -            break;
   7.874 -        }
   7.875 -        set_current_state(TASK_INTERRUPTIBLE);
   7.876 -        schedule_timeout(1);
   7.877 -     }
   7.878 -
   7.879 -    if ( err )
   7.880 -        ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
   7.881 -
   7.882 -    return err;
   7.883 -}
   7.884 -
   7.885 -__initcall(netif_init);
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile	Fri Jul 30 18:49:24 2004 +0000
     8.3 @@ -0,0 +1,2 @@
     8.4 +
     8.5 +obj-y	:= netback.o control.o interface.o
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h	Fri Jul 30 18:49:24 2004 +0000
     9.3 @@ -0,0 +1,97 @@
     9.4 +/******************************************************************************
     9.5 + * arch/xen/drivers/netif/backend/common.h
     9.6 + */
     9.7 +
     9.8 +#ifndef __NETIF__BACKEND__COMMON_H__
     9.9 +#define __NETIF__BACKEND__COMMON_H__
    9.10 +
    9.11 +#include <linux/config.h>
    9.12 +#include <linux/version.h>
    9.13 +#include <linux/module.h>
    9.14 +#include <linux/interrupt.h>
    9.15 +#include <linux/slab.h>
    9.16 +#include <linux/ip.h>
    9.17 +#include <linux/in.h>
    9.18 +#include <linux/netdevice.h>
    9.19 +#include <linux/etherdevice.h>
    9.20 +#include <asm-xen/ctrl_if.h>
    9.21 +#include <asm/io.h>
    9.22 +#include <asm/pgalloc.h>
    9.23 +#include <asm-xen/netif.h>
    9.24 +
    9.25 +#if 0
    9.26 +#define ASSERT(_p) \
    9.27 +    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
    9.28 +    __LINE__, __FILE__); *(int*)0=0; }
    9.29 +#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
    9.30 +                           __FILE__ , __LINE__ , ## _a )
    9.31 +#else
    9.32 +#define ASSERT(_p) ((void)0)
    9.33 +#define DPRINTK(_f, _a...) ((void)0)
    9.34 +#endif
    9.35 +
    9.36 +typedef struct netif_st {
    9.37 +    /* Unique identifier for this interface. */
    9.38 +    domid_t          domid;
    9.39 +    unsigned int     handle;
    9.40 +
    9.41 +    /* Physical parameters of the comms window. */
    9.42 +    unsigned long    tx_shmem_frame;
    9.43 +    unsigned long    rx_shmem_frame;
    9.44 +    unsigned int     evtchn;
    9.45 +    int              irq;
    9.46 +
    9.47 +    /* The shared rings and indexes. */
    9.48 +    netif_tx_interface_t *tx;
    9.49 +    netif_rx_interface_t *rx;
    9.50 +
    9.51 +    /* Private indexes into shared ring. */
    9.52 +    NETIF_RING_IDX rx_req_cons;
    9.53 +    NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */
    9.54 +    NETIF_RING_IDX tx_req_cons;
    9.55 +    NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */
    9.56 +
    9.57 +    /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
    9.58 +    unsigned long   credit_bytes;
    9.59 +    unsigned long   credit_usec;
    9.60 +    unsigned long   remaining_credit;
    9.61 +    struct timer_list credit_timeout;
    9.62 +
    9.63 +    /* Miscellaneous private stuff. */
    9.64 +    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
    9.65 +    /*
    9.66 +     * DISCONNECT response is deferred until pending requests are ack'ed.
    9.67 +     * We therefore need to store the id from the original request.
    9.68 +     */
    9.69 +    u8               disconnect_rspid;
    9.70 +    struct netif_st *hash_next;
    9.71 +    struct list_head list;  /* scheduling list */
    9.72 +    atomic_t         refcnt;
    9.73 +    spinlock_t       rx_lock, tx_lock;
    9.74 +    struct net_device *dev;
    9.75 +    struct net_device_stats stats;
    9.76 +} netif_t;
    9.77 +
    9.78 +void netif_create(netif_be_create_t *create);
    9.79 +void netif_destroy(netif_be_destroy_t *destroy);
    9.80 +void netif_connect(netif_be_connect_t *connect);
    9.81 +int  netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id);
    9.82 +void __netif_disconnect_complete(netif_t *netif);
    9.83 +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle);
    9.84 +#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
    9.85 +#define netif_put(_b)                             \
    9.86 +    do {                                          \
    9.87 +        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
    9.88 +            __netif_disconnect_complete(_b);      \
    9.89 +    } while (0)
    9.90 +
    9.91 +void netif_interface_init(void);
    9.92 +void netif_ctrlif_init(void);
    9.93 +
    9.94 +void netif_deschedule(netif_t *netif);
    9.95 +
    9.96 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
    9.97 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
    9.98 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
    9.99 +
   9.100 +#endif /* __NETIF__BACKEND__COMMON_H__ */
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c	Fri Jul 30 18:49:24 2004 +0000
    10.3 @@ -0,0 +1,65 @@
    10.4 +/******************************************************************************
    10.5 + * arch/xen/drivers/netif/backend/control.c
    10.6 + * 
    10.7 + * Routines for interfacing with the control plane.
    10.8 + * 
    10.9 + * Copyright (c) 2004, Keir Fraser
   10.10 + */
   10.11 +
   10.12 +#include "common.h"
   10.13 +
   10.14 +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
   10.15 +{
   10.16 +    switch ( msg->subtype )
   10.17 +    {
   10.18 +    case CMSG_NETIF_BE_CREATE:
   10.19 +        if ( msg->length != sizeof(netif_be_create_t) )
   10.20 +            goto parse_error;
   10.21 +        netif_create((netif_be_create_t *)&msg->msg[0]);
   10.22 +        break;        
   10.23 +    case CMSG_NETIF_BE_DESTROY:
   10.24 +        if ( msg->length != sizeof(netif_be_destroy_t) )
   10.25 +            goto parse_error;
   10.26 +        netif_destroy((netif_be_destroy_t *)&msg->msg[0]);
   10.27 +        break;        
   10.28 +    case CMSG_NETIF_BE_CONNECT:
   10.29 +        if ( msg->length != sizeof(netif_be_connect_t) )
   10.30 +            goto parse_error;
   10.31 +        netif_connect((netif_be_connect_t *)&msg->msg[0]);
   10.32 +        break;        
   10.33 +    case CMSG_NETIF_BE_DISCONNECT:
   10.34 +        if ( msg->length != sizeof(netif_be_disconnect_t) )
   10.35 +            goto parse_error;
   10.36 +        if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) )
   10.37 +            return; /* Sending the response is deferred until later. */
   10.38 +        break;        
   10.39 +    default:
   10.40 +        goto parse_error;
   10.41 +    }
   10.42 +
   10.43 +    ctrl_if_send_response(msg);
   10.44 +    return;
   10.45 +
   10.46 + parse_error:
   10.47 +    DPRINTK("Parse error while reading message subtype %d, len %d\n",
   10.48 +            msg->subtype, msg->length);
   10.49 +    msg->length = 0;
   10.50 +    ctrl_if_send_response(msg);
   10.51 +}
   10.52 +
   10.53 +void netif_ctrlif_init(void)
   10.54 +{
   10.55 +    ctrl_msg_t                       cmsg;
   10.56 +    netif_be_driver_status_changed_t st;
   10.57 +
   10.58 +    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
   10.59 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
   10.60 +
   10.61 +    /* Send a driver-UP notification to the domain controller. */
   10.62 +    cmsg.type      = CMSG_NETIF_BE;
   10.63 +    cmsg.subtype   = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED;
   10.64 +    cmsg.length    = sizeof(netif_be_driver_status_changed_t);
   10.65 +    st.status      = NETIF_DRIVER_STATUS_UP;
   10.66 +    memcpy(cmsg.msg, &st, sizeof(st));
   10.67 +    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
   10.68 +}
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c	Fri Jul 30 18:49:24 2004 +0000
    11.3 @@ -0,0 +1,288 @@
    11.4 +/******************************************************************************
    11.5 + * arch/xen/drivers/netif/backend/interface.c
    11.6 + * 
    11.7 + * Network-device interface management.
    11.8 + * 
    11.9 + * Copyright (c) 2004, Keir Fraser
   11.10 + */
   11.11 +
   11.12 +#include "common.h"
   11.13 +#include <linux/rtnetlink.h>
   11.14 +
   11.15 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
   11.16 +#define VMALLOC_VMADDR(x) ((unsigned long)(x))
   11.17 +#endif
   11.18 +
   11.19 +#define NETIF_HASHSZ 1024
   11.20 +#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1))
   11.21 +
   11.22 +static netif_t *netif_hash[NETIF_HASHSZ];
   11.23 +
   11.24 +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
   11.25 +{
   11.26 +    netif_t *netif = netif_hash[NETIF_HASH(domid, handle)];
   11.27 +    while ( (netif != NULL) && 
   11.28 +            ((netif->domid != domid) || (netif->handle != handle)) )
   11.29 +        netif = netif->hash_next;
   11.30 +    return netif;
   11.31 +}
   11.32 +
   11.33 +void __netif_disconnect_complete(netif_t *netif)
   11.34 +{
   11.35 +    ctrl_msg_t            cmsg;
   11.36 +    netif_be_disconnect_t disc;
   11.37 +
   11.38 +    /*
   11.39 +     * These can't be done in __netif_disconnect() because at that point there
   11.40 +     * may be outstanding requests at the disc whose asynchronous responses
   11.41 +     * must still be notified to the remote driver.
   11.42 +     */
   11.43 +    unbind_evtchn_from_irq(netif->evtchn);
   11.44 +    vfree(netif->tx); /* Frees netif->rx as well. */
   11.45 +    rtnl_lock();
   11.46 +    (void)dev_close(netif->dev);
   11.47 +    rtnl_unlock();
   11.48 +
   11.49 +    /* Construct the deferred response message. */
   11.50 +    cmsg.type         = CMSG_NETIF_BE;
   11.51 +    cmsg.subtype      = CMSG_NETIF_BE_DISCONNECT;
   11.52 +    cmsg.id           = netif->disconnect_rspid;
   11.53 +    cmsg.length       = sizeof(netif_be_disconnect_t);
   11.54 +    disc.domid        = netif->domid;
   11.55 +    disc.netif_handle = netif->handle;
   11.56 +    disc.status       = NETIF_BE_STATUS_OKAY;
   11.57 +    memcpy(cmsg.msg, &disc, sizeof(disc));
   11.58 +
   11.59 +    /*
   11.60 +     * Make sure message is constructed /before/ status change, because
   11.61 +     * after the status change the 'netif' structure could be deallocated at
   11.62 +     * any time. Also make sure we send the response /after/ status change,
   11.63 +     * as otherwise a subsequent CONNECT request could spuriously fail if
   11.64 +     * another CPU doesn't see the status change yet.
   11.65 +     */
   11.66 +    mb();
   11.67 +    if ( netif->status != DISCONNECTING )
   11.68 +        BUG();
   11.69 +    netif->status = DISCONNECTED;
   11.70 +    mb();
   11.71 +
   11.72 +    /* Send the successful response. */
   11.73 +    ctrl_if_send_response(&cmsg);
   11.74 +}
   11.75 +
   11.76 +void netif_create(netif_be_create_t *create)
   11.77 +{
   11.78 +    int                err = 0;
   11.79 +    domid_t            domid  = create->domid;
   11.80 +    unsigned int       handle = create->netif_handle;
   11.81 +    struct net_device *dev;
   11.82 +    netif_t          **pnetif, *netif;
   11.83 +    char               name[IFNAMSIZ] = {};
   11.84 +
   11.85 +    snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
   11.86 +    dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
   11.87 +    if ( dev == NULL )
   11.88 +    {
   11.89 +        DPRINTK("Could not create netif: out of memory\n");
   11.90 +        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
   11.91 +        return;
   11.92 +    }
   11.93 +
   11.94 +    netif = dev->priv;
   11.95 +    memset(netif, 0, sizeof(*netif));
   11.96 +    netif->domid  = domid;
   11.97 +    netif->handle = handle;
   11.98 +    netif->status = DISCONNECTED;
   11.99 +    spin_lock_init(&netif->rx_lock);
  11.100 +    spin_lock_init(&netif->tx_lock);
  11.101 +    atomic_set(&netif->refcnt, 0);
  11.102 +    netif->dev = dev;
  11.103 +
  11.104 +    netif->credit_bytes = netif->remaining_credit = ~0UL;
  11.105 +    netif->credit_usec  = 0UL;
  11.106 +    /*init_ac_timer(&new_vif->credit_timeout);*/
  11.107 +
  11.108 +    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
  11.109 +    while ( *pnetif != NULL )
  11.110 +    {
  11.111 +        if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) )
  11.112 +        {
  11.113 +            DPRINTK("Could not create netif: already exists\n");
  11.114 +            create->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
  11.115 +            kfree(dev);
  11.116 +            return;
  11.117 +        }
  11.118 +        pnetif = &(*pnetif)->hash_next;
  11.119 +    }
  11.120 +
  11.121 +    dev->hard_start_xmit = netif_be_start_xmit;
  11.122 +    dev->get_stats       = netif_be_get_stats;
  11.123 +    memcpy(dev->dev_addr, create->mac, ETH_ALEN);
  11.124 +
  11.125 +    /* Disable queuing. */
  11.126 +    dev->tx_queue_len = 0;
  11.127 +
  11.128 +    /* Force a different MAC from remote end. */
  11.129 +    dev->dev_addr[2] ^= 1;
  11.130 +
  11.131 +    if ( (err = register_netdev(dev)) != 0 )
  11.132 +    {
  11.133 +        DPRINTK("Could not register new net device %s: err=%d\n",
  11.134 +                dev->name, err);
  11.135 +        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
  11.136 +        kfree(dev);
  11.137 +        return;
  11.138 +    }
  11.139 +
  11.140 +    netif->hash_next = *pnetif;
  11.141 +    *pnetif = netif;
  11.142 +
  11.143 +    DPRINTK("Successfully created netif\n");
  11.144 +    create->status = NETIF_BE_STATUS_OKAY;
  11.145 +}
  11.146 +
  11.147 +void netif_destroy(netif_be_destroy_t *destroy)
  11.148 +{
  11.149 +    domid_t       domid  = destroy->domid;
  11.150 +    unsigned int  handle = destroy->netif_handle;
  11.151 +    netif_t     **pnetif, *netif;
  11.152 +
  11.153 +    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
  11.154 +    while ( (netif = *pnetif) != NULL )
  11.155 +    {
  11.156 +        if ( (netif->domid == domid) && (netif->handle == handle) )
  11.157 +        {
  11.158 +            if ( netif->status != DISCONNECTED )
  11.159 +                goto still_connected;
  11.160 +            goto destroy;
  11.161 +        }
  11.162 +        pnetif = &netif->hash_next;
  11.163 +    }
  11.164 +
  11.165 +    destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
  11.166 +    return;
  11.167 +
  11.168 + still_connected:
  11.169 +    destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
  11.170 +    return;
  11.171 +
  11.172 + destroy:
  11.173 +    *pnetif = netif->hash_next;
  11.174 +    unregister_netdev(netif->dev);
  11.175 +    kfree(netif->dev);
  11.176 +    destroy->status = NETIF_BE_STATUS_OKAY;
  11.177 +}
  11.178 +
  11.179 +void netif_connect(netif_be_connect_t *connect)
  11.180 +{
  11.181 +    domid_t       domid  = connect->domid;
  11.182 +    unsigned int  handle = connect->netif_handle;
  11.183 +    unsigned int  evtchn = connect->evtchn;
  11.184 +    unsigned long tx_shmem_frame = connect->tx_shmem_frame;
  11.185 +    unsigned long rx_shmem_frame = connect->rx_shmem_frame;
  11.186 +    struct vm_struct *vma;
  11.187 +    pgprot_t      prot;
  11.188 +    int           error;
  11.189 +    netif_t      *netif;
  11.190 +#if 0
  11.191 +    struct net_device *eth0_dev;
  11.192 +#endif
  11.193 +
  11.194 +    netif = netif_find_by_handle(domid, handle);
  11.195 +    if ( unlikely(netif == NULL) )
  11.196 +    {
  11.197 +        DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", 
  11.198 +                connect->domid, connect->netif_handle); 
  11.199 +        connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
  11.200 +        return;
  11.201 +    }
  11.202 +
  11.203 +    if ( netif->status != DISCONNECTED )
  11.204 +    {
  11.205 +        connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
  11.206 +        return;
  11.207 +    }
  11.208 +
  11.209 +    if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL )
  11.210 +    {
  11.211 +        connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
  11.212 +        return;
  11.213 +    }
  11.214 +
  11.215 +    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
  11.216 +    error  = direct_remap_area_pages(&init_mm, 
  11.217 +                                     VMALLOC_VMADDR(vma->addr),
  11.218 +                                     tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
  11.219 +                                     prot, domid);
  11.220 +    error |= direct_remap_area_pages(&init_mm, 
  11.221 +                                     VMALLOC_VMADDR(vma->addr) + PAGE_SIZE,
  11.222 +                                     rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
  11.223 +                                     prot, domid);
  11.224 +    if ( error != 0 )
  11.225 +    {
  11.226 +        if ( error == -ENOMEM )
  11.227 +            connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
  11.228 +        else if ( error == -EFAULT )
  11.229 +            connect->status = NETIF_BE_STATUS_MAPPING_ERROR;
  11.230 +        else
  11.231 +            connect->status = NETIF_BE_STATUS_ERROR;
  11.232 +        vfree(vma->addr);
  11.233 +        return;
  11.234 +    }
  11.235 +
  11.236 +    netif->evtchn         = evtchn;
  11.237 +    netif->irq            = bind_evtchn_to_irq(evtchn);
  11.238 +    netif->tx_shmem_frame = tx_shmem_frame;
  11.239 +    netif->rx_shmem_frame = rx_shmem_frame;
  11.240 +    netif->tx             = 
  11.241 +        (netif_tx_interface_t *)vma->addr;
  11.242 +    netif->rx             = 
  11.243 +        (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE);
  11.244 +    netif->status         = CONNECTED;
  11.245 +    netif_get(netif);
  11.246 +
  11.247 +    rtnl_lock();
  11.248 +    (void)dev_open(netif->dev);
  11.249 +    rtnl_unlock();
  11.250 +
  11.251 +    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
  11.252 +    netif_start_queue(netif->dev);
  11.253 +
  11.254 +    connect->status = NETIF_BE_STATUS_OKAY;
  11.255 +}
  11.256 +
  11.257 +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
  11.258 +{
  11.259 +    domid_t       domid  = disconnect->domid;
  11.260 +    unsigned int  handle = disconnect->netif_handle;
  11.261 +    netif_t      *netif;
  11.262 +
  11.263 +    netif = netif_find_by_handle(domid, handle);
  11.264 +    if ( unlikely(netif == NULL) )
  11.265 +    {
  11.266 +        DPRINTK("netif_disconnect attempted for non-existent netif"
  11.267 +                " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); 
  11.268 +        disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
  11.269 +        return 1; /* Caller will send response error message. */
  11.270 +    }
  11.271 +
  11.272 +    if ( netif->status == CONNECTED )
  11.273 +    {
  11.274 +        netif->status = DISCONNECTING;
  11.275 +        netif->disconnect_rspid = rsp_id;
  11.276 +        wmb(); /* Let other CPUs see the status change. */
  11.277 +        netif_stop_queue(netif->dev);
  11.278 +        free_irq(netif->irq, netif);
  11.279 +        netif_deschedule(netif);
  11.280 +        netif_put(netif);
  11.281 +        return 0; /* Caller should not send response message. */
  11.282 +    }
  11.283 +
  11.284 +    disconnect->status = NETIF_BE_STATUS_OKAY;
  11.285 +    return 1;
  11.286 +}
  11.287 +
  11.288 +void netif_interface_init(void)
  11.289 +{
  11.290 +    memset(netif_hash, 0, sizeof(netif_hash));
  11.291 +}
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Fri Jul 30 18:49:24 2004 +0000
    12.3 @@ -0,0 +1,778 @@
    12.4 +/******************************************************************************
    12.5 + * arch/xen/drivers/netif/backend/main.c
    12.6 + * 
    12.7 + * Back-end of the driver for virtual block devices. This portion of the
    12.8 + * driver exports a 'unified' block-device interface that can be accessed
    12.9 + * by any operating system that implements a compatible front end. A 
   12.10 + * reference front-end implementation can be found in:
   12.11 + *  arch/xen/drivers/netif/frontend
   12.12 + * 
   12.13 + * Copyright (c) 2002-2004, K A Fraser
   12.14 + */
   12.15 +
   12.16 +#include "common.h"
   12.17 +
   12.18 +static void netif_page_release(struct page *page);
   12.19 +static void make_tx_response(netif_t *netif, 
   12.20 +                             u16      id,
   12.21 +                             s8       st);
   12.22 +static int  make_rx_response(netif_t *netif, 
   12.23 +                             u16      id, 
   12.24 +                             s8       st,
   12.25 +                             memory_t addr,
   12.26 +                             u16      size);
   12.27 +
   12.28 +static void net_tx_action(unsigned long unused);
   12.29 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
   12.30 +
   12.31 +static void net_rx_action(unsigned long unused);
   12.32 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
   12.33 +
   12.34 +static struct sk_buff_head rx_queue;
   12.35 +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2];
   12.36 +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3];
   12.37 +static unsigned char rx_notify[NR_EVENT_CHANNELS];
   12.38 +
   12.39 +/* Don't currently gate addition of an interface to the tx scheduling list. */
   12.40 +#define tx_work_exists(_if) (1)
   12.41 +
   12.42 +#define MAX_PENDING_REQS 256
   12.43 +static unsigned long mmap_vstart;
   12.44 +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
   12.45 +
   12.46 +#define PKT_PROT_LEN (ETH_HLEN + 20)
   12.47 +
   12.48 +static struct {
   12.49 +    netif_tx_request_t req;
   12.50 +    netif_t *netif;
   12.51 +} pending_tx_info[MAX_PENDING_REQS];
   12.52 +static u16 pending_ring[MAX_PENDING_REQS];
   12.53 +typedef unsigned int PEND_RING_IDX;
   12.54 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
   12.55 +static PEND_RING_IDX pending_prod, pending_cons;
   12.56 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
   12.57 +
   12.58 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
   12.59 +static u16 dealloc_ring[MAX_PENDING_REQS];
   12.60 +static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED;
   12.61 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
   12.62 +
   12.63 +static struct sk_buff_head tx_queue;
   12.64 +static multicall_entry_t tx_mcl[MAX_PENDING_REQS];
   12.65 +
   12.66 +static struct list_head net_schedule_list;
   12.67 +static spinlock_t net_schedule_list_lock;
   12.68 +
   12.69 +#define MAX_MFN_ALLOC 64
   12.70 +static unsigned long mfn_list[MAX_MFN_ALLOC];
   12.71 +static unsigned int alloc_index = 0;
   12.72 +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
   12.73 +
   12.74 +static void __refresh_mfn_list(void)
   12.75 +{
   12.76 +    int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
   12.77 +                                    mfn_list, MAX_MFN_ALLOC);
   12.78 +    if ( unlikely(ret != MAX_MFN_ALLOC) )
   12.79 +        BUG();
   12.80 +    alloc_index = MAX_MFN_ALLOC;
   12.81 +}
   12.82 +
   12.83 +static unsigned long get_new_mfn(void)
   12.84 +{
   12.85 +    unsigned long mfn, flags;
   12.86 +    spin_lock_irqsave(&mfn_lock, flags);
   12.87 +    if ( alloc_index == 0 )
   12.88 +        __refresh_mfn_list();
   12.89 +    mfn = mfn_list[--alloc_index];
   12.90 +    spin_unlock_irqrestore(&mfn_lock, flags);
   12.91 +    return mfn;
   12.92 +}
   12.93 +
   12.94 +static void dealloc_mfn(unsigned long mfn)
   12.95 +{
   12.96 +    unsigned long flags;
   12.97 +    spin_lock_irqsave(&mfn_lock, flags);
   12.98 +    if ( alloc_index != MAX_MFN_ALLOC )
   12.99 +        mfn_list[alloc_index++] = mfn;
  12.100 +    else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1) != 1 )
  12.101 +        BUG();
  12.102 +    spin_unlock_irqrestore(&mfn_lock, flags);
  12.103 +}
  12.104 +
  12.105 +static inline void maybe_schedule_tx_action(void)
  12.106 +{
  12.107 +    smp_mb();
  12.108 +    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
  12.109 +         !list_empty(&net_schedule_list) )
  12.110 +        tasklet_schedule(&net_tx_tasklet);
  12.111 +}
  12.112 +
  12.113 +/*
  12.114 + * This is the primary RECEIVE function for a network interface.
  12.115 + * Note that, from the p.o.v. of /this/ OS it looks like a transmit.
  12.116 + */
  12.117 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
  12.118 +{
  12.119 +    netif_t *netif = (netif_t *)dev->priv;
  12.120 +
  12.121 +    ASSERT(skb->dev == dev);
  12.122 +
  12.123 +    /* Drop the packet if the target domain has no receive buffers. */
  12.124 +    if ( (netif->rx_req_cons == netif->rx->req_prod) ||
  12.125 +         ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
  12.126 +        goto drop;
  12.127 +
  12.128 +    /*
  12.129 +     * We do not copy the packet unless:
  12.130 +     *  1. The data is shared; or
  12.131 +     *  2. It spans a page boundary; or
  12.132 +     *  3. We cannot be sure the whole data page is allocated.
  12.133 +     * The copying method is taken from skb_copy().
  12.134 +     * NB. We also couldn't cope with fragmented packets, but we won't get
  12.135 +     *     any because we not advertise the NETIF_F_SG feature.
  12.136 +     */
  12.137 +    if ( skb_shared(skb) || skb_cloned(skb) || 
  12.138 +         (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
  12.139 +         ((skb->end - skb->head) < (PAGE_SIZE/2)) )
  12.140 +    {
  12.141 +        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
  12.142 +        int hlen = skb->data - skb->head;
  12.143 +        if ( unlikely(nskb == NULL) )
  12.144 +            goto drop;
  12.145 +        skb_reserve(nskb, hlen);
  12.146 +        __skb_put(nskb, skb->len);
  12.147 +        (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
  12.148 +        nskb->dev = skb->dev;
  12.149 +        dev_kfree_skb(skb);
  12.150 +        skb = nskb;
  12.151 +    }
  12.152 +
  12.153 +    netif->rx_req_cons++;
  12.154 +
  12.155 +    skb_queue_tail(&rx_queue, skb);
  12.156 +    tasklet_schedule(&net_rx_tasklet);
  12.157 +
  12.158 +    return 0;
  12.159 +
  12.160 + drop:
  12.161 +    netif->stats.rx_dropped++;
  12.162 +    dev_kfree_skb(skb);
  12.163 +    return 0;
  12.164 +}
  12.165 +
  12.166 +#if 0
  12.167 +static void xen_network_done_notify(void)
  12.168 +{
  12.169 +    static struct net_device *eth0_dev = NULL;
  12.170 +    if ( unlikely(eth0_dev == NULL) )
  12.171 +        eth0_dev = __dev_get_by_name("eth0");
  12.172 +    netif_rx_schedule(eth0_dev);
  12.173 +}
  12.174 +/* 
  12.175 + * Add following to poll() function in NAPI driver (Tigon3 is example):
  12.176 + *  if ( xen_network_done() )
  12.177 + *      tg3_enable_ints(tp); 
  12.178 + */
  12.179 +int xen_network_done(void)
  12.180 +{
  12.181 +    return skb_queue_empty(&rx_queue);
  12.182 +}
  12.183 +#endif
  12.184 +
  12.185 +static void net_rx_action(unsigned long unused)
  12.186 +{
  12.187 +    netif_t *netif;
  12.188 +    s8 status;
  12.189 +    u16 size, id, evtchn;
  12.190 +    mmu_update_t *mmu;
  12.191 +    multicall_entry_t *mcl;
  12.192 +    unsigned long vdata, mdata, new_mfn;
  12.193 +    struct sk_buff_head rxq;
  12.194 +    struct sk_buff *skb;
  12.195 +    u16 notify_list[NETIF_RX_RING_SIZE];
  12.196 +    int notify_nr = 0;
  12.197 +
  12.198 +    skb_queue_head_init(&rxq);
  12.199 +
  12.200 +    mcl = rx_mcl;
  12.201 +    mmu = rx_mmu;
  12.202 +    while ( (skb = skb_dequeue(&rx_queue)) != NULL )
  12.203 +    {
  12.204 +        netif   = (netif_t *)skb->dev->priv;
  12.205 +        vdata   = (unsigned long)skb->data;
  12.206 +        mdata   = virt_to_machine(vdata);
  12.207 +        new_mfn = get_new_mfn();
  12.208 +        
  12.209 +        mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
  12.210 +        mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;        
  12.211 +        mmu[1].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
  12.212 +        mmu[1].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
  12.213 +        mmu[1].ptr |= MMU_EXTENDED_COMMAND;
  12.214 +        mmu[1].val |= MMUEXT_SET_SUBJECTDOM;
  12.215 +        mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
  12.216 +        mmu[2].val  = MMUEXT_REASSIGN_PAGE;
  12.217 +
  12.218 +        mcl[0].op = __HYPERVISOR_update_va_mapping;
  12.219 +        mcl[0].args[0] = vdata >> PAGE_SHIFT;
  12.220 +        mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
  12.221 +        mcl[0].args[2] = 0;
  12.222 +        mcl[1].op = __HYPERVISOR_mmu_update;
  12.223 +        mcl[1].args[0] = (unsigned long)mmu;
  12.224 +        mcl[1].args[1] = 3;
  12.225 +        mcl[1].args[2] = 0;
  12.226 +
  12.227 +        mcl += 2;
  12.228 +        mmu += 3;
  12.229 +
  12.230 +        __skb_queue_tail(&rxq, skb);
  12.231 +
  12.232 +        /* Filled the batch queue? */
  12.233 +        if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) )
  12.234 +            break;
  12.235 +    }
  12.236 +
  12.237 +    if ( mcl == rx_mcl )
  12.238 +        return;
  12.239 +
  12.240 +    mcl[-2].args[2] = UVMF_FLUSH_TLB;
  12.241 +    if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) )
  12.242 +        BUG();
  12.243 +
  12.244 +    mcl = rx_mcl;
  12.245 +    mmu = rx_mmu;
  12.246 +    while ( (skb = __skb_dequeue(&rxq)) != NULL )
  12.247 +    {
  12.248 +        netif   = (netif_t *)skb->dev->priv;
  12.249 +        size    = skb->tail - skb->data;
  12.250 +
  12.251 +        /* Rederive the machine addresses. */
  12.252 +        new_mfn = mcl[0].args[1] >> PAGE_SHIFT;
  12.253 +        mdata   = ((mmu[2].ptr & PAGE_MASK) |
  12.254 +                   ((unsigned long)skb->data & ~PAGE_MASK));
  12.255 +        
  12.256 +        phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
  12.257 +        
  12.258 +        atomic_set(&(skb_shinfo(skb)->dataref), 1);
  12.259 +        skb_shinfo(skb)->nr_frags = 0;
  12.260 +        skb_shinfo(skb)->frag_list = NULL;
  12.261 +
  12.262 +        netif->stats.rx_bytes += size;
  12.263 +        netif->stats.rx_packets++;
  12.264 +
  12.265 +        /* The update_va_mapping() must not fail. */
  12.266 +        if ( unlikely(mcl[0].args[5] != 0) )
  12.267 +            BUG();
  12.268 +
  12.269 +        /* Check the reassignment error code. */
  12.270 +        status = NETIF_RSP_OKAY;
  12.271 +        if ( unlikely(mcl[1].args[5] != 0) )
  12.272 +        {
  12.273 +            DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid);
  12.274 +            dealloc_mfn(mdata >> PAGE_SHIFT);
  12.275 +            status = NETIF_RSP_ERROR;
  12.276 +        }
  12.277 +
  12.278 +        evtchn = netif->evtchn;
  12.279 +        id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id;
  12.280 +        if ( make_rx_response(netif, id, status, mdata, size) &&
  12.281 +             (rx_notify[evtchn] == 0) )
  12.282 +        {
  12.283 +            rx_notify[evtchn] = 1;
  12.284 +            notify_list[notify_nr++] = evtchn;
  12.285 +        }
  12.286 +
  12.287 +        dev_kfree_skb(skb);
  12.288 +
  12.289 +        mcl += 2;
  12.290 +        mmu += 3;
  12.291 +    }
  12.292 +
  12.293 +    while ( notify_nr != 0 )
  12.294 +    {
  12.295 +        evtchn = notify_list[--notify_nr];
  12.296 +        rx_notify[evtchn] = 0;
  12.297 +        notify_via_evtchn(evtchn);
  12.298 +    }
  12.299 +
  12.300 +    /* More work to do? */
  12.301 +    if ( !skb_queue_empty(&rx_queue) )
  12.302 +        tasklet_schedule(&net_rx_tasklet);
  12.303 +#if 0
  12.304 +    else
  12.305 +        xen_network_done_notify();
  12.306 +#endif
  12.307 +}
  12.308 +
  12.309 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
  12.310 +{
  12.311 +    netif_t *netif = dev->priv;
  12.312 +    return &netif->stats;
  12.313 +}
  12.314 +
  12.315 +static int __on_net_schedule_list(netif_t *netif)
  12.316 +{
  12.317 +    return netif->list.next != NULL;
  12.318 +}
  12.319 +
  12.320 +static void remove_from_net_schedule_list(netif_t *netif)
  12.321 +{
  12.322 +    spin_lock_irq(&net_schedule_list_lock);
  12.323 +    if ( likely(__on_net_schedule_list(netif)) )
  12.324 +    {
  12.325 +        list_del(&netif->list);
  12.326 +        netif->list.next = NULL;
  12.327 +        netif_put(netif);
  12.328 +    }
  12.329 +    spin_unlock_irq(&net_schedule_list_lock);
  12.330 +}
  12.331 +
  12.332 +static void add_to_net_schedule_list_tail(netif_t *netif)
  12.333 +{
  12.334 +    if ( __on_net_schedule_list(netif) )
  12.335 +        return;
  12.336 +
  12.337 +    spin_lock_irq(&net_schedule_list_lock);
  12.338 +    if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) )
  12.339 +    {
  12.340 +        list_add_tail(&netif->list, &net_schedule_list);
  12.341 +        netif_get(netif);
  12.342 +    }
  12.343 +    spin_unlock_irq(&net_schedule_list_lock);
  12.344 +}
  12.345 +
  12.346 +static inline void netif_schedule_work(netif_t *netif)
  12.347 +{
  12.348 +    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
  12.349 +         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
  12.350 +    {
  12.351 +        add_to_net_schedule_list_tail(netif);
  12.352 +        maybe_schedule_tx_action();
  12.353 +    }
  12.354 +}
  12.355 +
  12.356 +void netif_deschedule(netif_t *netif)
  12.357 +{
  12.358 +    remove_from_net_schedule_list(netif);
  12.359 +}
  12.360 +
  12.361 +#if 0
  12.362 +static void tx_credit_callback(unsigned long data)
  12.363 +{
  12.364 +    netif_t *netif = (netif_t *)data;
  12.365 +    netif->remaining_credit = netif->credit_bytes;
  12.366 +    netif_schedule_work(netif);
  12.367 +}
  12.368 +#endif
  12.369 +
  12.370 +static void net_tx_action(unsigned long unused)
  12.371 +{
  12.372 +    struct list_head *ent;
  12.373 +    struct sk_buff *skb;
  12.374 +    netif_t *netif;
  12.375 +    netif_tx_request_t txreq;
  12.376 +    u16 pending_idx;
  12.377 +    NETIF_RING_IDX i;
  12.378 +    struct page *page;
  12.379 +    multicall_entry_t *mcl;
  12.380 +    PEND_RING_IDX dc, dp;
  12.381 +
  12.382 +    if ( (dc = dealloc_cons) == (dp = dealloc_prod) )
  12.383 +        goto skip_dealloc;
  12.384 +
  12.385 +    mcl = tx_mcl;
  12.386 +    while ( dc != dp )
  12.387 +    {
  12.388 +        pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
  12.389 +        mcl[0].op = __HYPERVISOR_update_va_mapping;
  12.390 +        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
  12.391 +        mcl[0].args[1] = 0;
  12.392 +        mcl[0].args[2] = 0;
  12.393 +        mcl++;        
  12.394 +    }
  12.395 +
  12.396 +    mcl[-1].args[2] = UVMF_FLUSH_TLB;
  12.397 +    if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
  12.398 +        BUG();
  12.399 +
  12.400 +    mcl = tx_mcl;
  12.401 +    while ( dealloc_cons != dp )
  12.402 +    {
  12.403 +        /* The update_va_mapping() must not fail. */
  12.404 +        if ( unlikely(mcl[0].args[5] != 0) )
  12.405 +            BUG();
  12.406 +
  12.407 +        pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
  12.408 +
  12.409 +        netif = pending_tx_info[pending_idx].netif;
  12.410 +
  12.411 +        spin_lock(&netif->tx_lock);
  12.412 +        make_tx_response(netif, pending_tx_info[pending_idx].req.id, 
  12.413 +                         NETIF_RSP_OKAY);
  12.414 +        spin_unlock(&netif->tx_lock);
  12.415 +        
  12.416 +        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  12.417 +        
  12.418 +        /*
  12.419 +         * Scheduling checks must happen after the above response is posted.
  12.420 +         * This avoids a possible race with a guest OS on another CPU.
  12.421 +         */
  12.422 +        mb();
  12.423 +        if ( (netif->tx_req_cons != netif->tx->req_prod) &&
  12.424 +             ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
  12.425 +            add_to_net_schedule_list_tail(netif);
  12.426 +        
  12.427 +        netif_put(netif);
  12.428 +
  12.429 +        mcl++;
  12.430 +    }
  12.431 +
  12.432 + skip_dealloc:
  12.433 +    mcl = tx_mcl;
  12.434 +    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
  12.435 +            !list_empty(&net_schedule_list) )
  12.436 +    {
  12.437 +        /* Get a netif from the list with work to do. */
  12.438 +        ent = net_schedule_list.next;
  12.439 +        netif = list_entry(ent, netif_t, list);
  12.440 +        netif_get(netif);
  12.441 +        remove_from_net_schedule_list(netif);
  12.442 +
  12.443 +        /* Work to do? */
  12.444 +        i = netif->tx_req_cons;
  12.445 +        if ( (i == netif->tx->req_prod) ||
  12.446 +             ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
  12.447 +        {
  12.448 +            netif_put(netif);
  12.449 +            continue;
  12.450 +        }
  12.451 +        memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, 
  12.452 +               sizeof(txreq));
  12.453 +        netif->tx_req_cons++;
  12.454 +
  12.455 +#if 0
  12.456 +        /* Credit-based scheduling. */
  12.457 +        if ( tx.size > netif->remaining_credit )
  12.458 +        {
  12.459 +            s_time_t now = NOW(), next_credit = 
  12.460 +                netif->credit_timeout.expires + MICROSECS(netif->credit_usec);
  12.461 +            if ( next_credit <= now )
  12.462 +            {
  12.463 +                netif->credit_timeout.expires = now;
  12.464 +                netif->remaining_credit = netif->credit_bytes;
  12.465 +            }
  12.466 +            else
  12.467 +            {
  12.468 +                netif->remaining_credit = 0;
  12.469 +                netif->credit_timeout.expires  = next_credit;
  12.470 +                netif->credit_timeout.data     = (unsigned long)netif;
  12.471 +                netif->credit_timeout.function = tx_credit_callback;
  12.472 +                netif->credit_timeout.cpu      = smp_processor_id();
  12.473 +                add_ac_timer(&netif->credit_timeout);
  12.474 +                break;
  12.475 +            }
  12.476 +        }
  12.477 +        netif->remaining_credit -= tx.size;
  12.478 +#endif
  12.479 +
  12.480 +        netif_schedule_work(netif);
  12.481 +
  12.482 +        if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
  12.483 +             unlikely(txreq.size > ETH_FRAME_LEN) )
  12.484 +        {
  12.485 +            DPRINTK("Bad packet size: %d\n", txreq.size);
  12.486 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  12.487 +            netif_put(netif);
  12.488 +            continue; 
  12.489 +        }
  12.490 +
  12.491 +        /* No crossing a page boundary as the payload mustn't fragment. */
  12.492 +        if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) 
  12.493 +        {
  12.494 +            DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", 
  12.495 +                    txreq.addr, txreq.size, 
  12.496 +                    (txreq.addr &~PAGE_MASK) + txreq.size);
  12.497 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  12.498 +            netif_put(netif);
  12.499 +            continue;
  12.500 +        }
  12.501 +
  12.502 +        pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
  12.503 +
  12.504 +        if ( unlikely((skb = alloc_skb(PKT_PROT_LEN+16, GFP_ATOMIC)) == NULL) )
  12.505 +        {
  12.506 +            DPRINTK("Can't allocate a skb in start_xmit.\n");
  12.507 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  12.508 +            netif_put(netif);
  12.509 +            break;
  12.510 +        }
  12.511 +
  12.512 +        /* Packets passed to netif_rx() must have some headroom. */
  12.513 +        skb_reserve(skb, 16);
  12.514 +
  12.515 +        mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
  12.516 +        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
  12.517 +        mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
  12.518 +        mcl[0].args[2] = 0;
  12.519 +        mcl[0].args[3] = netif->domid;
  12.520 +        mcl++;
  12.521 +
  12.522 +        memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq));
  12.523 +        pending_tx_info[pending_idx].netif = netif;
  12.524 +        *((u16 *)skb->data) = pending_idx;
  12.525 +
  12.526 +        __skb_queue_tail(&tx_queue, skb);
  12.527 +
  12.528 +        pending_cons++;
  12.529 +
  12.530 +        /* Filled the batch queue? */
  12.531 +        if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) )
  12.532 +            break;
  12.533 +    }
  12.534 +
  12.535 +    if ( mcl == tx_mcl )
  12.536 +        return;
  12.537 +
  12.538 +    if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
  12.539 +        BUG();
  12.540 +
  12.541 +    mcl = tx_mcl;
  12.542 +    while ( (skb = __skb_dequeue(&tx_queue)) != NULL )
  12.543 +    {
  12.544 +        pending_idx = *((u16 *)skb->data);
  12.545 +        netif       = pending_tx_info[pending_idx].netif;
  12.546 +        memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq));
  12.547 +
  12.548 +        /* Check the remap error code. */
  12.549 +        if ( unlikely(mcl[0].args[5] != 0) )
  12.550 +        {
  12.551 +            DPRINTK("Bad page frame\n");
  12.552 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  12.553 +            netif_put(netif);
  12.554 +            kfree_skb(skb);
  12.555 +            mcl++;
  12.556 +            pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  12.557 +            continue;
  12.558 +        }
  12.559 +
  12.560 +        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
  12.561 +            txreq.addr >> PAGE_SHIFT;
  12.562 +
  12.563 +        __skb_put(skb, PKT_PROT_LEN);
  12.564 +        memcpy(skb->data, 
  12.565 +               (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
  12.566 +               PKT_PROT_LEN);
  12.567 +
  12.568 +        page = virt_to_page(MMAP_VADDR(pending_idx));
  12.569 +
  12.570 +        /* Append the packet payload as a fragment. */
  12.571 +        skb_shinfo(skb)->frags[0].page        = page;
  12.572 +        skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
  12.573 +        skb_shinfo(skb)->frags[0].page_offset = 
  12.574 +            (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
  12.575 +        skb_shinfo(skb)->nr_frags = 1;
  12.576 +        skb->data_len  = txreq.size - PKT_PROT_LEN;
  12.577 +        skb->len      += skb->data_len;
  12.578 +
  12.579 +        skb->dev      = netif->dev;
  12.580 +        skb->protocol = eth_type_trans(skb, skb->dev);
  12.581 +
  12.582 +        /*
  12.583 +         * Destructor information. We hideously abuse the 'mapping' pointer,
  12.584 +         * which isn't otherwise used by us. The page deallocator is modified
  12.585 +         * to interpret a non-NULL value as a destructor function to be called.
  12.586 +         * This works okay because in all other cases the pointer must be NULL
  12.587 +         * when the page is freed (normally Linux will explicitly bug out if
  12.588 +         * it sees otherwise.
  12.589 +         */
  12.590 +        page->mapping = (struct address_space *)netif_page_release;
  12.591 +        set_page_count(page, 1);
  12.592 +
  12.593 +        netif->stats.tx_bytes += txreq.size;
  12.594 +        netif->stats.tx_packets++;
  12.595 +
  12.596 +        netif_rx(skb);
  12.597 +        netif->dev->last_rx = jiffies;
  12.598 +
  12.599 +        mcl++;
  12.600 +    }
  12.601 +}
  12.602 +
  12.603 +static void netif_page_release(struct page *page)
  12.604 +{
  12.605 +    unsigned long flags;
  12.606 +    u16 pending_idx = page - virt_to_page(mmap_vstart);
  12.607 +
  12.608 +    /* Stop the abuse. */
  12.609 +    page->mapping = NULL;
  12.610 +
  12.611 +    spin_lock_irqsave(&dealloc_lock, flags);
  12.612 +    dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
  12.613 +    spin_unlock_irqrestore(&dealloc_lock, flags);
  12.614 +
  12.615 +    tasklet_schedule(&net_tx_tasklet);
  12.616 +}
  12.617 +
  12.618 +#if 0
  12.619 +long flush_bufs_for_netif(netif_t *netif)
  12.620 +{
  12.621 +    NETIF_RING_IDX i;
  12.622 +
  12.623 +    /* Return any outstanding receive buffers to the guest OS. */
  12.624 +    spin_lock(&netif->rx_lock);
  12.625 +    for ( i = netif->rx_req_cons; 
  12.626 +          (i != netif->rx->req_prod) &&
  12.627 +              ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE);
  12.628 +          i++ )
  12.629 +    {
  12.630 +        make_rx_response(netif,
  12.631 +                         netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id,
  12.632 +                         NETIF_RSP_DROPPED, 0, 0);
  12.633 +    }
  12.634 +    netif->rx_req_cons = i;
  12.635 +    spin_unlock(&netif->rx_lock);
  12.636 +
  12.637 +    /*
  12.638 +     * Flush pending transmit buffers. The guest may still have to wait for
  12.639 +     * buffers that are queued at a physical NIC.
  12.640 +     */
  12.641 +    spin_lock(&netif->tx_lock);
  12.642 +    for ( i = netif->tx_req_cons; 
  12.643 +          (i != netif->tx->req_prod) &&
  12.644 +              ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE);
  12.645 +          i++ )
  12.646 +    {
  12.647 +        make_tx_response(netif,
  12.648 +                         netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id,
  12.649 +                         NETIF_RSP_DROPPED);
  12.650 +    }
  12.651 +    netif->tx_req_cons = i;
  12.652 +    spin_unlock(&netif->tx_lock);
  12.653 +
  12.654 +    return 0;
  12.655 +}
  12.656 +#endif
  12.657 +
  12.658 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
  12.659 +{
  12.660 +    netif_t *netif = dev_id;
  12.661 +    if ( tx_work_exists(netif) )
  12.662 +    {
  12.663 +        add_to_net_schedule_list_tail(netif);
  12.664 +        maybe_schedule_tx_action();
  12.665 +    }
  12.666 +    return IRQ_HANDLED;
  12.667 +}
  12.668 +
  12.669 +static void make_tx_response(netif_t *netif, 
  12.670 +                             u16      id,
  12.671 +                             s8       st)
  12.672 +{
  12.673 +    NETIF_RING_IDX i = netif->tx_resp_prod;
  12.674 +    netif_tx_response_t *resp;
  12.675 +
  12.676 +    resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp;
  12.677 +    resp->id     = id;
  12.678 +    resp->status = st;
  12.679 +    wmb();
  12.680 +    netif->tx->resp_prod = netif->tx_resp_prod = ++i;
  12.681 +
  12.682 +    mb(); /* Update producer before checking event threshold. */
  12.683 +    if ( i == netif->tx->event )
  12.684 +        notify_via_evtchn(netif->evtchn);
  12.685 +}
  12.686 +
  12.687 +static int make_rx_response(netif_t *netif, 
  12.688 +                            u16      id, 
  12.689 +                            s8       st,
  12.690 +                            memory_t addr,
  12.691 +                            u16      size)
  12.692 +{
  12.693 +    NETIF_RING_IDX i = netif->rx_resp_prod;
  12.694 +    netif_rx_response_t *resp;
  12.695 +
  12.696 +    resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
  12.697 +    resp->addr   = addr;
  12.698 +    resp->id     = id;
  12.699 +    resp->status = (s16)size;
  12.700 +    if ( st < 0 )
  12.701 +        resp->status = (s16)st;
  12.702 +    wmb();
  12.703 +    netif->rx->resp_prod = netif->rx_resp_prod = ++i;
  12.704 +
  12.705 +    mb(); /* Update producer before checking event threshold. */
  12.706 +    return (i == netif->rx->event);
  12.707 +}
  12.708 +
  12.709 +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
  12.710 +{
  12.711 +    struct list_head *ent;
  12.712 +    netif_t *netif;
  12.713 +    int i = 0;
  12.714 +
  12.715 +    printk(KERN_ALERT "netif_schedule_list:\n");
  12.716 +    spin_lock_irq(&net_schedule_list_lock);
  12.717 +
  12.718 +    list_for_each ( ent, &net_schedule_list )
  12.719 +    {
  12.720 +        netif = list_entry(ent, netif_t, list);
  12.721 +        printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n",
  12.722 +               i, netif->rx_req_cons, netif->rx_resp_prod);               
  12.723 +        printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
  12.724 +               netif->tx_req_cons, netif->tx_resp_prod);
  12.725 +        printk(KERN_ALERT "   shared(rx_req_prod=%08x rx_resp_prod=%08x\n",
  12.726 +               netif->rx->req_prod, netif->rx->resp_prod);
  12.727 +        printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
  12.728 +               netif->rx->event, netif->tx->req_prod);
  12.729 +        printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
  12.730 +               netif->tx->resp_prod, netif->tx->event);
  12.731 +        i++;
  12.732 +    }
  12.733 +
  12.734 +    spin_unlock_irq(&net_schedule_list_lock);
  12.735 +    printk(KERN_ALERT " ** End of netif_schedule_list **\n");
  12.736 +
  12.737 +    return IRQ_HANDLED;
  12.738 +}
  12.739 +
  12.740 +static int __init netback_init(void)
  12.741 +{
  12.742 +    int i;
  12.743 +
  12.744 +    if ( !(start_info.flags & SIF_NET_BE_DOMAIN) &&
  12.745 +	 !(start_info.flags & SIF_INITDOMAIN) )
  12.746 +        return 0;
  12.747 +
  12.748 +    printk("Initialising Xen netif backend\n");
  12.749 +
  12.750 +    skb_queue_head_init(&rx_queue);
  12.751 +    skb_queue_head_init(&tx_queue);
  12.752 +
  12.753 +    netif_interface_init();
  12.754 +
  12.755 +    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
  12.756 +        BUG();
  12.757 +
  12.758 +    pending_cons = 0;
  12.759 +    pending_prod = MAX_PENDING_REQS;
  12.760 +    for ( i = 0; i < MAX_PENDING_REQS; i++ )
  12.761 +        pending_ring[i] = i;
  12.762 +
  12.763 +    spin_lock_init(&net_schedule_list_lock);
  12.764 +    INIT_LIST_HEAD(&net_schedule_list);
  12.765 +
  12.766 +    netif_ctrlif_init();
  12.767 +
  12.768 +    (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG),
  12.769 +                      netif_be_dbg, SA_SHIRQ, 
  12.770 +                      "net-be-dbg", &netif_be_dbg);
  12.771 +
  12.772 +    return 0;
  12.773 +}
  12.774 +
  12.775 +static void netback_cleanup(void)
  12.776 +{
  12.777 +    BUG();
  12.778 +}
  12.779 +
  12.780 +module_init(netback_init);
  12.781 +module_exit(netback_cleanup);
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig	Fri Jul 30 18:49:24 2004 +0000
    13.3 @@ -0,0 +1,6 @@
    13.4 +
    13.5 +config XENNET
    13.6 +	tristate "Xen network driver"
    13.7 +	depends on NETDEVICES && ARCH_XEN
    13.8 +	help
    13.9 +	  Network driver for Xen
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile	Fri Jul 30 18:49:24 2004 +0000
    14.3 @@ -0,0 +1,2 @@
    14.4 +
    14.5 +obj-y	:= netfront.o
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Fri Jul 30 18:49:24 2004 +0000
    15.3 @@ -0,0 +1,882 @@
    15.4 +/******************************************************************************
    15.5 + * Virtual network driver for conversing with remote driver backends.
    15.6 + * 
    15.7 + * Copyright (c) 2002-2004, K A Fraser
    15.8 + */
    15.9 +
   15.10 +#include <linux/config.h>
   15.11 +#include <linux/module.h>
   15.12 +#include <linux/version.h>
   15.13 +#include <linux/kernel.h>
   15.14 +#include <linux/sched.h>
   15.15 +#include <linux/slab.h>
   15.16 +#include <linux/string.h>
   15.17 +#include <linux/errno.h>
   15.18 +#include <linux/netdevice.h>
   15.19 +#include <linux/inetdevice.h>
   15.20 +#include <linux/etherdevice.h>
   15.21 +#include <linux/skbuff.h>
   15.22 +#include <linux/init.h>
   15.23 +
   15.24 +#include <asm/io.h>
   15.25 +#include <net/sock.h>
   15.26 +#include <net/pkt_sched.h>
   15.27 +
   15.28 +#include <asm-xen/evtchn.h>
   15.29 +#include <asm-xen/ctrl_if.h>
   15.30 +
   15.31 +#include <asm/page.h>
   15.32 +
   15.33 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
   15.34 +#include <asm-xen/netif.h>
   15.35 +#else
   15.36 +#include "../netif.h"
   15.37 +#define irqreturn_t void
   15.38 +#define IRQ_HANDLED
   15.39 +#endif
   15.40 +
   15.41 +#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
   15.42 +
   15.43 +static void network_tx_buf_gc(struct net_device *dev);
   15.44 +static void network_alloc_rx_buffers(struct net_device *dev);
   15.45 +
   15.46 +static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
   15.47 +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
   15.48 +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
   15.49 +
   15.50 +static struct list_head dev_list;
   15.51 +
   15.52 +struct net_private
   15.53 +{
   15.54 +    struct list_head list;
   15.55 +    struct net_device *dev;
   15.56 +
   15.57 +    struct net_device_stats stats;
   15.58 +    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
   15.59 +    unsigned int tx_full;
   15.60 +    
   15.61 +    netif_tx_interface_t *tx;
   15.62 +    netif_rx_interface_t *rx;
   15.63 +
   15.64 +    spinlock_t   tx_lock;
   15.65 +    spinlock_t   rx_lock;
   15.66 +
   15.67 +    unsigned int handle;
   15.68 +    unsigned int evtchn;
   15.69 +    unsigned int irq;
   15.70 +
   15.71 +    /* What is the status of our connection to the remote backend? */
   15.72 +#define BEST_CLOSED       0
   15.73 +#define BEST_DISCONNECTED 1
   15.74 +#define BEST_CONNECTED    2
   15.75 +    unsigned int backend_state;
   15.76 +
   15.77 +    /* Is this interface open or closed (down or up)? */
   15.78 +#define UST_CLOSED        0
   15.79 +#define UST_OPEN          1
   15.80 +    unsigned int user_state;
   15.81 +
   15.82 +    /*
   15.83 +     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
   15.84 +     * array is an index into a chain of free entries.
   15.85 +     */
   15.86 +    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
   15.87 +    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
   15.88 +};
   15.89 +
   15.90 +/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
   15.91 +#define ADD_ID_TO_FREELIST(_list, _id)             \
   15.92 +    (_list)[(_id)] = (_list)[0];                   \
   15.93 +    (_list)[0]     = (void *)(unsigned long)(_id);
   15.94 +#define GET_ID_FROM_FREELIST(_list)                \
   15.95 + ({ unsigned long _id = (unsigned long)(_list)[0]; \
   15.96 +    (_list)[0]  = (_list)[_id];                    \
   15.97 +    (unsigned short)_id; })
   15.98 +
   15.99 +static struct net_device *find_dev_by_handle(unsigned int handle)
  15.100 +{
  15.101 +    struct list_head *ent;
  15.102 +    struct net_private *np;
  15.103 +    list_for_each ( ent, &dev_list )
  15.104 +    {
  15.105 +        np = list_entry(ent, struct net_private, list);
  15.106 +        if ( np->handle == handle )
  15.107 +            return np->dev;
  15.108 +    }
  15.109 +    return NULL;
  15.110 +}
  15.111 +
  15.112 +/** Network interface info. */
  15.113 +struct netif_ctrl {
  15.114 +    /** Number of interfaces. */
  15.115 +    int interface_n;
  15.116 +    /** Number of connected interfaces. */
  15.117 +    int connected_n;
  15.118 +    /** Error code. */
  15.119 +    int err;
  15.120 +};
  15.121 +
  15.122 +static struct netif_ctrl netctrl;
  15.123 +
  15.124 +static void netctrl_init(void)
  15.125 +{
  15.126 +    memset(&netctrl, 0, sizeof(netctrl));
  15.127 +    netctrl.interface_n = -1;
  15.128 +}
  15.129 +
  15.130 +/** Get or set a network interface error.
  15.131 + */
  15.132 +static int netctrl_err(int err)
  15.133 +{
  15.134 +    if(err < 0 && !netctrl.err){
  15.135 +        netctrl.err = err;
  15.136 +        printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
  15.137 +    }
  15.138 +    return netctrl.err;
  15.139 +}
  15.140 +
  15.141 +/** Test if all network interfaces are connected.
  15.142 + *
  15.143 + * @return 1 if all connected, 0 if not, negative error code otherwise
  15.144 + */
  15.145 +static int netctrl_connected(void)
  15.146 +{
  15.147 +    int ok = 0;
  15.148 +    ok = (netctrl.err ? netctrl.err :
  15.149 +          (netctrl.connected_n == netctrl.interface_n));
  15.150 +    return ok;
  15.151 +}
  15.152 +
  15.153 +/** Count the connected network interfaces.
  15.154 + *
  15.155 + * @return connected count
  15.156 + */
  15.157 +static int netctrl_connected_count(void)
  15.158 +{
  15.159 +    
  15.160 +    struct list_head *ent;
  15.161 +    struct net_private *np;
  15.162 +    unsigned int connected;
  15.163 +
  15.164 +    connected = 0;
  15.165 +    
  15.166 +    list_for_each(ent, &dev_list)
  15.167 +    {
  15.168 +        np = list_entry(ent, struct net_private, list);
  15.169 +        if ( np->backend_state == BEST_CONNECTED )
  15.170 +            connected++;
  15.171 +    }
  15.172 +
  15.173 +    netctrl.connected_n = connected;
  15.174 +    return connected;
  15.175 +}
  15.176 +
  15.177 +static int network_open(struct net_device *dev)
  15.178 +{
  15.179 +    struct net_private *np = dev->priv;
  15.180 +
  15.181 +    memset(&np->stats, 0, sizeof(np->stats));
  15.182 +
  15.183 +    np->user_state = UST_OPEN;
  15.184 +
  15.185 +    network_alloc_rx_buffers(dev);
  15.186 +    np->rx->event = np->rx_resp_cons + 1;
  15.187 +
  15.188 +    netif_start_queue(dev);
  15.189 +
  15.190 +    return 0;
  15.191 +}
  15.192 +
  15.193 +
  15.194 +static void network_tx_buf_gc(struct net_device *dev)
  15.195 +{
  15.196 +    NETIF_RING_IDX i, prod;
  15.197 +    unsigned short id;
  15.198 +    struct net_private *np = dev->priv;
  15.199 +    struct sk_buff *skb;
  15.200 +
  15.201 +    if ( np->backend_state != BEST_CONNECTED )
  15.202 +        return;
  15.203 +
  15.204 +    do {
  15.205 +        prod = np->tx->resp_prod;
  15.206 +
  15.207 +        for ( i = np->tx_resp_cons; i != prod; i++ )
  15.208 +        {
  15.209 +            id  = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
  15.210 +            skb = np->tx_skbs[id];
  15.211 +            ADD_ID_TO_FREELIST(np->tx_skbs, id);
  15.212 +            dev_kfree_skb_any(skb);
  15.213 +        }
  15.214 +        
  15.215 +        np->tx_resp_cons = prod;
  15.216 +        
  15.217 +        /*
  15.218 +         * Set a new event, then check for race with update of tx_cons. Note
  15.219 +         * that it is essential to schedule a callback, no matter how few
  15.220 +         * buffers are pending. Even if there is space in the transmit ring,
  15.221 +         * higher layers may be blocked because too much data is outstanding:
  15.222 +         * in such cases notification from Xen is likely to be the only kick
  15.223 +         * that we'll get.
  15.224 +         */
  15.225 +        np->tx->event = 
  15.226 +            prod + ((np->tx->req_prod - prod) >> 1) + 1;
  15.227 +        mb();
  15.228 +    }
  15.229 +    while ( prod != np->tx->resp_prod );
  15.230 +
  15.231 +    if ( np->tx_full && 
  15.232 +         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
  15.233 +    {
  15.234 +        np->tx_full = 0;
  15.235 +        if ( np->user_state == UST_OPEN )
  15.236 +            netif_wake_queue(dev);
  15.237 +    }
  15.238 +}
  15.239 +
  15.240 +
  15.241 +static void network_alloc_rx_buffers(struct net_device *dev)
  15.242 +{
  15.243 +    unsigned short id;
  15.244 +    struct net_private *np = dev->priv;
  15.245 +    struct sk_buff *skb;
  15.246 +    NETIF_RING_IDX i = np->rx->req_prod;
  15.247 +    int nr_pfns = 0;
  15.248 +
  15.249 +    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
  15.250 +    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
  15.251 +         unlikely(np->backend_state != BEST_CONNECTED) )
  15.252 +        return;
  15.253 +
  15.254 +    do {
  15.255 +        skb = dev_alloc_skb(RX_BUF_SIZE);
  15.256 +        if ( unlikely(skb == NULL) )
  15.257 +            break;
  15.258 +
  15.259 +        skb->dev = dev;
  15.260 +
  15.261 +        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
  15.262 +            panic("alloc_skb needs to provide us page-aligned buffers.");
  15.263 +
  15.264 +        id = GET_ID_FROM_FREELIST(np->rx_skbs);
  15.265 +
  15.266 +        np->rx_skbs[id] = skb;
  15.267 +        
  15.268 +        np->rx->ring[MASK_NETIF_RX_IDX(i)].req.id = id;
  15.269 +        
  15.270 +        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
  15.271 +
  15.272 +        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
  15.273 +        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
  15.274 +        rx_mcl[nr_pfns].args[1] = 0;
  15.275 +        rx_mcl[nr_pfns].args[2] = 0;
  15.276 +
  15.277 +        nr_pfns++;
  15.278 +    }
  15.279 +    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
  15.280 +
  15.281 +    if ( unlikely(nr_pfns == 0) )
  15.282 +        return;
  15.283 +
  15.284 +    /*
  15.285 +     * We may have allocated buffers which have entries outstanding in the page
  15.286 +     * update queue -- make sure we flush those first!
  15.287 +     */
  15.288 +    flush_page_update_queue();
  15.289 +
  15.290 +    /* After all PTEs have been zapped we blow away stale TLB entries. */
  15.291 +    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
  15.292 +
  15.293 +    /* Give away a batch of pages. */
  15.294 +    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
  15.295 +    rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
  15.296 +    rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
  15.297 +    rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
  15.298 +
  15.299 +    /* Zap PTEs and give away pages in one big multicall. */
  15.300 +    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
  15.301 +
  15.302 +    /* Check return status of HYPERVISOR_dom_mem_op(). */
  15.303 +    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
  15.304 +        panic("Unable to reduce memory reservation\n");
  15.305 +
  15.306 +    np->rx->req_prod = i;
  15.307 +}
  15.308 +
  15.309 +
  15.310 +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
  15.311 +{
  15.312 +    unsigned short id;
  15.313 +    struct net_private *np = (struct net_private *)dev->priv;
  15.314 +    netif_tx_request_t *tx;
  15.315 +    NETIF_RING_IDX i;
  15.316 +
  15.317 +    if ( unlikely(np->tx_full) )
  15.318 +    {
  15.319 +        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
  15.320 +        netif_stop_queue(dev);
  15.321 +        return -ENOBUFS;
  15.322 +    }
  15.323 +
  15.324 +    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
  15.325 +                  PAGE_SIZE) )
  15.326 +    {
  15.327 +        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
  15.328 +        if ( unlikely(new_skb == NULL) )
  15.329 +            return 1;
  15.330 +        skb_put(new_skb, skb->len);
  15.331 +        memcpy(new_skb->data, skb->data, skb->len);
  15.332 +        dev_kfree_skb(skb);
  15.333 +        skb = new_skb;
  15.334 +    }
  15.335 +    
  15.336 +    spin_lock_irq(&np->tx_lock);
  15.337 +
  15.338 +    if ( np->backend_state != BEST_CONNECTED )
  15.339 +    {
  15.340 +        spin_unlock_irq(&np->tx_lock);
  15.341 +        return 1;
  15.342 +    }
  15.343 +
  15.344 +    i = np->tx->req_prod;
  15.345 +
  15.346 +    id = GET_ID_FROM_FREELIST(np->tx_skbs);
  15.347 +    np->tx_skbs[id] = skb;
  15.348 +
  15.349 +    tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
  15.350 +
  15.351 +    tx->id   = id;
  15.352 +    tx->addr = virt_to_machine(skb->data);
  15.353 +    tx->size = skb->len;
  15.354 +
  15.355 +    wmb();
  15.356 +    np->tx->req_prod = i + 1;
  15.357 +
  15.358 +    network_tx_buf_gc(dev);
  15.359 +
  15.360 +    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
  15.361 +    {
  15.362 +        np->tx_full = 1;
  15.363 +        netif_stop_queue(dev);
  15.364 +    }
  15.365 +
  15.366 +    spin_unlock_irq(&np->tx_lock);
  15.367 +
  15.368 +    np->stats.tx_bytes += skb->len;
  15.369 +    np->stats.tx_packets++;
  15.370 +
  15.371 +    /* Only notify Xen if there are no outstanding responses. */
  15.372 +    mb();
  15.373 +    if ( np->tx->resp_prod == i )
  15.374 +        notify_via_evtchn(np->evtchn);
  15.375 +
  15.376 +    return 0;
  15.377 +}
  15.378 +
  15.379 +
  15.380 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
  15.381 +{
  15.382 +    struct net_device *dev = dev_id;
  15.383 +    struct net_private *np = dev->priv;
  15.384 +    unsigned long flags;
  15.385 +
  15.386 +    spin_lock_irqsave(&np->tx_lock, flags);
  15.387 +    network_tx_buf_gc(dev);
  15.388 +    spin_unlock_irqrestore(&np->tx_lock, flags);
  15.389 +
  15.390 +    if ( (np->rx_resp_cons != np->rx->resp_prod) &&
  15.391 +         (np->user_state == UST_OPEN) )
  15.392 +        netif_rx_schedule(dev);
  15.393 +
  15.394 +    return IRQ_HANDLED;
  15.395 +}
  15.396 +
  15.397 +
  15.398 +static int netif_poll(struct net_device *dev, int *pbudget)
  15.399 +{
  15.400 +    struct net_private *np = dev->priv;
  15.401 +    struct sk_buff *skb;
  15.402 +    netif_rx_response_t *rx;
  15.403 +    NETIF_RING_IDX i;
  15.404 +    mmu_update_t *mmu = rx_mmu;
  15.405 +    multicall_entry_t *mcl = rx_mcl;
  15.406 +    int work_done, budget, more_to_do = 1;
  15.407 +    struct sk_buff_head rxq;
  15.408 +    unsigned long flags;
  15.409 +
  15.410 +    spin_lock(&np->rx_lock);
  15.411 +
  15.412 +    if ( np->backend_state != BEST_CONNECTED )
  15.413 +    {
  15.414 +        spin_unlock(&np->rx_lock);
  15.415 +        return 0;
  15.416 +    }
  15.417 +
  15.418 +    skb_queue_head_init(&rxq);
  15.419 +
  15.420 +    if ( (budget = *pbudget) > dev->quota )
  15.421 +        budget = dev->quota;
  15.422 +
  15.423 +    for ( i = np->rx_resp_cons, work_done = 0; 
  15.424 +          (i != np->rx->resp_prod) && (work_done < budget); 
  15.425 +          i++, work_done++ )
  15.426 +    {
  15.427 +        rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
  15.428 +
  15.429 +        /*
  15.430 +         * An error here is very odd. Usually indicates a backend bug,
  15.431 +         * low-memory condition, or that we didn't have reservation headroom.
  15.432 +         * Whatever - print an error and queue the id again straight away.
  15.433 +         */
  15.434 +        if ( unlikely(rx->status <= 0) )
  15.435 +        {
  15.436 +            /* Gate this error. We get a (valid) slew of them on suspend. */
  15.437 +            if ( np->user_state == UST_OPEN )
  15.438 +                printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
  15.439 +            np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
  15.440 +            wmb();
  15.441 +            np->rx->req_prod++;
  15.442 +            continue;
  15.443 +        }
  15.444 +
  15.445 +        skb = np->rx_skbs[rx->id];
  15.446 +        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
  15.447 +
  15.448 +        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
  15.449 +        skb_put(skb, rx->status);
  15.450 +
  15.451 +        np->stats.rx_packets++;
  15.452 +        np->stats.rx_bytes += rx->status;
  15.453 +
  15.454 +        /* Remap the page. */
  15.455 +        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
  15.456 +        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
  15.457 +        mmu++;
  15.458 +        mcl->op = __HYPERVISOR_update_va_mapping;
  15.459 +        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
  15.460 +        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
  15.461 +        mcl->args[2] = 0;
  15.462 +        mcl++;
  15.463 +
  15.464 +        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
  15.465 +            rx->addr >> PAGE_SHIFT;
  15.466 +
  15.467 +        __skb_queue_tail(&rxq, skb);
  15.468 +    }
  15.469 +
  15.470 +    /* Do all the remapping work, and M->P updates, in one big hypercall. */
  15.471 +    if ( likely((mcl - rx_mcl) != 0) )
  15.472 +    {
  15.473 +        mcl->op = __HYPERVISOR_mmu_update;
  15.474 +        mcl->args[0] = (unsigned long)rx_mmu;
  15.475 +        mcl->args[1] = mmu - rx_mmu;
  15.476 +        mcl->args[2] = 0;
  15.477 +        mcl++;
  15.478 +        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
  15.479 +    }
  15.480 +
  15.481 +    while ( (skb = __skb_dequeue(&rxq)) != NULL )
  15.482 +    {
  15.483 +        /* Set the shared-info area, which is hidden behind the real data. */
  15.484 +        atomic_set(&(skb_shinfo(skb)->dataref), 1);
  15.485 +        skb_shinfo(skb)->nr_frags = 0;
  15.486 +        skb_shinfo(skb)->frag_list = NULL;
  15.487 +
  15.488 +        /* Ethernet-specific work. Delayed to here as it peeks the header. */
  15.489 +        skb->protocol = eth_type_trans(skb, dev);
  15.490 +
  15.491 +        /* Pass it up. */
  15.492 +        netif_rx(skb);
  15.493 +        dev->last_rx = jiffies;
  15.494 +    }
  15.495 +
  15.496 +    np->rx_resp_cons = i;
  15.497 +
  15.498 +    network_alloc_rx_buffers(dev);
  15.499 +
  15.500 +    *pbudget   -= work_done;
  15.501 +    dev->quota -= work_done;
  15.502 +
  15.503 +    if ( work_done < budget )
  15.504 +    {
  15.505 +        local_irq_save(flags);
  15.506 +
  15.507 +        np->rx->event = i + 1;
  15.508 +    
  15.509 +        /* Deal with hypervisor racing our resetting of rx_event. */
  15.510 +        mb();
  15.511 +        if ( np->rx->resp_prod == i )
  15.512 +        {
  15.513 +            __netif_rx_complete(dev);
  15.514 +            more_to_do = 0;
  15.515 +        }
  15.516 +
  15.517 +        local_irq_restore(flags);
  15.518 +    }
  15.519 +
  15.520 +    spin_unlock(&np->rx_lock);
  15.521 +
  15.522 +    return more_to_do;
  15.523 +}
  15.524 +
  15.525 +
  15.526 +static int network_close(struct net_device *dev)
  15.527 +{
  15.528 +    struct net_private *np = dev->priv;
  15.529 +    np->user_state = UST_CLOSED;
  15.530 +    netif_stop_queue(np->dev);
  15.531 +    return 0;
  15.532 +}
  15.533 +
  15.534 +
  15.535 +static struct net_device_stats *network_get_stats(struct net_device *dev)
  15.536 +{
  15.537 +    struct net_private *np = (struct net_private *)dev->priv;
  15.538 +    return &np->stats;
  15.539 +}
  15.540 +
  15.541 +
  15.542 +static void network_connect(struct net_device *dev,
  15.543 +                            netif_fe_interface_status_changed_t *status)
  15.544 +{
  15.545 +    struct net_private *np;
  15.546 +    int i, requeue_idx;
  15.547 +    netif_tx_request_t *tx;
  15.548 +
  15.549 +    np = dev->priv;
  15.550 +    spin_lock_irq(&np->rx_lock);
  15.551 +    spin_lock(&np->tx_lock);
  15.552 +
  15.553 +    /* Recovery procedure: */
  15.554 +
  15.555 +    /* Step 1: Reinitialise variables. */
  15.556 +    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
  15.557 +    np->rx->event = 1;
  15.558 +
  15.559 +    /* Step 2: Rebuild the RX and TX ring contents.
  15.560 +     * NB. We could just free the queued TX packets now but we hope
  15.561 +     * that sending them out might do some good.  We have to rebuild
  15.562 +     * the RX ring because some of our pages are currently flipped out
  15.563 +     * so we can't just free the RX skbs.
  15.564 +     * NB2. Freelist index entries are always going to be less than
  15.565 +     *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
  15.566 +     * greater than __PAGE_OFFSET: we use this property to distinguish
  15.567 +     * them.
  15.568 +     */
  15.569 +
  15.570 +    /* Rebuild the TX buffer freelist and the TX ring itself.
  15.571 +     * NB. This reorders packets.  We could keep more private state
  15.572 +     * to avoid this but maybe it doesn't matter so much given the
  15.573 +     * interface has been down.
  15.574 +     */
  15.575 +    for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
  15.576 +    {
  15.577 +            if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
  15.578 +            {
  15.579 +                struct sk_buff *skb = np->tx_skbs[i];
  15.580 +                
  15.581 +                tx = &np->tx->ring[requeue_idx++].req;
  15.582 +                
  15.583 +                tx->id   = i;
  15.584 +                tx->addr = virt_to_machine(skb->data);
  15.585 +                tx->size = skb->len;
  15.586 +                
  15.587 +                np->stats.tx_bytes += skb->len;
  15.588 +                np->stats.tx_packets++;
  15.589 +            }
  15.590 +    }
  15.591 +    wmb();
  15.592 +    np->tx->req_prod = requeue_idx;
  15.593 +
  15.594 +    /* Rebuild the RX buffer freelist and the RX ring itself. */
  15.595 +    for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
  15.596 +        if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
  15.597 +            np->rx->ring[requeue_idx++].req.id = i;
  15.598 +    wmb();                
  15.599 +    np->rx->req_prod = requeue_idx;
  15.600 +
  15.601 +    /* Step 3: All public and private state should now be sane.  Get
  15.602 +     * ready to start sending and receiving packets and give the driver
  15.603 +     * domain a kick because we've probably just requeued some
  15.604 +     * packets.
  15.605 +     */
  15.606 +    np->backend_state = BEST_CONNECTED;
  15.607 +    notify_via_evtchn(status->evtchn);  
  15.608 +    network_tx_buf_gc(dev);
  15.609 +
  15.610 +    if ( np->user_state == UST_OPEN )
  15.611 +        netif_start_queue(dev);
  15.612 +
  15.613 +    spin_unlock(&np->tx_lock);
  15.614 +    spin_unlock_irq(&np->rx_lock);
  15.615 +}
  15.616 +
  15.617 +static void netif_status_change(netif_fe_interface_status_changed_t *status)
  15.618 +{
  15.619 +    ctrl_msg_t                   cmsg;
  15.620 +    netif_fe_interface_connect_t up;
  15.621 +    struct net_device *dev;
  15.622 +    struct net_private *np;
  15.623 +    
  15.624 +    if ( netctrl.interface_n <= 0 )
  15.625 +    {
  15.626 +        printk(KERN_WARNING "Status change: no interfaces\n");
  15.627 +        return;
  15.628 +    }
  15.629 +
  15.630 +    dev = find_dev_by_handle(status->handle);
  15.631 +    if(!dev){
  15.632 +        printk(KERN_WARNING "Status change: invalid netif handle %u\n",
  15.633 +               status->handle);
  15.634 +         return;
  15.635 +    }
  15.636 +    np  = dev->priv;
  15.637 +    
  15.638 +    switch ( status->status )
  15.639 +    {
  15.640 +    case NETIF_INTERFACE_STATUS_DESTROYED:
  15.641 +        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
  15.642 +               np->backend_state);
  15.643 +        break;
  15.644 +
  15.645 +    case NETIF_INTERFACE_STATUS_DISCONNECTED:
  15.646 +        if ( np->backend_state != BEST_CLOSED )
  15.647 +        {
  15.648 +            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
  15.649 +                   " in state %d\n", np->backend_state);
  15.650 +	    printk(KERN_INFO "Attempting to reconnect network interface\n");
  15.651 +
  15.652 +            /* Begin interface recovery.
  15.653 +	     *
  15.654 +	     * NB. Whilst we're recovering, we turn the carrier state off.  We
  15.655 +	     * take measures to ensure that this device isn't used for
  15.656 +	     * anything.  We also stop the queue for this device.  Various
  15.657 +	     * different approaches (e.g. continuing to buffer packets) have
  15.658 +	     * been tested but don't appear to improve the overall impact on
  15.659 +             * TCP connections.
  15.660 +	     *
  15.661 +             * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
  15.662 +             * is initiated by a special "RESET" message - disconnect could
  15.663 +             * just mean we're not allowed to use this interface any more.
  15.664 +             */
  15.665 +
  15.666 +            /* Stop old i/f to prevent errors whilst we rebuild the state. */
  15.667 +            spin_lock_irq(&np->tx_lock);
  15.668 +            spin_lock(&np->rx_lock);
  15.669 +            netif_stop_queue(dev);
  15.670 +            np->backend_state = BEST_DISCONNECTED;
  15.671 +            spin_unlock(&np->rx_lock);
  15.672 +            spin_unlock_irq(&np->tx_lock);
  15.673 +
  15.674 +            /* Free resources. */
  15.675 +            free_irq(np->irq, dev);
  15.676 +            unbind_evtchn_from_irq(np->evtchn);
  15.677 +	    free_page((unsigned long)np->tx);
  15.678 +            free_page((unsigned long)np->rx);
  15.679 +        }
  15.680 +
  15.681 +        /* Move from CLOSED to DISCONNECTED state. */
  15.682 +        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
  15.683 +        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
  15.684 +        memset(np->tx, 0, PAGE_SIZE);
  15.685 +        memset(np->rx, 0, PAGE_SIZE);
  15.686 +        np->backend_state = BEST_DISCONNECTED;
  15.687 +
  15.688 +        /* Construct an interface-CONNECT message for the domain controller. */
  15.689 +        cmsg.type      = CMSG_NETIF_FE;
  15.690 +        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
  15.691 +        cmsg.length    = sizeof(netif_fe_interface_connect_t);
  15.692 +        up.handle      = status->handle;
  15.693 +        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
  15.694 +        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
  15.695 +        memcpy(cmsg.msg, &up, sizeof(up));
  15.696 +        
  15.697 +        /* Tell the controller to bring up the interface. */
  15.698 +        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  15.699 +        break;
  15.700 +
  15.701 +    case NETIF_INTERFACE_STATUS_CONNECTED:
  15.702 +        if ( np->backend_state == BEST_CLOSED )
  15.703 +        {
  15.704 +            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
  15.705 +                   " in state %d\n", np->backend_state);
  15.706 +            break;
  15.707 +        }
  15.708 +
  15.709 +        memcpy(dev->dev_addr, status->mac, ETH_ALEN);
  15.710 +
  15.711 +        network_connect(dev, status);
  15.712 +
  15.713 +        np->evtchn = status->evtchn;
  15.714 +        np->irq = bind_evtchn_to_irq(np->evtchn);
  15.715 +        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
  15.716 +                          dev->name, dev);
  15.717 +        
  15.718 +        netctrl_connected_count();
  15.719 +        break;
  15.720 +
  15.721 +    default:
  15.722 +        printk(KERN_WARNING "Status change to unknown value %d\n", 
  15.723 +               status->status);
  15.724 +        break;
  15.725 +    }
  15.726 +}
  15.727 +
  15.728 +/** Create a network device.
  15.729 + * @param handle device handle
  15.730 + * @param val return parameter for created device
  15.731 + * @return 0 on success, error code otherwise
  15.732 + */
  15.733 +static int create_netdev(int handle, struct net_device **val)
  15.734 +{
  15.735 +    int i, err = 0;
  15.736 +    struct net_device *dev = NULL;
  15.737 +    struct net_private *np = NULL;
  15.738 +
  15.739 +    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
  15.740 +    {
  15.741 +        printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
  15.742 +        err = -ENOMEM;
  15.743 +        goto exit;
  15.744 +    }
  15.745 +
  15.746 +    np                = dev->priv;
  15.747 +    np->backend_state = BEST_CLOSED;
  15.748 +    np->user_state    = UST_CLOSED;
  15.749 +    np->handle        = handle;
  15.750 +    
  15.751 +    spin_lock_init(&np->tx_lock);
  15.752 +    spin_lock_init(&np->rx_lock);
  15.753 +
  15.754 +    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
  15.755 +    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
  15.756 +        np->tx_skbs[i] = (void *)(i+1);
  15.757 +    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
  15.758 +        np->rx_skbs[i] = (void *)(i+1);
  15.759 +
  15.760 +    dev->open            = network_open;
  15.761 +    dev->hard_start_xmit = network_start_xmit;
  15.762 +    dev->stop            = network_close;
  15.763 +    dev->get_stats       = network_get_stats;
  15.764 +    dev->poll            = netif_poll;
  15.765 +    dev->weight          = 64;
  15.766 +    
  15.767 +    if ( (err = register_netdev(dev)) != 0 )
  15.768 +    {
  15.769 +        printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
  15.770 +        goto exit;
  15.771 +    }
  15.772 +    np->dev = dev;
  15.773 +    list_add(&np->list, &dev_list);
  15.774 +
  15.775 +  exit:
  15.776 +    if ( (err != 0) && (dev != NULL ) )
  15.777 +        kfree(dev);
  15.778 +    else if ( val != NULL )
  15.779 +        *val = dev;
  15.780 +    return err;
  15.781 +}
  15.782 +
  15.783 +/*
  15.784 + * Initialize the network control interface. Set the number of network devices
  15.785 + * and create them.
  15.786 + */
  15.787 +static void netif_driver_status_change(
  15.788 +    netif_fe_driver_status_changed_t *status)
  15.789 +{
  15.790 +    int err = 0;
  15.791 +    int i;
  15.792 +    
  15.793 +    netctrl.interface_n = status->nr_interfaces;
  15.794 +    netctrl.connected_n = 0;
  15.795 +
  15.796 +    for ( i = 0; i < netctrl.interface_n; i++ )
  15.797 +    {
  15.798 +        if ( (err = create_netdev(i, NULL)) != 0 )
  15.799 +        {
  15.800 +            netctrl_err(err);
  15.801 +            break;
  15.802 +        }
  15.803 +    }
  15.804 +}
  15.805 +
  15.806 +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
  15.807 +{
  15.808 +    int respond = 1;
  15.809 +
  15.810 +    switch ( msg->subtype )
  15.811 +    {
  15.812 +    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
  15.813 +        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
  15.814 +            goto error;
  15.815 +        netif_status_change((netif_fe_interface_status_changed_t *)
  15.816 +                            &msg->msg[0]);
  15.817 +        break;
  15.818 +
  15.819 +    case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
  15.820 +        if ( msg->length != sizeof(netif_fe_driver_status_changed_t) )
  15.821 +            goto error;
  15.822 +        netif_driver_status_change((netif_fe_driver_status_changed_t *)
  15.823 +                                   &msg->msg[0]);
  15.824 +        /* Message is a response */
  15.825 +        respond = 0;
  15.826 +        break;
  15.827 +
  15.828 +    error:
  15.829 +    default:
  15.830 +        msg->length = 0;
  15.831 +        break;
  15.832 +    }
  15.833 +
  15.834 +    if ( respond )
  15.835 +        ctrl_if_send_response(msg);
  15.836 +}
  15.837 +
  15.838 +
  15.839 +static int __init netif_init(void)
  15.840 +{
  15.841 +    ctrl_msg_t                       cmsg;
  15.842 +    netif_fe_driver_status_changed_t st;
  15.843 +    int err = 0, wait_i, wait_n = 20;
  15.844 +
  15.845 +    if ( (start_info.flags & SIF_INITDOMAIN) ||
  15.846 +         (start_info.flags & SIF_NET_BE_DOMAIN) )
  15.847 +        return 0;
  15.848 +
  15.849 +    printk("Initialising Xen virtual ethernet frontend driver");
  15.850 +
  15.851 +    INIT_LIST_HEAD(&dev_list);
  15.852 +
  15.853 +    netctrl_init();
  15.854 +
  15.855 +    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
  15.856 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
  15.857 +
  15.858 +    /* Send a driver-UP notification to the domain controller. */
  15.859 +    cmsg.type      = CMSG_NETIF_FE;
  15.860 +    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
  15.861 +    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
  15.862 +    st.status      = NETIF_DRIVER_STATUS_UP;
  15.863 +    st.nr_interfaces = 0;
  15.864 +    memcpy(cmsg.msg, &st, sizeof(st));
  15.865 +    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  15.866 +
  15.867 +    /* Wait for all interfaces to be connected. */
  15.868 +    for ( wait_i = 0; ; wait_i++)
  15.869 +    {
  15.870 +        if ( (err = (wait_i < wait_n) ? netctrl_connected() : -ENETDOWN) != 0 )
  15.871 +        {
  15.872 +            err = (err > 0) ? 0 : err;
  15.873 +            break;
  15.874 +        }
  15.875 +        set_current_state(TASK_INTERRUPTIBLE);
  15.876 +        schedule_timeout(1);
  15.877 +     }
  15.878 +
  15.879 +    if ( err )
  15.880 +        ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
  15.881 +
  15.882 +    return err;
  15.883 +}
  15.884 +
  15.885 +__initcall(netif_init);
    16.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h	Fri Jul 30 14:22:41 2004 +0000
    16.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h	Fri Jul 30 18:49:24 2004 +0000
    16.3 @@ -151,6 +151,12 @@ static inline int flush_page_update_queu
    16.4  #define xen_flush_page_update_queue() (_flush_page_update_queue())
    16.5  void MULTICALL_flush_page_update_queue(void);
    16.6  
    16.7 +#ifdef CONFIG_XEN_PHYSDEV_ACCESS
    16.8 +/* Allocate a contiguous empty region of low memory. Return virtual start. */
    16.9 +unsigned long allocate_empty_lowmem_region(unsigned long pages);
   16.10 +/* Deallocate a contiguous region of low memory. Return it to the allocator. */
   16.11 +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages);
   16.12 +#endif
   16.13  
   16.14  /*
   16.15   * Assembler stubs for hyper-calls.
   16.16 @@ -389,9 +395,12 @@ static inline int HYPERVISOR_update_va_m
   16.17          "b" (page_nr), "c" ((new_val).pte_low), "d" (flags) : "memory" );
   16.18  
   16.19      if ( unlikely(ret < 0) )
   16.20 -        panic("Failed update VA mapping: %08lx, %08lx, %08lx",
   16.21 -              page_nr, (new_val).pte_low, flags);
   16.22 -    
   16.23 +    {
   16.24 +        printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
   16.25 +               page_nr, (new_val).pte_low, flags);
   16.26 +        BUG();
   16.27 +    }
   16.28 +
   16.29      return ret;
   16.30  }
   16.31  
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/linux-2.6.7-xen-sparse/mm/page_alloc.c	Fri Jul 30 18:49:24 2004 +0000
    17.3 @@ -0,0 +1,2017 @@
    17.4 +/*
    17.5 + *  linux/mm/page_alloc.c
    17.6 + *
    17.7 + *  Manages the free list, the system allocates free pages here.
    17.8 + *  Note that kmalloc() lives in slab.c
    17.9 + *
   17.10 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   17.11 + *  Swap reorganised 29.12.95, Stephen Tweedie
   17.12 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   17.13 + *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   17.14 + *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   17.15 + *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   17.16 + *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
   17.17 + *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   17.18 + */
   17.19 +
   17.20 +#include <linux/config.h>
   17.21 +#include <linux/stddef.h>
   17.22 +#include <linux/mm.h>
   17.23 +#include <linux/swap.h>
   17.24 +#include <linux/interrupt.h>
   17.25 +#include <linux/pagemap.h>
   17.26 +#include <linux/bootmem.h>
   17.27 +#include <linux/compiler.h>
   17.28 +#include <linux/module.h>
   17.29 +#include <linux/suspend.h>
   17.30 +#include <linux/pagevec.h>
   17.31 +#include <linux/blkdev.h>
   17.32 +#include <linux/slab.h>
   17.33 +#include <linux/notifier.h>
   17.34 +#include <linux/topology.h>
   17.35 +#include <linux/sysctl.h>
   17.36 +#include <linux/cpu.h>
   17.37 +
   17.38 +#include <asm/tlbflush.h>
   17.39 +
   17.40 +DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
   17.41 +struct pglist_data *pgdat_list;
   17.42 +unsigned long totalram_pages;
   17.43 +unsigned long totalhigh_pages;
   17.44 +int nr_swap_pages;
   17.45 +int numnodes = 1;
   17.46 +int sysctl_lower_zone_protection = 0;
   17.47 +
   17.48 +EXPORT_SYMBOL(totalram_pages);
   17.49 +EXPORT_SYMBOL(nr_swap_pages);
   17.50 +
   17.51 +/*
   17.52 + * Used by page_zone() to look up the address of the struct zone whose
   17.53 + * id is encoded in the upper bits of page->flags
   17.54 + */
   17.55 +struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
   17.56 +EXPORT_SYMBOL(zone_table);
   17.57 +
   17.58 +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
   17.59 +int min_free_kbytes = 1024;
   17.60 +
   17.61 +/*
   17.62 + * Temporary debugging check for pages not lying within a given zone.
   17.63 + */
   17.64 +static int bad_range(struct zone *zone, struct page *page)
   17.65 +{
   17.66 +	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
   17.67 +		return 1;
   17.68 +	if (page_to_pfn(page) < zone->zone_start_pfn)
   17.69 +		return 1;
   17.70 +	if (zone != page_zone(page))
   17.71 +		return 1;
   17.72 +	return 0;
   17.73 +}
   17.74 +
   17.75 +static void bad_page(const char *function, struct page *page)
   17.76 +{
   17.77 +	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
   17.78 +		function, current->comm, page);
   17.79 +	printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
   17.80 +		(unsigned long)page->flags, page->mapping,
   17.81 +		(int)page->mapcount, page_count(page));
   17.82 +	printk(KERN_EMERG "Backtrace:\n");
   17.83 +	dump_stack();
   17.84 +	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
   17.85 +	page->flags &= ~(1 << PG_private	|
   17.86 +			1 << PG_locked	|
   17.87 +			1 << PG_lru	|
   17.88 +			1 << PG_active	|
   17.89 +			1 << PG_dirty	|
   17.90 +			1 << PG_maplock |
   17.91 +			1 << PG_anon    |
   17.92 +			1 << PG_swapcache |
   17.93 +			1 << PG_writeback);
   17.94 +	set_page_count(page, 0);
   17.95 +	page->mapping = NULL;
   17.96 +	page->mapcount = 0;
   17.97 +}
   17.98 +
   17.99 +#ifndef CONFIG_HUGETLB_PAGE
  17.100 +#define prep_compound_page(page, order) do { } while (0)
  17.101 +#define destroy_compound_page(page, order) do { } while (0)
  17.102 +#else
  17.103 +/*
  17.104 + * Higher-order pages are called "compound pages".  They are structured thusly:
  17.105 + *
  17.106 + * The first PAGE_SIZE page is called the "head page".
  17.107 + *
  17.108 + * The remaining PAGE_SIZE pages are called "tail pages".
  17.109 + *
  17.110 + * All pages have PG_compound set.  All pages have their ->private pointing at
  17.111 + * the head page (even the head page has this).
  17.112 + *
  17.113 + * The first tail page's ->mapping, if non-zero, holds the address of the
  17.114 + * compound page's put_page() function.
  17.115 + *
  17.116 + * The order of the allocation is stored in the first tail page's ->index
  17.117 + * This is only for debug at present.  This usage means that zero-order pages
  17.118 + * may not be compound.
  17.119 + */
  17.120 +static void prep_compound_page(struct page *page, unsigned long order)
  17.121 +{
  17.122 +	int i;
  17.123 +	int nr_pages = 1 << order;
  17.124 +
  17.125 +	page[1].mapping = 0;
  17.126 +	page[1].index = order;
  17.127 +	for (i = 0; i < nr_pages; i++) {
  17.128 +		struct page *p = page + i;
  17.129 +
  17.130 +		SetPageCompound(p);
  17.131 +		p->private = (unsigned long)page;
  17.132 +	}
  17.133 +}
  17.134 +
  17.135 +static void destroy_compound_page(struct page *page, unsigned long order)
  17.136 +{
  17.137 +	int i;
  17.138 +	int nr_pages = 1 << order;
  17.139 +
  17.140 +	if (!PageCompound(page))
  17.141 +		return;
  17.142 +
  17.143 +	if (page[1].index != order)
  17.144 +		bad_page(__FUNCTION__, page);
  17.145 +
  17.146 +	for (i = 0; i < nr_pages; i++) {
  17.147 +		struct page *p = page + i;
  17.148 +
  17.149 +		if (!PageCompound(p))
  17.150 +			bad_page(__FUNCTION__, page);
  17.151 +		if (p->private != (unsigned long)page)
  17.152 +			bad_page(__FUNCTION__, page);
  17.153 +		ClearPageCompound(p);
  17.154 +	}
  17.155 +}
  17.156 +#endif		/* CONFIG_HUGETLB_PAGE */
  17.157 +
  17.158 +/*
  17.159 + * Freeing function for a buddy system allocator.
  17.160 + *
  17.161 + * The concept of a buddy system is to maintain direct-mapped table
  17.162 + * (containing bit values) for memory blocks of various "orders".
  17.163 + * The bottom level table contains the map for the smallest allocatable
  17.164 + * units of memory (here, pages), and each level above it describes
  17.165 + * pairs of units from the levels below, hence, "buddies".
  17.166 + * At a high level, all that happens here is marking the table entry
  17.167 + * at the bottom level available, and propagating the changes upward
  17.168 + * as necessary, plus some accounting needed to play nicely with other
  17.169 + * parts of the VM system.
  17.170 + * At each level, we keep one bit for each pair of blocks, which
  17.171 + * is set to 1 iff only one of the pair is allocated.  So when we
  17.172 + * are allocating or freeing one, we can derive the state of the
  17.173 + * other.  That is, if we allocate a small block, and both were   
  17.174 + * free, the remainder of the region must be split into blocks.   
  17.175 + * If a block is freed, and its buddy is also free, then this
  17.176 + * triggers coalescing into a block of larger size.            
  17.177 + *
  17.178 + * -- wli
  17.179 + */
  17.180 +
  17.181 +static inline void __free_pages_bulk (struct page *page, struct page *base,
  17.182 +		struct zone *zone, struct free_area *area, unsigned long mask,
  17.183 +		unsigned int order)
  17.184 +{
  17.185 +	unsigned long page_idx, index;
  17.186 +
  17.187 +	if (order)
  17.188 +		destroy_compound_page(page, order);
  17.189 +	page_idx = page - base;
  17.190 +	if (page_idx & ~mask)
  17.191 +		BUG();
  17.192 +	index = page_idx >> (1 + order);
  17.193 +
  17.194 +	zone->free_pages -= mask;
  17.195 +	while (mask + (1 << (MAX_ORDER-1))) {
  17.196 +		struct page *buddy1, *buddy2;
  17.197 +
  17.198 +		BUG_ON(area >= zone->free_area + MAX_ORDER);
  17.199 +		if (!__test_and_change_bit(index, area->map))
  17.200 +			/*
  17.201 +			 * the buddy page is still allocated.
  17.202 +			 */
  17.203 +			break;
  17.204 +		/*
  17.205 +		 * Move the buddy up one level.
  17.206 +		 * This code is taking advantage of the identity:
  17.207 +		 * 	-mask = 1+~mask
  17.208 +		 */
  17.209 +		buddy1 = base + (page_idx ^ -mask);
  17.210 +		buddy2 = base + page_idx;
  17.211 +		BUG_ON(bad_range(zone, buddy1));
  17.212 +		BUG_ON(bad_range(zone, buddy2));
  17.213 +		list_del(&buddy1->lru);
  17.214 +		mask <<= 1;
  17.215 +		area++;
  17.216 +		index >>= 1;
  17.217 +		page_idx &= mask;
  17.218 +	}
  17.219 +	list_add(&(base + page_idx)->lru, &area->free_list);
  17.220 +}
  17.221 +
  17.222 +static inline void free_pages_check(const char *function, struct page *page)
  17.223 +{
  17.224 +	if (	page_mapped(page) ||
  17.225 +		page->mapping != NULL ||
  17.226 +		page_count(page) != 0 ||
  17.227 +		(page->flags & (
  17.228 +			1 << PG_lru	|
  17.229 +			1 << PG_private |
  17.230 +			1 << PG_locked	|
  17.231 +			1 << PG_active	|
  17.232 +			1 << PG_reclaim	|
  17.233 +			1 << PG_slab	|
  17.234 +			1 << PG_maplock |
  17.235 +			1 << PG_anon    |
  17.236 +			1 << PG_swapcache |
  17.237 +			1 << PG_writeback )))
  17.238 +		bad_page(function, page);
  17.239 +	if (PageDirty(page))
  17.240 +		ClearPageDirty(page);
  17.241 +}
  17.242 +
  17.243 +/*
  17.244 + * Frees a list of pages. 
  17.245 + * Assumes all pages on list are in same zone, and of same order.
  17.246 + * count is the number of pages to free, or 0 for all on the list.
  17.247 + *
  17.248 + * If the zone was previously in an "all pages pinned" state then look to
  17.249 + * see if this freeing clears that state.
  17.250 + *
  17.251 + * And clear the zone's pages_scanned counter, to hold off the "all pages are
  17.252 + * pinned" detection logic.
  17.253 + */
  17.254 +static int
  17.255 +free_pages_bulk(struct zone *zone, int count,
  17.256 +		struct list_head *list, unsigned int order)
  17.257 +{
  17.258 +	unsigned long mask, flags;
  17.259 +	struct free_area *area;
  17.260 +	struct page *base, *page = NULL;
  17.261 +	int ret = 0;
  17.262 +
  17.263 +	mask = (~0UL) << order;
  17.264 +	base = zone->zone_mem_map;
  17.265 +	area = zone->free_area + order;
  17.266 +	spin_lock_irqsave(&zone->lock, flags);
  17.267 +	zone->all_unreclaimable = 0;
  17.268 +	zone->pages_scanned = 0;
  17.269 +	while (!list_empty(list) && count--) {
  17.270 +		page = list_entry(list->prev, struct page, lru);
  17.271 +		/* have to delete it as __free_pages_bulk list manipulates */
  17.272 +		list_del(&page->lru);
  17.273 +		__free_pages_bulk(page, base, zone, area, mask, order);
  17.274 +		ret++;
  17.275 +	}
  17.276 +	spin_unlock_irqrestore(&zone->lock, flags);
  17.277 +	return ret;
  17.278 +}
  17.279 +
  17.280 +void __free_pages_ok(struct page *page, unsigned int order)
  17.281 +{
  17.282 +	LIST_HEAD(list);
  17.283 +	int i;
  17.284 +
  17.285 +	mod_page_state(pgfree, 1 << order);
  17.286 +	for (i = 0 ; i < (1 << order) ; ++i)
  17.287 +		free_pages_check(__FUNCTION__, page + i);
  17.288 +	list_add(&page->lru, &list);
  17.289 +	kernel_map_pages(page, 1<<order, 0);
  17.290 +	free_pages_bulk(page_zone(page), 1, &list, order);
  17.291 +}
  17.292 +
  17.293 +#define MARK_USED(index, order, area) \
  17.294 +	__change_bit((index) >> (1+(order)), (area)->map)
  17.295 +
  17.296 +static inline struct page *
  17.297 +expand(struct zone *zone, struct page *page,
  17.298 +	 unsigned long index, int low, int high, struct free_area *area)
  17.299 +{
  17.300 +	unsigned long size = 1 << high;
  17.301 +
  17.302 +	while (high > low) {
  17.303 +		BUG_ON(bad_range(zone, page));
  17.304 +		area--;
  17.305 +		high--;
  17.306 +		size >>= 1;
  17.307 +		list_add(&page->lru, &area->free_list);
  17.308 +		MARK_USED(index, high, area);
  17.309 +		index += size;
  17.310 +		page += size;
  17.311 +	}
  17.312 +	return page;
  17.313 +}
  17.314 +
  17.315 +static inline void set_page_refs(struct page *page, int order)
  17.316 +{
  17.317 +#ifdef CONFIG_MMU
  17.318 +	set_page_count(page, 1);
  17.319 +#else
  17.320 +	int i;
  17.321 +
  17.322 +	/*
  17.323 +	 * We need to reference all the pages for this order, otherwise if
  17.324 +	 * anyone accesses one of the pages with (get/put) it will be freed.
  17.325 +	 */
  17.326 +	for (i = 0; i < (1 << order); i++)
  17.327 +		set_page_count(page+i, 1);
  17.328 +#endif /* CONFIG_MMU */
  17.329 +}
  17.330 +
  17.331 +/*
  17.332 + * This page is about to be returned from the page allocator
  17.333 + */
  17.334 +static void prep_new_page(struct page *page, int order)
  17.335 +{
  17.336 +	if (page->mapping || page_mapped(page) ||
  17.337 +	    (page->flags & (
  17.338 +			1 << PG_private	|
  17.339 +			1 << PG_locked	|
  17.340 +			1 << PG_lru	|
  17.341 +			1 << PG_active	|
  17.342 +			1 << PG_dirty	|
  17.343 +			1 << PG_reclaim	|
  17.344 +			1 << PG_maplock |
  17.345 +			1 << PG_anon    |
  17.346 +			1 << PG_swapcache |
  17.347 +			1 << PG_writeback )))
  17.348 +		bad_page(__FUNCTION__, page);
  17.349 +
  17.350 +	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
  17.351 +			1 << PG_referenced | 1 << PG_arch_1 |
  17.352 +			1 << PG_checked | 1 << PG_mappedtodisk);
  17.353 +	page->private = 0;
  17.354 +	set_page_refs(page, order);
  17.355 +}
  17.356 +
  17.357 +/* 
  17.358 + * Do the hard work of removing an element from the buddy allocator.
  17.359 + * Call me with the zone->lock already held.
  17.360 + */
  17.361 +static struct page *__rmqueue(struct zone *zone, unsigned int order)
  17.362 +{
  17.363 +	struct free_area * area;
  17.364 +	unsigned int current_order;
  17.365 +	struct page *page;
  17.366 +	unsigned int index;
  17.367 +
  17.368 +	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  17.369 +		area = zone->free_area + current_order;
  17.370 +		if (list_empty(&area->free_list))
  17.371 +			continue;
  17.372 +
  17.373 +		page = list_entry(area->free_list.next, struct page, lru);
  17.374 +		list_del(&page->lru);
  17.375 +		index = page - zone->zone_mem_map;
  17.376 +		if (current_order != MAX_ORDER-1)
  17.377 +			MARK_USED(index, current_order, area);
  17.378 +		zone->free_pages -= 1UL << order;
  17.379 +		return expand(zone, page, index, order, current_order, area);
  17.380 +	}
  17.381 +
  17.382 +	return NULL;
  17.383 +}
  17.384 +
  17.385 +/* 
  17.386 + * Obtain a specified number of elements from the buddy allocator, all under
  17.387 + * a single hold of the lock, for efficiency.  Add them to the supplied list.
  17.388 + * Returns the number of new pages which were placed at *list.
  17.389 + */
  17.390 +static int rmqueue_bulk(struct zone *zone, unsigned int order, 
  17.391 +			unsigned long count, struct list_head *list)
  17.392 +{
  17.393 +	unsigned long flags;
  17.394 +	int i;
  17.395 +	int allocated = 0;
  17.396 +	struct page *page;
  17.397 +	
  17.398 +	spin_lock_irqsave(&zone->lock, flags);
  17.399 +	for (i = 0; i < count; ++i) {
  17.400 +		page = __rmqueue(zone, order);
  17.401 +		if (page == NULL)
  17.402 +			break;
  17.403 +		allocated++;
  17.404 +		list_add_tail(&page->lru, list);
  17.405 +	}
  17.406 +	spin_unlock_irqrestore(&zone->lock, flags);
  17.407 +	return allocated;
  17.408 +}
  17.409 +
  17.410 +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
  17.411 +static void __drain_pages(unsigned int cpu)
  17.412 +{
  17.413 +	struct zone *zone;
  17.414 +	int i;
  17.415 +
  17.416 +	for_each_zone(zone) {
  17.417 +		struct per_cpu_pageset *pset;
  17.418 +
  17.419 +		pset = &zone->pageset[cpu];
  17.420 +		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
  17.421 +			struct per_cpu_pages *pcp;
  17.422 +
  17.423 +			pcp = &pset->pcp[i];
  17.424 +			pcp->count -= free_pages_bulk(zone, pcp->count,
  17.425 +						&pcp->list, 0);
  17.426 +		}
  17.427 +	}
  17.428 +}
  17.429 +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
  17.430 +
  17.431 +#ifdef CONFIG_PM
  17.432 +int is_head_of_free_region(struct page *page)
  17.433 +{
  17.434 +        struct zone *zone = page_zone(page);
  17.435 +        unsigned long flags;
  17.436 +	int order;
  17.437 +	struct list_head *curr;
  17.438 +
  17.439 +	/*
  17.440 +	 * Should not matter as we need quiescent system for
  17.441 +	 * suspend anyway, but...
  17.442 +	 */
  17.443 +	spin_lock_irqsave(&zone->lock, flags);
  17.444 +	for (order = MAX_ORDER - 1; order >= 0; --order)
  17.445 +		list_for_each(curr, &zone->free_area[order].free_list)
  17.446 +			if (page == list_entry(curr, struct page, lru)) {
  17.447 +				spin_unlock_irqrestore(&zone->lock, flags);
  17.448 +				return 1 << order;
  17.449 +			}
  17.450 +	spin_unlock_irqrestore(&zone->lock, flags);
  17.451 +        return 0;
  17.452 +}
  17.453 +
  17.454 +/*
  17.455 + * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  17.456 + */
  17.457 +void drain_local_pages(void)
  17.458 +{
  17.459 +	unsigned long flags;
  17.460 +
  17.461 +	local_irq_save(flags);	
  17.462 +	__drain_pages(smp_processor_id());
  17.463 +	local_irq_restore(flags);	
  17.464 +}
  17.465 +#endif /* CONFIG_PM */
  17.466 +
  17.467 +static void zone_statistics(struct zonelist *zonelist, struct zone *z)
  17.468 +{
  17.469 +#ifdef CONFIG_NUMA
  17.470 +	unsigned long flags;
  17.471 +	int cpu;
  17.472 +	pg_data_t *pg = z->zone_pgdat;
  17.473 +	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
  17.474 +	struct per_cpu_pageset *p;
  17.475 +
  17.476 +	local_irq_save(flags);
  17.477 +	cpu = smp_processor_id();
  17.478 +	p = &z->pageset[cpu];
  17.479 +	if (pg == orig) {
  17.480 +		z->pageset[cpu].numa_hit++;
  17.481 +	} else {
  17.482 +		p->numa_miss++;
  17.483 +		zonelist->zones[0]->pageset[cpu].numa_foreign++;
  17.484 +	}
  17.485 +	if (pg == NODE_DATA(numa_node_id()))
  17.486 +		p->local_node++;
  17.487 +	else
  17.488 +		p->other_node++;
  17.489 +	local_irq_restore(flags);
  17.490 +#endif
  17.491 +}
  17.492 +
  17.493 +/*
  17.494 + * Free a 0-order page
  17.495 + */
  17.496 +static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
  17.497 +static void fastcall free_hot_cold_page(struct page *page, int cold)
  17.498 +{
  17.499 +	struct zone *zone = page_zone(page);
  17.500 +	struct per_cpu_pages *pcp;
  17.501 +	unsigned long flags;
  17.502 +
  17.503 +	/* XXX Xen: use mapping pointer as skb/data-page destructor */
  17.504 +	if (page->mapping)
  17.505 +		return (*(void(*)(struct page *))page->mapping)(page);
  17.506 +
  17.507 +	kernel_map_pages(page, 1, 0);
  17.508 +	inc_page_state(pgfree);
  17.509 +	free_pages_check(__FUNCTION__, page);
  17.510 +	pcp = &zone->pageset[get_cpu()].pcp[cold];
  17.511 +	local_irq_save(flags);
  17.512 +	if (pcp->count >= pcp->high)
  17.513 +		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
  17.514 +	list_add(&page->lru, &pcp->list);
  17.515 +	pcp->count++;
  17.516 +	local_irq_restore(flags);
  17.517 +	put_cpu();
  17.518 +}
  17.519 +
  17.520 +void fastcall free_hot_page(struct page *page)
  17.521 +{
  17.522 +	free_hot_cold_page(page, 0);
  17.523 +}
  17.524 +	
  17.525 +void fastcall free_cold_page(struct page *page)
  17.526 +{
  17.527 +	free_hot_cold_page(page, 1);
  17.528 +}
  17.529 +
  17.530 +/*
  17.531 + * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  17.532 + * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  17.533 + * or two.
  17.534 + */
  17.535 +
  17.536 +static struct page *
  17.537 +buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
  17.538 +{
  17.539 +	unsigned long flags;
  17.540 +	struct page *page = NULL;
  17.541 +	int cold = !!(gfp_flags & __GFP_COLD);
  17.542 +
  17.543 +	if (order == 0) {
  17.544 +		struct per_cpu_pages *pcp;
  17.545 +
  17.546 +		pcp = &zone->pageset[get_cpu()].pcp[cold];
  17.547 +		local_irq_save(flags);
  17.548 +		if (pcp->count <= pcp->low)
  17.549 +			pcp->count += rmqueue_bulk(zone, 0,
  17.550 +						pcp->batch, &pcp->list);
  17.551 +		if (pcp->count) {
  17.552 +			page = list_entry(pcp->list.next, struct page, lru);
  17.553 +			list_del(&page->lru);
  17.554 +			pcp->count--;
  17.555 +		}
  17.556 +		local_irq_restore(flags);
  17.557 +		put_cpu();
  17.558 +	}
  17.559 +
  17.560 +	if (page == NULL) {
  17.561 +		spin_lock_irqsave(&zone->lock, flags);
  17.562 +		page = __rmqueue(zone, order);
  17.563 +		spin_unlock_irqrestore(&zone->lock, flags);
  17.564 +	}
  17.565 +
  17.566 +	if (page != NULL) {
  17.567 +		BUG_ON(bad_range(zone, page));
  17.568 +		mod_page_state_zone(zone, pgalloc, 1 << order);
  17.569 +		prep_new_page(page, order);
  17.570 +		if (order && (gfp_flags & __GFP_COMP))
  17.571 +			prep_compound_page(page, order);
  17.572 +	}
  17.573 +	return page;
  17.574 +}
  17.575 +
  17.576 +/*
  17.577 + * This is the 'heart' of the zoned buddy allocator.
  17.578 + *
  17.579 + * Herein lies the mysterious "incremental min".  That's the
  17.580 + *
  17.581 + *	local_low = z->pages_low;
  17.582 + *	min += local_low;
  17.583 + *
  17.584 + * thing.  The intent here is to provide additional protection to low zones for
  17.585 + * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
  17.586 + * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
  17.587 + * request.  This preserves additional space in those lower zones for requests
  17.588 + * which really do need memory from those zones.  It means that on a decent
  17.589 + * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
  17.590 + * zone untouched.
  17.591 + */
  17.592 +struct page * fastcall
  17.593 +__alloc_pages(unsigned int gfp_mask, unsigned int order,
  17.594 +		struct zonelist *zonelist)
  17.595 +{
  17.596 +	const int wait = gfp_mask & __GFP_WAIT;
  17.597 +	unsigned long min;
  17.598 +	struct zone **zones;
  17.599 +	struct page *page;
  17.600 +	struct reclaim_state reclaim_state;
  17.601 +	struct task_struct *p = current;
  17.602 +	int i;
  17.603 +	int alloc_type;
  17.604 +	int do_retry;
  17.605 +
  17.606 +	might_sleep_if(wait);
  17.607 +
  17.608 +	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
  17.609 +	if (zones[0] == NULL)     /* no zones in the zonelist */
  17.610 +		return NULL;
  17.611 +
  17.612 +	alloc_type = zone_idx(zones[0]);
  17.613 +
  17.614 +	/* Go through the zonelist once, looking for a zone with enough free */
  17.615 +	for (i = 0; zones[i] != NULL; i++) {
  17.616 +		struct zone *z = zones[i];
  17.617 +
  17.618 +		min = (1<<order) + z->protection[alloc_type];
  17.619 +
  17.620 +		/*
  17.621 +		 * We let real-time tasks dip their real-time paws a little
  17.622 +		 * deeper into reserves.
  17.623 +		 */
  17.624 +		if (rt_task(p))
  17.625 +			min -= z->pages_low >> 1;
  17.626 +
  17.627 +		if (z->free_pages >= min ||
  17.628 +				(!wait && z->free_pages >= z->pages_high)) {
  17.629 +			page = buffered_rmqueue(z, order, gfp_mask);
  17.630 +			if (page) {
  17.631 +				zone_statistics(zonelist, z);
  17.632 +				goto got_pg;
  17.633 +			}
  17.634 +		}
  17.635 +	}
  17.636 +
  17.637 +	/* we're somewhat low on memory, failed to find what we needed */
  17.638 +	for (i = 0; zones[i] != NULL; i++)
  17.639 +		wakeup_kswapd(zones[i]);
  17.640 +
  17.641 +	/* Go through the zonelist again, taking __GFP_HIGH into account */
  17.642 +	for (i = 0; zones[i] != NULL; i++) {
  17.643 +		struct zone *z = zones[i];
  17.644 +
  17.645 +		min = (1<<order) + z->protection[alloc_type];
  17.646 +
  17.647 +		if (gfp_mask & __GFP_HIGH)
  17.648 +			min -= z->pages_low >> 2;
  17.649 +		if (rt_task(p))
  17.650 +			min -= z->pages_low >> 1;
  17.651 +
  17.652 +		if (z->free_pages >= min ||
  17.653 +				(!wait && z->free_pages >= z->pages_high)) {
  17.654 +			page = buffered_rmqueue(z, order, gfp_mask);
  17.655 +			if (page) {
  17.656 +				zone_statistics(zonelist, z);
  17.657 +				goto got_pg;
  17.658 +			}
  17.659 +		}
  17.660 +	}
  17.661 +
  17.662 +	/* here we're in the low on memory slow path */
  17.663 +
  17.664 +rebalance:
  17.665 +	if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
  17.666 +		/* go through the zonelist yet again, ignoring mins */
  17.667 +		for (i = 0; zones[i] != NULL; i++) {
  17.668 +			struct zone *z = zones[i];
  17.669 +
  17.670 +			page = buffered_rmqueue(z, order, gfp_mask);
  17.671 +			if (page) {
  17.672 +				zone_statistics(zonelist, z);
  17.673 +				goto got_pg;
  17.674 +			}
  17.675 +		}
  17.676 +		goto nopage;
  17.677 +	}
  17.678 +
  17.679 +	/* Atomic allocations - we can't balance anything */
  17.680 +	if (!wait)
  17.681 +		goto nopage;
  17.682 +
  17.683 +	p->flags |= PF_MEMALLOC;
  17.684 +	reclaim_state.reclaimed_slab = 0;
  17.685 +	p->reclaim_state = &reclaim_state;
  17.686 +
  17.687 +	try_to_free_pages(zones, gfp_mask, order);
  17.688 +
  17.689 +	p->reclaim_state = NULL;
  17.690 +	p->flags &= ~PF_MEMALLOC;
  17.691 +
  17.692 +	/* go through the zonelist yet one more time */
  17.693 +	for (i = 0; zones[i] != NULL; i++) {
  17.694 +		struct zone *z = zones[i];
  17.695 +
  17.696 +		min = (1UL << order) + z->protection[alloc_type];
  17.697 +
  17.698 +		if (z->free_pages >= min ||
  17.699 +				(!wait && z->free_pages >= z->pages_high)) {
  17.700 +			page = buffered_rmqueue(z, order, gfp_mask);
  17.701 +			if (page) {
  17.702 + 				zone_statistics(zonelist, z);
  17.703 +				goto got_pg;
  17.704 +			}
  17.705 +		}
  17.706 +	}
  17.707 +
  17.708 +	/*
  17.709 +	 * Don't let big-order allocations loop unless the caller explicitly
  17.710 +	 * requests that.  Wait for some write requests to complete then retry.
  17.711 +	 *
  17.712 +	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
  17.713 +	 * may not be true in other implementations.
  17.714 +	 */
  17.715 +	do_retry = 0;
  17.716 +	if (!(gfp_mask & __GFP_NORETRY)) {
  17.717 +		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
  17.718 +			do_retry = 1;
  17.719 +		if (gfp_mask & __GFP_NOFAIL)
  17.720 +			do_retry = 1;
  17.721 +	}
  17.722 +	if (do_retry) {
  17.723 +		blk_congestion_wait(WRITE, HZ/50);
  17.724 +		goto rebalance;
  17.725 +	}
  17.726 +
  17.727 +nopage:
  17.728 +	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
  17.729 +		printk(KERN_WARNING "%s: page allocation failure."
  17.730 +			" order:%d, mode:0x%x\n",
  17.731 +			p->comm, order, gfp_mask);
  17.732 +		dump_stack();
  17.733 +	}
  17.734 +	return NULL;
  17.735 +got_pg:
  17.736 +	kernel_map_pages(page, 1 << order, 1);
  17.737 +	return page;
  17.738 +}
  17.739 +
  17.740 +EXPORT_SYMBOL(__alloc_pages);
  17.741 +
  17.742 +#ifdef CONFIG_NUMA
  17.743 +/* Early boot: Everything is done by one cpu, but the data structures will be
  17.744 + * used by all cpus - spread them on all nodes.
  17.745 + */
  17.746 +static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
  17.747 +{
  17.748 +static int nodenr;
  17.749 +	int i = nodenr;
  17.750 +	struct page *page;
  17.751 +
  17.752 +	for (;;) {
  17.753 +		if (i > nodenr + numnodes)
  17.754 +			return 0;
  17.755 +		if (node_present_pages(i%numnodes)) {
  17.756 +			struct zone **z;
  17.757 +			/* The node contains memory. Check that there is
  17.758 +			 * memory in the intended zonelist.
  17.759 +			 */
  17.760 +			z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
  17.761 +			while (*z) {
  17.762 +				if ( (*z)->free_pages > (1UL<<order))
  17.763 +					goto found_node;
  17.764 +				z++;
  17.765 +			}
  17.766 +		}
  17.767 +		i++;
  17.768 +	}
  17.769 +found_node:
  17.770 +	nodenr = i+1;
  17.771 +	page = alloc_pages_node(i%numnodes, gfp_mask, order);
  17.772 +	if (!page)
  17.773 +		return 0;
  17.774 +	return (unsigned long) page_address(page);
  17.775 +}
  17.776 +#endif
  17.777 +
  17.778 +/*
  17.779 + * Common helper functions.
  17.780 + */
  17.781 +fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
  17.782 +{
  17.783 +	struct page * page;
  17.784 +
  17.785 +#ifdef CONFIG_NUMA
  17.786 +	if (unlikely(system_state == SYSTEM_BOOTING))
  17.787 +		return get_boot_pages(gfp_mask, order);
  17.788 +#endif
  17.789 +	page = alloc_pages(gfp_mask, order);
  17.790 +	if (!page)
  17.791 +		return 0;
  17.792 +	return (unsigned long) page_address(page);
  17.793 +}
  17.794 +
  17.795 +EXPORT_SYMBOL(__get_free_pages);
  17.796 +
  17.797 +fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
  17.798 +{
  17.799 +	struct page * page;
  17.800 +
  17.801 +	/*
  17.802 +	 * get_zeroed_page() returns a 32-bit address, which cannot represent
  17.803 +	 * a highmem page
  17.804 +	 */
  17.805 +	BUG_ON(gfp_mask & __GFP_HIGHMEM);
  17.806 +
  17.807 +	page = alloc_pages(gfp_mask, 0);
  17.808 +	if (page) {
  17.809 +		void *address = page_address(page);
  17.810 +		clear_page(address);
  17.811 +		return (unsigned long) address;
  17.812 +	}
  17.813 +	return 0;
  17.814 +}
  17.815 +
  17.816 +EXPORT_SYMBOL(get_zeroed_page);
  17.817 +
  17.818 +void __pagevec_free(struct pagevec *pvec)
  17.819 +{
  17.820 +	int i = pagevec_count(pvec);
  17.821 +
  17.822 +	while (--i >= 0)
  17.823 +		free_hot_cold_page(pvec->pages[i], pvec->cold);
  17.824 +}
  17.825 +
  17.826 +fastcall void __free_pages(struct page *page, unsigned int order)
  17.827 +{
  17.828 +	if (!PageReserved(page) && put_page_testzero(page)) {
  17.829 +		if (order == 0)
  17.830 +			free_hot_page(page);
  17.831 +		else
  17.832 +			__free_pages_ok(page, order);
  17.833 +	}
  17.834 +}
  17.835 +
  17.836 +EXPORT_SYMBOL(__free_pages);
  17.837 +
  17.838 +fastcall void free_pages(unsigned long addr, unsigned int order)
  17.839 +{
  17.840 +	if (addr != 0) {
  17.841 +		BUG_ON(!virt_addr_valid(addr));
  17.842 +		__free_pages(virt_to_page(addr), order);
  17.843 +	}
  17.844 +}
  17.845 +
  17.846 +EXPORT_SYMBOL(free_pages);
  17.847 +
  17.848 +/*
  17.849 + * Total amount of free (allocatable) RAM:
  17.850 + */
  17.851 +unsigned int nr_free_pages(void)
  17.852 +{
  17.853 +	unsigned int sum = 0;
  17.854 +	struct zone *zone;
  17.855 +
  17.856 +	for_each_zone(zone)
  17.857 +		sum += zone->free_pages;
  17.858 +
  17.859 +	return sum;
  17.860 +}
  17.861 +
  17.862 +EXPORT_SYMBOL(nr_free_pages);
  17.863 +
  17.864 +unsigned int nr_used_zone_pages(void)
  17.865 +{
  17.866 +	unsigned int pages = 0;
  17.867 +	struct zone *zone;
  17.868 +
  17.869 +	for_each_zone(zone)
  17.870 +		pages += zone->nr_active + zone->nr_inactive;
  17.871 +
  17.872 +	return pages;
  17.873 +}
  17.874 +
  17.875 +#ifdef CONFIG_NUMA
  17.876 +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
  17.877 +{
  17.878 +	unsigned int i, sum = 0;
  17.879 +
  17.880 +	for (i = 0; i < MAX_NR_ZONES; i++)
  17.881 +		sum += pgdat->node_zones[i].free_pages;
  17.882 +
  17.883 +	return sum;
  17.884 +}
  17.885 +#endif
  17.886 +
  17.887 +static unsigned int nr_free_zone_pages(int offset)
  17.888 +{
  17.889 +	pg_data_t *pgdat;
  17.890 +	unsigned int sum = 0;
  17.891 +
  17.892 +	for_each_pgdat(pgdat) {
  17.893 +		struct zonelist *zonelist = pgdat->node_zonelists + offset;
  17.894 +		struct zone **zonep = zonelist->zones;
  17.895 +		struct zone *zone;
  17.896 +
  17.897 +		for (zone = *zonep++; zone; zone = *zonep++) {
  17.898 +			unsigned long size = zone->present_pages;
  17.899 +			unsigned long high = zone->pages_high;
  17.900 +			if (size > high)
  17.901 +				sum += size - high;
  17.902 +		}
  17.903 +	}
  17.904 +
  17.905 +	return sum;
  17.906 +}
  17.907 +
  17.908 +/*
  17.909 + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
  17.910 + */
  17.911 +unsigned int nr_free_buffer_pages(void)
  17.912 +{
  17.913 +	return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
  17.914 +}
  17.915 +
  17.916 +/*
  17.917 + * Amount of free RAM allocatable within all zones
  17.918 + */
  17.919 +unsigned int nr_free_pagecache_pages(void)
  17.920 +{
  17.921 +	return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
  17.922 +}
  17.923 +
  17.924 +#ifdef CONFIG_HIGHMEM
  17.925 +unsigned int nr_free_highpages (void)
  17.926 +{
  17.927 +	pg_data_t *pgdat;
  17.928 +	unsigned int pages = 0;
  17.929 +
  17.930 +	for_each_pgdat(pgdat)
  17.931 +		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
  17.932 +
  17.933 +	return pages;
  17.934 +}
  17.935 +#endif
  17.936 +
  17.937 +#ifdef CONFIG_NUMA
  17.938 +static void show_node(struct zone *zone)
  17.939 +{
  17.940 +	printk("Node %d ", zone->zone_pgdat->node_id);
  17.941 +}
  17.942 +#else
  17.943 +#define show_node(zone)	do { } while (0)
  17.944 +#endif
  17.945 +
  17.946 +/*
  17.947 + * Accumulate the page_state information across all CPUs.
  17.948 + * The result is unavoidably approximate - it can change
  17.949 + * during and after execution of this function.
  17.950 + */
  17.951 +DEFINE_PER_CPU(struct page_state, page_states) = {0};
  17.952 +EXPORT_PER_CPU_SYMBOL(page_states);
  17.953 +
  17.954 +atomic_t nr_pagecache = ATOMIC_INIT(0);
  17.955 +EXPORT_SYMBOL(nr_pagecache);
  17.956 +#ifdef CONFIG_SMP
  17.957 +DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
  17.958 +#endif
  17.959 +
  17.960 +void __get_page_state(struct page_state *ret, int nr)
  17.961 +{
  17.962 +	int cpu = 0;
  17.963 +
  17.964 +	memset(ret, 0, sizeof(*ret));
  17.965 +	while (cpu < NR_CPUS) {
  17.966 +		unsigned long *in, *out, off;
  17.967 +
  17.968 +		if (!cpu_possible(cpu)) {
  17.969 +			cpu++;
  17.970 +			continue;
  17.971 +		}
  17.972 +
  17.973 +		in = (unsigned long *)&per_cpu(page_states, cpu);
  17.974 +		cpu++;
  17.975 +		if (cpu < NR_CPUS && cpu_possible(cpu))
  17.976 +			prefetch(&per_cpu(page_states, cpu));
  17.977 +		out = (unsigned long *)ret;
  17.978 +		for (off = 0; off < nr; off++)
  17.979 +			*out++ += *in++;
  17.980 +	}
  17.981 +}
  17.982 +
  17.983 +void get_page_state(struct page_state *ret)
  17.984 +{
  17.985 +	int nr;
  17.986 +
  17.987 +	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
  17.988 +	nr /= sizeof(unsigned long);
  17.989 +
  17.990 +	__get_page_state(ret, nr + 1);
  17.991 +}
  17.992 +
  17.993 +void get_full_page_state(struct page_state *ret)
  17.994 +{
  17.995 +	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
  17.996 +}
  17.997 +
  17.998 +unsigned long __read_page_state(unsigned offset)
  17.999 +{
 17.1000 +	unsigned long ret = 0;
 17.1001 +	int cpu;
 17.1002 +
 17.1003 +	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 17.1004 +		unsigned long in;
 17.1005 +
 17.1006 +		if (!cpu_possible(cpu))
 17.1007 +			continue;
 17.1008 +
 17.1009 +		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
 17.1010 +		ret += *((unsigned long *)in);
 17.1011 +	}
 17.1012 +	return ret;
 17.1013 +}
 17.1014 +
 17.1015 +void get_zone_counts(unsigned long *active,
 17.1016 +		unsigned long *inactive, unsigned long *free)
 17.1017 +{
 17.1018 +	struct zone *zone;
 17.1019 +
 17.1020 +	*active = 0;
 17.1021 +	*inactive = 0;
 17.1022 +	*free = 0;
 17.1023 +	for_each_zone(zone) {
 17.1024 +		*active += zone->nr_active;
 17.1025 +		*inactive += zone->nr_inactive;
 17.1026 +		*free += zone->free_pages;
 17.1027 +	}
 17.1028 +}
 17.1029 +
 17.1030 +void si_meminfo(struct sysinfo *val)
 17.1031 +{
 17.1032 +	val->totalram = totalram_pages;
 17.1033 +	val->sharedram = 0;
 17.1034 +	val->freeram = nr_free_pages();
 17.1035 +	val->bufferram = nr_blockdev_pages();
 17.1036 +#ifdef CONFIG_HIGHMEM
 17.1037 +	val->totalhigh = totalhigh_pages;
 17.1038 +	val->freehigh = nr_free_highpages();
 17.1039 +#else
 17.1040 +	val->totalhigh = 0;
 17.1041 +	val->freehigh = 0;
 17.1042 +#endif
 17.1043 +	val->mem_unit = PAGE_SIZE;
 17.1044 +}
 17.1045 +
 17.1046 +EXPORT_SYMBOL(si_meminfo);
 17.1047 +
 17.1048 +#ifdef CONFIG_NUMA
 17.1049 +void si_meminfo_node(struct sysinfo *val, int nid)
 17.1050 +{
 17.1051 +	pg_data_t *pgdat = NODE_DATA(nid);
 17.1052 +
 17.1053 +	val->totalram = pgdat->node_present_pages;
 17.1054 +	val->freeram = nr_free_pages_pgdat(pgdat);
 17.1055 +	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 17.1056 +	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 17.1057 +	val->mem_unit = PAGE_SIZE;
 17.1058 +}
 17.1059 +#endif
 17.1060 +
 17.1061 +#define K(x) ((x) << (PAGE_SHIFT-10))
 17.1062 +
 17.1063 +/*
 17.1064 + * Show free area list (used inside shift_scroll-lock stuff)
 17.1065 + * We also calculate the percentage fragmentation. We do this by counting the
 17.1066 + * memory on each free list with the exception of the first item on the list.
 17.1067 + */
 17.1068 +void show_free_areas(void)
 17.1069 +{
 17.1070 +	struct page_state ps;
 17.1071 +	int cpu, temperature;
 17.1072 +	unsigned long active;
 17.1073 +	unsigned long inactive;
 17.1074 +	unsigned long free;
 17.1075 +	struct zone *zone;
 17.1076 +
 17.1077 +	for_each_zone(zone) {
 17.1078 +		show_node(zone);
 17.1079 +		printk("%s per-cpu:", zone->name);
 17.1080 +
 17.1081 +		if (!zone->present_pages) {
 17.1082 +			printk(" empty\n");
 17.1083 +			continue;
 17.1084 +		} else
 17.1085 +			printk("\n");
 17.1086 +
 17.1087 +		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
 17.1088 +			struct per_cpu_pageset *pageset;
 17.1089 +
 17.1090 +			if (!cpu_possible(cpu))
 17.1091 +				continue;
 17.1092 +
 17.1093 +			pageset = zone->pageset + cpu;
 17.1094 +
 17.1095 +			for (temperature = 0; temperature < 2; temperature++)
 17.1096 +				printk("cpu %d %s: low %d, high %d, batch %d\n",
 17.1097 +					cpu,
 17.1098 +					temperature ? "cold" : "hot",
 17.1099 +					pageset->pcp[temperature].low,
 17.1100 +					pageset->pcp[temperature].high,
 17.1101 +					pageset->pcp[temperature].batch);
 17.1102 +		}
 17.1103 +	}
 17.1104 +
 17.1105 +	get_page_state(&ps);
 17.1106 +	get_zone_counts(&active, &inactive, &free);
 17.1107 +
 17.1108 +	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
 17.1109 +		K(nr_free_pages()),
 17.1110 +		K(nr_free_highpages()));
 17.1111 +
 17.1112 +	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
 17.1113 +		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
 17.1114 +		active,
 17.1115 +		inactive,
 17.1116 +		ps.nr_dirty,
 17.1117 +		ps.nr_writeback,
 17.1118 +		ps.nr_unstable,
 17.1119 +		nr_free_pages(),
 17.1120 +		ps.nr_slab,
 17.1121 +		ps.nr_mapped,
 17.1122 +		ps.nr_page_table_pages);
 17.1123 +
 17.1124 +	for_each_zone(zone) {
 17.1125 +		int i;
 17.1126 +
 17.1127 +		show_node(zone);
 17.1128 +		printk("%s"
 17.1129 +			" free:%lukB"
 17.1130 +			" min:%lukB"
 17.1131 +			" low:%lukB"
 17.1132 +			" high:%lukB"
 17.1133 +			" active:%lukB"
 17.1134 +			" inactive:%lukB"
 17.1135 +			" present:%lukB"
 17.1136 +			"\n",
 17.1137 +			zone->name,
 17.1138 +			K(zone->free_pages),
 17.1139 +			K(zone->pages_min),
 17.1140 +			K(zone->pages_low),
 17.1141 +			K(zone->pages_high),
 17.1142 +			K(zone->nr_active),
 17.1143 +			K(zone->nr_inactive),
 17.1144 +			K(zone->present_pages)
 17.1145 +			);
 17.1146 +		printk("protections[]:");
 17.1147 +		for (i = 0; i < MAX_NR_ZONES; i++)
 17.1148 +			printk(" %lu", zone->protection[i]);
 17.1149 +		printk("\n");
 17.1150 +	}
 17.1151 +
 17.1152 +	for_each_zone(zone) {
 17.1153 +		struct list_head *elem;
 17.1154 + 		unsigned long nr, flags, order, total = 0;
 17.1155 +
 17.1156 +		show_node(zone);
 17.1157 +		printk("%s: ", zone->name);
 17.1158 +		if (!zone->present_pages) {
 17.1159 +			printk("empty\n");
 17.1160 +			continue;
 17.1161 +		}
 17.1162 +
 17.1163 +		spin_lock_irqsave(&zone->lock, flags);
 17.1164 +		for (order = 0; order < MAX_ORDER; order++) {
 17.1165 +			nr = 0;
 17.1166 +			list_for_each(elem, &zone->free_area[order].free_list)
 17.1167 +				++nr;
 17.1168 +			total += nr << order;
 17.1169 +			printk("%lu*%lukB ", nr, K(1UL) << order);
 17.1170 +		}
 17.1171 +		spin_unlock_irqrestore(&zone->lock, flags);
 17.1172 +		printk("= %lukB\n", K(total));
 17.1173 +	}
 17.1174 +
 17.1175 +	show_swap_cache_info();
 17.1176 +}
 17.1177 +
 17.1178 +/*
 17.1179 + * Builds allocation fallback zone lists.
 17.1180 + */
 17.1181 +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
 17.1182 +{
 17.1183 +	switch (k) {
 17.1184 +		struct zone *zone;
 17.1185 +	default:
 17.1186 +		BUG();
 17.1187 +	case ZONE_HIGHMEM:
 17.1188 +		zone = pgdat->node_zones + ZONE_HIGHMEM;
 17.1189 +		if (zone->present_pages) {
 17.1190 +#ifndef CONFIG_HIGHMEM
 17.1191 +			BUG();
 17.1192 +#endif
 17.1193 +			zonelist->zones[j++] = zone;
 17.1194 +		}
 17.1195 +	case ZONE_NORMAL:
 17.1196 +		zone = pgdat->node_zones + ZONE_NORMAL;
 17.1197 +		if (zone->present_pages)
 17.1198 +			zonelist->zones[j++] = zone;
 17.1199 +	case ZONE_DMA:
 17.1200 +		zone = pgdat->node_zones + ZONE_DMA;
 17.1201 +		if (zone->present_pages)
 17.1202 +			zonelist->zones[j++] = zone;
 17.1203 +	}
 17.1204 +
 17.1205 +	return j;
 17.1206 +}
 17.1207 +
 17.1208 +#ifdef CONFIG_NUMA
 17.1209 +#define MAX_NODE_LOAD (numnodes)
 17.1210 +static int __initdata node_load[MAX_NUMNODES];
 17.1211 +/**
 17.1212 + * find_next_best_node - find the next node that should appear in a given
 17.1213 + *    node's fallback list
 17.1214 + * @node: node whose fallback list we're appending
 17.1215 + * @used_node_mask: pointer to the bitmap of already used nodes
 17.1216 + *
 17.1217 + * We use a number of factors to determine which is the next node that should
 17.1218 + * appear on a given node's fallback list.  The node should not have appeared
 17.1219 + * already in @node's fallback list, and it should be the next closest node
 17.1220 + * according to the distance array (which contains arbitrary distance values
 17.1221 + * from each node to each node in the system), and should also prefer nodes
 17.1222 + * with no CPUs, since presumably they'll have very little allocation pressure
 17.1223 + * on them otherwise.
 17.1224 + * It returns -1 if no node is found.
 17.1225 + */
 17.1226 +static int __init find_next_best_node(int node, void *used_node_mask)
 17.1227 +{
 17.1228 +	int i, n, val;
 17.1229 +	int min_val = INT_MAX;
 17.1230 +	int best_node = -1;
 17.1231 +
 17.1232 +	for (i = 0; i < numnodes; i++) {
 17.1233 +		cpumask_t tmp;
 17.1234 +
 17.1235 +		/* Start from local node */
 17.1236 +		n = (node+i)%numnodes;
 17.1237 +
 17.1238 +		/* Don't want a node to appear more than once */
 17.1239 +		if (test_bit(n, used_node_mask))
 17.1240 +			continue;
 17.1241 +
 17.1242 +		/* Use the distance array to find the distance */
 17.1243 +		val = node_distance(node, n);
 17.1244 +
 17.1245 +		/* Give preference to headless and unused nodes */
 17.1246 +		tmp = node_to_cpumask(n);
 17.1247 +		if (!cpus_empty(tmp))
 17.1248 +			val += PENALTY_FOR_NODE_WITH_CPUS;
 17.1249 +
 17.1250 +		/* Slight preference for less loaded node */
 17.1251 +		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 17.1252 +		val += node_load[n];
 17.1253 +
 17.1254 +		if (val < min_val) {
 17.1255 +			min_val = val;
 17.1256 +			best_node = n;
 17.1257 +		}
 17.1258 +	}
 17.1259 +
 17.1260 +	if (best_node >= 0)
 17.1261 +		set_bit(best_node, used_node_mask);
 17.1262 +
 17.1263 +	return best_node;
 17.1264 +}
 17.1265 +
 17.1266 +static void __init build_zonelists(pg_data_t *pgdat)
 17.1267 +{
 17.1268 +	int i, j, k, node, local_node;
 17.1269 +	int prev_node, load;
 17.1270 +	struct zonelist *zonelist;
 17.1271 +	DECLARE_BITMAP(used_mask, MAX_NUMNODES);
 17.1272 +
 17.1273 +	/* initialize zonelists */
 17.1274 +	for (i = 0; i < MAX_NR_ZONES; i++) {
 17.1275 +		zonelist = pgdat->node_zonelists + i;
 17.1276 +		memset(zonelist, 0, sizeof(*zonelist));
 17.1277 +		zonelist->zones[0] = NULL;
 17.1278 +	}
 17.1279 +
 17.1280 +	/* NUMA-aware ordering of nodes */
 17.1281 +	local_node = pgdat->node_id;
 17.1282 +	load = numnodes;
 17.1283 +	prev_node = local_node;
 17.1284 +	bitmap_zero(used_mask, MAX_NUMNODES);
 17.1285 +	while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
 17.1286 +		/*
 17.1287 +		 * We don't want to pressure a particular node.
 17.1288 +		 * So adding penalty to the first node in same
 17.1289 +		 * distance group to make it round-robin.
 17.1290 +		 */
 17.1291 +		if (node_distance(local_node, node) !=
 17.1292 +				node_distance(local_node, prev_node))
 17.1293 +			node_load[node] += load;
 17.1294 +		prev_node = node;
 17.1295 +		load--;
 17.1296 +		for (i = 0; i < MAX_NR_ZONES; i++) {
 17.1297 +			zonelist = pgdat->node_zonelists + i;
 17.1298 +			for (j = 0; zonelist->zones[j] != NULL; j++);
 17.1299 +
 17.1300 +			k = ZONE_NORMAL;
 17.1301 +			if (i & __GFP_HIGHMEM)
 17.1302 +				k = ZONE_HIGHMEM;
 17.1303 +			if (i & __GFP_DMA)
 17.1304 +				k = ZONE_DMA;
 17.1305 +
 17.1306 +	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 17.1307 +			zonelist->zones[j] = NULL;
 17.1308 +		}
 17.1309 +	}
 17.1310 +}
 17.1311 +
 17.1312 +#else	/* CONFIG_NUMA */
 17.1313 +
 17.1314 +static void __init build_zonelists(pg_data_t *pgdat)
 17.1315 +{
 17.1316 +	int i, j, k, node, local_node;
 17.1317 +
 17.1318 +	local_node = pgdat->node_id;
 17.1319 +	for (i = 0; i < MAX_NR_ZONES; i++) {
 17.1320 +		struct zonelist *zonelist;
 17.1321 +
 17.1322 +		zonelist = pgdat->node_zonelists + i;
 17.1323 +		memset(zonelist, 0, sizeof(*zonelist));
 17.1324 +
 17.1325 +		j = 0;
 17.1326 +		k = ZONE_NORMAL;
 17.1327 +		if (i & __GFP_HIGHMEM)
 17.1328 +			k = ZONE_HIGHMEM;
 17.1329 +		if (i & __GFP_DMA)
 17.1330 +			k = ZONE_DMA;
 17.1331 +
 17.1332 + 		j = build_zonelists_node(pgdat, zonelist, j, k);
 17.1333 + 		/*
 17.1334 + 		 * Now we build the zonelist so that it contains the zones
 17.1335 + 		 * of all the other nodes.
 17.1336 + 		 * We don't want to pressure a particular node, so when
 17.1337 + 		 * building the zones for node N, we make sure that the
 17.1338 + 		 * zones coming right after the local ones are those from
 17.1339 + 		 * node N+1 (modulo N)
 17.1340 + 		 */
 17.1341 + 		for (node = local_node + 1; node < numnodes; node++)
 17.1342 + 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 17.1343 + 		for (node = 0; node < local_node; node++)
 17.1344 + 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 17.1345 + 
 17.1346 +		zonelist->zones[j] = NULL;
 17.1347 +	}
 17.1348 +}
 17.1349 +
 17.1350 +#endif	/* CONFIG_NUMA */
 17.1351 +
 17.1352 +void __init build_all_zonelists(void)
 17.1353 +{
 17.1354 +	int i;
 17.1355 +
 17.1356 +	for(i = 0 ; i < numnodes ; i++)
 17.1357 +		build_zonelists(NODE_DATA(i));
 17.1358 +	printk("Built %i zonelists\n", numnodes);
 17.1359 +}
 17.1360 +
 17.1361 +/*
 17.1362 + * Helper functions to size the waitqueue hash table.
 17.1363 + * Essentially these want to choose hash table sizes sufficiently
 17.1364 + * large so that collisions trying to wait on pages are rare.
 17.1365 + * But in fact, the number of active page waitqueues on typical
 17.1366 + * systems is ridiculously low, less than 200. So this is even
 17.1367 + * conservative, even though it seems large.
 17.1368 + *
 17.1369 + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 17.1370 + * waitqueues, i.e. the size of the waitq table given the number of pages.
 17.1371 + */
 17.1372 +#define PAGES_PER_WAITQUEUE	256
 17.1373 +
 17.1374 +static inline unsigned long wait_table_size(unsigned long pages)
 17.1375 +{
 17.1376 +	unsigned long size = 1;
 17.1377 +
 17.1378 +	pages /= PAGES_PER_WAITQUEUE;
 17.1379 +
 17.1380 +	while (size < pages)
 17.1381 +		size <<= 1;
 17.1382 +
 17.1383 +	/*
 17.1384 +	 * Once we have dozens or even hundreds of threads sleeping
 17.1385 +	 * on IO we've got bigger problems than wait queue collision.
 17.1386 +	 * Limit the size of the wait table to a reasonable size.
 17.1387 +	 */
 17.1388 +	size = min(size, 4096UL);
 17.1389 +
 17.1390 +	return max(size, 4UL);
 17.1391 +}
 17.1392 +
 17.1393 +/*
 17.1394 + * This is an integer logarithm so that shifts can be used later
 17.1395 + * to extract the more random high bits from the multiplicative
 17.1396 + * hash function before the remainder is taken.
 17.1397 + */
 17.1398 +static inline unsigned long wait_table_bits(unsigned long size)
 17.1399 +{
 17.1400 +	return ffz(~size);
 17.1401 +}
 17.1402 +
 17.1403 +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 17.1404 +
 17.1405 +static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 17.1406 +		unsigned long *zones_size, unsigned long *zholes_size)
 17.1407 +{
 17.1408 +	unsigned long realtotalpages, totalpages = 0;
 17.1409 +	int i;
 17.1410 +
 17.1411 +	for (i = 0; i < MAX_NR_ZONES; i++)
 17.1412 +		totalpages += zones_size[i];
 17.1413 +	pgdat->node_spanned_pages = totalpages;
 17.1414 +
 17.1415 +	realtotalpages = totalpages;
 17.1416 +	if (zholes_size)
 17.1417 +		for (i = 0; i < MAX_NR_ZONES; i++)
 17.1418 +			realtotalpages -= zholes_size[i];
 17.1419 +	pgdat->node_present_pages = realtotalpages;
 17.1420 +	printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 17.1421 +}
 17.1422 +
 17.1423 +
 17.1424 +/*
 17.1425 + * Initially all pages are reserved - free ones are freed
 17.1426 + * up by free_all_bootmem() once the early boot process is
 17.1427 + * done. Non-atomic initialization, single-pass.
 17.1428 + */
 17.1429 +void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
 17.1430 +		unsigned long zone, unsigned long start_pfn)
 17.1431 +{
 17.1432 +	struct page *page;
 17.1433 +
 17.1434 +	for (page = start; page < (start + size); page++) {
 17.1435 +		set_page_zone(page, NODEZONE(nid, zone));
 17.1436 +		set_page_count(page, 0);
 17.1437 +		SetPageReserved(page);
 17.1438 +		INIT_LIST_HEAD(&page->lru);
 17.1439 +#ifdef WANT_PAGE_VIRTUAL
 17.1440 +		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 17.1441 +		if (zone != ZONE_HIGHMEM)
 17.1442 +			set_page_address(page, __va(start_pfn << PAGE_SHIFT));
 17.1443 +#endif
 17.1444 +		start_pfn++;
 17.1445 +	}
 17.1446 +}
 17.1447 +
 17.1448 +#ifndef __HAVE_ARCH_MEMMAP_INIT
 17.1449 +#define memmap_init(start, size, nid, zone, start_pfn) \
 17.1450 +	memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
 17.1451 +#endif
 17.1452 +
 17.1453 +/*
 17.1454 + * Set up the zone data structures:
 17.1455 + *   - mark all pages reserved
 17.1456 + *   - mark all memory queues empty
 17.1457 + *   - clear the memory bitmaps
 17.1458 + */
 17.1459 +static void __init free_area_init_core(struct pglist_data *pgdat,
 17.1460 +		unsigned long *zones_size, unsigned long *zholes_size)
 17.1461 +{
 17.1462 +	unsigned long i, j;
 17.1463 +	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 17.1464 +	int cpu, nid = pgdat->node_id;
 17.1465 +	struct page *lmem_map = pgdat->node_mem_map;
 17.1466 +	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 17.1467 +
 17.1468 +	pgdat->nr_zones = 0;
 17.1469 +	init_waitqueue_head(&pgdat->kswapd_wait);
 17.1470 +	
 17.1471 +	for (j = 0; j < MAX_NR_ZONES; j++) {
 17.1472 +		struct zone *zone = pgdat->node_zones + j;
 17.1473 +		unsigned long size, realsize;
 17.1474 +		unsigned long batch;
 17.1475 +
 17.1476 +		zone_table[NODEZONE(nid, j)] = zone;
 17.1477 +		realsize = size = zones_size[j];
 17.1478 +		if (zholes_size)
 17.1479 +			realsize -= zholes_size[j];
 17.1480 +
 17.1481 +		zone->spanned_pages = size;
 17.1482 +		zone->present_pages = realsize;
 17.1483 +		zone->name = zone_names[j];
 17.1484 +		spin_lock_init(&zone->lock);
 17.1485 +		spin_lock_init(&zone->lru_lock);
 17.1486 +		zone->zone_pgdat = pgdat;
 17.1487 +		zone->free_pages = 0;
 17.1488 +
 17.1489 +		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 17.1490 +
 17.1491 +		/*
 17.1492 +		 * The per-cpu-pages pools are set to around 1000th of the
 17.1493 +		 * size of the zone.  But no more than 1/4 of a meg - there's
 17.1494 +		 * no point in going beyond the size of L2 cache.
 17.1495 +		 *
 17.1496 +		 * OK, so we don't know how big the cache is.  So guess.
 17.1497 +		 */
 17.1498 +		batch = zone->present_pages / 1024;
 17.1499 +		if (batch * PAGE_SIZE > 256 * 1024)
 17.1500 +			batch = (256 * 1024) / PAGE_SIZE;
 17.1501 +		batch /= 4;		/* We effectively *= 4 below */
 17.1502 +		if (batch < 1)
 17.1503 +			batch = 1;
 17.1504 +
 17.1505 +		for (cpu = 0; cpu < NR_CPUS; cpu++) {
 17.1506 +			struct per_cpu_pages *pcp;
 17.1507 +
 17.1508 +			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
 17.1509 +			pcp->count = 0;
 17.1510 +			pcp->low = 2 * batch;
 17.1511 +			pcp->high = 6 * batch;
 17.1512 +			pcp->batch = 1 * batch;
 17.1513 +			INIT_LIST_HEAD(&pcp->list);
 17.1514 +
 17.1515 +			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
 17.1516 +			pcp->count = 0;
 17.1517 +			pcp->low = 0;
 17.1518 +			pcp->high = 2 * batch;
 17.1519 +			pcp->batch = 1 * batch;
 17.1520 +			INIT_LIST_HEAD(&pcp->list);
 17.1521 +		}
 17.1522 +		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
 17.1523 +				zone_names[j], realsize, batch);
 17.1524 +		INIT_LIST_HEAD(&zone->active_list);
 17.1525 +		INIT_LIST_HEAD(&zone->inactive_list);
 17.1526 +		atomic_set(&zone->nr_scan_active, 0);
 17.1527 +		atomic_set(&zone->nr_scan_inactive, 0);
 17.1528 +		zone->nr_active = 0;
 17.1529 +		zone->nr_inactive = 0;
 17.1530 +		if (!size)
 17.1531 +			continue;
 17.1532 +
 17.1533 +		/*
 17.1534 +		 * The per-page waitqueue mechanism uses hashed waitqueues
 17.1535 +		 * per zone.
 17.1536 +		 */
 17.1537 +		zone->wait_table_size = wait_table_size(size);
 17.1538 +		zone->wait_table_bits =
 17.1539 +			wait_table_bits(zone->wait_table_size);
 17.1540 +		zone->wait_table = (wait_queue_head_t *)
 17.1541 +			alloc_bootmem_node(pgdat, zone->wait_table_size
 17.1542 +						* sizeof(wait_queue_head_t));
 17.1543 +
 17.1544 +		for(i = 0; i < zone->wait_table_size; ++i)
 17.1545 +			init_waitqueue_head(zone->wait_table + i);
 17.1546 +
 17.1547 +		pgdat->nr_zones = j+1;
 17.1548 +
 17.1549 +		zone->zone_mem_map = lmem_map;
 17.1550 +		zone->zone_start_pfn = zone_start_pfn;
 17.1551 +
 17.1552 +		if ((zone_start_pfn) & (zone_required_alignment-1))
 17.1553 +			printk("BUG: wrong zone alignment, it will crash\n");
 17.1554 +
 17.1555 +		memmap_init(lmem_map, size, nid, j, zone_start_pfn);
 17.1556 +
 17.1557 +		zone_start_pfn += size;
 17.1558 +		lmem_map += size;
 17.1559 +
 17.1560 +		for (i = 0; ; i++) {
 17.1561 +			unsigned long bitmap_size;
 17.1562 +
 17.1563 +			INIT_LIST_HEAD(&zone->free_area[i].free_list);
 17.1564 +			if (i == MAX_ORDER-1) {
 17.1565 +				zone->free_area[i].map = NULL;
 17.1566 +				break;
 17.1567 +			}
 17.1568 +
 17.1569 +			/*
 17.1570 +			 * Page buddy system uses "index >> (i+1)",
 17.1571 +			 * where "index" is at most "size-1".
 17.1572 +			 *
 17.1573 +			 * The extra "+3" is to round down to byte
 17.1574 +			 * size (8 bits per byte assumption). Thus
 17.1575 +			 * we get "(size-1) >> (i+4)" as the last byte
 17.1576 +			 * we can access.
 17.1577 +			 *
 17.1578 +			 * The "+1" is because we want to round the
 17.1579 +			 * byte allocation up rather than down. So
 17.1580 +			 * we should have had a "+7" before we shifted
 17.1581 +			 * down by three. Also, we have to add one as
 17.1582 +			 * we actually _use_ the last bit (it's [0,n]
 17.1583 +			 * inclusive, not [0,n[).
 17.1584 +			 *
 17.1585 +			 * So we actually had +7+1 before we shift
 17.1586 +			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
 17.1587 +			 * (modulo overflows, which we do not have).
 17.1588 +			 *
 17.1589 +			 * Finally, we LONG_ALIGN because all bitmap
 17.1590 +			 * operations are on longs.
 17.1591 +			 */
 17.1592 +			bitmap_size = (size-1) >> (i+4);
 17.1593 +			bitmap_size = LONG_ALIGN(bitmap_size+1);
 17.1594 +			zone->free_area[i].map = 
 17.1595 +			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
 17.1596 +		}
 17.1597 +	}
 17.1598 +}
 17.1599 +
 17.1600 +void __init free_area_init_node(int nid, struct pglist_data *pgdat,
 17.1601 +		struct page *node_mem_map, unsigned long *zones_size,
 17.1602 +		unsigned long node_start_pfn, unsigned long *zholes_size)
 17.1603 +{
 17.1604 +	unsigned long size;
 17.1605 +
 17.1606 +	pgdat->node_id = nid;
 17.1607 +	pgdat->node_start_pfn = node_start_pfn;
 17.1608 +	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
 17.1609 +	if (!node_mem_map) {
 17.1610 +		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
 17.1611 +		node_mem_map = alloc_bootmem_node(pgdat, size);
 17.1612 +	}
 17.1613 +	pgdat->node_mem_map = node_mem_map;
 17.1614 +
 17.1615 +	free_area_init_core(pgdat, zones_size, zholes_size);
 17.1616 +}
 17.1617 +
 17.1618 +#ifndef CONFIG_DISCONTIGMEM
 17.1619 +static bootmem_data_t contig_bootmem_data;
 17.1620 +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 17.1621 +
 17.1622 +EXPORT_SYMBOL(contig_page_data);
 17.1623 +
 17.1624 +void __init free_area_init(unsigned long *zones_size)
 17.1625 +{
 17.1626 +	free_area_init_node(0, &contig_page_data, NULL, zones_size,
 17.1627 +			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 17.1628 +	mem_map = contig_page_data.node_mem_map;
 17.1629 +}
 17.1630 +#endif
 17.1631 +
 17.1632 +#ifdef CONFIG_PROC_FS
 17.1633 +
 17.1634 +#include <linux/seq_file.h>
 17.1635 +
 17.1636 +static void *frag_start(struct seq_file *m, loff_t *pos)
 17.1637 +{
 17.1638 +	pg_data_t *pgdat;
 17.1639 +	loff_t node = *pos;
 17.1640 +
 17.1641 +	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
 17.1642 +		--node;
 17.1643 +
 17.1644 +	return pgdat;
 17.1645 +}
 17.1646 +
 17.1647 +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 17.1648 +{
 17.1649 +	pg_data_t *pgdat = (pg_data_t *)arg;
 17.1650 +
 17.1651 +	(*pos)++;
 17.1652 +	return pgdat->pgdat_next;
 17.1653 +}
 17.1654 +
 17.1655 +static void frag_stop(struct seq_file *m, void *arg)
 17.1656 +{
 17.1657 +}
 17.1658 +
 17.1659 +/* 
 17.1660 + * This walks the freelist for each zone. Whilst this is slow, I'd rather 
 17.1661 + * be slow here than slow down the fast path by keeping stats - mjbligh
 17.1662 + */
 17.1663 +static int frag_show(struct seq_file *m, void *arg)
 17.1664 +{
 17.1665 +	pg_data_t *pgdat = (pg_data_t *)arg;
 17.1666 +	struct zone *zone;
 17.1667 +	struct zone *node_zones = pgdat->node_zones;
 17.1668 +	unsigned long flags;
 17.1669 +	int order;
 17.1670 +
 17.1671 +	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 17.1672 +		if (!zone->present_pages)
 17.1673 +			continue;
 17.1674 +
 17.1675 +		spin_lock_irqsave(&zone->lock, flags);
 17.1676 +		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 17.1677 +		for (order = 0; order < MAX_ORDER; ++order) {
 17.1678 +			unsigned long nr_bufs = 0;
 17.1679 +			struct list_head *elem;
 17.1680 +
 17.1681 +			list_for_each(elem, &(zone->free_area[order].free_list))
 17.1682 +				++nr_bufs;
 17.1683 +			seq_printf(m, "%6lu ", nr_bufs);
 17.1684 +		}
 17.1685 +		spin_unlock_irqrestore(&zone->lock, flags);
 17.1686 +		seq_putc(m, '\n');
 17.1687 +	}
 17.1688 +	return 0;
 17.1689 +}
 17.1690 +
 17.1691 +struct seq_operations fragmentation_op = {
 17.1692 +	.start	= frag_start,
 17.1693 +	.next	= frag_next,
 17.1694 +	.stop	= frag_stop,
 17.1695 +	.show	= frag_show,
 17.1696 +};
 17.1697 +
 17.1698 +static char *vmstat_text[] = {
 17.1699 +	"nr_dirty",
 17.1700 +	"nr_writeback",
 17.1701 +	"nr_unstable",
 17.1702 +	"nr_page_table_pages",
 17.1703 +	"nr_mapped",
 17.1704 +	"nr_slab",
 17.1705 +
 17.1706 +	"pgpgin",
 17.1707 +	"pgpgout",
 17.1708 +	"pswpin",
 17.1709 +	"pswpout",
 17.1710 +	"pgalloc_high",
 17.1711 +
 17.1712 +	"pgalloc_normal",
 17.1713 +	"pgalloc_dma",
 17.1714 +	"pgfree",
 17.1715 +	"pgactivate",
 17.1716 +	"pgdeactivate",
 17.1717 +
 17.1718 +	"pgfault",
 17.1719 +	"pgmajfault",
 17.1720 +	"pgrefill_high",
 17.1721 +	"pgrefill_normal",
 17.1722 +	"pgrefill_dma",
 17.1723 +
 17.1724 +	"pgsteal_high",
 17.1725 +	"pgsteal_normal",
 17.1726 +	"pgsteal_dma",
 17.1727 +	"pgscan_kswapd_high",
 17.1728 +	"pgscan_kswapd_normal",
 17.1729 +
 17.1730 +	"pgscan_kswapd_dma",
 17.1731 +	"pgscan_direct_high",
 17.1732 +	"pgscan_direct_normal",
 17.1733 +	"pgscan_direct_dma",
 17.1734 +	"pginodesteal",
 17.1735 +
 17.1736 +	"slabs_scanned",
 17.1737 +	"kswapd_steal",
 17.1738 +	"kswapd_inodesteal",
 17.1739 +	"pageoutrun",
 17.1740 +	"allocstall",
 17.1741 +
 17.1742 +	"pgrotated",
 17.1743 +};
 17.1744 +
 17.1745 +static void *vmstat_start(struct seq_file *m, loff_t *pos)
 17.1746 +{
 17.1747 +	struct page_state *ps;
 17.1748 +
 17.1749 +	if (*pos >= ARRAY_SIZE(vmstat_text))
 17.1750 +		return NULL;
 17.1751 +
 17.1752 +	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
 17.1753 +	m->private = ps;
 17.1754 +	if (!ps)
 17.1755 +		return ERR_PTR(-ENOMEM);
 17.1756 +	get_full_page_state(ps);
 17.1757 +	ps->pgpgin /= 2;		/* sectors -> kbytes */
 17.1758 +	ps->pgpgout /= 2;
 17.1759 +	return (unsigned long *)ps + *pos;
 17.1760 +}
 17.1761 +
 17.1762 +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
 17.1763 +{
 17.1764 +	(*pos)++;
 17.1765 +	if (*pos >= ARRAY_SIZE(vmstat_text))
 17.1766 +		return NULL;
 17.1767 +	return (unsigned long *)m->private + *pos;
 17.1768 +}
 17.1769 +
 17.1770 +static int vmstat_show(struct seq_file *m, void *arg)
 17.1771 +{
 17.1772 +	unsigned long *l = arg;
 17.1773 +	unsigned long off = l - (unsigned long *)m->private;
 17.1774 +
 17.1775 +	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
 17.1776 +	return 0;
 17.1777 +}
 17.1778 +
 17.1779 +static void vmstat_stop(struct seq_file *m, void *arg)
 17.1780 +{
 17.1781 +	kfree(m->private);
 17.1782 +	m->private = NULL;
 17.1783 +}
 17.1784 +
 17.1785 +struct seq_operations vmstat_op = {
 17.1786 +	.start	= vmstat_start,
 17.1787 +	.next	= vmstat_next,
 17.1788 +	.stop	= vmstat_stop,
 17.1789 +	.show	= vmstat_show,
 17.1790 +};
 17.1791 +
 17.1792 +#endif /* CONFIG_PROC_FS */
 17.1793 +
 17.1794 +#ifdef CONFIG_HOTPLUG_CPU
 17.1795 +static int page_alloc_cpu_notify(struct notifier_block *self,
 17.1796 +				 unsigned long action, void *hcpu)
 17.1797 +{
 17.1798 +	int cpu = (unsigned long)hcpu;
 17.1799 +	long *count;
 17.1800 +
 17.1801 +	if (action == CPU_DEAD) {
 17.1802 +		/* Drain local pagecache count. */
 17.1803 +		count = &per_cpu(nr_pagecache_local, cpu);
 17.1804 +		atomic_add(*count, &nr_pagecache);
 17.1805 +		*count = 0;
 17.1806 +		local_irq_disable();
 17.1807 +		__drain_pages(cpu);
 17.1808 +		local_irq_enable();
 17.1809 +	}
 17.1810 +	return NOTIFY_OK;
 17.1811 +}
 17.1812 +#endif /* CONFIG_HOTPLUG_CPU */
 17.1813 +
 17.1814 +void __init page_alloc_init(void)
 17.1815 +{
 17.1816 +	hotcpu_notifier(page_alloc_cpu_notify, 0);
 17.1817 +}
 17.1818 +
 17.1819 +static unsigned long higherzone_val(struct zone *z, int max_zone,
 17.1820 +					int alloc_type)
 17.1821 +{
 17.1822 +	int z_idx = zone_idx(z);
 17.1823 +	struct zone *higherzone;
 17.1824 +	unsigned long pages;
 17.1825 +
 17.1826 +	/* there is no higher zone to get a contribution from */
 17.1827 +	if (z_idx == MAX_NR_ZONES-1)
 17.1828 +		return 0;
 17.1829 +
 17.1830 +	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
 17.1831 +
 17.1832 +	/* We always start with the higher zone's protection value */
 17.1833 +	pages = higherzone->protection[alloc_type];
 17.1834 +
 17.1835 +	/*
 17.1836 +	 * We get a lower-zone-protection contribution only if there are
 17.1837 +	 * pages in the higher zone and if we're not the highest zone
 17.1838 +	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
 17.1839 +	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
 17.1840 +	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
 17.1841 +	 */
 17.1842 +	if (higherzone->present_pages && z_idx < alloc_type)
 17.1843 +		pages += higherzone->pages_low * sysctl_lower_zone_protection;
 17.1844 +
 17.1845 +	return pages;
 17.1846 +}
 17.1847 +
 17.1848 +/*
 17.1849 + * setup_per_zone_protection - called whenver min_free_kbytes or
 17.1850 + *	sysctl_lower_zone_protection changes.  Ensures that each zone
 17.1851 + *	has a correct pages_protected value, so an adequate number of
 17.1852 + *	pages are left in the zone after a successful __alloc_pages().
 17.1853 + *
 17.1854 + *	This algorithm is way confusing.  I tries to keep the same behavior
 17.1855 + *	as we had with the incremental min iterative algorithm.
 17.1856 + */
 17.1857 +static void setup_per_zone_protection(void)
 17.1858 +{
 17.1859 +	struct pglist_data *pgdat;
 17.1860 +	struct zone *zones, *zone;
 17.1861 +	int max_zone;
 17.1862 +	int i, j;
 17.1863 +
 17.1864 +	for_each_pgdat(pgdat) {
 17.1865 +		zones = pgdat->node_zones;
 17.1866 +
 17.1867 +		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
 17.1868 +			if (zones[i].present_pages)
 17.1869 +				max_zone = i;
 17.1870 +
 17.1871 +		/*
 17.1872 +		 * For each of the different allocation types:
 17.1873 +		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
 17.1874 +		 */
 17.1875 +		for (i = 0; i < MAX_NR_ZONES; i++) {
 17.1876 +			/*
 17.1877 +			 * For each of the zones:
 17.1878 +			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
 17.1879 +			 */
 17.1880 +			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
 17.1881 +				zone = &zones[j];
 17.1882 +
 17.1883 +				/*
 17.1884 +				 * We never protect zones that don't have memory
 17.1885 +				 * in them (j>max_zone) or zones that aren't in
 17.1886 +				 * the zonelists for a certain type of
 17.1887 +				 * allocation (j>i).  We have to assign these to
 17.1888 +				 * zero because the lower zones take
 17.1889 +				 * contributions from the higher zones.
 17.1890 +				 */
 17.1891 +				if (j > max_zone || j > i) {
 17.1892 +					zone->protection[i] = 0;
 17.1893 +					continue;
 17.1894 +				}
 17.1895 +				/*
 17.1896 +				 * The contribution of the next higher zone
 17.1897 +				 */
 17.1898 +				zone->protection[i] = higherzone_val(zone,
 17.1899 +								max_zone, i);
 17.1900 +				zone->protection[i] += zone->pages_low;
 17.1901 +			}
 17.1902 +		}
 17.1903 +	}
 17.1904 +}
 17.1905 +
 17.1906 +/*
 17.1907 + * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
 17.1908 + *	that the pages_{min,low,high} values for each zone are set correctly 
 17.1909 + *	with respect to min_free_kbytes.
 17.1910 + */
 17.1911 +static void setup_per_zone_pages_min(void)
 17.1912 +{
 17.1913 +	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 17.1914 +	unsigned long lowmem_pages = 0;
 17.1915 +	struct zone *zone;
 17.1916 +	unsigned long flags;
 17.1917 +
 17.1918 +	/* Calculate total number of !ZONE_HIGHMEM pages */
 17.1919 +	for_each_zone(zone) {
 17.1920 +		if (!is_highmem(zone))
 17.1921 +			lowmem_pages += zone->present_pages;
 17.1922 +	}
 17.1923 +
 17.1924 +	for_each_zone(zone) {
 17.1925 +		spin_lock_irqsave(&zone->lru_lock, flags);
 17.1926 +		if (is_highmem(zone)) {
 17.1927 +			/*
 17.1928 +			 * Often, highmem doesn't need to reserve any pages.
 17.1929 +			 * But the pages_min/low/high values are also used for
 17.1930 +			 * batching up page reclaim activity so we need a
 17.1931 +			 * decent value here.
 17.1932 +			 */
 17.1933 +			int min_pages;
 17.1934 +
 17.1935 +			min_pages = zone->present_pages / 1024;
 17.1936 +			if (min_pages < SWAP_CLUSTER_MAX)
 17.1937 +				min_pages = SWAP_CLUSTER_MAX;
 17.1938 +			if (min_pages > 128)
 17.1939 +				min_pages = 128;
 17.1940 +			zone->pages_min = min_pages;
 17.1941 +		} else {
 17.1942 +			/* if it's a lowmem zone, reserve a number of pages 
 17.1943 +			 * proportionate to the zone's size.
 17.1944 +			 */
 17.1945 +			zone->pages_min = (pages_min * zone->present_pages) / 
 17.1946 +			                   lowmem_pages;
 17.1947 +		}
 17.1948 +
 17.1949 +		zone->pages_low = zone->pages_min * 2;
 17.1950 +		zone->pages_high = zone->pages_min * 3;
 17.1951 +		spin_unlock_irqrestore(&zone->lru_lock, flags);
 17.1952 +	}
 17.1953 +}
 17.1954 +
 17.1955 +/*
 17.1956 + * Initialise min_free_kbytes.
 17.1957 + *
 17.1958 + * For small machines we want it small (128k min).  For large machines
 17.1959 + * we want it large (16MB max).  But it is not linear, because network
 17.1960 + * bandwidth does not increase linearly with machine size.  We use
 17.1961 + *
 17.1962 + *	min_free_kbytes = sqrt(lowmem_kbytes)
 17.1963 + *
 17.1964 + * which yields
 17.1965 + *
 17.1966 + * 16MB:	128k
 17.1967 + * 32MB:	181k
 17.1968 + * 64MB:	256k
 17.1969 + * 128MB:	362k
 17.1970 + * 256MB:	512k
 17.1971 + * 512MB:	724k
 17.1972 + * 1024MB:	1024k
 17.1973 + * 2048MB:	1448k
 17.1974 + * 4096MB:	2048k
 17.1975 + * 8192MB:	2896k
 17.1976 + * 16384MB:	4096k
 17.1977 + */
 17.1978 +static int __init init_per_zone_pages_min(void)
 17.1979 +{
 17.1980 +	unsigned long lowmem_kbytes;
 17.1981 +
 17.1982 +	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 17.1983 +
 17.1984 +	min_free_kbytes = int_sqrt(lowmem_kbytes);
 17.1985 +	if (min_free_kbytes < 128)
 17.1986 +		min_free_kbytes = 128;
 17.1987 +	if (min_free_kbytes > 16384)
 17.1988 +		min_free_kbytes = 16384;
 17.1989 +	setup_per_zone_pages_min();
 17.1990 +	setup_per_zone_protection();
 17.1991 +	return 0;
 17.1992 +}
 17.1993 +module_init(init_per_zone_pages_min)
 17.1994 +
 17.1995 +/*
 17.1996 + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
 17.1997 + *	that we can call two helper functions whenever min_free_kbytes
 17.1998 + *	changes.
 17.1999 + */
 17.2000 +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
 17.2001 +		struct file *file, void __user *buffer, size_t *length)
 17.2002 +{
 17.2003 +	proc_dointvec(table, write, file, buffer, length);
 17.2004 +	setup_per_zone_pages_min();
 17.2005 +	setup_per_zone_protection();
 17.2006 +	return 0;
 17.2007 +}
 17.2008 +
 17.2009 +/*
 17.2010 + * lower_zone_protection_sysctl_handler - just a wrapper around
 17.2011 + *	proc_dointvec() so that we can call setup_per_zone_protection()
 17.2012 + *	whenever sysctl_lower_zone_protection changes.
 17.2013 + */
 17.2014 +int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
 17.2015 +		 struct file *file, void __user *buffer, size_t *length)
 17.2016 +{
 17.2017 +	proc_dointvec_minmax(table, write, file, buffer, length);
 17.2018 +	setup_per_zone_protection();
 17.2019 +	return 0;
 17.2020 +}