ia64/xen-unstable

changeset 1354:00059c1948cf

bitkeeper revision 1.891.1.5 (409ba2e8A6F60eP06BqyZUGapsn8XA)

Network interface for new IO model is now completed.
author kaf24@scramble.cl.cam.ac.uk
date Fri May 07 14:53:28 2004 +0000 (2004-05-07)
parents 74d515393e65
children a2abb67d5518 088303b99385
files .rootkeys tools/examples/xc_dom_create.py tools/xenctl/lib/utils.py tools/xend/lib/domain_controller.h tools/xend/lib/main.py tools/xend/lib/manager.py tools/xend/lib/netif.py xen/common/dom_mem_ops.c xen/common/domain.c xen/common/kernel.c xen/common/memory.c xenolinux-2.4.26-sparse/arch/xen/config.in xenolinux-2.4.26-sparse/arch/xen/defconfig xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h xenolinux-2.4.26-sparse/include/asm-xen/io.h xenolinux-2.4.26-sparse/include/asm-xen/pci.h xenolinux-2.4.26-sparse/mkbuildtree xenolinux-2.4.26-sparse/mm/page_alloc.c
line diff
     1.1 --- a/.rootkeys	Thu May 06 14:53:19 2004 +0000
     1.2 +++ b/.rootkeys	Fri May 07 14:53:28 2004 +0000
     1.3 @@ -107,6 +107,7 @@ 4055ad97wMLUj0BZT0e_T0EwQN0Bvw tools/xen
     1.4  4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h
     1.5  4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py
     1.6  4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py
     1.7 +409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py
     1.8  40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c
     1.9  4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py
    1.10  4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend
    1.11 @@ -735,6 +736,7 @@ 3e5a4e678ddsQOpbSiRdy1GRcDc9WA xenolinux
    1.12  3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h
    1.13  3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h
    1.14  3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h
    1.15 +409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h
    1.16  3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h
    1.17  3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h
    1.18  3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h
    1.19 @@ -762,6 +764,7 @@ 406aeeafkrnCuIVWLFv3kfn4uAD5Eg xenolinux
    1.20  3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c
    1.21  3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c
    1.22  3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c
    1.23 +409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c
    1.24  3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c
    1.25  3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c
    1.26  407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse
     2.1 --- a/tools/examples/xc_dom_create.py	Thu May 06 14:53:19 2004 +0000
     2.2 +++ b/tools/examples/xc_dom_create.py	Fri May 07 14:53:28 2004 +0000
     2.3 @@ -333,7 +333,18 @@ def make_domain():
     2.4                  xc.domain_destroy ( dom=id )
     2.5                  sys.exit()
     2.6  
     2.7 -    if not new_io_world:
     2.8 +    if new_io_world:
     2.9 +        cmsg = 'new_network_interface(dom='+str(id)+')'
    2.10 +        xend_response = xenctl.utils.xend_control_message(cmsg)
    2.11 +        if not xend_response['success']:
    2.12 +            print "Error creating network interface"
    2.13 +            print "Error type: " + xend_response['error_type']
    2.14 +            if xend_response['error_type'] == 'exception':
    2.15 +                print "Exception type: " + xend_response['exception_type']
    2.16 +                print "Exception val:  " + xend_response['exception_value']
    2.17 +            xc.domain_destroy ( dom=id )
    2.18 +            sys.exit()
    2.19 +    else:
    2.20          # setup virtual firewall rules for all aliases
    2.21          for ip in vfr_ipaddr:
    2.22              xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip )
     3.1 --- a/tools/xenctl/lib/utils.py	Thu May 06 14:53:19 2004 +0000
     3.2 +++ b/tools/xenctl/lib/utils.py	Fri May 07 14:53:28 2004 +0000
     3.3 @@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'):
     3.4              return m.group(1)
     3.5      return None
     3.6  
     3.7 -def get_current_ipgw(dev='eth0'):
     3.8 -    """Return a string containing the IP gateway for the given
     3.9 -    network interface (default 'eth0').
    3.10 -    """
    3.11 +def get_current_ipgw():
    3.12 +    """Return a string containing the default IP gateway."""
    3.13      fd = os.popen( '/sbin/route -n' )
    3.14      lines = fd.readlines()
    3.15      for line in lines:
    3.16 -        m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
    3.17 -                       '\s+\S+\s+\S*G.*' + dev + '.*', line )
    3.18 +        m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
    3.19 +                       '\s+0.0.0.0+\s+\S*G.*', line )
    3.20          if m:
    3.21              return m.group(1)
    3.22      return None
     4.1 --- a/tools/xend/lib/domain_controller.h	Thu May 06 14:53:19 2004 +0000
     4.2 +++ b/tools/xend/lib/domain_controller.h	Fri May 07 14:53:28 2004 +0000
     4.3 @@ -468,7 +468,6 @@ typedef struct {
     4.4      unsigned int   evtchn;            /* Event channel for notifications.    */
     4.5      unsigned long  tx_shmem_frame;    /* Page cont. tx shared comms window.  */
     4.6      unsigned long  rx_shmem_frame;    /* Page cont. rx shared comms window.  */
     4.7 -    unsigned long  shmem_frame;       
     4.8      /* OUT */
     4.9      unsigned int   status;
    4.10  } netif_be_connect_t; 
     5.1 --- a/tools/xend/lib/main.py	Thu May 06 14:53:19 2004 +0000
     5.2 +++ b/tools/xend/lib/main.py	Fri May 07 14:53:28 2004 +0000
     5.3 @@ -5,7 +5,7 @@
     5.4  ###########################################################
     5.5  
     5.6  import errno, re, os, pwd, select, signal, socket, struct, sys, time
     5.7 -import xend.blkif, xend.console, xend.manager, xend.utils, Xc
     5.8 +import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc
     5.9  
    5.10  
    5.11  # The following parameters could be placed in a configuration file.
    5.12 @@ -19,6 +19,8 @@ UNIX_SOCK    = 'management_sock' # relat
    5.13  CMSG_CONSOLE  = 0
    5.14  CMSG_BLKIF_BE = 1
    5.15  CMSG_BLKIF_FE = 2
    5.16 +CMSG_NETIF_BE = 3
    5.17 +CMSG_NETIF_FE = 4
    5.18  
    5.19  
    5.20  def port_from_dom(dom):
    5.21 @@ -162,6 +164,10 @@ def daemon_loop():
    5.22              if xend.blkif.interface.list.has_key(idx):
    5.23                  blk_if = xend.blkif.interface.list[idx]
    5.24  
    5.25 +            net_if = False
    5.26 +            if xend.netif.interface.list.has_key(idx):
    5.27 +                net_if = xend.netif.interface.list[idx]
    5.28 +
    5.29              # If we pick up a disconnect notification then we do any necessary
    5.30              # cleanup.
    5.31              if type == notifier.EXCEPTION:
    5.32 @@ -175,6 +181,9 @@ def daemon_loop():
    5.33                      if blk_if:
    5.34                          blk_if.destroy()
    5.35                          del blk_if
    5.36 +                    if net_if:
    5.37 +                        net_if.destroy()
    5.38 +                        del net_if
    5.39                      continue
    5.40  
    5.41              # Process incoming requests.
    5.42 @@ -188,6 +197,10 @@ def daemon_loop():
    5.43                      blk_if.ctrlif_rx_req(port, msg)
    5.44                  elif type == CMSG_BLKIF_BE and port == dom0_port:
    5.45                      xend.blkif.backend_rx_req(port, msg)
    5.46 +                elif type == CMSG_NETIF_FE and net_if:
    5.47 +                    net_if.ctrlif_rx_req(port, msg)
    5.48 +                elif type == CMSG_NETIF_BE and port == dom0_port:
    5.49 +                    xend.netif.backend_rx_req(port, msg)
    5.50                  else:
    5.51                      port.write_response(msg)
    5.52  
    5.53 @@ -198,6 +211,8 @@ def daemon_loop():
    5.54                  type = (msg.get_header())['type']
    5.55                  if type == CMSG_BLKIF_BE and port == dom0_port:
    5.56                      xend.blkif.backend_rx_rsp(port, msg)
    5.57 +                elif type == CMSG_NETIF_BE and port == dom0_port:
    5.58 +                    xend.netif.backend_rx_rsp(port, msg)
    5.59  
    5.60              # Send console data.
    5.61              if con_if and con_if.ctrlif_transmit_work(port):
    5.62 @@ -207,10 +222,18 @@ def daemon_loop():
    5.63              if blk_if and blk_if.ctrlif_transmit_work(port):
    5.64                  work_done = True
    5.65  
    5.66 +            # Send netif messages.
    5.67 +            if net_if and net_if.ctrlif_transmit_work(port):
    5.68 +                work_done = True
    5.69 +
    5.70              # Back-end block-device work.
    5.71              if port == dom0_port and xend.blkif.backend_do_work(port):
    5.72                  work_done = True
    5.73                  
    5.74 +            # Back-end network-device work.
    5.75 +            if port == dom0_port and xend.netif.backend_do_work(port):
    5.76 +                work_done = True
    5.77 +                
    5.78              # Finally, notify the remote end of any work that we did.
    5.79              if work_done:
    5.80                  port.notify()
     6.1 --- a/tools/xend/lib/manager.py	Thu May 06 14:53:19 2004 +0000
     6.2 +++ b/tools/xend/lib/manager.py	Fri May 07 14:53:28 2004 +0000
     6.3 @@ -4,7 +4,7 @@
     6.4  ## Copyright (c) 2004, K A Fraser (University of Cambridge)
     6.5  #############################################################
     6.6  
     6.7 -import xend.blkif, xend.console, xend.main, xend.utils
     6.8 +import xend.blkif, xend.netif, xend.console, xend.main, xend.utils
     6.9  
    6.10  
    6.11  ##
    6.12 @@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, 
    6.13  
    6.14      # Response is deferred until back-end driver sends acknowledgement.
    6.15      return None
    6.16 +
    6.17 +
    6.18 +##
    6.19 +## new_network_interface:
    6.20 +##  Create a new network interface for the specified domain @dom.
    6.21 +##
    6.22 +def new_network_interface(dom, handle=-1):
    6.23 +    # By default we create an interface with handle zero.
    6.24 +    if handle < 0:
    6.25 +        handle = 0
    6.26 +
    6.27 +    # We only support one interface per domain, which must have handle zero.
    6.28 +    if handle != 0:
    6.29 +        response = { 'success': False }
    6.30 +        response['error_type'] = 'Bad handle %d (only handle 0 ' + \
    6.31 +                                 'is supported)' % handle
    6.32 +        return response
    6.33 +
    6.34 +    # Find local event-channel port associated with the specified domain.
    6.35 +    port = xend.main.port_from_dom(dom)
    6.36 +    if not port:
    6.37 +        response = { 'success': False }
    6.38 +        response['error_type'] = 'Unknown domain %d' % dom
    6.39 +        return response
    6.40 +
    6.41 +    # The interface must not already exist.
    6.42 +    if xend.netif.interface.list.has_key(port.local_port):
    6.43 +        response = { 'success': False }
    6.44 +        response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \
    6.45 +                                 'exists' % (dom, handle)
    6.46 +        return response
    6.47 +
    6.48 +    # Create the new interface. Initially no virtual devices are attached.
    6.49 +    xend.netif.interface(dom, port.local_port)
    6.50 +
    6.51 +    # Response is deferred until back-end driver sends acknowledgement.
    6.52 +    return None
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/tools/xend/lib/netif.py	Fri May 07 14:53:28 2004 +0000
     7.3 @@ -0,0 +1,144 @@
     7.4 +
     7.5 +###################################################################
     7.6 +## xend/netif.py -- Network-interface management functions for Xend
     7.7 +## Copyright (c) 2004, K A Fraser (University of Cambridge)
     7.8 +###################################################################
     7.9 +
    7.10 +import errno, random, re, os, select, signal, socket, struct, sys
    7.11 +import xend.main, xend.console, xend.manager, xend.utils, Xc
    7.12 +
    7.13 +CMSG_NETIF_BE = 3
    7.14 +CMSG_NETIF_FE = 4
    7.15 +CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED =  0
    7.16 +CMSG_NETIF_FE_DRIVER_STATUS_CHANGED    = 32
    7.17 +CMSG_NETIF_FE_INTERFACE_CONNECT        = 33
    7.18 +CMSG_NETIF_FE_INTERFACE_DISCONNECT     = 34
    7.19 +CMSG_NETIF_BE_CREATE      = 0
    7.20 +CMSG_NETIF_BE_DESTROY     = 1
    7.21 +CMSG_NETIF_BE_CONNECT     = 2
    7.22 +CMSG_NETIF_BE_DISCONNECT  = 3
    7.23 +
    7.24 +pendmsg = None
    7.25 +pendaddr = None
    7.26 +
    7.27 +def backend_tx_req(msg):
    7.28 +    port = xend.main.dom0_port
    7.29 +    if port.space_to_write_request():
    7.30 +        port.write_request(msg)
    7.31 +        port.notify()
    7.32 +    else:
    7.33 +        xend.netif.pendmsg = msg
    7.34 +
    7.35 +def backend_rx_req(port, msg):
    7.36 +    port.write_response(msg)
    7.37 +
    7.38 +def backend_rx_rsp(port, msg):
    7.39 +    subtype = (msg.get_header())['subtype']
    7.40 +    print "Received netif-be response, subtype %d" % subtype
    7.41 +    if subtype == CMSG_NETIF_BE_CREATE:
    7.42 +        rsp = { 'success': True }
    7.43 +        xend.main.send_management_response(rsp, xend.netif.pendaddr)
    7.44 +    elif subtype == CMSG_NETIF_BE_CONNECT:
    7.45 +        (dom,hnd,evtchn,tx_frame,rx_frame,st) = \
    7.46 +           struct.unpack("QIILLI", msg.get_payload())
    7.47 +        netif = interface.list[xend.main.port_from_dom(dom).local_port]
    7.48 +        msg = xend.utils.message(CMSG_NETIF_FE, \
    7.49 +                                 CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
    7.50 +        msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \
    7.51 +                                       netif.evtchn['port2'], \
    7.52 +                                       netif.mac[0],netif.mac[1], \
    7.53 +                                       netif.mac[2],netif.mac[3], \
    7.54 +                                       netif.mac[4],netif.mac[5], \
    7.55 +                                       0,0))
    7.56 +        netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg)
    7.57 +
    7.58 +def backend_do_work(port):
    7.59 +    global pendmsg
    7.60 +    if pendmsg and port.space_to_write_request():
    7.61 +        port.write_request(pendmsg)
    7.62 +        pendmsg = None
    7.63 +        return True
    7.64 +    return False
    7.65 +
    7.66 +
    7.67 +class interface:
    7.68 +
    7.69 +    # Dictionary of all network-device interfaces.
    7.70 +    list = {}
    7.71 +
    7.72 +
    7.73 +    # NB. 'key' is an opaque value that has no meaning in this class.
    7.74 +    def __init__(self, dom, key):
    7.75 +        self.dom     = dom
    7.76 +        self.key     = key
    7.77 +        self.pendmsg = None
    7.78 +
    7.79 +        # VIFs get a random MAC address with a "special" vendor id.
    7.80 +        # 
    7.81 +        # NB. The vendor is currently an "obsolete" one that used to belong
    7.82 +        # to DEC (AA-00-00). Using it is probably a bit rude :-)
    7.83 +        # 
    7.84 +        # NB2. The first bit of the first random octet is set to zero for
    7.85 +        # all dynamic MAC addresses. This may allow us to manually specify
    7.86 +        # MAC addresses for some VIFs with no fear of clashes.
    7.87 +        self.mac = [ 0xaa, 0x00, 0x00 ]
    7.88 +        self.mac.append(int(random.random()*128))
    7.89 +        self.mac.append(int(random.random()*256))
    7.90 +        self.mac.append(int(random.random()*256))
    7.91 +                
    7.92 +        interface.list[key] = self
    7.93 +        msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0)
    7.94 +        msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \
    7.95 +                                       self.mac[0],self.mac[1], \
    7.96 +                                       self.mac[2],self.mac[3], \
    7.97 +                                       self.mac[4],self.mac[5], \
    7.98 +                                       0,0,0))
    7.99 +        xend.netif.pendaddr = xend.main.mgmt_req_addr
   7.100 +        backend_tx_req(msg)
   7.101 +
   7.102 +
   7.103 +    # Completely destroy this interface.
   7.104 +    def destroy(self):
   7.105 +        del interface.list[self.key]
   7.106 +        msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0)
   7.107 +        msg.append_payload(struct.pack("QII",self.dom,0,0))
   7.108 +        backend_tx_req(msg)        
   7.109 +
   7.110 +
   7.111 +    # The parameter @port is the control-interface event channel. This method
   7.112 +    # returns True if messages were written to the control interface.
   7.113 +    def ctrlif_transmit_work(self, port):
   7.114 +        if self.pendmsg and port.space_to_write_request():
   7.115 +            port.write_request(self.pendmsg)
   7.116 +            self.pendmsg = None
   7.117 +            return True
   7.118 +        return False
   7.119 +
   7.120 +    def ctrlif_tx_req(self, port, msg):
   7.121 +        if port.space_to_write_request():
   7.122 +            port.write_request(msg)
   7.123 +            port.notify()
   7.124 +        else:
   7.125 +            self.pendmsg = msg
   7.126 +
   7.127 +    def ctrlif_rx_req(self, port, msg):
   7.128 +        port.write_response(msg)
   7.129 +        subtype = (msg.get_header())['subtype']
   7.130 +        if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
   7.131 +            msg = xend.utils.message(CMSG_NETIF_FE, \
   7.132 +                                     CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
   7.133 +            msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \
   7.134 +                                           self.mac[1],self.mac[2], \
   7.135 +                                           self.mac[3],self.mac[4], \
   7.136 +                                           self.mac[5],0,0))
   7.137 +            self.ctrlif_tx_req(port, msg)
   7.138 +        elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT:
   7.139 +            (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload())
   7.140 +            xc = Xc.new()
   7.141 +            self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom)
   7.142 +            msg = xend.utils.message(CMSG_NETIF_BE, \
   7.143 +                                     CMSG_NETIF_BE_CONNECT, 0)
   7.144 +            msg.append_payload(struct.pack("QIILLI",self.dom,0, \
   7.145 +                                           self.evtchn['port1'],tx_frame, \
   7.146 +                                           rx_frame,0))
   7.147 +            backend_tx_req(msg)
     8.1 --- a/xen/common/dom_mem_ops.c	Thu May 06 14:53:19 2004 +0000
     8.2 +++ b/xen/common/dom_mem_ops.c	Fri May 07 14:53:28 2004 +0000
     8.3 @@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_st
     8.4      {
     8.5          /* Leave some slack pages; e.g., for the network. */
     8.6          if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
     8.7 -                                   (PAGE_SHIFT-10))) ) 
     8.8 +                                   (PAGE_SHIFT-10))) )
     8.9 +        {
    8.10 +            DPRINTK("Not enough slack: %u %u\n",
    8.11 +                    free_pfns,
    8.12 +                    SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10));
    8.13              break;
    8.14 +        }
    8.15  
    8.16          /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
    8.17          if ( unlikely((page = alloc_domain_page(p)) == NULL) )
    8.18 +        {
    8.19 +            DPRINTK("Could not allocate a frame\n");
    8.20              break;
    8.21 -        
    8.22 +        }
    8.23 +
    8.24          /* Inform the domain of the new page's machine address. */ 
    8.25          mpfn = (unsigned long)(page - frame_table);
    8.26          copy_to_user(op.pages, &mpfn, sizeof(mpfn));
     9.1 --- a/xen/common/domain.c	Thu May 06 14:53:19 2004 +0000
     9.2 +++ b/xen/common/domain.c	Fri May 07 14:53:28 2004 +0000
     9.3 @@ -334,6 +334,8 @@ struct pfn_info *alloc_domain_page(struc
     9.4          spin_lock(&p->page_list_lock);
     9.5          if ( unlikely(p->tot_pages >= p->max_pages) )
     9.6          {
     9.7 +            DPRINTK("Over-allocation for domain %llu: %u >= %u\n",
     9.8 +                    p->domain, p->tot_pages, p->max_pages);
     9.9              spin_unlock(&p->page_list_lock);
    9.10              goto free_and_exit;
    9.11          }
    9.12 @@ -884,7 +886,7 @@ int construct_dom0(struct task_struct *p
    9.13          page->type_and_flags  = 0;
    9.14          page->count_and_flags = PGC_allocated | 1;
    9.15          list_add_tail(&page->list, &p->page_list);
    9.16 -        p->tot_pages++;
    9.17 +        p->tot_pages++; p->max_pages++;
    9.18      }
    9.19  
    9.20      mpt_alloc = (vpt_start - v_start) + alloc_start;
    10.1 --- a/xen/common/kernel.c	Thu May 06 14:53:19 2004 +0000
    10.2 +++ b/xen/common/kernel.c	Fri May 07 14:53:28 2004 +0000
    10.3 @@ -105,7 +105,6 @@ static struct {
    10.4  void cmain(unsigned long magic, multiboot_info_t *mbi)
    10.5  {
    10.6      struct task_struct *new_dom;
    10.7 -    dom0_createdomain_t dom0_params;
    10.8      unsigned long max_page;
    10.9      unsigned char *cmdline;
   10.10      module_t *mod = (module_t *)__va(mbi->mods_addr);
   10.11 @@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboo
   10.12      task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task;
   10.13  
   10.14      /* Create initial domain 0. */
   10.15 -    dom0_params.memory_kb = opt_dom0_mem;
   10.16      new_dom = do_createdomain(0, 0);
   10.17      if ( new_dom == NULL )
   10.18          panic("Error creating domain 0\n");
    11.1 --- a/xen/common/memory.c	Thu May 06 14:53:19 2004 +0000
    11.2 +++ b/xen/common/memory.c	Fri May 07 14:53:28 2004 +0000
    11.3 @@ -940,17 +940,25 @@ static int do_extended_command(unsigned 
    11.4          }
    11.5          break;
    11.6  
    11.7 +        /* XXX This function is racey! */
    11.8      case MMUEXT_REASSIGN_PAGE:
    11.9 -        if ( !IS_PRIV(current) )
   11.10 +        if ( unlikely(!IS_PRIV(current)) )
   11.11          {
   11.12              MEM_LOG("Dom %llu has no privilege to reassign page ownership",
   11.13                      current->domain);
   11.14              okay = 0;
   11.15          }
   11.16 -        else if ( percpu_info[cpu].gps != NULL )
   11.17 +        else if ( likely(percpu_info[cpu].gps != NULL) )
   11.18          {
   11.19 +            current->tot_pages--;
   11.20 +            percpu_info[cpu].gps->tot_pages++;
   11.21              page->u.domain = percpu_info[cpu].gps;
   11.22          }
   11.23 +        else
   11.24 +        {
   11.25 +            MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn);
   11.26 +            okay = 0;
   11.27 +        }
   11.28          break;
   11.29  
   11.30      case MMUEXT_RESET_SUBJECTDOM:
    12.1 --- a/xenolinux-2.4.26-sparse/arch/xen/config.in	Thu May 06 14:53:19 2004 +0000
    12.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/config.in	Fri May 07 14:53:28 2004 +0000
    12.3 @@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then
    12.4     bool 'HIGHMEM I/O support' CONFIG_HIGHIO
    12.5  fi
    12.6  
    12.7 +define_int CONFIG_FORCE_MAX_ZONEORDER 12
    12.8 +
    12.9  #bool 'Symmetric multi-processing support' CONFIG_SMP
   12.10  #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
   12.11  #   define_bool CONFIG_HAVE_DEC_LOCK y
    13.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig	Thu May 06 14:53:19 2004 +0000
    13.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig	Fri May 07 14:53:28 2004 +0000
    13.3 @@ -50,6 +50,7 @@ CONFIG_X86_TSC=y
    13.4  CONFIG_X86_L1_CACHE_SHIFT=5
    13.5  CONFIG_NOHIGHMEM=y
    13.6  # CONFIG_HIGHMEM4G is not set
    13.7 +CONFIG_FORCE_MAX_ZONEORDER=12
    13.8  
    13.9  #
   13.10  # General setup
   13.11 @@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y
   13.12  # Network testing
   13.13  #
   13.14  # CONFIG_NET_PKTGEN is not set
   13.15 +CONFIG_NETDEVICES=y
   13.16  
   13.17  #
   13.18  # Block devices
    14.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev	Thu May 06 14:53:19 2004 +0000
    14.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev	Fri May 07 14:53:28 2004 +0000
    14.3 @@ -51,6 +51,7 @@ CONFIG_X86_TSC=y
    14.4  CONFIG_X86_L1_CACHE_SHIFT=5
    14.5  CONFIG_NOHIGHMEM=y
    14.6  # CONFIG_HIGHMEM4G is not set
    14.7 +CONFIG_FORCE_MAX_ZONEORDER=12
    14.8  
    14.9  #
   14.10  # General setup
    15.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h	Thu May 06 14:53:19 2004 +0000
    15.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h	Fri May 07 14:53:28 2004 +0000
    15.3 @@ -10,6 +10,7 @@
    15.4  #include <linux/rbtree.h>
    15.5  #include <linux/interrupt.h>
    15.6  #include <linux/slab.h>
    15.7 +#include <linux/blkdev.h>
    15.8  #include <asm/ctrl_if.h>
    15.9  #include <asm/io.h>
   15.10  #include "../blkif.h"
    16.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c	Thu May 06 14:53:19 2004 +0000
    16.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c	Fri May 07 14:53:28 2004 +0000
    16.3 @@ -74,7 +74,8 @@ void blkif_ctrlif_init(void)
    16.4      ctrl_msg_t                       cmsg;
    16.5      blkif_be_driver_status_changed_t st;
    16.6  
    16.7 -    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx);
    16.8 +    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
    16.9 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
   16.10  
   16.11      /* Send a driver-UP notification to the domain controller. */
   16.12      cmsg.type      = CMSG_BLKIF_BE;
    17.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c	Thu May 06 14:53:19 2004 +0000
    17.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c	Fri May 07 14:53:28 2004 +0000
    17.3 @@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *cre
    17.4      unsigned int  handle = create->blkif_handle;
    17.5      blkif_t     **pblkif, *blkif;
    17.6  
    17.7 -    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL )
    17.8 +    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
    17.9      {
   17.10          DPRINTK("Could not create blkif: out of memory\n");
   17.11          create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
    18.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c	Thu May 06 14:53:19 2004 +0000
    18.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c	Fri May 07 14:53:28 2004 +0000
    18.3 @@ -24,17 +24,15 @@
    18.4  #define MAX_PENDING_REQS 64
    18.5  #define BATCH_PER_DOMAIN 16
    18.6  
    18.7 -static struct vm_struct *mmap_vma;
    18.8 -#define MMAP_PAGES_PER_SEGMENT \
    18.9 -    ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1)
   18.10 +static unsigned long mmap_vstart;
   18.11  #define MMAP_PAGES_PER_REQUEST \
   18.12 -    (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT)
   18.13 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
   18.14  #define MMAP_PAGES             \
   18.15      (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
   18.16 -#define MMAP_VADDR(_req,_seg)            \
   18.17 -    ((unsigned long)mmap_vma->addr +     \
   18.18 +#define MMAP_VADDR(_req,_seg)                        \
   18.19 +    (mmap_vstart +                                   \
   18.20       ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
   18.21 -     ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE))
   18.22 +     ((_seg) * PAGE_SIZE))
   18.23  
   18.24  /*
   18.25   * Each outstanding request that we've passed to the lower device layers has a 
   18.26 @@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blki
   18.27      prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
   18.28      for ( i = 0; i < req->nr_segments; i++ )
   18.29      {
   18.30 -        if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) )
   18.31 +        /* Make sure the buffer is page-sized. */
   18.32 +        if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) ||
   18.33 +             (blkif_last_sect(req->frame_and_sects[i]) != 7) )
   18.34              goto bad_descriptor;
   18.35          rc = direct_remap_area_pages(&init_mm, 
   18.36                                       MMAP_VADDR(pending_idx, i),
   18.37 -                                     req->buffer_and_sects[i] & PAGE_MASK, 
   18.38 +                                     req->frame_and_sects[i] & PAGE_MASK, 
   18.39                                       PAGE_SIZE, prot, blkif->domid);
   18.40          if ( rc != 0 )
   18.41              goto bad_descriptor;
   18.42 @@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t
   18.43      extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
   18.44      struct buffer_head *bh;
   18.45      int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
   18.46 -    unsigned short nr_sects;
   18.47 -    unsigned long buffer;
   18.48 +    short nr_sects;
   18.49 +    unsigned long buffer, fas;
   18.50      int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
   18.51      pending_req_t *pending_req;
   18.52      pgprot_t       prot;
   18.53  
   18.54      /* We map virtual scatter/gather segments to physical segments. */
   18.55      int new_segs, nr_psegs = 0;
   18.56 -    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
   18.57 +    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
   18.58  
   18.59      /* Check that number of segments is sane. */
   18.60      if ( unlikely(req->nr_segments == 0) || 
   18.61 @@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t
   18.62       */
   18.63      for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
   18.64      {
   18.65 -        buffer   = req->buffer_and_sects[i] & ~0x1FF;
   18.66 -        nr_sects = req->buffer_and_sects[i] &  0x1FF;
   18.67 -
   18.68 -        if ( unlikely(nr_sects == 0) )
   18.69 -            continue;
   18.70 +        fas      = req->frame_and_sects[i];
   18.71 +        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
   18.72 +        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
   18.73  
   18.74 -        if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) )
   18.75 -        {
   18.76 -            DPRINTK("Too many sectors in segment\n");
   18.77 +        if ( nr_sects <= 0 )
   18.78              goto bad_descriptor;
   18.79 -        }
   18.80  
   18.81          phys_seg[nr_psegs].dev           = req->device;
   18.82          phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
   18.83 @@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t
   18.84          }
   18.85    
   18.86          nr_psegs += new_segs;
   18.87 -        ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2);
   18.88 +        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
   18.89      }
   18.90  
   18.91      /* Nonsensical zero-sized request? */
   18.92 @@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t
   18.93  
   18.94      for ( i = 0; i < nr_psegs; i++ )
   18.95      {
   18.96 -        unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + 
   18.97 -                            (phys_seg[i].nr_sects << 9) + 
   18.98 -                            (PAGE_SIZE - 1)) & PAGE_MASK;
   18.99          int rc = direct_remap_area_pages(&init_mm, 
  18.100                                           MMAP_VADDR(pending_idx, i),
  18.101                                           phys_seg[i].buffer & PAGE_MASK, 
  18.102 -                                         sz, prot, blkif->domid);
  18.103 +                                         PAGE_SIZE, prot, blkif->domid);
  18.104          if ( rc != 0 )
  18.105          {
  18.106              DPRINTK("invalid buffer\n");
  18.107 @@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t
  18.108                                MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
  18.109              goto bad_descriptor;
  18.110          }
  18.111 +        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
  18.112 +            phys_seg[i].buffer >> PAGE_SHIFT;
  18.113      }
  18.114  
  18.115      pending_req = &pending_reqs[pending_idx];
  18.116 @@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t
  18.117          bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
  18.118          bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
  18.119              (phys_seg[i].buffer & ~PAGE_MASK);
  18.120 +//        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
  18.121          bh->b_end_io        = end_block_io_op;
  18.122          bh->b_private       = pending_req;
  18.123  
  18.124 @@ -456,13 +451,13 @@ static int __init init_module(void)
  18.125  {
  18.126      int i;
  18.127  
  18.128 +    if ( !(start_info.flags & SIF_INITDOMAIN) )
  18.129 +        return 0;
  18.130 +
  18.131      blkif_interface_init();
  18.132  
  18.133 -    if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL )
  18.134 -    {
  18.135 -        printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n");
  18.136 -        return -ENOMEM;
  18.137 -    }
  18.138 +    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
  18.139 +        BUG();
  18.140  
  18.141      pending_cons = 0;
  18.142      pending_prod = MAX_PENDING_REQS;
  18.143 @@ -484,6 +479,7 @@ static int __init init_module(void)
  18.144  
  18.145  static void cleanup_module(void)
  18.146  {
  18.147 +    BUG();
  18.148  }
  18.149  
  18.150  module_init(init_module);
    19.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c	Thu May 06 14:53:19 2004 +0000
    19.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c	Fri May 07 14:53:28 2004 +0000
    19.3 @@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *c
    19.4          }
    19.5      }
    19.6  
    19.7 -    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) )
    19.8 +    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
    19.9      {
   19.10          DPRINTK("vbd_create: out of memory\n");
   19.11          create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
   19.12 @@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow)
   19.13      } 
   19.14  
   19.15      if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
   19.16 -                               GFP_ATOMIC)) == NULL) )
   19.17 +                               GFP_KERNEL)) == NULL) )
   19.18      {
   19.19          DPRINTK("vbd_grow: out of memory\n");
   19.20          grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
    20.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h	Thu May 06 14:53:19 2004 +0000
    20.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h	Fri May 07 14:53:28 2004 +0000
    20.3 @@ -26,19 +26,22 @@
    20.4   */
    20.5  #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
    20.6  
    20.7 -#define BLKIF_MAX_SECTORS_PER_SEGMENT  16
    20.8 -
    20.9  typedef struct {
   20.10      u8             operation;        /* BLKIF_OP_???                         */
   20.11      u8             nr_segments;      /* number of segments                   */
   20.12      blkif_vdev_t   device;           /* only for read/write requests         */
   20.13      unsigned long  id;               /* private guest value, echoed in resp  */
   20.14      blkif_sector_t sector_number;    /* start sector idx on disk (r/w only)  */
   20.15 -    /* Least 9 bits is 'nr_sects'. High 23 bits is the address.       */
   20.16 -    /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */
   20.17 -    unsigned long  buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   20.18 +    /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame.   */
   20.19 +    /* @first_sect: first sector in frame to transfer (inclusive).           */
   20.20 +    /* @last_sect: last sector in frame to transfer (inclusive).             */
   20.21 +    /* @frame: machine page frame number.                                    */
   20.22 +    unsigned long  frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   20.23  } blkif_request_t;
   20.24  
   20.25 +#define blkif_first_sect(_fas) (((_fas)>>3)&7)
   20.26 +#define blkif_last_sect(_fas)  ((_fas)&7)
   20.27 +
   20.28  typedef struct {
   20.29      unsigned long   id;              /* copied from request */
   20.30      u8              operation;       /* copied from request */
   20.31 @@ -79,8 +82,8 @@ typedef struct {
   20.32   *  @device      == unused (zero)
   20.33   *  @id          == any value (echoed in response message)
   20.34   *  @sector_num  == unused (zero)
   20.35 - *  @buffer_and_sects == list of page-aligned, page-sized buffers.
   20.36 - *                       (i.e., nr_sects == 8).
   20.37 + *  @frame_and_sects == list of page-sized buffers.
   20.38 + *                       (i.e., @first_sect == 0, @last_sect == 7).
   20.39   * 
   20.40   * The response is a list of vdisk_t elements copied into the out-of-band
   20.41   * probe buffer. On success the response status field contains the number
    21.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c	Thu May 06 14:53:19 2004 +0000
    21.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c	Fri May 07 14:53:28 2004 +0000
    21.3 @@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linu
    21.4  static unsigned int blkif_state = BLKIF_STATE_CLOSED;
    21.5  static unsigned int blkif_evtchn, blkif_irq;
    21.6  
    21.7 -static struct tq_struct blkif_statechange_tq;
    21.8 -
    21.9  static int blkif_control_rsp_valid;
   21.10  static blkif_response_t blkif_control_rsp;
   21.11  
   21.12 @@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned 
   21.13      struct gendisk     *gd;
   21.14      blkif_request_t    *req;
   21.15      struct buffer_head *bh;
   21.16 +    unsigned int        fsect, lsect;
   21.17  
   21.18 -    if ( unlikely(nr_sectors >= (1<<9)) )
   21.19 -        BUG();
   21.20 +    fsect = (buffer_ma & ~PAGE_MASK) >> 9;
   21.21 +    lsect = fsect + nr_sectors - 1;
   21.22 +
   21.23 +    /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
   21.24      if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
   21.25          BUG();
   21.26 +    if ( lsect > 7 )
   21.27 +        BUG();
   21.28 +
   21.29 +    buffer_ma &= PAGE_MASK;
   21.30  
   21.31      if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
   21.32          return 1;
   21.33 @@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned 
   21.34              bh = (struct buffer_head *)id;
   21.35              bh->b_reqnext = (struct buffer_head *)req->id;
   21.36              req->id = id;
   21.37 -            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
   21.38 -            if ( ++req->nr_segments < MAX_BLK_SEGS )
   21.39 +            req->frame_and_sects[req->nr_segments] = 
   21.40 +                buffer_ma | (fsect<<3) | lsect;
   21.41 +            if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
   21.42                  sg_next_sect += nr_sectors;
   21.43              else
   21.44                  DISABLE_SCATTERGATHER();
   21.45 @@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned 
   21.46      req->sector_number = (blkif_sector_t)sector_number;
   21.47      req->device        = device; 
   21.48      req->nr_segments   = 1;
   21.49 -    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
   21.50 +    req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
   21.51      req_prod++;
   21.52  
   21.53      return 0;
   21.54 @@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t 
   21.55  }
   21.56  
   21.57  
   21.58 -static void blkif_bringup_phase1(void *unused)
   21.59 +static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
   21.60  {
   21.61      ctrl_msg_t                   cmsg;
   21.62      blkif_fe_interface_connect_t up;
   21.63  
   21.64 -    /* Move from CLOSED to DISCONNECTED state. */
   21.65 -    blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
   21.66 -    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
   21.67 -    blkif_state  = BLKIF_STATE_DISCONNECTED;
   21.68 -
   21.69 -    /* Construct an interface-CONNECT message for the domain controller. */
   21.70 -    cmsg.type      = CMSG_BLKIF_FE;
   21.71 -    cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
   21.72 -    cmsg.length    = sizeof(blkif_fe_interface_connect_t);
   21.73 -    up.handle      = 0;
   21.74 -    up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
   21.75 -    memcpy(cmsg.msg, &up, sizeof(up));
   21.76 -
   21.77 -    /* Tell the controller to bring up the interface. */
   21.78 -    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
   21.79 -}
   21.80 -
   21.81 -static void blkif_bringup_phase2(void *unused)
   21.82 -{
   21.83 -    blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
   21.84 -    (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
   21.85 -
   21.86 -    /* Probe for discs that are attached to the interface. */
   21.87 -    xlvbd_init();
   21.88 -
   21.89 -    blkif_state = BLKIF_STATE_CONNECTED;
   21.90 -
   21.91 -    /* Kick pending requests. */
   21.92 -    spin_lock_irq(&io_request_lock);
   21.93 -    kick_pending_request_queues();
   21.94 -    spin_unlock_irq(&io_request_lock);
   21.95 -}
   21.96 -
   21.97 -static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
   21.98 -{
   21.99      if ( status->handle != 0 )
  21.100      {
  21.101          printk(KERN_WARNING "Status change on unsupported blkif %d\n",
  21.102 @@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe
  21.103                     " in state %d\n", blkif_state);
  21.104              break;
  21.105          }
  21.106 -        blkif_statechange_tq.routine = blkif_bringup_phase1;
  21.107 -        schedule_task(&blkif_statechange_tq);
  21.108 +
  21.109 +        /* Move from CLOSED to DISCONNECTED state. */
  21.110 +        blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
  21.111 +        blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
  21.112 +        blkif_state  = BLKIF_STATE_DISCONNECTED;
  21.113 +
  21.114 +        /* Construct an interface-CONNECT message for the domain controller. */
  21.115 +        cmsg.type      = CMSG_BLKIF_FE;
  21.116 +        cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
  21.117 +        cmsg.length    = sizeof(blkif_fe_interface_connect_t);
  21.118 +        up.handle      = 0;
  21.119 +        up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
  21.120 +        memcpy(cmsg.msg, &up, sizeof(up));
  21.121 +        
  21.122 +        /* Tell the controller to bring up the interface. */
  21.123 +        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  21.124          break;
  21.125  
  21.126      case BLKIF_INTERFACE_STATUS_CONNECTED:
  21.127 @@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe
  21.128                     " in state %d\n", blkif_state);
  21.129              break;
  21.130          }
  21.131 +
  21.132          blkif_evtchn = status->evtchn;
  21.133 -        blkif_statechange_tq.routine = blkif_bringup_phase2;
  21.134 -        schedule_task(&blkif_statechange_tq);
  21.135 +        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
  21.136 +        (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
  21.137 +        
  21.138 +        /* Probe for discs that are attached to the interface. */
  21.139 +        xlvbd_init();
  21.140 +        
  21.141 +        blkif_state = BLKIF_STATE_CONNECTED;
  21.142 +        
  21.143 +        /* Kick pending requests. */
  21.144 +        spin_lock_irq(&io_request_lock);
  21.145 +        kick_pending_request_queues();
  21.146 +        spin_unlock_irq(&io_request_lock);
  21.147          break;
  21.148  
  21.149      default:
  21.150 @@ -675,7 +671,11 @@ int __init xlblk_init(void)
  21.151      ctrl_msg_t                       cmsg;
  21.152      blkif_fe_driver_status_changed_t st;
  21.153  
  21.154 -    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx);
  21.155 +    if ( start_info.flags & SIF_INITDOMAIN )
  21.156 +        return 0;
  21.157 +
  21.158 +    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
  21.159 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
  21.160  
  21.161      /* Send a driver-UP notification to the domain controller. */
  21.162      cmsg.type      = CMSG_BLKIF_FE;
    22.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c	Thu May 06 14:53:19 2004 +0000
    22.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c	Fri May 07 14:53:28 2004 +0000
    22.3 @@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *d
    22.4      memset(&req, 0, sizeof(req));
    22.5      req.operation   = BLKIF_OP_PROBE;
    22.6      req.nr_segments = 1;
    22.7 -    req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512);
    22.8 +    req.frame_and_sects[0] = virt_to_machine(buf) | 7;
    22.9  
   22.10      blkif_control_send(&req, &rsp);
   22.11  
    23.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c	Thu May 06 14:53:19 2004 +0000
    23.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c	Fri May 07 14:53:28 2004 +0000
    23.3 @@ -513,7 +513,7 @@ static int __init xencons_init(void)
    23.4      }
    23.5      else
    23.6      {
    23.7 -        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx);
    23.8 +        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0);
    23.9      }
   23.10  
   23.11      printk("Xen virtual console successfully installed\n");
    24.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c	Thu May 06 14:53:19 2004 +0000
    24.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c	Fri May 07 14:53:28 2004 +0000
    24.3 @@ -10,8 +10,6 @@
    24.4  
    24.5  static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
    24.6  {
    24.7 -    DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype);
    24.8 -    
    24.9      switch ( msg->subtype )
   24.10      {
   24.11      case CMSG_NETIF_BE_CREATE:
   24.12 @@ -54,7 +52,8 @@ void netif_ctrlif_init(void)
   24.13      ctrl_msg_t                       cmsg;
   24.14      netif_be_driver_status_changed_t st;
   24.15  
   24.16 -    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx);
   24.17 +    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
   24.18 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
   24.19  
   24.20      /* Send a driver-UP notification to the domain controller. */
   24.21      cmsg.type      = CMSG_NETIF_BE;
    25.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c	Thu May 06 14:53:19 2004 +0000
    25.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c	Fri May 07 14:53:28 2004 +0000
    25.3 @@ -7,6 +7,7 @@
    25.4   */
    25.5  
    25.6  #include "common.h"
    25.7 +#include <linux/rtnetlink.h>
    25.8  
    25.9  #define NETIF_HASHSZ 1024
   25.10  #define NETIF_HASH(_d,_h) \
   25.11 @@ -14,6 +15,7 @@
   25.12  
   25.13  static netif_t *netif_hash[NETIF_HASHSZ];
   25.14  static struct net_device *bridge_dev;
   25.15 +static struct net_bridge *bridge_br;
   25.16  
   25.17  netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
   25.18  {
   25.19 @@ -36,8 +38,10 @@ void __netif_disconnect_complete(netif_t
   25.20       */
   25.21      unbind_evtchn_from_irq(netif->evtchn);
   25.22      vfree(netif->tx); /* Frees netif->rx as well. */
   25.23 -    (void)br_del_if((struct net_bridge *)bridge_dev->priv, netif->dev);
   25.24 +    rtnl_lock();
   25.25 +    (void)br_del_if(bridge_br, netif->dev);
   25.26      (void)dev_close(netif->dev);
   25.27 +    rtnl_unlock();
   25.28  
   25.29      /* Construct the deferred response message. */
   25.30      cmsg.type         = CMSG_NETIF_BE;
   25.31 @@ -73,7 +77,7 @@ void netif_create(netif_be_create_t *cre
   25.32      struct net_device *dev;
   25.33      netif_t          **pnetif, *netif;
   25.34  
   25.35 -    dev = alloc_netdev(sizeof(netif_t), "netif-be-%d", ether_setup);
   25.36 +    dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup);
   25.37      if ( dev == NULL )
   25.38      {
   25.39          DPRINTK("Could not create netif: out of memory\n");
   25.40 @@ -111,7 +115,10 @@ void netif_create(netif_be_create_t *cre
   25.41      dev->hard_start_xmit = netif_be_start_xmit;
   25.42      dev->get_stats       = netif_be_get_stats;
   25.43      memcpy(dev->dev_addr, create->mac, ETH_ALEN);
   25.44 -    
   25.45 +
   25.46 +    /* XXX In bridge mode we should force a different MAC from remote end. */
   25.47 +    dev->dev_addr[2] ^= 1;
   25.48 +
   25.49      if ( register_netdev(dev) != 0 )
   25.50      {
   25.51          DPRINTK("Could not register new net device\n");
   25.52 @@ -225,15 +232,27 @@ void netif_connect(netif_be_connect_t *c
   25.53      netif->status         = CONNECTED;
   25.54      netif_get(netif);
   25.55  
   25.56 +    rtnl_lock();
   25.57 +
   25.58      (void)dev_open(netif->dev);
   25.59 -    (void)br_add_if((struct net_bridge *)bridge_dev->priv, netif->dev);
   25.60 -    /* At this point we try to ensure that eth0 is attached to the bridge. */
   25.61 +    (void)br_add_if(bridge_br, netif->dev);
   25.62 +
   25.63 +    /*
   25.64 +     * The default config is a very simple binding to eth0.
   25.65 +     * If eth0 is being used as an IP interface by this OS then someone
   25.66 +     * must add eth0's IP address to nbe-br, and change the routing table
   25.67 +     * to refer to nbe-br instead of eth0.
   25.68 +     */
   25.69 +    (void)dev_open(bridge_dev);
   25.70      if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL )
   25.71      {
   25.72          (void)dev_open(eth0_dev);
   25.73 -        (void)br_add_if((struct net_bridge *)bridge_dev->priv, eth0_dev);
   25.74 +        (void)br_add_if(bridge_br, eth0_dev);
   25.75      }
   25.76 -    (void)request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif);
   25.77 +
   25.78 +    rtnl_unlock();
   25.79 +
   25.80 +    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
   25.81      netif_start_queue(netif->dev);
   25.82  
   25.83      connect->status = NETIF_BE_STATUS_OKAY;
   25.84 @@ -271,8 +290,11 @@ int netif_disconnect(netif_be_disconnect
   25.85  void netif_interface_init(void)
   25.86  {
   25.87      memset(netif_hash, 0, sizeof(netif_hash));
   25.88 -    if ( br_add_bridge("netif-backend") != 0 )
   25.89 +    if ( br_add_bridge("nbe-br") != 0 )
   25.90          BUG();
   25.91 -    bridge_dev = __dev_get_by_name("netif-be-bridge");
   25.92 -    (void)dev_open(bridge_dev);
   25.93 +    bridge_dev = __dev_get_by_name("nbe-br");
   25.94 +    bridge_br  = (struct net_bridge *)bridge_dev->priv;
   25.95 +    bridge_br->bridge_hello_time = bridge_br->hello_time = 0;
   25.96 +    bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0;
   25.97 +    bridge_br->stp_enabled = 0;
   25.98  }
    26.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c	Thu May 06 14:53:19 2004 +0000
    26.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c	Fri May 07 14:53:28 2004 +0000
    26.3 @@ -14,7 +14,7 @@
    26.4  #include <asm/hypervisor-ifs/dom_mem_ops.h>
    26.5  
    26.6  static void net_tx_action(unsigned long unused);
    26.7 -static void tx_skb_release(struct sk_buff *skb);
    26.8 +static void netif_page_release(struct page *page);
    26.9  static void make_tx_response(netif_t *netif, 
   26.10                               u16      id,
   26.11                               s8       st);
   26.12 @@ -30,13 +30,13 @@ static DECLARE_TASKLET(net_tx_tasklet, n
   26.13  #define tx_work_exists(_if) (1)
   26.14  
   26.15  #define MAX_PENDING_REQS 256
   26.16 -unsigned long mmap_vstart;
   26.17 +static unsigned long mmap_vstart;
   26.18  #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
   26.19  
   26.20  #define PKT_PROT_LEN (ETH_HLEN + 20)
   26.21  
   26.22 -/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/
   26.23  static u16 pending_id[MAX_PENDING_REQS];
   26.24 +static netif_t *pending_netif[MAX_PENDING_REQS];
   26.25  static u16 pending_ring[MAX_PENDING_REQS];
   26.26  static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
   26.27  typedef unsigned int PEND_RING_IDX;
   26.28 @@ -60,8 +60,7 @@ static void __refresh_mfn_list(void)
   26.29      op.u.increase.pages = mfn_list;
   26.30      if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC )
   26.31      {
   26.32 -        printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
   26.33 -               ret);
   26.34 +        printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret);
   26.35          BUG();
   26.36      }
   26.37      alloc_index = MAX_MFN_ALLOC;
   26.38 @@ -100,10 +99,10 @@ int netif_be_start_xmit(struct sk_buff *
   26.39  {
   26.40      netif_t *netif = (netif_t *)dev->priv;
   26.41      s8 status = NETIF_RSP_OKAY;
   26.42 -    u16 size, id;
   26.43 +    u16 size=0, id;
   26.44      mmu_update_t mmu[6];
   26.45      pgd_t *pgd; pmd_t *pmd; pte_t *pte;
   26.46 -    unsigned long vdata, new_mfn;
   26.47 +    unsigned long vdata, mdata=0, new_mfn;
   26.48  
   26.49      /* Drop the packet if the target domain has no receive buffers. */
   26.50      if ( (netif->rx_req_cons == netif->rx->req_prod) ||
   26.51 @@ -126,16 +125,23 @@ int netif_be_start_xmit(struct sk_buff *
   26.52           (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
   26.53           ((skb->end - skb->head) < (PAGE_SIZE/2)) )
   26.54      {
   26.55 -        struct sk_buff *nskb = dev_alloc_skb(PAGE_SIZE-1024);
   26.56 +        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
   26.57          int hlen = skb->data - skb->head;
   26.58 +        if ( unlikely(nskb == NULL) )
   26.59 +        {
   26.60 +            DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid);
   26.61 +            status = NETIF_RSP_ERROR;
   26.62 +            goto out;
   26.63 +        }
   26.64          skb_reserve(nskb, hlen);
   26.65 -        skb_put(nskb, skb->len);
   26.66 +        __skb_put(nskb, skb->len);
   26.67          (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
   26.68          dev_kfree_skb(skb);
   26.69          skb = nskb;
   26.70      }
   26.71  
   26.72      vdata = (unsigned long)skb->data;
   26.73 +    mdata = virt_to_machine(vdata);
   26.74      size  = skb->tail - skb->data;
   26.75  
   26.76      new_mfn = get_new_mfn();
   26.77 @@ -153,7 +159,7 @@ int netif_be_start_xmit(struct sk_buff *
   26.78      mmu[1].ptr |= MMU_EXTENDED_COMMAND;
   26.79      mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H;
   26.80  
   26.81 -    mmu[2].ptr  = virt_to_machine(vdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
   26.82 +    mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
   26.83      mmu[2].val  = MMUEXT_REASSIGN_PAGE;
   26.84  
   26.85      mmu[3].ptr  = MMU_EXTENDED_COMMAND;
   26.86 @@ -167,6 +173,7 @@ int netif_be_start_xmit(struct sk_buff *
   26.87  
   26.88      if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) )
   26.89      {
   26.90 +        DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid);
   26.91          dealloc_mfn(new_mfn);
   26.92          status = NETIF_RSP_ERROR;
   26.93          goto out;
   26.94 @@ -174,12 +181,12 @@ int netif_be_start_xmit(struct sk_buff *
   26.95  
   26.96      phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn;
   26.97  
   26.98 -    netif->stats.tx_bytes += size;
   26.99 -    netif->stats.tx_packets++;
  26.100 +    netif->stats.rx_bytes += size;
  26.101 +    netif->stats.rx_packets++;
  26.102  
  26.103   out:
  26.104      spin_lock(&netif->rx_lock);
  26.105 -    make_rx_response(netif, id, status, virt_to_machine(vdata), size);
  26.106 +    make_rx_response(netif, id, status, mdata, size);
  26.107      spin_unlock(&netif->rx_lock);    
  26.108      dev_kfree_skb(skb);
  26.109      return 0;
  26.110 @@ -220,6 +227,16 @@ static void add_to_net_schedule_list_tai
  26.111      spin_unlock(&net_schedule_list_lock);
  26.112  }
  26.113  
  26.114 +static inline void netif_schedule_work(netif_t *netif)
  26.115 +{
  26.116 +    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
  26.117 +         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
  26.118 +    {
  26.119 +        add_to_net_schedule_list_tail(netif);
  26.120 +        maybe_schedule_tx_action();
  26.121 +    }
  26.122 +}
  26.123 +
  26.124  void netif_deschedule(netif_t *netif)
  26.125  {
  26.126      remove_from_net_schedule_list(netif);
  26.127 @@ -229,14 +246,8 @@ void netif_deschedule(netif_t *netif)
  26.128  static void tx_credit_callback(unsigned long data)
  26.129  {
  26.130      netif_t *netif = (netif_t *)data;
  26.131 -
  26.132      netif->remaining_credit = netif->credit_bytes;
  26.133 -
  26.134 -    if ( tx_work_exists(netif) )
  26.135 -    {
  26.136 -        add_to_net_schedule_list_tail(netif);
  26.137 -        maybe_schedule_tx_action();
  26.138 -    }    
  26.139 +    netif_schedule_work(netif);
  26.140  }
  26.141  #endif
  26.142  
  26.143 @@ -249,6 +260,7 @@ static void net_tx_action(unsigned long 
  26.144      u16 pending_idx;
  26.145      NETIF_RING_IDX i;
  26.146      pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED);
  26.147 +    struct page *page;
  26.148  
  26.149      while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
  26.150              !list_empty(&net_schedule_list) )
  26.151 @@ -261,7 +273,7 @@ static void net_tx_action(unsigned long 
  26.152  
  26.153          /* Work to do? */
  26.154          i = netif->tx_req_cons;
  26.155 -        if ( (i == netif->tx->req_prod) && 
  26.156 +        if ( (i == netif->tx->req_prod) ||
  26.157               ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
  26.158          {
  26.159              netif_put(netif);
  26.160 @@ -296,7 +308,7 @@ static void net_tx_action(unsigned long 
  26.161          netif->remaining_credit -= tx.size;
  26.162  #endif
  26.163  
  26.164 -        add_to_net_schedule_list_tail(netif);
  26.165 +        netif_schedule_work(netif);
  26.166  
  26.167          if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
  26.168               unlikely(txreq.size > ETH_FRAME_LEN) )
  26.169 @@ -335,6 +347,7 @@ static void net_tx_action(unsigned long 
  26.170  
  26.171          if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) )
  26.172          {
  26.173 +            DPRINTK("Can't allocate a skb in start_xmit.\n");
  26.174              make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  26.175              netif_put(netif);
  26.176              vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
  26.177 @@ -346,29 +359,29 @@ static void net_tx_action(unsigned long 
  26.178                 (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
  26.179                 PKT_PROT_LEN);
  26.180  
  26.181 -        skb->dev        = netif->dev;
  26.182 -        skb->protocol   = eth_type_trans(skb, skb->dev);
  26.183 -        
  26.184 +        page = virt_to_page(MMAP_VADDR(pending_idx));
  26.185 +
  26.186          /* Append the packet payload as a fragment. */
  26.187 -        skb_shinfo(skb)->frags[0].page        = 
  26.188 -            virt_to_page(MMAP_VADDR(pending_idx));
  26.189 -        skb_shinfo(skb)->frags[0].size        =
  26.190 -            txreq.size - PKT_PROT_LEN;
  26.191 +        skb_shinfo(skb)->frags[0].page        = page;
  26.192 +        skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
  26.193          skb_shinfo(skb)->frags[0].page_offset = 
  26.194              (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
  26.195          skb_shinfo(skb)->nr_frags = 1;
  26.196          skb->data_len  = txreq.size - PKT_PROT_LEN;
  26.197          skb->len      += skb->data_len;
  26.198  
  26.199 -        /* Destructor information. */
  26.200 -        skb->destructor = tx_skb_release;
  26.201 -        skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif;
  26.202 -        skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx;
  26.203 +        skb->dev      = netif->dev;
  26.204 +        skb->protocol = eth_type_trans(skb, skb->dev);
  26.205  
  26.206 -        netif->stats.rx_bytes += txreq.size;
  26.207 -        netif->stats.rx_packets++;
  26.208 +        /* Destructor information. */
  26.209 +        atomic_set(&page->count, 1);
  26.210 +        page->mapping = (struct address_space *)netif_page_release;
  26.211 +        pending_id[pending_idx] = txreq.id;
  26.212 +        pending_netif[pending_idx] = netif;
  26.213  
  26.214 -        pending_id[pending_idx] = txreq.id;
  26.215 +        netif->stats.tx_bytes += txreq.size;
  26.216 +        netif->stats.tx_packets++;
  26.217 +
  26.218          pending_cons++;
  26.219  
  26.220          netif_rx(skb);
  26.221 @@ -376,28 +389,34 @@ static void net_tx_action(unsigned long 
  26.222      }
  26.223  }
  26.224  
  26.225 -/* Destructor function for tx skbs. */
  26.226 -static void tx_skb_release(struct sk_buff *skb)
  26.227 +static void netif_page_release(struct page *page)
  26.228  {
  26.229      unsigned long flags;
  26.230 -    netif_t *netif = (netif_t *)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page;
  26.231 -    u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size;
  26.232 +    netif_t *netif;
  26.233 +    u16 pending_idx;
  26.234 +
  26.235 +    pending_idx = page - virt_to_page(mmap_vstart);
  26.236 +
  26.237 +    netif = pending_netif[pending_idx];
  26.238  
  26.239      vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
  26.240 -    
  26.241 -    skb_shinfo(skb)->nr_frags = 0; 
  26.242 -    
  26.243 +        
  26.244      spin_lock(&netif->tx_lock);
  26.245      make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
  26.246      spin_unlock(&netif->tx_lock);
  26.247 -    
  26.248 +
  26.249 +    /*
  26.250 +     * Scheduling checks must happen after the above response is posted.
  26.251 +     * This avoids a possible race with a guest OS on another CPU.
  26.252 +     */
  26.253 +    mb();
  26.254 +    netif_schedule_work(netif);
  26.255 +
  26.256      netif_put(netif);
  26.257   
  26.258      spin_lock_irqsave(&pend_prod_lock, flags);
  26.259      pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  26.260      spin_unlock_irqrestore(&pend_prod_lock, flags);
  26.261 - 
  26.262 -    maybe_schedule_tx_action();        
  26.263  }
  26.264  
  26.265  #if 0
  26.266 @@ -493,9 +512,26 @@ static void make_rx_response(netif_t    
  26.267  
  26.268  static int __init init_module(void)
  26.269  {
  26.270 +    int i;
  26.271 +
  26.272 +    if ( !(start_info.flags & SIF_INITDOMAIN) )
  26.273 +        return 0;
  26.274 +
  26.275      netif_interface_init();
  26.276 -    mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS);
  26.277 +
  26.278 +    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
  26.279 +        BUG();
  26.280 +
  26.281 +    pending_cons = 0;
  26.282 +    pending_prod = MAX_PENDING_REQS;
  26.283 +    for ( i = 0; i < MAX_PENDING_REQS; i++ )
  26.284 +        pending_ring[i] = i;
  26.285 +
  26.286 +    spin_lock_init(&net_schedule_list_lock);
  26.287 +    INIT_LIST_HEAD(&net_schedule_list);
  26.288 +
  26.289      netif_ctrlif_init();
  26.290 +
  26.291      return 0;
  26.292  }
  26.293  
    27.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c	Thu May 06 14:53:19 2004 +0000
    27.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c	Fri May 07 14:53:28 2004 +0000
    27.3 @@ -25,20 +25,18 @@
    27.4  #include <net/sock.h>
    27.5  #include <net/pkt_sched.h>
    27.6  
    27.7 -#include "../netif.h"
    27.8 +#include <asm/evtchn.h>
    27.9 +#include <asm/ctrl_if.h>
   27.10 +#include <asm/hypervisor-ifs/dom_mem_ops.h>
   27.11  
   27.12 -static struct tq_struct netif_statechange_tq;
   27.13 +#include "../netif.h"
   27.14  
   27.15  #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
   27.16  
   27.17 -static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs);
   27.18  static void network_tx_buf_gc(struct net_device *dev);
   27.19  static void network_alloc_rx_buffers(struct net_device *dev);
   27.20  static void cleanup_module(void);
   27.21  
   27.22 -/* Dynamically-mapped IRQs. */
   27.23 -static int network_irq, debug_irq;
   27.24 -
   27.25  static struct list_head dev_list;
   27.26  
   27.27  struct net_private
   27.28 @@ -47,7 +45,7 @@ struct net_private
   27.29      struct net_device *dev;
   27.30  
   27.31      struct net_device_stats stats;
   27.32 -    NET_RING_IDX rx_resp_cons, tx_resp_cons;
   27.33 +    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
   27.34      unsigned int tx_full;
   27.35      
   27.36      netif_tx_interface_t *tx;
   27.37 @@ -69,8 +67,8 @@ struct net_private
   27.38       * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
   27.39       * array is an index into a chain of free entries.
   27.40       */
   27.41 -    struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1];
   27.42 -    struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1];
   27.43 +    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
   27.44 +    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
   27.45  };
   27.46  
   27.47  /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
   27.48 @@ -91,7 +89,7 @@ static struct net_device *find_dev_by_ha
   27.49      {
   27.50          np = list_entry(ent, struct net_private, list);
   27.51          if ( np->handle == handle )
   27.52 -            return np;
   27.53 +            return np->dev;
   27.54      }
   27.55      return NULL;
   27.56  }
   27.57 @@ -100,8 +98,7 @@ static struct net_device *find_dev_by_ha
   27.58  static int network_open(struct net_device *dev)
   27.59  {
   27.60      struct net_private *np = dev->priv;
   27.61 -    netop_t netop;
   27.62 -    int i, ret;
   27.63 +    int i;
   27.64  
   27.65      if ( np->state != NETIF_STATE_CONNECTED )
   27.66          return -EINVAL;
   27.67 @@ -111,15 +108,16 @@ static int network_open(struct net_devic
   27.68      spin_lock_init(&np->tx_lock);
   27.69  
   27.70      /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
   27.71 -    for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ )
   27.72 +    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
   27.73          np->tx_skbs[i] = (void *)(i+1);
   27.74 -    for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ )
   27.75 +    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
   27.76          np->rx_skbs[i] = (void *)(i+1);
   27.77  
   27.78      wmb();
   27.79      np->state = NETIF_STATE_ACTIVE;
   27.80  
   27.81      network_alloc_rx_buffers(dev);
   27.82 +    np->rx->event = np->rx_resp_cons + 1;
   27.83  
   27.84      netif_start_queue(dev);
   27.85  
   27.86 @@ -131,18 +129,17 @@ static int network_open(struct net_devic
   27.87  
   27.88  static void network_tx_buf_gc(struct net_device *dev)
   27.89  {
   27.90 -    NET_RING_IDX i, prod;
   27.91 +    NETIF_RING_IDX i, prod;
   27.92      unsigned short id;
   27.93      struct net_private *np = dev->priv;
   27.94      struct sk_buff *skb;
   27.95 -    tx_entry_t *tx_ring = np->net_ring->tx_ring;
   27.96  
   27.97      do {
   27.98 -        prod = np->net_idx->tx_resp_prod;
   27.99 +        prod = np->tx->resp_prod;
  27.100  
  27.101          for ( i = np->tx_resp_cons; i != prod; i++ )
  27.102          {
  27.103 -            id  = tx_ring[MASK_NET_TX_IDX(i)].resp.id;
  27.104 +            id  = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id;
  27.105              skb = np->tx_skbs[id];
  27.106              ADD_ID_TO_FREELIST(np->tx_skbs, id);
  27.107              dev_kfree_skb_any(skb);
  27.108 @@ -158,14 +155,14 @@ static void network_tx_buf_gc(struct net
  27.109           * in such cases notification from Xen is likely to be the only kick
  27.110           * that we'll get.
  27.111           */
  27.112 -        np->net_idx->tx_event = 
  27.113 -            prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1;
  27.114 +        np->tx->event = 
  27.115 +            prod + ((np->tx->req_prod - prod) >> 1) + 1;
  27.116          mb();
  27.117      }
  27.118 -    while ( prod != np->net_idx->tx_resp_prod );
  27.119 +    while ( prod != np->tx->resp_prod );
  27.120  
  27.121      if ( np->tx_full && 
  27.122 -         ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) )
  27.123 +         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
  27.124      {
  27.125          np->tx_full = 0;
  27.126          if ( np->state == NETIF_STATE_ACTIVE )
  27.127 @@ -189,10 +186,14 @@ static void network_alloc_rx_buffers(str
  27.128      unsigned short id;
  27.129      struct net_private *np = dev->priv;
  27.130      struct sk_buff *skb;
  27.131 -    netop_t netop;
  27.132 -    NET_RING_IDX i = np->net_idx->rx_req_prod;
  27.133 +    NETIF_RING_IDX i = np->rx->req_prod;
  27.134 +    dom_mem_op_t op;
  27.135 +    unsigned long pfn_array[NETIF_RX_RING_SIZE];
  27.136 +    int ret, nr_pfns = 0;
  27.137 +    pte_t *pte;
  27.138  
  27.139 -    if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || 
  27.140 +    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
  27.141 +    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
  27.142           unlikely(np->state != NETIF_STATE_ACTIVE) )
  27.143          return;
  27.144  
  27.145 @@ -209,13 +210,13 @@ static void network_alloc_rx_buffers(str
  27.146          id = GET_ID_FROM_FREELIST(np->rx_skbs);
  27.147          np->rx_skbs[id] = skb;
  27.148  
  27.149 -        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id   = id;
  27.150 -        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 
  27.151 -            virt_to_machine(get_ppte(skb->head));
  27.152 -
  27.153 -        np->rx_bufs_to_notify++;
  27.154 +        np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
  27.155 +        
  27.156 +        pte = get_ppte(skb->head);
  27.157 +        pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT;
  27.158 +        queue_l1_entry_update(pte, 0);
  27.159      }
  27.160 -    while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
  27.161 +    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
  27.162  
  27.163      /*
  27.164       * We may have allocated buffers which have entries outstanding in the page
  27.165 @@ -223,17 +224,16 @@ static void network_alloc_rx_buffers(str
  27.166       */
  27.167      flush_page_update_queue();
  27.168  
  27.169 -    np->net_idx->rx_req_prod = i;
  27.170 -    np->net_idx->rx_event    = np->rx_resp_cons + 1;
  27.171 -        
  27.172 -    /* Batch Xen notifications. */
  27.173 -    if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) )
  27.174 +    op.op = MEMOP_RESERVATION_DECREASE;
  27.175 +    op.u.decrease.size  = nr_pfns;
  27.176 +    op.u.decrease.pages = pfn_array;
  27.177 +    if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns )
  27.178      {
  27.179 -        netop.cmd = NETOP_PUSH_BUFFERS;
  27.180 -        netop.vif = np->idx;
  27.181 -        (void)HYPERVISOR_net_io_op(&netop);
  27.182 -        np->rx_bufs_to_notify = 0;
  27.183 +        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
  27.184 +        BUG();
  27.185      }
  27.186 +
  27.187 +    np->rx->req_prod = i;
  27.188  }
  27.189  
  27.190  
  27.191 @@ -241,9 +241,8 @@ static int network_start_xmit(struct sk_
  27.192  {
  27.193      unsigned short id;
  27.194      struct net_private *np = (struct net_private *)dev->priv;
  27.195 -    tx_req_entry_t *tx;
  27.196 -    netop_t netop;
  27.197 -    NET_RING_IDX i;
  27.198 +    netif_tx_request_t *tx;
  27.199 +    NETIF_RING_IDX i;
  27.200  
  27.201      if ( unlikely(np->tx_full) )
  27.202      {
  27.203 @@ -262,27 +261,27 @@ static int network_start_xmit(struct sk_
  27.204          memcpy(new_skb->data, skb->data, skb->len);
  27.205          dev_kfree_skb(skb);
  27.206          skb = new_skb;
  27.207 -    }   
  27.208 +    }
  27.209      
  27.210      spin_lock_irq(&np->tx_lock);
  27.211  
  27.212 -    i = np->net_idx->tx_req_prod;
  27.213 +    i = np->tx->req_prod;
  27.214  
  27.215      id = GET_ID_FROM_FREELIST(np->tx_skbs);
  27.216      np->tx_skbs[id] = skb;
  27.217  
  27.218 -    tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req;
  27.219 +    tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req;
  27.220  
  27.221      tx->id   = id;
  27.222 -    tx->addr = phys_to_machine(virt_to_phys(skb->data));
  27.223 +    tx->addr = virt_to_machine(skb->data);
  27.224      tx->size = skb->len;
  27.225  
  27.226      wmb();
  27.227 -    np->net_idx->tx_req_prod = i + 1;
  27.228 +    np->tx->req_prod = i + 1;
  27.229  
  27.230      network_tx_buf_gc(dev);
  27.231  
  27.232 -    if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) )
  27.233 +    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
  27.234      {
  27.235          np->tx_full = 1;
  27.236          netif_stop_queue(dev);
  27.237 @@ -295,12 +294,8 @@ static int network_start_xmit(struct sk_
  27.238  
  27.239      /* Only notify Xen if there are no outstanding responses. */
  27.240      mb();
  27.241 -    if ( np->net_idx->tx_resp_prod == i )
  27.242 -    {
  27.243 -        netop.cmd = NETOP_PUSH_BUFFERS;
  27.244 -        netop.vif = np->idx;
  27.245 -        (void)HYPERVISOR_net_io_op(&netop);
  27.246 -    }
  27.247 +    if ( np->tx->resp_prod == i )
  27.248 +        notify_via_evtchn(np->evtchn);
  27.249  
  27.250      return 0;
  27.251  }
  27.252 @@ -312,22 +307,24 @@ static void netif_int(int irq, void *dev
  27.253      struct net_private *np = dev->priv;
  27.254      unsigned long flags;
  27.255      struct sk_buff *skb;
  27.256 -    rx_resp_entry_t *rx;
  27.257 -    NET_RING_IDX i;
  27.258 +    netif_rx_response_t *rx;
  27.259 +    NETIF_RING_IDX i;
  27.260 +    mmu_update_t mmu[2];
  27.261 +    pte_t *pte;
  27.262  
  27.263      spin_lock_irqsave(&np->tx_lock, flags);
  27.264      network_tx_buf_gc(dev);
  27.265      spin_unlock_irqrestore(&np->tx_lock, flags);
  27.266  
  27.267   again:
  27.268 -    for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ )
  27.269 +    for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ )
  27.270      {
  27.271 -        rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp;
  27.272 +        rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;
  27.273  
  27.274          skb = np->rx_skbs[rx->id];
  27.275          ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
  27.276  
  27.277 -        if ( unlikely(rx->status != RING_STATUS_OK) )
  27.278 +        if ( unlikely(rx->status <= 0) )
  27.279          {
  27.280              /* Gate this error. We get a (valid) slew of them on suspend. */
  27.281              if ( np->state == NETIF_STATE_ACTIVE )
  27.282 @@ -336,6 +333,17 @@ static void netif_int(int irq, void *dev
  27.283              continue;
  27.284          }
  27.285  
  27.286 +        /* Remap the page. */
  27.287 +        pte = get_ppte(skb->head);
  27.288 +        mmu[0].ptr  = virt_to_machine(pte);
  27.289 +        mmu[0].val  = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
  27.290 +        mmu[1].ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
  27.291 +        mmu[1].val  = __pa(skb->head) >> PAGE_SHIFT;
  27.292 +        if ( HYPERVISOR_mmu_update(mmu, 2) != 0 )
  27.293 +            BUG();
  27.294 +        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
  27.295 +            rx->addr >> PAGE_SHIFT;
  27.296 +
  27.297          /*
  27.298           * Set up shinfo -- from alloc_skb This was particularily nasty:  the
  27.299           * shared info is hidden at the back of the data area (presumably so it
  27.300 @@ -348,13 +356,13 @@ static void netif_int(int irq, void *dev
  27.301          phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
  27.302              (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
  27.303  
  27.304 -        skb->data = skb->tail = skb->head + rx->offset;
  27.305 -        skb_put(skb, rx->size);
  27.306 +        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
  27.307 +        skb_put(skb, rx->status);
  27.308          skb->protocol = eth_type_trans(skb, dev);
  27.309  
  27.310          np->stats.rx_packets++;
  27.311  
  27.312 -        np->stats.rx_bytes += rx->size;
  27.313 +        np->stats.rx_bytes += rx->status;
  27.314          netif_rx(skb);
  27.315          dev->last_rx = jiffies;
  27.316      }
  27.317 @@ -362,10 +370,11 @@ static void netif_int(int irq, void *dev
  27.318      np->rx_resp_cons = i;
  27.319  
  27.320      network_alloc_rx_buffers(dev);
  27.321 +    np->rx->event = np->rx_resp_cons + 1;
  27.322      
  27.323      /* Deal with hypervisor racing our resetting of rx_event. */
  27.324      mb();
  27.325 -    if ( np->net_idx->rx_resp_prod != i )
  27.326 +    if ( np->rx->resp_prod != i )
  27.327          goto again;
  27.328  }
  27.329  
  27.330 @@ -373,16 +382,11 @@ static void netif_int(int irq, void *dev
  27.331  static int network_close(struct net_device *dev)
  27.332  {
  27.333      struct net_private *np = dev->priv;
  27.334 -    netop_t netop;
  27.335  
  27.336      netif_stop_queue(np->dev);
  27.337  
  27.338 -    netop.cmd = NETOP_FLUSH_BUFFERS;
  27.339 -    netop.vif = np->idx;
  27.340 -    (void)HYPERVISOR_net_io_op(&netop);
  27.341 -
  27.342 -    while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) ||
  27.343 -            (np->tx_resp_cons != np->net_idx->tx_req_prod) )
  27.344 +    while ( (np->rx_resp_cons != np->rx->req_prod) ||
  27.345 +            (np->tx_resp_cons != np->tx->req_prod) )
  27.346      {
  27.347          barrier();
  27.348          current->state = TASK_INTERRUPTIBLE;
  27.349 @@ -406,55 +410,12 @@ static struct net_device_stats *network_
  27.350  }
  27.351  
  27.352  
  27.353 -static void netif_bringup_phase1(void *unused)
  27.354 +static void netif_status_change(netif_fe_interface_status_changed_t *status)
  27.355  {
  27.356      ctrl_msg_t                   cmsg;
  27.357      netif_fe_interface_connect_t up;
  27.358      struct net_device *dev;
  27.359      struct net_private *np;
  27.360 -
  27.361 -    dev = find_dev_by_handle(0);
  27.362 -    np  = dev->priv;
  27.363 -    
  27.364 -    /* Move from CLOSED to DISCONNECTED state. */
  27.365 -    np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
  27.366 -    np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
  27.367 -    memset(np->tx, 0, PAGE_SIZE);
  27.368 -    memset(np->rx, 0, PAGE_SIZE);
  27.369 -    np->state  = NETIF_STATE_DISCONNECTED;
  27.370 -
  27.371 -    /* Construct an interface-CONNECT message for the domain controller. */
  27.372 -    cmsg.type      = CMSG_NETIF_FE;
  27.373 -    cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
  27.374 -    cmsg.length    = sizeof(netif_fe_interface_connect_t);
  27.375 -    up.handle      = 0;
  27.376 -    up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
  27.377 -    up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
  27.378 -    memcpy(cmsg.msg, &up, sizeof(up));
  27.379 -
  27.380 -    /* Tell the controller to bring up the interface. */
  27.381 -    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  27.382 -}
  27.383 -
  27.384 -static void netif_bringup_phase2(void *unused)
  27.385 -{
  27.386 -    struct net_device *dev;
  27.387 -    struct net_private *np;
  27.388 -
  27.389 -    dev = find_dev_by_handle(0);
  27.390 -    np  = dev->priv;
  27.391 -    
  27.392 -    np->irq = bind_evtchn_to_irq(np->evtchn);
  27.393 -    (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
  27.394 -                      "netif", dev);
  27.395 -
  27.396 -    np->state = NETIF_STATE_CONNECTED;
  27.397 -}
  27.398 -
  27.399 -static void netif_status_change(netif_fe_interface_status_changed_t *status)
  27.400 -{
  27.401 -    struct net_device *dev;
  27.402 -    struct net_private *np;
  27.403      
  27.404      if ( status->handle != 0 )
  27.405      {
  27.406 @@ -470,31 +431,53 @@ static void netif_status_change(netif_fe
  27.407      {
  27.408      case NETIF_INTERFACE_STATUS_DESTROYED:
  27.409          printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
  27.410 -               netif_state);
  27.411 +               np->state);
  27.412          break;
  27.413  
  27.414      case NETIF_INTERFACE_STATUS_DISCONNECTED:
  27.415          if ( np->state != NETIF_STATE_CLOSED )
  27.416          {
  27.417              printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
  27.418 -                   " in state %d\n", netif_state);
  27.419 +                   " in state %d\n", np->state);
  27.420              break;
  27.421          }
  27.422 -        netif_statechange_tq.routine = netif_bringup_phase1;
  27.423 -        schedule_task(&netif_statechange_tq);
  27.424 +
  27.425 +        /* Move from CLOSED to DISCONNECTED state. */
  27.426 +        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
  27.427 +        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
  27.428 +        memset(np->tx, 0, PAGE_SIZE);
  27.429 +        memset(np->rx, 0, PAGE_SIZE);
  27.430 +        np->state  = NETIF_STATE_DISCONNECTED;
  27.431 +
  27.432 +        /* Construct an interface-CONNECT message for the domain controller. */
  27.433 +        cmsg.type      = CMSG_NETIF_FE;
  27.434 +        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
  27.435 +        cmsg.length    = sizeof(netif_fe_interface_connect_t);
  27.436 +        up.handle      = 0;
  27.437 +        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
  27.438 +        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
  27.439 +        memcpy(cmsg.msg, &up, sizeof(up));
  27.440 +        
  27.441 +        /* Tell the controller to bring up the interface. */
  27.442 +        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  27.443          break;
  27.444  
  27.445      case NETIF_INTERFACE_STATUS_CONNECTED:
  27.446          if ( np->state == NETIF_STATE_CLOSED )
  27.447          {
  27.448              printk(KERN_WARNING "Unexpected netif-CONNECTED message"
  27.449 -                   " in state %d\n", netif_state);
  27.450 +                   " in state %d\n", np->state);
  27.451              break;
  27.452          }
  27.453 -        np->evtchn = status->evtchn;
  27.454 +
  27.455          memcpy(dev->dev_addr, status->mac, ETH_ALEN);
  27.456 -        netif_statechange_tq.routine = netif_bringup_phase2;
  27.457 -        schedule_task(&netif_statechange_tq);
  27.458 +
  27.459 +        np->evtchn = status->evtchn;
  27.460 +        np->irq = bind_evtchn_to_irq(np->evtchn);
  27.461 +        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
  27.462 +                      dev->name, dev);
  27.463 +        
  27.464 +        np->state = NETIF_STATE_CONNECTED;
  27.465          break;
  27.466  
  27.467      default:
  27.468 @@ -532,10 +515,13 @@ static int __init init_module(void)
  27.469  {
  27.470      ctrl_msg_t                       cmsg;
  27.471      netif_fe_driver_status_changed_t st;
  27.472 -    int i, err;
  27.473 +    int err;
  27.474      struct net_device *dev;
  27.475      struct net_private *np;
  27.476  
  27.477 +    if ( start_info.flags & SIF_INITDOMAIN )
  27.478 +        return 0;
  27.479 +
  27.480      INIT_LIST_HEAD(&dev_list);
  27.481  
  27.482      if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
  27.483 @@ -562,7 +548,8 @@ static int __init init_module(void)
  27.484      np->dev = dev;
  27.485      list_add(&np->list, &dev_list);
  27.486  
  27.487 -    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
  27.488 +    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
  27.489 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
  27.490  
  27.491      /* Send a driver-UP notification to the domain controller. */
  27.492      cmsg.type      = CMSG_NETIF_FE;
    28.1 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c	Thu May 06 14:53:19 2004 +0000
    28.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c	Fri May 07 14:53:28 2004 +0000
    28.3 @@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_acti
    28.4  static CONTROL_RING_IDX ctrl_if_tx_resp_cons;
    28.5  static CONTROL_RING_IDX ctrl_if_rx_req_cons;
    28.6  
    28.7 -/* Incoming message requests: primary message type -> message handler. */
    28.8 +/* Incoming message requests. */
    28.9 +    /* Primary message type -> message handler. */
   28.10  static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256];
   28.11 +    /* Primary message type -> callback in process context? */
   28.12 +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)];
   28.13 +    /* Is it late enough during bootstrap to use schedule_task()? */
   28.14 +static int safe_to_schedule_task;
   28.15 +    /* Passed to schedule_task(). */
   28.16 +static struct tq_struct ctrl_if_rxmsg_deferred_tq;
   28.17 +    /* Queue up messages to be handled in process context. */
   28.18 +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE];
   28.19 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod;
   28.20 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons;
   28.21  
   28.22  /* Incoming message responses: message identifier -> message handler/id. */
   28.23  static struct {
   28.24 @@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigne
   28.25      }
   28.26  }
   28.27  
   28.28 +static void __ctrl_if_rxmsg_deferred(void *unused)
   28.29 +{
   28.30 +    ctrl_msg_t *msg;
   28.31 +
   28.32 +    while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod )
   28.33 +    {
   28.34 +        msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
   28.35 +            ctrl_if_rxmsg_deferred_cons++)];
   28.36 +        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
   28.37 +    }
   28.38 +}
   28.39 +
   28.40  static void __ctrl_if_rx_tasklet(unsigned long data)
   28.41  {
   28.42      control_if_t *ctrl_if = get_ctrl_if();
   28.43 -    ctrl_msg_t   *msg;
   28.44 +    ctrl_msg_t    msg, *pmsg;
   28.45  
   28.46      while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod )
   28.47      {
   28.48 -        /*
   28.49 -         * We need no locking or barriers here. There will be one and only one
   28.50 -         * response as a result of each callback, so the callback handler
   28.51 -         * doesn't need to worry about the 'msg' being overwritten until:
   28.52 -         *  1. It returns (if the message must persist then it must be copied).
   28.53 -         *  2. A response is sent (the response may overwrite the request).
   28.54 -         */
   28.55 -        msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
   28.56 -        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
   28.57 +        pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
   28.58 +        memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
   28.59 +        if ( msg.length != 0 )
   28.60 +            memcpy(msg.msg, pmsg->msg, msg.length);
   28.61 +        if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) )
   28.62 +        {
   28.63 +            pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
   28.64 +                ctrl_if_rxmsg_deferred_prod++)];
   28.65 +            memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length);
   28.66 +            schedule_task(&ctrl_if_rxmsg_deferred_tq);
   28.67 +        }
   28.68 +        else
   28.69 +        {
   28.70 +            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
   28.71 +        }
   28.72      }
   28.73  }
   28.74  
   28.75 @@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *m
   28.76      ctrl_if_notify_controller();
   28.77  }
   28.78  
   28.79 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd)
   28.80 +int ctrl_if_register_receiver(
   28.81 +    u8 type, 
   28.82 +    ctrl_msg_handler_t hnd, 
   28.83 +    unsigned int flags)
   28.84  {
   28.85 -    unsigned long flags;
   28.86 +    unsigned long _flags;
   28.87      int inuse;
   28.88  
   28.89 -    spin_lock_irqsave(&ctrl_if_lock, flags);
   28.90 +    spin_lock_irqsave(&ctrl_if_lock, _flags);
   28.91  
   28.92      inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler);
   28.93  
   28.94      if ( inuse )
   28.95 +    {
   28.96          printk(KERN_INFO "Receiver %p already established for control "
   28.97                 "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type);
   28.98 +    }
   28.99      else
  28.100 +    {
  28.101          ctrl_if_rxmsg_handler[type] = hnd;
  28.102 +        clear_bit(type, &ctrl_if_rxmsg_blocking_context);
  28.103 +        if ( flags == CALLBACK_IN_BLOCKING_CONTEXT )
  28.104 +        {
  28.105 +            set_bit(type, &ctrl_if_rxmsg_blocking_context);
  28.106 +            if ( !safe_to_schedule_task )
  28.107 +                BUG();
  28.108 +        }
  28.109 +    }
  28.110  
  28.111 -    spin_unlock_irqrestore(&ctrl_if_lock, flags);
  28.112 +    spin_unlock_irqrestore(&ctrl_if_lock, _flags);
  28.113  
  28.114      return !inuse;
  28.115  }
  28.116 @@ -326,6 +369,7 @@ void __init ctrl_if_init(void)
  28.117  
  28.118      for ( i = 0; i < 256; i++ )
  28.119          ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler;
  28.120 +    ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred;
  28.121  
  28.122      spin_lock_init(&ctrl_if_lock);
  28.123  
  28.124 @@ -333,6 +377,15 @@ void __init ctrl_if_init(void)
  28.125  }
  28.126  
  28.127  
  28.128 +/* This is called after it is safe to call schedule_task(). */
  28.129 +static int __init ctrl_if_late_setup(void)
  28.130 +{
  28.131 +    safe_to_schedule_task = 1;
  28.132 +    return 0;
  28.133 +}
  28.134 +__initcall(ctrl_if_late_setup);
  28.135 +
  28.136 +
  28.137  /*
  28.138   * !! The following are DANGEROUS FUNCTIONS !!
  28.139   * Use with care [for example, see xencons_force_flush()].
    29.1 --- a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c	Thu May 06 14:53:19 2004 +0000
    29.2 +++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c	Fri May 07 14:53:28 2004 +0000
    29.3 @@ -1626,7 +1626,7 @@ int __init blk_dev_init(void)
    29.4  	jsfd_init();
    29.5  #endif
    29.6  
    29.7 -#ifdef CONFIG_XEN_VBD
    29.8 +#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO)
    29.9      xlblk_init();
   29.10  #endif
   29.11  
    30.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h	Thu May 06 14:53:19 2004 +0000
    30.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h	Fri May 07 14:53:28 2004 +0000
    30.3 @@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *m
    30.4   * Register a receiver for typed messages from the domain controller. The 
    30.5   * handler (@hnd) is called for every received message of specified @type.
    30.6   * Returns TRUE (non-zero) if the handler was successfully registered.
    30.7 + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will
    30.8 + * occur in a context in which it is safe to yield (i.e., process context).
    30.9   */
   30.10 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd);
   30.11 +#define CALLBACK_IN_BLOCKING_CONTEXT 1
   30.12 +int ctrl_if_register_receiver(
   30.13 +    u8 type, 
   30.14 +    ctrl_msg_handler_t hnd,
   30.15 +    unsigned int flags);
   30.16  
   30.17  /*
   30.18   * Unregister a receiver for typed messages from the domain controller. The 
    31.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h	Thu May 06 14:53:19 2004 +0000
    31.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h	Fri May 07 14:53:28 2004 +0000
    31.3 @@ -159,46 +159,11 @@ extern void iounmap(void *addr);
    31.4  extern void *bt_ioremap(unsigned long offset, unsigned long size);
    31.5  extern void bt_iounmap(void *addr, unsigned long size);
    31.6  
    31.7 -#ifdef CONFIG_XEN_PHYSDEV_ACCESS
    31.8 -
    31.9 -#ifdef CONFIG_HIGHMEM
   31.10 -#error "Highmem is not yet compatible with physical device access"
   31.11 -#endif
   31.12 -
   31.13 -/*
   31.14 - * The bus translation macros need special care if we are executing device
   31.15 - * accesses to/from other domains' memory. In these cases the virtual address
   31.16 - * is actually a temporary mapping in the 'vmalloc' space. The physical
   31.17 - * address will therefore be >max_low_pfn, and will not have a valid entry
   31.18 - * in the phys_to_mach mapping table.
   31.19 - */
   31.20 -static inline unsigned long phys_to_bus(unsigned long phys)
   31.21 -{
   31.22 -    extern unsigned long max_pfn;
   31.23 -    pgd_t *pgd; pmd_t *pmd; pte_t *pte;
   31.24 -    void *addr;
   31.25 -    unsigned long bus;
   31.26 -    if ( (phys >> PAGE_SHIFT) < max_pfn )
   31.27 -        return phys_to_machine(phys);
   31.28 -    addr = phys_to_virt(phys);
   31.29 -    pgd = pgd_offset_k(   (unsigned long)addr);
   31.30 -    pmd = pmd_offset(pgd, (unsigned long)addr);
   31.31 -    pte = pte_offset(pmd, (unsigned long)addr);
   31.32 -    bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK);
   31.33 -    return bus;
   31.34 -}
   31.35 -
   31.36 -#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x))
   31.37 -#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
   31.38 -#define page_to_bus(_x) phys_to_bus(page_to_phys(_x))
   31.39 -
   31.40 -#else
   31.41 -
   31.42  #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x))
   31.43  #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
   31.44  #define page_to_bus(_x) phys_to_machine(page_to_phys(_x))
   31.45 -
   31.46 -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
   31.47 +#define bus_to_phys(_x) machine_to_phys(_x)
   31.48 +#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT))
   31.49  
   31.50  /*
   31.51   * readX/writeX() are used to access memory mapped devices. On some
    32.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h	Fri May 07 14:53:28 2004 +0000
    32.3 @@ -0,0 +1,283 @@
    32.4 +#ifndef __i386_PCI_H
    32.5 +#define __i386_PCI_H
    32.6 +
    32.7 +#include <linux/config.h>
    32.8 +
    32.9 +#ifdef __KERNEL__
   32.10 +
   32.11 +/* Can be used to override the logic in pci_scan_bus for skipping
   32.12 +   already-configured bus numbers - to be used for buggy BIOSes
   32.13 +   or architectures with incomplete PCI setup by the loader */
   32.14 +
   32.15 +#ifdef CONFIG_PCI
   32.16 +extern unsigned int pcibios_assign_all_busses(void);
   32.17 +#else
   32.18 +#define pcibios_assign_all_busses()	0
   32.19 +#endif
   32.20 +#define pcibios_scan_all_fns()		0
   32.21 +
   32.22 +extern unsigned long pci_mem_start;
   32.23 +#define PCIBIOS_MIN_IO		0x1000
   32.24 +#define PCIBIOS_MIN_MEM		(pci_mem_start)
   32.25 +
   32.26 +void pcibios_config_init(void);
   32.27 +struct pci_bus * pcibios_scan_root(int bus);
   32.28 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
   32.29 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
   32.30 +
   32.31 +void pcibios_set_master(struct pci_dev *dev);
   32.32 +void pcibios_penalize_isa_irq(int irq);
   32.33 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
   32.34 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
   32.35 +
   32.36 +/* Dynamic DMA mapping stuff.
   32.37 + * i386 has everything mapped statically.
   32.38 + */
   32.39 +
   32.40 +#include <linux/types.h>
   32.41 +#include <linux/slab.h>
   32.42 +#include <asm/scatterlist.h>
   32.43 +#include <linux/string.h>
   32.44 +#include <asm/io.h>
   32.45 +
   32.46 +struct pci_dev;
   32.47 +
   32.48 +/* The networking and block device layers use this boolean for bounce
   32.49 + * buffer decisions.
   32.50 + */
   32.51 +#define PCI_DMA_BUS_IS_PHYS	(0)
   32.52 +
   32.53 +/* Allocate and map kernel buffer using consistent mode DMA for a device.
   32.54 + * hwdev should be valid struct pci_dev pointer for PCI devices,
   32.55 + * NULL for PCI-like buses (ISA, EISA).
   32.56 + * Returns non-NULL cpu-view pointer to the buffer if successful and
   32.57 + * sets *dma_addrp to the pci side dma address as well, else *dma_addrp
   32.58 + * is undefined.
   32.59 + */
   32.60 +extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
   32.61 +				  dma_addr_t *dma_handle);
   32.62 +
   32.63 +/* Free and unmap a consistent DMA buffer.
   32.64 + * cpu_addr is what was returned from pci_alloc_consistent,
   32.65 + * size must be the same as what as passed into pci_alloc_consistent,
   32.66 + * and likewise dma_addr must be the same as what *dma_addrp was set to.
   32.67 + *
   32.68 + * References to the memory and mappings associated with cpu_addr/dma_addr
   32.69 + * past this call are illegal.
   32.70 + */
   32.71 +extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
   32.72 +				void *vaddr, dma_addr_t dma_handle);
   32.73 +
   32.74 +/* Map a single buffer of the indicated size for DMA in streaming mode.
   32.75 + * The 32-bit bus address to use is returned.
   32.76 + *
   32.77 + * Once the device is given the dma address, the device owns this memory
   32.78 + * until either pci_unmap_single or pci_dma_sync_single is performed.
   32.79 + */
   32.80 +static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr,
   32.81 +					size_t size, int direction)
   32.82 +{
   32.83 +	if (direction == PCI_DMA_NONE)
   32.84 +		out_of_line_bug();
   32.85 +	flush_write_buffers();
   32.86 +	return virt_to_bus(ptr);
   32.87 +}
   32.88 +
   32.89 +/* Unmap a single streaming mode DMA translation.  The dma_addr and size
   32.90 + * must match what was provided for in a previous pci_map_single call.  All
   32.91 + * other usages are undefined.
   32.92 + *
   32.93 + * After this call, reads by the cpu to the buffer are guarenteed to see
   32.94 + * whatever the device wrote there.
   32.95 + */
   32.96 +static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
   32.97 +				    size_t size, int direction)
   32.98 +{
   32.99 +	if (direction == PCI_DMA_NONE)
  32.100 +		out_of_line_bug();
  32.101 +	/* Nothing to do */
  32.102 +}
  32.103 +
  32.104 +/*
  32.105 + * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical
  32.106 + * to pci_map_single, but takes a struct page instead of a virtual address
  32.107 + */
  32.108 +static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
  32.109 +				      unsigned long offset, size_t size, int direction)
  32.110 +{
  32.111 +	if (direction == PCI_DMA_NONE)
  32.112 +		out_of_line_bug();
  32.113 +
  32.114 +	return page_to_bus(page) + offset;
  32.115 +}
  32.116 +
  32.117 +static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
  32.118 +				  size_t size, int direction)
  32.119 +{
  32.120 +	if (direction == PCI_DMA_NONE)
  32.121 +		out_of_line_bug();
  32.122 +	/* Nothing to do */
  32.123 +}
  32.124 +
  32.125 +/* pci_unmap_{page,single} is a nop so... */
  32.126 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
  32.127 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
  32.128 +#define pci_unmap_addr(PTR, ADDR_NAME)		(0)
  32.129 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)	do { } while (0)
  32.130 +#define pci_unmap_len(PTR, LEN_NAME)		(0)
  32.131 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
  32.132 +
  32.133 +/* Map a set of buffers described by scatterlist in streaming
  32.134 + * mode for DMA.  This is the scather-gather version of the
  32.135 + * above pci_map_single interface.  Here the scatter gather list
  32.136 + * elements are each tagged with the appropriate dma address
  32.137 + * and length.  They are obtained via sg_dma_{address,length}(SG).
  32.138 + *
  32.139 + * NOTE: An implementation may be able to use a smaller number of
  32.140 + *       DMA address/length pairs than there are SG table elements.
  32.141 + *       (for example via virtual mapping capabilities)
  32.142 + *       The routine returns the number of addr/length pairs actually
  32.143 + *       used, at most nents.
  32.144 + *
  32.145 + * Device ownership issues as mentioned above for pci_map_single are
  32.146 + * the same here.
  32.147 + */
  32.148 +static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
  32.149 +			     int nents, int direction)
  32.150 +{
  32.151 +	int i;
  32.152 +
  32.153 +	if (direction == PCI_DMA_NONE)
  32.154 +		out_of_line_bug();
  32.155 + 
  32.156 + 	/*
  32.157 + 	 * temporary 2.4 hack
  32.158 + 	 */
  32.159 + 	for (i = 0; i < nents; i++ ) {
  32.160 + 		if (sg[i].address && sg[i].page)
  32.161 + 			out_of_line_bug();
  32.162 + 		else if (!sg[i].address && !sg[i].page)
  32.163 + 			out_of_line_bug();
  32.164 + 
  32.165 + 		if (sg[i].address)
  32.166 + 			sg[i].dma_address = virt_to_bus(sg[i].address);
  32.167 + 		else
  32.168 + 			sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
  32.169 + 	}
  32.170 + 
  32.171 +	flush_write_buffers();
  32.172 +	return nents;
  32.173 +}
  32.174 +
  32.175 +/* Unmap a set of streaming mode DMA translations.
  32.176 + * Again, cpu read rules concerning calls here are the same as for
  32.177 + * pci_unmap_single() above.
  32.178 + */
  32.179 +static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
  32.180 +				int nents, int direction)
  32.181 +{
  32.182 +	if (direction == PCI_DMA_NONE)
  32.183 +		out_of_line_bug();
  32.184 +	/* Nothing to do */
  32.185 +}
  32.186 +
  32.187 +/* Make physical memory consistent for a single
  32.188 + * streaming mode DMA translation after a transfer.
  32.189 + *
  32.190 + * If you perform a pci_map_single() but wish to interrogate the
  32.191 + * buffer using the cpu, yet do not wish to teardown the PCI dma
  32.192 + * mapping, you must call this function before doing so.  At the
  32.193 + * next point you give the PCI dma address back to the card, the
  32.194 + * device again owns the buffer.
  32.195 + */
  32.196 +static inline void pci_dma_sync_single(struct pci_dev *hwdev,
  32.197 +				       dma_addr_t dma_handle,
  32.198 +				       size_t size, int direction)
  32.199 +{
  32.200 +	if (direction == PCI_DMA_NONE)
  32.201 +		out_of_line_bug();
  32.202 +	flush_write_buffers();
  32.203 +}
  32.204 +
  32.205 +/* Make physical memory consistent for a set of streaming
  32.206 + * mode DMA translations after a transfer.
  32.207 + *
  32.208 + * The same as pci_dma_sync_single but for a scatter-gather list,
  32.209 + * same rules and usage.
  32.210 + */
  32.211 +static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
  32.212 +				   struct scatterlist *sg,
  32.213 +				   int nelems, int direction)
  32.214 +{
  32.215 +	if (direction == PCI_DMA_NONE)
  32.216 +		out_of_line_bug();
  32.217 +	flush_write_buffers();
  32.218 +}
  32.219 +
  32.220 +/* Return whether the given PCI device DMA address mask can
  32.221 + * be supported properly.  For example, if your device can
  32.222 + * only drive the low 24-bits during PCI bus mastering, then
  32.223 + * you would pass 0x00ffffff as the mask to this function.
  32.224 + */
  32.225 +static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
  32.226 +{
  32.227 +        /*
  32.228 +         * we fall back to GFP_DMA when the mask isn't all 1s,
  32.229 +         * so we can't guarantee allocations that must be
  32.230 +         * within a tighter range than GFP_DMA..
  32.231 +         */
  32.232 +        if(mask < 0x00ffffff)
  32.233 +                return 0;
  32.234 +
  32.235 +	return 1;
  32.236 +}
  32.237 +
  32.238 +/* This is always fine. */
  32.239 +#define pci_dac_dma_supported(pci_dev, mask)	(1)
  32.240 +
  32.241 +static __inline__ dma64_addr_t
  32.242 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
  32.243 +{
  32.244 +	return ((dma64_addr_t) page_to_bus(page) +
  32.245 +		(dma64_addr_t) offset);
  32.246 +}
  32.247 +
  32.248 +static __inline__ struct page *
  32.249 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
  32.250 +{
  32.251 +	return bus_to_page(dma_addr);
  32.252 +}
  32.253 +
  32.254 +static __inline__ unsigned long
  32.255 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
  32.256 +{
  32.257 +	return (dma_addr & ~PAGE_MASK);
  32.258 +}
  32.259 +
  32.260 +static __inline__ void
  32.261 +pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
  32.262 +{
  32.263 +	flush_write_buffers();
  32.264 +}
  32.265 +
  32.266 +/* These macros should be used after a pci_map_sg call has been done
  32.267 + * to get bus addresses of each of the SG entries and their lengths.
  32.268 + * You should only work with the number of sg entries pci_map_sg
  32.269 + * returns.
  32.270 + */
  32.271 +#define sg_dma_address(sg)	((sg)->dma_address)
  32.272 +#define sg_dma_len(sg)		((sg)->length)
  32.273 +
  32.274 +/* Return the index of the PCI controller for device. */
  32.275 +static inline int pci_controller_num(struct pci_dev *dev)
  32.276 +{
  32.277 +	return 0;
  32.278 +}
  32.279 +
  32.280 +#define HAVE_PCI_MMAP
  32.281 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  32.282 +			       enum pci_mmap_state mmap_state, int write_combine);
  32.283 +
  32.284 +#endif /* __KERNEL__ */
  32.285 +
  32.286 +#endif /* __i386_PCI_H */
    33.1 --- a/xenolinux-2.4.26-sparse/mkbuildtree	Thu May 06 14:53:19 2004 +0000
    33.2 +++ b/xenolinux-2.4.26-sparse/mkbuildtree	Fri May 07 14:53:28 2004 +0000
    33.3 @@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h
    33.4  ln -sf ../asm-i386/namei.h 
    33.5  ln -sf ../asm-i386/param.h 
    33.6  ln -sf ../asm-i386/parport.h 
    33.7 -ln -sf ../asm-i386/pci.h
    33.8  ln -sf ../asm-i386/pgtable-3level.h 
    33.9  ln -sf ../asm-i386/poll.h 
   33.10  ln -sf ../asm-i386/posix_types.h 
    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/xenolinux-2.4.26-sparse/mm/page_alloc.c	Fri May 07 14:53:28 2004 +0000
    34.3 @@ -0,0 +1,930 @@
    34.4 +/*
    34.5 + *  linux/mm/page_alloc.c
    34.6 + *
    34.7 + *  Manages the free list, the system allocates free pages here.
    34.8 + *  Note that kmalloc() lives in slab.c
    34.9 + *
   34.10 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   34.11 + *  Swap reorganised 29.12.95, Stephen Tweedie
   34.12 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   34.13 + *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   34.14 + *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   34.15 + *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   34.16 + */
   34.17 +
   34.18 +#include <linux/config.h>
   34.19 +#include <linux/mm.h>
   34.20 +#include <linux/swap.h>
   34.21 +#include <linux/swapctl.h>
   34.22 +#include <linux/interrupt.h>
   34.23 +#include <linux/pagemap.h>
   34.24 +#include <linux/bootmem.h>
   34.25 +#include <linux/slab.h>
   34.26 +#include <linux/module.h>
   34.27 +
   34.28 +int nr_swap_pages;
   34.29 +int nr_active_pages;
   34.30 +int nr_inactive_pages;
   34.31 +LIST_HEAD(inactive_list);
   34.32 +LIST_HEAD(active_list);
   34.33 +pg_data_t *pgdat_list;
   34.34 +
   34.35 +/*
   34.36 + *
   34.37 + * The zone_table array is used to look up the address of the
   34.38 + * struct zone corresponding to a given zone number (ZONE_DMA,
   34.39 + * ZONE_NORMAL, or ZONE_HIGHMEM).
   34.40 + */
   34.41 +zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
   34.42 +EXPORT_SYMBOL(zone_table);
   34.43 +
   34.44 +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
   34.45 +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
   34.46 +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
   34.47 +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
   34.48 +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
   34.49 +
   34.50 +int vm_gfp_debug = 0;
   34.51 +
   34.52 +/*
   34.53 + * Temporary debugging check.
   34.54 + */
   34.55 +#define BAD_RANGE(zone, page)						\
   34.56 +(									\
   34.57 +	(((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size))	\
   34.58 +	|| (((page) - mem_map) < (zone)->zone_start_mapnr)		\
   34.59 +	|| ((zone) != page_zone(page))					\
   34.60 +)
   34.61 +
   34.62 +/*
   34.63 + * Freeing function for a buddy system allocator.
   34.64 + * Contrary to prior comments, this is *NOT* hairy, and there
   34.65 + * is no reason for anyone not to understand it.
   34.66 + *
   34.67 + * The concept of a buddy system is to maintain direct-mapped tables
   34.68 + * (containing bit values) for memory blocks of various "orders".
   34.69 + * The bottom level table contains the map for the smallest allocatable
   34.70 + * units of memory (here, pages), and each level above it describes
   34.71 + * pairs of units from the levels below, hence, "buddies".
   34.72 + * At a high level, all that happens here is marking the table entry
   34.73 + * at the bottom level available, and propagating the changes upward
   34.74 + * as necessary, plus some accounting needed to play nicely with other
   34.75 + * parts of the VM system.
   34.76 + * At each level, we keep one bit for each pair of blocks, which
   34.77 + * is set to 1 iff only one of the pair is allocated.  So when we
   34.78 + * are allocating or freeing one, we can derive the state of the
   34.79 + * other.  That is, if we allocate a small block, and both were   
   34.80 + * free, the remainder of the region must be split into blocks.   
   34.81 + * If a block is freed, and its buddy is also free, then this
   34.82 + * triggers coalescing into a block of larger size.            
   34.83 + *
   34.84 + * -- wli
   34.85 + */
   34.86 +
   34.87 +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
   34.88 +static void __free_pages_ok (struct page *page, unsigned int order)
   34.89 +{
   34.90 +	unsigned long index, page_idx, mask, flags;
   34.91 +	free_area_t *area;
   34.92 +	struct page *base;
   34.93 +	zone_t *zone;
   34.94 +
   34.95 +	/*
   34.96 +	 * Yes, think what happens when other parts of the kernel take 
   34.97 +	 * a reference to a page in order to pin it for io. -ben
   34.98 +	 */
   34.99 +	if (PageLRU(page)) {
  34.100 +		if (unlikely(in_interrupt()))
  34.101 +			BUG();
  34.102 +		lru_cache_del(page);
  34.103 +	}
  34.104 +
  34.105 +	if (page->buffers)
  34.106 +		BUG();
  34.107 +	if (page->mapping)
  34.108 +		return (*(void(*)(struct page *))page->mapping)(page);
  34.109 +	if (!VALID_PAGE(page))
  34.110 +		BUG();
  34.111 +	if (PageLocked(page))
  34.112 +		BUG();
  34.113 +	if (PageActive(page))
  34.114 +		BUG();
  34.115 +	ClearPageReferenced(page);
  34.116 +	ClearPageDirty(page);
  34.117 +
  34.118 +	if (current->flags & PF_FREE_PAGES)
  34.119 +		goto local_freelist;
  34.120 + back_local_freelist:
  34.121 +
  34.122 +	zone = page_zone(page);
  34.123 +
  34.124 +	mask = (~0UL) << order;
  34.125 +	base = zone->zone_mem_map;
  34.126 +	page_idx = page - base;
  34.127 +	if (page_idx & ~mask)
  34.128 +		BUG();
  34.129 +	index = page_idx >> (1 + order);
  34.130 +
  34.131 +	area = zone->free_area + order;
  34.132 +
  34.133 +	spin_lock_irqsave(&zone->lock, flags);
  34.134 +
  34.135 +	zone->free_pages -= mask;
  34.136 +
  34.137 +	while (mask + (1 << (MAX_ORDER-1))) {
  34.138 +		struct page *buddy1, *buddy2;
  34.139 +
  34.140 +		if (area >= zone->free_area + MAX_ORDER)
  34.141 +			BUG();
  34.142 +		if (!__test_and_change_bit(index, area->map))
  34.143 +			/*
  34.144 +			 * the buddy page is still allocated.
  34.145 +			 */
  34.146 +			break;
  34.147 +		/*
  34.148 +		 * Move the buddy up one level.
  34.149 +		 * This code is taking advantage of the identity:
  34.150 +		 * 	-mask = 1+~mask
  34.151 +		 */
  34.152 +		buddy1 = base + (page_idx ^ -mask);
  34.153 +		buddy2 = base + page_idx;
  34.154 +		if (BAD_RANGE(zone,buddy1))
  34.155 +			BUG();
  34.156 +		if (BAD_RANGE(zone,buddy2))
  34.157 +			BUG();
  34.158 +
  34.159 +		list_del(&buddy1->list);
  34.160 +		mask <<= 1;
  34.161 +		area++;
  34.162 +		index >>= 1;
  34.163 +		page_idx &= mask;
  34.164 +	}
  34.165 +	list_add(&(base + page_idx)->list, &area->free_list);
  34.166 +
  34.167 +	spin_unlock_irqrestore(&zone->lock, flags);
  34.168 +	return;
  34.169 +
  34.170 + local_freelist:
  34.171 +	if (current->nr_local_pages)
  34.172 +		goto back_local_freelist;
  34.173 +	if (in_interrupt())
  34.174 +		goto back_local_freelist;		
  34.175 +
  34.176 +	list_add(&page->list, &current->local_pages);
  34.177 +	page->index = order;
  34.178 +	current->nr_local_pages++;
  34.179 +}
  34.180 +
  34.181 +#define MARK_USED(index, order, area) \
  34.182 +	__change_bit((index) >> (1+(order)), (area)->map)
  34.183 +
  34.184 +static inline struct page * expand (zone_t *zone, struct page *page,
  34.185 +	 unsigned long index, int low, int high, free_area_t * area)
  34.186 +{
  34.187 +	unsigned long size = 1 << high;
  34.188 +
  34.189 +	while (high > low) {
  34.190 +		if (BAD_RANGE(zone,page))
  34.191 +			BUG();
  34.192 +		area--;
  34.193 +		high--;
  34.194 +		size >>= 1;
  34.195 +		list_add(&(page)->list, &(area)->free_list);
  34.196 +		MARK_USED(index, high, area);
  34.197 +		index += size;
  34.198 +		page += size;
  34.199 +	}
  34.200 +	if (BAD_RANGE(zone,page))
  34.201 +		BUG();
  34.202 +	return page;
  34.203 +}
  34.204 +
  34.205 +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
  34.206 +static struct page * rmqueue(zone_t *zone, unsigned int order)
  34.207 +{
  34.208 +	free_area_t * area = zone->free_area + order;
  34.209 +	unsigned int curr_order = order;
  34.210 +	struct list_head *head, *curr;
  34.211 +	unsigned long flags;
  34.212 +	struct page *page;
  34.213 +
  34.214 +	spin_lock_irqsave(&zone->lock, flags);
  34.215 +	do {
  34.216 +		head = &area->free_list;
  34.217 +		curr = head->next;
  34.218 +
  34.219 +		if (curr != head) {
  34.220 +			unsigned int index;
  34.221 +
  34.222 +			page = list_entry(curr, struct page, list);
  34.223 +			if (BAD_RANGE(zone,page))
  34.224 +				BUG();
  34.225 +			list_del(curr);
  34.226 +			index = page - zone->zone_mem_map;
  34.227 +			if (curr_order != MAX_ORDER-1)
  34.228 +				MARK_USED(index, curr_order, area);
  34.229 +			zone->free_pages -= 1UL << order;
  34.230 +
  34.231 +			page = expand(zone, page, index, order, curr_order, area);
  34.232 +			spin_unlock_irqrestore(&zone->lock, flags);
  34.233 +
  34.234 +			set_page_count(page, 1);
  34.235 +			if (BAD_RANGE(zone,page))
  34.236 +				BUG();
  34.237 +			if (PageLRU(page))
  34.238 +				BUG();
  34.239 +			if (PageActive(page))
  34.240 +				BUG();
  34.241 +			return page;	
  34.242 +		}
  34.243 +		curr_order++;
  34.244 +		area++;
  34.245 +	} while (curr_order < MAX_ORDER);
  34.246 +	spin_unlock_irqrestore(&zone->lock, flags);
  34.247 +
  34.248 +	return NULL;
  34.249 +}
  34.250 +
  34.251 +#ifndef CONFIG_DISCONTIGMEM
  34.252 +struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
  34.253 +{
  34.254 +	return __alloc_pages(gfp_mask, order,
  34.255 +		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
  34.256 +}
  34.257 +#endif
  34.258 +
  34.259 +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
  34.260 +static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
  34.261 +{
  34.262 +	struct page * page = NULL;
  34.263 +	int __freed;
  34.264 +
  34.265 +	if (in_interrupt())
  34.266 +		BUG();
  34.267 +
  34.268 +	current->allocation_order = order;
  34.269 +	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
  34.270 +
  34.271 +	__freed = try_to_free_pages_zone(classzone, gfp_mask);
  34.272 +
  34.273 +	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
  34.274 +
  34.275 +	if (current->nr_local_pages) {
  34.276 +		struct list_head * entry, * local_pages;
  34.277 +		struct page * tmp;
  34.278 +		int nr_pages;
  34.279 +
  34.280 +		local_pages = &current->local_pages;
  34.281 +
  34.282 +		if (likely(__freed)) {
  34.283 +			/* pick from the last inserted so we're lifo */
  34.284 +			entry = local_pages->next;
  34.285 +			do {
  34.286 +				tmp = list_entry(entry, struct page, list);
  34.287 +				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
  34.288 +					list_del(entry);
  34.289 +					current->nr_local_pages--;
  34.290 +					set_page_count(tmp, 1);
  34.291 +					page = tmp;
  34.292 +
  34.293 +					if (page->buffers)
  34.294 +						BUG();
  34.295 +					if (page->mapping)
  34.296 +						BUG();
  34.297 +					if (!VALID_PAGE(page))
  34.298 +						BUG();
  34.299 +					if (PageLocked(page))
  34.300 +						BUG();
  34.301 +					if (PageLRU(page))
  34.302 +						BUG();
  34.303 +					if (PageActive(page))
  34.304 +						BUG();
  34.305 +					if (PageDirty(page))
  34.306 +						BUG();
  34.307 +
  34.308 +					break;
  34.309 +				}
  34.310 +			} while ((entry = entry->next) != local_pages);
  34.311 +		}
  34.312 +
  34.313 +		nr_pages = current->nr_local_pages;
  34.314 +		/* free in reverse order so that the global order will be lifo */
  34.315 +		while ((entry = local_pages->prev) != local_pages) {
  34.316 +			list_del(entry);
  34.317 +			tmp = list_entry(entry, struct page, list);
  34.318 +			__free_pages_ok(tmp, tmp->index);
  34.319 +			if (!nr_pages--)
  34.320 +				BUG();
  34.321 +		}
  34.322 +		current->nr_local_pages = 0;
  34.323 +	}
  34.324 +
  34.325 +	*freed = __freed;
  34.326 +	return page;
  34.327 +}
  34.328 +
  34.329 +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
  34.330 +{
  34.331 +	long free = zone->free_pages - (1UL << order);
  34.332 +	return free >= 0 ? free : 0;
  34.333 +}
  34.334 +
  34.335 +/*
  34.336 + * This is the 'heart' of the zoned buddy allocator:
  34.337 + */
  34.338 +struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
  34.339 +{
  34.340 +	zone_t **zone, * classzone;
  34.341 +	struct page * page;
  34.342 +	int freed, class_idx;
  34.343 +
  34.344 +	zone = zonelist->zones;
  34.345 +	classzone = *zone;
  34.346 +	class_idx = zone_idx(classzone);
  34.347 +
  34.348 +	for (;;) {
  34.349 +		zone_t *z = *(zone++);
  34.350 +		if (!z)
  34.351 +			break;
  34.352 +
  34.353 +		if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
  34.354 +			page = rmqueue(z, order);
  34.355 +			if (page)
  34.356 +				return page;
  34.357 +		}
  34.358 +	}
  34.359 +
  34.360 +	classzone->need_balance = 1;
  34.361 +	mb();
  34.362 +	if (waitqueue_active(&kswapd_wait))
  34.363 +		wake_up_interruptible(&kswapd_wait);
  34.364 +
  34.365 +	zone = zonelist->zones;
  34.366 +	for (;;) {
  34.367 +		unsigned long min;
  34.368 +		zone_t *z = *(zone++);
  34.369 +		if (!z)
  34.370 +			break;
  34.371 +
  34.372 +		min = z->watermarks[class_idx].min;
  34.373 +		if (!(gfp_mask & __GFP_WAIT))
  34.374 +			min >>= 2;
  34.375 +		if (zone_free_pages(z, order) > min) {
  34.376 +			page = rmqueue(z, order);
  34.377 +			if (page)
  34.378 +				return page;
  34.379 +		}
  34.380 +	}
  34.381 +
  34.382 +	/* here we're in the low on memory slow path */
  34.383 +
  34.384 +	if ((current->flags & PF_MEMALLOC) && 
  34.385 +			(!in_interrupt() || (current->flags & PF_MEMDIE))) {
  34.386 +		zone = zonelist->zones;
  34.387 +		for (;;) {
  34.388 +			zone_t *z = *(zone++);
  34.389 +			if (!z)
  34.390 +				break;
  34.391 +
  34.392 +			page = rmqueue(z, order);
  34.393 +			if (page)
  34.394 +				return page;
  34.395 +		}
  34.396 +		return NULL;
  34.397 +	}
  34.398 +
  34.399 +	/* Atomic allocations - we can't balance anything */
  34.400 +	if (!(gfp_mask & __GFP_WAIT))
  34.401 +		goto out;
  34.402 +
  34.403 + rebalance:
  34.404 +	page = balance_classzone(classzone, gfp_mask, order, &freed);
  34.405 +	if (page)
  34.406 +		return page;
  34.407 +
  34.408 +	zone = zonelist->zones;
  34.409 +	if (likely(freed)) {
  34.410 +		for (;;) {
  34.411 +			zone_t *z = *(zone++);
  34.412 +			if (!z)
  34.413 +				break;
  34.414 +
  34.415 +			if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
  34.416 +				page = rmqueue(z, order);
  34.417 +				if (page)
  34.418 +					return page;
  34.419 +			}
  34.420 +		}
  34.421 +		goto rebalance;
  34.422 +	} else {
  34.423 +		/* 
  34.424 +		 * Check that no other task is been killed meanwhile,
  34.425 +		 * in such a case we can succeed the allocation.
  34.426 +		 */
  34.427 +		for (;;) {
  34.428 +			zone_t *z = *(zone++);
  34.429 +			if (!z)
  34.430 +				break;
  34.431 +
  34.432 +			if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
  34.433 +				page = rmqueue(z, order);
  34.434 +				if (page)
  34.435 +					return page;
  34.436 +			}
  34.437 +		}
  34.438 +	}
  34.439 +
  34.440 + out:
  34.441 +	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
  34.442 +	       order, gfp_mask, !!(current->flags & PF_MEMALLOC));
  34.443 +	if (unlikely(vm_gfp_debug))
  34.444 +		dump_stack();
  34.445 +	return NULL;
  34.446 +}
  34.447 +
  34.448 +/*
  34.449 + * Common helper functions.
  34.450 + */
  34.451 +unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
  34.452 +{
  34.453 +	struct page * page;
  34.454 +
  34.455 +	page = alloc_pages(gfp_mask, order);
  34.456 +	if (!page)
  34.457 +		return 0;
  34.458 +	return (unsigned long) page_address(page);
  34.459 +}
  34.460 +
  34.461 +unsigned long get_zeroed_page(unsigned int gfp_mask)
  34.462 +{
  34.463 +	struct page * page;
  34.464 +
  34.465 +	page = alloc_pages(gfp_mask, 0);
  34.466 +	if (page) {
  34.467 +		void *address = page_address(page);
  34.468 +		clear_page(address);
  34.469 +		return (unsigned long) address;
  34.470 +	}
  34.471 +	return 0;
  34.472 +}
  34.473 +
  34.474 +void __free_pages(struct page *page, unsigned int order)
  34.475 +{
  34.476 +	if (!PageReserved(page) && put_page_testzero(page))
  34.477 +		__free_pages_ok(page, order);
  34.478 +}
  34.479 +
  34.480 +void free_pages(unsigned long addr, unsigned int order)
  34.481 +{
  34.482 +	if (addr != 0)
  34.483 +		__free_pages(virt_to_page(addr), order);
  34.484 +}
  34.485 +
  34.486 +/*
  34.487 + * Total amount of free (allocatable) RAM:
  34.488 + */
  34.489 +unsigned int nr_free_pages (void)
  34.490 +{
  34.491 +	unsigned int sum = 0;
  34.492 +	zone_t *zone;
  34.493 +
  34.494 +	for_each_zone(zone)
  34.495 +		sum += zone->free_pages;
  34.496 +
  34.497 +	return sum;
  34.498 +}
  34.499 +
  34.500 +/*
  34.501 + * Amount of free RAM allocatable as buffer memory:
  34.502 + */
  34.503 +unsigned int nr_free_buffer_pages (void)
  34.504 +{
  34.505 +	pg_data_t *pgdat;
  34.506 +	unsigned int sum = 0;
  34.507 +	zonelist_t *zonelist;
  34.508 +	zone_t **zonep, *zone;
  34.509 +
  34.510 +	for_each_pgdat(pgdat) {
  34.511 +		int class_idx;
  34.512 +		zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
  34.513 +		zonep = zonelist->zones;
  34.514 +		zone = *zonep;
  34.515 +		class_idx = zone_idx(zone);
  34.516 +
  34.517 +		sum += zone->nr_cache_pages;
  34.518 +		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
  34.519 +			int free = zone->free_pages - zone->watermarks[class_idx].high;
  34.520 +			if (free <= 0)
  34.521 +				continue;
  34.522 +			sum += free;
  34.523 +		}
  34.524 +	}
  34.525 +
  34.526 +	return sum;
  34.527 +}
  34.528 +
  34.529 +#if CONFIG_HIGHMEM
  34.530 +unsigned int nr_free_highpages (void)
  34.531 +{
  34.532 +	pg_data_t *pgdat;
  34.533 +	unsigned int pages = 0;
  34.534 +
  34.535 +	for_each_pgdat(pgdat)
  34.536 +		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
  34.537 +
  34.538 +	return pages;
  34.539 +}
  34.540 +
  34.541 +unsigned int freeable_lowmem(void)
  34.542 +{
  34.543 +	unsigned int pages = 0;
  34.544 +	pg_data_t *pgdat;
  34.545 +
  34.546 +	for_each_pgdat(pgdat) {
  34.547 +		pages += pgdat->node_zones[ZONE_DMA].free_pages;
  34.548 +		pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
  34.549 +		pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
  34.550 +		pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
  34.551 +		pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
  34.552 +		pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
  34.553 +	}
  34.554 +
  34.555 +	return pages;
  34.556 +}
  34.557 +#endif
  34.558 +
  34.559 +#define K(x) ((x) << (PAGE_SHIFT-10))
  34.560 +
  34.561 +/*
  34.562 + * Show free area list (used inside shift_scroll-lock stuff)
  34.563 + * We also calculate the percentage fragmentation. We do this by counting the
  34.564 + * memory on each free list with the exception of the first item on the list.
  34.565 + */
  34.566 +void show_free_areas_core(pg_data_t *pgdat)
  34.567 +{
  34.568 + 	unsigned int order;
  34.569 +	unsigned type;
  34.570 +	pg_data_t *tmpdat = pgdat;
  34.571 +
  34.572 +	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
  34.573 +		K(nr_free_pages()),
  34.574 +		K(nr_free_highpages()));
  34.575 +
  34.576 +	while (tmpdat) {
  34.577 +		zone_t *zone;
  34.578 +		for (zone = tmpdat->node_zones;
  34.579 +			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
  34.580 +			printk("Zone:%s freepages:%6lukB\n", 
  34.581 +					zone->name,
  34.582 +					K(zone->free_pages));
  34.583 +			
  34.584 +		tmpdat = tmpdat->node_next;
  34.585 +	}
  34.586 +
  34.587 +	printk("( Active: %d, inactive: %d, free: %d )\n",
  34.588 +	       nr_active_pages,
  34.589 +	       nr_inactive_pages,
  34.590 +	       nr_free_pages());
  34.591 +
  34.592 +	for (type = 0; type < MAX_NR_ZONES; type++) {
  34.593 +		struct list_head *head, *curr;
  34.594 +		zone_t *zone = pgdat->node_zones + type;
  34.595 + 		unsigned long nr, total, flags;
  34.596 +
  34.597 +		total = 0;
  34.598 +		if (zone->size) {
  34.599 +			spin_lock_irqsave(&zone->lock, flags);
  34.600 +		 	for (order = 0; order < MAX_ORDER; order++) {
  34.601 +				head = &(zone->free_area + order)->free_list;
  34.602 +				curr = head;
  34.603 +				nr = 0;
  34.604 +				for (;;) {
  34.605 +					if ((curr = curr->next) == head)
  34.606 +						break;
  34.607 +					nr++;
  34.608 +				}
  34.609 +				total += nr * (1 << order);
  34.610 +				printk("%lu*%lukB ", nr, K(1UL) << order);
  34.611 +			}
  34.612 +			spin_unlock_irqrestore(&zone->lock, flags);
  34.613 +		}
  34.614 +		printk("= %lukB)\n", K(total));
  34.615 +	}
  34.616 +
  34.617 +#ifdef SWAP_CACHE_INFO
  34.618 +	show_swap_cache_info();
  34.619 +#endif	
  34.620 +}
  34.621 +
  34.622 +void show_free_areas(void)
  34.623 +{
  34.624 +	show_free_areas_core(pgdat_list);
  34.625 +}
  34.626 +
  34.627 +/*
  34.628 + * Builds allocation fallback zone lists.
  34.629 + */
  34.630 +static inline void build_zonelists(pg_data_t *pgdat)
  34.631 +{
  34.632 +	int i, j, k;
  34.633 +
  34.634 +	for (i = 0; i <= GFP_ZONEMASK; i++) {
  34.635 +		zonelist_t *zonelist;
  34.636 +		zone_t *zone;
  34.637 +
  34.638 +		zonelist = pgdat->node_zonelists + i;
  34.639 +		memset(zonelist, 0, sizeof(*zonelist));
  34.640 +
  34.641 +		j = 0;
  34.642 +		k = ZONE_NORMAL;
  34.643 +		if (i & __GFP_HIGHMEM)
  34.644 +			k = ZONE_HIGHMEM;
  34.645 +		if (i & __GFP_DMA)
  34.646 +			k = ZONE_DMA;
  34.647 +
  34.648 +		switch (k) {
  34.649 +			default:
  34.650 +				BUG();
  34.651 +			/*
  34.652 +			 * fallthrough:
  34.653 +			 */
  34.654 +			case ZONE_HIGHMEM:
  34.655 +				zone = pgdat->node_zones + ZONE_HIGHMEM;
  34.656 +				if (zone->size) {
  34.657 +#ifndef CONFIG_HIGHMEM
  34.658 +					BUG();
  34.659 +#endif
  34.660 +					zonelist->zones[j++] = zone;
  34.661 +				}
  34.662 +			case ZONE_NORMAL:
  34.663 +				zone = pgdat->node_zones + ZONE_NORMAL;
  34.664 +				if (zone->size)
  34.665 +					zonelist->zones[j++] = zone;
  34.666 +			case ZONE_DMA:
  34.667 +				zone = pgdat->node_zones + ZONE_DMA;
  34.668 +				if (zone->size)
  34.669 +					zonelist->zones[j++] = zone;
  34.670 +		}
  34.671 +		zonelist->zones[j++] = NULL;
  34.672 +	} 
  34.673 +}
  34.674 +
  34.675 +/*
  34.676 + * Helper functions to size the waitqueue hash table.
  34.677 + * Essentially these want to choose hash table sizes sufficiently
  34.678 + * large so that collisions trying to wait on pages are rare.
  34.679 + * But in fact, the number of active page waitqueues on typical
  34.680 + * systems is ridiculously low, less than 200. So this is even
  34.681 + * conservative, even though it seems large.
  34.682 + *
  34.683 + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  34.684 + * waitqueues, i.e. the size of the waitq table given the number of pages.
  34.685 + */
  34.686 +#define PAGES_PER_WAITQUEUE	256
  34.687 +
  34.688 +static inline unsigned long wait_table_size(unsigned long pages)
  34.689 +{
  34.690 +	unsigned long size = 1;
  34.691 +
  34.692 +	pages /= PAGES_PER_WAITQUEUE;
  34.693 +
  34.694 +	while (size < pages)
  34.695 +		size <<= 1;
  34.696 +
  34.697 +	/*
  34.698 +	 * Once we have dozens or even hundreds of threads sleeping
  34.699 +	 * on IO we've got bigger problems than wait queue collision.
  34.700 +	 * Limit the size of the wait table to a reasonable size.
  34.701 +	 */
  34.702 +	size = min(size, 4096UL);
  34.703 +
  34.704 +	return size;
  34.705 +}
  34.706 +
  34.707 +/*
  34.708 + * This is an integer logarithm so that shifts can be used later
  34.709 + * to extract the more random high bits from the multiplicative
  34.710 + * hash function before the remainder is taken.
  34.711 + */
  34.712 +static inline unsigned long wait_table_bits(unsigned long size)
  34.713 +{
  34.714 +	return ffz(~size);
  34.715 +}
  34.716 +
  34.717 +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
  34.718 +
  34.719 +/*
  34.720 + * Set up the zone data structures:
  34.721 + *   - mark all pages reserved
  34.722 + *   - mark all memory queues empty
  34.723 + *   - clear the memory bitmaps
  34.724 + */
  34.725 +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
  34.726 +	unsigned long *zones_size, unsigned long zone_start_paddr, 
  34.727 +	unsigned long *zholes_size, struct page *lmem_map)
  34.728 +{
  34.729 +	unsigned long i, j;
  34.730 +	unsigned long map_size;
  34.731 +	unsigned long totalpages, offset, realtotalpages;
  34.732 +	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
  34.733 +
  34.734 +	if (zone_start_paddr & ~PAGE_MASK)
  34.735 +		BUG();
  34.736 +
  34.737 +	totalpages = 0;
  34.738 +	for (i = 0; i < MAX_NR_ZONES; i++) {
  34.739 +		unsigned long size = zones_size[i];
  34.740 +		totalpages += size;
  34.741 +	}
  34.742 +	realtotalpages = totalpages;
  34.743 +	if (zholes_size)
  34.744 +		for (i = 0; i < MAX_NR_ZONES; i++)
  34.745 +			realtotalpages -= zholes_size[i];
  34.746 +			
  34.747 +	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
  34.748 +
  34.749 +	/*
  34.750 +	 * Some architectures (with lots of mem and discontinous memory
  34.751 +	 * maps) have to search for a good mem_map area:
  34.752 +	 * For discontigmem, the conceptual mem map array starts from 
  34.753 +	 * PAGE_OFFSET, we need to align the actual array onto a mem map 
  34.754 +	 * boundary, so that MAP_NR works.
  34.755 +	 */
  34.756 +	map_size = (totalpages + 1)*sizeof(struct page);
  34.757 +	if (lmem_map == (struct page *)0) {
  34.758 +		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
  34.759 +		lmem_map = (struct page *)(PAGE_OFFSET + 
  34.760 +			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
  34.761 +	}
  34.762 +	*gmap = pgdat->node_mem_map = lmem_map;
  34.763 +	pgdat->node_size = totalpages;
  34.764 +	pgdat->node_start_paddr = zone_start_paddr;
  34.765 +	pgdat->node_start_mapnr = (lmem_map - mem_map);
  34.766 +	pgdat->nr_zones = 0;
  34.767 +
  34.768 +	offset = lmem_map - mem_map;	
  34.769 +	for (j = 0; j < MAX_NR_ZONES; j++) {
  34.770 +		zone_t *zone = pgdat->node_zones + j;
  34.771 +		unsigned long mask;
  34.772 +		unsigned long size, realsize;
  34.773 +		int idx;
  34.774 +
  34.775 +		zone_table[nid * MAX_NR_ZONES + j] = zone;
  34.776 +		realsize = size = zones_size[j];
  34.777 +		if (zholes_size)
  34.778 +			realsize -= zholes_size[j];
  34.779 +
  34.780 +		printk("zone(%lu): %lu pages.\n", j, size);
  34.781 +		zone->size = size;
  34.782 +		zone->realsize = realsize;
  34.783 +		zone->name = zone_names[j];
  34.784 +		zone->lock = SPIN_LOCK_UNLOCKED;
  34.785 +		zone->zone_pgdat = pgdat;
  34.786 +		zone->free_pages = 0;
  34.787 +		zone->need_balance = 0;
  34.788 +		 zone->nr_active_pages = zone->nr_inactive_pages = 0;
  34.789 +
  34.790 +
  34.791 +		if (!size)
  34.792 +			continue;
  34.793 +
  34.794 +		/*
  34.795 +		 * The per-page waitqueue mechanism uses hashed waitqueues
  34.796 +		 * per zone.
  34.797 +		 */
  34.798 +		zone->wait_table_size = wait_table_size(size);
  34.799 +		zone->wait_table_shift =
  34.800 +			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
  34.801 +		zone->wait_table = (wait_queue_head_t *)
  34.802 +			alloc_bootmem_node(pgdat, zone->wait_table_size
  34.803 +						* sizeof(wait_queue_head_t));
  34.804 +
  34.805 +		for(i = 0; i < zone->wait_table_size; ++i)
  34.806 +			init_waitqueue_head(zone->wait_table + i);
  34.807 +
  34.808 +		pgdat->nr_zones = j+1;
  34.809 +
  34.810 +		mask = (realsize / zone_balance_ratio[j]);
  34.811 +		if (mask < zone_balance_min[j])
  34.812 +			mask = zone_balance_min[j];
  34.813 +		else if (mask > zone_balance_max[j])
  34.814 +			mask = zone_balance_max[j];
  34.815 +		zone->watermarks[j].min = mask;
  34.816 +		zone->watermarks[j].low = mask*2;
  34.817 +		zone->watermarks[j].high = mask*3;
  34.818 +		/* now set the watermarks of the lower zones in the "j" classzone */
  34.819 +		for (idx = j-1; idx >= 0; idx--) {
  34.820 +			zone_t * lower_zone = pgdat->node_zones + idx;
  34.821 +			unsigned long lower_zone_reserve;
  34.822 +			if (!lower_zone->size)
  34.823 +				continue;
  34.824 +
  34.825 +			mask = lower_zone->watermarks[idx].min;
  34.826 +			lower_zone->watermarks[j].min = mask;
  34.827 +			lower_zone->watermarks[j].low = mask*2;
  34.828 +			lower_zone->watermarks[j].high = mask*3;
  34.829 +
  34.830 +			/* now the brainer part */
  34.831 +			lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
  34.832 +			lower_zone->watermarks[j].min += lower_zone_reserve;
  34.833 +			lower_zone->watermarks[j].low += lower_zone_reserve;
  34.834 +			lower_zone->watermarks[j].high += lower_zone_reserve;
  34.835 +
  34.836 +			realsize += lower_zone->realsize;
  34.837 +		}
  34.838 +
  34.839 +		zone->zone_mem_map = mem_map + offset;
  34.840 +		zone->zone_start_mapnr = offset;
  34.841 +		zone->zone_start_paddr = zone_start_paddr;
  34.842 +
  34.843 +		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
  34.844 +			printk("BUG: wrong zone alignment, it will crash\n");
  34.845 +
  34.846 +		/*
  34.847 +		 * Initially all pages are reserved - free ones are freed
  34.848 +		 * up by free_all_bootmem() once the early boot process is
  34.849 +		 * done. Non-atomic initialization, single-pass.
  34.850 +		 */
  34.851 +		for (i = 0; i < size; i++) {
  34.852 +			struct page *page = mem_map + offset + i;
  34.853 +			set_page_zone(page, nid * MAX_NR_ZONES + j);
  34.854 +			set_page_count(page, 0);
  34.855 +			SetPageReserved(page);
  34.856 +			INIT_LIST_HEAD(&page->list);
  34.857 +			if (j != ZONE_HIGHMEM)
  34.858 +				set_page_address(page, __va(zone_start_paddr));
  34.859 +			zone_start_paddr += PAGE_SIZE;
  34.860 +		}
  34.861 +
  34.862 +		offset += size;
  34.863 +		for (i = 0; ; i++) {
  34.864 +			unsigned long bitmap_size;
  34.865 +
  34.866 +			INIT_LIST_HEAD(&zone->free_area[i].free_list);
  34.867 +			if (i == MAX_ORDER-1) {
  34.868 +				zone->free_area[i].map = NULL;
  34.869 +				break;
  34.870 +			}
  34.871 +
  34.872 +			/*
  34.873 +			 * Page buddy system uses "index >> (i+1)",
  34.874 +			 * where "index" is at most "size-1".
  34.875 +			 *
  34.876 +			 * The extra "+3" is to round down to byte
  34.877 +			 * size (8 bits per byte assumption). Thus
  34.878 +			 * we get "(size-1) >> (i+4)" as the last byte
  34.879 +			 * we can access.
  34.880 +			 *
  34.881 +			 * The "+1" is because we want to round the
  34.882 +			 * byte allocation up rather than down. So
  34.883 +			 * we should have had a "+7" before we shifted
  34.884 +			 * down by three. Also, we have to add one as
  34.885 +			 * we actually _use_ the last bit (it's [0,n]
  34.886 +			 * inclusive, not [0,n[).
  34.887 +			 *
  34.888 +			 * So we actually had +7+1 before we shift
  34.889 +			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
  34.890 +			 * (modulo overflows, which we do not have).
  34.891 +			 *
  34.892 +			 * Finally, we LONG_ALIGN because all bitmap
  34.893 +			 * operations are on longs.
  34.894 +			 */
  34.895 +			bitmap_size = (size-1) >> (i+4);
  34.896 +			bitmap_size = LONG_ALIGN(bitmap_size+1);
  34.897 +			zone->free_area[i].map = 
  34.898 +			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
  34.899 +		}
  34.900 +	}
  34.901 +	build_zonelists(pgdat);
  34.902 +}
  34.903 +
  34.904 +void __init free_area_init(unsigned long *zones_size)
  34.905 +{
  34.906 +	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
  34.907 +}
  34.908 +
  34.909 +static int __init setup_mem_frac(char *str)
  34.910 +{
  34.911 +	int j = 0;
  34.912 +
  34.913 +	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
  34.914 +	printk("setup_mem_frac: ");
  34.915 +	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
  34.916 +	printk("\n");
  34.917 +	return 1;
  34.918 +}
  34.919 +
  34.920 +__setup("memfrac=", setup_mem_frac);
  34.921 +
  34.922 +static int __init setup_lower_zone_reserve(char *str)
  34.923 +{
  34.924 +	int j = 0;
  34.925 +
  34.926 +	while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
  34.927 +	printk("setup_lower_zone_reserve: ");
  34.928 +	for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
  34.929 +	printk("\n");
  34.930 +	return 1;
  34.931 +}
  34.932 +
  34.933 +__setup("lower_zone_reserve=", setup_lower_zone_reserve);