ia64/xen-unstable

changeset 1360:0fab6364d23b

bitkeeper revision 1.897 (40a0eb02jGwqt6POLmCY0eC1hpHfvw)

trivial merge
author iap10@labyrinth.cl.cam.ac.uk
date Tue May 11 15:02:26 2004 +0000 (2004-05-11)
parents 8d56cd44e887 a2abb67d5518
children 34951071caf8
files .rootkeys tools/examples/xc_dom_create.py tools/xc/py/Xc.c tools/xenctl/lib/utils.py tools/xend/lib/domain_controller.h tools/xend/lib/main.py tools/xend/lib/manager.py tools/xend/lib/netif.py xen/arch/i386/entry.S xen/common/dom_mem_ops.c xen/common/domain.c xen/common/kernel.c xen/common/memory.c xen/common/physdev.c xen/include/asm-i386/processor.h xen/include/hypervisor-ifs/hypervisor-if.h xenolinux-2.4.26-sparse/arch/xen/config.in xenolinux-2.4.26-sparse/arch/xen/defconfig xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/common.h xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h xenolinux-2.4.26-sparse/include/asm-xen/io.h xenolinux-2.4.26-sparse/include/asm-xen/pci.h xenolinux-2.4.26-sparse/mkbuildtree xenolinux-2.4.26-sparse/mm/page_alloc.c
line diff
     1.1 --- a/.rootkeys	Tue May 11 14:57:44 2004 +0000
     1.2 +++ b/.rootkeys	Tue May 11 15:02:26 2004 +0000
     1.3 @@ -107,6 +107,7 @@ 4055ad97wMLUj0BZT0e_T0EwQN0Bvw tools/xen
     1.4  4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h
     1.5  4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py
     1.6  4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py
     1.7 +409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py
     1.8  40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c
     1.9  4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py
    1.10  4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend
    1.11 @@ -735,6 +736,7 @@ 3e5a4e678ddsQOpbSiRdy1GRcDc9WA xenolinux
    1.12  3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h
    1.13  3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h
    1.14  3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h
    1.15 +409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h
    1.16  3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h
    1.17  3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h
    1.18  3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h
    1.19 @@ -762,6 +764,7 @@ 406aeeafkrnCuIVWLFv3kfn4uAD5Eg xenolinux
    1.20  3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c
    1.21  3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c
    1.22  3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c
    1.23 +409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c
    1.24  3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c
    1.25  3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c
    1.26  407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse
     2.1 --- a/tools/examples/xc_dom_create.py	Tue May 11 14:57:44 2004 +0000
     2.2 +++ b/tools/examples/xc_dom_create.py	Tue May 11 15:02:26 2004 +0000
     2.3 @@ -333,7 +333,18 @@ def make_domain():
     2.4                  xc.domain_destroy ( dom=id )
     2.5                  sys.exit()
     2.6  
     2.7 -    if not new_io_world:
     2.8 +    if new_io_world:
     2.9 +        cmsg = 'new_network_interface(dom='+str(id)+')'
    2.10 +        xend_response = xenctl.utils.xend_control_message(cmsg)
    2.11 +        if not xend_response['success']:
    2.12 +            print "Error creating network interface"
    2.13 +            print "Error type: " + xend_response['error_type']
    2.14 +            if xend_response['error_type'] == 'exception':
    2.15 +                print "Exception type: " + xend_response['exception_type']
    2.16 +                print "Exception val:  " + xend_response['exception_value']
    2.17 +            xc.domain_destroy ( dom=id )
    2.18 +            sys.exit()
    2.19 +    else:
    2.20          # setup virtual firewall rules for all aliases
    2.21          for ip in vfr_ipaddr:
    2.22              xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip )
     3.1 --- a/tools/xc/py/Xc.c	Tue May 11 14:57:44 2004 +0000
     3.2 +++ b/tools/xc/py/Xc.c	Tue May 11 15:02:26 2004 +0000
     3.3 @@ -13,6 +13,7 @@
     3.4  #include <sys/types.h>
     3.5  #include <sys/socket.h>
     3.6  #include <netdb.h>
     3.7 +#include <arpa/inet.h>
     3.8  
     3.9  /* Needed for Python versions earlier than 2.3. */
    3.10  #ifndef PyMODINIT_FUNC
    3.11 @@ -202,13 +203,13 @@ static PyObject *pyxc_linux_save(PyObjec
    3.12      if (progress) flags |= XCFLAGS_VERBOSE;
    3.13      if (live)     flags |= XCFLAGS_LIVE;
    3.14  
    3.15 -    if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0)
    3.16 +    if ( strncmp(state_file,"tcp:", strlen("tcp:")) == 0 )
    3.17      {
    3.18  #define max_namelen 64
    3.19  	char server[max_namelen];
    3.20  	char *port_s;
    3.21  	int port=777;
    3.22 -	int sd = 0;
    3.23 +	int sd = -1;
    3.24  	struct hostent *h;
    3.25  	struct sockaddr_in s;
    3.26  	int sockbufsize;
    3.27 @@ -216,19 +217,18 @@ static PyObject *pyxc_linux_save(PyObjec
    3.28  	int writerfn(void *fd, const void *buf, size_t count)
    3.29  	{
    3.30  	    int tot = 0, rc;
    3.31 -	    do 
    3.32 -	    {
    3.33 +	    do {
    3.34  		rc = write( (int) fd, ((char*)buf)+tot, count-tot );
    3.35 -		if (rc<0) { perror("WRITE"); return rc; };
    3.36 +		if ( rc < 0 ) { perror("WRITE"); return rc; };
    3.37  		tot += rc;
    3.38  	    }
    3.39 -	    while(tot<count);
    3.40 +	    while ( tot < count );
    3.41  	    return 0;
    3.42  	}
    3.43  
    3.44  	strncpy( server, state_file+strlen("tcp://"), max_namelen);
    3.45  	server[max_namelen-1]='\0';
    3.46 -	if( port_s = strchr(server,':') )
    3.47 +	if ( (port_s = strchr(server,':')) != NULL )
    3.48  	{
    3.49  	    *port_s = '\0';
    3.50  	    port = atoi(port_s+1);
    3.51 @@ -238,36 +238,36 @@ static PyObject *pyxc_linux_save(PyObjec
    3.52  	
    3.53  	h = gethostbyname(server);
    3.54  	sd = socket (AF_INET,SOCK_STREAM,0);
    3.55 -	if(sd<0) goto serr;
    3.56 +	if ( sd < 0 )
    3.57 +            goto serr;
    3.58  	s.sin_family = AF_INET;
    3.59  	bcopy ( h->h_addr, &(s.sin_addr.s_addr), h->h_length);
    3.60  	s.sin_port = htons(port);
    3.61 -	if( connect(sd, (struct sockaddr *) &s, sizeof(s)) ) 
    3.62 +	if ( connect(sd, (struct sockaddr *) &s, sizeof(s)) ) 
    3.63  	    goto serr;
    3.64  
    3.65  	sockbufsize=128*1024;
    3.66 -	if (setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &sockbufsize, sizeof sockbufsize) < 0) 
    3.67 -	{
    3.68 +	if ( setsockopt(sd, SOL_SOCKET, SO_SNDBUF, 
    3.69 +                        &sockbufsize, sizeof sockbufsize) < 0 ) 
    3.70  	    goto serr;
    3.71 -	}
    3.72  
    3.73 -	if ( xc_linux_save(xc->xc_handle, dom, flags, writerfn, (void*)sd) == 0 )
    3.74 +	if ( xc_linux_save(xc->xc_handle, dom, flags, 
    3.75 +                           writerfn, (void*)sd) == 0 )
    3.76  	{
    3.77  	    close(sd);
    3.78  	    Py_INCREF(zero);
    3.79  	    return zero;
    3.80  	}
    3.81  
    3.82 -	serr:
    3.83 -
    3.84 +    serr:
    3.85  	PyErr_SetFromErrno(xc_error);
    3.86 -	if(sd)close(sd);
    3.87 +	if ( sd >= 0 ) close(sd);
    3.88  	return NULL;
    3.89      }    
    3.90      else
    3.91      {
    3.92 -	int fd;
    3.93 -	gzFile gfd;
    3.94 +	int fd = -1;
    3.95 +	gzFile gfd = NULL;
    3.96  
    3.97  	int writerfn(void *fd, const void *buf, size_t count)
    3.98  	{
    3.99 @@ -311,10 +311,11 @@ static PyObject *pyxc_linux_save(PyObjec
   3.100  
   3.101      err:
   3.102  	PyErr_SetFromErrno(xc_error);
   3.103 -	if(gfd)gzclose(gfd);
   3.104 -	if(fd)close(fd);
   3.105 +	if ( gfd != NULL )
   3.106 +            gzclose(gfd);
   3.107 +	if ( fd >= 0 )
   3.108 +            close(fd);
   3.109  	unlink(state_file);
   3.110 -
   3.111  	return NULL;
   3.112      }
   3.113  
   3.114 @@ -337,15 +338,16 @@ static PyObject *pyxc_linux_restore(PyOb
   3.115                                        &dom, &state_file, &progress) )
   3.116          return NULL;
   3.117  
   3.118 -    if (progress) flags |= XCFLAGS_VERBOSE;
   3.119 +    if ( progress )
   3.120 +        flags |= XCFLAGS_VERBOSE;
   3.121  
   3.122 -    if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0)
   3.123 +    if ( strncmp(state_file,"tcp:", strlen("tcp:")) == 0 )
   3.124      {
   3.125  #define max_namelen 64
   3.126  	char server[max_namelen];
   3.127  	char *port_s;
   3.128  	int port=777;
   3.129 -	int ld = 0, sd = 0;
   3.130 +	int ld = -1, sd = -1;
   3.131  	struct hostent *h;
   3.132  	struct sockaddr_in s, d, p;
   3.133  	socklen_t dlen, plen;
   3.134 @@ -357,20 +359,16 @@ static PyObject *pyxc_linux_restore(PyOb
   3.135  	    int rc, tot = 0;
   3.136  	    do { 
   3.137  		rc = read( (int) fd, ((char*)buf)+tot, count-tot ); 
   3.138 -		if (rc<0)
   3.139 -		    {
   3.140 -			perror("READ");
   3.141 -			return rc;
   3.142 -		    }
   3.143 +		if ( rc < 0 ) { perror("READ"); return rc; }
   3.144  		tot += rc;
   3.145 -	    } while( tot<count );
   3.146 -
   3.147 +	    } 
   3.148 +            while ( tot < count );
   3.149  	    return 0;
   3.150  	}
   3.151  
   3.152  	strncpy( server, state_file+strlen("tcp://"), max_namelen);
   3.153  	server[max_namelen-1]='\0';
   3.154 -	if( port_s = strchr(server,':') )
   3.155 +	if ( (port_s = strchr(server,':')) != NULL )
   3.156  	{
   3.157  	    *port_s = '\0';
   3.158  	    port = atoi(port_s+1);
   3.159 @@ -380,58 +378,55 @@ static PyObject *pyxc_linux_restore(PyOb
   3.160  	
   3.161  	h = gethostbyname(server);
   3.162  	ld = socket (AF_INET,SOCK_STREAM,0);
   3.163 -	if(ld<0) goto serr;
   3.164 +	if ( ld < 0 ) goto serr;
   3.165  	s.sin_family = AF_INET;
   3.166  	//bcopy ( h->h_addr, &(s.sin_addr.s_addr), h->h_length);
   3.167  	s.sin_addr.s_addr = htonl(INADDR_ANY);
   3.168  	s.sin_port = htons(port);
   3.169  
   3.170 -	if (setsockopt(ld, SOL_SOCKET, SO_REUSEADDR, &on, sizeof (on)) < 0)
   3.171 +	if ( setsockopt(ld, SOL_SOCKET, SO_REUSEADDR, &on, sizeof (on)) < 0 )
   3.172  	    goto serr;
   3.173  
   3.174 -	if( bind(ld, (struct sockaddr *) &s, sizeof(s)) ) 
   3.175 +	if ( bind(ld, (struct sockaddr *) &s, sizeof(s)) ) 
   3.176  	    goto serr;
   3.177  
   3.178 -	if( listen(ld, 1) )
   3.179 +	if ( listen(ld, 1) )
   3.180  	    goto serr;
   3.181  
   3.182  	dlen=sizeof(struct sockaddr);
   3.183 -	if( (sd = accept(ld, (struct sockaddr *) &d, &dlen )) < 0 )
   3.184 +	if ( (sd = accept(ld, (struct sockaddr *) &d, &dlen )) < 0 )
   3.185  	    goto serr;
   3.186  
   3.187          plen = sizeof(p);
   3.188 -	if (getpeername(sd, (struct sockaddr_in *) &p, 
   3.189 -			&plen) < 0) {
   3.190 +	if ( getpeername(sd, (struct sockaddr_in *) &p, 
   3.191 +                         &plen) < 0 )
   3.192  	    goto serr;
   3.193 -	}
   3.194  
   3.195 -	printf("Accepted connection from %s\n",
   3.196 -			inet_ntoa(p.sin_addr));
   3.197 +	printf("Accepted connection from %s\n", inet_ntoa(p.sin_addr));
   3.198  	
   3.199  	sockbufsize=128*1024;
   3.200 -	if (setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &sockbufsize, sizeof sockbufsize) < 0) 
   3.201 -	{
   3.202 +	if ( setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &sockbufsize, 
   3.203 +                        sizeof sockbufsize) < 0 ) 
   3.204  	    goto serr;
   3.205 -	}
   3.206  
   3.207 -	if ( xc_linux_restore(xc->xc_handle, dom, flags, readerfn, (void*)sd, &dom) == 0 )
   3.208 +	if ( xc_linux_restore(xc->xc_handle, dom, flags, 
   3.209 +                              readerfn, (void*)sd, &dom) == 0 )
   3.210  	{
   3.211  	    close(sd);
   3.212  	    Py_INCREF(zero);
   3.213  	    return zero;
   3.214  	}
   3.215  
   3.216 -	serr:
   3.217 -
   3.218 +    serr:
   3.219  	PyErr_SetFromErrno(xc_error);
   3.220 -	if(ld)close(ld);
   3.221 -	if(sd)close(sd);
   3.222 +	if ( ld >= 0 ) close(ld);
   3.223 +	if ( sd >= 0 ) close(sd);
   3.224  	return NULL;
   3.225      }    
   3.226      else
   3.227      {
   3.228 -	int fd;
   3.229 -	gzFile gfd;
   3.230 +	int fd = -1;
   3.231 +	gzFile gfd = NULL;
   3.232  
   3.233  	int readerfn(void *fd, void *buf, size_t count)
   3.234  	{
   3.235 @@ -442,7 +437,7 @@ static PyObject *pyxc_linux_restore(PyOb
   3.236  	    return ! (rc == count);
   3.237  	}
   3.238  
   3.239 -	if (strncmp(state_file,"file:",strlen("file:")) == 0)
   3.240 +	if ( strncmp(state_file,"file:",strlen("file:")) == 0 )
   3.241  	    state_file += strlen("file:");
   3.242  
   3.243  	if ( (fd = open(state_file, O_RDONLY)) == -1 )
   3.244 @@ -464,7 +459,8 @@ static PyObject *pyxc_linux_restore(PyOb
   3.245  	}
   3.246  
   3.247  
   3.248 -	if ( xc_linux_restore(xc->xc_handle, dom, flags, readerfn, gfd, &dom) == 0 )
   3.249 +	if ( xc_linux_restore(xc->xc_handle, dom, flags, 
   3.250 +                              readerfn, gfd, &dom) == 0 )
   3.251  	{
   3.252  	    gzclose(gfd);
   3.253  	    close(fd);
   3.254 @@ -475,8 +471,8 @@ static PyObject *pyxc_linux_restore(PyOb
   3.255  
   3.256      err:
   3.257  	PyErr_SetFromErrno(xc_error);
   3.258 -	if(gfd)gzclose(gfd);
   3.259 -	if(fd)close(fd);
   3.260 +	if ( gfd != NULL ) gzclose(gfd);
   3.261 +	if ( fd >= 0 ) close(fd);
   3.262  	return NULL;
   3.263      }
   3.264  
     4.1 --- a/tools/xenctl/lib/utils.py	Tue May 11 14:57:44 2004 +0000
     4.2 +++ b/tools/xenctl/lib/utils.py	Tue May 11 15:02:26 2004 +0000
     4.3 @@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'):
     4.4              return m.group(1)
     4.5      return None
     4.6  
     4.7 -def get_current_ipgw(dev='eth0'):
     4.8 -    """Return a string containing the IP gateway for the given
     4.9 -    network interface (default 'eth0').
    4.10 -    """
    4.11 +def get_current_ipgw():
    4.12 +    """Return a string containing the default IP gateway."""
    4.13      fd = os.popen( '/sbin/route -n' )
    4.14      lines = fd.readlines()
    4.15      for line in lines:
    4.16 -        m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
    4.17 -                       '\s+\S+\s+\S*G.*' + dev + '.*', line )
    4.18 +        m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
    4.19 +                       '\s+0.0.0.0+\s+\S*G.*', line )
    4.20          if m:
    4.21              return m.group(1)
    4.22      return None
     5.1 --- a/tools/xend/lib/domain_controller.h	Tue May 11 14:57:44 2004 +0000
     5.2 +++ b/tools/xend/lib/domain_controller.h	Tue May 11 15:02:26 2004 +0000
     5.3 @@ -342,6 +342,7 @@ typedef struct {
     5.4      unsigned int handle;
     5.5      unsigned int status;
     5.6      unsigned int evtchn; /* status == NETIF_INTERFACE_STATUS_CONNECTED */
     5.7 +    u8           mac[6]; /* status == NETIF_INTERFACE_STATUS_CONNECTED */
     5.8  } netif_fe_interface_status_changed_t;
     5.9  
    5.10  /*
    5.11 @@ -373,7 +374,8 @@ typedef struct {
    5.12   */
    5.13  typedef struct {
    5.14      unsigned int  handle;
    5.15 -    unsigned long shmem_frame;
    5.16 +    unsigned long tx_shmem_frame;
    5.17 +    unsigned long rx_shmem_frame;
    5.18  } netif_fe_interface_connect_t;
    5.19  
    5.20  /*
    5.21 @@ -434,6 +436,7 @@ typedef struct {
    5.22      /* IN */
    5.23      domid_t        domid;             /* Domain attached to new interface.   */
    5.24      unsigned int   netif_handle;      /* Domain-specific interface handle.   */
    5.25 +    u8             mac[6];
    5.26      /* OUT */
    5.27      unsigned int   status;
    5.28  } netif_be_create_t; 
    5.29 @@ -463,7 +466,8 @@ typedef struct {
    5.30      domid_t        domid;             /* Domain attached to new interface.   */
    5.31      unsigned int   netif_handle;      /* Domain-specific interface handle.   */
    5.32      unsigned int   evtchn;            /* Event channel for notifications.    */
    5.33 -    unsigned long  shmem_frame;       /* Page cont. shared comms window.     */
    5.34 +    unsigned long  tx_shmem_frame;    /* Page cont. tx shared comms window.  */
    5.35 +    unsigned long  rx_shmem_frame;    /* Page cont. rx shared comms window.  */
    5.36      /* OUT */
    5.37      unsigned int   status;
    5.38  } netif_be_connect_t; 
     6.1 --- a/tools/xend/lib/main.py	Tue May 11 14:57:44 2004 +0000
     6.2 +++ b/tools/xend/lib/main.py	Tue May 11 15:02:26 2004 +0000
     6.3 @@ -5,7 +5,7 @@
     6.4  ###########################################################
     6.5  
     6.6  import errno, re, os, pwd, select, signal, socket, struct, sys, time
     6.7 -import xend.blkif, xend.console, xend.manager, xend.utils, Xc
     6.8 +import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc
     6.9  
    6.10  
    6.11  # The following parameters could be placed in a configuration file.
    6.12 @@ -19,6 +19,8 @@ UNIX_SOCK    = 'management_sock' # relat
    6.13  CMSG_CONSOLE  = 0
    6.14  CMSG_BLKIF_BE = 1
    6.15  CMSG_BLKIF_FE = 2
    6.16 +CMSG_NETIF_BE = 3
    6.17 +CMSG_NETIF_FE = 4
    6.18  
    6.19  
    6.20  def port_from_dom(dom):
    6.21 @@ -162,6 +164,10 @@ def daemon_loop():
    6.22              if xend.blkif.interface.list.has_key(idx):
    6.23                  blk_if = xend.blkif.interface.list[idx]
    6.24  
    6.25 +            net_if = False
    6.26 +            if xend.netif.interface.list.has_key(idx):
    6.27 +                net_if = xend.netif.interface.list[idx]
    6.28 +
    6.29              # If we pick up a disconnect notification then we do any necessary
    6.30              # cleanup.
    6.31              if type == notifier.EXCEPTION:
    6.32 @@ -175,6 +181,9 @@ def daemon_loop():
    6.33                      if blk_if:
    6.34                          blk_if.destroy()
    6.35                          del blk_if
    6.36 +                    if net_if:
    6.37 +                        net_if.destroy()
    6.38 +                        del net_if
    6.39                      continue
    6.40  
    6.41              # Process incoming requests.
    6.42 @@ -188,6 +197,10 @@ def daemon_loop():
    6.43                      blk_if.ctrlif_rx_req(port, msg)
    6.44                  elif type == CMSG_BLKIF_BE and port == dom0_port:
    6.45                      xend.blkif.backend_rx_req(port, msg)
    6.46 +                elif type == CMSG_NETIF_FE and net_if:
    6.47 +                    net_if.ctrlif_rx_req(port, msg)
    6.48 +                elif type == CMSG_NETIF_BE and port == dom0_port:
    6.49 +                    xend.netif.backend_rx_req(port, msg)
    6.50                  else:
    6.51                      port.write_response(msg)
    6.52  
    6.53 @@ -198,6 +211,8 @@ def daemon_loop():
    6.54                  type = (msg.get_header())['type']
    6.55                  if type == CMSG_BLKIF_BE and port == dom0_port:
    6.56                      xend.blkif.backend_rx_rsp(port, msg)
    6.57 +                elif type == CMSG_NETIF_BE and port == dom0_port:
    6.58 +                    xend.netif.backend_rx_rsp(port, msg)
    6.59  
    6.60              # Send console data.
    6.61              if con_if and con_if.ctrlif_transmit_work(port):
    6.62 @@ -207,10 +222,18 @@ def daemon_loop():
    6.63              if blk_if and blk_if.ctrlif_transmit_work(port):
    6.64                  work_done = True
    6.65  
    6.66 +            # Send netif messages.
    6.67 +            if net_if and net_if.ctrlif_transmit_work(port):
    6.68 +                work_done = True
    6.69 +
    6.70              # Back-end block-device work.
    6.71              if port == dom0_port and xend.blkif.backend_do_work(port):
    6.72                  work_done = True
    6.73                  
    6.74 +            # Back-end network-device work.
    6.75 +            if port == dom0_port and xend.netif.backend_do_work(port):
    6.76 +                work_done = True
    6.77 +                
    6.78              # Finally, notify the remote end of any work that we did.
    6.79              if work_done:
    6.80                  port.notify()
     7.1 --- a/tools/xend/lib/manager.py	Tue May 11 14:57:44 2004 +0000
     7.2 +++ b/tools/xend/lib/manager.py	Tue May 11 15:02:26 2004 +0000
     7.3 @@ -4,7 +4,7 @@
     7.4  ## Copyright (c) 2004, K A Fraser (University of Cambridge)
     7.5  #############################################################
     7.6  
     7.7 -import xend.blkif, xend.console, xend.main, xend.utils
     7.8 +import xend.blkif, xend.netif, xend.console, xend.main, xend.utils
     7.9  
    7.10  
    7.11  ##
    7.12 @@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, 
    7.13  
    7.14      # Response is deferred until back-end driver sends acknowledgement.
    7.15      return None
    7.16 +
    7.17 +
    7.18 +##
    7.19 +## new_network_interface:
    7.20 +##  Create a new network interface for the specified domain @dom.
    7.21 +##
    7.22 +def new_network_interface(dom, handle=-1):
    7.23 +    # By default we create an interface with handle zero.
    7.24 +    if handle < 0:
    7.25 +        handle = 0
    7.26 +
    7.27 +    # We only support one interface per domain, which must have handle zero.
    7.28 +    if handle != 0:
    7.29 +        response = { 'success': False }
    7.30 +        response['error_type'] = 'Bad handle %d (only handle 0 ' + \
    7.31 +                                 'is supported)' % handle
    7.32 +        return response
    7.33 +
    7.34 +    # Find local event-channel port associated with the specified domain.
    7.35 +    port = xend.main.port_from_dom(dom)
    7.36 +    if not port:
    7.37 +        response = { 'success': False }
    7.38 +        response['error_type'] = 'Unknown domain %d' % dom
    7.39 +        return response
    7.40 +
    7.41 +    # The interface must not already exist.
    7.42 +    if xend.netif.interface.list.has_key(port.local_port):
    7.43 +        response = { 'success': False }
    7.44 +        response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \
    7.45 +                                 'exists' % (dom, handle)
    7.46 +        return response
    7.47 +
    7.48 +    # Create the new interface. Initially no virtual devices are attached.
    7.49 +    xend.netif.interface(dom, port.local_port)
    7.50 +
    7.51 +    # Response is deferred until back-end driver sends acknowledgement.
    7.52 +    return None
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/tools/xend/lib/netif.py	Tue May 11 15:02:26 2004 +0000
     8.3 @@ -0,0 +1,144 @@
     8.4 +
     8.5 +###################################################################
     8.6 +## xend/netif.py -- Network-interface management functions for Xend
     8.7 +## Copyright (c) 2004, K A Fraser (University of Cambridge)
     8.8 +###################################################################
     8.9 +
    8.10 +import errno, random, re, os, select, signal, socket, struct, sys
    8.11 +import xend.main, xend.console, xend.manager, xend.utils, Xc
    8.12 +
    8.13 +CMSG_NETIF_BE = 3
    8.14 +CMSG_NETIF_FE = 4
    8.15 +CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED =  0
    8.16 +CMSG_NETIF_FE_DRIVER_STATUS_CHANGED    = 32
    8.17 +CMSG_NETIF_FE_INTERFACE_CONNECT        = 33
    8.18 +CMSG_NETIF_FE_INTERFACE_DISCONNECT     = 34
    8.19 +CMSG_NETIF_BE_CREATE      = 0
    8.20 +CMSG_NETIF_BE_DESTROY     = 1
    8.21 +CMSG_NETIF_BE_CONNECT     = 2
    8.22 +CMSG_NETIF_BE_DISCONNECT  = 3
    8.23 +
    8.24 +pendmsg = None
    8.25 +pendaddr = None
    8.26 +
    8.27 +def backend_tx_req(msg):
    8.28 +    port = xend.main.dom0_port
    8.29 +    if port.space_to_write_request():
    8.30 +        port.write_request(msg)
    8.31 +        port.notify()
    8.32 +    else:
    8.33 +        xend.netif.pendmsg = msg
    8.34 +
    8.35 +def backend_rx_req(port, msg):
    8.36 +    port.write_response(msg)
    8.37 +
    8.38 +def backend_rx_rsp(port, msg):
    8.39 +    subtype = (msg.get_header())['subtype']
    8.40 +    print "Received netif-be response, subtype %d" % subtype
    8.41 +    if subtype == CMSG_NETIF_BE_CREATE:
    8.42 +        rsp = { 'success': True }
    8.43 +        xend.main.send_management_response(rsp, xend.netif.pendaddr)
    8.44 +    elif subtype == CMSG_NETIF_BE_CONNECT:
    8.45 +        (dom,hnd,evtchn,tx_frame,rx_frame,st) = \
    8.46 +           struct.unpack("QIILLI", msg.get_payload())
    8.47 +        netif = interface.list[xend.main.port_from_dom(dom).local_port]
    8.48 +        msg = xend.utils.message(CMSG_NETIF_FE, \
    8.49 +                                 CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
    8.50 +        msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \
    8.51 +                                       netif.evtchn['port2'], \
    8.52 +                                       netif.mac[0],netif.mac[1], \
    8.53 +                                       netif.mac[2],netif.mac[3], \
    8.54 +                                       netif.mac[4],netif.mac[5], \
    8.55 +                                       0,0))
    8.56 +        netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg)
    8.57 +
    8.58 +def backend_do_work(port):
    8.59 +    global pendmsg
    8.60 +    if pendmsg and port.space_to_write_request():
    8.61 +        port.write_request(pendmsg)
    8.62 +        pendmsg = None
    8.63 +        return True
    8.64 +    return False
    8.65 +
    8.66 +
    8.67 +class interface:
    8.68 +
    8.69 +    # Dictionary of all network-device interfaces.
    8.70 +    list = {}
    8.71 +
    8.72 +
    8.73 +    # NB. 'key' is an opaque value that has no meaning in this class.
    8.74 +    def __init__(self, dom, key):
    8.75 +        self.dom     = dom
    8.76 +        self.key     = key
    8.77 +        self.pendmsg = None
    8.78 +
    8.79 +        # VIFs get a random MAC address with a "special" vendor id.
    8.80 +        # 
    8.81 +        # NB. The vendor is currently an "obsolete" one that used to belong
    8.82 +        # to DEC (AA-00-00). Using it is probably a bit rude :-)
    8.83 +        # 
    8.84 +        # NB2. The first bit of the first random octet is set to zero for
    8.85 +        # all dynamic MAC addresses. This may allow us to manually specify
    8.86 +        # MAC addresses for some VIFs with no fear of clashes.
    8.87 +        self.mac = [ 0xaa, 0x00, 0x00 ]
    8.88 +        self.mac.append(int(random.random()*128))
    8.89 +        self.mac.append(int(random.random()*256))
    8.90 +        self.mac.append(int(random.random()*256))
    8.91 +                
    8.92 +        interface.list[key] = self
    8.93 +        msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0)
    8.94 +        msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \
    8.95 +                                       self.mac[0],self.mac[1], \
    8.96 +                                       self.mac[2],self.mac[3], \
    8.97 +                                       self.mac[4],self.mac[5], \
    8.98 +                                       0,0,0))
    8.99 +        xend.netif.pendaddr = xend.main.mgmt_req_addr
   8.100 +        backend_tx_req(msg)
   8.101 +
   8.102 +
   8.103 +    # Completely destroy this interface.
   8.104 +    def destroy(self):
   8.105 +        del interface.list[self.key]
   8.106 +        msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0)
   8.107 +        msg.append_payload(struct.pack("QII",self.dom,0,0))
   8.108 +        backend_tx_req(msg)        
   8.109 +
   8.110 +
   8.111 +    # The parameter @port is the control-interface event channel. This method
   8.112 +    # returns True if messages were written to the control interface.
   8.113 +    def ctrlif_transmit_work(self, port):
   8.114 +        if self.pendmsg and port.space_to_write_request():
   8.115 +            port.write_request(self.pendmsg)
   8.116 +            self.pendmsg = None
   8.117 +            return True
   8.118 +        return False
   8.119 +
   8.120 +    def ctrlif_tx_req(self, port, msg):
   8.121 +        if port.space_to_write_request():
   8.122 +            port.write_request(msg)
   8.123 +            port.notify()
   8.124 +        else:
   8.125 +            self.pendmsg = msg
   8.126 +
   8.127 +    def ctrlif_rx_req(self, port, msg):
   8.128 +        port.write_response(msg)
   8.129 +        subtype = (msg.get_header())['subtype']
   8.130 +        if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
   8.131 +            msg = xend.utils.message(CMSG_NETIF_FE, \
   8.132 +                                     CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
   8.133 +            msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \
   8.134 +                                           self.mac[1],self.mac[2], \
   8.135 +                                           self.mac[3],self.mac[4], \
   8.136 +                                           self.mac[5],0,0))
   8.137 +            self.ctrlif_tx_req(port, msg)
   8.138 +        elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT:
   8.139 +            (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload())
   8.140 +            xc = Xc.new()
   8.141 +            self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom)
   8.142 +            msg = xend.utils.message(CMSG_NETIF_BE, \
   8.143 +                                     CMSG_NETIF_BE_CONNECT, 0)
   8.144 +            msg.append_payload(struct.pack("QIILLI",self.dom,0, \
   8.145 +                                           self.evtchn['port1'],tx_frame, \
   8.146 +                                           rx_frame,0))
   8.147 +            backend_tx_req(msg)
     9.1 --- a/xen/arch/i386/entry.S	Tue May 11 14:57:44 2004 +0000
     9.2 +++ b/xen/arch/i386/entry.S	Tue May 11 15:02:26 2004 +0000
     9.3 @@ -145,16 +145,13 @@ NT_MASK		= 0x00004000
     9.4          pushl %ecx; \
     9.5          pushl %ebx; \
     9.6  
     9.7 -#define SAVE_ALL_NOSTI     \
     9.8 +#define SAVE_ALL \
     9.9          SAVE_ALL_NOSEGREGS \
    9.10          movl $(__HYPERVISOR_DS),%edx; \
    9.11          movl %edx,%ds; \
    9.12          movl %edx,%es; \
    9.13          movl %edx,%fs; \
    9.14          movl %edx,%gs;
    9.15 -
    9.16 -#define SAVE_ALL \
    9.17 -        SAVE_ALL_NOSTI \
    9.18          sti;
    9.19  
    9.20  #define GET_CURRENT(reg)   \
    9.21 @@ -406,7 +403,11 @@ create_bounce_frame:
    9.22          jz   1f /* jump if returning to an existing ring-1 activation */
    9.23          /* obtain ss/esp from TSS -- no current ring-1 activations */
    9.24          movzwl PROCESSOR(%ebx),%eax
    9.25 -        shll $8,%eax /* multiply by 256 */
    9.26 +        /* next 4 lines multiply %eax by 8320, which is sizeof(tss_struct) */
    9.27 +        movl %eax, %ecx
    9.28 +        shll $7, %ecx
    9.29 +        shll $13, %eax
    9.30 +        addl %ecx,%eax
    9.31          addl $init_tss + 12,%eax
    9.32          movl (%eax),%esi /* tss->esp1 */
    9.33  FAULT6: movl 4(%eax),%ds /* tss->ss1  */
    9.34 @@ -529,12 +530,18 @@ error_code:
    9.35  	movl  GS(%esp), %edi		# get the function address
    9.36  	movl  %eax, ORIG_EAX(%esp)
    9.37  	movl  %ecx, GS(%esp)
    9.38 -	movl  %esp,%edx
    9.39 -	pushl %esi			# push the error code
    9.40 -	pushl %edx			# push the pt_regs pointer
    9.41  	movl  $(__HYPERVISOR_DS),%edx
    9.42  	movl  %edx,%ds
    9.43  	movl  %edx,%es
    9.44 +	movl  %edx,%fs
    9.45 +	movl  %edx,%gs
    9.46 +	movl  EFLAGS(%esp),%edx
    9.47 +	testl $0x200,%edx               # Is IF asserted in saved EFLAGS?
    9.48 +	jz    1f                        # Don't STI if it isn't.
    9.49 +	sti
    9.50 +1:	movl  %esp,%edx
    9.51 +	pushl %esi			# push the error code
    9.52 +	pushl %edx			# push the pt_regs pointer
    9.53  	GET_CURRENT(%ebx)
    9.54  	call  *%edi
    9.55          addl  $8,%esp
    10.1 --- a/xen/common/dom_mem_ops.c	Tue May 11 14:57:44 2004 +0000
    10.2 +++ b/xen/common/dom_mem_ops.c	Tue May 11 15:02:26 2004 +0000
    10.3 @@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_st
    10.4      {
    10.5          /* Leave some slack pages; e.g., for the network. */
    10.6          if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
    10.7 -                                   (PAGE_SHIFT-10))) ) 
    10.8 +                                   (PAGE_SHIFT-10))) )
    10.9 +        {
   10.10 +            DPRINTK("Not enough slack: %u %u\n",
   10.11 +                    free_pfns,
   10.12 +                    SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10));
   10.13              break;
   10.14 +        }
   10.15  
   10.16          /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
   10.17          if ( unlikely((page = alloc_domain_page(p)) == NULL) )
   10.18 +        {
   10.19 +            DPRINTK("Could not allocate a frame\n");
   10.20              break;
   10.21 -        
   10.22 +        }
   10.23 +
   10.24          /* Inform the domain of the new page's machine address. */ 
   10.25          mpfn = (unsigned long)(page - frame_table);
   10.26          copy_to_user(op.pages, &mpfn, sizeof(mpfn));
    11.1 --- a/xen/common/domain.c	Tue May 11 14:57:44 2004 +0000
    11.2 +++ b/xen/common/domain.c	Tue May 11 15:02:26 2004 +0000
    11.3 @@ -340,6 +340,8 @@ struct pfn_info *alloc_domain_page(struc
    11.4          spin_lock(&p->page_list_lock);
    11.5          if ( unlikely(p->tot_pages >= p->max_pages) )
    11.6          {
    11.7 +            DPRINTK("Over-allocation for domain %llu: %u >= %u\n",
    11.8 +                    p->domain, p->tot_pages, p->max_pages);
    11.9              spin_unlock(&p->page_list_lock);
   11.10              goto free_and_exit;
   11.11          }
   11.12 @@ -894,7 +896,7 @@ int construct_dom0(struct task_struct *p
   11.13          page->type_and_flags  = 0;
   11.14          page->count_and_flags = PGC_allocated | 1;
   11.15          list_add_tail(&page->list, &p->page_list);
   11.16 -        p->tot_pages++;
   11.17 +        p->tot_pages++; p->max_pages++;
   11.18      }
   11.19  
   11.20      mpt_alloc = (vpt_start - v_start) + alloc_start;
    12.1 --- a/xen/common/kernel.c	Tue May 11 14:57:44 2004 +0000
    12.2 +++ b/xen/common/kernel.c	Tue May 11 15:02:26 2004 +0000
    12.3 @@ -105,7 +105,6 @@ static struct {
    12.4  void cmain(unsigned long magic, multiboot_info_t *mbi)
    12.5  {
    12.6      struct task_struct *new_dom;
    12.7 -    dom0_createdomain_t dom0_params;
    12.8      unsigned long max_page;
    12.9      unsigned char *cmdline;
   12.10      module_t *mod = (module_t *)__va(mbi->mods_addr);
   12.11 @@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboo
   12.12      task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task;
   12.13  
   12.14      /* Create initial domain 0. */
   12.15 -    dom0_params.memory_kb = opt_dom0_mem;
   12.16      new_dom = do_createdomain(0, 0);
   12.17      if ( new_dom == NULL )
   12.18          panic("Error creating domain 0\n");
    13.1 --- a/xen/common/memory.c	Tue May 11 14:57:44 2004 +0000
    13.2 +++ b/xen/common/memory.c	Tue May 11 15:02:26 2004 +0000
    13.3 @@ -415,6 +415,7 @@ static int get_page_from_l1e(l1_pgentry_
    13.4  {
    13.5      unsigned long l1v = l1_pgentry_val(l1e);
    13.6      unsigned long pfn = l1_pgentry_to_pagenr(l1e);
    13.7 +    extern int domain_iomem_in_pfn(struct task_struct *p, unsigned long pfn);
    13.8  
    13.9      if ( !(l1v & _PAGE_PRESENT) )
   13.10          return 1;
   13.11 @@ -428,7 +429,11 @@ static int get_page_from_l1e(l1_pgentry_
   13.12      if ( unlikely(!pfn_is_ram(pfn)) )
   13.13      {
   13.14          if ( IS_PRIV(current) )
   13.15 -            return 1;
   13.16 +            return 1;	
   13.17 +
   13.18 +	if ( IS_CAPABLE_PHYSDEV(current) )
   13.19 +            return domain_iomem_in_pfn(current, pfn);
   13.20 +
   13.21          MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
   13.22          return 0;
   13.23      }
   13.24 @@ -915,7 +920,8 @@ static int do_extended_command(unsigned 
   13.25          break;
   13.26  
   13.27      case MMUEXT_SET_SUBJECTDOM_H:
   13.28 -        percpu_info[cpu].subject_id |= ((domid_t)((ptr&~0xFFFF)|(val>>16)))<<32;
   13.29 +        percpu_info[cpu].subject_id |= 
   13.30 +            ((domid_t)((ptr&~0xFFFF)|(val>>16)))<<32;
   13.31  
   13.32          if ( !IS_PRIV(current) )
   13.33          {
   13.34 @@ -939,6 +945,33 @@ static int do_extended_command(unsigned 
   13.35          }
   13.36          break;
   13.37  
   13.38 +        /* XXX This function is racey! */
   13.39 +    case MMUEXT_REASSIGN_PAGE:
   13.40 +        if ( unlikely(!IS_PRIV(current)) )
   13.41 +        {
   13.42 +            MEM_LOG("Dom %llu has no privilege to reassign page ownership",
   13.43 +                    current->domain);
   13.44 +            okay = 0;
   13.45 +        }
   13.46 +        else if ( likely(percpu_info[cpu].gps != NULL) )
   13.47 +        {
   13.48 +            current->tot_pages--;
   13.49 +            percpu_info[cpu].gps->tot_pages++;
   13.50 +            page->u.domain = percpu_info[cpu].gps;
   13.51 +        }
   13.52 +        else
   13.53 +        {
   13.54 +            MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn);
   13.55 +            okay = 0;
   13.56 +        }
   13.57 +        break;
   13.58 +
   13.59 +    case MMUEXT_RESET_SUBJECTDOM:
   13.60 +        if ( percpu_info[cpu].gps != NULL )
   13.61 +            put_task_struct(percpu_info[cpu].gps);
   13.62 +        percpu_info[cpu].gps = percpu_info[cpu].pts = NULL;
   13.63 +        break;
   13.64 +
   13.65      default:
   13.66          MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
   13.67          okay = 0;
    14.1 --- a/xen/common/physdev.c	Tue May 11 14:57:44 2004 +0000
    14.2 +++ b/xen/common/physdev.c	Tue May 11 15:02:26 2004 +0000
    14.3 @@ -202,22 +202,55 @@ int physdev_pci_access_modify(
    14.4                            &p->io_bitmap_sel);
    14.5              }
    14.6          }
    14.7 -        else if ( r->flags & IORESOURCE_MEM )
    14.8 -        {
    14.9 -            /* allow domain to map IO memory for this device */
   14.10 -            INFO("Giving domain %llu memory resources (%lx - %lx) "
   14.11 -                 "for device %s\n", dom, r->start, r->end, pdev->slot_name);
   14.12 -            for ( j = r->start; j < r->end + 1; j += PAGE_SIZE )
   14.13 -                SHARE_PFN_WITH_DOMAIN(frame_table + (j >> PAGE_SHIFT), p);
   14.14 -        }
   14.15 -    }
   14.16  
   14.17 -
   14.18 +        /* rights to IO memory regions are checked when the domain maps them */
   14.19 +	}
   14.20   out:
   14.21      put_task_struct(p);
   14.22      return rc;
   14.23  }
   14.24  
   14.25 +/* Check if a domain controls a device with IO memory within frame @pfn.
   14.26 + * Returns: 1 if the domain should be allowed to map @pfn, 0 otherwise.  */
   14.27 +int domain_iomem_in_pfn(struct task_struct *p, unsigned long pfn)
   14.28 +{
   14.29 +    int ret = 0;
   14.30 +    struct list_head *l;
   14.31 +
   14.32 +    VERBOSE_INFO("Checking if physdev-capable domain %llu needs access to "
   14.33 +                 "pfn %08lx\n", p->domain, pfn);
   14.34 +    
   14.35 +    spin_lock(&p->pcidev_lock);
   14.36 +
   14.37 +    list_for_each(l, &p->pcidev_list)
   14.38 +    {
   14.39 +        int i;
   14.40 +        phys_dev_t *phys_dev = list_entry(l, phys_dev_t, node);
   14.41 +        struct pci_dev *pci_dev = phys_dev->dev;
   14.42 +
   14.43 +        for ( i = 0; (i < DEVICE_COUNT_RESOURCE) && (ret == 0); i++ )
   14.44 +        {
   14.45 +            struct resource *r = &pci_dev->resource[i];
   14.46 +            
   14.47 +            if ( r->flags & IORESOURCE_MEM )
   14.48 +                if ( (r->start >> PAGE_SHIFT) == pfn
   14.49 +                     || (r->end >> PAGE_SHIFT) == pfn
   14.50 +                     || ((r->start >> PAGE_SHIFT < pfn)
   14.51 +                         && (r->end >> PAGE_SHIFT > pfn)) )
   14.52 +                    ret = 1;
   14.53 +        }
   14.54 +
   14.55 +        if ( ret != 0 ) break;
   14.56 +    }
   14.57 +    
   14.58 +    spin_unlock(&p->pcidev_lock);
   14.59 +
   14.60 +    VERBOSE_INFO("Domain %llu %s mapping of pfn %08lx\n",
   14.61 +                 p->domain, ret ? "allowed" : "disallowed", pfn);
   14.62 +
   14.63 +    return ret;
   14.64 +}
   14.65 +
   14.66  /* check if a domain has general access to a device */
   14.67  inline static int check_dev_acc (struct task_struct *p,
   14.68                                   int bus, int dev, int func,
   14.69 @@ -235,8 +268,7 @@ inline static int check_dev_acc (struct 
   14.70      if ( bus > PCI_BUSMAX || dev > PCI_DEVMAX || func > PCI_FUNCMAX )
   14.71          return -EINVAL;
   14.72  
   14.73 -    VERBOSE_INFO("a=%c b=%x d=%x f=%x ", (acc == ACC_READ) ? 'R' : 'W',
   14.74 -                 mask, bus, dev, func);
   14.75 +    VERBOSE_INFO("b=%x d=%x f=%x ", bus, dev, func);
   14.76  
   14.77      /* check target device */
   14.78      target_devfn = PCI_DEVFN(dev, func);
   14.79 @@ -296,8 +328,8 @@ static int do_base_address_access(phys_d
   14.80          /* We could set *val to some value but the guest may well be in trouble
   14.81           * anyway if this write fails.  Hopefully the printk will give us a
   14.82           * clue what went wrong. */
   14.83 -        printk("Guest attempting sub-dword %s to BASE_ADDRESS %d\n", 
   14.84 -             (acc == ACC_READ) ? "read" : "write", idx);
   14.85 +        printk("Guest %llu attempting sub-dword %s to BASE_ADDRESS %d\n",
   14.86 +               pdev->owner->domain, (acc == ACC_READ) ? "read" : "write", idx);
   14.87          
   14.88          return -EPERM;
   14.89      }
   14.90 @@ -328,7 +360,7 @@ static int do_base_address_access(phys_d
   14.91              }
   14.92          }
   14.93          VERBOSE_INFO("fixed pci write: %02x:%02x:%02x reg=0x%02x len=0x%02x"
   14.94 -                     " val=0x%08x %lx\n", 
   14.95 +                     " val=0x%08x %x\n", 
   14.96                       dev->bus->number, PCI_SLOT(dev->devfn), 
   14.97                       PCI_FUNC(dev->devfn), reg, len, *val, pdev->state);
   14.98      }
   14.99 @@ -365,7 +397,7 @@ static int do_base_address_access(phys_d
  14.100              }
  14.101          }
  14.102          VERBOSE_INFO("fixed pci read: %02x:%02x:%02x reg=0x%02x len=0x%02x"
  14.103 -                     " val=0x%08x %lx\n", 
  14.104 +                     " val=0x%08x %x\n", 
  14.105                       dev->bus->number, PCI_SLOT(dev->devfn), 
  14.106                       PCI_FUNC(dev->devfn), reg, len, *val, pdev->state);
  14.107      }
  14.108 @@ -422,9 +454,9 @@ static int do_rom_address_access(phys_de
  14.109              }
  14.110          }
  14.111          VERBOSE_INFO("fixed pci write: %02x:%02x:%02x reg=0x%02x len=0x%02x"
  14.112 -                     " val=0x%08x %lx\n", 
  14.113 +                     " val=0x%08x %x\n", 
  14.114                       dev->bus->number, PCI_SLOT(dev->devfn), 
  14.115 -                     PCI_FUNC(dev->devfn), reg, len, *val, pdev->state);
  14.116 +                     PCI_FUNC(dev->devfn), PCI_ROM_ADDRESS, len, *val, pdev->state);
  14.117      }
  14.118      else if ( acc == ACC_READ )
  14.119      {
  14.120 @@ -442,9 +474,9 @@ static int do_rom_address_access(phys_de
  14.121              *val = *val | (orig_val & 0x1);
  14.122          }
  14.123          VERBOSE_INFO("fixed pci read: %02x:%02x:%02x reg=0x%02x len=0x%02x"
  14.124 -                     " val=0x%08x %lx\n", 
  14.125 +                     " val=0x%08x %x\n", 
  14.126                       dev->bus->number, PCI_SLOT(dev->devfn), 
  14.127 -                     PCI_FUNC(dev->devfn), reg, len, *val, pdev->state);
  14.128 +                     PCI_FUNC(dev->devfn), PCI_ROM_ADDRESS, len, *val, pdev->state);
  14.129      }
  14.130  
  14.131      return ret;
    15.1 --- a/xen/include/asm-i386/processor.h	Tue May 11 14:57:44 2004 +0000
    15.2 +++ b/xen/include/asm-i386/processor.h	Tue May 11 15:02:26 2004 +0000
    15.3 @@ -375,7 +375,7 @@ struct tss_struct {
    15.4      unsigned short	trace, bitmap;
    15.5      unsigned long	io_bitmap[IO_BITMAP_SIZE+1];
    15.6      /*
    15.7 -     * pads the TSS to be cacheline-aligned (size is 0x100)
    15.8 +     * pads the TSS to be cacheline-aligned (total size is 0x2080)
    15.9       */
   15.10      unsigned long __cacheline_filler[5];
   15.11  };
    16.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h	Tue May 11 14:57:44 2004 +0000
    16.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h	Tue May 11 15:02:26 2004 +0000
    16.3 @@ -127,6 +127,12 @@
    16.4   *   (ptr[31:15],val[31:15]) -- dom[63:32]
    16.5   *   NB. This command must be immediately preceded by SET_SUBJECTDOM_L.
    16.6   * 
    16.7 + *   val[7:0] == MMUEXT_REASSIGN_PAGE:
    16.8 + *   ptr[:2]  -- machine address within page to be reassigned to the GPS.
    16.9 + * 
   16.10 + *   val[7:0] == MMUEXT_RESET_SUBJECTDOM:
   16.11 + *   Resets both the GPS and the PTS to their defaults (i.e., calling domain).
   16.12 + * 
   16.13   * Notes on constraints on the above arguments:
   16.14   *  [1] The page frame containing the machine address must belong to the PTS.
   16.15   *  [2] If the PTE is valid (i.e., bit 0 is set) then the specified page frame
   16.16 @@ -151,6 +157,8 @@
   16.17  #define MMUEXT_SET_SUBJECTDOM_L  9 /* (ptr[31:15],val[31:15]) = dom[31:0]    */
   16.18  #define MMUEXT_SET_SUBJECTDOM_H 10 /* (ptr[31:15],val[31:15]) = dom[63:32]   */
   16.19  #define SET_PAGETABLE_SUBJECTDOM (1<<14) /* OR into 'val' arg of SUBJECTDOM_H*/
   16.20 +#define MMUEXT_REASSIGN_PAGE    11
   16.21 +#define MMUEXT_RESET_SUBJECTDOM 12
   16.22  #define MMUEXT_CMD_MASK        255
   16.23  #define MMUEXT_CMD_SHIFT         8
   16.24  
    17.1 --- a/xenolinux-2.4.26-sparse/arch/xen/config.in	Tue May 11 14:57:44 2004 +0000
    17.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/config.in	Tue May 11 15:02:26 2004 +0000
    17.3 @@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then
    17.4     bool 'HIGHMEM I/O support' CONFIG_HIGHIO
    17.5  fi
    17.6  
    17.7 +define_int CONFIG_FORCE_MAX_ZONEORDER 12
    17.8 +
    17.9  #bool 'Symmetric multi-processing support' CONFIG_SMP
   17.10  #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
   17.11  #   define_bool CONFIG_HAVE_DEC_LOCK y
    18.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig	Tue May 11 14:57:44 2004 +0000
    18.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig	Tue May 11 15:02:26 2004 +0000
    18.3 @@ -50,6 +50,7 @@ CONFIG_X86_TSC=y
    18.4  CONFIG_X86_L1_CACHE_SHIFT=5
    18.5  CONFIG_NOHIGHMEM=y
    18.6  # CONFIG_HIGHMEM4G is not set
    18.7 +CONFIG_FORCE_MAX_ZONEORDER=12
    18.8  
    18.9  #
   18.10  # General setup
   18.11 @@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y
   18.12  # Network testing
   18.13  #
   18.14  # CONFIG_NET_PKTGEN is not set
   18.15 +CONFIG_NETDEVICES=y
   18.16  
   18.17  #
   18.18  # Block devices
    19.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev	Tue May 11 14:57:44 2004 +0000
    19.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev	Tue May 11 15:02:26 2004 +0000
    19.3 @@ -51,6 +51,7 @@ CONFIG_X86_TSC=y
    19.4  CONFIG_X86_L1_CACHE_SHIFT=5
    19.5  CONFIG_NOHIGHMEM=y
    19.6  # CONFIG_HIGHMEM4G is not set
    19.7 +CONFIG_FORCE_MAX_ZONEORDER=12
    19.8  
    19.9  #
   19.10  # General setup
   19.11 @@ -89,19 +90,7 @@ CONFIG_BINFMT_ELF=y
   19.12  #
   19.13  # Parallel port support
   19.14  #
   19.15 -CONFIG_PARPORT=y
   19.16 -CONFIG_PARPORT_PC=y
   19.17 -# CONFIG_PARPORT_PC_FIFO is not set
   19.18 -# CONFIG_PARPORT_PC_SUPERIO is not set
   19.19 -# CONFIG_PARPORT_PC_PCMCIA is not set
   19.20 -# CONFIG_PARPORT_AMIGA is not set
   19.21 -# CONFIG_PARPORT_MFC3 is not set
   19.22 -# CONFIG_PARPORT_ATARI is not set
   19.23 -# CONFIG_PARPORT_GSC is not set
   19.24 -# CONFIG_PARPORT_SUNBPP is not set
   19.25 -# CONFIG_PARPORT_IP22 is not set
   19.26 -# CONFIG_PARPORT_OTHER is not set
   19.27 -CONFIG_PARPORT_1284=y
   19.28 +# CONFIG_PARPORT is not set
   19.29  
   19.30  #
   19.31  # Plug and Play configuration
   19.32 @@ -112,7 +101,7 @@ CONFIG_PNP=y
   19.33  #
   19.34  # Block devices
   19.35  #
   19.36 -CONFIG_BLK_DEV_FD=y
   19.37 +# CONFIG_BLK_DEV_FD is not set
   19.38  # CONFIG_BLK_DEV_XD is not set
   19.39  # CONFIG_PARIDE is not set
   19.40  # CONFIG_BLK_CPQ_DA is not set
   19.41 @@ -131,14 +120,14 @@ CONFIG_BLK_DEV_INITRD=y
   19.42  #
   19.43  # Multi-device support (RAID and LVM)
   19.44  #
   19.45 -CONFIG_MD=y
   19.46 -CONFIG_BLK_DEV_MD=y
   19.47 -CONFIG_MD_LINEAR=y
   19.48 -CONFIG_MD_RAID0=y
   19.49 -CONFIG_MD_RAID1=y
   19.50 -CONFIG_MD_RAID5=y
   19.51 -CONFIG_MD_MULTIPATH=y
   19.52 -CONFIG_BLK_DEV_LVM=y
   19.53 +# CONFIG_MD is not set
   19.54 +# CONFIG_BLK_DEV_MD is not set
   19.55 +# CONFIG_MD_LINEAR is not set
   19.56 +# CONFIG_MD_RAID0 is not set
   19.57 +# CONFIG_MD_RAID1 is not set
   19.58 +# CONFIG_MD_RAID5 is not set
   19.59 +# CONFIG_MD_MULTIPATH is not set
   19.60 +# CONFIG_BLK_DEV_LVM is not set
   19.61  
   19.62  #
   19.63  # Networking options
   19.64 @@ -234,7 +223,7 @@ CONFIG_IP_NF_TARGET_ULOG=y
   19.65  #
   19.66  # CONFIG_DEV_APPLETALK is not set
   19.67  # CONFIG_DECNET is not set
   19.68 -# CONFIG_BRIDGE is not set
   19.69 +CONFIG_BRIDGE=y
   19.70  # CONFIG_X25 is not set
   19.71  # CONFIG_LAPB is not set
   19.72  # CONFIG_LLC is not set
   19.73 @@ -380,14 +369,7 @@ CONFIG_CHR_DEV_SG=y
   19.74  # CONFIG_SCSI_AHA1740 is not set
   19.75  CONFIG_SCSI_AACRAID=y
   19.76  # CONFIG_SCSI_AIC7XXX is not set
   19.77 -CONFIG_SCSI_AIC79XX=y
   19.78 -CONFIG_AIC79XX_CMDS_PER_DEVICE=32
   19.79 -CONFIG_AIC79XX_RESET_DELAY_MS=15000
   19.80 -# CONFIG_AIC79XX_BUILD_FIRMWARE is not set
   19.81 -# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
   19.82 -CONFIG_AIC79XX_DEBUG_ENABLE=y
   19.83 -CONFIG_AIC79XX_DEBUG_MASK=0
   19.84 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
   19.85 +# CONFIG_SCSI_AIC79XX is not set
   19.86  # CONFIG_SCSI_AIC7XXX_OLD is not set
   19.87  # CONFIG_SCSI_DPT_I2O is not set
   19.88  # CONFIG_SCSI_ADVANSYS is not set
   19.89 @@ -397,9 +379,9 @@ CONFIG_SCSI_MEGARAID=y
   19.90  # CONFIG_SCSI_MEGARAID2 is not set
   19.91  CONFIG_SCSI_BUSLOGIC=y
   19.92  # CONFIG_SCSI_OMIT_FLASHPOINT is not set
   19.93 -CONFIG_SCSI_CPQFCTS=y
   19.94 +# CONFIG_SCSI_CPQFCTS is not set
   19.95  # CONFIG_SCSI_DMX3191D is not set
   19.96 -CONFIG_SCSI_DTC3280=y
   19.97 +# CONFIG_SCSI_DTC3280 is not set
   19.98  # CONFIG_SCSI_EATA is not set
   19.99  # CONFIG_SCSI_EATA_DMA is not set
  19.100  # CONFIG_SCSI_EATA_PIO is not set
  19.101 @@ -409,15 +391,11 @@ CONFIG_SCSI_DTC3280=y
  19.102  # CONFIG_SCSI_IPS is not set
  19.103  # CONFIG_SCSI_INITIO is not set
  19.104  # CONFIG_SCSI_INIA100 is not set
  19.105 -# CONFIG_SCSI_PPA is not set
  19.106 -# CONFIG_SCSI_IMM is not set
  19.107  # CONFIG_SCSI_NCR53C406A is not set
  19.108  # CONFIG_SCSI_NCR53C7xx is not set
  19.109 -CONFIG_SCSI_SYM53C8XX_2=y
  19.110 -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
  19.111 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
  19.112 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
  19.113 -# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
  19.114 +# CONFIG_SCSI_SYM53C8XX_2 is not set
  19.115 +# CONFIG_SCSI_NCR53C8XX is not set
  19.116 +# CONFIG_SCSI_SYM53C8XX is not set
  19.117  # CONFIG_SCSI_PAS16 is not set
  19.118  # CONFIG_SCSI_PCI2000 is not set
  19.119  # CONFIG_SCSI_PCI2220I is not set
  19.120 @@ -510,9 +488,7 @@ CONFIG_PCNET32=y
  19.121  # CONFIG_APRICOT is not set
  19.122  # CONFIG_B44 is not set
  19.123  # CONFIG_CS89x0 is not set
  19.124 -CONFIG_TULIP=y
  19.125 -# CONFIG_TULIP_MWI is not set
  19.126 -# CONFIG_TULIP_MMIO is not set
  19.127 +# CONFIG_TULIP is not set
  19.128  # CONFIG_DE4X5 is not set
  19.129  # CONFIG_DGRS is not set
  19.130  # CONFIG_DM9102 is not set
  19.131 @@ -545,8 +521,7 @@ CONFIG_TULIP=y
  19.132  #
  19.133  # Ethernet (1000 Mbit)
  19.134  #
  19.135 -CONFIG_ACENIC=y
  19.136 -# CONFIG_ACENIC_OMIT_TIGON_I is not set
  19.137 +# CONFIG_ACENIC is not set
  19.138  # CONFIG_DL2K is not set
  19.139  CONFIG_E1000=y
  19.140  # CONFIG_E1000_NAPI is not set
  19.141 @@ -621,9 +596,6 @@ CONFIG_VT_CONSOLE=y
  19.142  # CONFIG_SERIAL_NONSTANDARD is not set
  19.143  CONFIG_UNIX98_PTYS=y
  19.144  CONFIG_UNIX98_PTY_COUNT=256
  19.145 -# CONFIG_PRINTER is not set
  19.146 -# CONFIG_PPDEV is not set
  19.147 -# CONFIG_TIPAR is not set
  19.148  
  19.149  #
  19.150  # I2C support
  19.151 @@ -869,107 +841,7 @@ CONFIG_DUMMY_CONSOLE=y
  19.152  #
  19.153  # USB support
  19.154  #
  19.155 -CONFIG_USB=y
  19.156 -CONFIG_USB_DEBUG=y
  19.157 -
  19.158 -#
  19.159 -# Miscellaneous USB options
  19.160 -#
  19.161 -# CONFIG_USB_DEVICEFS is not set
  19.162 -# CONFIG_USB_BANDWIDTH is not set
  19.163 -
  19.164 -#
  19.165 -# USB Host Controller Drivers
  19.166 -#
  19.167 -# CONFIG_USB_EHCI_HCD is not set
  19.168 -CONFIG_USB_UHCI=y
  19.169 -# CONFIG_USB_UHCI_ALT is not set
  19.170 -CONFIG_USB_OHCI=y
  19.171 -# CONFIG_USB_SL811HS_ALT is not set
  19.172 -# CONFIG_USB_SL811HS is not set
  19.173 -
  19.174 -#
  19.175 -# USB Device Class drivers
  19.176 -#
  19.177 -# CONFIG_USB_AUDIO is not set
  19.178 -# CONFIG_USB_EMI26 is not set
  19.179 -# CONFIG_USB_BLUETOOTH is not set
  19.180 -# CONFIG_USB_MIDI is not set
  19.181 -# CONFIG_USB_STORAGE is not set
  19.182 -# CONFIG_USB_STORAGE_DEBUG is not set
  19.183 -# CONFIG_USB_STORAGE_DATAFAB is not set
  19.184 -# CONFIG_USB_STORAGE_FREECOM is not set
  19.185 -# CONFIG_USB_STORAGE_ISD200 is not set
  19.186 -# CONFIG_USB_STORAGE_DPCM is not set
  19.187 -# CONFIG_USB_STORAGE_HP8200e is not set
  19.188 -# CONFIG_USB_STORAGE_SDDR09 is not set
  19.189 -# CONFIG_USB_STORAGE_SDDR55 is not set
  19.190 -# CONFIG_USB_STORAGE_JUMPSHOT is not set
  19.191 -# CONFIG_USB_ACM is not set
  19.192 -# CONFIG_USB_PRINTER is not set
  19.193 -
  19.194 -#
  19.195 -# USB Human Interface Devices (HID)
  19.196 -#
  19.197 -# CONFIG_USB_HID is not set
  19.198 -
  19.199 -#
  19.200 -#     Input core support is needed for USB HID input layer or HIDBP support
  19.201 -#
  19.202 -# CONFIG_USB_HIDINPUT is not set
  19.203 -# CONFIG_USB_HIDDEV is not set
  19.204 -# CONFIG_USB_KBD is not set
  19.205 -# CONFIG_USB_MOUSE is not set
  19.206 -# CONFIG_USB_AIPTEK is not set
  19.207 -# CONFIG_USB_WACOM is not set
  19.208 -# CONFIG_USB_KBTAB is not set
  19.209 -# CONFIG_USB_POWERMATE is not set
  19.210 -
  19.211 -#
  19.212 -# USB Imaging devices
  19.213 -#
  19.214 -# CONFIG_USB_DC2XX is not set
  19.215 -# CONFIG_USB_MDC800 is not set
  19.216 -# CONFIG_USB_SCANNER is not set
  19.217 -# CONFIG_USB_MICROTEK is not set
  19.218 -# CONFIG_USB_HPUSBSCSI is not set
  19.219 -
  19.220 -#
  19.221 -# USB Multimedia devices
  19.222 -#
  19.223 -
  19.224 -#
  19.225 -#   Video4Linux support is needed for USB Multimedia device support
  19.226 -#
  19.227 -
  19.228 -#
  19.229 -# USB Network adaptors
  19.230 -#
  19.231 -# CONFIG_USB_PEGASUS is not set
  19.232 -# CONFIG_USB_RTL8150 is not set
  19.233 -# CONFIG_USB_KAWETH is not set
  19.234 -# CONFIG_USB_CATC is not set
  19.235 -# CONFIG_USB_CDCETHER is not set
  19.236 -# CONFIG_USB_USBNET is not set
  19.237 -
  19.238 -#
  19.239 -# USB port drivers
  19.240 -#
  19.241 -# CONFIG_USB_USS720 is not set
  19.242 -
  19.243 -#
  19.244 -# USB Serial Converter support
  19.245 -#
  19.246 -# CONFIG_USB_SERIAL is not set
  19.247 -
  19.248 -#
  19.249 -# USB Miscellaneous drivers
  19.250 -#
  19.251 -# CONFIG_USB_RIO500 is not set
  19.252 -# CONFIG_USB_AUERSWALD is not set
  19.253 -# CONFIG_USB_TIGL is not set
  19.254 -# CONFIG_USB_BRLVGER is not set
  19.255 -# CONFIG_USB_LCD is not set
  19.256 +# CONFIG_USB is not set
  19.257  
  19.258  #
  19.259  # Support for USB gadgets
    20.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h	Tue May 11 14:57:44 2004 +0000
    20.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h	Tue May 11 15:02:26 2004 +0000
    20.3 @@ -10,6 +10,7 @@
    20.4  #include <linux/rbtree.h>
    20.5  #include <linux/interrupt.h>
    20.6  #include <linux/slab.h>
    20.7 +#include <linux/blkdev.h>
    20.8  #include <asm/ctrl_if.h>
    20.9  #include <asm/io.h>
   20.10  #include "../blkif.h"
    21.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c	Tue May 11 14:57:44 2004 +0000
    21.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c	Tue May 11 15:02:26 2004 +0000
    21.3 @@ -74,7 +74,8 @@ void blkif_ctrlif_init(void)
    21.4      ctrl_msg_t                       cmsg;
    21.5      blkif_be_driver_status_changed_t st;
    21.6  
    21.7 -    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx);
    21.8 +    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
    21.9 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
   21.10  
   21.11      /* Send a driver-UP notification to the domain controller. */
   21.12      cmsg.type      = CMSG_BLKIF_BE;
    22.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c	Tue May 11 14:57:44 2004 +0000
    22.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c	Tue May 11 15:02:26 2004 +0000
    22.3 @@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *cre
    22.4      unsigned int  handle = create->blkif_handle;
    22.5      blkif_t     **pblkif, *blkif;
    22.6  
    22.7 -    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL )
    22.8 +    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
    22.9      {
   22.10          DPRINTK("Could not create blkif: out of memory\n");
   22.11          create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
    23.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c	Tue May 11 14:57:44 2004 +0000
    23.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c	Tue May 11 15:02:26 2004 +0000
    23.3 @@ -24,17 +24,15 @@
    23.4  #define MAX_PENDING_REQS 64
    23.5  #define BATCH_PER_DOMAIN 16
    23.6  
    23.7 -static struct vm_struct *mmap_vma;
    23.8 -#define MMAP_PAGES_PER_SEGMENT \
    23.9 -    ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1)
   23.10 +static unsigned long mmap_vstart;
   23.11  #define MMAP_PAGES_PER_REQUEST \
   23.12 -    (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT)
   23.13 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
   23.14  #define MMAP_PAGES             \
   23.15      (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
   23.16 -#define MMAP_VADDR(_req,_seg)            \
   23.17 -    ((unsigned long)mmap_vma->addr +     \
   23.18 +#define MMAP_VADDR(_req,_seg)                        \
   23.19 +    (mmap_vstart +                                   \
   23.20       ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
   23.21 -     ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE))
   23.22 +     ((_seg) * PAGE_SIZE))
   23.23  
   23.24  /*
   23.25   * Each outstanding request that we've passed to the lower device layers has a 
   23.26 @@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blki
   23.27      prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
   23.28      for ( i = 0; i < req->nr_segments; i++ )
   23.29      {
   23.30 -        if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) )
   23.31 +        /* Make sure the buffer is page-sized. */
   23.32 +        if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) ||
   23.33 +             (blkif_last_sect(req->frame_and_sects[i]) != 7) )
   23.34              goto bad_descriptor;
   23.35          rc = direct_remap_area_pages(&init_mm, 
   23.36                                       MMAP_VADDR(pending_idx, i),
   23.37 -                                     req->buffer_and_sects[i] & PAGE_MASK, 
   23.38 +                                     req->frame_and_sects[i] & PAGE_MASK, 
   23.39                                       PAGE_SIZE, prot, blkif->domid);
   23.40          if ( rc != 0 )
   23.41              goto bad_descriptor;
   23.42 @@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t
   23.43      extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
   23.44      struct buffer_head *bh;
   23.45      int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
   23.46 -    unsigned short nr_sects;
   23.47 -    unsigned long buffer;
   23.48 +    short nr_sects;
   23.49 +    unsigned long buffer, fas;
   23.50      int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
   23.51      pending_req_t *pending_req;
   23.52      pgprot_t       prot;
   23.53  
   23.54      /* We map virtual scatter/gather segments to physical segments. */
   23.55      int new_segs, nr_psegs = 0;
   23.56 -    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
   23.57 +    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
   23.58  
   23.59      /* Check that number of segments is sane. */
   23.60      if ( unlikely(req->nr_segments == 0) || 
   23.61 @@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t
   23.62       */
   23.63      for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
   23.64      {
   23.65 -        buffer   = req->buffer_and_sects[i] & ~0x1FF;
   23.66 -        nr_sects = req->buffer_and_sects[i] &  0x1FF;
   23.67 -
   23.68 -        if ( unlikely(nr_sects == 0) )
   23.69 -            continue;
   23.70 +        fas      = req->frame_and_sects[i];
   23.71 +        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
   23.72 +        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
   23.73  
   23.74 -        if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) )
   23.75 -        {
   23.76 -            DPRINTK("Too many sectors in segment\n");
   23.77 +        if ( nr_sects <= 0 )
   23.78              goto bad_descriptor;
   23.79 -        }
   23.80  
   23.81          phys_seg[nr_psegs].dev           = req->device;
   23.82          phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
   23.83 @@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t
   23.84          }
   23.85    
   23.86          nr_psegs += new_segs;
   23.87 -        ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2);
   23.88 +        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
   23.89      }
   23.90  
   23.91      /* Nonsensical zero-sized request? */
   23.92 @@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t
   23.93  
   23.94      for ( i = 0; i < nr_psegs; i++ )
   23.95      {
   23.96 -        unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + 
   23.97 -                            (phys_seg[i].nr_sects << 9) + 
   23.98 -                            (PAGE_SIZE - 1)) & PAGE_MASK;
   23.99          int rc = direct_remap_area_pages(&init_mm, 
  23.100                                           MMAP_VADDR(pending_idx, i),
  23.101                                           phys_seg[i].buffer & PAGE_MASK, 
  23.102 -                                         sz, prot, blkif->domid);
  23.103 +                                         PAGE_SIZE, prot, blkif->domid);
  23.104          if ( rc != 0 )
  23.105          {
  23.106              DPRINTK("invalid buffer\n");
  23.107 @@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t
  23.108                                MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
  23.109              goto bad_descriptor;
  23.110          }
  23.111 +        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
  23.112 +            phys_seg[i].buffer >> PAGE_SHIFT;
  23.113      }
  23.114  
  23.115      pending_req = &pending_reqs[pending_idx];
  23.116 @@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t
  23.117          bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
  23.118          bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
  23.119              (phys_seg[i].buffer & ~PAGE_MASK);
  23.120 +//        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
  23.121          bh->b_end_io        = end_block_io_op;
  23.122          bh->b_private       = pending_req;
  23.123  
  23.124 @@ -456,13 +451,13 @@ static int __init init_module(void)
  23.125  {
  23.126      int i;
  23.127  
  23.128 +    if ( !(start_info.flags & SIF_INITDOMAIN) )
  23.129 +        return 0;
  23.130 +
  23.131      blkif_interface_init();
  23.132  
  23.133 -    if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL )
  23.134 -    {
  23.135 -        printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n");
  23.136 -        return -ENOMEM;
  23.137 -    }
  23.138 +    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
  23.139 +        BUG();
  23.140  
  23.141      pending_cons = 0;
  23.142      pending_prod = MAX_PENDING_REQS;
  23.143 @@ -484,6 +479,7 @@ static int __init init_module(void)
  23.144  
  23.145  static void cleanup_module(void)
  23.146  {
  23.147 +    BUG();
  23.148  }
  23.149  
  23.150  module_init(init_module);
    24.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c	Tue May 11 14:57:44 2004 +0000
    24.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c	Tue May 11 15:02:26 2004 +0000
    24.3 @@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *c
    24.4          }
    24.5      }
    24.6  
    24.7 -    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) )
    24.8 +    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
    24.9      {
   24.10          DPRINTK("vbd_create: out of memory\n");
   24.11          create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
   24.12 @@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow)
   24.13      } 
   24.14  
   24.15      if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
   24.16 -                               GFP_ATOMIC)) == NULL) )
   24.17 +                               GFP_KERNEL)) == NULL) )
   24.18      {
   24.19          DPRINTK("vbd_grow: out of memory\n");
   24.20          grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
    25.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h	Tue May 11 14:57:44 2004 +0000
    25.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h	Tue May 11 15:02:26 2004 +0000
    25.3 @@ -26,19 +26,22 @@
    25.4   */
    25.5  #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
    25.6  
    25.7 -#define BLKIF_MAX_SECTORS_PER_SEGMENT  16
    25.8 -
    25.9  typedef struct {
   25.10      u8             operation;        /* BLKIF_OP_???                         */
   25.11      u8             nr_segments;      /* number of segments                   */
   25.12      blkif_vdev_t   device;           /* only for read/write requests         */
   25.13      unsigned long  id;               /* private guest value, echoed in resp  */
   25.14      blkif_sector_t sector_number;    /* start sector idx on disk (r/w only)  */
   25.15 -    /* Least 9 bits is 'nr_sects'. High 23 bits is the address.       */
   25.16 -    /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */
   25.17 -    unsigned long  buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   25.18 +    /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame.   */
   25.19 +    /* @first_sect: first sector in frame to transfer (inclusive).           */
   25.20 +    /* @last_sect: last sector in frame to transfer (inclusive).             */
   25.21 +    /* @frame: machine page frame number.                                    */
   25.22 +    unsigned long  frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   25.23  } blkif_request_t;
   25.24  
   25.25 +#define blkif_first_sect(_fas) (((_fas)>>3)&7)
   25.26 +#define blkif_last_sect(_fas)  ((_fas)&7)
   25.27 +
   25.28  typedef struct {
   25.29      unsigned long   id;              /* copied from request */
   25.30      u8              operation;       /* copied from request */
   25.31 @@ -79,8 +82,8 @@ typedef struct {
   25.32   *  @device      == unused (zero)
   25.33   *  @id          == any value (echoed in response message)
   25.34   *  @sector_num  == unused (zero)
   25.35 - *  @buffer_and_sects == list of page-aligned, page-sized buffers.
   25.36 - *                       (i.e., nr_sects == 8).
   25.37 + *  @frame_and_sects == list of page-sized buffers.
   25.38 + *                       (i.e., @first_sect == 0, @last_sect == 7).
   25.39   * 
   25.40   * The response is a list of vdisk_t elements copied into the out-of-band
   25.41   * probe buffer. On success the response status field contains the number
    26.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c	Tue May 11 14:57:44 2004 +0000
    26.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c	Tue May 11 15:02:26 2004 +0000
    26.3 @@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linu
    26.4  static unsigned int blkif_state = BLKIF_STATE_CLOSED;
    26.5  static unsigned int blkif_evtchn, blkif_irq;
    26.6  
    26.7 -static struct tq_struct blkif_statechange_tq;
    26.8 -
    26.9  static int blkif_control_rsp_valid;
   26.10  static blkif_response_t blkif_control_rsp;
   26.11  
   26.12 @@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned 
   26.13      struct gendisk     *gd;
   26.14      blkif_request_t    *req;
   26.15      struct buffer_head *bh;
   26.16 +    unsigned int        fsect, lsect;
   26.17  
   26.18 -    if ( unlikely(nr_sectors >= (1<<9)) )
   26.19 -        BUG();
   26.20 +    fsect = (buffer_ma & ~PAGE_MASK) >> 9;
   26.21 +    lsect = fsect + nr_sectors - 1;
   26.22 +
   26.23 +    /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
   26.24      if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
   26.25          BUG();
   26.26 +    if ( lsect > 7 )
   26.27 +        BUG();
   26.28 +
   26.29 +    buffer_ma &= PAGE_MASK;
   26.30  
   26.31      if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
   26.32          return 1;
   26.33 @@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned 
   26.34              bh = (struct buffer_head *)id;
   26.35              bh->b_reqnext = (struct buffer_head *)req->id;
   26.36              req->id = id;
   26.37 -            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
   26.38 -            if ( ++req->nr_segments < MAX_BLK_SEGS )
   26.39 +            req->frame_and_sects[req->nr_segments] = 
   26.40 +                buffer_ma | (fsect<<3) | lsect;
   26.41 +            if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
   26.42                  sg_next_sect += nr_sectors;
   26.43              else
   26.44                  DISABLE_SCATTERGATHER();
   26.45 @@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned 
   26.46      req->sector_number = (blkif_sector_t)sector_number;
   26.47      req->device        = device; 
   26.48      req->nr_segments   = 1;
   26.49 -    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
   26.50 +    req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
   26.51      req_prod++;
   26.52  
   26.53      return 0;
   26.54 @@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t 
   26.55  }
   26.56  
   26.57  
   26.58 -static void blkif_bringup_phase1(void *unused)
   26.59 +static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
   26.60  {
   26.61      ctrl_msg_t                   cmsg;
   26.62      blkif_fe_interface_connect_t up;
   26.63  
   26.64 -    /* Move from CLOSED to DISCONNECTED state. */
   26.65 -    blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
   26.66 -    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
   26.67 -    blkif_state  = BLKIF_STATE_DISCONNECTED;
   26.68 -
   26.69 -    /* Construct an interface-CONNECT message for the domain controller. */
   26.70 -    cmsg.type      = CMSG_BLKIF_FE;
   26.71 -    cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
   26.72 -    cmsg.length    = sizeof(blkif_fe_interface_connect_t);
   26.73 -    up.handle      = 0;
   26.74 -    up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
   26.75 -    memcpy(cmsg.msg, &up, sizeof(up));
   26.76 -
   26.77 -    /* Tell the controller to bring up the interface. */
   26.78 -    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
   26.79 -}
   26.80 -
   26.81 -static void blkif_bringup_phase2(void *unused)
   26.82 -{
   26.83 -    blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
   26.84 -    (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
   26.85 -
   26.86 -    /* Probe for discs that are attached to the interface. */
   26.87 -    xlvbd_init();
   26.88 -
   26.89 -    blkif_state = BLKIF_STATE_CONNECTED;
   26.90 -
   26.91 -    /* Kick pending requests. */
   26.92 -    spin_lock_irq(&io_request_lock);
   26.93 -    kick_pending_request_queues();
   26.94 -    spin_unlock_irq(&io_request_lock);
   26.95 -}
   26.96 -
   26.97 -static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
   26.98 -{
   26.99      if ( status->handle != 0 )
  26.100      {
  26.101          printk(KERN_WARNING "Status change on unsupported blkif %d\n",
  26.102 @@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe
  26.103                     " in state %d\n", blkif_state);
  26.104              break;
  26.105          }
  26.106 -        blkif_statechange_tq.routine = blkif_bringup_phase1;
  26.107 -        schedule_task(&blkif_statechange_tq);
  26.108 +
  26.109 +        /* Move from CLOSED to DISCONNECTED state. */
  26.110 +        blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
  26.111 +        blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
  26.112 +        blkif_state  = BLKIF_STATE_DISCONNECTED;
  26.113 +
  26.114 +        /* Construct an interface-CONNECT message for the domain controller. */
  26.115 +        cmsg.type      = CMSG_BLKIF_FE;
  26.116 +        cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
  26.117 +        cmsg.length    = sizeof(blkif_fe_interface_connect_t);
  26.118 +        up.handle      = 0;
  26.119 +        up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
  26.120 +        memcpy(cmsg.msg, &up, sizeof(up));
  26.121 +        
  26.122 +        /* Tell the controller to bring up the interface. */
  26.123 +        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  26.124          break;
  26.125  
  26.126      case BLKIF_INTERFACE_STATUS_CONNECTED:
  26.127 @@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe
  26.128                     " in state %d\n", blkif_state);
  26.129              break;
  26.130          }
  26.131 +
  26.132          blkif_evtchn = status->evtchn;
  26.133 -        blkif_statechange_tq.routine = blkif_bringup_phase2;
  26.134 -        schedule_task(&blkif_statechange_tq);
  26.135 +        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
  26.136 +        (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
  26.137 +        
  26.138 +        /* Probe for discs that are attached to the interface. */
  26.139 +        xlvbd_init();
  26.140 +        
  26.141 +        blkif_state = BLKIF_STATE_CONNECTED;
  26.142 +        
  26.143 +        /* Kick pending requests. */
  26.144 +        spin_lock_irq(&io_request_lock);
  26.145 +        kick_pending_request_queues();
  26.146 +        spin_unlock_irq(&io_request_lock);
  26.147          break;
  26.148  
  26.149      default:
  26.150 @@ -675,7 +671,11 @@ int __init xlblk_init(void)
  26.151      ctrl_msg_t                       cmsg;
  26.152      blkif_fe_driver_status_changed_t st;
  26.153  
  26.154 -    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx);
  26.155 +    if ( start_info.flags & SIF_INITDOMAIN )
  26.156 +        return 0;
  26.157 +
  26.158 +    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
  26.159 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
  26.160  
  26.161      /* Send a driver-UP notification to the domain controller. */
  26.162      cmsg.type      = CMSG_BLKIF_FE;
    27.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c	Tue May 11 14:57:44 2004 +0000
    27.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c	Tue May 11 15:02:26 2004 +0000
    27.3 @@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *d
    27.4      memset(&req, 0, sizeof(req));
    27.5      req.operation   = BLKIF_OP_PROBE;
    27.6      req.nr_segments = 1;
    27.7 -    req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512);
    27.8 +    req.frame_and_sects[0] = virt_to_machine(buf) | 7;
    27.9  
   27.10      blkif_control_send(&req, &rsp);
   27.11  
    28.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c	Tue May 11 14:57:44 2004 +0000
    28.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c	Tue May 11 15:02:26 2004 +0000
    28.3 @@ -512,7 +512,7 @@ static int __init xencons_init(void)
    28.4      }
    28.5      else
    28.6      {
    28.7 -        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx);
    28.8 +        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0);
    28.9      }
   28.10  
   28.11      printk("Xen virtual console successfully installed\n");
    29.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/common.h	Tue May 11 14:57:44 2004 +0000
    29.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/common.h	Tue May 11 15:02:26 2004 +0000
    29.3 @@ -16,6 +16,7 @@
    29.4  #include <asm/ctrl_if.h>
    29.5  #include <asm/io.h>
    29.6  #include "../netif.h"
    29.7 +#include "../../../../../net/bridge/br_private.h"
    29.8  
    29.9  #ifndef NDEBUG
   29.10  #define ASSERT(_p) \
   29.11 @@ -28,7 +29,7 @@
   29.12  #define DPRINTK(_f, _a...) ((void)0)
   29.13  #endif
   29.14  
   29.15 -typedef struct {
   29.16 +typedef struct netif_st {
   29.17      /* Unique identifier for this interface. */
   29.18      domid_t          domid;
   29.19      unsigned int     handle;
   29.20 @@ -49,13 +50,7 @@ typedef struct {
   29.21      NETIF_RING_IDX tx_req_cons;
   29.22      NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */
   29.23  
   29.24 -    /* Usage accounting */
   29.25 -    long long total_bytes_sent;
   29.26 -    long long total_bytes_received;
   29.27 -    long long total_packets_sent;
   29.28 -    long long total_packets_received;
   29.29 -
   29.30 -    /* Trasnmit shaping: allow 'credit_bytes' every 'credit_usec'. */
   29.31 +    /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
   29.32      unsigned long   credit_bytes;
   29.33      unsigned long   credit_usec;
   29.34      unsigned long   remaining_credit;
   29.35 @@ -72,7 +67,8 @@ typedef struct {
   29.36      struct list_head list;  /* scheduling list */
   29.37      atomic_t         refcnt;
   29.38      spinlock_t       rx_lock, tx_lock;
   29.39 -    unsigned char    vmac[ETH_ALEN];
   29.40 +    struct net_device *dev;
   29.41 +    struct net_device_stats stats;
   29.42  } netif_t;
   29.43  
   29.44  void netif_create(netif_be_create_t *create);
   29.45 @@ -93,6 +89,8 @@ void netif_ctrlif_init(void);
   29.46  
   29.47  void netif_deschedule(netif_t *netif);
   29.48  
   29.49 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
   29.50 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
   29.51  void netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
   29.52  
   29.53  #endif /* __NETIF__BACKEND__COMMON_H__ */
    30.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c	Tue May 11 14:57:44 2004 +0000
    30.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c	Tue May 11 15:02:26 2004 +0000
    30.3 @@ -10,8 +10,6 @@
    30.4  
    30.5  static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
    30.6  {
    30.7 -    DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype);
    30.8 -    
    30.9      switch ( msg->subtype )
   30.10      {
   30.11      case CMSG_NETIF_BE_CREATE:
   30.12 @@ -54,7 +52,8 @@ void netif_ctrlif_init(void)
   30.13      ctrl_msg_t                       cmsg;
   30.14      netif_be_driver_status_changed_t st;
   30.15  
   30.16 -    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx);
   30.17 +    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
   30.18 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
   30.19  
   30.20      /* Send a driver-UP notification to the domain controller. */
   30.21      cmsg.type      = CMSG_NETIF_BE;
    31.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c	Tue May 11 14:57:44 2004 +0000
    31.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c	Tue May 11 15:02:26 2004 +0000
    31.3 @@ -7,13 +7,15 @@
    31.4   */
    31.5  
    31.6  #include "common.h"
    31.7 +#include <linux/rtnetlink.h>
    31.8  
    31.9  #define NETIF_HASHSZ 1024
   31.10  #define NETIF_HASH(_d,_h) \
   31.11      (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(NETIF_HASHSZ-1))
   31.12  
   31.13 -static kmem_cache_t *netif_cachep;
   31.14 -static netif_t      *netif_hash[NETIF_HASHSZ];
   31.15 +static netif_t *netif_hash[NETIF_HASHSZ];
   31.16 +static struct net_device *bridge_dev;
   31.17 +static struct net_bridge *bridge_br;
   31.18  
   31.19  netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
   31.20  {
   31.21 @@ -35,7 +37,11 @@ void __netif_disconnect_complete(netif_t
   31.22       * must still be notified to the remote driver.
   31.23       */
   31.24      unbind_evtchn_from_irq(netif->evtchn);
   31.25 -    vfree(netif->net_ring_base);
   31.26 +    vfree(netif->tx); /* Frees netif->rx as well. */
   31.27 +    rtnl_lock();
   31.28 +    (void)br_del_if(bridge_br, netif->dev);
   31.29 +    (void)dev_close(netif->dev);
   31.30 +    rtnl_unlock();
   31.31  
   31.32      /* Construct the deferred response message. */
   31.33      cmsg.type         = CMSG_NETIF_BE;
   31.34 @@ -66,24 +72,32 @@ void __netif_disconnect_complete(netif_t
   31.35  
   31.36  void netif_create(netif_be_create_t *create)
   31.37  {
   31.38 -    domid_t       domid  = create->domid;
   31.39 -    unsigned int  handle = create->netif_handle;
   31.40 -    netif_t     **pnetif, *netif;
   31.41 +    domid_t            domid  = create->domid;
   31.42 +    unsigned int       handle = create->netif_handle;
   31.43 +    struct net_device *dev;
   31.44 +    netif_t          **pnetif, *netif;
   31.45  
   31.46 -    if ( (netif = kmem_cache_alloc(netif_cachep, GFP_ATOMIC)) == NULL )
   31.47 +    dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup);
   31.48 +    if ( dev == NULL )
   31.49      {
   31.50          DPRINTK("Could not create netif: out of memory\n");
   31.51          create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
   31.52          return;
   31.53      }
   31.54  
   31.55 +    netif = dev->priv;
   31.56      memset(netif, 0, sizeof(*netif));
   31.57      netif->domid  = domid;
   31.58      netif->handle = handle;
   31.59      netif->status = DISCONNECTED;
   31.60 -    spin_lock_init(&netif->vbd_lock);
   31.61 -    spin_lock_init(&netif->net_ring_lock);
   31.62 +    spin_lock_init(&netif->rx_lock);
   31.63 +    spin_lock_init(&netif->tx_lock);
   31.64      atomic_set(&netif->refcnt, 0);
   31.65 +    netif->dev = dev;
   31.66 +
   31.67 +    netif->credit_bytes = netif->remaining_credit = ~0UL;
   31.68 +    netif->credit_usec  = 0UL;
   31.69 +    /*init_ac_timer(&new_vif->credit_timeout);*/
   31.70  
   31.71      pnetif = &netif_hash[NETIF_HASH(domid, handle)];
   31.72      while ( *pnetif != NULL )
   31.73 @@ -92,12 +106,27 @@ void netif_create(netif_be_create_t *cre
   31.74          {
   31.75              DPRINTK("Could not create netif: already exists\n");
   31.76              create->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
   31.77 -            kmem_cache_free(netif_cachep, netif);
   31.78 +            kfree(dev);
   31.79              return;
   31.80          }
   31.81          pnetif = &(*pnetif)->hash_next;
   31.82      }
   31.83  
   31.84 +    dev->hard_start_xmit = netif_be_start_xmit;
   31.85 +    dev->get_stats       = netif_be_get_stats;
   31.86 +    memcpy(dev->dev_addr, create->mac, ETH_ALEN);
   31.87 +
   31.88 +    /* XXX In bridge mode we should force a different MAC from remote end. */
   31.89 +    dev->dev_addr[2] ^= 1;
   31.90 +
   31.91 +    if ( register_netdev(dev) != 0 )
   31.92 +    {
   31.93 +        DPRINTK("Could not register new net device\n");
   31.94 +        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
   31.95 +        kfree(dev);
   31.96 +        return;
   31.97 +    }
   31.98 +
   31.99      netif->hash_next = *pnetif;
  31.100      *pnetif = netif;
  31.101  
  31.102 @@ -132,8 +161,8 @@ void netif_destroy(netif_be_destroy_t *d
  31.103  
  31.104   destroy:
  31.105      *pnetif = netif->hash_next;
  31.106 -    destroy_all_vbds(netif);
  31.107 -    kmem_cache_free(netif_cachep, netif);
  31.108 +    unregister_netdev(netif->dev);
  31.109 +    kfree(netif->dev);
  31.110      destroy->status = NETIF_BE_STATUS_OKAY;
  31.111  }
  31.112  
  31.113 @@ -142,11 +171,13 @@ void netif_connect(netif_be_connect_t *c
  31.114      domid_t       domid  = connect->domid;
  31.115      unsigned int  handle = connect->netif_handle;
  31.116      unsigned int  evtchn = connect->evtchn;
  31.117 -    unsigned long shmem_frame = connect->shmem_frame;
  31.118 +    unsigned long tx_shmem_frame = connect->tx_shmem_frame;
  31.119 +    unsigned long rx_shmem_frame = connect->rx_shmem_frame;
  31.120      struct vm_struct *vma;
  31.121      pgprot_t      prot;
  31.122      int           error;
  31.123      netif_t      *netif;
  31.124 +    struct net_device *eth0_dev;
  31.125  
  31.126      netif = netif_find_by_handle(domid, handle);
  31.127      if ( unlikely(netif == NULL) )
  31.128 @@ -157,16 +188,27 @@ void netif_connect(netif_be_connect_t *c
  31.129          return;
  31.130      }
  31.131  
  31.132 -    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
  31.133 +    if ( netif->status != DISCONNECTED )
  31.134 +    {
  31.135 +        connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
  31.136 +        return;
  31.137 +    }
  31.138 +
  31.139 +    if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL )
  31.140      {
  31.141          connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
  31.142          return;
  31.143      }
  31.144  
  31.145      prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
  31.146 -    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
  31.147 -                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
  31.148 -                                    prot, domid);
  31.149 +    error  = direct_remap_area_pages(&init_mm, 
  31.150 +                                     VMALLOC_VMADDR(vma->addr),
  31.151 +                                     tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
  31.152 +                                     prot, domid);
  31.153 +    error |= direct_remap_area_pages(&init_mm, 
  31.154 +                                     VMALLOC_VMADDR(vma->addr) + PAGE_SIZE,
  31.155 +                                     rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
  31.156 +                                     prot, domid);
  31.157      if ( error != 0 )
  31.158      {
  31.159          if ( error == -ENOMEM )
  31.160 @@ -179,21 +221,39 @@ void netif_connect(netif_be_connect_t *c
  31.161          return;
  31.162      }
  31.163  
  31.164 -    if ( netif->status != DISCONNECTED )
  31.165 +    netif->evtchn         = evtchn;
  31.166 +    netif->irq            = bind_evtchn_to_irq(evtchn);
  31.167 +    netif->tx_shmem_frame = tx_shmem_frame;
  31.168 +    netif->rx_shmem_frame = rx_shmem_frame;
  31.169 +    netif->tx             = 
  31.170 +        (netif_tx_interface_t *)vma->addr;
  31.171 +    netif->rx             = 
  31.172 +        (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE);
  31.173 +    netif->status         = CONNECTED;
  31.174 +    netif_get(netif);
  31.175 +
  31.176 +    rtnl_lock();
  31.177 +
  31.178 +    (void)dev_open(netif->dev);
  31.179 +    (void)br_add_if(bridge_br, netif->dev);
  31.180 +
  31.181 +    /*
  31.182 +     * The default config is a very simple binding to eth0.
  31.183 +     * If eth0 is being used as an IP interface by this OS then someone
  31.184 +     * must add eth0's IP address to nbe-br, and change the routing table
  31.185 +     * to refer to nbe-br instead of eth0.
  31.186 +     */
  31.187 +    (void)dev_open(bridge_dev);
  31.188 +    if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL )
  31.189      {
  31.190 -        connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
  31.191 -        vfree(vma->addr);
  31.192 -        return;
  31.193 +        (void)dev_open(eth0_dev);
  31.194 +        (void)br_add_if(bridge_br, eth0_dev);
  31.195      }
  31.196  
  31.197 -    netif->evtchn        = evtchn;
  31.198 -    netif->irq           = bind_evtchn_to_irq(evtchn);
  31.199 -    netif->shmem_frame   = shmem_frame;
  31.200 -    netif->net_ring_base = (netif_ring_t *)vma->addr;
  31.201 -    netif->status        = CONNECTED;
  31.202 -    netif_get(netif);
  31.203 +    rtnl_unlock();
  31.204  
  31.205 -    request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif);
  31.206 +    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
  31.207 +    netif_start_queue(netif->dev);
  31.208  
  31.209      connect->status = NETIF_BE_STATUS_OKAY;
  31.210  }
  31.211 @@ -218,6 +278,7 @@ int netif_disconnect(netif_be_disconnect
  31.212          netif->status = DISCONNECTING;
  31.213          netif->disconnect_rspid = rsp_id;
  31.214          wmb(); /* Let other CPUs see the status change. */
  31.215 +        netif_stop_queue(netif->dev);
  31.216          free_irq(netif->irq, NULL);
  31.217          netif_deschedule(netif);
  31.218          netif_put(netif);
  31.219 @@ -226,105 +287,14 @@ int netif_disconnect(netif_be_disconnect
  31.220      return 0; /* Caller should not send response message. */
  31.221  }
  31.222  
  31.223 -net_vif_t *create_net_vif(domid_t dom)
  31.224 -{
  31.225 -    unsigned int idx;
  31.226 -    net_vif_t *new_vif = NULL;
  31.227 -    net_ring_t *new_ring = NULL;
  31.228 -    struct task_struct *p = NULL;
  31.229 -    unsigned long flags, vmac_hash;
  31.230 -    unsigned char vmac_key[ETH_ALEN + 2 + MAX_DOMAIN_NAME];
  31.231 -
  31.232 -    if ( (p = find_domain_by_id(dom)) == NULL )
  31.233 -        return NULL;
  31.234 -    
  31.235 -    write_lock_irqsave(&tasklist_lock, flags);
  31.236 -
  31.237 -    for ( idx = 0; idx < MAX_DOMAIN_VIFS; idx++ )
  31.238 -        if ( p->net_vif_list[idx] == NULL )
  31.239 -            break;
  31.240 -    if ( idx == MAX_DOMAIN_VIFS )
  31.241 -        goto fail;
  31.242 -
  31.243 -    if ( (new_vif = kmem_cache_alloc(net_vif_cache, GFP_KERNEL)) == NULL )
  31.244 -        goto fail;
  31.245 -
  31.246 -    memset(new_vif, 0, sizeof(*new_vif));
  31.247 -    
  31.248 -    if ( sizeof(net_ring_t) > PAGE_SIZE )
  31.249 -        BUG();
  31.250 -    new_ring = (net_ring_t *)get_free_page(GFP_KERNEL);
  31.251 -    clear_page(new_ring);
  31.252 -    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
  31.253 -
  31.254 -    /*
  31.255 -     * Fill in the new vif struct. Note that, while the vif's refcnt is
  31.256 -     * non-zero, we hold a reference to the task structure.
  31.257 -     */
  31.258 -    atomic_set(&new_vif->refcnt, 1);
  31.259 -    new_vif->shared_rings = new_ring;
  31.260 -    new_vif->shared_idxs  = &p->shared_info->net_idx[idx];
  31.261 -    new_vif->domain       = p;
  31.262 -    new_vif->idx          = idx;
  31.263 -    new_vif->list.next    = NULL;
  31.264 -    spin_lock_init(&new_vif->rx_lock);
  31.265 -    spin_lock_init(&new_vif->tx_lock);
  31.266 -
  31.267 -    new_vif->credit_bytes = new_vif->remaining_credit = ~0UL;
  31.268 -    new_vif->credit_usec  = 0UL;
  31.269 -    init_ac_timer(&new_vif->credit_timeout);
  31.270 -
  31.271 -    if ( (p->domain == 0) && (idx == 0) )
  31.272 -    {
  31.273 -        /*
  31.274 -         * DOM0/VIF0 gets the real physical MAC address, so that users can
  31.275 -         * easily get a Xen-based machine up and running by using an existing
  31.276 -         * DHCP entry.
  31.277 -         */
  31.278 -        memcpy(new_vif->vmac, the_dev->dev_addr, ETH_ALEN);
  31.279 -    }
  31.280 -    else
  31.281 -    {
  31.282 -        /*
  31.283 -         * Most VIFs get a random MAC address with a "special" vendor id.
  31.284 -         * We try to get MAC addresses to be unique across multiple servers
  31.285 -         * by including the physical MAC address in the hash. The hash also
  31.286 -         * includes the vif index and the domain's name.
  31.287 -         * 
  31.288 -         * NB. The vendor is currently an "obsolete" one that used to belong
  31.289 -         * to DEC (AA-00-00). Using it is probably a bit rude :-)
  31.290 -         * 
  31.291 -         * NB2. The first bit of the first random octet is set to zero for
  31.292 -         * all dynamic MAC addresses. This may allow us to manually specify
  31.293 -         * MAC addresses for some VIFs with no fear of clashes.
  31.294 -         */
  31.295 -        memcpy(&vmac_key[0], the_dev->dev_addr, ETH_ALEN);
  31.296 -        *(__u16 *)(&vmac_key[ETH_ALEN]) = htons(idx);
  31.297 -        strcpy(&vmac_key[ETH_ALEN+2], p->name);
  31.298 -        vmac_hash = hash(vmac_key, ETH_ALEN + 2 + strlen(p->name));
  31.299 -        memcpy(new_vif->vmac, "\xaa\x00\x00", 3);
  31.300 -        new_vif->vmac[3] = (vmac_hash >> 16) & 0xef; /* First bit is zero. */
  31.301 -        new_vif->vmac[4] = (vmac_hash >>  8) & 0xff;
  31.302 -        new_vif->vmac[5] = (vmac_hash >>  0) & 0xff;
  31.303 -    }
  31.304 -
  31.305 -    p->net_vif_list[idx] = new_vif;
  31.306 -    
  31.307 -    write_unlock_irqrestore(&tasklist_lock, flags);
  31.308 -    return new_vif;
  31.309 -    
  31.310 - fail:
  31.311 -    write_unlock_irqrestore(&tasklist_lock, flags);
  31.312 -    if ( new_vif != NULL )
  31.313 -        kmem_cache_free(net_vif_cache, new_vif);
  31.314 -    if ( p != NULL )
  31.315 -        put_task_struct(p);
  31.316 -    return NULL;
  31.317 -}
  31.318 -
  31.319  void netif_interface_init(void)
  31.320  {
  31.321 -    netif_cachep = kmem_cache_create("netif_cache", sizeof(netif_t), 
  31.322 -                                     0, 0, NULL, NULL);
  31.323      memset(netif_hash, 0, sizeof(netif_hash));
  31.324 +    if ( br_add_bridge("nbe-br") != 0 )
  31.325 +        BUG();
  31.326 +    bridge_dev = __dev_get_by_name("nbe-br");
  31.327 +    bridge_br  = (struct net_bridge *)bridge_dev->priv;
  31.328 +    bridge_br->bridge_hello_time = bridge_br->hello_time = 0;
  31.329 +    bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0;
  31.330 +    bridge_br->stp_enabled = 0;
  31.331  }
    32.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c	Tue May 11 14:57:44 2004 +0000
    32.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c	Tue May 11 15:02:26 2004 +0000
    32.3 @@ -11,7 +11,10 @@
    32.4   */
    32.5  
    32.6  #include "common.h"
    32.7 +#include <asm/hypervisor-ifs/dom_mem_ops.h>
    32.8  
    32.9 +static void net_tx_action(unsigned long unused);
   32.10 +static void netif_page_release(struct page *page);
   32.11  static void make_tx_response(netif_t *netif, 
   32.12                               u16      id,
   32.13                               s8       st);
   32.14 @@ -21,38 +24,131 @@ static void make_rx_response(netif_t    
   32.15                               netif_addr_t addr,
   32.16                               u16          size);
   32.17  
   32.18 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
   32.19 +
   32.20  /* Don't currently gate addition of an interface to the tx scheduling list. */
   32.21  #define tx_work_exists(_if) (1)
   32.22  
   32.23  #define MAX_PENDING_REQS 256
   32.24 -static struct vm_struct *mmap_vma;
   32.25 -#define MMAP_VADDR(_req) ((unsigned long)mmap_vma->addr + ((_req) * PAGE_SIZE))
   32.26 +static unsigned long mmap_vstart;
   32.27 +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
   32.28  
   32.29 -/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/
   32.30 +#define PKT_PROT_LEN (ETH_HLEN + 20)
   32.31 +
   32.32 +static u16 pending_id[MAX_PENDING_REQS];
   32.33 +static netif_t *pending_netif[MAX_PENDING_REQS];
   32.34  static u16 pending_ring[MAX_PENDING_REQS];
   32.35  static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
   32.36 -/* NB. We use a different index type to differentiate from shared blk rings. */
   32.37  typedef unsigned int PEND_RING_IDX;
   32.38  #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
   32.39  static PEND_RING_IDX pending_prod, pending_cons;
   32.40  #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
   32.41  
   32.42 +static struct list_head net_schedule_list;
   32.43 +static spinlock_t net_schedule_list_lock;
   32.44 +
   32.45 +#define MAX_MFN_ALLOC 64
   32.46 +static unsigned long mfn_list[MAX_MFN_ALLOC];
   32.47 +static unsigned int alloc_index = 0;
   32.48 +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
   32.49 +static void __refresh_mfn_list(void)
   32.50 +{
   32.51 +    int ret;
   32.52 +    dom_mem_op_t op;
   32.53 +    op.op = MEMOP_RESERVATION_INCREASE;
   32.54 +    op.u.increase.size  = MAX_MFN_ALLOC;
   32.55 +    op.u.increase.pages = mfn_list;
   32.56 +    if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC )
   32.57 +    {
   32.58 +        printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret);
   32.59 +        BUG();
   32.60 +    }
   32.61 +    alloc_index = MAX_MFN_ALLOC;
   32.62 +}
   32.63 +static unsigned long get_new_mfn(void)
   32.64 +{
   32.65 +    unsigned long mfn, flags;
   32.66 +    spin_lock_irqsave(&mfn_lock, flags);
   32.67 +    if ( alloc_index == 0 )
   32.68 +        __refresh_mfn_list();
   32.69 +    mfn = mfn_list[--alloc_index];
   32.70 +    spin_unlock_irqrestore(&mfn_lock, flags);
   32.71 +    return mfn;
   32.72 +}
   32.73 +static void dealloc_mfn(unsigned long mfn)
   32.74 +{
   32.75 +    unsigned long flags;
   32.76 +    spin_lock_irqsave(&mfn_lock, flags);
   32.77 +    mfn_list[alloc_index++] = mfn;
   32.78 +    spin_unlock_irqrestore(&mfn_lock, flags);
   32.79 +}
   32.80 +
   32.81 +static inline void maybe_schedule_tx_action(void)
   32.82 +{
   32.83 +    smp_mb();
   32.84 +    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
   32.85 +         !list_empty(&net_schedule_list) )
   32.86 +        tasklet_schedule(&net_tx_tasklet);
   32.87 +}
   32.88 +
   32.89  /*
   32.90   * This is the primary RECEIVE function for a network interface.
   32.91   * Note that, from the p.o.v. of /this/ OS it looks like a transmit.
   32.92   */
   32.93 -static void netif_start_xmit(struct sk_buff *skb, struct net_device *dev)
   32.94 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
   32.95  {
   32.96      netif_t *netif = (netif_t *)dev->priv;
   32.97 -    s8 status = BLKIF_RSP_OKAY;
   32.98 -    u16 size;
   32.99 -    mmu_update_t mmu[4];
  32.100 +    s8 status = NETIF_RSP_OKAY;
  32.101 +    u16 size=0, id;
  32.102 +    mmu_update_t mmu[6];
  32.103 +    pgd_t *pgd; pmd_t *pmd; pte_t *pte;
  32.104 +    unsigned long vdata, mdata=0, new_mfn;
  32.105 +
  32.106 +    /* Drop the packet if the target domain has no receive buffers. */
  32.107 +    if ( (netif->rx_req_cons == netif->rx->req_prod) ||
  32.108 +         ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
  32.109 +    {
  32.110 +        dev_kfree_skb(skb);
  32.111 +        return 0;
  32.112 +    }
  32.113  
  32.114 -    memcpy(skb->mac.ethernet->h_dest, netif->vmac, ETH_ALEN);
  32.115 -    if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
  32.116 -        memcpy(skb->nh.raw + 18, netif->vmac, ETH_ALEN);
  32.117 +    id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_req_cons++)].req.id;
  32.118 + 
  32.119 +    /*
  32.120 +     * We do not copy the packet unless:
  32.121 +     *  1. It is fragmented; or
  32.122 +     *  2. It spans a page boundary; or
  32.123 +     *  3. We cannot be sure the whole data page is allocated.
  32.124 +     * The copying method is taken from skb_copy().
  32.125 +     */
  32.126 +    if ( (skb_shinfo(skb)->nr_frags != 0) ||
  32.127 +         (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
  32.128 +         ((skb->end - skb->head) < (PAGE_SIZE/2)) )
  32.129 +    {
  32.130 +        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
  32.131 +        int hlen = skb->data - skb->head;
  32.132 +        if ( unlikely(nskb == NULL) )
  32.133 +        {
  32.134 +            DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid);
  32.135 +            status = NETIF_RSP_ERROR;
  32.136 +            goto out;
  32.137 +        }
  32.138 +        skb_reserve(nskb, hlen);
  32.139 +        __skb_put(nskb, skb->len);
  32.140 +        (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
  32.141 +        dev_kfree_skb(skb);
  32.142 +        skb = nskb;
  32.143 +    }
  32.144  
  32.145 -    spin_lock(&netif->rx_lock);
  32.146 +    vdata = (unsigned long)skb->data;
  32.147 +    mdata = virt_to_machine(vdata);
  32.148 +    size  = skb->tail - skb->data;
  32.149 +
  32.150 +    new_mfn = get_new_mfn();
  32.151 +
  32.152 +    pgd = pgd_offset_k(   (vdata & PAGE_MASK));
  32.153 +    pmd = pmd_offset(pgd, (vdata & PAGE_MASK));
  32.154 +    pte = pte_offset(pmd, (vdata & PAGE_MASK));
  32.155  
  32.156      mmu[0].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
  32.157      mmu[0].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
  32.158 @@ -63,49 +159,44 @@ static void netif_start_xmit(struct sk_b
  32.159      mmu[1].ptr |= MMU_EXTENDED_COMMAND;
  32.160      mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H;
  32.161  
  32.162 -    mmu[2].ptr  = ptr | MMU_EXTENDED_COMMAND;
  32.163 +    mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
  32.164      mmu[2].val  = MMUEXT_REASSIGN_PAGE;
  32.165  
  32.166 -    mmu[3].ptr  = ppte;
  32.167 -    mmu[3].val  = newpage;
  32.168 +    mmu[3].ptr  = MMU_EXTENDED_COMMAND;
  32.169 +    mmu[3].val  = MMUEXT_RESET_SUBJECTDOM;
  32.170 +
  32.171 +    mmu[4].ptr  = virt_to_machine(pte);
  32.172 +    mmu[4].val  = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
  32.173  
  32.174 -    if ( unlikely(HYPERVISOR_mmu_update(mmu, 4) < 0) )
  32.175 +    mmu[5].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
  32.176 +    mmu[5].val  = __pa(vdata) >> PAGE_SHIFT;
  32.177 +
  32.178 +    if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) )
  32.179      {
  32.180 -        status = BLKIF_RSP_ERROR;
  32.181 +        DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid);
  32.182 +        dealloc_mfn(new_mfn);
  32.183 +        status = NETIF_RSP_ERROR;
  32.184          goto out;
  32.185      }
  32.186  
  32.187 -    /* Record this so they can be billed. */
  32.188 -    netif->total_packets_received++;
  32.189 -    netif->total_bytes_received += size;
  32.190 +    phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn;
  32.191 +
  32.192 +    netif->stats.rx_bytes += size;
  32.193 +    netif->stats.rx_packets++;
  32.194  
  32.195   out:
  32.196 -    make_rx_response(netif, rx->id, status, addr, size);
  32.197 +    spin_lock(&netif->rx_lock);
  32.198 +    make_rx_response(netif, id, status, mdata, size);
  32.199      spin_unlock(&netif->rx_lock);    
  32.200      dev_kfree_skb(skb);
  32.201 +    return 0;
  32.202  }
  32.203  
  32.204 -
  32.205 -/*************************************************************
  32.206 - * NEW TRANSMIT SCHEDULER
  32.207 - * 
  32.208 - * NB. We ought also to only send a limited number of bytes to the NIC
  32.209 - * for transmission at any one time (to avoid head-of-line blocking).
  32.210 - * However, driver rings are small enough that they provide a reasonable
  32.211 - * limit.
  32.212 - * 
  32.213 - * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
  32.214 - *     e1000 has 256 descriptors == 128 packets, at 1000Mbps
  32.215 - *     tg3 has 512 descriptors == 256 packets, at 1000Mbps
  32.216 - * 
  32.217 - * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
  32.218 - * This would take 3ms, and represents our worst-case HoL blocking cost.
  32.219 - * 
  32.220 - * We think this is reasonable.
  32.221 - */
  32.222 -
  32.223 -struct list_head net_schedule_list;
  32.224 -spinlock_t net_schedule_list_lock;
  32.225 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
  32.226 +{
  32.227 +    netif_t *netif = dev->priv;
  32.228 +    return &netif->stats;
  32.229 +}
  32.230  
  32.231  static int __on_net_schedule_list(netif_t *netif)
  32.232  {
  32.233 @@ -128,7 +219,7 @@ static void add_to_net_schedule_list_tai
  32.234          return;
  32.235  
  32.236      spin_lock(&net_schedule_list_lock);
  32.237 -    if ( likely(!__on_net_schedule_list(netif)) )
  32.238 +    if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) )
  32.239      {
  32.240          list_add_tail(&netif->list, &net_schedule_list);
  32.241          netif_get(netif);
  32.242 @@ -136,46 +227,29 @@ static void add_to_net_schedule_list_tai
  32.243      spin_unlock(&net_schedule_list_lock);
  32.244  }
  32.245  
  32.246 -
  32.247 -static void tx_skb_release(struct sk_buff *skb);
  32.248 -    
  32.249 -static inline int init_tx_header(netif_t *netif, u8 *data, 
  32.250 -                                 unsigned int len, struct net_device *dev)
  32.251 +static inline void netif_schedule_work(netif_t *netif)
  32.252  {
  32.253 -    int proto = ntohs(*(unsigned short *)(data + 12));
  32.254 -
  32.255 -    memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
  32.256 -        
  32.257 -    switch ( proto )
  32.258 +    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
  32.259 +         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
  32.260      {
  32.261 -    case ETH_P_ARP:
  32.262 -        if ( len < 42 ) break;
  32.263 -        memcpy(data + 22, dev->dev_addr, ETH_ALEN);
  32.264 -        break;
  32.265 -    case ETH_P_IP:
  32.266 -        break;
  32.267 -    default:
  32.268 -        /* Unsupported protocols are onyl allowed to/from NETIF0/0. */
  32.269 -        if ( (netif->domain->domain != 0) || (netif->idx != 0) )
  32.270 -            proto = 0;
  32.271 -        break;
  32.272 +        add_to_net_schedule_list_tail(netif);
  32.273 +        maybe_schedule_tx_action();
  32.274      }
  32.275 -    return proto;
  32.276  }
  32.277  
  32.278 +void netif_deschedule(netif_t *netif)
  32.279 +{
  32.280 +    remove_from_net_schedule_list(netif);
  32.281 +}
  32.282  
  32.283 +#if 0
  32.284  static void tx_credit_callback(unsigned long data)
  32.285  {
  32.286      netif_t *netif = (netif_t *)data;
  32.287 -
  32.288      netif->remaining_credit = netif->credit_bytes;
  32.289 -
  32.290 -    if ( tx_work_exists(netif) )
  32.291 -    {
  32.292 -        add_to_net_schedule_list_tail(netif);
  32.293 -        maybe_schedule_tx_action();
  32.294 -    }    
  32.295 +    netif_schedule_work(netif);
  32.296  }
  32.297 +#endif
  32.298  
  32.299  static void net_tx_action(unsigned long unused)
  32.300  {
  32.301 @@ -184,7 +258,9 @@ static void net_tx_action(unsigned long 
  32.302      netif_t *netif;
  32.303      netif_tx_request_t txreq;
  32.304      u16 pending_idx;
  32.305 +    NETIF_RING_IDX i;
  32.306      pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED);
  32.307 +    struct page *page;
  32.308  
  32.309      while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
  32.310              !list_empty(&net_schedule_list) )
  32.311 @@ -197,7 +273,7 @@ static void net_tx_action(unsigned long 
  32.312  
  32.313          /* Work to do? */
  32.314          i = netif->tx_req_cons;
  32.315 -        if ( (i == shared_idxs->tx_req_prod) && 
  32.316 +        if ( (i == netif->tx->req_prod) ||
  32.317               ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
  32.318          {
  32.319              netif_put(netif);
  32.320 @@ -232,7 +308,7 @@ static void net_tx_action(unsigned long 
  32.321          netif->remaining_credit -= tx.size;
  32.322  #endif
  32.323  
  32.324 -        add_to_net_schedule_list_tail(netif);
  32.325 +        netif_schedule_work(netif);
  32.326  
  32.327          if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
  32.328               unlikely(txreq.size > ETH_FRAME_LEN) )
  32.329 @@ -246,7 +322,7 @@ static void net_tx_action(unsigned long 
  32.330          /* No crossing a page boundary as the payload mustn't fragment. */
  32.331          if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) 
  32.332          {
  32.333 -            DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 
  32.334 +            DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", 
  32.335                      txreq.addr, txreq.size, 
  32.336                      (txreq.addr &~PAGE_MASK) + txreq.size);
  32.337              make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  32.338 @@ -262,102 +338,88 @@ static void net_tx_action(unsigned long 
  32.339                                       PAGE_SIZE, prot, netif->domid) != 0 )
  32.340          {
  32.341              DPRINTK("Bad page frame\n");
  32.342 -            make_tx_response(netif, tx.id, NETIF_RSP_ERROR);
  32.343 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  32.344              netif_put(netif);
  32.345              continue;
  32.346          }
  32.347 -            
  32.348 +        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
  32.349 +            txreq.addr >> PAGE_SHIFT;
  32.350 +
  32.351          if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) )
  32.352          {
  32.353 -            make_tx_response(netif, tx.id, BLKIF_RSP_ERROR);
  32.354 +            DPRINTK("Can't allocate a skb in start_xmit.\n");
  32.355 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  32.356              netif_put(netif);
  32.357              vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
  32.358              break;
  32.359          }
  32.360          
  32.361 -        __skb_put(PKT_PROT_LEN);
  32.362 -        memcpy(skb->data, src, PKT_PROT_LEN);
  32.363 -        protocol = __constant_htons(
  32.364 -            init_tx_header(netif, g_data, tx.size, the_dev));
  32.365 -        if ( protocol == 0 )
  32.366 -        {
  32.367 -            make_tx_response(netif, tx.id, NETIF_RSP_ERROR);
  32.368 -            netif_put(netif);
  32.369 -            dev_kfree_skb(skb);
  32.370 -            goto cleanup_and_continue;
  32.371 -        }
  32.372 +        __skb_put(skb, PKT_PROT_LEN);
  32.373 +        memcpy(skb->data, 
  32.374 +               (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
  32.375 +               PKT_PROT_LEN);
  32.376  
  32.377 -        skb->dev        = netif->dev;
  32.378 -        skb->protocol   = eth_type_trans(skb, skb->dev);
  32.379 -        
  32.380 +        page = virt_to_page(MMAP_VADDR(pending_idx));
  32.381 +
  32.382          /* Append the packet payload as a fragment. */
  32.383 -        skb_shinfo(skb)->frags[0].page        = 
  32.384 -          &mem_map[txreq.addr >> PAGE_SHIFT];
  32.385 +        skb_shinfo(skb)->frags[0].page        = page;
  32.386          skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
  32.387          skb_shinfo(skb)->frags[0].page_offset = 
  32.388              (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
  32.389          skb_shinfo(skb)->nr_frags = 1;
  32.390 -        skb->data_len  = tx->size - PKT_PROT_LEN;
  32.391 +        skb->data_len  = txreq.size - PKT_PROT_LEN;
  32.392          skb->len      += skb->data_len;
  32.393  
  32.394 +        skb->dev      = netif->dev;
  32.395 +        skb->protocol = eth_type_trans(skb, skb->dev);
  32.396 +
  32.397          /* Destructor information. */
  32.398 -        skb->destructor = tx_skb_release;
  32.399 -        skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif;
  32.400 -        skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx;
  32.401 +        atomic_set(&page->count, 1);
  32.402 +        page->mapping = (struct address_space *)netif_page_release;
  32.403 +        pending_id[pending_idx] = txreq.id;
  32.404 +        pending_netif[pending_idx] = netif;
  32.405  
  32.406 -        /* Record the transmission so they can be billed. */
  32.407 -        netif->total_packets_sent++;
  32.408 -        netif->total_bytes_sent += tx->size;
  32.409 +        netif->stats.tx_bytes += txreq.size;
  32.410 +        netif->stats.tx_packets++;
  32.411  
  32.412          pending_cons++;
  32.413 +
  32.414          netif_rx(skb);
  32.415          netif->dev->last_rx = jiffies;
  32.416      }
  32.417  }
  32.418  
  32.419 -DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
  32.420 -
  32.421 -
  32.422 -static inline void maybe_schedule_tx_action(void)
  32.423 +static void netif_page_release(struct page *page)
  32.424  {
  32.425 -    smp_mb();
  32.426 -    if ( !netif_queue_stopped(the_dev) &&
  32.427 -         !list_empty(&net_schedule_list) )
  32.428 -        tasklet_schedule(&net_tx_tasklet);
  32.429 -}
  32.430 -
  32.431 +    unsigned long flags;
  32.432 +    netif_t *netif;
  32.433 +    u16 pending_idx;
  32.434  
  32.435 -/* Destructor function for tx skbs. */
  32.436 -static void tx_skb_release(struct sk_buff *skb)
  32.437 -{
  32.438 -    int i;
  32.439 -    netif_t *netif = (netif_t)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page;
  32.440 -    u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size;
  32.441 +    pending_idx = page - virt_to_page(mmap_vstart);
  32.442 +
  32.443 +    netif = pending_netif[pending_idx];
  32.444  
  32.445      vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
  32.446 -    
  32.447 -    skb_shinfo(skb)->nr_frags = 0; 
  32.448 -    
  32.449 +        
  32.450      spin_lock(&netif->tx_lock);
  32.451 -    make_tx_response(netif, skb->guest_id, NETIF_RSP_OKAY);
  32.452 +    make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
  32.453      spin_unlock(&netif->tx_lock);
  32.454 -    
  32.455 +
  32.456      /*
  32.457 -     * Checks below must happen after the above response is posted. This avoids
  32.458 -     * a possible race with a guest OS on another CPU.
  32.459 +     * Scheduling checks must happen after the above response is posted.
  32.460 +     * This avoids a possible race with a guest OS on another CPU.
  32.461       */
  32.462      mb();
  32.463 -    
  32.464 -    if ( tx_work_exists(netif) )
  32.465 -    {
  32.466 -        add_to_net_schedule_list_tail(netif);
  32.467 -        maybe_schedule_tx_action();        
  32.468 -    }
  32.469 -    
  32.470 +    netif_schedule_work(netif);
  32.471 +
  32.472      netif_put(netif);
  32.473 + 
  32.474 +    spin_lock_irqsave(&pend_prod_lock, flags);
  32.475 +    pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  32.476 +    spin_unlock_irqrestore(&pend_prod_lock, flags);
  32.477  }
  32.478  
  32.479 -
  32.480 +#if 0
  32.481  long flush_bufs_for_netif(netif_t *netif)
  32.482  {
  32.483      NET_RING_IDX i;
  32.484 @@ -395,6 +457,7 @@ long flush_bufs_for_netif(netif_t *netif
  32.485  
  32.486      return 0;
  32.487  }
  32.488 +#endif
  32.489  
  32.490  void netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
  32.491  {
  32.492 @@ -424,7 +487,6 @@ static void make_tx_response(netif_t *ne
  32.493          notify_via_evtchn(netif->evtchn);
  32.494  }
  32.495  
  32.496 -
  32.497  static void make_rx_response(netif_t     *netif, 
  32.498                               u16          id, 
  32.499                               s8           st,
  32.500 @@ -448,28 +510,35 @@ static void make_rx_response(netif_t    
  32.501          notify_via_evtchn(netif->evtchn);
  32.502  }
  32.503  
  32.504 -
  32.505  static int __init init_module(void)
  32.506  {
  32.507 +    int i;
  32.508 +
  32.509 +    if ( !(start_info.flags & SIF_INITDOMAIN) )
  32.510 +        return 0;
  32.511 +
  32.512      netif_interface_init();
  32.513  
  32.514 -    if ( (mmap_vma = get_vm_area(MAX_PENDING_REQS * PAGE_SIZE, 
  32.515 -                                 VM_IOREMAP)) == NULL )
  32.516 -    {
  32.517 -        printk(KERN_WARNING "Could not allocate VMA for netif backend.\n");
  32.518 -        return -ENOMEM;
  32.519 -    }
  32.520 +    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
  32.521 +        BUG();
  32.522 +
  32.523 +    pending_cons = 0;
  32.524 +    pending_prod = MAX_PENDING_REQS;
  32.525 +    for ( i = 0; i < MAX_PENDING_REQS; i++ )
  32.526 +        pending_ring[i] = i;
  32.527 +
  32.528 +    spin_lock_init(&net_schedule_list_lock);
  32.529 +    INIT_LIST_HEAD(&net_schedule_list);
  32.530  
  32.531      netif_ctrlif_init();
  32.532  
  32.533      return 0;
  32.534  }
  32.535  
  32.536 -
  32.537  static void cleanup_module(void)
  32.538  {
  32.539 +    BUG();
  32.540  }
  32.541  
  32.542 -
  32.543  module_init(init_module);
  32.544  module_exit(cleanup_module);
    33.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c	Tue May 11 14:57:44 2004 +0000
    33.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c	Tue May 11 15:02:26 2004 +0000
    33.3 @@ -25,16 +25,18 @@
    33.4  #include <net/sock.h>
    33.5  #include <net/pkt_sched.h>
    33.6  
    33.7 +#include <asm/evtchn.h>
    33.8 +#include <asm/ctrl_if.h>
    33.9 +#include <asm/hypervisor-ifs/dom_mem_ops.h>
   33.10 +
   33.11 +#include "../netif.h"
   33.12 +
   33.13  #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
   33.14  
   33.15 -static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs);
   33.16  static void network_tx_buf_gc(struct net_device *dev);
   33.17  static void network_alloc_rx_buffers(struct net_device *dev);
   33.18  static void cleanup_module(void);
   33.19  
   33.20 -/* Dynamically-mapped IRQs. */
   33.21 -static int network_irq, debug_irq;
   33.22 -
   33.23  static struct list_head dev_list;
   33.24  
   33.25  struct net_private
   33.26 @@ -43,26 +45,30 @@ struct net_private
   33.27      struct net_device *dev;
   33.28  
   33.29      struct net_device_stats stats;
   33.30 -    NET_RING_IDX rx_resp_cons, tx_resp_cons;
   33.31 -    unsigned int net_ring_fixmap_idx, tx_full;
   33.32 -    net_ring_t  *net_ring;
   33.33 -    net_idx_t   *net_idx;
   33.34 +    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
   33.35 +    unsigned int tx_full;
   33.36 +    
   33.37 +    netif_tx_interface_t *tx;
   33.38 +    netif_rx_interface_t *rx;
   33.39 +
   33.40      spinlock_t   tx_lock;
   33.41 -    unsigned int idx; /* Domain-specific index of this VIF. */
   33.42  
   33.43 -    unsigned int rx_bufs_to_notify;
   33.44 +    unsigned int handle;
   33.45 +    unsigned int evtchn;
   33.46 +    unsigned int irq;
   33.47  
   33.48 -#define STATE_ACTIVE    0
   33.49 -#define STATE_SUSPENDED 1
   33.50 -#define STATE_CLOSED    2
   33.51 +#define NETIF_STATE_CLOSED       0
   33.52 +#define NETIF_STATE_DISCONNECTED 1
   33.53 +#define NETIF_STATE_CONNECTED    2
   33.54 +#define NETIF_STATE_ACTIVE       3
   33.55      unsigned int state;
   33.56  
   33.57      /*
   33.58       * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
   33.59       * array is an index into a chain of free entries.
   33.60       */
   33.61 -    struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1];
   33.62 -    struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1];
   33.63 +    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
   33.64 +    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
   33.65  };
   33.66  
   33.67  /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
   33.68 @@ -75,86 +81,43 @@ struct net_private
   33.69      (unsigned short)_id; })
   33.70  
   33.71  
   33.72 -static void _dbg_network_int(struct net_device *dev)
   33.73 -{
   33.74 -    struct net_private *np = dev->priv;
   33.75 -
   33.76 -    if ( np->state == STATE_CLOSED )
   33.77 -        return;
   33.78 -    
   33.79 -    printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x,"
   33.80 -           " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x,"
   33.81 -           " tx_event=0x%08x, state=%d\n",
   33.82 -           np->tx_full, np->tx_resp_cons, 
   33.83 -           np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, 
   33.84 -           np->net_idx->tx_event,
   33.85 -           test_bit(__LINK_STATE_XOFF, &dev->state));
   33.86 -    printk(KERN_ALERT "net: rx_resp_cons=0x%08x,"
   33.87 -           " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n",
   33.88 -           np->rx_resp_cons, np->net_idx->rx_req_prod,
   33.89 -           np->net_idx->rx_resp_prod, np->net_idx->rx_event);
   33.90 -}
   33.91 -
   33.92 -
   33.93 -static void dbg_network_int(int irq, void *unused, struct pt_regs *ptregs)
   33.94 +static struct net_device *find_dev_by_handle(unsigned int handle)
   33.95  {
   33.96      struct list_head *ent;
   33.97      struct net_private *np;
   33.98      list_for_each ( ent, &dev_list )
   33.99      {
  33.100          np = list_entry(ent, struct net_private, list);
  33.101 -        _dbg_network_int(np->dev);
  33.102 +        if ( np->handle == handle )
  33.103 +            return np->dev;
  33.104      }
  33.105 +    return NULL;
  33.106  }
  33.107  
  33.108  
  33.109  static int network_open(struct net_device *dev)
  33.110  {
  33.111      struct net_private *np = dev->priv;
  33.112 -    netop_t netop;
  33.113 -    int i, ret;
  33.114 -
  33.115 -    netop.cmd = NETOP_RESET_RINGS;
  33.116 -    netop.vif = np->idx;
  33.117 -    if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 )
  33.118 -    {
  33.119 -        printk(KERN_ALERT "Possible net trouble: couldn't reset ring idxs\n");
  33.120 -        return ret;
  33.121 -    }
  33.122 +    int i;
  33.123  
  33.124 -    netop.cmd = NETOP_GET_VIF_INFO;
  33.125 -    netop.vif = np->idx;
  33.126 -    if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 )
  33.127 -    {
  33.128 -        printk(KERN_ALERT "Couldn't get info for vif %d\n", np->idx);
  33.129 -        return ret;
  33.130 -    }
  33.131 +    if ( np->state != NETIF_STATE_CONNECTED )
  33.132 +        return -EINVAL;
  33.133  
  33.134 -    memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN);
  33.135 -
  33.136 -    set_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx, 
  33.137 -               netop.u.get_vif_info.ring_mfn << PAGE_SHIFT);
  33.138 -    np->net_ring = (net_ring_t *)fix_to_virt(
  33.139 -        FIX_NETRING0_BASE + np->net_ring_fixmap_idx);
  33.140 -    np->net_idx  = &HYPERVISOR_shared_info->net_idx[np->idx];
  33.141 -
  33.142 -    np->rx_bufs_to_notify = 0;
  33.143      np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
  33.144      memset(&np->stats, 0, sizeof(np->stats));
  33.145      spin_lock_init(&np->tx_lock);
  33.146 -    memset(np->net_ring, 0, sizeof(*np->net_ring));
  33.147 -    memset(np->net_idx, 0, sizeof(*np->net_idx));
  33.148  
  33.149      /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
  33.150 -    for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ )
  33.151 +    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
  33.152          np->tx_skbs[i] = (void *)(i+1);
  33.153 -    for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ )
  33.154 +    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
  33.155          np->rx_skbs[i] = (void *)(i+1);
  33.156  
  33.157      wmb();
  33.158 -    np->state = STATE_ACTIVE;
  33.159 +    np->state = NETIF_STATE_ACTIVE;
  33.160  
  33.161      network_alloc_rx_buffers(dev);
  33.162 +    np->rx->event = np->rx_resp_cons + 1;
  33.163  
  33.164      netif_start_queue(dev);
  33.165  
  33.166 @@ -166,18 +129,17 @@ static int network_open(struct net_devic
  33.167  
  33.168  static void network_tx_buf_gc(struct net_device *dev)
  33.169  {
  33.170 -    NET_RING_IDX i, prod;
  33.171 +    NETIF_RING_IDX i, prod;
  33.172      unsigned short id;
  33.173      struct net_private *np = dev->priv;
  33.174      struct sk_buff *skb;
  33.175 -    tx_entry_t *tx_ring = np->net_ring->tx_ring;
  33.176  
  33.177      do {
  33.178 -        prod = np->net_idx->tx_resp_prod;
  33.179 +        prod = np->tx->resp_prod;
  33.180  
  33.181          for ( i = np->tx_resp_cons; i != prod; i++ )
  33.182          {
  33.183 -            id  = tx_ring[MASK_NET_TX_IDX(i)].resp.id;
  33.184 +            id  = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id;
  33.185              skb = np->tx_skbs[id];
  33.186              ADD_ID_TO_FREELIST(np->tx_skbs, id);
  33.187              dev_kfree_skb_any(skb);
  33.188 @@ -193,17 +155,17 @@ static void network_tx_buf_gc(struct net
  33.189           * in such cases notification from Xen is likely to be the only kick
  33.190           * that we'll get.
  33.191           */
  33.192 -        np->net_idx->tx_event = 
  33.193 -            prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1;
  33.194 +        np->tx->event = 
  33.195 +            prod + ((np->tx->req_prod - prod) >> 1) + 1;
  33.196          mb();
  33.197      }
  33.198 -    while ( prod != np->net_idx->tx_resp_prod );
  33.199 +    while ( prod != np->tx->resp_prod );
  33.200  
  33.201      if ( np->tx_full && 
  33.202 -         ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) )
  33.203 +         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
  33.204      {
  33.205          np->tx_full = 0;
  33.206 -        if ( np->state == STATE_ACTIVE )
  33.207 +        if ( np->state == NETIF_STATE_ACTIVE )
  33.208              netif_wake_queue(dev);
  33.209      }
  33.210  }
  33.211 @@ -224,11 +186,15 @@ static void network_alloc_rx_buffers(str
  33.212      unsigned short id;
  33.213      struct net_private *np = dev->priv;
  33.214      struct sk_buff *skb;
  33.215 -    netop_t netop;
  33.216 -    NET_RING_IDX i = np->net_idx->rx_req_prod;
  33.217 +    NETIF_RING_IDX i = np->rx->req_prod;
  33.218 +    dom_mem_op_t op;
  33.219 +    unsigned long pfn_array[NETIF_RX_RING_SIZE];
  33.220 +    int ret, nr_pfns = 0;
  33.221 +    pte_t *pte;
  33.222  
  33.223 -    if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || 
  33.224 -         unlikely(np->state != STATE_ACTIVE) )
  33.225 +    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
  33.226 +    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
  33.227 +         unlikely(np->state != NETIF_STATE_ACTIVE) )
  33.228          return;
  33.229  
  33.230      do {
  33.231 @@ -244,13 +210,13 @@ static void network_alloc_rx_buffers(str
  33.232          id = GET_ID_FROM_FREELIST(np->rx_skbs);
  33.233          np->rx_skbs[id] = skb;
  33.234  
  33.235 -        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id   = id;
  33.236 -        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 
  33.237 -            virt_to_machine(get_ppte(skb->head));
  33.238 -
  33.239 -        np->rx_bufs_to_notify++;
  33.240 +        np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
  33.241 +        
  33.242 +        pte = get_ppte(skb->head);
  33.243 +        pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT;
  33.244 +        queue_l1_entry_update(pte, 0);
  33.245      }
  33.246 -    while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
  33.247 +    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
  33.248  
  33.249      /*
  33.250       * We may have allocated buffers which have entries outstanding in the page
  33.251 @@ -258,17 +224,16 @@ static void network_alloc_rx_buffers(str
  33.252       */
  33.253      flush_page_update_queue();
  33.254  
  33.255 -    np->net_idx->rx_req_prod = i;
  33.256 -    np->net_idx->rx_event    = np->rx_resp_cons + 1;
  33.257 -        
  33.258 -    /* Batch Xen notifications. */
  33.259 -    if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) )
  33.260 +    op.op = MEMOP_RESERVATION_DECREASE;
  33.261 +    op.u.decrease.size  = nr_pfns;
  33.262 +    op.u.decrease.pages = pfn_array;
  33.263 +    if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns )
  33.264      {
  33.265 -        netop.cmd = NETOP_PUSH_BUFFERS;
  33.266 -        netop.vif = np->idx;
  33.267 -        (void)HYPERVISOR_net_io_op(&netop);
  33.268 -        np->rx_bufs_to_notify = 0;
  33.269 +        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
  33.270 +        BUG();
  33.271      }
  33.272 +
  33.273 +    np->rx->req_prod = i;
  33.274  }
  33.275  
  33.276  
  33.277 @@ -276,9 +241,8 @@ static int network_start_xmit(struct sk_
  33.278  {
  33.279      unsigned short id;
  33.280      struct net_private *np = (struct net_private *)dev->priv;
  33.281 -    tx_req_entry_t *tx;
  33.282 -    netop_t netop;
  33.283 -    NET_RING_IDX i;
  33.284 +    netif_tx_request_t *tx;
  33.285 +    NETIF_RING_IDX i;
  33.286  
  33.287      if ( unlikely(np->tx_full) )
  33.288      {
  33.289 @@ -297,27 +261,27 @@ static int network_start_xmit(struct sk_
  33.290          memcpy(new_skb->data, skb->data, skb->len);
  33.291          dev_kfree_skb(skb);
  33.292          skb = new_skb;
  33.293 -    }   
  33.294 +    }
  33.295      
  33.296      spin_lock_irq(&np->tx_lock);
  33.297  
  33.298 -    i = np->net_idx->tx_req_prod;
  33.299 +    i = np->tx->req_prod;
  33.300  
  33.301      id = GET_ID_FROM_FREELIST(np->tx_skbs);
  33.302      np->tx_skbs[id] = skb;
  33.303  
  33.304 -    tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req;
  33.305 +    tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req;
  33.306  
  33.307      tx->id   = id;
  33.308 -    tx->addr = phys_to_machine(virt_to_phys(skb->data));
  33.309 +    tx->addr = virt_to_machine(skb->data);
  33.310      tx->size = skb->len;
  33.311  
  33.312      wmb();
  33.313 -    np->net_idx->tx_req_prod = i + 1;
  33.314 +    np->tx->req_prod = i + 1;
  33.315  
  33.316      network_tx_buf_gc(dev);
  33.317  
  33.318 -    if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) )
  33.319 +    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
  33.320      {
  33.321          np->tx_full = 1;
  33.322          netif_stop_queue(dev);
  33.323 @@ -330,49 +294,56 @@ static int network_start_xmit(struct sk_
  33.324  
  33.325      /* Only notify Xen if there are no outstanding responses. */
  33.326      mb();
  33.327 -    if ( np->net_idx->tx_resp_prod == i )
  33.328 -    {
  33.329 -        netop.cmd = NETOP_PUSH_BUFFERS;
  33.330 -        netop.vif = np->idx;
  33.331 -        (void)HYPERVISOR_net_io_op(&netop);
  33.332 -    }
  33.333 +    if ( np->tx->resp_prod == i )
  33.334 +        notify_via_evtchn(np->evtchn);
  33.335  
  33.336      return 0;
  33.337  }
  33.338  
  33.339  
  33.340 -static inline void _network_interrupt(struct net_device *dev)
  33.341 +static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
  33.342  {
  33.343 +    struct net_device *dev = dev_id;
  33.344      struct net_private *np = dev->priv;
  33.345      unsigned long flags;
  33.346      struct sk_buff *skb;
  33.347 -    rx_resp_entry_t *rx;
  33.348 -    NET_RING_IDX i;
  33.349 +    netif_rx_response_t *rx;
  33.350 +    NETIF_RING_IDX i;
  33.351 +    mmu_update_t mmu[2];
  33.352 +    pte_t *pte;
  33.353  
  33.354 -    if ( unlikely(np->state == STATE_CLOSED) )
  33.355 -        return;
  33.356 -    
  33.357      spin_lock_irqsave(&np->tx_lock, flags);
  33.358      network_tx_buf_gc(dev);
  33.359      spin_unlock_irqrestore(&np->tx_lock, flags);
  33.360  
  33.361   again:
  33.362 -    for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ )
  33.363 +    for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ )
  33.364      {
  33.365 -        rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp;
  33.366 +        rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;
  33.367  
  33.368          skb = np->rx_skbs[rx->id];
  33.369          ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
  33.370  
  33.371 -        if ( unlikely(rx->status != RING_STATUS_OK) )
  33.372 +        if ( unlikely(rx->status <= 0) )
  33.373          {
  33.374              /* Gate this error. We get a (valid) slew of them on suspend. */
  33.375 -            if ( np->state == STATE_ACTIVE )
  33.376 +            if ( np->state == NETIF_STATE_ACTIVE )
  33.377                  printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
  33.378              dev_kfree_skb_any(skb);
  33.379              continue;
  33.380          }
  33.381  
  33.382 +        /* Remap the page. */
  33.383 +        pte = get_ppte(skb->head);
  33.384 +        mmu[0].ptr  = virt_to_machine(pte);
  33.385 +        mmu[0].val  = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
  33.386 +        mmu[1].ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
  33.387 +        mmu[1].val  = __pa(skb->head) >> PAGE_SHIFT;
  33.388 +        if ( HYPERVISOR_mmu_update(mmu, 2) != 0 )
  33.389 +            BUG();
  33.390 +        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
  33.391 +            rx->addr >> PAGE_SHIFT;
  33.392 +
  33.393          /*
  33.394           * Set up shinfo -- from alloc_skb This was particularily nasty:  the
  33.395           * shared info is hidden at the back of the data area (presumably so it
  33.396 @@ -385,13 +356,13 @@ static inline void _network_interrupt(st
  33.397          phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
  33.398              (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
  33.399  
  33.400 -        skb->data = skb->tail = skb->head + rx->offset;
  33.401 -        skb_put(skb, rx->size);
  33.402 +        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
  33.403 +        skb_put(skb, rx->status);
  33.404          skb->protocol = eth_type_trans(skb, dev);
  33.405  
  33.406          np->stats.rx_packets++;
  33.407  
  33.408 -        np->stats.rx_bytes += rx->size;
  33.409 +        np->stats.rx_bytes += rx->status;
  33.410          netif_rx(skb);
  33.411          dev->last_rx = jiffies;
  33.412      }
  33.413 @@ -399,42 +370,23 @@ static inline void _network_interrupt(st
  33.414      np->rx_resp_cons = i;
  33.415  
  33.416      network_alloc_rx_buffers(dev);
  33.417 +    np->rx->event = np->rx_resp_cons + 1;
  33.418      
  33.419      /* Deal with hypervisor racing our resetting of rx_event. */
  33.420      mb();
  33.421 -    if ( np->net_idx->rx_resp_prod != i )
  33.422 +    if ( np->rx->resp_prod != i )
  33.423          goto again;
  33.424  }
  33.425  
  33.426  
  33.427 -static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs)
  33.428 -{
  33.429 -    struct list_head *ent;
  33.430 -    struct net_private *np;
  33.431 -    list_for_each ( ent, &dev_list )
  33.432 -    {
  33.433 -        np = list_entry(ent, struct net_private, list);
  33.434 -        _network_interrupt(np->dev);
  33.435 -    }
  33.436 -}
  33.437 -
  33.438 -
  33.439  static int network_close(struct net_device *dev)
  33.440  {
  33.441      struct net_private *np = dev->priv;
  33.442 -    netop_t netop;
  33.443 -
  33.444 -    np->state = STATE_SUSPENDED;
  33.445 -    wmb();
  33.446  
  33.447      netif_stop_queue(np->dev);
  33.448  
  33.449 -    netop.cmd = NETOP_FLUSH_BUFFERS;
  33.450 -    netop.vif = np->idx;
  33.451 -    (void)HYPERVISOR_net_io_op(&netop);
  33.452 -
  33.453 -    while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) ||
  33.454 -            (np->tx_resp_cons != np->net_idx->tx_req_prod) )
  33.455 +    while ( (np->rx_resp_cons != np->rx->req_prod) ||
  33.456 +            (np->tx_resp_cons != np->tx->req_prod) )
  33.457      {
  33.458          barrier();
  33.459          current->state = TASK_INTERRUPTIBLE;
  33.460 @@ -442,12 +394,9 @@ static int network_close(struct net_devi
  33.461      }
  33.462  
  33.463      wmb();
  33.464 -    np->state = STATE_CLOSED;
  33.465 +    np->state = NETIF_STATE_CONNECTED;
  33.466      wmb();
  33.467  
  33.468 -    /* Now no longer safe to take interrupts for this device. */
  33.469 -    clear_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx);
  33.470 -
  33.471      MOD_DEC_USE_COUNT;
  33.472  
  33.473      return 0;
  33.474 @@ -461,72 +410,164 @@ static struct net_device_stats *network_
  33.475  }
  33.476  
  33.477  
  33.478 +static void netif_status_change(netif_fe_interface_status_changed_t *status)
  33.479 +{
  33.480 +    ctrl_msg_t                   cmsg;
  33.481 +    netif_fe_interface_connect_t up;
  33.482 +    struct net_device *dev;
  33.483 +    struct net_private *np;
  33.484 +    
  33.485 +    if ( status->handle != 0 )
  33.486 +    {
  33.487 +        printk(KERN_WARNING "Status change on unsupported netif %d\n",
  33.488 +               status->handle);
  33.489 +        return;
  33.490 +    }
  33.491 +
  33.492 +    dev = find_dev_by_handle(0);
  33.493 +    np  = dev->priv;
  33.494 +    
  33.495 +    switch ( status->status )
  33.496 +    {
  33.497 +    case NETIF_INTERFACE_STATUS_DESTROYED:
  33.498 +        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
  33.499 +               np->state);
  33.500 +        break;
  33.501 +
  33.502 +    case NETIF_INTERFACE_STATUS_DISCONNECTED:
  33.503 +        if ( np->state != NETIF_STATE_CLOSED )
  33.504 +        {
  33.505 +            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
  33.506 +                   " in state %d\n", np->state);
  33.507 +            break;
  33.508 +        }
  33.509 +
  33.510 +        /* Move from CLOSED to DISCONNECTED state. */
  33.511 +        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
  33.512 +        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
  33.513 +        memset(np->tx, 0, PAGE_SIZE);
  33.514 +        memset(np->rx, 0, PAGE_SIZE);
  33.515 +        np->state  = NETIF_STATE_DISCONNECTED;
  33.516 +
  33.517 +        /* Construct an interface-CONNECT message for the domain controller. */
  33.518 +        cmsg.type      = CMSG_NETIF_FE;
  33.519 +        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
  33.520 +        cmsg.length    = sizeof(netif_fe_interface_connect_t);
  33.521 +        up.handle      = 0;
  33.522 +        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
  33.523 +        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
  33.524 +        memcpy(cmsg.msg, &up, sizeof(up));
  33.525 +        
  33.526 +        /* Tell the controller to bring up the interface. */
  33.527 +        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  33.528 +        break;
  33.529 +
  33.530 +    case NETIF_INTERFACE_STATUS_CONNECTED:
  33.531 +        if ( np->state == NETIF_STATE_CLOSED )
  33.532 +        {
  33.533 +            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
  33.534 +                   " in state %d\n", np->state);
  33.535 +            break;
  33.536 +        }
  33.537 +
  33.538 +        memcpy(dev->dev_addr, status->mac, ETH_ALEN);
  33.539 +
  33.540 +        np->evtchn = status->evtchn;
  33.541 +        np->irq = bind_evtchn_to_irq(np->evtchn);
  33.542 +        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
  33.543 +                      dev->name, dev);
  33.544 +        
  33.545 +        np->state = NETIF_STATE_CONNECTED;
  33.546 +        break;
  33.547 +
  33.548 +    default:
  33.549 +        printk(KERN_WARNING "Status change to unknown value %d\n", 
  33.550 +               status->status);
  33.551 +        break;
  33.552 +    }
  33.553 +}
  33.554 +
  33.555 +
  33.556 +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
  33.557 +{
  33.558 +    switch ( msg->subtype )
  33.559 +    {
  33.560 +    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
  33.561 +        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
  33.562 +            goto parse_error;
  33.563 +        netif_status_change((netif_fe_interface_status_changed_t *)
  33.564 +                            &msg->msg[0]);
  33.565 +        break;
  33.566 +    default:
  33.567 +        goto parse_error;
  33.568 +    }
  33.569 +
  33.570 +    ctrl_if_send_response(msg);
  33.571 +    return;
  33.572 +
  33.573 + parse_error:
  33.574 +    msg->length = 0;
  33.575 +    ctrl_if_send_response(msg);
  33.576 +}
  33.577 +
  33.578 +
  33.579  static int __init init_module(void)
  33.580  {
  33.581 -#if 0
  33.582 -    int i, fixmap_idx=-1, err;
  33.583 +    ctrl_msg_t                       cmsg;
  33.584 +    netif_fe_driver_status_changed_t st;
  33.585 +    int err;
  33.586      struct net_device *dev;
  33.587      struct net_private *np;
  33.588 -    netop_t netop;
  33.589 +
  33.590 +    if ( start_info.flags & SIF_INITDOMAIN )
  33.591 +        return 0;
  33.592  
  33.593      INIT_LIST_HEAD(&dev_list);
  33.594  
  33.595 -    network_irq = bind_virq_to_irq(VIRQ_NET);
  33.596 -    debug_irq   = bind_virq_to_irq(VIRQ_DEBUG);
  33.597 +    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
  33.598 +    {
  33.599 +        err = -ENOMEM;
  33.600 +        goto fail;
  33.601 +    }
  33.602  
  33.603 -    err = request_irq(network_irq, network_interrupt, 
  33.604 -                      SA_SAMPLE_RANDOM, "network", NULL);
  33.605 -    if ( err )
  33.606 +    np = dev->priv;
  33.607 +    np->state  = NETIF_STATE_CLOSED;
  33.608 +    np->handle = 0;
  33.609 +
  33.610 +    dev->open            = network_open;
  33.611 +    dev->hard_start_xmit = network_start_xmit;
  33.612 +    dev->stop            = network_close;
  33.613 +    dev->get_stats       = network_get_stats;
  33.614 +    
  33.615 +    if ( (err = register_netdev(dev)) != 0 )
  33.616      {
  33.617 -        printk(KERN_WARNING "Could not allocate network interrupt\n");
  33.618 +        kfree(dev);
  33.619          goto fail;
  33.620      }
  33.621      
  33.622 -    err = request_irq(debug_irq, dbg_network_int, 
  33.623 -                      SA_SHIRQ, "net_dbg", &dbg_network_int);
  33.624 -    if ( err )
  33.625 -        printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n");
  33.626 +    np->dev = dev;
  33.627 +    list_add(&np->list, &dev_list);
  33.628  
  33.629 -    for ( i = 0; i < MAX_DOMAIN_VIFS; i++ )
  33.630 -    {
  33.631 -        /* If the VIF is invalid then the query hypercall will fail. */
  33.632 -        netop.cmd = NETOP_GET_VIF_INFO;
  33.633 -        netop.vif = i;
  33.634 -        if ( HYPERVISOR_net_io_op(&netop) != 0 )
  33.635 -            continue;
  33.636 -
  33.637 -        /* We actually only support up to 4 vifs right now. */
  33.638 -        if ( ++fixmap_idx == 4 )
  33.639 -            break;
  33.640 +    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
  33.641 +                                    CALLBACK_IN_BLOCKING_CONTEXT);
  33.642  
  33.643 -        dev = alloc_etherdev(sizeof(struct net_private));
  33.644 -        if ( dev == NULL )
  33.645 -        {
  33.646 -            err = -ENOMEM;
  33.647 -            goto fail;
  33.648 -        }
  33.649 -
  33.650 -        np = dev->priv;
  33.651 -        np->state               = STATE_CLOSED;
  33.652 -        np->net_ring_fixmap_idx = fixmap_idx;
  33.653 -        np->idx                 = i;
  33.654 +    /* Send a driver-UP notification to the domain controller. */
  33.655 +    cmsg.type      = CMSG_NETIF_FE;
  33.656 +    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
  33.657 +    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
  33.658 +    st.status      = NETIF_DRIVER_STATUS_UP;
  33.659 +    memcpy(cmsg.msg, &st, sizeof(st));
  33.660 +    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
  33.661  
  33.662 -        SET_MODULE_OWNER(dev);
  33.663 -        dev->open            = network_open;
  33.664 -        dev->hard_start_xmit = network_start_xmit;
  33.665 -        dev->stop            = network_close;
  33.666 -        dev->get_stats       = network_get_stats;
  33.667 -
  33.668 -        memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN);
  33.669 -
  33.670 -        if ( (err = register_netdev(dev)) != 0 )
  33.671 -        {
  33.672 -            kfree(dev);
  33.673 -            goto fail;
  33.674 -        }
  33.675 -
  33.676 -        np->dev = dev;
  33.677 -        list_add(&np->list, &dev_list);
  33.678 +    /*
  33.679 +     * We should read 'nr_interfaces' from response message and wait
  33.680 +     * for notifications before proceeding. For now we assume that we
  33.681 +     * will be notified of exactly one interface.
  33.682 +     */
  33.683 +    while ( np->state != NETIF_STATE_CONNECTED )
  33.684 +    {
  33.685 +        set_current_state(TASK_INTERRUPTIBLE);
  33.686 +        schedule_timeout(1);
  33.687      }
  33.688  
  33.689      return 0;
  33.690 @@ -534,30 +575,13 @@ static int __init init_module(void)
  33.691   fail:
  33.692      cleanup_module();
  33.693      return err;
  33.694 -#endif
  33.695 -    return 0;
  33.696  }
  33.697  
  33.698  
  33.699  static void cleanup_module(void)
  33.700  {
  33.701 -    struct net_private *np;
  33.702 -    struct net_device *dev;
  33.703 -
  33.704 -    while ( !list_empty(&dev_list) )
  33.705 -    {
  33.706 -        np = list_entry(dev_list.next, struct net_private, list);
  33.707 -        list_del(&np->list);
  33.708 -        dev = np->dev;
  33.709 -        unregister_netdev(dev);
  33.710 -        kfree(dev);
  33.711 -    }
  33.712 -
  33.713 -    free_irq(network_irq, NULL);
  33.714 -    free_irq(debug_irq, NULL);
  33.715 -
  33.716 -    unbind_virq_from_irq(VIRQ_NET);
  33.717 -    unbind_virq_from_irq(VIRQ_DEBUG);
  33.718 +    /* XXX FIXME */
  33.719 +    BUG();
  33.720  }
  33.721  
  33.722  
    34.1 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c	Tue May 11 14:57:44 2004 +0000
    34.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c	Tue May 11 15:02:26 2004 +0000
    34.3 @@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_acti
    34.4  static CONTROL_RING_IDX ctrl_if_tx_resp_cons;
    34.5  static CONTROL_RING_IDX ctrl_if_rx_req_cons;
    34.6  
    34.7 -/* Incoming message requests: primary message type -> message handler. */
    34.8 +/* Incoming message requests. */
    34.9 +    /* Primary message type -> message handler. */
   34.10  static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256];
   34.11 +    /* Primary message type -> callback in process context? */
   34.12 +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)];
   34.13 +    /* Is it late enough during bootstrap to use schedule_task()? */
   34.14 +static int safe_to_schedule_task;
   34.15 +    /* Passed to schedule_task(). */
   34.16 +static struct tq_struct ctrl_if_rxmsg_deferred_tq;
   34.17 +    /* Queue up messages to be handled in process context. */
   34.18 +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE];
   34.19 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod;
   34.20 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons;
   34.21  
   34.22  /* Incoming message responses: message identifier -> message handler/id. */
   34.23  static struct {
   34.24 @@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigne
   34.25      }
   34.26  }
   34.27  
   34.28 +static void __ctrl_if_rxmsg_deferred(void *unused)
   34.29 +{
   34.30 +    ctrl_msg_t *msg;
   34.31 +
   34.32 +    while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod )
   34.33 +    {
   34.34 +        msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
   34.35 +            ctrl_if_rxmsg_deferred_cons++)];
   34.36 +        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
   34.37 +    }
   34.38 +}
   34.39 +
   34.40  static void __ctrl_if_rx_tasklet(unsigned long data)
   34.41  {
   34.42      control_if_t *ctrl_if = get_ctrl_if();
   34.43 -    ctrl_msg_t   *msg;
   34.44 +    ctrl_msg_t    msg, *pmsg;
   34.45  
   34.46      while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod )
   34.47      {
   34.48 -        /*
   34.49 -         * We need no locking or barriers here. There will be one and only one
   34.50 -         * response as a result of each callback, so the callback handler
   34.51 -         * doesn't need to worry about the 'msg' being overwritten until:
   34.52 -         *  1. It returns (if the message must persist then it must be copied).
   34.53 -         *  2. A response is sent (the response may overwrite the request).
   34.54 -         */
   34.55 -        msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
   34.56 -        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
   34.57 +        pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
   34.58 +        memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
   34.59 +        if ( msg.length != 0 )
   34.60 +            memcpy(msg.msg, pmsg->msg, msg.length);
   34.61 +        if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) )
   34.62 +        {
   34.63 +            pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
   34.64 +                ctrl_if_rxmsg_deferred_prod++)];
   34.65 +            memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length);
   34.66 +            schedule_task(&ctrl_if_rxmsg_deferred_tq);
   34.67 +        }
   34.68 +        else
   34.69 +        {
   34.70 +            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
   34.71 +        }
   34.72      }
   34.73  }
   34.74  
   34.75 @@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *m
   34.76      ctrl_if_notify_controller();
   34.77  }
   34.78  
   34.79 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd)
   34.80 +int ctrl_if_register_receiver(
   34.81 +    u8 type, 
   34.82 +    ctrl_msg_handler_t hnd, 
   34.83 +    unsigned int flags)
   34.84  {
   34.85 -    unsigned long flags;
   34.86 +    unsigned long _flags;
   34.87      int inuse;
   34.88  
   34.89 -    spin_lock_irqsave(&ctrl_if_lock, flags);
   34.90 +    spin_lock_irqsave(&ctrl_if_lock, _flags);
   34.91  
   34.92      inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler);
   34.93  
   34.94      if ( inuse )
   34.95 +    {
   34.96          printk(KERN_INFO "Receiver %p already established for control "
   34.97                 "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type);
   34.98 +    }
   34.99      else
  34.100 +    {
  34.101          ctrl_if_rxmsg_handler[type] = hnd;
  34.102 +        clear_bit(type, &ctrl_if_rxmsg_blocking_context);
  34.103 +        if ( flags == CALLBACK_IN_BLOCKING_CONTEXT )
  34.104 +        {
  34.105 +            set_bit(type, &ctrl_if_rxmsg_blocking_context);
  34.106 +            if ( !safe_to_schedule_task )
  34.107 +                BUG();
  34.108 +        }
  34.109 +    }
  34.110  
  34.111 -    spin_unlock_irqrestore(&ctrl_if_lock, flags);
  34.112 +    spin_unlock_irqrestore(&ctrl_if_lock, _flags);
  34.113  
  34.114      return !inuse;
  34.115  }
  34.116 @@ -326,6 +369,7 @@ void __init ctrl_if_init(void)
  34.117  
  34.118      for ( i = 0; i < 256; i++ )
  34.119          ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler;
  34.120 +    ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred;
  34.121  
  34.122      spin_lock_init(&ctrl_if_lock);
  34.123  
  34.124 @@ -333,6 +377,15 @@ void __init ctrl_if_init(void)
  34.125  }
  34.126  
  34.127  
  34.128 +/* This is called after it is safe to call schedule_task(). */
  34.129 +static int __init ctrl_if_late_setup(void)
  34.130 +{
  34.131 +    safe_to_schedule_task = 1;
  34.132 +    return 0;
  34.133 +}
  34.134 +__initcall(ctrl_if_late_setup);
  34.135 +
  34.136 +
  34.137  /*
  34.138   * !! The following are DANGEROUS FUNCTIONS !!
  34.139   * Use with care [for example, see xencons_force_flush()].
    35.1 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c	Tue May 11 14:57:44 2004 +0000
    35.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c	Tue May 11 15:02:26 2004 +0000
    35.3 @@ -8,7 +8,10 @@
    35.4  
    35.5  #include <linux/config.h>
    35.6  #include <linux/sched.h>
    35.7 +#include <linux/mm.h>
    35.8 +#include <linux/vmalloc.h>
    35.9  #include <asm/hypervisor.h>
   35.10 +#include <asm/hypervisor-ifs/dom_mem_ops.h>
   35.11  #include <asm/page.h>
   35.12  #include <asm/pgtable.h>
   35.13  #include <asm/multicall.h>
   35.14 @@ -244,3 +247,105 @@ void queue_set_ldt(unsigned long ptr, un
   35.15      increment_index();
   35.16      spin_unlock_irqrestore(&update_lock, flags);
   35.17  }
   35.18 +
   35.19 +void queue_machphys_update(unsigned long mfn, unsigned long pfn)
   35.20 +{
   35.21 +    unsigned long flags;
   35.22 +    spin_lock_irqsave(&update_lock, flags);
   35.23 +    update_queue[idx].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
   35.24 +    update_queue[idx].val = pfn;
   35.25 +    increment_index();
   35.26 +    spin_unlock_irqrestore(&update_lock, flags);
   35.27 +}
   35.28 +
   35.29 +#ifdef CONFIG_XEN_PHYSDEV_ACCESS
   35.30 +
   35.31 +unsigned long allocate_empty_lowmem_region(unsigned long pages)
   35.32 +{
   35.33 +    pgd_t         *pgd; 
   35.34 +    pmd_t         *pmd;
   35.35 +    pte_t         *pte;
   35.36 +    unsigned long *pfn_array;
   35.37 +    unsigned long  vstart;
   35.38 +    unsigned long  i;
   35.39 +    int            ret;
   35.40 +    unsigned int   order = get_order(pages*PAGE_SIZE);
   35.41 +    dom_mem_op_t   dom_mem_op;
   35.42 +
   35.43 +    vstart = __get_free_pages(GFP_KERNEL, order);
   35.44 +    if ( vstart == 0 )
   35.45 +        return 0UL;
   35.46 +
   35.47 +    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
   35.48 +    if ( pfn_array == NULL )
   35.49 +        BUG();
   35.50 +
   35.51 +    for ( i = 0; i < (1<<order); i++ )
   35.52 +    {
   35.53 +        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
   35.54 +        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
   35.55 +        pte = pte_offset(pmd, (vstart + (i*PAGE_SIZE))); 
   35.56 +        pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
   35.57 +        queue_l1_entry_update(pte, 0);
   35.58 +        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
   35.59 +    }
   35.60 +
   35.61 +    flush_page_update_queue();
   35.62 +
   35.63 +    dom_mem_op.op = MEMOP_RESERVATION_DECREASE;
   35.64 +    dom_mem_op.u.decrease.size  = 1<<order;
   35.65 +    dom_mem_op.u.decrease.pages = pfn_array;
   35.66 +    if ( (ret = HYPERVISOR_dom_mem_op(&dom_mem_op)) != (1<<order) )
   35.67 +    {
   35.68 +        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
   35.69 +        BUG();
   35.70 +    }
   35.71 +
   35.72 +    vfree(pfn_array);
   35.73 +
   35.74 +    return vstart;
   35.75 +}
   35.76 +
   35.77 +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages)
   35.78 +{
   35.79 +    pgd_t         *pgd; 
   35.80 +    pmd_t         *pmd;
   35.81 +    pte_t         *pte;
   35.82 +    unsigned long *pfn_array;
   35.83 +    unsigned long  i;
   35.84 +    int            ret;
   35.85 +    unsigned int   order = get_order(pages*PAGE_SIZE);
   35.86 +    dom_mem_op_t   dom_mem_op;
   35.87 +
   35.88 +    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
   35.89 +    if ( pfn_array == NULL )
   35.90 +        BUG();
   35.91 +
   35.92 +    dom_mem_op.op = MEMOP_RESERVATION_INCREASE;
   35.93 +    dom_mem_op.u.increase.size  = 1<<order;
   35.94 +    dom_mem_op.u.increase.pages = pfn_array;
   35.95 +    if ( (ret = HYPERVISOR_dom_mem_op(&dom_mem_op)) != (1<<order) )
   35.96 +    {
   35.97 +        printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
   35.98 +               ret);
   35.99 +        BUG();
  35.100 +    }
  35.101 +
  35.102 +    for ( i = 0; i < (1<<order); i++ )
  35.103 +    {
  35.104 +        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
  35.105 +        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
  35.106 +        pte = pte_offset(pmd, (vstart + (i*PAGE_SIZE)));
  35.107 +        queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL);
  35.108 +        queue_machphys_update(pfn_array[i], __pa(vstart)>>PAGE_SHIFT);
  35.109 +        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i];
  35.110 +    }
  35.111 +
  35.112 +    flush_page_update_queue();
  35.113 +
  35.114 +    vfree(pfn_array);
  35.115 +
  35.116 +    free_pages(vstart, order);
  35.117 +}
  35.118 +
  35.119 +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
    36.1 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c	Tue May 11 14:57:44 2004 +0000
    36.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c	Tue May 11 15:02:26 2004 +0000
    36.3 @@ -202,10 +202,6 @@ void * __ioremap(unsigned long machine_a
    36.4      unsigned long offset, last_addr;
    36.5      pgprot_t prot;
    36.6  
    36.7 -    /* Only privileged Xenolinux can make unchecked pagetable updates. */
    36.8 -    if ( !(start_info.flags & SIF_PRIVILEGED) )
    36.9 -        return NULL;
   36.10 -
   36.11      /* Don't allow wraparound or zero size */
   36.12      last_addr = machine_addr + size - 1;
   36.13      if (!size || last_addr < machine_addr)
    37.1 --- a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c	Tue May 11 14:57:44 2004 +0000
    37.2 +++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c	Tue May 11 15:02:26 2004 +0000
    37.3 @@ -1626,7 +1626,7 @@ int __init blk_dev_init(void)
    37.4  	jsfd_init();
    37.5  #endif
    37.6  
    37.7 -#ifdef CONFIG_XEN_VBD
    37.8 +#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO)
    37.9      xlblk_init();
   37.10  #endif
   37.11  
    38.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h	Tue May 11 14:57:44 2004 +0000
    38.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h	Tue May 11 15:02:26 2004 +0000
    38.3 @@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *m
    38.4   * Register a receiver for typed messages from the domain controller. The 
    38.5   * handler (@hnd) is called for every received message of specified @type.
    38.6   * Returns TRUE (non-zero) if the handler was successfully registered.
    38.7 + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will
    38.8 + * occur in a context in which it is safe to yield (i.e., process context).
    38.9   */
   38.10 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd);
   38.11 +#define CALLBACK_IN_BLOCKING_CONTEXT 1
   38.12 +int ctrl_if_register_receiver(
   38.13 +    u8 type, 
   38.14 +    ctrl_msg_handler_t hnd,
   38.15 +    unsigned int flags);
   38.16  
   38.17  /*
   38.18   * Unregister a receiver for typed messages from the domain controller. The 
    39.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h	Tue May 11 14:57:44 2004 +0000
    39.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h	Tue May 11 15:02:26 2004 +0000
    39.3 @@ -44,6 +44,7 @@ void queue_pgd_unpin(unsigned long ptr);
    39.4  void queue_pte_pin(unsigned long ptr);
    39.5  void queue_pte_unpin(unsigned long ptr);
    39.6  void queue_set_ldt(unsigned long ptr, unsigned long bytes);
    39.7 +void queue_machphys_update(unsigned long mfn, unsigned long pfn);
    39.8  #define MMU_UPDATE_DEBUG 0
    39.9  
   39.10  #if MMU_UPDATE_DEBUG > 0
   39.11 @@ -137,6 +138,12 @@ static inline int flush_page_update_queu
   39.12  #define XEN_flush_page_update_queue() (_flush_page_update_queue())
   39.13  void MULTICALL_flush_page_update_queue(void);
   39.14  
   39.15 +#ifdef CONFIG_XEN_PHYSDEV_ACCESS
   39.16 +/* Allocate a contiguous empty region of low memory. Return virtual start. */
   39.17 +unsigned long allocate_empty_lowmem_region(unsigned long pages);
   39.18 +/* Deallocate a contiguous region of low memory. Return it to the allocator. */
   39.19 +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages);
   39.20 +#endif
   39.21  
   39.22  /*
   39.23   * Assembler stubs for hyper-calls.
    40.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h	Tue May 11 14:57:44 2004 +0000
    40.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h	Tue May 11 15:02:26 2004 +0000
    40.3 @@ -159,46 +159,11 @@ extern void iounmap(void *addr);
    40.4  extern void *bt_ioremap(unsigned long offset, unsigned long size);
    40.5  extern void bt_iounmap(void *addr, unsigned long size);
    40.6  
    40.7 -#ifdef CONFIG_XEN_PHYSDEV_ACCESS
    40.8 -
    40.9 -#ifdef CONFIG_HIGHMEM
   40.10 -#error "Highmem is not yet compatible with physical device access"
   40.11 -#endif
   40.12 -
   40.13 -/*
   40.14 - * The bus translation macros need special care if we are executing device
   40.15 - * accesses to/from other domains' memory. In these cases the virtual address
   40.16 - * is actually a temporary mapping in the 'vmalloc' space. The physical
   40.17 - * address will therefore be >max_low_pfn, and will not have a valid entry
   40.18 - * in the phys_to_mach mapping table.
   40.19 - */
   40.20 -static inline unsigned long phys_to_bus(unsigned long phys)
   40.21 -{
   40.22 -    extern unsigned long max_pfn;
   40.23 -    pgd_t *pgd; pmd_t *pmd; pte_t *pte;
   40.24 -    void *addr;
   40.25 -    unsigned long bus;
   40.26 -    if ( (phys >> PAGE_SHIFT) < max_pfn )
   40.27 -        return phys_to_machine(phys);
   40.28 -    addr = phys_to_virt(phys);
   40.29 -    pgd = pgd_offset_k(   (unsigned long)addr);
   40.30 -    pmd = pmd_offset(pgd, (unsigned long)addr);
   40.31 -    pte = pte_offset(pmd, (unsigned long)addr);
   40.32 -    bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK);
   40.33 -    return bus;
   40.34 -}
   40.35 -
   40.36 -#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x))
   40.37 -#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
   40.38 -#define page_to_bus(_x) phys_to_bus(page_to_phys(_x))
   40.39 -
   40.40 -#else
   40.41 -
   40.42  #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x))
   40.43  #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
   40.44  #define page_to_bus(_x) phys_to_machine(page_to_phys(_x))
   40.45 -
   40.46 -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
   40.47 +#define bus_to_phys(_x) machine_to_phys(_x)
   40.48 +#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT))
   40.49  
   40.50  /*
   40.51   * readX/writeX() are used to access memory mapped devices. On some
    41.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    41.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h	Tue May 11 15:02:26 2004 +0000
    41.3 @@ -0,0 +1,283 @@
    41.4 +#ifndef __i386_PCI_H
    41.5 +#define __i386_PCI_H
    41.6 +
    41.7 +#include <linux/config.h>
    41.8 +
    41.9 +#ifdef __KERNEL__
   41.10 +
   41.11 +/* Can be used to override the logic in pci_scan_bus for skipping
   41.12 +   already-configured bus numbers - to be used for buggy BIOSes
   41.13 +   or architectures with incomplete PCI setup by the loader */
   41.14 +
   41.15 +#ifdef CONFIG_PCI
   41.16 +extern unsigned int pcibios_assign_all_busses(void);
   41.17 +#else
   41.18 +#define pcibios_assign_all_busses()	0
   41.19 +#endif
   41.20 +#define pcibios_scan_all_fns()		0
   41.21 +
   41.22 +extern unsigned long pci_mem_start;
   41.23 +#define PCIBIOS_MIN_IO		0x1000
   41.24 +#define PCIBIOS_MIN_MEM		(pci_mem_start)
   41.25 +
   41.26 +void pcibios_config_init(void);
   41.27 +struct pci_bus * pcibios_scan_root(int bus);
   41.28 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
   41.29 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
   41.30 +
   41.31 +void pcibios_set_master(struct pci_dev *dev);
   41.32 +void pcibios_penalize_isa_irq(int irq);
   41.33 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
   41.34 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
   41.35 +
   41.36 +/* Dynamic DMA mapping stuff.
   41.37 + * i386 has everything mapped statically.
   41.38 + */
   41.39 +
   41.40 +#include <linux/types.h>
   41.41 +#include <linux/slab.h>
   41.42 +#include <asm/scatterlist.h>
   41.43 +#include <linux/string.h>
   41.44 +#include <asm/io.h>
   41.45 +
   41.46 +struct pci_dev;
   41.47 +
   41.48 +/* The networking and block device layers use this boolean for bounce
   41.49 + * buffer decisions.
   41.50 + */
   41.51 +#define PCI_DMA_BUS_IS_PHYS	(0)
   41.52 +
   41.53 +/* Allocate and map kernel buffer using consistent mode DMA for a device.
   41.54 + * hwdev should be valid struct pci_dev pointer for PCI devices,
   41.55 + * NULL for PCI-like buses (ISA, EISA).
   41.56 + * Returns non-NULL cpu-view pointer to the buffer if successful and
   41.57 + * sets *dma_addrp to the pci side dma address as well, else *dma_addrp
   41.58 + * is undefined.
   41.59 + */
   41.60 +extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
   41.61 +				  dma_addr_t *dma_handle);
   41.62 +
   41.63 +/* Free and unmap a consistent DMA buffer.
   41.64 + * cpu_addr is what was returned from pci_alloc_consistent,
   41.65 + * size must be the same as what as passed into pci_alloc_consistent,
   41.66 + * and likewise dma_addr must be the same as what *dma_addrp was set to.
   41.67 + *
   41.68 + * References to the memory and mappings associated with cpu_addr/dma_addr
   41.69 + * past this call are illegal.
   41.70 + */
   41.71 +extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
   41.72 +				void *vaddr, dma_addr_t dma_handle);
   41.73 +
   41.74 +/* Map a single buffer of the indicated size for DMA in streaming mode.
   41.75 + * The 32-bit bus address to use is returned.
   41.76 + *
   41.77 + * Once the device is given the dma address, the device owns this memory
   41.78 + * until either pci_unmap_single or pci_dma_sync_single is performed.
   41.79 + */
   41.80 +static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr,
   41.81 +					size_t size, int direction)
   41.82 +{
   41.83 +	if (direction == PCI_DMA_NONE)
   41.84 +		out_of_line_bug();
   41.85 +	flush_write_buffers();
   41.86 +	return virt_to_bus(ptr);
   41.87 +}
   41.88 +
   41.89 +/* Unmap a single streaming mode DMA translation.  The dma_addr and size
   41.90 + * must match what was provided for in a previous pci_map_single call.  All
   41.91 + * other usages are undefined.
   41.92 + *
   41.93 + * After this call, reads by the cpu to the buffer are guarenteed to see
   41.94 + * whatever the device wrote there.
   41.95 + */
   41.96 +static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
   41.97 +				    size_t size, int direction)
   41.98 +{
   41.99 +	if (direction == PCI_DMA_NONE)
  41.100 +		out_of_line_bug();
  41.101 +	/* Nothing to do */
  41.102 +}
  41.103 +
  41.104 +/*
  41.105 + * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical
  41.106 + * to pci_map_single, but takes a struct page instead of a virtual address
  41.107 + */
  41.108 +static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
  41.109 +				      unsigned long offset, size_t size, int direction)
  41.110 +{
  41.111 +	if (direction == PCI_DMA_NONE)
  41.112 +		out_of_line_bug();
  41.113 +
  41.114 +	return page_to_bus(page) + offset;
  41.115 +}
  41.116 +
  41.117 +static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
  41.118 +				  size_t size, int direction)
  41.119 +{
  41.120 +	if (direction == PCI_DMA_NONE)
  41.121 +		out_of_line_bug();
  41.122 +	/* Nothing to do */
  41.123 +}
  41.124 +
  41.125 +/* pci_unmap_{page,single} is a nop so... */
  41.126 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
  41.127 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
  41.128 +#define pci_unmap_addr(PTR, ADDR_NAME)		(0)
  41.129 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)	do { } while (0)
  41.130 +#define pci_unmap_len(PTR, LEN_NAME)		(0)
  41.131 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
  41.132 +
  41.133 +/* Map a set of buffers described by scatterlist in streaming
  41.134 + * mode for DMA.  This is the scather-gather version of the
  41.135 + * above pci_map_single interface.  Here the scatter gather list
  41.136 + * elements are each tagged with the appropriate dma address
  41.137 + * and length.  They are obtained via sg_dma_{address,length}(SG).
  41.138 + *
  41.139 + * NOTE: An implementation may be able to use a smaller number of
  41.140 + *       DMA address/length pairs than there are SG table elements.
  41.141 + *       (for example via virtual mapping capabilities)
  41.142 + *       The routine returns the number of addr/length pairs actually
  41.143 + *       used, at most nents.
  41.144 + *
  41.145 + * Device ownership issues as mentioned above for pci_map_single are
  41.146 + * the same here.
  41.147 + */
  41.148 +static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
  41.149 +			     int nents, int direction)
  41.150 +{
  41.151 +	int i;
  41.152 +
  41.153 +	if (direction == PCI_DMA_NONE)
  41.154 +		out_of_line_bug();
  41.155 + 
  41.156 + 	/*
  41.157 + 	 * temporary 2.4 hack
  41.158 + 	 */
  41.159 + 	for (i = 0; i < nents; i++ ) {
  41.160 + 		if (sg[i].address && sg[i].page)
  41.161 + 			out_of_line_bug();
  41.162 + 		else if (!sg[i].address && !sg[i].page)
  41.163 + 			out_of_line_bug();
  41.164 + 
  41.165 + 		if (sg[i].address)
  41.166 + 			sg[i].dma_address = virt_to_bus(sg[i].address);
  41.167 + 		else
  41.168 + 			sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
  41.169 + 	}
  41.170 + 
  41.171 +	flush_write_buffers();
  41.172 +	return nents;
  41.173 +}
  41.174 +
  41.175 +/* Unmap a set of streaming mode DMA translations.
  41.176 + * Again, cpu read rules concerning calls here are the same as for
  41.177 + * pci_unmap_single() above.
  41.178 + */
  41.179 +static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
  41.180 +				int nents, int direction)
  41.181 +{
  41.182 +	if (direction == PCI_DMA_NONE)
  41.183 +		out_of_line_bug();
  41.184 +	/* Nothing to do */
  41.185 +}
  41.186 +
  41.187 +/* Make physical memory consistent for a single
  41.188 + * streaming mode DMA translation after a transfer.
  41.189 + *
  41.190 + * If you perform a pci_map_single() but wish to interrogate the
  41.191 + * buffer using the cpu, yet do not wish to teardown the PCI dma
  41.192 + * mapping, you must call this function before doing so.  At the
  41.193 + * next point you give the PCI dma address back to the card, the
  41.194 + * device again owns the buffer.
  41.195 + */
  41.196 +static inline void pci_dma_sync_single(struct pci_dev *hwdev,
  41.197 +				       dma_addr_t dma_handle,
  41.198 +				       size_t size, int direction)
  41.199 +{
  41.200 +	if (direction == PCI_DMA_NONE)
  41.201 +		out_of_line_bug();
  41.202 +	flush_write_buffers();
  41.203 +}
  41.204 +
  41.205 +/* Make physical memory consistent for a set of streaming
  41.206 + * mode DMA translations after a transfer.
  41.207 + *
  41.208 + * The same as pci_dma_sync_single but for a scatter-gather list,
  41.209 + * same rules and usage.
  41.210 + */
  41.211 +static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
  41.212 +				   struct scatterlist *sg,
  41.213 +				   int nelems, int direction)
  41.214 +{
  41.215 +	if (direction == PCI_DMA_NONE)
  41.216 +		out_of_line_bug();
  41.217 +	flush_write_buffers();
  41.218 +}
  41.219 +
  41.220 +/* Return whether the given PCI device DMA address mask can
  41.221 + * be supported properly.  For example, if your device can
  41.222 + * only drive the low 24-bits during PCI bus mastering, then
  41.223 + * you would pass 0x00ffffff as the mask to this function.
  41.224 + */
  41.225 +static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
  41.226 +{
  41.227 +        /*
  41.228 +         * we fall back to GFP_DMA when the mask isn't all 1s,
  41.229 +         * so we can't guarantee allocations that must be
  41.230 +         * within a tighter range than GFP_DMA..
  41.231 +         */
  41.232 +        if(mask < 0x00ffffff)
  41.233 +                return 0;
  41.234 +
  41.235 +	return 1;
  41.236 +}
  41.237 +
  41.238 +/* This is always fine. */
  41.239 +#define pci_dac_dma_supported(pci_dev, mask)	(1)
  41.240 +
  41.241 +static __inline__ dma64_addr_t
  41.242 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
  41.243 +{
  41.244 +	return ((dma64_addr_t) page_to_bus(page) +
  41.245 +		(dma64_addr_t) offset);
  41.246 +}
  41.247 +
  41.248 +static __inline__ struct page *
  41.249 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
  41.250 +{
  41.251 +	return bus_to_page(dma_addr);
  41.252 +}
  41.253 +
  41.254 +static __inline__ unsigned long
  41.255 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
  41.256 +{
  41.257 +	return (dma_addr & ~PAGE_MASK);
  41.258 +}
  41.259 +
  41.260 +static __inline__ void
  41.261 +pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
  41.262 +{
  41.263 +	flush_write_buffers();
  41.264 +}
  41.265 +
  41.266 +/* These macros should be used after a pci_map_sg call has been done
  41.267 + * to get bus addresses of each of the SG entries and their lengths.
  41.268 + * You should only work with the number of sg entries pci_map_sg
  41.269 + * returns.
  41.270 + */
  41.271 +#define sg_dma_address(sg)	((sg)->dma_address)
  41.272 +#define sg_dma_len(sg)		((sg)->length)
  41.273 +
  41.274 +/* Return the index of the PCI controller for device. */
  41.275 +static inline int pci_controller_num(struct pci_dev *dev)
  41.276 +{
  41.277 +	return 0;
  41.278 +}
  41.279 +
  41.280 +#define HAVE_PCI_MMAP
  41.281 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  41.282 +			       enum pci_mmap_state mmap_state, int write_combine);
  41.283 +
  41.284 +#endif /* __KERNEL__ */
  41.285 +
  41.286 +#endif /* __i386_PCI_H */
    42.1 --- a/xenolinux-2.4.26-sparse/mkbuildtree	Tue May 11 14:57:44 2004 +0000
    42.2 +++ b/xenolinux-2.4.26-sparse/mkbuildtree	Tue May 11 15:02:26 2004 +0000
    42.3 @@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h
    42.4  ln -sf ../asm-i386/namei.h 
    42.5  ln -sf ../asm-i386/param.h 
    42.6  ln -sf ../asm-i386/parport.h 
    42.7 -ln -sf ../asm-i386/pci.h
    42.8  ln -sf ../asm-i386/pgtable-3level.h 
    42.9  ln -sf ../asm-i386/poll.h 
   42.10  ln -sf ../asm-i386/posix_types.h 
    43.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    43.2 +++ b/xenolinux-2.4.26-sparse/mm/page_alloc.c	Tue May 11 15:02:26 2004 +0000
    43.3 @@ -0,0 +1,930 @@
    43.4 +/*
    43.5 + *  linux/mm/page_alloc.c
    43.6 + *
    43.7 + *  Manages the free list, the system allocates free pages here.
    43.8 + *  Note that kmalloc() lives in slab.c
    43.9 + *
   43.10 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   43.11 + *  Swap reorganised 29.12.95, Stephen Tweedie
   43.12 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   43.13 + *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   43.14 + *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   43.15 + *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   43.16 + */
   43.17 +
   43.18 +#include <linux/config.h>
   43.19 +#include <linux/mm.h>
   43.20 +#include <linux/swap.h>
   43.21 +#include <linux/swapctl.h>
   43.22 +#include <linux/interrupt.h>
   43.23 +#include <linux/pagemap.h>
   43.24 +#include <linux/bootmem.h>
   43.25 +#include <linux/slab.h>
   43.26 +#include <linux/module.h>
   43.27 +
   43.28 +int nr_swap_pages;
   43.29 +int nr_active_pages;
   43.30 +int nr_inactive_pages;
   43.31 +LIST_HEAD(inactive_list);
   43.32 +LIST_HEAD(active_list);
   43.33 +pg_data_t *pgdat_list;
   43.34 +
   43.35 +/*
   43.36 + *
   43.37 + * The zone_table array is used to look up the address of the
   43.38 + * struct zone corresponding to a given zone number (ZONE_DMA,
   43.39 + * ZONE_NORMAL, or ZONE_HIGHMEM).
   43.40 + */
   43.41 +zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
   43.42 +EXPORT_SYMBOL(zone_table);
   43.43 +
   43.44 +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
   43.45 +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
   43.46 +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
   43.47 +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
   43.48 +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
   43.49 +
   43.50 +int vm_gfp_debug = 0;
   43.51 +
   43.52 +/*
   43.53 + * Temporary debugging check.
   43.54 + */
   43.55 +#define BAD_RANGE(zone, page)						\
   43.56 +(									\
   43.57 +	(((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size))	\
   43.58 +	|| (((page) - mem_map) < (zone)->zone_start_mapnr)		\
   43.59 +	|| ((zone) != page_zone(page))					\
   43.60 +)
   43.61 +
   43.62 +/*
   43.63 + * Freeing function for a buddy system allocator.
   43.64 + * Contrary to prior comments, this is *NOT* hairy, and there
   43.65 + * is no reason for anyone not to understand it.
   43.66 + *
   43.67 + * The concept of a buddy system is to maintain direct-mapped tables
   43.68 + * (containing bit values) for memory blocks of various "orders".
   43.69 + * The bottom level table contains the map for the smallest allocatable
   43.70 + * units of memory (here, pages), and each level above it describes
   43.71 + * pairs of units from the levels below, hence, "buddies".
   43.72 + * At a high level, all that happens here is marking the table entry
   43.73 + * at the bottom level available, and propagating the changes upward
   43.74 + * as necessary, plus some accounting needed to play nicely with other
   43.75 + * parts of the VM system.
   43.76 + * At each level, we keep one bit for each pair of blocks, which
   43.77 + * is set to 1 iff only one of the pair is allocated.  So when we
   43.78 + * are allocating or freeing one, we can derive the state of the
   43.79 + * other.  That is, if we allocate a small block, and both were   
   43.80 + * free, the remainder of the region must be split into blocks.   
   43.81 + * If a block is freed, and its buddy is also free, then this
   43.82 + * triggers coalescing into a block of larger size.            
   43.83 + *
   43.84 + * -- wli
   43.85 + */
   43.86 +
   43.87 +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
   43.88 +static void __free_pages_ok (struct page *page, unsigned int order)
   43.89 +{
   43.90 +	unsigned long index, page_idx, mask, flags;
   43.91 +	free_area_t *area;
   43.92 +	struct page *base;
   43.93 +	zone_t *zone;
   43.94 +
   43.95 +	/*
   43.96 +	 * Yes, think what happens when other parts of the kernel take 
   43.97 +	 * a reference to a page in order to pin it for io. -ben
   43.98 +	 */
   43.99 +	if (PageLRU(page)) {
  43.100 +		if (unlikely(in_interrupt()))
  43.101 +			BUG();
  43.102 +		lru_cache_del(page);
  43.103 +	}
  43.104 +
  43.105 +	if (page->buffers)
  43.106 +		BUG();
  43.107 +	if (page->mapping)
  43.108 +		return (*(void(*)(struct page *))page->mapping)(page);
  43.109 +	if (!VALID_PAGE(page))
  43.110 +		BUG();
  43.111 +	if (PageLocked(page))
  43.112 +		BUG();
  43.113 +	if (PageActive(page))
  43.114 +		BUG();
  43.115 +	ClearPageReferenced(page);
  43.116 +	ClearPageDirty(page);
  43.117 +
  43.118 +	if (current->flags & PF_FREE_PAGES)
  43.119 +		goto local_freelist;
  43.120 + back_local_freelist:
  43.121 +
  43.122 +	zone = page_zone(page);
  43.123 +
  43.124 +	mask = (~0UL) << order;
  43.125 +	base = zone->zone_mem_map;
  43.126 +	page_idx = page - base;
  43.127 +	if (page_idx & ~mask)
  43.128 +		BUG();
  43.129 +	index = page_idx >> (1 + order);
  43.130 +
  43.131 +	area = zone->free_area + order;
  43.132 +
  43.133 +	spin_lock_irqsave(&zone->lock, flags);
  43.134 +
  43.135 +	zone->free_pages -= mask;
  43.136 +
  43.137 +	while (mask + (1 << (MAX_ORDER-1))) {
  43.138 +		struct page *buddy1, *buddy2;
  43.139 +
  43.140 +		if (area >= zone->free_area + MAX_ORDER)
  43.141 +			BUG();
  43.142 +		if (!__test_and_change_bit(index, area->map))
  43.143 +			/*
  43.144 +			 * the buddy page is still allocated.
  43.145 +			 */
  43.146 +			break;
  43.147 +		/*
  43.148 +		 * Move the buddy up one level.
  43.149 +		 * This code is taking advantage of the identity:
  43.150 +		 * 	-mask = 1+~mask
  43.151 +		 */
  43.152 +		buddy1 = base + (page_idx ^ -mask);
  43.153 +		buddy2 = base + page_idx;
  43.154 +		if (BAD_RANGE(zone,buddy1))
  43.155 +			BUG();
  43.156 +		if (BAD_RANGE(zone,buddy2))
  43.157 +			BUG();
  43.158 +
  43.159 +		list_del(&buddy1->list);
  43.160 +		mask <<= 1;
  43.161 +		area++;
  43.162 +		index >>= 1;
  43.163 +		page_idx &= mask;
  43.164 +	}
  43.165 +	list_add(&(base + page_idx)->list, &area->free_list);
  43.166 +
  43.167 +	spin_unlock_irqrestore(&zone->lock, flags);
  43.168 +	return;
  43.169 +
  43.170 + local_freelist:
  43.171 +	if (current->nr_local_pages)
  43.172 +		goto back_local_freelist;
  43.173 +	if (in_interrupt())
  43.174 +		goto back_local_freelist;		
  43.175 +
  43.176 +	list_add(&page->list, &current->local_pages);
  43.177 +	page->index = order;
  43.178 +	current->nr_local_pages++;
  43.179 +}
  43.180 +
  43.181 +#define MARK_USED(index, order, area) \
  43.182 +	__change_bit((index) >> (1+(order)), (area)->map)
  43.183 +
  43.184 +static inline struct page * expand (zone_t *zone, struct page *page,
  43.185 +	 unsigned long index, int low, int high, free_area_t * area)
  43.186 +{
  43.187 +	unsigned long size = 1 << high;
  43.188 +
  43.189 +	while (high > low) {
  43.190 +		if (BAD_RANGE(zone,page))
  43.191 +			BUG();
  43.192 +		area--;
  43.193 +		high--;
  43.194 +		size >>= 1;
  43.195 +		list_add(&(page)->list, &(area)->free_list);
  43.196 +		MARK_USED(index, high, area);
  43.197 +		index += size;
  43.198 +		page += size;
  43.199 +	}
  43.200 +	if (BAD_RANGE(zone,page))
  43.201 +		BUG();
  43.202 +	return page;
  43.203 +}
  43.204 +
  43.205 +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
  43.206 +static struct page * rmqueue(zone_t *zone, unsigned int order)
  43.207 +{
  43.208 +	free_area_t * area = zone->free_area + order;
  43.209 +	unsigned int curr_order = order;
  43.210 +	struct list_head *head, *curr;
  43.211 +	unsigned long flags;
  43.212 +	struct page *page;
  43.213 +
  43.214 +	spin_lock_irqsave(&zone->lock, flags);
  43.215 +	do {
  43.216 +		head = &area->free_list;
  43.217 +		curr = head->next;
  43.218 +
  43.219 +		if (curr != head) {
  43.220 +			unsigned int index;
  43.221 +
  43.222 +			page = list_entry(curr, struct page, list);
  43.223 +			if (BAD_RANGE(zone,page))
  43.224 +				BUG();
  43.225 +			list_del(curr);
  43.226 +			index = page - zone->zone_mem_map;
  43.227 +			if (curr_order != MAX_ORDER-1)
  43.228 +				MARK_USED(index, curr_order, area);
  43.229 +			zone->free_pages -= 1UL << order;
  43.230 +
  43.231 +			page = expand(zone, page, index, order, curr_order, area);
  43.232 +			spin_unlock_irqrestore(&zone->lock, flags);
  43.233 +
  43.234 +			set_page_count(page, 1);
  43.235 +			if (BAD_RANGE(zone,page))
  43.236 +				BUG();
  43.237 +			if (PageLRU(page))
  43.238 +				BUG();
  43.239 +			if (PageActive(page))
  43.240 +				BUG();
  43.241 +			return page;	
  43.242 +		}
  43.243 +		curr_order++;
  43.244 +		area++;
  43.245 +	} while (curr_order < MAX_ORDER);
  43.246 +	spin_unlock_irqrestore(&zone->lock, flags);
  43.247 +
  43.248 +	return NULL;
  43.249 +}
  43.250 +
  43.251 +#ifndef CONFIG_DISCONTIGMEM
  43.252 +struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
  43.253 +{
  43.254 +	return __alloc_pages(gfp_mask, order,
  43.255 +		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
  43.256 +}
  43.257 +#endif
  43.258 +
  43.259 +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
  43.260 +static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
  43.261 +{
  43.262 +	struct page * page = NULL;
  43.263 +	int __freed;
  43.264 +
  43.265 +	if (in_interrupt())
  43.266 +		BUG();
  43.267 +
  43.268 +	current->allocation_order = order;
  43.269 +	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
  43.270 +
  43.271 +	__freed = try_to_free_pages_zone(classzone, gfp_mask);
  43.272 +
  43.273 +	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
  43.274 +
  43.275 +	if (current->nr_local_pages) {
  43.276 +		struct list_head * entry, * local_pages;
  43.277 +		struct page * tmp;
  43.278 +		int nr_pages;
  43.279 +
  43.280 +		local_pages = &current->local_pages;
  43.281 +
  43.282 +		if (likely(__freed)) {
  43.283 +			/* pick from the last inserted so we're lifo */
  43.284 +			entry = local_pages->next;
  43.285 +			do {
  43.286 +				tmp = list_entry(entry, struct page, list);
  43.287 +				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
  43.288 +					list_del(entry);
  43.289 +					current->nr_local_pages--;
  43.290 +					set_page_count(tmp, 1);
  43.291 +					page = tmp;
  43.292 +
  43.293 +					if (page->buffers)
  43.294 +						BUG();
  43.295 +					if (page->mapping)
  43.296 +						BUG();
  43.297 +					if (!VALID_PAGE(page))
  43.298 +						BUG();
  43.299 +					if (PageLocked(page))
  43.300 +						BUG();
  43.301 +					if (PageLRU(page))
  43.302 +						BUG();
  43.303 +					if (PageActive(page))
  43.304 +						BUG();
  43.305 +					if (PageDirty(page))
  43.306 +						BUG();
  43.307 +
  43.308 +					break;
  43.309 +				}
  43.310 +			} while ((entry = entry->next) != local_pages);
  43.311 +		}
  43.312 +
  43.313 +		nr_pages = current->nr_local_pages;
  43.314 +		/* free in reverse order so that the global order will be lifo */
  43.315 +		while ((entry = local_pages->prev) != local_pages) {
  43.316 +			list_del(entry);
  43.317 +			tmp = list_entry(entry, struct page, list);
  43.318 +			__free_pages_ok(tmp, tmp->index);
  43.319 +			if (!nr_pages--)
  43.320 +				BUG();
  43.321 +		}
  43.322 +		current->nr_local_pages = 0;
  43.323 +	}
  43.324 +
  43.325 +	*freed = __freed;
  43.326 +	return page;
  43.327 +}
  43.328 +
  43.329 +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
  43.330 +{
  43.331 +	long free = zone->free_pages - (1UL << order);
  43.332 +	return free >= 0 ? free : 0;
  43.333 +}
  43.334 +
  43.335 +/*
  43.336 + * This is the 'heart' of the zoned buddy allocator:
  43.337 + */
  43.338 +struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
  43.339 +{
  43.340 +	zone_t **zone, * classzone;
  43.341 +	struct page * page;
  43.342 +	int freed, class_idx;
  43.343 +
  43.344 +	zone = zonelist->zones;
  43.345 +	classzone = *zone;
  43.346 +	class_idx = zone_idx(classzone);
  43.347 +
  43.348 +	for (;;) {
  43.349 +		zone_t *z = *(zone++);
  43.350 +		if (!z)
  43.351 +			break;
  43.352 +
  43.353 +		if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
  43.354 +			page = rmqueue(z, order);
  43.355 +			if (page)
  43.356 +				return page;
  43.357 +		}
  43.358 +	}
  43.359 +
  43.360 +	classzone->need_balance = 1;
  43.361 +	mb();
  43.362 +	if (waitqueue_active(&kswapd_wait))
  43.363 +		wake_up_interruptible(&kswapd_wait);
  43.364 +
  43.365 +	zone = zonelist->zones;
  43.366 +	for (;;) {
  43.367 +		unsigned long min;
  43.368 +		zone_t *z = *(zone++);
  43.369 +		if (!z)
  43.370 +			break;
  43.371 +
  43.372 +		min = z->watermarks[class_idx].min;
  43.373 +		if (!(gfp_mask & __GFP_WAIT))
  43.374 +			min >>= 2;
  43.375 +		if (zone_free_pages(z, order) > min) {
  43.376 +			page = rmqueue(z, order);
  43.377 +			if (page)
  43.378 +				return page;
  43.379 +		}
  43.380 +	}
  43.381 +
  43.382 +	/* here we're in the low on memory slow path */
  43.383 +
  43.384 +	if ((current->flags & PF_MEMALLOC) && 
  43.385 +			(!in_interrupt() || (current->flags & PF_MEMDIE))) {
  43.386 +		zone = zonelist->zones;
  43.387 +		for (;;) {
  43.388 +			zone_t *z = *(zone++);
  43.389 +			if (!z)
  43.390 +				break;
  43.391 +
  43.392 +			page = rmqueue(z, order);
  43.393 +			if (page)
  43.394 +				return page;
  43.395 +		}
  43.396 +		return NULL;
  43.397 +	}
  43.398 +
  43.399 +	/* Atomic allocations - we can't balance anything */
  43.400 +	if (!(gfp_mask & __GFP_WAIT))
  43.401 +		goto out;
  43.402 +
  43.403 + rebalance:
  43.404 +	page = balance_classzone(classzone, gfp_mask, order, &freed);
  43.405 +	if (page)
  43.406 +		return page;
  43.407 +
  43.408 +	zone = zonelist->zones;
  43.409 +	if (likely(freed)) {
  43.410 +		for (;;) {
  43.411 +			zone_t *z = *(zone++);
  43.412 +			if (!z)
  43.413 +				break;
  43.414 +
  43.415 +			if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
  43.416 +				page = rmqueue(z, order);
  43.417 +				if (page)
  43.418 +					return page;
  43.419 +			}
  43.420 +		}
  43.421 +		goto rebalance;
  43.422 +	} else {
  43.423 +		/* 
  43.424 +		 * Check that no other task is been killed meanwhile,
  43.425 +		 * in such a case we can succeed the allocation.
  43.426 +		 */
  43.427 +		for (;;) {
  43.428 +			zone_t *z = *(zone++);
  43.429 +			if (!z)
  43.430 +				break;
  43.431 +
  43.432 +			if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
  43.433 +				page = rmqueue(z, order);
  43.434 +				if (page)
  43.435 +					return page;
  43.436 +			}
  43.437 +		}
  43.438 +	}
  43.439 +
  43.440 + out:
  43.441 +	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
  43.442 +	       order, gfp_mask, !!(current->flags & PF_MEMALLOC));
  43.443 +	if (unlikely(vm_gfp_debug))
  43.444 +		dump_stack();
  43.445 +	return NULL;
  43.446 +}
  43.447 +
  43.448 +/*
  43.449 + * Common helper functions.
  43.450 + */
  43.451 +unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
  43.452 +{
  43.453 +	struct page * page;
  43.454 +
  43.455 +	page = alloc_pages(gfp_mask, order);
  43.456 +	if (!page)
  43.457 +		return 0;
  43.458 +	return (unsigned long) page_address(page);
  43.459 +}
  43.460 +
  43.461 +unsigned long get_zeroed_page(unsigned int gfp_mask)
  43.462 +{
  43.463 +	struct page * page;
  43.464 +
  43.465 +	page = alloc_pages(gfp_mask, 0);
  43.466 +	if (page) {
  43.467 +		void *address = page_address(page);
  43.468 +		clear_page(address);
  43.469 +		return (unsigned long) address;
  43.470 +	}
  43.471 +	return 0;
  43.472 +}
  43.473 +
  43.474 +void __free_pages(struct page *page, unsigned int order)
  43.475 +{
  43.476 +	if (!PageReserved(page) && put_page_testzero(page))
  43.477 +		__free_pages_ok(page, order);
  43.478 +}
  43.479 +
  43.480 +void free_pages(unsigned long addr, unsigned int order)
  43.481 +{
  43.482 +	if (addr != 0)
  43.483 +		__free_pages(virt_to_page(addr), order);
  43.484 +}
  43.485 +
  43.486 +/*
  43.487 + * Total amount of free (allocatable) RAM:
  43.488 + */
  43.489 +unsigned int nr_free_pages (void)
  43.490 +{
  43.491 +	unsigned int sum = 0;
  43.492 +	zone_t *zone;
  43.493 +
  43.494 +	for_each_zone(zone)
  43.495 +		sum += zone->free_pages;
  43.496 +
  43.497 +	return sum;
  43.498 +}
  43.499 +
  43.500 +/*
  43.501 + * Amount of free RAM allocatable as buffer memory:
  43.502 + */
  43.503 +unsigned int nr_free_buffer_pages (void)
  43.504 +{
  43.505 +	pg_data_t *pgdat;
  43.506 +	unsigned int sum = 0;
  43.507 +	zonelist_t *zonelist;
  43.508 +	zone_t **zonep, *zone;
  43.509 +
  43.510 +	for_each_pgdat(pgdat) {
  43.511 +		int class_idx;
  43.512 +		zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
  43.513 +		zonep = zonelist->zones;
  43.514 +		zone = *zonep;
  43.515 +		class_idx = zone_idx(zone);
  43.516 +
  43.517 +		sum += zone->nr_cache_pages;
  43.518 +		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
  43.519 +			int free = zone->free_pages - zone->watermarks[class_idx].high;
  43.520 +			if (free <= 0)
  43.521 +				continue;
  43.522 +			sum += free;
  43.523 +		}
  43.524 +	}
  43.525 +
  43.526 +	return sum;
  43.527 +}
  43.528 +
  43.529 +#if CONFIG_HIGHMEM
  43.530 +unsigned int nr_free_highpages (void)
  43.531 +{
  43.532 +	pg_data_t *pgdat;
  43.533 +	unsigned int pages = 0;
  43.534 +
  43.535 +	for_each_pgdat(pgdat)
  43.536 +		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
  43.537 +
  43.538 +	return pages;
  43.539 +}
  43.540 +
  43.541 +unsigned int freeable_lowmem(void)
  43.542 +{
  43.543 +	unsigned int pages = 0;
  43.544 +	pg_data_t *pgdat;
  43.545 +
  43.546 +	for_each_pgdat(pgdat) {
  43.547 +		pages += pgdat->node_zones[ZONE_DMA].free_pages;
  43.548 +		pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
  43.549 +		pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
  43.550 +		pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
  43.551 +		pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
  43.552 +		pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
  43.553 +	}
  43.554 +
  43.555 +	return pages;
  43.556 +}
  43.557 +#endif
  43.558 +
  43.559 +#define K(x) ((x) << (PAGE_SHIFT-10))
  43.560 +
  43.561 +/*
  43.562 + * Show free area list (used inside shift_scroll-lock stuff)
  43.563 + * We also calculate the percentage fragmentation. We do this by counting the
  43.564 + * memory on each free list with the exception of the first item on the list.
  43.565 + */
  43.566 +void show_free_areas_core(pg_data_t *pgdat)
  43.567 +{
  43.568 + 	unsigned int order;
  43.569 +	unsigned type;
  43.570 +	pg_data_t *tmpdat = pgdat;
  43.571 +
  43.572 +	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
  43.573 +		K(nr_free_pages()),
  43.574 +		K(nr_free_highpages()));
  43.575 +
  43.576 +	while (tmpdat) {
  43.577 +		zone_t *zone;
  43.578 +		for (zone = tmpdat->node_zones;
  43.579 +			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
  43.580 +			printk("Zone:%s freepages:%6lukB\n", 
  43.581 +					zone->name,
  43.582 +					K(zone->free_pages));
  43.583 +			
  43.584 +		tmpdat = tmpdat->node_next;
  43.585 +	}
  43.586 +
  43.587 +	printk("( Active: %d, inactive: %d, free: %d )\n",
  43.588 +	       nr_active_pages,
  43.589 +	       nr_inactive_pages,
  43.590 +	       nr_free_pages());
  43.591 +
  43.592 +	for (type = 0; type < MAX_NR_ZONES; type++) {
  43.593 +		struct list_head *head, *curr;
  43.594 +		zone_t *zone = pgdat->node_zones + type;
  43.595 + 		unsigned long nr, total, flags;
  43.596 +
  43.597 +		total = 0;
  43.598 +		if (zone->size) {
  43.599 +			spin_lock_irqsave(&zone->lock, flags);
  43.600 +		 	for (order = 0; order < MAX_ORDER; order++) {
  43.601 +				head = &(zone->free_area + order)->free_list;
  43.602 +				curr = head;
  43.603 +				nr = 0;
  43.604 +				for (;;) {
  43.605 +					if ((curr = curr->next) == head)
  43.606 +						break;
  43.607 +					nr++;
  43.608 +				}
  43.609 +				total += nr * (1 << order);
  43.610 +				printk("%lu*%lukB ", nr, K(1UL) << order);
  43.611 +			}
  43.612 +			spin_unlock_irqrestore(&zone->lock, flags);
  43.613 +		}
  43.614 +		printk("= %lukB)\n", K(total));
  43.615 +	}
  43.616 +
  43.617 +#ifdef SWAP_CACHE_INFO
  43.618 +	show_swap_cache_info();
  43.619 +#endif	
  43.620 +}
  43.621 +
  43.622 +void show_free_areas(void)
  43.623 +{
  43.624 +	show_free_areas_core(pgdat_list);
  43.625 +}
  43.626 +
  43.627 +/*
  43.628 + * Builds allocation fallback zone lists.
  43.629 + */
  43.630 +static inline void build_zonelists(pg_data_t *pgdat)
  43.631 +{
  43.632 +	int i, j, k;
  43.633 +
  43.634 +	for (i = 0; i <= GFP_ZONEMASK; i++) {
  43.635 +		zonelist_t *zonelist;
  43.636 +		zone_t *zone;
  43.637 +
  43.638 +		zonelist = pgdat->node_zonelists + i;
  43.639 +		memset(zonelist, 0, sizeof(*zonelist));
  43.640 +
  43.641 +		j = 0;
  43.642 +		k = ZONE_NORMAL;
  43.643 +		if (i & __GFP_HIGHMEM)
  43.644 +			k = ZONE_HIGHMEM;
  43.645 +		if (i & __GFP_DMA)
  43.646 +			k = ZONE_DMA;
  43.647 +
  43.648 +		switch (k) {
  43.649 +			default:
  43.650 +				BUG();
  43.651 +			/*
  43.652 +			 * fallthrough:
  43.653 +			 */
  43.654 +			case ZONE_HIGHMEM:
  43.655 +				zone = pgdat->node_zones + ZONE_HIGHMEM;
  43.656 +				if (zone->size) {
  43.657 +#ifndef CONFIG_HIGHMEM
  43.658 +					BUG();
  43.659 +#endif
  43.660 +					zonelist->zones[j++] = zone;
  43.661 +				}
  43.662 +			case ZONE_NORMAL:
  43.663 +				zone = pgdat->node_zones + ZONE_NORMAL;
  43.664 +				if (zone->size)
  43.665 +					zonelist->zones[j++] = zone;
  43.666 +			case ZONE_DMA:
  43.667 +				zone = pgdat->node_zones + ZONE_DMA;
  43.668 +				if (zone->size)
  43.669 +					zonelist->zones[j++] = zone;
  43.670 +		}
  43.671 +		zonelist->zones[j++] = NULL;
  43.672 +	} 
  43.673 +}
  43.674 +
  43.675 +/*
  43.676 + * Helper functions to size the waitqueue hash table.
  43.677 + * Essentially these want to choose hash table sizes sufficiently
  43.678 + * large so that collisions trying to wait on pages are rare.
  43.679 + * But in fact, the number of active page waitqueues on typical
  43.680 + * systems is ridiculously low, less than 200. So this is even
  43.681 + * conservative, even though it seems large.
  43.682 + *
  43.683 + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  43.684 + * waitqueues, i.e. the size of the waitq table given the number of pages.
  43.685 + */
  43.686 +#define PAGES_PER_WAITQUEUE	256
  43.687 +
  43.688 +static inline unsigned long wait_table_size(unsigned long pages)
  43.689 +{
  43.690 +	unsigned long size = 1;
  43.691 +
  43.692 +	pages /= PAGES_PER_WAITQUEUE;
  43.693 +
  43.694 +	while (size < pages)
  43.695 +		size <<= 1;
  43.696 +
  43.697 +	/*
  43.698 +	 * Once we have dozens or even hundreds of threads sleeping
  43.699 +	 * on IO we've got bigger problems than wait queue collision.
  43.700 +	 * Limit the size of the wait table to a reasonable size.
  43.701 +	 */
  43.702 +	size = min(size, 4096UL);
  43.703 +
  43.704 +	return size;
  43.705 +}
  43.706 +
  43.707 +/*
  43.708 + * This is an integer logarithm so that shifts can be used later
  43.709 + * to extract the more random high bits from the multiplicative
  43.710 + * hash function before the remainder is taken.
  43.711 + */
  43.712 +static inline unsigned long wait_table_bits(unsigned long size)
  43.713 +{
  43.714 +	return ffz(~size);
  43.715 +}
  43.716 +
  43.717 +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
  43.718 +
  43.719 +/*
  43.720 + * Set up the zone data structures:
  43.721 + *   - mark all pages reserved
  43.722 + *   - mark all memory queues empty
  43.723 + *   - clear the memory bitmaps
  43.724 + */
  43.725 +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
  43.726 +	unsigned long *zones_size, unsigned long zone_start_paddr, 
  43.727 +	unsigned long *zholes_size, struct page *lmem_map)
  43.728 +{
  43.729 +	unsigned long i, j;
  43.730 +	unsigned long map_size;
  43.731 +	unsigned long totalpages, offset, realtotalpages;
  43.732 +	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
  43.733 +
  43.734 +	if (zone_start_paddr & ~PAGE_MASK)
  43.735 +		BUG();
  43.736 +
  43.737 +	totalpages = 0;
  43.738 +	for (i = 0; i < MAX_NR_ZONES; i++) {
  43.739 +		unsigned long size = zones_size[i];
  43.740 +		totalpages += size;
  43.741 +	}
  43.742 +	realtotalpages = totalpages;
  43.743 +	if (zholes_size)
  43.744 +		for (i = 0; i < MAX_NR_ZONES; i++)
  43.745 +			realtotalpages -= zholes_size[i];
  43.746 +			
  43.747 +	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
  43.748 +
  43.749 +	/*
  43.750 +	 * Some architectures (with lots of mem and discontinous memory
  43.751 +	 * maps) have to search for a good mem_map area:
  43.752 +	 * For discontigmem, the conceptual mem map array starts from 
  43.753 +	 * PAGE_OFFSET, we need to align the actual array onto a mem map 
  43.754 +	 * boundary, so that MAP_NR works.
  43.755 +	 */
  43.756 +	map_size = (totalpages + 1)*sizeof(struct page);
  43.757 +	if (lmem_map == (struct page *)0) {
  43.758 +		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
  43.759 +		lmem_map = (struct page *)(PAGE_OFFSET + 
  43.760 +			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
  43.761 +	}
  43.762 +	*gmap = pgdat->node_mem_map = lmem_map;
  43.763 +	pgdat->node_size = totalpages;
  43.764 +	pgdat->node_start_paddr = zone_start_paddr;
  43.765 +	pgdat->node_start_mapnr = (lmem_map - mem_map);
  43.766 +	pgdat->nr_zones = 0;
  43.767 +
  43.768 +	offset = lmem_map - mem_map;	
  43.769 +	for (j = 0; j < MAX_NR_ZONES; j++) {
  43.770 +		zone_t *zone = pgdat->node_zones + j;
  43.771 +		unsigned long mask;
  43.772 +		unsigned long size, realsize;
  43.773 +		int idx;
  43.774 +
  43.775 +		zone_table[nid * MAX_NR_ZONES + j] = zone;
  43.776 +		realsize = size = zones_size[j];
  43.777 +		if (zholes_size)
  43.778 +			realsize -= zholes_size[j];
  43.779 +
  43.780 +		printk("zone(%lu): %lu pages.\n", j, size);
  43.781 +		zone->size = size;
  43.782 +		zone->realsize = realsize;
  43.783 +		zone->name = zone_names[j];
  43.784 +		zone->lock = SPIN_LOCK_UNLOCKED;
  43.785 +		zone->zone_pgdat = pgdat;
  43.786 +		zone->free_pages = 0;
  43.787 +		zone->need_balance = 0;
  43.788 +		 zone->nr_active_pages = zone->nr_inactive_pages = 0;
  43.789 +
  43.790 +
  43.791 +		if (!size)
  43.792 +			continue;
  43.793 +
  43.794 +		/*
  43.795 +		 * The per-page waitqueue mechanism uses hashed waitqueues
  43.796 +		 * per zone.
  43.797 +		 */
  43.798 +		zone->wait_table_size = wait_table_size(size);
  43.799 +		zone->wait_table_shift =
  43.800 +			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
  43.801 +		zone->wait_table = (wait_queue_head_t *)
  43.802 +			alloc_bootmem_node(pgdat, zone->wait_table_size
  43.803 +						* sizeof(wait_queue_head_t));
  43.804 +
  43.805 +		for(i = 0; i < zone->wait_table_size; ++i)
  43.806 +			init_waitqueue_head(zone->wait_table + i);
  43.807 +
  43.808 +		pgdat->nr_zones = j+1;
  43.809 +
  43.810 +		mask = (realsize / zone_balance_ratio[j]);
  43.811 +		if (mask < zone_balance_min[j])
  43.812 +			mask = zone_balance_min[j];
  43.813 +		else if (mask > zone_balance_max[j])
  43.814 +			mask = zone_balance_max[j];
  43.815 +		zone->watermarks[j].min = mask;
  43.816 +		zone->watermarks[j].low = mask*2;
  43.817 +		zone->watermarks[j].high = mask*3;
  43.818 +		/* now set the watermarks of the lower zones in the "j" classzone */
  43.819 +		for (idx = j-1; idx >= 0; idx--) {
  43.820 +			zone_t * lower_zone = pgdat->node_zones + idx;
  43.821 +			unsigned long lower_zone_reserve;
  43.822 +			if (!lower_zone->size)
  43.823 +				continue;
  43.824 +
  43.825 +			mask = lower_zone->watermarks[idx].min;
  43.826 +			lower_zone->watermarks[j].min = mask;
  43.827 +			lower_zone->watermarks[j].low = mask*2;
  43.828 +			lower_zone->watermarks[j].high = mask*3;
  43.829 +
  43.830 +			/* now the brainer part */
  43.831 +			lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
  43.832 +			lower_zone->watermarks[j].min += lower_zone_reserve;
  43.833 +			lower_zone->watermarks[j].low += lower_zone_reserve;
  43.834 +			lower_zone->watermarks[j].high += lower_zone_reserve;
  43.835 +
  43.836 +			realsize += lower_zone->realsize;
  43.837 +		}
  43.838 +
  43.839 +		zone->zone_mem_map = mem_map + offset;
  43.840 +		zone->zone_start_mapnr = offset;
  43.841 +		zone->zone_start_paddr = zone_start_paddr;
  43.842 +
  43.843 +		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
  43.844 +			printk("BUG: wrong zone alignment, it will crash\n");
  43.845 +
  43.846 +		/*
  43.847 +		 * Initially all pages are reserved - free ones are freed
  43.848 +		 * up by free_all_bootmem() once the early boot process is
  43.849 +		 * done. Non-atomic initialization, single-pass.
  43.850 +		 */
  43.851 +		for (i = 0; i < size; i++) {
  43.852 +			struct page *page = mem_map + offset + i;
  43.853 +			set_page_zone(page, nid * MAX_NR_ZONES + j);
  43.854 +			set_page_count(page, 0);
  43.855 +			SetPageReserved(page);
  43.856 +			INIT_LIST_HEAD(&page->list);
  43.857 +			if (j != ZONE_HIGHMEM)
  43.858 +				set_page_address(page, __va(zone_start_paddr));
  43.859 +			zone_start_paddr += PAGE_SIZE;
  43.860 +		}
  43.861 +
  43.862 +		offset += size;
  43.863 +		for (i = 0; ; i++) {
  43.864 +			unsigned long bitmap_size;
  43.865 +
  43.866 +			INIT_LIST_HEAD(&zone->free_area[i].free_list);
  43.867 +			if (i == MAX_ORDER-1) {
  43.868 +				zone->free_area[i].map = NULL;
  43.869 +				break;
  43.870 +			}
  43.871 +
  43.872 +			/*
  43.873 +			 * Page buddy system uses "index >> (i+1)",
  43.874 +			 * where "index" is at most "size-1".
  43.875 +			 *
  43.876 +			 * The extra "+3" is to round down to byte
  43.877 +			 * size (8 bits per byte assumption). Thus
  43.878 +			 * we get "(size-1) >> (i+4)" as the last byte
  43.879 +			 * we can access.
  43.880 +			 *
  43.881 +			 * The "+1" is because we want to round the
  43.882 +			 * byte allocation up rather than down. So
  43.883 +			 * we should have had a "+7" before we shifted
  43.884 +			 * down by three. Also, we have to add one as
  43.885 +			 * we actually _use_ the last bit (it's [0,n]
  43.886 +			 * inclusive, not [0,n[).
  43.887 +			 *
  43.888 +			 * So we actually had +7+1 before we shift
  43.889 +			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
  43.890 +			 * (modulo overflows, which we do not have).
  43.891 +			 *
  43.892 +			 * Finally, we LONG_ALIGN because all bitmap
  43.893 +			 * operations are on longs.
  43.894 +			 */
  43.895 +			bitmap_size = (size-1) >> (i+4);
  43.896 +			bitmap_size = LONG_ALIGN(bitmap_size+1);
  43.897 +			zone->free_area[i].map = 
  43.898 +			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
  43.899 +		}
  43.900 +	}
  43.901 +	build_zonelists(pgdat);
  43.902 +}
  43.903 +
  43.904 +void __init free_area_init(unsigned long *zones_size)
  43.905 +{
  43.906 +	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
  43.907 +}
  43.908 +
  43.909 +static int __init setup_mem_frac(char *str)
  43.910 +{
  43.911 +	int j = 0;
  43.912 +
  43.913 +	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
  43.914 +	printk("setup_mem_frac: ");
  43.915 +	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
  43.916 +	printk("\n");
  43.917 +	return 1;
  43.918 +}
  43.919 +
  43.920 +__setup("memfrac=", setup_mem_frac);
  43.921 +
  43.922 +static int __init setup_lower_zone_reserve(char *str)
  43.923 +{
  43.924 +	int j = 0;
  43.925 +
  43.926 +	while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
  43.927 +	printk("setup_lower_zone_reserve: ");
  43.928 +	for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
  43.929 +	printk("\n");
  43.930 +	return 1;
  43.931 +}
  43.932 +
  43.933 +__setup("lower_zone_reserve=", setup_lower_zone_reserve);