ia64/xen-unstable
changeset 1360:0fab6364d23b
bitkeeper revision 1.897 (40a0eb02jGwqt6POLmCY0eC1hpHfvw)
trivial merge
trivial merge
line diff
1.1 --- a/.rootkeys Tue May 11 14:57:44 2004 +0000 1.2 +++ b/.rootkeys Tue May 11 15:02:26 2004 +0000 1.3 @@ -107,6 +107,7 @@ 4055ad97wMLUj0BZT0e_T0EwQN0Bvw tools/xen 1.4 4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h 1.5 4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py 1.6 4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py 1.7 +409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py 1.8 40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c 1.9 4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py 1.10 4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend 1.11 @@ -735,6 +736,7 @@ 3e5a4e678ddsQOpbSiRdy1GRcDc9WA xenolinux 1.12 3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h 1.13 3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h 1.14 3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h 1.15 +409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h 1.16 3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h 1.17 3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h 1.18 3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h 1.19 @@ -762,6 +764,7 @@ 406aeeafkrnCuIVWLFv3kfn4uAD5Eg xenolinux 1.20 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c 1.21 3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c 1.22 3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c 1.23 +409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c 1.24 3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c 1.25 3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c 1.26 407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse
2.1 --- a/tools/examples/xc_dom_create.py Tue May 11 14:57:44 2004 +0000 2.2 +++ b/tools/examples/xc_dom_create.py Tue May 11 15:02:26 2004 +0000 2.3 @@ -333,7 +333,18 @@ def make_domain(): 2.4 xc.domain_destroy ( dom=id ) 2.5 sys.exit() 2.6 2.7 - if not new_io_world: 2.8 + if new_io_world: 2.9 + cmsg = 'new_network_interface(dom='+str(id)+')' 2.10 + xend_response = xenctl.utils.xend_control_message(cmsg) 2.11 + if not xend_response['success']: 2.12 + print "Error creating network interface" 2.13 + print "Error type: " + xend_response['error_type'] 2.14 + if xend_response['error_type'] == 'exception': 2.15 + print "Exception type: " + xend_response['exception_type'] 2.16 + print "Exception val: " + xend_response['exception_value'] 2.17 + xc.domain_destroy ( dom=id ) 2.18 + sys.exit() 2.19 + else: 2.20 # setup virtual firewall rules for all aliases 2.21 for ip in vfr_ipaddr: 2.22 xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip )
3.1 --- a/tools/xc/py/Xc.c Tue May 11 14:57:44 2004 +0000 3.2 +++ b/tools/xc/py/Xc.c Tue May 11 15:02:26 2004 +0000 3.3 @@ -13,6 +13,7 @@ 3.4 #include <sys/types.h> 3.5 #include <sys/socket.h> 3.6 #include <netdb.h> 3.7 +#include <arpa/inet.h> 3.8 3.9 /* Needed for Python versions earlier than 2.3. */ 3.10 #ifndef PyMODINIT_FUNC 3.11 @@ -202,13 +203,13 @@ static PyObject *pyxc_linux_save(PyObjec 3.12 if (progress) flags |= XCFLAGS_VERBOSE; 3.13 if (live) flags |= XCFLAGS_LIVE; 3.14 3.15 - if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0) 3.16 + if ( strncmp(state_file,"tcp:", strlen("tcp:")) == 0 ) 3.17 { 3.18 #define max_namelen 64 3.19 char server[max_namelen]; 3.20 char *port_s; 3.21 int port=777; 3.22 - int sd = 0; 3.23 + int sd = -1; 3.24 struct hostent *h; 3.25 struct sockaddr_in s; 3.26 int sockbufsize; 3.27 @@ -216,19 +217,18 @@ static PyObject *pyxc_linux_save(PyObjec 3.28 int writerfn(void *fd, const void *buf, size_t count) 3.29 { 3.30 int tot = 0, rc; 3.31 - do 3.32 - { 3.33 + do { 3.34 rc = write( (int) fd, ((char*)buf)+tot, count-tot ); 3.35 - if (rc<0) { perror("WRITE"); return rc; }; 3.36 + if ( rc < 0 ) { perror("WRITE"); return rc; }; 3.37 tot += rc; 3.38 } 3.39 - while(tot<count); 3.40 + while ( tot < count ); 3.41 return 0; 3.42 } 3.43 3.44 strncpy( server, state_file+strlen("tcp://"), max_namelen); 3.45 server[max_namelen-1]='\0'; 3.46 - if( port_s = strchr(server,':') ) 3.47 + if ( (port_s = strchr(server,':')) != NULL ) 3.48 { 3.49 *port_s = '\0'; 3.50 port = atoi(port_s+1); 3.51 @@ -238,36 +238,36 @@ static PyObject *pyxc_linux_save(PyObjec 3.52 3.53 h = gethostbyname(server); 3.54 sd = socket (AF_INET,SOCK_STREAM,0); 3.55 - if(sd<0) goto serr; 3.56 + if ( sd < 0 ) 3.57 + goto serr; 3.58 s.sin_family = AF_INET; 3.59 bcopy ( h->h_addr, &(s.sin_addr.s_addr), h->h_length); 3.60 s.sin_port = htons(port); 3.61 - if( connect(sd, (struct sockaddr *) &s, sizeof(s)) ) 3.62 + if ( connect(sd, (struct sockaddr *) &s, sizeof(s)) ) 3.63 goto serr; 3.64 3.65 sockbufsize=128*1024; 3.66 - if (setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &sockbufsize, sizeof sockbufsize) < 0) 3.67 - { 3.68 + if ( setsockopt(sd, SOL_SOCKET, SO_SNDBUF, 3.69 + &sockbufsize, sizeof sockbufsize) < 0 ) 3.70 goto serr; 3.71 - } 3.72 3.73 - if ( xc_linux_save(xc->xc_handle, dom, flags, writerfn, (void*)sd) == 0 ) 3.74 + if ( xc_linux_save(xc->xc_handle, dom, flags, 3.75 + writerfn, (void*)sd) == 0 ) 3.76 { 3.77 close(sd); 3.78 Py_INCREF(zero); 3.79 return zero; 3.80 } 3.81 3.82 - serr: 3.83 - 3.84 + serr: 3.85 PyErr_SetFromErrno(xc_error); 3.86 - if(sd)close(sd); 3.87 + if ( sd >= 0 ) close(sd); 3.88 return NULL; 3.89 } 3.90 else 3.91 { 3.92 - int fd; 3.93 - gzFile gfd; 3.94 + int fd = -1; 3.95 + gzFile gfd = NULL; 3.96 3.97 int writerfn(void *fd, const void *buf, size_t count) 3.98 { 3.99 @@ -311,10 +311,11 @@ static PyObject *pyxc_linux_save(PyObjec 3.100 3.101 err: 3.102 PyErr_SetFromErrno(xc_error); 3.103 - if(gfd)gzclose(gfd); 3.104 - if(fd)close(fd); 3.105 + if ( gfd != NULL ) 3.106 + gzclose(gfd); 3.107 + if ( fd >= 0 ) 3.108 + close(fd); 3.109 unlink(state_file); 3.110 - 3.111 return NULL; 3.112 } 3.113 3.114 @@ -337,15 +338,16 @@ static PyObject *pyxc_linux_restore(PyOb 3.115 &dom, &state_file, &progress) ) 3.116 return NULL; 3.117 3.118 - if (progress) flags |= XCFLAGS_VERBOSE; 3.119 + if ( progress ) 3.120 + flags |= XCFLAGS_VERBOSE; 3.121 3.122 - if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0) 3.123 + if ( strncmp(state_file,"tcp:", strlen("tcp:")) == 0 ) 3.124 { 3.125 #define max_namelen 64 3.126 char server[max_namelen]; 3.127 char *port_s; 3.128 int port=777; 3.129 - int ld = 0, sd = 0; 3.130 + int ld = -1, sd = -1; 3.131 struct hostent *h; 3.132 struct sockaddr_in s, d, p; 3.133 socklen_t dlen, plen; 3.134 @@ -357,20 +359,16 @@ static PyObject *pyxc_linux_restore(PyOb 3.135 int rc, tot = 0; 3.136 do { 3.137 rc = read( (int) fd, ((char*)buf)+tot, count-tot ); 3.138 - if (rc<0) 3.139 - { 3.140 - perror("READ"); 3.141 - return rc; 3.142 - } 3.143 + if ( rc < 0 ) { perror("READ"); return rc; } 3.144 tot += rc; 3.145 - } while( tot<count ); 3.146 - 3.147 + } 3.148 + while ( tot < count ); 3.149 return 0; 3.150 } 3.151 3.152 strncpy( server, state_file+strlen("tcp://"), max_namelen); 3.153 server[max_namelen-1]='\0'; 3.154 - if( port_s = strchr(server,':') ) 3.155 + if ( (port_s = strchr(server,':')) != NULL ) 3.156 { 3.157 *port_s = '\0'; 3.158 port = atoi(port_s+1); 3.159 @@ -380,58 +378,55 @@ static PyObject *pyxc_linux_restore(PyOb 3.160 3.161 h = gethostbyname(server); 3.162 ld = socket (AF_INET,SOCK_STREAM,0); 3.163 - if(ld<0) goto serr; 3.164 + if ( ld < 0 ) goto serr; 3.165 s.sin_family = AF_INET; 3.166 //bcopy ( h->h_addr, &(s.sin_addr.s_addr), h->h_length); 3.167 s.sin_addr.s_addr = htonl(INADDR_ANY); 3.168 s.sin_port = htons(port); 3.169 3.170 - if (setsockopt(ld, SOL_SOCKET, SO_REUSEADDR, &on, sizeof (on)) < 0) 3.171 + if ( setsockopt(ld, SOL_SOCKET, SO_REUSEADDR, &on, sizeof (on)) < 0 ) 3.172 goto serr; 3.173 3.174 - if( bind(ld, (struct sockaddr *) &s, sizeof(s)) ) 3.175 + if ( bind(ld, (struct sockaddr *) &s, sizeof(s)) ) 3.176 goto serr; 3.177 3.178 - if( listen(ld, 1) ) 3.179 + if ( listen(ld, 1) ) 3.180 goto serr; 3.181 3.182 dlen=sizeof(struct sockaddr); 3.183 - if( (sd = accept(ld, (struct sockaddr *) &d, &dlen )) < 0 ) 3.184 + if ( (sd = accept(ld, (struct sockaddr *) &d, &dlen )) < 0 ) 3.185 goto serr; 3.186 3.187 plen = sizeof(p); 3.188 - if (getpeername(sd, (struct sockaddr_in *) &p, 3.189 - &plen) < 0) { 3.190 + if ( getpeername(sd, (struct sockaddr_in *) &p, 3.191 + &plen) < 0 ) 3.192 goto serr; 3.193 - } 3.194 3.195 - printf("Accepted connection from %s\n", 3.196 - inet_ntoa(p.sin_addr)); 3.197 + printf("Accepted connection from %s\n", inet_ntoa(p.sin_addr)); 3.198 3.199 sockbufsize=128*1024; 3.200 - if (setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &sockbufsize, sizeof sockbufsize) < 0) 3.201 - { 3.202 + if ( setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &sockbufsize, 3.203 + sizeof sockbufsize) < 0 ) 3.204 goto serr; 3.205 - } 3.206 3.207 - if ( xc_linux_restore(xc->xc_handle, dom, flags, readerfn, (void*)sd, &dom) == 0 ) 3.208 + if ( xc_linux_restore(xc->xc_handle, dom, flags, 3.209 + readerfn, (void*)sd, &dom) == 0 ) 3.210 { 3.211 close(sd); 3.212 Py_INCREF(zero); 3.213 return zero; 3.214 } 3.215 3.216 - serr: 3.217 - 3.218 + serr: 3.219 PyErr_SetFromErrno(xc_error); 3.220 - if(ld)close(ld); 3.221 - if(sd)close(sd); 3.222 + if ( ld >= 0 ) close(ld); 3.223 + if ( sd >= 0 ) close(sd); 3.224 return NULL; 3.225 } 3.226 else 3.227 { 3.228 - int fd; 3.229 - gzFile gfd; 3.230 + int fd = -1; 3.231 + gzFile gfd = NULL; 3.232 3.233 int readerfn(void *fd, void *buf, size_t count) 3.234 { 3.235 @@ -442,7 +437,7 @@ static PyObject *pyxc_linux_restore(PyOb 3.236 return ! (rc == count); 3.237 } 3.238 3.239 - if (strncmp(state_file,"file:",strlen("file:")) == 0) 3.240 + if ( strncmp(state_file,"file:",strlen("file:")) == 0 ) 3.241 state_file += strlen("file:"); 3.242 3.243 if ( (fd = open(state_file, O_RDONLY)) == -1 ) 3.244 @@ -464,7 +459,8 @@ static PyObject *pyxc_linux_restore(PyOb 3.245 } 3.246 3.247 3.248 - if ( xc_linux_restore(xc->xc_handle, dom, flags, readerfn, gfd, &dom) == 0 ) 3.249 + if ( xc_linux_restore(xc->xc_handle, dom, flags, 3.250 + readerfn, gfd, &dom) == 0 ) 3.251 { 3.252 gzclose(gfd); 3.253 close(fd); 3.254 @@ -475,8 +471,8 @@ static PyObject *pyxc_linux_restore(PyOb 3.255 3.256 err: 3.257 PyErr_SetFromErrno(xc_error); 3.258 - if(gfd)gzclose(gfd); 3.259 - if(fd)close(fd); 3.260 + if ( gfd != NULL ) gzclose(gfd); 3.261 + if ( fd >= 0 ) close(fd); 3.262 return NULL; 3.263 } 3.264
4.1 --- a/tools/xenctl/lib/utils.py Tue May 11 14:57:44 2004 +0000 4.2 +++ b/tools/xenctl/lib/utils.py Tue May 11 15:02:26 2004 +0000 4.3 @@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'): 4.4 return m.group(1) 4.5 return None 4.6 4.7 -def get_current_ipgw(dev='eth0'): 4.8 - """Return a string containing the IP gateway for the given 4.9 - network interface (default 'eth0'). 4.10 - """ 4.11 +def get_current_ipgw(): 4.12 + """Return a string containing the default IP gateway.""" 4.13 fd = os.popen( '/sbin/route -n' ) 4.14 lines = fd.readlines() 4.15 for line in lines: 4.16 - m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' + 4.17 - '\s+\S+\s+\S*G.*' + dev + '.*', line ) 4.18 + m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' + 4.19 + '\s+0.0.0.0+\s+\S*G.*', line ) 4.20 if m: 4.21 return m.group(1) 4.22 return None
5.1 --- a/tools/xend/lib/domain_controller.h Tue May 11 14:57:44 2004 +0000 5.2 +++ b/tools/xend/lib/domain_controller.h Tue May 11 15:02:26 2004 +0000 5.3 @@ -342,6 +342,7 @@ typedef struct { 5.4 unsigned int handle; 5.5 unsigned int status; 5.6 unsigned int evtchn; /* status == NETIF_INTERFACE_STATUS_CONNECTED */ 5.7 + u8 mac[6]; /* status == NETIF_INTERFACE_STATUS_CONNECTED */ 5.8 } netif_fe_interface_status_changed_t; 5.9 5.10 /* 5.11 @@ -373,7 +374,8 @@ typedef struct { 5.12 */ 5.13 typedef struct { 5.14 unsigned int handle; 5.15 - unsigned long shmem_frame; 5.16 + unsigned long tx_shmem_frame; 5.17 + unsigned long rx_shmem_frame; 5.18 } netif_fe_interface_connect_t; 5.19 5.20 /* 5.21 @@ -434,6 +436,7 @@ typedef struct { 5.22 /* IN */ 5.23 domid_t domid; /* Domain attached to new interface. */ 5.24 unsigned int netif_handle; /* Domain-specific interface handle. */ 5.25 + u8 mac[6]; 5.26 /* OUT */ 5.27 unsigned int status; 5.28 } netif_be_create_t; 5.29 @@ -463,7 +466,8 @@ typedef struct { 5.30 domid_t domid; /* Domain attached to new interface. */ 5.31 unsigned int netif_handle; /* Domain-specific interface handle. */ 5.32 unsigned int evtchn; /* Event channel for notifications. */ 5.33 - unsigned long shmem_frame; /* Page cont. shared comms window. */ 5.34 + unsigned long tx_shmem_frame; /* Page cont. tx shared comms window. */ 5.35 + unsigned long rx_shmem_frame; /* Page cont. rx shared comms window. */ 5.36 /* OUT */ 5.37 unsigned int status; 5.38 } netif_be_connect_t;
6.1 --- a/tools/xend/lib/main.py Tue May 11 14:57:44 2004 +0000 6.2 +++ b/tools/xend/lib/main.py Tue May 11 15:02:26 2004 +0000 6.3 @@ -5,7 +5,7 @@ 6.4 ########################################################### 6.5 6.6 import errno, re, os, pwd, select, signal, socket, struct, sys, time 6.7 -import xend.blkif, xend.console, xend.manager, xend.utils, Xc 6.8 +import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc 6.9 6.10 6.11 # The following parameters could be placed in a configuration file. 6.12 @@ -19,6 +19,8 @@ UNIX_SOCK = 'management_sock' # relat 6.13 CMSG_CONSOLE = 0 6.14 CMSG_BLKIF_BE = 1 6.15 CMSG_BLKIF_FE = 2 6.16 +CMSG_NETIF_BE = 3 6.17 +CMSG_NETIF_FE = 4 6.18 6.19 6.20 def port_from_dom(dom): 6.21 @@ -162,6 +164,10 @@ def daemon_loop(): 6.22 if xend.blkif.interface.list.has_key(idx): 6.23 blk_if = xend.blkif.interface.list[idx] 6.24 6.25 + net_if = False 6.26 + if xend.netif.interface.list.has_key(idx): 6.27 + net_if = xend.netif.interface.list[idx] 6.28 + 6.29 # If we pick up a disconnect notification then we do any necessary 6.30 # cleanup. 6.31 if type == notifier.EXCEPTION: 6.32 @@ -175,6 +181,9 @@ def daemon_loop(): 6.33 if blk_if: 6.34 blk_if.destroy() 6.35 del blk_if 6.36 + if net_if: 6.37 + net_if.destroy() 6.38 + del net_if 6.39 continue 6.40 6.41 # Process incoming requests. 6.42 @@ -188,6 +197,10 @@ def daemon_loop(): 6.43 blk_if.ctrlif_rx_req(port, msg) 6.44 elif type == CMSG_BLKIF_BE and port == dom0_port: 6.45 xend.blkif.backend_rx_req(port, msg) 6.46 + elif type == CMSG_NETIF_FE and net_if: 6.47 + net_if.ctrlif_rx_req(port, msg) 6.48 + elif type == CMSG_NETIF_BE and port == dom0_port: 6.49 + xend.netif.backend_rx_req(port, msg) 6.50 else: 6.51 port.write_response(msg) 6.52 6.53 @@ -198,6 +211,8 @@ def daemon_loop(): 6.54 type = (msg.get_header())['type'] 6.55 if type == CMSG_BLKIF_BE and port == dom0_port: 6.56 xend.blkif.backend_rx_rsp(port, msg) 6.57 + elif type == CMSG_NETIF_BE and port == dom0_port: 6.58 + xend.netif.backend_rx_rsp(port, msg) 6.59 6.60 # Send console data. 6.61 if con_if and con_if.ctrlif_transmit_work(port): 6.62 @@ -207,10 +222,18 @@ def daemon_loop(): 6.63 if blk_if and blk_if.ctrlif_transmit_work(port): 6.64 work_done = True 6.65 6.66 + # Send netif messages. 6.67 + if net_if and net_if.ctrlif_transmit_work(port): 6.68 + work_done = True 6.69 + 6.70 # Back-end block-device work. 6.71 if port == dom0_port and xend.blkif.backend_do_work(port): 6.72 work_done = True 6.73 6.74 + # Back-end network-device work. 6.75 + if port == dom0_port and xend.netif.backend_do_work(port): 6.76 + work_done = True 6.77 + 6.78 # Finally, notify the remote end of any work that we did. 6.79 if work_done: 6.80 port.notify()
7.1 --- a/tools/xend/lib/manager.py Tue May 11 14:57:44 2004 +0000 7.2 +++ b/tools/xend/lib/manager.py Tue May 11 15:02:26 2004 +0000 7.3 @@ -4,7 +4,7 @@ 7.4 ## Copyright (c) 2004, K A Fraser (University of Cambridge) 7.5 ############################################################# 7.6 7.7 -import xend.blkif, xend.console, xend.main, xend.utils 7.8 +import xend.blkif, xend.netif, xend.console, xend.main, xend.utils 7.9 7.10 7.11 ## 7.12 @@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, 7.13 7.14 # Response is deferred until back-end driver sends acknowledgement. 7.15 return None 7.16 + 7.17 + 7.18 +## 7.19 +## new_network_interface: 7.20 +## Create a new network interface for the specified domain @dom. 7.21 +## 7.22 +def new_network_interface(dom, handle=-1): 7.23 + # By default we create an interface with handle zero. 7.24 + if handle < 0: 7.25 + handle = 0 7.26 + 7.27 + # We only support one interface per domain, which must have handle zero. 7.28 + if handle != 0: 7.29 + response = { 'success': False } 7.30 + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ 7.31 + 'is supported)' % handle 7.32 + return response 7.33 + 7.34 + # Find local event-channel port associated with the specified domain. 7.35 + port = xend.main.port_from_dom(dom) 7.36 + if not port: 7.37 + response = { 'success': False } 7.38 + response['error_type'] = 'Unknown domain %d' % dom 7.39 + return response 7.40 + 7.41 + # The interface must not already exist. 7.42 + if xend.netif.interface.list.has_key(port.local_port): 7.43 + response = { 'success': False } 7.44 + response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \ 7.45 + 'exists' % (dom, handle) 7.46 + return response 7.47 + 7.48 + # Create the new interface. Initially no virtual devices are attached. 7.49 + xend.netif.interface(dom, port.local_port) 7.50 + 7.51 + # Response is deferred until back-end driver sends acknowledgement. 7.52 + return None
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/tools/xend/lib/netif.py Tue May 11 15:02:26 2004 +0000 8.3 @@ -0,0 +1,144 @@ 8.4 + 8.5 +################################################################### 8.6 +## xend/netif.py -- Network-interface management functions for Xend 8.7 +## Copyright (c) 2004, K A Fraser (University of Cambridge) 8.8 +################################################################### 8.9 + 8.10 +import errno, random, re, os, select, signal, socket, struct, sys 8.11 +import xend.main, xend.console, xend.manager, xend.utils, Xc 8.12 + 8.13 +CMSG_NETIF_BE = 3 8.14 +CMSG_NETIF_FE = 4 8.15 +CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED = 0 8.16 +CMSG_NETIF_FE_DRIVER_STATUS_CHANGED = 32 8.17 +CMSG_NETIF_FE_INTERFACE_CONNECT = 33 8.18 +CMSG_NETIF_FE_INTERFACE_DISCONNECT = 34 8.19 +CMSG_NETIF_BE_CREATE = 0 8.20 +CMSG_NETIF_BE_DESTROY = 1 8.21 +CMSG_NETIF_BE_CONNECT = 2 8.22 +CMSG_NETIF_BE_DISCONNECT = 3 8.23 + 8.24 +pendmsg = None 8.25 +pendaddr = None 8.26 + 8.27 +def backend_tx_req(msg): 8.28 + port = xend.main.dom0_port 8.29 + if port.space_to_write_request(): 8.30 + port.write_request(msg) 8.31 + port.notify() 8.32 + else: 8.33 + xend.netif.pendmsg = msg 8.34 + 8.35 +def backend_rx_req(port, msg): 8.36 + port.write_response(msg) 8.37 + 8.38 +def backend_rx_rsp(port, msg): 8.39 + subtype = (msg.get_header())['subtype'] 8.40 + print "Received netif-be response, subtype %d" % subtype 8.41 + if subtype == CMSG_NETIF_BE_CREATE: 8.42 + rsp = { 'success': True } 8.43 + xend.main.send_management_response(rsp, xend.netif.pendaddr) 8.44 + elif subtype == CMSG_NETIF_BE_CONNECT: 8.45 + (dom,hnd,evtchn,tx_frame,rx_frame,st) = \ 8.46 + struct.unpack("QIILLI", msg.get_payload()) 8.47 + netif = interface.list[xend.main.port_from_dom(dom).local_port] 8.48 + msg = xend.utils.message(CMSG_NETIF_FE, \ 8.49 + CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0) 8.50 + msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \ 8.51 + netif.evtchn['port2'], \ 8.52 + netif.mac[0],netif.mac[1], \ 8.53 + netif.mac[2],netif.mac[3], \ 8.54 + netif.mac[4],netif.mac[5], \ 8.55 + 0,0)) 8.56 + netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg) 8.57 + 8.58 +def backend_do_work(port): 8.59 + global pendmsg 8.60 + if pendmsg and port.space_to_write_request(): 8.61 + port.write_request(pendmsg) 8.62 + pendmsg = None 8.63 + return True 8.64 + return False 8.65 + 8.66 + 8.67 +class interface: 8.68 + 8.69 + # Dictionary of all network-device interfaces. 8.70 + list = {} 8.71 + 8.72 + 8.73 + # NB. 'key' is an opaque value that has no meaning in this class. 8.74 + def __init__(self, dom, key): 8.75 + self.dom = dom 8.76 + self.key = key 8.77 + self.pendmsg = None 8.78 + 8.79 + # VIFs get a random MAC address with a "special" vendor id. 8.80 + # 8.81 + # NB. The vendor is currently an "obsolete" one that used to belong 8.82 + # to DEC (AA-00-00). Using it is probably a bit rude :-) 8.83 + # 8.84 + # NB2. The first bit of the first random octet is set to zero for 8.85 + # all dynamic MAC addresses. This may allow us to manually specify 8.86 + # MAC addresses for some VIFs with no fear of clashes. 8.87 + self.mac = [ 0xaa, 0x00, 0x00 ] 8.88 + self.mac.append(int(random.random()*128)) 8.89 + self.mac.append(int(random.random()*256)) 8.90 + self.mac.append(int(random.random()*256)) 8.91 + 8.92 + interface.list[key] = self 8.93 + msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0) 8.94 + msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \ 8.95 + self.mac[0],self.mac[1], \ 8.96 + self.mac[2],self.mac[3], \ 8.97 + self.mac[4],self.mac[5], \ 8.98 + 0,0,0)) 8.99 + xend.netif.pendaddr = xend.main.mgmt_req_addr 8.100 + backend_tx_req(msg) 8.101 + 8.102 + 8.103 + # Completely destroy this interface. 8.104 + def destroy(self): 8.105 + del interface.list[self.key] 8.106 + msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0) 8.107 + msg.append_payload(struct.pack("QII",self.dom,0,0)) 8.108 + backend_tx_req(msg) 8.109 + 8.110 + 8.111 + # The parameter @port is the control-interface event channel. This method 8.112 + # returns True if messages were written to the control interface. 8.113 + def ctrlif_transmit_work(self, port): 8.114 + if self.pendmsg and port.space_to_write_request(): 8.115 + port.write_request(self.pendmsg) 8.116 + self.pendmsg = None 8.117 + return True 8.118 + return False 8.119 + 8.120 + def ctrlif_tx_req(self, port, msg): 8.121 + if port.space_to_write_request(): 8.122 + port.write_request(msg) 8.123 + port.notify() 8.124 + else: 8.125 + self.pendmsg = msg 8.126 + 8.127 + def ctrlif_rx_req(self, port, msg): 8.128 + port.write_response(msg) 8.129 + subtype = (msg.get_header())['subtype'] 8.130 + if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED: 8.131 + msg = xend.utils.message(CMSG_NETIF_FE, \ 8.132 + CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0) 8.133 + msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \ 8.134 + self.mac[1],self.mac[2], \ 8.135 + self.mac[3],self.mac[4], \ 8.136 + self.mac[5],0,0)) 8.137 + self.ctrlif_tx_req(port, msg) 8.138 + elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT: 8.139 + (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload()) 8.140 + xc = Xc.new() 8.141 + self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom) 8.142 + msg = xend.utils.message(CMSG_NETIF_BE, \ 8.143 + CMSG_NETIF_BE_CONNECT, 0) 8.144 + msg.append_payload(struct.pack("QIILLI",self.dom,0, \ 8.145 + self.evtchn['port1'],tx_frame, \ 8.146 + rx_frame,0)) 8.147 + backend_tx_req(msg)
9.1 --- a/xen/arch/i386/entry.S Tue May 11 14:57:44 2004 +0000 9.2 +++ b/xen/arch/i386/entry.S Tue May 11 15:02:26 2004 +0000 9.3 @@ -145,16 +145,13 @@ NT_MASK = 0x00004000 9.4 pushl %ecx; \ 9.5 pushl %ebx; \ 9.6 9.7 -#define SAVE_ALL_NOSTI \ 9.8 +#define SAVE_ALL \ 9.9 SAVE_ALL_NOSEGREGS \ 9.10 movl $(__HYPERVISOR_DS),%edx; \ 9.11 movl %edx,%ds; \ 9.12 movl %edx,%es; \ 9.13 movl %edx,%fs; \ 9.14 movl %edx,%gs; 9.15 - 9.16 -#define SAVE_ALL \ 9.17 - SAVE_ALL_NOSTI \ 9.18 sti; 9.19 9.20 #define GET_CURRENT(reg) \ 9.21 @@ -406,7 +403,11 @@ create_bounce_frame: 9.22 jz 1f /* jump if returning to an existing ring-1 activation */ 9.23 /* obtain ss/esp from TSS -- no current ring-1 activations */ 9.24 movzwl PROCESSOR(%ebx),%eax 9.25 - shll $8,%eax /* multiply by 256 */ 9.26 + /* next 4 lines multiply %eax by 8320, which is sizeof(tss_struct) */ 9.27 + movl %eax, %ecx 9.28 + shll $7, %ecx 9.29 + shll $13, %eax 9.30 + addl %ecx,%eax 9.31 addl $init_tss + 12,%eax 9.32 movl (%eax),%esi /* tss->esp1 */ 9.33 FAULT6: movl 4(%eax),%ds /* tss->ss1 */ 9.34 @@ -529,12 +530,18 @@ error_code: 9.35 movl GS(%esp), %edi # get the function address 9.36 movl %eax, ORIG_EAX(%esp) 9.37 movl %ecx, GS(%esp) 9.38 - movl %esp,%edx 9.39 - pushl %esi # push the error code 9.40 - pushl %edx # push the pt_regs pointer 9.41 movl $(__HYPERVISOR_DS),%edx 9.42 movl %edx,%ds 9.43 movl %edx,%es 9.44 + movl %edx,%fs 9.45 + movl %edx,%gs 9.46 + movl EFLAGS(%esp),%edx 9.47 + testl $0x200,%edx # Is IF asserted in saved EFLAGS? 9.48 + jz 1f # Don't STI if it isn't. 9.49 + sti 9.50 +1: movl %esp,%edx 9.51 + pushl %esi # push the error code 9.52 + pushl %edx # push the pt_regs pointer 9.53 GET_CURRENT(%ebx) 9.54 call *%edi 9.55 addl $8,%esp
10.1 --- a/xen/common/dom_mem_ops.c Tue May 11 14:57:44 2004 +0000 10.2 +++ b/xen/common/dom_mem_ops.c Tue May 11 15:02:26 2004 +0000 10.3 @@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_st 10.4 { 10.5 /* Leave some slack pages; e.g., for the network. */ 10.6 if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 10.7 - (PAGE_SHIFT-10))) ) 10.8 + (PAGE_SHIFT-10))) ) 10.9 + { 10.10 + DPRINTK("Not enough slack: %u %u\n", 10.11 + free_pfns, 10.12 + SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10)); 10.13 break; 10.14 + } 10.15 10.16 /* NB. 'alloc_domain_page' does limit checking on pages per domain. */ 10.17 if ( unlikely((page = alloc_domain_page(p)) == NULL) ) 10.18 + { 10.19 + DPRINTK("Could not allocate a frame\n"); 10.20 break; 10.21 - 10.22 + } 10.23 + 10.24 /* Inform the domain of the new page's machine address. */ 10.25 mpfn = (unsigned long)(page - frame_table); 10.26 copy_to_user(op.pages, &mpfn, sizeof(mpfn));
11.1 --- a/xen/common/domain.c Tue May 11 14:57:44 2004 +0000 11.2 +++ b/xen/common/domain.c Tue May 11 15:02:26 2004 +0000 11.3 @@ -340,6 +340,8 @@ struct pfn_info *alloc_domain_page(struc 11.4 spin_lock(&p->page_list_lock); 11.5 if ( unlikely(p->tot_pages >= p->max_pages) ) 11.6 { 11.7 + DPRINTK("Over-allocation for domain %llu: %u >= %u\n", 11.8 + p->domain, p->tot_pages, p->max_pages); 11.9 spin_unlock(&p->page_list_lock); 11.10 goto free_and_exit; 11.11 } 11.12 @@ -894,7 +896,7 @@ int construct_dom0(struct task_struct *p 11.13 page->type_and_flags = 0; 11.14 page->count_and_flags = PGC_allocated | 1; 11.15 list_add_tail(&page->list, &p->page_list); 11.16 - p->tot_pages++; 11.17 + p->tot_pages++; p->max_pages++; 11.18 } 11.19 11.20 mpt_alloc = (vpt_start - v_start) + alloc_start;
12.1 --- a/xen/common/kernel.c Tue May 11 14:57:44 2004 +0000 12.2 +++ b/xen/common/kernel.c Tue May 11 15:02:26 2004 +0000 12.3 @@ -105,7 +105,6 @@ static struct { 12.4 void cmain(unsigned long magic, multiboot_info_t *mbi) 12.5 { 12.6 struct task_struct *new_dom; 12.7 - dom0_createdomain_t dom0_params; 12.8 unsigned long max_page; 12.9 unsigned char *cmdline; 12.10 module_t *mod = (module_t *)__va(mbi->mods_addr); 12.11 @@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboo 12.12 task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task; 12.13 12.14 /* Create initial domain 0. */ 12.15 - dom0_params.memory_kb = opt_dom0_mem; 12.16 new_dom = do_createdomain(0, 0); 12.17 if ( new_dom == NULL ) 12.18 panic("Error creating domain 0\n");
13.1 --- a/xen/common/memory.c Tue May 11 14:57:44 2004 +0000 13.2 +++ b/xen/common/memory.c Tue May 11 15:02:26 2004 +0000 13.3 @@ -415,6 +415,7 @@ static int get_page_from_l1e(l1_pgentry_ 13.4 { 13.5 unsigned long l1v = l1_pgentry_val(l1e); 13.6 unsigned long pfn = l1_pgentry_to_pagenr(l1e); 13.7 + extern int domain_iomem_in_pfn(struct task_struct *p, unsigned long pfn); 13.8 13.9 if ( !(l1v & _PAGE_PRESENT) ) 13.10 return 1; 13.11 @@ -428,7 +429,11 @@ static int get_page_from_l1e(l1_pgentry_ 13.12 if ( unlikely(!pfn_is_ram(pfn)) ) 13.13 { 13.14 if ( IS_PRIV(current) ) 13.15 - return 1; 13.16 + return 1; 13.17 + 13.18 + if ( IS_CAPABLE_PHYSDEV(current) ) 13.19 + return domain_iomem_in_pfn(current, pfn); 13.20 + 13.21 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); 13.22 return 0; 13.23 } 13.24 @@ -915,7 +920,8 @@ static int do_extended_command(unsigned 13.25 break; 13.26 13.27 case MMUEXT_SET_SUBJECTDOM_H: 13.28 - percpu_info[cpu].subject_id |= ((domid_t)((ptr&~0xFFFF)|(val>>16)))<<32; 13.29 + percpu_info[cpu].subject_id |= 13.30 + ((domid_t)((ptr&~0xFFFF)|(val>>16)))<<32; 13.31 13.32 if ( !IS_PRIV(current) ) 13.33 { 13.34 @@ -939,6 +945,33 @@ static int do_extended_command(unsigned 13.35 } 13.36 break; 13.37 13.38 + /* XXX This function is racey! */ 13.39 + case MMUEXT_REASSIGN_PAGE: 13.40 + if ( unlikely(!IS_PRIV(current)) ) 13.41 + { 13.42 + MEM_LOG("Dom %llu has no privilege to reassign page ownership", 13.43 + current->domain); 13.44 + okay = 0; 13.45 + } 13.46 + else if ( likely(percpu_info[cpu].gps != NULL) ) 13.47 + { 13.48 + current->tot_pages--; 13.49 + percpu_info[cpu].gps->tot_pages++; 13.50 + page->u.domain = percpu_info[cpu].gps; 13.51 + } 13.52 + else 13.53 + { 13.54 + MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn); 13.55 + okay = 0; 13.56 + } 13.57 + break; 13.58 + 13.59 + case MMUEXT_RESET_SUBJECTDOM: 13.60 + if ( percpu_info[cpu].gps != NULL ) 13.61 + put_task_struct(percpu_info[cpu].gps); 13.62 + percpu_info[cpu].gps = percpu_info[cpu].pts = NULL; 13.63 + break; 13.64 + 13.65 default: 13.66 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); 13.67 okay = 0;
14.1 --- a/xen/common/physdev.c Tue May 11 14:57:44 2004 +0000 14.2 +++ b/xen/common/physdev.c Tue May 11 15:02:26 2004 +0000 14.3 @@ -202,22 +202,55 @@ int physdev_pci_access_modify( 14.4 &p->io_bitmap_sel); 14.5 } 14.6 } 14.7 - else if ( r->flags & IORESOURCE_MEM ) 14.8 - { 14.9 - /* allow domain to map IO memory for this device */ 14.10 - INFO("Giving domain %llu memory resources (%lx - %lx) " 14.11 - "for device %s\n", dom, r->start, r->end, pdev->slot_name); 14.12 - for ( j = r->start; j < r->end + 1; j += PAGE_SIZE ) 14.13 - SHARE_PFN_WITH_DOMAIN(frame_table + (j >> PAGE_SHIFT), p); 14.14 - } 14.15 - } 14.16 14.17 - 14.18 + /* rights to IO memory regions are checked when the domain maps them */ 14.19 + } 14.20 out: 14.21 put_task_struct(p); 14.22 return rc; 14.23 } 14.24 14.25 +/* Check if a domain controls a device with IO memory within frame @pfn. 14.26 + * Returns: 1 if the domain should be allowed to map @pfn, 0 otherwise. */ 14.27 +int domain_iomem_in_pfn(struct task_struct *p, unsigned long pfn) 14.28 +{ 14.29 + int ret = 0; 14.30 + struct list_head *l; 14.31 + 14.32 + VERBOSE_INFO("Checking if physdev-capable domain %llu needs access to " 14.33 + "pfn %08lx\n", p->domain, pfn); 14.34 + 14.35 + spin_lock(&p->pcidev_lock); 14.36 + 14.37 + list_for_each(l, &p->pcidev_list) 14.38 + { 14.39 + int i; 14.40 + phys_dev_t *phys_dev = list_entry(l, phys_dev_t, node); 14.41 + struct pci_dev *pci_dev = phys_dev->dev; 14.42 + 14.43 + for ( i = 0; (i < DEVICE_COUNT_RESOURCE) && (ret == 0); i++ ) 14.44 + { 14.45 + struct resource *r = &pci_dev->resource[i]; 14.46 + 14.47 + if ( r->flags & IORESOURCE_MEM ) 14.48 + if ( (r->start >> PAGE_SHIFT) == pfn 14.49 + || (r->end >> PAGE_SHIFT) == pfn 14.50 + || ((r->start >> PAGE_SHIFT < pfn) 14.51 + && (r->end >> PAGE_SHIFT > pfn)) ) 14.52 + ret = 1; 14.53 + } 14.54 + 14.55 + if ( ret != 0 ) break; 14.56 + } 14.57 + 14.58 + spin_unlock(&p->pcidev_lock); 14.59 + 14.60 + VERBOSE_INFO("Domain %llu %s mapping of pfn %08lx\n", 14.61 + p->domain, ret ? "allowed" : "disallowed", pfn); 14.62 + 14.63 + return ret; 14.64 +} 14.65 + 14.66 /* check if a domain has general access to a device */ 14.67 inline static int check_dev_acc (struct task_struct *p, 14.68 int bus, int dev, int func, 14.69 @@ -235,8 +268,7 @@ inline static int check_dev_acc (struct 14.70 if ( bus > PCI_BUSMAX || dev > PCI_DEVMAX || func > PCI_FUNCMAX ) 14.71 return -EINVAL; 14.72 14.73 - VERBOSE_INFO("a=%c b=%x d=%x f=%x ", (acc == ACC_READ) ? 'R' : 'W', 14.74 - mask, bus, dev, func); 14.75 + VERBOSE_INFO("b=%x d=%x f=%x ", bus, dev, func); 14.76 14.77 /* check target device */ 14.78 target_devfn = PCI_DEVFN(dev, func); 14.79 @@ -296,8 +328,8 @@ static int do_base_address_access(phys_d 14.80 /* We could set *val to some value but the guest may well be in trouble 14.81 * anyway if this write fails. Hopefully the printk will give us a 14.82 * clue what went wrong. */ 14.83 - printk("Guest attempting sub-dword %s to BASE_ADDRESS %d\n", 14.84 - (acc == ACC_READ) ? "read" : "write", idx); 14.85 + printk("Guest %llu attempting sub-dword %s to BASE_ADDRESS %d\n", 14.86 + pdev->owner->domain, (acc == ACC_READ) ? "read" : "write", idx); 14.87 14.88 return -EPERM; 14.89 } 14.90 @@ -328,7 +360,7 @@ static int do_base_address_access(phys_d 14.91 } 14.92 } 14.93 VERBOSE_INFO("fixed pci write: %02x:%02x:%02x reg=0x%02x len=0x%02x" 14.94 - " val=0x%08x %lx\n", 14.95 + " val=0x%08x %x\n", 14.96 dev->bus->number, PCI_SLOT(dev->devfn), 14.97 PCI_FUNC(dev->devfn), reg, len, *val, pdev->state); 14.98 } 14.99 @@ -365,7 +397,7 @@ static int do_base_address_access(phys_d 14.100 } 14.101 } 14.102 VERBOSE_INFO("fixed pci read: %02x:%02x:%02x reg=0x%02x len=0x%02x" 14.103 - " val=0x%08x %lx\n", 14.104 + " val=0x%08x %x\n", 14.105 dev->bus->number, PCI_SLOT(dev->devfn), 14.106 PCI_FUNC(dev->devfn), reg, len, *val, pdev->state); 14.107 } 14.108 @@ -422,9 +454,9 @@ static int do_rom_address_access(phys_de 14.109 } 14.110 } 14.111 VERBOSE_INFO("fixed pci write: %02x:%02x:%02x reg=0x%02x len=0x%02x" 14.112 - " val=0x%08x %lx\n", 14.113 + " val=0x%08x %x\n", 14.114 dev->bus->number, PCI_SLOT(dev->devfn), 14.115 - PCI_FUNC(dev->devfn), reg, len, *val, pdev->state); 14.116 + PCI_FUNC(dev->devfn), PCI_ROM_ADDRESS, len, *val, pdev->state); 14.117 } 14.118 else if ( acc == ACC_READ ) 14.119 { 14.120 @@ -442,9 +474,9 @@ static int do_rom_address_access(phys_de 14.121 *val = *val | (orig_val & 0x1); 14.122 } 14.123 VERBOSE_INFO("fixed pci read: %02x:%02x:%02x reg=0x%02x len=0x%02x" 14.124 - " val=0x%08x %lx\n", 14.125 + " val=0x%08x %x\n", 14.126 dev->bus->number, PCI_SLOT(dev->devfn), 14.127 - PCI_FUNC(dev->devfn), reg, len, *val, pdev->state); 14.128 + PCI_FUNC(dev->devfn), PCI_ROM_ADDRESS, len, *val, pdev->state); 14.129 } 14.130 14.131 return ret;
15.1 --- a/xen/include/asm-i386/processor.h Tue May 11 14:57:44 2004 +0000 15.2 +++ b/xen/include/asm-i386/processor.h Tue May 11 15:02:26 2004 +0000 15.3 @@ -375,7 +375,7 @@ struct tss_struct { 15.4 unsigned short trace, bitmap; 15.5 unsigned long io_bitmap[IO_BITMAP_SIZE+1]; 15.6 /* 15.7 - * pads the TSS to be cacheline-aligned (size is 0x100) 15.8 + * pads the TSS to be cacheline-aligned (total size is 0x2080) 15.9 */ 15.10 unsigned long __cacheline_filler[5]; 15.11 };
16.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h Tue May 11 14:57:44 2004 +0000 16.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h Tue May 11 15:02:26 2004 +0000 16.3 @@ -127,6 +127,12 @@ 16.4 * (ptr[31:15],val[31:15]) -- dom[63:32] 16.5 * NB. This command must be immediately preceded by SET_SUBJECTDOM_L. 16.6 * 16.7 + * val[7:0] == MMUEXT_REASSIGN_PAGE: 16.8 + * ptr[:2] -- machine address within page to be reassigned to the GPS. 16.9 + * 16.10 + * val[7:0] == MMUEXT_RESET_SUBJECTDOM: 16.11 + * Resets both the GPS and the PTS to their defaults (i.e., calling domain). 16.12 + * 16.13 * Notes on constraints on the above arguments: 16.14 * [1] The page frame containing the machine address must belong to the PTS. 16.15 * [2] If the PTE is valid (i.e., bit 0 is set) then the specified page frame 16.16 @@ -151,6 +157,8 @@ 16.17 #define MMUEXT_SET_SUBJECTDOM_L 9 /* (ptr[31:15],val[31:15]) = dom[31:0] */ 16.18 #define MMUEXT_SET_SUBJECTDOM_H 10 /* (ptr[31:15],val[31:15]) = dom[63:32] */ 16.19 #define SET_PAGETABLE_SUBJECTDOM (1<<14) /* OR into 'val' arg of SUBJECTDOM_H*/ 16.20 +#define MMUEXT_REASSIGN_PAGE 11 16.21 +#define MMUEXT_RESET_SUBJECTDOM 12 16.22 #define MMUEXT_CMD_MASK 255 16.23 #define MMUEXT_CMD_SHIFT 8 16.24
17.1 --- a/xenolinux-2.4.26-sparse/arch/xen/config.in Tue May 11 14:57:44 2004 +0000 17.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/config.in Tue May 11 15:02:26 2004 +0000 17.3 @@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then 17.4 bool 'HIGHMEM I/O support' CONFIG_HIGHIO 17.5 fi 17.6 17.7 +define_int CONFIG_FORCE_MAX_ZONEORDER 12 17.8 + 17.9 #bool 'Symmetric multi-processing support' CONFIG_SMP 17.10 #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then 17.11 # define_bool CONFIG_HAVE_DEC_LOCK y
18.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig Tue May 11 14:57:44 2004 +0000 18.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig Tue May 11 15:02:26 2004 +0000 18.3 @@ -50,6 +50,7 @@ CONFIG_X86_TSC=y 18.4 CONFIG_X86_L1_CACHE_SHIFT=5 18.5 CONFIG_NOHIGHMEM=y 18.6 # CONFIG_HIGHMEM4G is not set 18.7 +CONFIG_FORCE_MAX_ZONEORDER=12 18.8 18.9 # 18.10 # General setup 18.11 @@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y 18.12 # Network testing 18.13 # 18.14 # CONFIG_NET_PKTGEN is not set 18.15 +CONFIG_NETDEVICES=y 18.16 18.17 # 18.18 # Block devices
19.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev Tue May 11 14:57:44 2004 +0000 19.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev Tue May 11 15:02:26 2004 +0000 19.3 @@ -51,6 +51,7 @@ CONFIG_X86_TSC=y 19.4 CONFIG_X86_L1_CACHE_SHIFT=5 19.5 CONFIG_NOHIGHMEM=y 19.6 # CONFIG_HIGHMEM4G is not set 19.7 +CONFIG_FORCE_MAX_ZONEORDER=12 19.8 19.9 # 19.10 # General setup 19.11 @@ -89,19 +90,7 @@ CONFIG_BINFMT_ELF=y 19.12 # 19.13 # Parallel port support 19.14 # 19.15 -CONFIG_PARPORT=y 19.16 -CONFIG_PARPORT_PC=y 19.17 -# CONFIG_PARPORT_PC_FIFO is not set 19.18 -# CONFIG_PARPORT_PC_SUPERIO is not set 19.19 -# CONFIG_PARPORT_PC_PCMCIA is not set 19.20 -# CONFIG_PARPORT_AMIGA is not set 19.21 -# CONFIG_PARPORT_MFC3 is not set 19.22 -# CONFIG_PARPORT_ATARI is not set 19.23 -# CONFIG_PARPORT_GSC is not set 19.24 -# CONFIG_PARPORT_SUNBPP is not set 19.25 -# CONFIG_PARPORT_IP22 is not set 19.26 -# CONFIG_PARPORT_OTHER is not set 19.27 -CONFIG_PARPORT_1284=y 19.28 +# CONFIG_PARPORT is not set 19.29 19.30 # 19.31 # Plug and Play configuration 19.32 @@ -112,7 +101,7 @@ CONFIG_PNP=y 19.33 # 19.34 # Block devices 19.35 # 19.36 -CONFIG_BLK_DEV_FD=y 19.37 +# CONFIG_BLK_DEV_FD is not set 19.38 # CONFIG_BLK_DEV_XD is not set 19.39 # CONFIG_PARIDE is not set 19.40 # CONFIG_BLK_CPQ_DA is not set 19.41 @@ -131,14 +120,14 @@ CONFIG_BLK_DEV_INITRD=y 19.42 # 19.43 # Multi-device support (RAID and LVM) 19.44 # 19.45 -CONFIG_MD=y 19.46 -CONFIG_BLK_DEV_MD=y 19.47 -CONFIG_MD_LINEAR=y 19.48 -CONFIG_MD_RAID0=y 19.49 -CONFIG_MD_RAID1=y 19.50 -CONFIG_MD_RAID5=y 19.51 -CONFIG_MD_MULTIPATH=y 19.52 -CONFIG_BLK_DEV_LVM=y 19.53 +# CONFIG_MD is not set 19.54 +# CONFIG_BLK_DEV_MD is not set 19.55 +# CONFIG_MD_LINEAR is not set 19.56 +# CONFIG_MD_RAID0 is not set 19.57 +# CONFIG_MD_RAID1 is not set 19.58 +# CONFIG_MD_RAID5 is not set 19.59 +# CONFIG_MD_MULTIPATH is not set 19.60 +# CONFIG_BLK_DEV_LVM is not set 19.61 19.62 # 19.63 # Networking options 19.64 @@ -234,7 +223,7 @@ CONFIG_IP_NF_TARGET_ULOG=y 19.65 # 19.66 # CONFIG_DEV_APPLETALK is not set 19.67 # CONFIG_DECNET is not set 19.68 -# CONFIG_BRIDGE is not set 19.69 +CONFIG_BRIDGE=y 19.70 # CONFIG_X25 is not set 19.71 # CONFIG_LAPB is not set 19.72 # CONFIG_LLC is not set 19.73 @@ -380,14 +369,7 @@ CONFIG_CHR_DEV_SG=y 19.74 # CONFIG_SCSI_AHA1740 is not set 19.75 CONFIG_SCSI_AACRAID=y 19.76 # CONFIG_SCSI_AIC7XXX is not set 19.77 -CONFIG_SCSI_AIC79XX=y 19.78 -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 19.79 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 19.80 -# CONFIG_AIC79XX_BUILD_FIRMWARE is not set 19.81 -# CONFIG_AIC79XX_ENABLE_RD_STRM is not set 19.82 -CONFIG_AIC79XX_DEBUG_ENABLE=y 19.83 -CONFIG_AIC79XX_DEBUG_MASK=0 19.84 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set 19.85 +# CONFIG_SCSI_AIC79XX is not set 19.86 # CONFIG_SCSI_AIC7XXX_OLD is not set 19.87 # CONFIG_SCSI_DPT_I2O is not set 19.88 # CONFIG_SCSI_ADVANSYS is not set 19.89 @@ -397,9 +379,9 @@ CONFIG_SCSI_MEGARAID=y 19.90 # CONFIG_SCSI_MEGARAID2 is not set 19.91 CONFIG_SCSI_BUSLOGIC=y 19.92 # CONFIG_SCSI_OMIT_FLASHPOINT is not set 19.93 -CONFIG_SCSI_CPQFCTS=y 19.94 +# CONFIG_SCSI_CPQFCTS is not set 19.95 # CONFIG_SCSI_DMX3191D is not set 19.96 -CONFIG_SCSI_DTC3280=y 19.97 +# CONFIG_SCSI_DTC3280 is not set 19.98 # CONFIG_SCSI_EATA is not set 19.99 # CONFIG_SCSI_EATA_DMA is not set 19.100 # CONFIG_SCSI_EATA_PIO is not set 19.101 @@ -409,15 +391,11 @@ CONFIG_SCSI_DTC3280=y 19.102 # CONFIG_SCSI_IPS is not set 19.103 # CONFIG_SCSI_INITIO is not set 19.104 # CONFIG_SCSI_INIA100 is not set 19.105 -# CONFIG_SCSI_PPA is not set 19.106 -# CONFIG_SCSI_IMM is not set 19.107 # CONFIG_SCSI_NCR53C406A is not set 19.108 # CONFIG_SCSI_NCR53C7xx is not set 19.109 -CONFIG_SCSI_SYM53C8XX_2=y 19.110 -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 19.111 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 19.112 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 19.113 -# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set 19.114 +# CONFIG_SCSI_SYM53C8XX_2 is not set 19.115 +# CONFIG_SCSI_NCR53C8XX is not set 19.116 +# CONFIG_SCSI_SYM53C8XX is not set 19.117 # CONFIG_SCSI_PAS16 is not set 19.118 # CONFIG_SCSI_PCI2000 is not set 19.119 # CONFIG_SCSI_PCI2220I is not set 19.120 @@ -510,9 +488,7 @@ CONFIG_PCNET32=y 19.121 # CONFIG_APRICOT is not set 19.122 # CONFIG_B44 is not set 19.123 # CONFIG_CS89x0 is not set 19.124 -CONFIG_TULIP=y 19.125 -# CONFIG_TULIP_MWI is not set 19.126 -# CONFIG_TULIP_MMIO is not set 19.127 +# CONFIG_TULIP is not set 19.128 # CONFIG_DE4X5 is not set 19.129 # CONFIG_DGRS is not set 19.130 # CONFIG_DM9102 is not set 19.131 @@ -545,8 +521,7 @@ CONFIG_TULIP=y 19.132 # 19.133 # Ethernet (1000 Mbit) 19.134 # 19.135 -CONFIG_ACENIC=y 19.136 -# CONFIG_ACENIC_OMIT_TIGON_I is not set 19.137 +# CONFIG_ACENIC is not set 19.138 # CONFIG_DL2K is not set 19.139 CONFIG_E1000=y 19.140 # CONFIG_E1000_NAPI is not set 19.141 @@ -621,9 +596,6 @@ CONFIG_VT_CONSOLE=y 19.142 # CONFIG_SERIAL_NONSTANDARD is not set 19.143 CONFIG_UNIX98_PTYS=y 19.144 CONFIG_UNIX98_PTY_COUNT=256 19.145 -# CONFIG_PRINTER is not set 19.146 -# CONFIG_PPDEV is not set 19.147 -# CONFIG_TIPAR is not set 19.148 19.149 # 19.150 # I2C support 19.151 @@ -869,107 +841,7 @@ CONFIG_DUMMY_CONSOLE=y 19.152 # 19.153 # USB support 19.154 # 19.155 -CONFIG_USB=y 19.156 -CONFIG_USB_DEBUG=y 19.157 - 19.158 -# 19.159 -# Miscellaneous USB options 19.160 -# 19.161 -# CONFIG_USB_DEVICEFS is not set 19.162 -# CONFIG_USB_BANDWIDTH is not set 19.163 - 19.164 -# 19.165 -# USB Host Controller Drivers 19.166 -# 19.167 -# CONFIG_USB_EHCI_HCD is not set 19.168 -CONFIG_USB_UHCI=y 19.169 -# CONFIG_USB_UHCI_ALT is not set 19.170 -CONFIG_USB_OHCI=y 19.171 -# CONFIG_USB_SL811HS_ALT is not set 19.172 -# CONFIG_USB_SL811HS is not set 19.173 - 19.174 -# 19.175 -# USB Device Class drivers 19.176 -# 19.177 -# CONFIG_USB_AUDIO is not set 19.178 -# CONFIG_USB_EMI26 is not set 19.179 -# CONFIG_USB_BLUETOOTH is not set 19.180 -# CONFIG_USB_MIDI is not set 19.181 -# CONFIG_USB_STORAGE is not set 19.182 -# CONFIG_USB_STORAGE_DEBUG is not set 19.183 -# CONFIG_USB_STORAGE_DATAFAB is not set 19.184 -# CONFIG_USB_STORAGE_FREECOM is not set 19.185 -# CONFIG_USB_STORAGE_ISD200 is not set 19.186 -# CONFIG_USB_STORAGE_DPCM is not set 19.187 -# CONFIG_USB_STORAGE_HP8200e is not set 19.188 -# CONFIG_USB_STORAGE_SDDR09 is not set 19.189 -# CONFIG_USB_STORAGE_SDDR55 is not set 19.190 -# CONFIG_USB_STORAGE_JUMPSHOT is not set 19.191 -# CONFIG_USB_ACM is not set 19.192 -# CONFIG_USB_PRINTER is not set 19.193 - 19.194 -# 19.195 -# USB Human Interface Devices (HID) 19.196 -# 19.197 -# CONFIG_USB_HID is not set 19.198 - 19.199 -# 19.200 -# Input core support is needed for USB HID input layer or HIDBP support 19.201 -# 19.202 -# CONFIG_USB_HIDINPUT is not set 19.203 -# CONFIG_USB_HIDDEV is not set 19.204 -# CONFIG_USB_KBD is not set 19.205 -# CONFIG_USB_MOUSE is not set 19.206 -# CONFIG_USB_AIPTEK is not set 19.207 -# CONFIG_USB_WACOM is not set 19.208 -# CONFIG_USB_KBTAB is not set 19.209 -# CONFIG_USB_POWERMATE is not set 19.210 - 19.211 -# 19.212 -# USB Imaging devices 19.213 -# 19.214 -# CONFIG_USB_DC2XX is not set 19.215 -# CONFIG_USB_MDC800 is not set 19.216 -# CONFIG_USB_SCANNER is not set 19.217 -# CONFIG_USB_MICROTEK is not set 19.218 -# CONFIG_USB_HPUSBSCSI is not set 19.219 - 19.220 -# 19.221 -# USB Multimedia devices 19.222 -# 19.223 - 19.224 -# 19.225 -# Video4Linux support is needed for USB Multimedia device support 19.226 -# 19.227 - 19.228 -# 19.229 -# USB Network adaptors 19.230 -# 19.231 -# CONFIG_USB_PEGASUS is not set 19.232 -# CONFIG_USB_RTL8150 is not set 19.233 -# CONFIG_USB_KAWETH is not set 19.234 -# CONFIG_USB_CATC is not set 19.235 -# CONFIG_USB_CDCETHER is not set 19.236 -# CONFIG_USB_USBNET is not set 19.237 - 19.238 -# 19.239 -# USB port drivers 19.240 -# 19.241 -# CONFIG_USB_USS720 is not set 19.242 - 19.243 -# 19.244 -# USB Serial Converter support 19.245 -# 19.246 -# CONFIG_USB_SERIAL is not set 19.247 - 19.248 -# 19.249 -# USB Miscellaneous drivers 19.250 -# 19.251 -# CONFIG_USB_RIO500 is not set 19.252 -# CONFIG_USB_AUERSWALD is not set 19.253 -# CONFIG_USB_TIGL is not set 19.254 -# CONFIG_USB_BRLVGER is not set 19.255 -# CONFIG_USB_LCD is not set 19.256 +# CONFIG_USB is not set 19.257 19.258 # 19.259 # Support for USB gadgets
20.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h Tue May 11 14:57:44 2004 +0000 20.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h Tue May 11 15:02:26 2004 +0000 20.3 @@ -10,6 +10,7 @@ 20.4 #include <linux/rbtree.h> 20.5 #include <linux/interrupt.h> 20.6 #include <linux/slab.h> 20.7 +#include <linux/blkdev.h> 20.8 #include <asm/ctrl_if.h> 20.9 #include <asm/io.h> 20.10 #include "../blkif.h"
21.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c Tue May 11 14:57:44 2004 +0000 21.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c Tue May 11 15:02:26 2004 +0000 21.3 @@ -74,7 +74,8 @@ void blkif_ctrlif_init(void) 21.4 ctrl_msg_t cmsg; 21.5 blkif_be_driver_status_changed_t st; 21.6 21.7 - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx); 21.8 + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 21.9 + CALLBACK_IN_BLOCKING_CONTEXT); 21.10 21.11 /* Send a driver-UP notification to the domain controller. */ 21.12 cmsg.type = CMSG_BLKIF_BE;
22.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c Tue May 11 14:57:44 2004 +0000 22.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c Tue May 11 15:02:26 2004 +0000 22.3 @@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *cre 22.4 unsigned int handle = create->blkif_handle; 22.5 blkif_t **pblkif, *blkif; 22.6 22.7 - if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL ) 22.8 + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) 22.9 { 22.10 DPRINTK("Could not create blkif: out of memory\n"); 22.11 create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
23.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c Tue May 11 14:57:44 2004 +0000 23.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c Tue May 11 15:02:26 2004 +0000 23.3 @@ -24,17 +24,15 @@ 23.4 #define MAX_PENDING_REQS 64 23.5 #define BATCH_PER_DOMAIN 16 23.6 23.7 -static struct vm_struct *mmap_vma; 23.8 -#define MMAP_PAGES_PER_SEGMENT \ 23.9 - ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1) 23.10 +static unsigned long mmap_vstart; 23.11 #define MMAP_PAGES_PER_REQUEST \ 23.12 - (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT) 23.13 + (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) 23.14 #define MMAP_PAGES \ 23.15 (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) 23.16 -#define MMAP_VADDR(_req,_seg) \ 23.17 - ((unsigned long)mmap_vma->addr + \ 23.18 +#define MMAP_VADDR(_req,_seg) \ 23.19 + (mmap_vstart + \ 23.20 ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ 23.21 - ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE)) 23.22 + ((_seg) * PAGE_SIZE)) 23.23 23.24 /* 23.25 * Each outstanding request that we've passed to the lower device layers has a 23.26 @@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blki 23.27 prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW); 23.28 for ( i = 0; i < req->nr_segments; i++ ) 23.29 { 23.30 - if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) ) 23.31 + /* Make sure the buffer is page-sized. */ 23.32 + if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) || 23.33 + (blkif_last_sect(req->frame_and_sects[i]) != 7) ) 23.34 goto bad_descriptor; 23.35 rc = direct_remap_area_pages(&init_mm, 23.36 MMAP_VADDR(pending_idx, i), 23.37 - req->buffer_and_sects[i] & PAGE_MASK, 23.38 + req->frame_and_sects[i] & PAGE_MASK, 23.39 PAGE_SIZE, prot, blkif->domid); 23.40 if ( rc != 0 ) 23.41 goto bad_descriptor; 23.42 @@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t 23.43 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 23.44 struct buffer_head *bh; 23.45 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; 23.46 - unsigned short nr_sects; 23.47 - unsigned long buffer; 23.48 + short nr_sects; 23.49 + unsigned long buffer, fas; 23.50 int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; 23.51 pending_req_t *pending_req; 23.52 pgprot_t prot; 23.53 23.54 /* We map virtual scatter/gather segments to physical segments. */ 23.55 int new_segs, nr_psegs = 0; 23.56 - phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; 23.57 + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1]; 23.58 23.59 /* Check that number of segments is sane. */ 23.60 if ( unlikely(req->nr_segments == 0) || 23.61 @@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t 23.62 */ 23.63 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) 23.64 { 23.65 - buffer = req->buffer_and_sects[i] & ~0x1FF; 23.66 - nr_sects = req->buffer_and_sects[i] & 0x1FF; 23.67 - 23.68 - if ( unlikely(nr_sects == 0) ) 23.69 - continue; 23.70 + fas = req->frame_and_sects[i]; 23.71 + buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); 23.72 + nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; 23.73 23.74 - if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) ) 23.75 - { 23.76 - DPRINTK("Too many sectors in segment\n"); 23.77 + if ( nr_sects <= 0 ) 23.78 goto bad_descriptor; 23.79 - } 23.80 23.81 phys_seg[nr_psegs].dev = req->device; 23.82 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; 23.83 @@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t 23.84 } 23.85 23.86 nr_psegs += new_segs; 23.87 - ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2); 23.88 + ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1)); 23.89 } 23.90 23.91 /* Nonsensical zero-sized request? */ 23.92 @@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t 23.93 23.94 for ( i = 0; i < nr_psegs; i++ ) 23.95 { 23.96 - unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + 23.97 - (phys_seg[i].nr_sects << 9) + 23.98 - (PAGE_SIZE - 1)) & PAGE_MASK; 23.99 int rc = direct_remap_area_pages(&init_mm, 23.100 MMAP_VADDR(pending_idx, i), 23.101 phys_seg[i].buffer & PAGE_MASK, 23.102 - sz, prot, blkif->domid); 23.103 + PAGE_SIZE, prot, blkif->domid); 23.104 if ( rc != 0 ) 23.105 { 23.106 DPRINTK("invalid buffer\n"); 23.107 @@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t 23.108 MMAP_PAGES_PER_REQUEST * PAGE_SIZE); 23.109 goto bad_descriptor; 23.110 } 23.111 + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = 23.112 + phys_seg[i].buffer >> PAGE_SHIFT; 23.113 } 23.114 23.115 pending_req = &pending_reqs[pending_idx]; 23.116 @@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t 23.117 bh->b_rsector = (unsigned long)phys_seg[i].sector_number; 23.118 bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + 23.119 (phys_seg[i].buffer & ~PAGE_MASK); 23.120 +// bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); 23.121 bh->b_end_io = end_block_io_op; 23.122 bh->b_private = pending_req; 23.123 23.124 @@ -456,13 +451,13 @@ static int __init init_module(void) 23.125 { 23.126 int i; 23.127 23.128 + if ( !(start_info.flags & SIF_INITDOMAIN) ) 23.129 + return 0; 23.130 + 23.131 blkif_interface_init(); 23.132 23.133 - if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL ) 23.134 - { 23.135 - printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n"); 23.136 - return -ENOMEM; 23.137 - } 23.138 + if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) 23.139 + BUG(); 23.140 23.141 pending_cons = 0; 23.142 pending_prod = MAX_PENDING_REQS; 23.143 @@ -484,6 +479,7 @@ static int __init init_module(void) 23.144 23.145 static void cleanup_module(void) 23.146 { 23.147 + BUG(); 23.148 } 23.149 23.150 module_init(init_module);
24.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c Tue May 11 14:57:44 2004 +0000 24.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c Tue May 11 15:02:26 2004 +0000 24.3 @@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *c 24.4 } 24.5 } 24.6 24.7 - if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) ) 24.8 + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) 24.9 { 24.10 DPRINTK("vbd_create: out of memory\n"); 24.11 create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; 24.12 @@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow) 24.13 } 24.14 24.15 if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 24.16 - GFP_ATOMIC)) == NULL) ) 24.17 + GFP_KERNEL)) == NULL) ) 24.18 { 24.19 DPRINTK("vbd_grow: out of memory\n"); 24.20 grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
25.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h Tue May 11 14:57:44 2004 +0000 25.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h Tue May 11 15:02:26 2004 +0000 25.3 @@ -26,19 +26,22 @@ 25.4 */ 25.5 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 25.6 25.7 -#define BLKIF_MAX_SECTORS_PER_SEGMENT 16 25.8 - 25.9 typedef struct { 25.10 u8 operation; /* BLKIF_OP_??? */ 25.11 u8 nr_segments; /* number of segments */ 25.12 blkif_vdev_t device; /* only for read/write requests */ 25.13 unsigned long id; /* private guest value, echoed in resp */ 25.14 blkif_sector_t sector_number; /* start sector idx on disk (r/w only) */ 25.15 - /* Least 9 bits is 'nr_sects'. High 23 bits is the address. */ 25.16 - /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */ 25.17 - unsigned long buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 25.18 + /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame. */ 25.19 + /* @first_sect: first sector in frame to transfer (inclusive). */ 25.20 + /* @last_sect: last sector in frame to transfer (inclusive). */ 25.21 + /* @frame: machine page frame number. */ 25.22 + unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 25.23 } blkif_request_t; 25.24 25.25 +#define blkif_first_sect(_fas) (((_fas)>>3)&7) 25.26 +#define blkif_last_sect(_fas) ((_fas)&7) 25.27 + 25.28 typedef struct { 25.29 unsigned long id; /* copied from request */ 25.30 u8 operation; /* copied from request */ 25.31 @@ -79,8 +82,8 @@ typedef struct { 25.32 * @device == unused (zero) 25.33 * @id == any value (echoed in response message) 25.34 * @sector_num == unused (zero) 25.35 - * @buffer_and_sects == list of page-aligned, page-sized buffers. 25.36 - * (i.e., nr_sects == 8). 25.37 + * @frame_and_sects == list of page-sized buffers. 25.38 + * (i.e., @first_sect == 0, @last_sect == 7). 25.39 * 25.40 * The response is a list of vdisk_t elements copied into the out-of-band 25.41 * probe buffer. On success the response status field contains the number
26.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c Tue May 11 14:57:44 2004 +0000 26.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c Tue May 11 15:02:26 2004 +0000 26.3 @@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linu 26.4 static unsigned int blkif_state = BLKIF_STATE_CLOSED; 26.5 static unsigned int blkif_evtchn, blkif_irq; 26.6 26.7 -static struct tq_struct blkif_statechange_tq; 26.8 - 26.9 static int blkif_control_rsp_valid; 26.10 static blkif_response_t blkif_control_rsp; 26.11 26.12 @@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned 26.13 struct gendisk *gd; 26.14 blkif_request_t *req; 26.15 struct buffer_head *bh; 26.16 + unsigned int fsect, lsect; 26.17 26.18 - if ( unlikely(nr_sectors >= (1<<9)) ) 26.19 - BUG(); 26.20 + fsect = (buffer_ma & ~PAGE_MASK) >> 9; 26.21 + lsect = fsect + nr_sectors - 1; 26.22 + 26.23 + /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ 26.24 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) 26.25 BUG(); 26.26 + if ( lsect > 7 ) 26.27 + BUG(); 26.28 + 26.29 + buffer_ma &= PAGE_MASK; 26.30 26.31 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) 26.32 return 1; 26.33 @@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned 26.34 bh = (struct buffer_head *)id; 26.35 bh->b_reqnext = (struct buffer_head *)req->id; 26.36 req->id = id; 26.37 - req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors; 26.38 - if ( ++req->nr_segments < MAX_BLK_SEGS ) 26.39 + req->frame_and_sects[req->nr_segments] = 26.40 + buffer_ma | (fsect<<3) | lsect; 26.41 + if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) 26.42 sg_next_sect += nr_sectors; 26.43 else 26.44 DISABLE_SCATTERGATHER(); 26.45 @@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned 26.46 req->sector_number = (blkif_sector_t)sector_number; 26.47 req->device = device; 26.48 req->nr_segments = 1; 26.49 - req->buffer_and_sects[0] = buffer_ma | nr_sectors; 26.50 + req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect; 26.51 req_prod++; 26.52 26.53 return 0; 26.54 @@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t 26.55 } 26.56 26.57 26.58 -static void blkif_bringup_phase1(void *unused) 26.59 +static void blkif_status_change(blkif_fe_interface_status_changed_t *status) 26.60 { 26.61 ctrl_msg_t cmsg; 26.62 blkif_fe_interface_connect_t up; 26.63 26.64 - /* Move from CLOSED to DISCONNECTED state. */ 26.65 - blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); 26.66 - blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; 26.67 - blkif_state = BLKIF_STATE_DISCONNECTED; 26.68 - 26.69 - /* Construct an interface-CONNECT message for the domain controller. */ 26.70 - cmsg.type = CMSG_BLKIF_FE; 26.71 - cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; 26.72 - cmsg.length = sizeof(blkif_fe_interface_connect_t); 26.73 - up.handle = 0; 26.74 - up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; 26.75 - memcpy(cmsg.msg, &up, sizeof(up)); 26.76 - 26.77 - /* Tell the controller to bring up the interface. */ 26.78 - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 26.79 -} 26.80 - 26.81 -static void blkif_bringup_phase2(void *unused) 26.82 -{ 26.83 - blkif_irq = bind_evtchn_to_irq(blkif_evtchn); 26.84 - (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); 26.85 - 26.86 - /* Probe for discs that are attached to the interface. */ 26.87 - xlvbd_init(); 26.88 - 26.89 - blkif_state = BLKIF_STATE_CONNECTED; 26.90 - 26.91 - /* Kick pending requests. */ 26.92 - spin_lock_irq(&io_request_lock); 26.93 - kick_pending_request_queues(); 26.94 - spin_unlock_irq(&io_request_lock); 26.95 -} 26.96 - 26.97 -static void blkif_status_change(blkif_fe_interface_status_changed_t *status) 26.98 -{ 26.99 if ( status->handle != 0 ) 26.100 { 26.101 printk(KERN_WARNING "Status change on unsupported blkif %d\n", 26.102 @@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe 26.103 " in state %d\n", blkif_state); 26.104 break; 26.105 } 26.106 - blkif_statechange_tq.routine = blkif_bringup_phase1; 26.107 - schedule_task(&blkif_statechange_tq); 26.108 + 26.109 + /* Move from CLOSED to DISCONNECTED state. */ 26.110 + blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); 26.111 + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; 26.112 + blkif_state = BLKIF_STATE_DISCONNECTED; 26.113 + 26.114 + /* Construct an interface-CONNECT message for the domain controller. */ 26.115 + cmsg.type = CMSG_BLKIF_FE; 26.116 + cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; 26.117 + cmsg.length = sizeof(blkif_fe_interface_connect_t); 26.118 + up.handle = 0; 26.119 + up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; 26.120 + memcpy(cmsg.msg, &up, sizeof(up)); 26.121 + 26.122 + /* Tell the controller to bring up the interface. */ 26.123 + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 26.124 break; 26.125 26.126 case BLKIF_INTERFACE_STATUS_CONNECTED: 26.127 @@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe 26.128 " in state %d\n", blkif_state); 26.129 break; 26.130 } 26.131 + 26.132 blkif_evtchn = status->evtchn; 26.133 - blkif_statechange_tq.routine = blkif_bringup_phase2; 26.134 - schedule_task(&blkif_statechange_tq); 26.135 + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); 26.136 + (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); 26.137 + 26.138 + /* Probe for discs that are attached to the interface. */ 26.139 + xlvbd_init(); 26.140 + 26.141 + blkif_state = BLKIF_STATE_CONNECTED; 26.142 + 26.143 + /* Kick pending requests. */ 26.144 + spin_lock_irq(&io_request_lock); 26.145 + kick_pending_request_queues(); 26.146 + spin_unlock_irq(&io_request_lock); 26.147 break; 26.148 26.149 default: 26.150 @@ -675,7 +671,11 @@ int __init xlblk_init(void) 26.151 ctrl_msg_t cmsg; 26.152 blkif_fe_driver_status_changed_t st; 26.153 26.154 - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx); 26.155 + if ( start_info.flags & SIF_INITDOMAIN ) 26.156 + return 0; 26.157 + 26.158 + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, 26.159 + CALLBACK_IN_BLOCKING_CONTEXT); 26.160 26.161 /* Send a driver-UP notification to the domain controller. */ 26.162 cmsg.type = CMSG_BLKIF_FE;
27.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c Tue May 11 14:57:44 2004 +0000 27.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c Tue May 11 15:02:26 2004 +0000 27.3 @@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *d 27.4 memset(&req, 0, sizeof(req)); 27.5 req.operation = BLKIF_OP_PROBE; 27.6 req.nr_segments = 1; 27.7 - req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512); 27.8 + req.frame_and_sects[0] = virt_to_machine(buf) | 7; 27.9 27.10 blkif_control_send(&req, &rsp); 27.11
28.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c Tue May 11 14:57:44 2004 +0000 28.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c Tue May 11 15:02:26 2004 +0000 28.3 @@ -512,7 +512,7 @@ static int __init xencons_init(void) 28.4 } 28.5 else 28.6 { 28.7 - (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx); 28.8 + (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); 28.9 } 28.10 28.11 printk("Xen virtual console successfully installed\n");
29.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/common.h Tue May 11 14:57:44 2004 +0000 29.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/common.h Tue May 11 15:02:26 2004 +0000 29.3 @@ -16,6 +16,7 @@ 29.4 #include <asm/ctrl_if.h> 29.5 #include <asm/io.h> 29.6 #include "../netif.h" 29.7 +#include "../../../../../net/bridge/br_private.h" 29.8 29.9 #ifndef NDEBUG 29.10 #define ASSERT(_p) \ 29.11 @@ -28,7 +29,7 @@ 29.12 #define DPRINTK(_f, _a...) ((void)0) 29.13 #endif 29.14 29.15 -typedef struct { 29.16 +typedef struct netif_st { 29.17 /* Unique identifier for this interface. */ 29.18 domid_t domid; 29.19 unsigned int handle; 29.20 @@ -49,13 +50,7 @@ typedef struct { 29.21 NETIF_RING_IDX tx_req_cons; 29.22 NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ 29.23 29.24 - /* Usage accounting */ 29.25 - long long total_bytes_sent; 29.26 - long long total_bytes_received; 29.27 - long long total_packets_sent; 29.28 - long long total_packets_received; 29.29 - 29.30 - /* Trasnmit shaping: allow 'credit_bytes' every 'credit_usec'. */ 29.31 + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ 29.32 unsigned long credit_bytes; 29.33 unsigned long credit_usec; 29.34 unsigned long remaining_credit; 29.35 @@ -72,7 +67,8 @@ typedef struct { 29.36 struct list_head list; /* scheduling list */ 29.37 atomic_t refcnt; 29.38 spinlock_t rx_lock, tx_lock; 29.39 - unsigned char vmac[ETH_ALEN]; 29.40 + struct net_device *dev; 29.41 + struct net_device_stats stats; 29.42 } netif_t; 29.43 29.44 void netif_create(netif_be_create_t *create); 29.45 @@ -93,6 +89,8 @@ void netif_ctrlif_init(void); 29.46 29.47 void netif_deschedule(netif_t *netif); 29.48 29.49 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); 29.50 +struct net_device_stats *netif_be_get_stats(struct net_device *dev); 29.51 void netif_be_int(int irq, void *dev_id, struct pt_regs *regs); 29.52 29.53 #endif /* __NETIF__BACKEND__COMMON_H__ */
30.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c Tue May 11 14:57:44 2004 +0000 30.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c Tue May 11 15:02:26 2004 +0000 30.3 @@ -10,8 +10,6 @@ 30.4 30.5 static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) 30.6 { 30.7 - DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype); 30.8 - 30.9 switch ( msg->subtype ) 30.10 { 30.11 case CMSG_NETIF_BE_CREATE: 30.12 @@ -54,7 +52,8 @@ void netif_ctrlif_init(void) 30.13 ctrl_msg_t cmsg; 30.14 netif_be_driver_status_changed_t st; 30.15 30.16 - (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx); 30.17 + (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, 30.18 + CALLBACK_IN_BLOCKING_CONTEXT); 30.19 30.20 /* Send a driver-UP notification to the domain controller. */ 30.21 cmsg.type = CMSG_NETIF_BE;
31.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c Tue May 11 14:57:44 2004 +0000 31.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c Tue May 11 15:02:26 2004 +0000 31.3 @@ -7,13 +7,15 @@ 31.4 */ 31.5 31.6 #include "common.h" 31.7 +#include <linux/rtnetlink.h> 31.8 31.9 #define NETIF_HASHSZ 1024 31.10 #define NETIF_HASH(_d,_h) \ 31.11 (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(NETIF_HASHSZ-1)) 31.12 31.13 -static kmem_cache_t *netif_cachep; 31.14 -static netif_t *netif_hash[NETIF_HASHSZ]; 31.15 +static netif_t *netif_hash[NETIF_HASHSZ]; 31.16 +static struct net_device *bridge_dev; 31.17 +static struct net_bridge *bridge_br; 31.18 31.19 netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) 31.20 { 31.21 @@ -35,7 +37,11 @@ void __netif_disconnect_complete(netif_t 31.22 * must still be notified to the remote driver. 31.23 */ 31.24 unbind_evtchn_from_irq(netif->evtchn); 31.25 - vfree(netif->net_ring_base); 31.26 + vfree(netif->tx); /* Frees netif->rx as well. */ 31.27 + rtnl_lock(); 31.28 + (void)br_del_if(bridge_br, netif->dev); 31.29 + (void)dev_close(netif->dev); 31.30 + rtnl_unlock(); 31.31 31.32 /* Construct the deferred response message. */ 31.33 cmsg.type = CMSG_NETIF_BE; 31.34 @@ -66,24 +72,32 @@ void __netif_disconnect_complete(netif_t 31.35 31.36 void netif_create(netif_be_create_t *create) 31.37 { 31.38 - domid_t domid = create->domid; 31.39 - unsigned int handle = create->netif_handle; 31.40 - netif_t **pnetif, *netif; 31.41 + domid_t domid = create->domid; 31.42 + unsigned int handle = create->netif_handle; 31.43 + struct net_device *dev; 31.44 + netif_t **pnetif, *netif; 31.45 31.46 - if ( (netif = kmem_cache_alloc(netif_cachep, GFP_ATOMIC)) == NULL ) 31.47 + dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup); 31.48 + if ( dev == NULL ) 31.49 { 31.50 DPRINTK("Could not create netif: out of memory\n"); 31.51 create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; 31.52 return; 31.53 } 31.54 31.55 + netif = dev->priv; 31.56 memset(netif, 0, sizeof(*netif)); 31.57 netif->domid = domid; 31.58 netif->handle = handle; 31.59 netif->status = DISCONNECTED; 31.60 - spin_lock_init(&netif->vbd_lock); 31.61 - spin_lock_init(&netif->net_ring_lock); 31.62 + spin_lock_init(&netif->rx_lock); 31.63 + spin_lock_init(&netif->tx_lock); 31.64 atomic_set(&netif->refcnt, 0); 31.65 + netif->dev = dev; 31.66 + 31.67 + netif->credit_bytes = netif->remaining_credit = ~0UL; 31.68 + netif->credit_usec = 0UL; 31.69 + /*init_ac_timer(&new_vif->credit_timeout);*/ 31.70 31.71 pnetif = &netif_hash[NETIF_HASH(domid, handle)]; 31.72 while ( *pnetif != NULL ) 31.73 @@ -92,12 +106,27 @@ void netif_create(netif_be_create_t *cre 31.74 { 31.75 DPRINTK("Could not create netif: already exists\n"); 31.76 create->status = NETIF_BE_STATUS_INTERFACE_EXISTS; 31.77 - kmem_cache_free(netif_cachep, netif); 31.78 + kfree(dev); 31.79 return; 31.80 } 31.81 pnetif = &(*pnetif)->hash_next; 31.82 } 31.83 31.84 + dev->hard_start_xmit = netif_be_start_xmit; 31.85 + dev->get_stats = netif_be_get_stats; 31.86 + memcpy(dev->dev_addr, create->mac, ETH_ALEN); 31.87 + 31.88 + /* XXX In bridge mode we should force a different MAC from remote end. */ 31.89 + dev->dev_addr[2] ^= 1; 31.90 + 31.91 + if ( register_netdev(dev) != 0 ) 31.92 + { 31.93 + DPRINTK("Could not register new net device\n"); 31.94 + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; 31.95 + kfree(dev); 31.96 + return; 31.97 + } 31.98 + 31.99 netif->hash_next = *pnetif; 31.100 *pnetif = netif; 31.101 31.102 @@ -132,8 +161,8 @@ void netif_destroy(netif_be_destroy_t *d 31.103 31.104 destroy: 31.105 *pnetif = netif->hash_next; 31.106 - destroy_all_vbds(netif); 31.107 - kmem_cache_free(netif_cachep, netif); 31.108 + unregister_netdev(netif->dev); 31.109 + kfree(netif->dev); 31.110 destroy->status = NETIF_BE_STATUS_OKAY; 31.111 } 31.112 31.113 @@ -142,11 +171,13 @@ void netif_connect(netif_be_connect_t *c 31.114 domid_t domid = connect->domid; 31.115 unsigned int handle = connect->netif_handle; 31.116 unsigned int evtchn = connect->evtchn; 31.117 - unsigned long shmem_frame = connect->shmem_frame; 31.118 + unsigned long tx_shmem_frame = connect->tx_shmem_frame; 31.119 + unsigned long rx_shmem_frame = connect->rx_shmem_frame; 31.120 struct vm_struct *vma; 31.121 pgprot_t prot; 31.122 int error; 31.123 netif_t *netif; 31.124 + struct net_device *eth0_dev; 31.125 31.126 netif = netif_find_by_handle(domid, handle); 31.127 if ( unlikely(netif == NULL) ) 31.128 @@ -157,16 +188,27 @@ void netif_connect(netif_be_connect_t *c 31.129 return; 31.130 } 31.131 31.132 - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) 31.133 + if ( netif->status != DISCONNECTED ) 31.134 + { 31.135 + connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; 31.136 + return; 31.137 + } 31.138 + 31.139 + if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL ) 31.140 { 31.141 connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; 31.142 return; 31.143 } 31.144 31.145 prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); 31.146 - error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), 31.147 - shmem_frame<<PAGE_SHIFT, PAGE_SIZE, 31.148 - prot, domid); 31.149 + error = direct_remap_area_pages(&init_mm, 31.150 + VMALLOC_VMADDR(vma->addr), 31.151 + tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, 31.152 + prot, domid); 31.153 + error |= direct_remap_area_pages(&init_mm, 31.154 + VMALLOC_VMADDR(vma->addr) + PAGE_SIZE, 31.155 + rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, 31.156 + prot, domid); 31.157 if ( error != 0 ) 31.158 { 31.159 if ( error == -ENOMEM ) 31.160 @@ -179,21 +221,39 @@ void netif_connect(netif_be_connect_t *c 31.161 return; 31.162 } 31.163 31.164 - if ( netif->status != DISCONNECTED ) 31.165 + netif->evtchn = evtchn; 31.166 + netif->irq = bind_evtchn_to_irq(evtchn); 31.167 + netif->tx_shmem_frame = tx_shmem_frame; 31.168 + netif->rx_shmem_frame = rx_shmem_frame; 31.169 + netif->tx = 31.170 + (netif_tx_interface_t *)vma->addr; 31.171 + netif->rx = 31.172 + (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); 31.173 + netif->status = CONNECTED; 31.174 + netif_get(netif); 31.175 + 31.176 + rtnl_lock(); 31.177 + 31.178 + (void)dev_open(netif->dev); 31.179 + (void)br_add_if(bridge_br, netif->dev); 31.180 + 31.181 + /* 31.182 + * The default config is a very simple binding to eth0. 31.183 + * If eth0 is being used as an IP interface by this OS then someone 31.184 + * must add eth0's IP address to nbe-br, and change the routing table 31.185 + * to refer to nbe-br instead of eth0. 31.186 + */ 31.187 + (void)dev_open(bridge_dev); 31.188 + if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL ) 31.189 { 31.190 - connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; 31.191 - vfree(vma->addr); 31.192 - return; 31.193 + (void)dev_open(eth0_dev); 31.194 + (void)br_add_if(bridge_br, eth0_dev); 31.195 } 31.196 31.197 - netif->evtchn = evtchn; 31.198 - netif->irq = bind_evtchn_to_irq(evtchn); 31.199 - netif->shmem_frame = shmem_frame; 31.200 - netif->net_ring_base = (netif_ring_t *)vma->addr; 31.201 - netif->status = CONNECTED; 31.202 - netif_get(netif); 31.203 + rtnl_unlock(); 31.204 31.205 - request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif); 31.206 + (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif); 31.207 + netif_start_queue(netif->dev); 31.208 31.209 connect->status = NETIF_BE_STATUS_OKAY; 31.210 } 31.211 @@ -218,6 +278,7 @@ int netif_disconnect(netif_be_disconnect 31.212 netif->status = DISCONNECTING; 31.213 netif->disconnect_rspid = rsp_id; 31.214 wmb(); /* Let other CPUs see the status change. */ 31.215 + netif_stop_queue(netif->dev); 31.216 free_irq(netif->irq, NULL); 31.217 netif_deschedule(netif); 31.218 netif_put(netif); 31.219 @@ -226,105 +287,14 @@ int netif_disconnect(netif_be_disconnect 31.220 return 0; /* Caller should not send response message. */ 31.221 } 31.222 31.223 -net_vif_t *create_net_vif(domid_t dom) 31.224 -{ 31.225 - unsigned int idx; 31.226 - net_vif_t *new_vif = NULL; 31.227 - net_ring_t *new_ring = NULL; 31.228 - struct task_struct *p = NULL; 31.229 - unsigned long flags, vmac_hash; 31.230 - unsigned char vmac_key[ETH_ALEN + 2 + MAX_DOMAIN_NAME]; 31.231 - 31.232 - if ( (p = find_domain_by_id(dom)) == NULL ) 31.233 - return NULL; 31.234 - 31.235 - write_lock_irqsave(&tasklist_lock, flags); 31.236 - 31.237 - for ( idx = 0; idx < MAX_DOMAIN_VIFS; idx++ ) 31.238 - if ( p->net_vif_list[idx] == NULL ) 31.239 - break; 31.240 - if ( idx == MAX_DOMAIN_VIFS ) 31.241 - goto fail; 31.242 - 31.243 - if ( (new_vif = kmem_cache_alloc(net_vif_cache, GFP_KERNEL)) == NULL ) 31.244 - goto fail; 31.245 - 31.246 - memset(new_vif, 0, sizeof(*new_vif)); 31.247 - 31.248 - if ( sizeof(net_ring_t) > PAGE_SIZE ) 31.249 - BUG(); 31.250 - new_ring = (net_ring_t *)get_free_page(GFP_KERNEL); 31.251 - clear_page(new_ring); 31.252 - SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p); 31.253 - 31.254 - /* 31.255 - * Fill in the new vif struct. Note that, while the vif's refcnt is 31.256 - * non-zero, we hold a reference to the task structure. 31.257 - */ 31.258 - atomic_set(&new_vif->refcnt, 1); 31.259 - new_vif->shared_rings = new_ring; 31.260 - new_vif->shared_idxs = &p->shared_info->net_idx[idx]; 31.261 - new_vif->domain = p; 31.262 - new_vif->idx = idx; 31.263 - new_vif->list.next = NULL; 31.264 - spin_lock_init(&new_vif->rx_lock); 31.265 - spin_lock_init(&new_vif->tx_lock); 31.266 - 31.267 - new_vif->credit_bytes = new_vif->remaining_credit = ~0UL; 31.268 - new_vif->credit_usec = 0UL; 31.269 - init_ac_timer(&new_vif->credit_timeout); 31.270 - 31.271 - if ( (p->domain == 0) && (idx == 0) ) 31.272 - { 31.273 - /* 31.274 - * DOM0/VIF0 gets the real physical MAC address, so that users can 31.275 - * easily get a Xen-based machine up and running by using an existing 31.276 - * DHCP entry. 31.277 - */ 31.278 - memcpy(new_vif->vmac, the_dev->dev_addr, ETH_ALEN); 31.279 - } 31.280 - else 31.281 - { 31.282 - /* 31.283 - * Most VIFs get a random MAC address with a "special" vendor id. 31.284 - * We try to get MAC addresses to be unique across multiple servers 31.285 - * by including the physical MAC address in the hash. The hash also 31.286 - * includes the vif index and the domain's name. 31.287 - * 31.288 - * NB. The vendor is currently an "obsolete" one that used to belong 31.289 - * to DEC (AA-00-00). Using it is probably a bit rude :-) 31.290 - * 31.291 - * NB2. The first bit of the first random octet is set to zero for 31.292 - * all dynamic MAC addresses. This may allow us to manually specify 31.293 - * MAC addresses for some VIFs with no fear of clashes. 31.294 - */ 31.295 - memcpy(&vmac_key[0], the_dev->dev_addr, ETH_ALEN); 31.296 - *(__u16 *)(&vmac_key[ETH_ALEN]) = htons(idx); 31.297 - strcpy(&vmac_key[ETH_ALEN+2], p->name); 31.298 - vmac_hash = hash(vmac_key, ETH_ALEN + 2 + strlen(p->name)); 31.299 - memcpy(new_vif->vmac, "\xaa\x00\x00", 3); 31.300 - new_vif->vmac[3] = (vmac_hash >> 16) & 0xef; /* First bit is zero. */ 31.301 - new_vif->vmac[4] = (vmac_hash >> 8) & 0xff; 31.302 - new_vif->vmac[5] = (vmac_hash >> 0) & 0xff; 31.303 - } 31.304 - 31.305 - p->net_vif_list[idx] = new_vif; 31.306 - 31.307 - write_unlock_irqrestore(&tasklist_lock, flags); 31.308 - return new_vif; 31.309 - 31.310 - fail: 31.311 - write_unlock_irqrestore(&tasklist_lock, flags); 31.312 - if ( new_vif != NULL ) 31.313 - kmem_cache_free(net_vif_cache, new_vif); 31.314 - if ( p != NULL ) 31.315 - put_task_struct(p); 31.316 - return NULL; 31.317 -} 31.318 - 31.319 void netif_interface_init(void) 31.320 { 31.321 - netif_cachep = kmem_cache_create("netif_cache", sizeof(netif_t), 31.322 - 0, 0, NULL, NULL); 31.323 memset(netif_hash, 0, sizeof(netif_hash)); 31.324 + if ( br_add_bridge("nbe-br") != 0 ) 31.325 + BUG(); 31.326 + bridge_dev = __dev_get_by_name("nbe-br"); 31.327 + bridge_br = (struct net_bridge *)bridge_dev->priv; 31.328 + bridge_br->bridge_hello_time = bridge_br->hello_time = 0; 31.329 + bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0; 31.330 + bridge_br->stp_enabled = 0; 31.331 }
32.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c Tue May 11 14:57:44 2004 +0000 32.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c Tue May 11 15:02:26 2004 +0000 32.3 @@ -11,7 +11,10 @@ 32.4 */ 32.5 32.6 #include "common.h" 32.7 +#include <asm/hypervisor-ifs/dom_mem_ops.h> 32.8 32.9 +static void net_tx_action(unsigned long unused); 32.10 +static void netif_page_release(struct page *page); 32.11 static void make_tx_response(netif_t *netif, 32.12 u16 id, 32.13 s8 st); 32.14 @@ -21,38 +24,131 @@ static void make_rx_response(netif_t 32.15 netif_addr_t addr, 32.16 u16 size); 32.17 32.18 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); 32.19 + 32.20 /* Don't currently gate addition of an interface to the tx scheduling list. */ 32.21 #define tx_work_exists(_if) (1) 32.22 32.23 #define MAX_PENDING_REQS 256 32.24 -static struct vm_struct *mmap_vma; 32.25 -#define MMAP_VADDR(_req) ((unsigned long)mmap_vma->addr + ((_req) * PAGE_SIZE)) 32.26 +static unsigned long mmap_vstart; 32.27 +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) 32.28 32.29 -/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/ 32.30 +#define PKT_PROT_LEN (ETH_HLEN + 20) 32.31 + 32.32 +static u16 pending_id[MAX_PENDING_REQS]; 32.33 +static netif_t *pending_netif[MAX_PENDING_REQS]; 32.34 static u16 pending_ring[MAX_PENDING_REQS]; 32.35 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; 32.36 -/* NB. We use a different index type to differentiate from shared blk rings. */ 32.37 typedef unsigned int PEND_RING_IDX; 32.38 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) 32.39 static PEND_RING_IDX pending_prod, pending_cons; 32.40 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) 32.41 32.42 +static struct list_head net_schedule_list; 32.43 +static spinlock_t net_schedule_list_lock; 32.44 + 32.45 +#define MAX_MFN_ALLOC 64 32.46 +static unsigned long mfn_list[MAX_MFN_ALLOC]; 32.47 +static unsigned int alloc_index = 0; 32.48 +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED; 32.49 +static void __refresh_mfn_list(void) 32.50 +{ 32.51 + int ret; 32.52 + dom_mem_op_t op; 32.53 + op.op = MEMOP_RESERVATION_INCREASE; 32.54 + op.u.increase.size = MAX_MFN_ALLOC; 32.55 + op.u.increase.pages = mfn_list; 32.56 + if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC ) 32.57 + { 32.58 + printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret); 32.59 + BUG(); 32.60 + } 32.61 + alloc_index = MAX_MFN_ALLOC; 32.62 +} 32.63 +static unsigned long get_new_mfn(void) 32.64 +{ 32.65 + unsigned long mfn, flags; 32.66 + spin_lock_irqsave(&mfn_lock, flags); 32.67 + if ( alloc_index == 0 ) 32.68 + __refresh_mfn_list(); 32.69 + mfn = mfn_list[--alloc_index]; 32.70 + spin_unlock_irqrestore(&mfn_lock, flags); 32.71 + return mfn; 32.72 +} 32.73 +static void dealloc_mfn(unsigned long mfn) 32.74 +{ 32.75 + unsigned long flags; 32.76 + spin_lock_irqsave(&mfn_lock, flags); 32.77 + mfn_list[alloc_index++] = mfn; 32.78 + spin_unlock_irqrestore(&mfn_lock, flags); 32.79 +} 32.80 + 32.81 +static inline void maybe_schedule_tx_action(void) 32.82 +{ 32.83 + smp_mb(); 32.84 + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && 32.85 + !list_empty(&net_schedule_list) ) 32.86 + tasklet_schedule(&net_tx_tasklet); 32.87 +} 32.88 + 32.89 /* 32.90 * This is the primary RECEIVE function for a network interface. 32.91 * Note that, from the p.o.v. of /this/ OS it looks like a transmit. 32.92 */ 32.93 -static void netif_start_xmit(struct sk_buff *skb, struct net_device *dev) 32.94 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) 32.95 { 32.96 netif_t *netif = (netif_t *)dev->priv; 32.97 - s8 status = BLKIF_RSP_OKAY; 32.98 - u16 size; 32.99 - mmu_update_t mmu[4]; 32.100 + s8 status = NETIF_RSP_OKAY; 32.101 + u16 size=0, id; 32.102 + mmu_update_t mmu[6]; 32.103 + pgd_t *pgd; pmd_t *pmd; pte_t *pte; 32.104 + unsigned long vdata, mdata=0, new_mfn; 32.105 + 32.106 + /* Drop the packet if the target domain has no receive buffers. */ 32.107 + if ( (netif->rx_req_cons == netif->rx->req_prod) || 32.108 + ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) 32.109 + { 32.110 + dev_kfree_skb(skb); 32.111 + return 0; 32.112 + } 32.113 32.114 - memcpy(skb->mac.ethernet->h_dest, netif->vmac, ETH_ALEN); 32.115 - if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP ) 32.116 - memcpy(skb->nh.raw + 18, netif->vmac, ETH_ALEN); 32.117 + id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_req_cons++)].req.id; 32.118 + 32.119 + /* 32.120 + * We do not copy the packet unless: 32.121 + * 1. It is fragmented; or 32.122 + * 2. It spans a page boundary; or 32.123 + * 3. We cannot be sure the whole data page is allocated. 32.124 + * The copying method is taken from skb_copy(). 32.125 + */ 32.126 + if ( (skb_shinfo(skb)->nr_frags != 0) || 32.127 + (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) || 32.128 + ((skb->end - skb->head) < (PAGE_SIZE/2)) ) 32.129 + { 32.130 + struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC); 32.131 + int hlen = skb->data - skb->head; 32.132 + if ( unlikely(nskb == NULL) ) 32.133 + { 32.134 + DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid); 32.135 + status = NETIF_RSP_ERROR; 32.136 + goto out; 32.137 + } 32.138 + skb_reserve(nskb, hlen); 32.139 + __skb_put(nskb, skb->len); 32.140 + (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len); 32.141 + dev_kfree_skb(skb); 32.142 + skb = nskb; 32.143 + } 32.144 32.145 - spin_lock(&netif->rx_lock); 32.146 + vdata = (unsigned long)skb->data; 32.147 + mdata = virt_to_machine(vdata); 32.148 + size = skb->tail - skb->data; 32.149 + 32.150 + new_mfn = get_new_mfn(); 32.151 + 32.152 + pgd = pgd_offset_k( (vdata & PAGE_MASK)); 32.153 + pmd = pmd_offset(pgd, (vdata & PAGE_MASK)); 32.154 + pte = pte_offset(pmd, (vdata & PAGE_MASK)); 32.155 32.156 mmu[0].val = (unsigned long)(netif->domid<<16) & ~0xFFFFUL; 32.157 mmu[0].ptr = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL; 32.158 @@ -63,49 +159,44 @@ static void netif_start_xmit(struct sk_b 32.159 mmu[1].ptr |= MMU_EXTENDED_COMMAND; 32.160 mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H; 32.161 32.162 - mmu[2].ptr = ptr | MMU_EXTENDED_COMMAND; 32.163 + mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; 32.164 mmu[2].val = MMUEXT_REASSIGN_PAGE; 32.165 32.166 - mmu[3].ptr = ppte; 32.167 - mmu[3].val = newpage; 32.168 + mmu[3].ptr = MMU_EXTENDED_COMMAND; 32.169 + mmu[3].val = MMUEXT_RESET_SUBJECTDOM; 32.170 + 32.171 + mmu[4].ptr = virt_to_machine(pte); 32.172 + mmu[4].val = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; 32.173 32.174 - if ( unlikely(HYPERVISOR_mmu_update(mmu, 4) < 0) ) 32.175 + mmu[5].ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; 32.176 + mmu[5].val = __pa(vdata) >> PAGE_SHIFT; 32.177 + 32.178 + if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) ) 32.179 { 32.180 - status = BLKIF_RSP_ERROR; 32.181 + DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid); 32.182 + dealloc_mfn(new_mfn); 32.183 + status = NETIF_RSP_ERROR; 32.184 goto out; 32.185 } 32.186 32.187 - /* Record this so they can be billed. */ 32.188 - netif->total_packets_received++; 32.189 - netif->total_bytes_received += size; 32.190 + phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn; 32.191 + 32.192 + netif->stats.rx_bytes += size; 32.193 + netif->stats.rx_packets++; 32.194 32.195 out: 32.196 - make_rx_response(netif, rx->id, status, addr, size); 32.197 + spin_lock(&netif->rx_lock); 32.198 + make_rx_response(netif, id, status, mdata, size); 32.199 spin_unlock(&netif->rx_lock); 32.200 dev_kfree_skb(skb); 32.201 + return 0; 32.202 } 32.203 32.204 - 32.205 -/************************************************************* 32.206 - * NEW TRANSMIT SCHEDULER 32.207 - * 32.208 - * NB. We ought also to only send a limited number of bytes to the NIC 32.209 - * for transmission at any one time (to avoid head-of-line blocking). 32.210 - * However, driver rings are small enough that they provide a reasonable 32.211 - * limit. 32.212 - * 32.213 - * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps 32.214 - * e1000 has 256 descriptors == 128 packets, at 1000Mbps 32.215 - * tg3 has 512 descriptors == 256 packets, at 1000Mbps 32.216 - * 32.217 - * So, worst case is tg3 with 256 1500-bytes packets == 375kB. 32.218 - * This would take 3ms, and represents our worst-case HoL blocking cost. 32.219 - * 32.220 - * We think this is reasonable. 32.221 - */ 32.222 - 32.223 -struct list_head net_schedule_list; 32.224 -spinlock_t net_schedule_list_lock; 32.225 +struct net_device_stats *netif_be_get_stats(struct net_device *dev) 32.226 +{ 32.227 + netif_t *netif = dev->priv; 32.228 + return &netif->stats; 32.229 +} 32.230 32.231 static int __on_net_schedule_list(netif_t *netif) 32.232 { 32.233 @@ -128,7 +219,7 @@ static void add_to_net_schedule_list_tai 32.234 return; 32.235 32.236 spin_lock(&net_schedule_list_lock); 32.237 - if ( likely(!__on_net_schedule_list(netif)) ) 32.238 + if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) ) 32.239 { 32.240 list_add_tail(&netif->list, &net_schedule_list); 32.241 netif_get(netif); 32.242 @@ -136,46 +227,29 @@ static void add_to_net_schedule_list_tai 32.243 spin_unlock(&net_schedule_list_lock); 32.244 } 32.245 32.246 - 32.247 -static void tx_skb_release(struct sk_buff *skb); 32.248 - 32.249 -static inline int init_tx_header(netif_t *netif, u8 *data, 32.250 - unsigned int len, struct net_device *dev) 32.251 +static inline void netif_schedule_work(netif_t *netif) 32.252 { 32.253 - int proto = ntohs(*(unsigned short *)(data + 12)); 32.254 - 32.255 - memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN); 32.256 - 32.257 - switch ( proto ) 32.258 + if ( (netif->tx_req_cons != netif->tx->req_prod) && 32.259 + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) 32.260 { 32.261 - case ETH_P_ARP: 32.262 - if ( len < 42 ) break; 32.263 - memcpy(data + 22, dev->dev_addr, ETH_ALEN); 32.264 - break; 32.265 - case ETH_P_IP: 32.266 - break; 32.267 - default: 32.268 - /* Unsupported protocols are onyl allowed to/from NETIF0/0. */ 32.269 - if ( (netif->domain->domain != 0) || (netif->idx != 0) ) 32.270 - proto = 0; 32.271 - break; 32.272 + add_to_net_schedule_list_tail(netif); 32.273 + maybe_schedule_tx_action(); 32.274 } 32.275 - return proto; 32.276 } 32.277 32.278 +void netif_deschedule(netif_t *netif) 32.279 +{ 32.280 + remove_from_net_schedule_list(netif); 32.281 +} 32.282 32.283 +#if 0 32.284 static void tx_credit_callback(unsigned long data) 32.285 { 32.286 netif_t *netif = (netif_t *)data; 32.287 - 32.288 netif->remaining_credit = netif->credit_bytes; 32.289 - 32.290 - if ( tx_work_exists(netif) ) 32.291 - { 32.292 - add_to_net_schedule_list_tail(netif); 32.293 - maybe_schedule_tx_action(); 32.294 - } 32.295 + netif_schedule_work(netif); 32.296 } 32.297 +#endif 32.298 32.299 static void net_tx_action(unsigned long unused) 32.300 { 32.301 @@ -184,7 +258,9 @@ static void net_tx_action(unsigned long 32.302 netif_t *netif; 32.303 netif_tx_request_t txreq; 32.304 u16 pending_idx; 32.305 + NETIF_RING_IDX i; 32.306 pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED); 32.307 + struct page *page; 32.308 32.309 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && 32.310 !list_empty(&net_schedule_list) ) 32.311 @@ -197,7 +273,7 @@ static void net_tx_action(unsigned long 32.312 32.313 /* Work to do? */ 32.314 i = netif->tx_req_cons; 32.315 - if ( (i == shared_idxs->tx_req_prod) && 32.316 + if ( (i == netif->tx->req_prod) || 32.317 ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) 32.318 { 32.319 netif_put(netif); 32.320 @@ -232,7 +308,7 @@ static void net_tx_action(unsigned long 32.321 netif->remaining_credit -= tx.size; 32.322 #endif 32.323 32.324 - add_to_net_schedule_list_tail(netif); 32.325 + netif_schedule_work(netif); 32.326 32.327 if ( unlikely(txreq.size <= PKT_PROT_LEN) || 32.328 unlikely(txreq.size > ETH_FRAME_LEN) ) 32.329 @@ -246,7 +322,7 @@ static void net_tx_action(unsigned long 32.330 /* No crossing a page boundary as the payload mustn't fragment. */ 32.331 if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) 32.332 { 32.333 - DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 32.334 + DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", 32.335 txreq.addr, txreq.size, 32.336 (txreq.addr &~PAGE_MASK) + txreq.size); 32.337 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); 32.338 @@ -262,102 +338,88 @@ static void net_tx_action(unsigned long 32.339 PAGE_SIZE, prot, netif->domid) != 0 ) 32.340 { 32.341 DPRINTK("Bad page frame\n"); 32.342 - make_tx_response(netif, tx.id, NETIF_RSP_ERROR); 32.343 + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); 32.344 netif_put(netif); 32.345 continue; 32.346 } 32.347 - 32.348 + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = 32.349 + txreq.addr >> PAGE_SHIFT; 32.350 + 32.351 if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) ) 32.352 { 32.353 - make_tx_response(netif, tx.id, BLKIF_RSP_ERROR); 32.354 + DPRINTK("Can't allocate a skb in start_xmit.\n"); 32.355 + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); 32.356 netif_put(netif); 32.357 vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE); 32.358 break; 32.359 } 32.360 32.361 - __skb_put(PKT_PROT_LEN); 32.362 - memcpy(skb->data, src, PKT_PROT_LEN); 32.363 - protocol = __constant_htons( 32.364 - init_tx_header(netif, g_data, tx.size, the_dev)); 32.365 - if ( protocol == 0 ) 32.366 - { 32.367 - make_tx_response(netif, tx.id, NETIF_RSP_ERROR); 32.368 - netif_put(netif); 32.369 - dev_kfree_skb(skb); 32.370 - goto cleanup_and_continue; 32.371 - } 32.372 + __skb_put(skb, PKT_PROT_LEN); 32.373 + memcpy(skb->data, 32.374 + (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), 32.375 + PKT_PROT_LEN); 32.376 32.377 - skb->dev = netif->dev; 32.378 - skb->protocol = eth_type_trans(skb, skb->dev); 32.379 - 32.380 + page = virt_to_page(MMAP_VADDR(pending_idx)); 32.381 + 32.382 /* Append the packet payload as a fragment. */ 32.383 - skb_shinfo(skb)->frags[0].page = 32.384 - &mem_map[txreq.addr >> PAGE_SHIFT]; 32.385 + skb_shinfo(skb)->frags[0].page = page; 32.386 skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN; 32.387 skb_shinfo(skb)->frags[0].page_offset = 32.388 (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK; 32.389 skb_shinfo(skb)->nr_frags = 1; 32.390 - skb->data_len = tx->size - PKT_PROT_LEN; 32.391 + skb->data_len = txreq.size - PKT_PROT_LEN; 32.392 skb->len += skb->data_len; 32.393 32.394 + skb->dev = netif->dev; 32.395 + skb->protocol = eth_type_trans(skb, skb->dev); 32.396 + 32.397 /* Destructor information. */ 32.398 - skb->destructor = tx_skb_release; 32.399 - skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif; 32.400 - skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx; 32.401 + atomic_set(&page->count, 1); 32.402 + page->mapping = (struct address_space *)netif_page_release; 32.403 + pending_id[pending_idx] = txreq.id; 32.404 + pending_netif[pending_idx] = netif; 32.405 32.406 - /* Record the transmission so they can be billed. */ 32.407 - netif->total_packets_sent++; 32.408 - netif->total_bytes_sent += tx->size; 32.409 + netif->stats.tx_bytes += txreq.size; 32.410 + netif->stats.tx_packets++; 32.411 32.412 pending_cons++; 32.413 + 32.414 netif_rx(skb); 32.415 netif->dev->last_rx = jiffies; 32.416 } 32.417 } 32.418 32.419 -DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); 32.420 - 32.421 - 32.422 -static inline void maybe_schedule_tx_action(void) 32.423 +static void netif_page_release(struct page *page) 32.424 { 32.425 - smp_mb(); 32.426 - if ( !netif_queue_stopped(the_dev) && 32.427 - !list_empty(&net_schedule_list) ) 32.428 - tasklet_schedule(&net_tx_tasklet); 32.429 -} 32.430 - 32.431 + unsigned long flags; 32.432 + netif_t *netif; 32.433 + u16 pending_idx; 32.434 32.435 -/* Destructor function for tx skbs. */ 32.436 -static void tx_skb_release(struct sk_buff *skb) 32.437 -{ 32.438 - int i; 32.439 - netif_t *netif = (netif_t)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page; 32.440 - u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size; 32.441 + pending_idx = page - virt_to_page(mmap_vstart); 32.442 + 32.443 + netif = pending_netif[pending_idx]; 32.444 32.445 vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE); 32.446 - 32.447 - skb_shinfo(skb)->nr_frags = 0; 32.448 - 32.449 + 32.450 spin_lock(&netif->tx_lock); 32.451 - make_tx_response(netif, skb->guest_id, NETIF_RSP_OKAY); 32.452 + make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY); 32.453 spin_unlock(&netif->tx_lock); 32.454 - 32.455 + 32.456 /* 32.457 - * Checks below must happen after the above response is posted. This avoids 32.458 - * a possible race with a guest OS on another CPU. 32.459 + * Scheduling checks must happen after the above response is posted. 32.460 + * This avoids a possible race with a guest OS on another CPU. 32.461 */ 32.462 mb(); 32.463 - 32.464 - if ( tx_work_exists(netif) ) 32.465 - { 32.466 - add_to_net_schedule_list_tail(netif); 32.467 - maybe_schedule_tx_action(); 32.468 - } 32.469 - 32.470 + netif_schedule_work(netif); 32.471 + 32.472 netif_put(netif); 32.473 + 32.474 + spin_lock_irqsave(&pend_prod_lock, flags); 32.475 + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; 32.476 + spin_unlock_irqrestore(&pend_prod_lock, flags); 32.477 } 32.478 32.479 - 32.480 +#if 0 32.481 long flush_bufs_for_netif(netif_t *netif) 32.482 { 32.483 NET_RING_IDX i; 32.484 @@ -395,6 +457,7 @@ long flush_bufs_for_netif(netif_t *netif 32.485 32.486 return 0; 32.487 } 32.488 +#endif 32.489 32.490 void netif_be_int(int irq, void *dev_id, struct pt_regs *regs) 32.491 { 32.492 @@ -424,7 +487,6 @@ static void make_tx_response(netif_t *ne 32.493 notify_via_evtchn(netif->evtchn); 32.494 } 32.495 32.496 - 32.497 static void make_rx_response(netif_t *netif, 32.498 u16 id, 32.499 s8 st, 32.500 @@ -448,28 +510,35 @@ static void make_rx_response(netif_t 32.501 notify_via_evtchn(netif->evtchn); 32.502 } 32.503 32.504 - 32.505 static int __init init_module(void) 32.506 { 32.507 + int i; 32.508 + 32.509 + if ( !(start_info.flags & SIF_INITDOMAIN) ) 32.510 + return 0; 32.511 + 32.512 netif_interface_init(); 32.513 32.514 - if ( (mmap_vma = get_vm_area(MAX_PENDING_REQS * PAGE_SIZE, 32.515 - VM_IOREMAP)) == NULL ) 32.516 - { 32.517 - printk(KERN_WARNING "Could not allocate VMA for netif backend.\n"); 32.518 - return -ENOMEM; 32.519 - } 32.520 + if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) 32.521 + BUG(); 32.522 + 32.523 + pending_cons = 0; 32.524 + pending_prod = MAX_PENDING_REQS; 32.525 + for ( i = 0; i < MAX_PENDING_REQS; i++ ) 32.526 + pending_ring[i] = i; 32.527 + 32.528 + spin_lock_init(&net_schedule_list_lock); 32.529 + INIT_LIST_HEAD(&net_schedule_list); 32.530 32.531 netif_ctrlif_init(); 32.532 32.533 return 0; 32.534 } 32.535 32.536 - 32.537 static void cleanup_module(void) 32.538 { 32.539 + BUG(); 32.540 } 32.541 32.542 - 32.543 module_init(init_module); 32.544 module_exit(cleanup_module);
33.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c Tue May 11 14:57:44 2004 +0000 33.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c Tue May 11 15:02:26 2004 +0000 33.3 @@ -25,16 +25,18 @@ 33.4 #include <net/sock.h> 33.5 #include <net/pkt_sched.h> 33.6 33.7 +#include <asm/evtchn.h> 33.8 +#include <asm/ctrl_if.h> 33.9 +#include <asm/hypervisor-ifs/dom_mem_ops.h> 33.10 + 33.11 +#include "../netif.h" 33.12 + 33.13 #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ 33.14 33.15 -static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs); 33.16 static void network_tx_buf_gc(struct net_device *dev); 33.17 static void network_alloc_rx_buffers(struct net_device *dev); 33.18 static void cleanup_module(void); 33.19 33.20 -/* Dynamically-mapped IRQs. */ 33.21 -static int network_irq, debug_irq; 33.22 - 33.23 static struct list_head dev_list; 33.24 33.25 struct net_private 33.26 @@ -43,26 +45,30 @@ struct net_private 33.27 struct net_device *dev; 33.28 33.29 struct net_device_stats stats; 33.30 - NET_RING_IDX rx_resp_cons, tx_resp_cons; 33.31 - unsigned int net_ring_fixmap_idx, tx_full; 33.32 - net_ring_t *net_ring; 33.33 - net_idx_t *net_idx; 33.34 + NETIF_RING_IDX rx_resp_cons, tx_resp_cons; 33.35 + unsigned int tx_full; 33.36 + 33.37 + netif_tx_interface_t *tx; 33.38 + netif_rx_interface_t *rx; 33.39 + 33.40 spinlock_t tx_lock; 33.41 - unsigned int idx; /* Domain-specific index of this VIF. */ 33.42 33.43 - unsigned int rx_bufs_to_notify; 33.44 + unsigned int handle; 33.45 + unsigned int evtchn; 33.46 + unsigned int irq; 33.47 33.48 -#define STATE_ACTIVE 0 33.49 -#define STATE_SUSPENDED 1 33.50 -#define STATE_CLOSED 2 33.51 +#define NETIF_STATE_CLOSED 0 33.52 +#define NETIF_STATE_DISCONNECTED 1 33.53 +#define NETIF_STATE_CONNECTED 2 33.54 +#define NETIF_STATE_ACTIVE 3 33.55 unsigned int state; 33.56 33.57 /* 33.58 * {tx,rx}_skbs store outstanding skbuffs. The first entry in each 33.59 * array is an index into a chain of free entries. 33.60 */ 33.61 - struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1]; 33.62 - struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1]; 33.63 + struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; 33.64 + struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; 33.65 }; 33.66 33.67 /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ 33.68 @@ -75,86 +81,43 @@ struct net_private 33.69 (unsigned short)_id; }) 33.70 33.71 33.72 -static void _dbg_network_int(struct net_device *dev) 33.73 -{ 33.74 - struct net_private *np = dev->priv; 33.75 - 33.76 - if ( np->state == STATE_CLOSED ) 33.77 - return; 33.78 - 33.79 - printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x," 33.80 - " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x," 33.81 - " tx_event=0x%08x, state=%d\n", 33.82 - np->tx_full, np->tx_resp_cons, 33.83 - np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, 33.84 - np->net_idx->tx_event, 33.85 - test_bit(__LINK_STATE_XOFF, &dev->state)); 33.86 - printk(KERN_ALERT "net: rx_resp_cons=0x%08x," 33.87 - " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n", 33.88 - np->rx_resp_cons, np->net_idx->rx_req_prod, 33.89 - np->net_idx->rx_resp_prod, np->net_idx->rx_event); 33.90 -} 33.91 - 33.92 - 33.93 -static void dbg_network_int(int irq, void *unused, struct pt_regs *ptregs) 33.94 +static struct net_device *find_dev_by_handle(unsigned int handle) 33.95 { 33.96 struct list_head *ent; 33.97 struct net_private *np; 33.98 list_for_each ( ent, &dev_list ) 33.99 { 33.100 np = list_entry(ent, struct net_private, list); 33.101 - _dbg_network_int(np->dev); 33.102 + if ( np->handle == handle ) 33.103 + return np->dev; 33.104 } 33.105 + return NULL; 33.106 } 33.107 33.108 33.109 static int network_open(struct net_device *dev) 33.110 { 33.111 struct net_private *np = dev->priv; 33.112 - netop_t netop; 33.113 - int i, ret; 33.114 - 33.115 - netop.cmd = NETOP_RESET_RINGS; 33.116 - netop.vif = np->idx; 33.117 - if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) 33.118 - { 33.119 - printk(KERN_ALERT "Possible net trouble: couldn't reset ring idxs\n"); 33.120 - return ret; 33.121 - } 33.122 + int i; 33.123 33.124 - netop.cmd = NETOP_GET_VIF_INFO; 33.125 - netop.vif = np->idx; 33.126 - if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) 33.127 - { 33.128 - printk(KERN_ALERT "Couldn't get info for vif %d\n", np->idx); 33.129 - return ret; 33.130 - } 33.131 + if ( np->state != NETIF_STATE_CONNECTED ) 33.132 + return -EINVAL; 33.133 33.134 - memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); 33.135 - 33.136 - set_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx, 33.137 - netop.u.get_vif_info.ring_mfn << PAGE_SHIFT); 33.138 - np->net_ring = (net_ring_t *)fix_to_virt( 33.139 - FIX_NETRING0_BASE + np->net_ring_fixmap_idx); 33.140 - np->net_idx = &HYPERVISOR_shared_info->net_idx[np->idx]; 33.141 - 33.142 - np->rx_bufs_to_notify = 0; 33.143 np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; 33.144 memset(&np->stats, 0, sizeof(np->stats)); 33.145 spin_lock_init(&np->tx_lock); 33.146 - memset(np->net_ring, 0, sizeof(*np->net_ring)); 33.147 - memset(np->net_idx, 0, sizeof(*np->net_idx)); 33.148 33.149 /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ 33.150 - for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ ) 33.151 + for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ ) 33.152 np->tx_skbs[i] = (void *)(i+1); 33.153 - for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ ) 33.154 + for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ ) 33.155 np->rx_skbs[i] = (void *)(i+1); 33.156 33.157 wmb(); 33.158 - np->state = STATE_ACTIVE; 33.159 + np->state = NETIF_STATE_ACTIVE; 33.160 33.161 network_alloc_rx_buffers(dev); 33.162 + np->rx->event = np->rx_resp_cons + 1; 33.163 33.164 netif_start_queue(dev); 33.165 33.166 @@ -166,18 +129,17 @@ static int network_open(struct net_devic 33.167 33.168 static void network_tx_buf_gc(struct net_device *dev) 33.169 { 33.170 - NET_RING_IDX i, prod; 33.171 + NETIF_RING_IDX i, prod; 33.172 unsigned short id; 33.173 struct net_private *np = dev->priv; 33.174 struct sk_buff *skb; 33.175 - tx_entry_t *tx_ring = np->net_ring->tx_ring; 33.176 33.177 do { 33.178 - prod = np->net_idx->tx_resp_prod; 33.179 + prod = np->tx->resp_prod; 33.180 33.181 for ( i = np->tx_resp_cons; i != prod; i++ ) 33.182 { 33.183 - id = tx_ring[MASK_NET_TX_IDX(i)].resp.id; 33.184 + id = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id; 33.185 skb = np->tx_skbs[id]; 33.186 ADD_ID_TO_FREELIST(np->tx_skbs, id); 33.187 dev_kfree_skb_any(skb); 33.188 @@ -193,17 +155,17 @@ static void network_tx_buf_gc(struct net 33.189 * in such cases notification from Xen is likely to be the only kick 33.190 * that we'll get. 33.191 */ 33.192 - np->net_idx->tx_event = 33.193 - prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1; 33.194 + np->tx->event = 33.195 + prod + ((np->tx->req_prod - prod) >> 1) + 1; 33.196 mb(); 33.197 } 33.198 - while ( prod != np->net_idx->tx_resp_prod ); 33.199 + while ( prod != np->tx->resp_prod ); 33.200 33.201 if ( np->tx_full && 33.202 - ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) ) 33.203 + ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) ) 33.204 { 33.205 np->tx_full = 0; 33.206 - if ( np->state == STATE_ACTIVE ) 33.207 + if ( np->state == NETIF_STATE_ACTIVE ) 33.208 netif_wake_queue(dev); 33.209 } 33.210 } 33.211 @@ -224,11 +186,15 @@ static void network_alloc_rx_buffers(str 33.212 unsigned short id; 33.213 struct net_private *np = dev->priv; 33.214 struct sk_buff *skb; 33.215 - netop_t netop; 33.216 - NET_RING_IDX i = np->net_idx->rx_req_prod; 33.217 + NETIF_RING_IDX i = np->rx->req_prod; 33.218 + dom_mem_op_t op; 33.219 + unsigned long pfn_array[NETIF_RX_RING_SIZE]; 33.220 + int ret, nr_pfns = 0; 33.221 + pte_t *pte; 33.222 33.223 - if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || 33.224 - unlikely(np->state != STATE_ACTIVE) ) 33.225 + /* Make sure the batch is large enough to be worthwhile (1/2 ring). */ 33.226 + if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 33.227 + unlikely(np->state != NETIF_STATE_ACTIVE) ) 33.228 return; 33.229 33.230 do { 33.231 @@ -244,13 +210,13 @@ static void network_alloc_rx_buffers(str 33.232 id = GET_ID_FROM_FREELIST(np->rx_skbs); 33.233 np->rx_skbs[id] = skb; 33.234 33.235 - np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id; 33.236 - np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 33.237 - virt_to_machine(get_ppte(skb->head)); 33.238 - 33.239 - np->rx_bufs_to_notify++; 33.240 + np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id; 33.241 + 33.242 + pte = get_ppte(skb->head); 33.243 + pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT; 33.244 + queue_l1_entry_update(pte, 0); 33.245 } 33.246 - while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE ); 33.247 + while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE ); 33.248 33.249 /* 33.250 * We may have allocated buffers which have entries outstanding in the page 33.251 @@ -258,17 +224,16 @@ static void network_alloc_rx_buffers(str 33.252 */ 33.253 flush_page_update_queue(); 33.254 33.255 - np->net_idx->rx_req_prod = i; 33.256 - np->net_idx->rx_event = np->rx_resp_cons + 1; 33.257 - 33.258 - /* Batch Xen notifications. */ 33.259 - if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) ) 33.260 + op.op = MEMOP_RESERVATION_DECREASE; 33.261 + op.u.decrease.size = nr_pfns; 33.262 + op.u.decrease.pages = pfn_array; 33.263 + if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns ) 33.264 { 33.265 - netop.cmd = NETOP_PUSH_BUFFERS; 33.266 - netop.vif = np->idx; 33.267 - (void)HYPERVISOR_net_io_op(&netop); 33.268 - np->rx_bufs_to_notify = 0; 33.269 + printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret); 33.270 + BUG(); 33.271 } 33.272 + 33.273 + np->rx->req_prod = i; 33.274 } 33.275 33.276 33.277 @@ -276,9 +241,8 @@ static int network_start_xmit(struct sk_ 33.278 { 33.279 unsigned short id; 33.280 struct net_private *np = (struct net_private *)dev->priv; 33.281 - tx_req_entry_t *tx; 33.282 - netop_t netop; 33.283 - NET_RING_IDX i; 33.284 + netif_tx_request_t *tx; 33.285 + NETIF_RING_IDX i; 33.286 33.287 if ( unlikely(np->tx_full) ) 33.288 { 33.289 @@ -297,27 +261,27 @@ static int network_start_xmit(struct sk_ 33.290 memcpy(new_skb->data, skb->data, skb->len); 33.291 dev_kfree_skb(skb); 33.292 skb = new_skb; 33.293 - } 33.294 + } 33.295 33.296 spin_lock_irq(&np->tx_lock); 33.297 33.298 - i = np->net_idx->tx_req_prod; 33.299 + i = np->tx->req_prod; 33.300 33.301 id = GET_ID_FROM_FREELIST(np->tx_skbs); 33.302 np->tx_skbs[id] = skb; 33.303 33.304 - tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req; 33.305 + tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req; 33.306 33.307 tx->id = id; 33.308 - tx->addr = phys_to_machine(virt_to_phys(skb->data)); 33.309 + tx->addr = virt_to_machine(skb->data); 33.310 tx->size = skb->len; 33.311 33.312 wmb(); 33.313 - np->net_idx->tx_req_prod = i + 1; 33.314 + np->tx->req_prod = i + 1; 33.315 33.316 network_tx_buf_gc(dev); 33.317 33.318 - if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) ) 33.319 + if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) ) 33.320 { 33.321 np->tx_full = 1; 33.322 netif_stop_queue(dev); 33.323 @@ -330,49 +294,56 @@ static int network_start_xmit(struct sk_ 33.324 33.325 /* Only notify Xen if there are no outstanding responses. */ 33.326 mb(); 33.327 - if ( np->net_idx->tx_resp_prod == i ) 33.328 - { 33.329 - netop.cmd = NETOP_PUSH_BUFFERS; 33.330 - netop.vif = np->idx; 33.331 - (void)HYPERVISOR_net_io_op(&netop); 33.332 - } 33.333 + if ( np->tx->resp_prod == i ) 33.334 + notify_via_evtchn(np->evtchn); 33.335 33.336 return 0; 33.337 } 33.338 33.339 33.340 -static inline void _network_interrupt(struct net_device *dev) 33.341 +static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) 33.342 { 33.343 + struct net_device *dev = dev_id; 33.344 struct net_private *np = dev->priv; 33.345 unsigned long flags; 33.346 struct sk_buff *skb; 33.347 - rx_resp_entry_t *rx; 33.348 - NET_RING_IDX i; 33.349 + netif_rx_response_t *rx; 33.350 + NETIF_RING_IDX i; 33.351 + mmu_update_t mmu[2]; 33.352 + pte_t *pte; 33.353 33.354 - if ( unlikely(np->state == STATE_CLOSED) ) 33.355 - return; 33.356 - 33.357 spin_lock_irqsave(&np->tx_lock, flags); 33.358 network_tx_buf_gc(dev); 33.359 spin_unlock_irqrestore(&np->tx_lock, flags); 33.360 33.361 again: 33.362 - for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ ) 33.363 + for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ ) 33.364 { 33.365 - rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp; 33.366 + rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp; 33.367 33.368 skb = np->rx_skbs[rx->id]; 33.369 ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); 33.370 33.371 - if ( unlikely(rx->status != RING_STATUS_OK) ) 33.372 + if ( unlikely(rx->status <= 0) ) 33.373 { 33.374 /* Gate this error. We get a (valid) slew of them on suspend. */ 33.375 - if ( np->state == STATE_ACTIVE ) 33.376 + if ( np->state == NETIF_STATE_ACTIVE ) 33.377 printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status); 33.378 dev_kfree_skb_any(skb); 33.379 continue; 33.380 } 33.381 33.382 + /* Remap the page. */ 33.383 + pte = get_ppte(skb->head); 33.384 + mmu[0].ptr = virt_to_machine(pte); 33.385 + mmu[0].val = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; 33.386 + mmu[1].ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; 33.387 + mmu[1].val = __pa(skb->head) >> PAGE_SHIFT; 33.388 + if ( HYPERVISOR_mmu_update(mmu, 2) != 0 ) 33.389 + BUG(); 33.390 + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 33.391 + rx->addr >> PAGE_SHIFT; 33.392 + 33.393 /* 33.394 * Set up shinfo -- from alloc_skb This was particularily nasty: the 33.395 * shared info is hidden at the back of the data area (presumably so it 33.396 @@ -385,13 +356,13 @@ static inline void _network_interrupt(st 33.397 phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = 33.398 (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; 33.399 33.400 - skb->data = skb->tail = skb->head + rx->offset; 33.401 - skb_put(skb, rx->size); 33.402 + skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK); 33.403 + skb_put(skb, rx->status); 33.404 skb->protocol = eth_type_trans(skb, dev); 33.405 33.406 np->stats.rx_packets++; 33.407 33.408 - np->stats.rx_bytes += rx->size; 33.409 + np->stats.rx_bytes += rx->status; 33.410 netif_rx(skb); 33.411 dev->last_rx = jiffies; 33.412 } 33.413 @@ -399,42 +370,23 @@ static inline void _network_interrupt(st 33.414 np->rx_resp_cons = i; 33.415 33.416 network_alloc_rx_buffers(dev); 33.417 + np->rx->event = np->rx_resp_cons + 1; 33.418 33.419 /* Deal with hypervisor racing our resetting of rx_event. */ 33.420 mb(); 33.421 - if ( np->net_idx->rx_resp_prod != i ) 33.422 + if ( np->rx->resp_prod != i ) 33.423 goto again; 33.424 } 33.425 33.426 33.427 -static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs) 33.428 -{ 33.429 - struct list_head *ent; 33.430 - struct net_private *np; 33.431 - list_for_each ( ent, &dev_list ) 33.432 - { 33.433 - np = list_entry(ent, struct net_private, list); 33.434 - _network_interrupt(np->dev); 33.435 - } 33.436 -} 33.437 - 33.438 - 33.439 static int network_close(struct net_device *dev) 33.440 { 33.441 struct net_private *np = dev->priv; 33.442 - netop_t netop; 33.443 - 33.444 - np->state = STATE_SUSPENDED; 33.445 - wmb(); 33.446 33.447 netif_stop_queue(np->dev); 33.448 33.449 - netop.cmd = NETOP_FLUSH_BUFFERS; 33.450 - netop.vif = np->idx; 33.451 - (void)HYPERVISOR_net_io_op(&netop); 33.452 - 33.453 - while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) || 33.454 - (np->tx_resp_cons != np->net_idx->tx_req_prod) ) 33.455 + while ( (np->rx_resp_cons != np->rx->req_prod) || 33.456 + (np->tx_resp_cons != np->tx->req_prod) ) 33.457 { 33.458 barrier(); 33.459 current->state = TASK_INTERRUPTIBLE; 33.460 @@ -442,12 +394,9 @@ static int network_close(struct net_devi 33.461 } 33.462 33.463 wmb(); 33.464 - np->state = STATE_CLOSED; 33.465 + np->state = NETIF_STATE_CONNECTED; 33.466 wmb(); 33.467 33.468 - /* Now no longer safe to take interrupts for this device. */ 33.469 - clear_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx); 33.470 - 33.471 MOD_DEC_USE_COUNT; 33.472 33.473 return 0; 33.474 @@ -461,72 +410,164 @@ static struct net_device_stats *network_ 33.475 } 33.476 33.477 33.478 +static void netif_status_change(netif_fe_interface_status_changed_t *status) 33.479 +{ 33.480 + ctrl_msg_t cmsg; 33.481 + netif_fe_interface_connect_t up; 33.482 + struct net_device *dev; 33.483 + struct net_private *np; 33.484 + 33.485 + if ( status->handle != 0 ) 33.486 + { 33.487 + printk(KERN_WARNING "Status change on unsupported netif %d\n", 33.488 + status->handle); 33.489 + return; 33.490 + } 33.491 + 33.492 + dev = find_dev_by_handle(0); 33.493 + np = dev->priv; 33.494 + 33.495 + switch ( status->status ) 33.496 + { 33.497 + case NETIF_INTERFACE_STATUS_DESTROYED: 33.498 + printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n", 33.499 + np->state); 33.500 + break; 33.501 + 33.502 + case NETIF_INTERFACE_STATUS_DISCONNECTED: 33.503 + if ( np->state != NETIF_STATE_CLOSED ) 33.504 + { 33.505 + printk(KERN_WARNING "Unexpected netif-DISCONNECTED message" 33.506 + " in state %d\n", np->state); 33.507 + break; 33.508 + } 33.509 + 33.510 + /* Move from CLOSED to DISCONNECTED state. */ 33.511 + np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); 33.512 + np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); 33.513 + memset(np->tx, 0, PAGE_SIZE); 33.514 + memset(np->rx, 0, PAGE_SIZE); 33.515 + np->state = NETIF_STATE_DISCONNECTED; 33.516 + 33.517 + /* Construct an interface-CONNECT message for the domain controller. */ 33.518 + cmsg.type = CMSG_NETIF_FE; 33.519 + cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; 33.520 + cmsg.length = sizeof(netif_fe_interface_connect_t); 33.521 + up.handle = 0; 33.522 + up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT; 33.523 + up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT; 33.524 + memcpy(cmsg.msg, &up, sizeof(up)); 33.525 + 33.526 + /* Tell the controller to bring up the interface. */ 33.527 + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 33.528 + break; 33.529 + 33.530 + case NETIF_INTERFACE_STATUS_CONNECTED: 33.531 + if ( np->state == NETIF_STATE_CLOSED ) 33.532 + { 33.533 + printk(KERN_WARNING "Unexpected netif-CONNECTED message" 33.534 + " in state %d\n", np->state); 33.535 + break; 33.536 + } 33.537 + 33.538 + memcpy(dev->dev_addr, status->mac, ETH_ALEN); 33.539 + 33.540 + np->evtchn = status->evtchn; 33.541 + np->irq = bind_evtchn_to_irq(np->evtchn); 33.542 + (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 33.543 + dev->name, dev); 33.544 + 33.545 + np->state = NETIF_STATE_CONNECTED; 33.546 + break; 33.547 + 33.548 + default: 33.549 + printk(KERN_WARNING "Status change to unknown value %d\n", 33.550 + status->status); 33.551 + break; 33.552 + } 33.553 +} 33.554 + 33.555 + 33.556 +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) 33.557 +{ 33.558 + switch ( msg->subtype ) 33.559 + { 33.560 + case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED: 33.561 + if ( msg->length != sizeof(netif_fe_interface_status_changed_t) ) 33.562 + goto parse_error; 33.563 + netif_status_change((netif_fe_interface_status_changed_t *) 33.564 + &msg->msg[0]); 33.565 + break; 33.566 + default: 33.567 + goto parse_error; 33.568 + } 33.569 + 33.570 + ctrl_if_send_response(msg); 33.571 + return; 33.572 + 33.573 + parse_error: 33.574 + msg->length = 0; 33.575 + ctrl_if_send_response(msg); 33.576 +} 33.577 + 33.578 + 33.579 static int __init init_module(void) 33.580 { 33.581 -#if 0 33.582 - int i, fixmap_idx=-1, err; 33.583 + ctrl_msg_t cmsg; 33.584 + netif_fe_driver_status_changed_t st; 33.585 + int err; 33.586 struct net_device *dev; 33.587 struct net_private *np; 33.588 - netop_t netop; 33.589 + 33.590 + if ( start_info.flags & SIF_INITDOMAIN ) 33.591 + return 0; 33.592 33.593 INIT_LIST_HEAD(&dev_list); 33.594 33.595 - network_irq = bind_virq_to_irq(VIRQ_NET); 33.596 - debug_irq = bind_virq_to_irq(VIRQ_DEBUG); 33.597 + if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL ) 33.598 + { 33.599 + err = -ENOMEM; 33.600 + goto fail; 33.601 + } 33.602 33.603 - err = request_irq(network_irq, network_interrupt, 33.604 - SA_SAMPLE_RANDOM, "network", NULL); 33.605 - if ( err ) 33.606 + np = dev->priv; 33.607 + np->state = NETIF_STATE_CLOSED; 33.608 + np->handle = 0; 33.609 + 33.610 + dev->open = network_open; 33.611 + dev->hard_start_xmit = network_start_xmit; 33.612 + dev->stop = network_close; 33.613 + dev->get_stats = network_get_stats; 33.614 + 33.615 + if ( (err = register_netdev(dev)) != 0 ) 33.616 { 33.617 - printk(KERN_WARNING "Could not allocate network interrupt\n"); 33.618 + kfree(dev); 33.619 goto fail; 33.620 } 33.621 33.622 - err = request_irq(debug_irq, dbg_network_int, 33.623 - SA_SHIRQ, "net_dbg", &dbg_network_int); 33.624 - if ( err ) 33.625 - printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); 33.626 + np->dev = dev; 33.627 + list_add(&np->list, &dev_list); 33.628 33.629 - for ( i = 0; i < MAX_DOMAIN_VIFS; i++ ) 33.630 - { 33.631 - /* If the VIF is invalid then the query hypercall will fail. */ 33.632 - netop.cmd = NETOP_GET_VIF_INFO; 33.633 - netop.vif = i; 33.634 - if ( HYPERVISOR_net_io_op(&netop) != 0 ) 33.635 - continue; 33.636 - 33.637 - /* We actually only support up to 4 vifs right now. */ 33.638 - if ( ++fixmap_idx == 4 ) 33.639 - break; 33.640 + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, 33.641 + CALLBACK_IN_BLOCKING_CONTEXT); 33.642 33.643 - dev = alloc_etherdev(sizeof(struct net_private)); 33.644 - if ( dev == NULL ) 33.645 - { 33.646 - err = -ENOMEM; 33.647 - goto fail; 33.648 - } 33.649 - 33.650 - np = dev->priv; 33.651 - np->state = STATE_CLOSED; 33.652 - np->net_ring_fixmap_idx = fixmap_idx; 33.653 - np->idx = i; 33.654 + /* Send a driver-UP notification to the domain controller. */ 33.655 + cmsg.type = CMSG_NETIF_FE; 33.656 + cmsg.subtype = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED; 33.657 + cmsg.length = sizeof(netif_fe_driver_status_changed_t); 33.658 + st.status = NETIF_DRIVER_STATUS_UP; 33.659 + memcpy(cmsg.msg, &st, sizeof(st)); 33.660 + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 33.661 33.662 - SET_MODULE_OWNER(dev); 33.663 - dev->open = network_open; 33.664 - dev->hard_start_xmit = network_start_xmit; 33.665 - dev->stop = network_close; 33.666 - dev->get_stats = network_get_stats; 33.667 - 33.668 - memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); 33.669 - 33.670 - if ( (err = register_netdev(dev)) != 0 ) 33.671 - { 33.672 - kfree(dev); 33.673 - goto fail; 33.674 - } 33.675 - 33.676 - np->dev = dev; 33.677 - list_add(&np->list, &dev_list); 33.678 + /* 33.679 + * We should read 'nr_interfaces' from response message and wait 33.680 + * for notifications before proceeding. For now we assume that we 33.681 + * will be notified of exactly one interface. 33.682 + */ 33.683 + while ( np->state != NETIF_STATE_CONNECTED ) 33.684 + { 33.685 + set_current_state(TASK_INTERRUPTIBLE); 33.686 + schedule_timeout(1); 33.687 } 33.688 33.689 return 0; 33.690 @@ -534,30 +575,13 @@ static int __init init_module(void) 33.691 fail: 33.692 cleanup_module(); 33.693 return err; 33.694 -#endif 33.695 - return 0; 33.696 } 33.697 33.698 33.699 static void cleanup_module(void) 33.700 { 33.701 - struct net_private *np; 33.702 - struct net_device *dev; 33.703 - 33.704 - while ( !list_empty(&dev_list) ) 33.705 - { 33.706 - np = list_entry(dev_list.next, struct net_private, list); 33.707 - list_del(&np->list); 33.708 - dev = np->dev; 33.709 - unregister_netdev(dev); 33.710 - kfree(dev); 33.711 - } 33.712 - 33.713 - free_irq(network_irq, NULL); 33.714 - free_irq(debug_irq, NULL); 33.715 - 33.716 - unbind_virq_from_irq(VIRQ_NET); 33.717 - unbind_virq_from_irq(VIRQ_DEBUG); 33.718 + /* XXX FIXME */ 33.719 + BUG(); 33.720 } 33.721 33.722
34.1 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c Tue May 11 14:57:44 2004 +0000 34.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c Tue May 11 15:02:26 2004 +0000 34.3 @@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_acti 34.4 static CONTROL_RING_IDX ctrl_if_tx_resp_cons; 34.5 static CONTROL_RING_IDX ctrl_if_rx_req_cons; 34.6 34.7 -/* Incoming message requests: primary message type -> message handler. */ 34.8 +/* Incoming message requests. */ 34.9 + /* Primary message type -> message handler. */ 34.10 static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256]; 34.11 + /* Primary message type -> callback in process context? */ 34.12 +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)]; 34.13 + /* Is it late enough during bootstrap to use schedule_task()? */ 34.14 +static int safe_to_schedule_task; 34.15 + /* Passed to schedule_task(). */ 34.16 +static struct tq_struct ctrl_if_rxmsg_deferred_tq; 34.17 + /* Queue up messages to be handled in process context. */ 34.18 +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE]; 34.19 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod; 34.20 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons; 34.21 34.22 /* Incoming message responses: message identifier -> message handler/id. */ 34.23 static struct { 34.24 @@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigne 34.25 } 34.26 } 34.27 34.28 +static void __ctrl_if_rxmsg_deferred(void *unused) 34.29 +{ 34.30 + ctrl_msg_t *msg; 34.31 + 34.32 + while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod ) 34.33 + { 34.34 + msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( 34.35 + ctrl_if_rxmsg_deferred_cons++)]; 34.36 + (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); 34.37 + } 34.38 +} 34.39 + 34.40 static void __ctrl_if_rx_tasklet(unsigned long data) 34.41 { 34.42 control_if_t *ctrl_if = get_ctrl_if(); 34.43 - ctrl_msg_t *msg; 34.44 + ctrl_msg_t msg, *pmsg; 34.45 34.46 while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod ) 34.47 { 34.48 - /* 34.49 - * We need no locking or barriers here. There will be one and only one 34.50 - * response as a result of each callback, so the callback handler 34.51 - * doesn't need to worry about the 'msg' being overwritten until: 34.52 - * 1. It returns (if the message must persist then it must be copied). 34.53 - * 2. A response is sent (the response may overwrite the request). 34.54 - */ 34.55 - msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; 34.56 - (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); 34.57 + pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; 34.58 + memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg)); 34.59 + if ( msg.length != 0 ) 34.60 + memcpy(msg.msg, pmsg->msg, msg.length); 34.61 + if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) ) 34.62 + { 34.63 + pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( 34.64 + ctrl_if_rxmsg_deferred_prod++)]; 34.65 + memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length); 34.66 + schedule_task(&ctrl_if_rxmsg_deferred_tq); 34.67 + } 34.68 + else 34.69 + { 34.70 + (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0); 34.71 + } 34.72 } 34.73 } 34.74 34.75 @@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *m 34.76 ctrl_if_notify_controller(); 34.77 } 34.78 34.79 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd) 34.80 +int ctrl_if_register_receiver( 34.81 + u8 type, 34.82 + ctrl_msg_handler_t hnd, 34.83 + unsigned int flags) 34.84 { 34.85 - unsigned long flags; 34.86 + unsigned long _flags; 34.87 int inuse; 34.88 34.89 - spin_lock_irqsave(&ctrl_if_lock, flags); 34.90 + spin_lock_irqsave(&ctrl_if_lock, _flags); 34.91 34.92 inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler); 34.93 34.94 if ( inuse ) 34.95 + { 34.96 printk(KERN_INFO "Receiver %p already established for control " 34.97 "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type); 34.98 + } 34.99 else 34.100 + { 34.101 ctrl_if_rxmsg_handler[type] = hnd; 34.102 + clear_bit(type, &ctrl_if_rxmsg_blocking_context); 34.103 + if ( flags == CALLBACK_IN_BLOCKING_CONTEXT ) 34.104 + { 34.105 + set_bit(type, &ctrl_if_rxmsg_blocking_context); 34.106 + if ( !safe_to_schedule_task ) 34.107 + BUG(); 34.108 + } 34.109 + } 34.110 34.111 - spin_unlock_irqrestore(&ctrl_if_lock, flags); 34.112 + spin_unlock_irqrestore(&ctrl_if_lock, _flags); 34.113 34.114 return !inuse; 34.115 } 34.116 @@ -326,6 +369,7 @@ void __init ctrl_if_init(void) 34.117 34.118 for ( i = 0; i < 256; i++ ) 34.119 ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler; 34.120 + ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred; 34.121 34.122 spin_lock_init(&ctrl_if_lock); 34.123 34.124 @@ -333,6 +377,15 @@ void __init ctrl_if_init(void) 34.125 } 34.126 34.127 34.128 +/* This is called after it is safe to call schedule_task(). */ 34.129 +static int __init ctrl_if_late_setup(void) 34.130 +{ 34.131 + safe_to_schedule_task = 1; 34.132 + return 0; 34.133 +} 34.134 +__initcall(ctrl_if_late_setup); 34.135 + 34.136 + 34.137 /* 34.138 * !! The following are DANGEROUS FUNCTIONS !! 34.139 * Use with care [for example, see xencons_force_flush()].
35.1 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c Tue May 11 14:57:44 2004 +0000 35.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c Tue May 11 15:02:26 2004 +0000 35.3 @@ -8,7 +8,10 @@ 35.4 35.5 #include <linux/config.h> 35.6 #include <linux/sched.h> 35.7 +#include <linux/mm.h> 35.8 +#include <linux/vmalloc.h> 35.9 #include <asm/hypervisor.h> 35.10 +#include <asm/hypervisor-ifs/dom_mem_ops.h> 35.11 #include <asm/page.h> 35.12 #include <asm/pgtable.h> 35.13 #include <asm/multicall.h> 35.14 @@ -244,3 +247,105 @@ void queue_set_ldt(unsigned long ptr, un 35.15 increment_index(); 35.16 spin_unlock_irqrestore(&update_lock, flags); 35.17 } 35.18 + 35.19 +void queue_machphys_update(unsigned long mfn, unsigned long pfn) 35.20 +{ 35.21 + unsigned long flags; 35.22 + spin_lock_irqsave(&update_lock, flags); 35.23 + update_queue[idx].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; 35.24 + update_queue[idx].val = pfn; 35.25 + increment_index(); 35.26 + spin_unlock_irqrestore(&update_lock, flags); 35.27 +} 35.28 + 35.29 +#ifdef CONFIG_XEN_PHYSDEV_ACCESS 35.30 + 35.31 +unsigned long allocate_empty_lowmem_region(unsigned long pages) 35.32 +{ 35.33 + pgd_t *pgd; 35.34 + pmd_t *pmd; 35.35 + pte_t *pte; 35.36 + unsigned long *pfn_array; 35.37 + unsigned long vstart; 35.38 + unsigned long i; 35.39 + int ret; 35.40 + unsigned int order = get_order(pages*PAGE_SIZE); 35.41 + dom_mem_op_t dom_mem_op; 35.42 + 35.43 + vstart = __get_free_pages(GFP_KERNEL, order); 35.44 + if ( vstart == 0 ) 35.45 + return 0UL; 35.46 + 35.47 + pfn_array = vmalloc((1<<order) * sizeof(*pfn_array)); 35.48 + if ( pfn_array == NULL ) 35.49 + BUG(); 35.50 + 35.51 + for ( i = 0; i < (1<<order); i++ ) 35.52 + { 35.53 + pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE))); 35.54 + pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE))); 35.55 + pte = pte_offset(pmd, (vstart + (i*PAGE_SIZE))); 35.56 + pfn_array[i] = pte->pte_low >> PAGE_SHIFT; 35.57 + queue_l1_entry_update(pte, 0); 35.58 + phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef; 35.59 + } 35.60 + 35.61 + flush_page_update_queue(); 35.62 + 35.63 + dom_mem_op.op = MEMOP_RESERVATION_DECREASE; 35.64 + dom_mem_op.u.decrease.size = 1<<order; 35.65 + dom_mem_op.u.decrease.pages = pfn_array; 35.66 + if ( (ret = HYPERVISOR_dom_mem_op(&dom_mem_op)) != (1<<order) ) 35.67 + { 35.68 + printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret); 35.69 + BUG(); 35.70 + } 35.71 + 35.72 + vfree(pfn_array); 35.73 + 35.74 + return vstart; 35.75 +} 35.76 + 35.77 +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages) 35.78 +{ 35.79 + pgd_t *pgd; 35.80 + pmd_t *pmd; 35.81 + pte_t *pte; 35.82 + unsigned long *pfn_array; 35.83 + unsigned long i; 35.84 + int ret; 35.85 + unsigned int order = get_order(pages*PAGE_SIZE); 35.86 + dom_mem_op_t dom_mem_op; 35.87 + 35.88 + pfn_array = vmalloc((1<<order) * sizeof(*pfn_array)); 35.89 + if ( pfn_array == NULL ) 35.90 + BUG(); 35.91 + 35.92 + dom_mem_op.op = MEMOP_RESERVATION_INCREASE; 35.93 + dom_mem_op.u.increase.size = 1<<order; 35.94 + dom_mem_op.u.increase.pages = pfn_array; 35.95 + if ( (ret = HYPERVISOR_dom_mem_op(&dom_mem_op)) != (1<<order) ) 35.96 + { 35.97 + printk(KERN_WARNING "Unable to increase memory reservation (%d)\n", 35.98 + ret); 35.99 + BUG(); 35.100 + } 35.101 + 35.102 + for ( i = 0; i < (1<<order); i++ ) 35.103 + { 35.104 + pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE))); 35.105 + pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE))); 35.106 + pte = pte_offset(pmd, (vstart + (i*PAGE_SIZE))); 35.107 + queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL); 35.108 + queue_machphys_update(pfn_array[i], __pa(vstart)>>PAGE_SHIFT); 35.109 + phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i]; 35.110 + } 35.111 + 35.112 + flush_page_update_queue(); 35.113 + 35.114 + vfree(pfn_array); 35.115 + 35.116 + free_pages(vstart, order); 35.117 +} 35.118 + 35.119 +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
36.1 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c Tue May 11 14:57:44 2004 +0000 36.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c Tue May 11 15:02:26 2004 +0000 36.3 @@ -202,10 +202,6 @@ void * __ioremap(unsigned long machine_a 36.4 unsigned long offset, last_addr; 36.5 pgprot_t prot; 36.6 36.7 - /* Only privileged Xenolinux can make unchecked pagetable updates. */ 36.8 - if ( !(start_info.flags & SIF_PRIVILEGED) ) 36.9 - return NULL; 36.10 - 36.11 /* Don't allow wraparound or zero size */ 36.12 last_addr = machine_addr + size - 1; 36.13 if (!size || last_addr < machine_addr)
37.1 --- a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c Tue May 11 14:57:44 2004 +0000 37.2 +++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c Tue May 11 15:02:26 2004 +0000 37.3 @@ -1626,7 +1626,7 @@ int __init blk_dev_init(void) 37.4 jsfd_init(); 37.5 #endif 37.6 37.7 -#ifdef CONFIG_XEN_VBD 37.8 +#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO) 37.9 xlblk_init(); 37.10 #endif 37.11
38.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h Tue May 11 14:57:44 2004 +0000 38.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h Tue May 11 15:02:26 2004 +0000 38.3 @@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *m 38.4 * Register a receiver for typed messages from the domain controller. The 38.5 * handler (@hnd) is called for every received message of specified @type. 38.6 * Returns TRUE (non-zero) if the handler was successfully registered. 38.7 + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will 38.8 + * occur in a context in which it is safe to yield (i.e., process context). 38.9 */ 38.10 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd); 38.11 +#define CALLBACK_IN_BLOCKING_CONTEXT 1 38.12 +int ctrl_if_register_receiver( 38.13 + u8 type, 38.14 + ctrl_msg_handler_t hnd, 38.15 + unsigned int flags); 38.16 38.17 /* 38.18 * Unregister a receiver for typed messages from the domain controller. The
39.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h Tue May 11 14:57:44 2004 +0000 39.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h Tue May 11 15:02:26 2004 +0000 39.3 @@ -44,6 +44,7 @@ void queue_pgd_unpin(unsigned long ptr); 39.4 void queue_pte_pin(unsigned long ptr); 39.5 void queue_pte_unpin(unsigned long ptr); 39.6 void queue_set_ldt(unsigned long ptr, unsigned long bytes); 39.7 +void queue_machphys_update(unsigned long mfn, unsigned long pfn); 39.8 #define MMU_UPDATE_DEBUG 0 39.9 39.10 #if MMU_UPDATE_DEBUG > 0 39.11 @@ -137,6 +138,12 @@ static inline int flush_page_update_queu 39.12 #define XEN_flush_page_update_queue() (_flush_page_update_queue()) 39.13 void MULTICALL_flush_page_update_queue(void); 39.14 39.15 +#ifdef CONFIG_XEN_PHYSDEV_ACCESS 39.16 +/* Allocate a contiguous empty region of low memory. Return virtual start. */ 39.17 +unsigned long allocate_empty_lowmem_region(unsigned long pages); 39.18 +/* Deallocate a contiguous region of low memory. Return it to the allocator. */ 39.19 +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages); 39.20 +#endif 39.21 39.22 /* 39.23 * Assembler stubs for hyper-calls.
40.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h Tue May 11 14:57:44 2004 +0000 40.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h Tue May 11 15:02:26 2004 +0000 40.3 @@ -159,46 +159,11 @@ extern void iounmap(void *addr); 40.4 extern void *bt_ioremap(unsigned long offset, unsigned long size); 40.5 extern void bt_iounmap(void *addr, unsigned long size); 40.6 40.7 -#ifdef CONFIG_XEN_PHYSDEV_ACCESS 40.8 - 40.9 -#ifdef CONFIG_HIGHMEM 40.10 -#error "Highmem is not yet compatible with physical device access" 40.11 -#endif 40.12 - 40.13 -/* 40.14 - * The bus translation macros need special care if we are executing device 40.15 - * accesses to/from other domains' memory. In these cases the virtual address 40.16 - * is actually a temporary mapping in the 'vmalloc' space. The physical 40.17 - * address will therefore be >max_low_pfn, and will not have a valid entry 40.18 - * in the phys_to_mach mapping table. 40.19 - */ 40.20 -static inline unsigned long phys_to_bus(unsigned long phys) 40.21 -{ 40.22 - extern unsigned long max_pfn; 40.23 - pgd_t *pgd; pmd_t *pmd; pte_t *pte; 40.24 - void *addr; 40.25 - unsigned long bus; 40.26 - if ( (phys >> PAGE_SHIFT) < max_pfn ) 40.27 - return phys_to_machine(phys); 40.28 - addr = phys_to_virt(phys); 40.29 - pgd = pgd_offset_k( (unsigned long)addr); 40.30 - pmd = pmd_offset(pgd, (unsigned long)addr); 40.31 - pte = pte_offset(pmd, (unsigned long)addr); 40.32 - bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK); 40.33 - return bus; 40.34 -} 40.35 - 40.36 -#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x)) 40.37 -#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) 40.38 -#define page_to_bus(_x) phys_to_bus(page_to_phys(_x)) 40.39 - 40.40 -#else 40.41 - 40.42 #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x)) 40.43 #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) 40.44 #define page_to_bus(_x) phys_to_machine(page_to_phys(_x)) 40.45 - 40.46 -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ 40.47 +#define bus_to_phys(_x) machine_to_phys(_x) 40.48 +#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT)) 40.49 40.50 /* 40.51 * readX/writeX() are used to access memory mapped devices. On some
41.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 41.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h Tue May 11 15:02:26 2004 +0000 41.3 @@ -0,0 +1,283 @@ 41.4 +#ifndef __i386_PCI_H 41.5 +#define __i386_PCI_H 41.6 + 41.7 +#include <linux/config.h> 41.8 + 41.9 +#ifdef __KERNEL__ 41.10 + 41.11 +/* Can be used to override the logic in pci_scan_bus for skipping 41.12 + already-configured bus numbers - to be used for buggy BIOSes 41.13 + or architectures with incomplete PCI setup by the loader */ 41.14 + 41.15 +#ifdef CONFIG_PCI 41.16 +extern unsigned int pcibios_assign_all_busses(void); 41.17 +#else 41.18 +#define pcibios_assign_all_busses() 0 41.19 +#endif 41.20 +#define pcibios_scan_all_fns() 0 41.21 + 41.22 +extern unsigned long pci_mem_start; 41.23 +#define PCIBIOS_MIN_IO 0x1000 41.24 +#define PCIBIOS_MIN_MEM (pci_mem_start) 41.25 + 41.26 +void pcibios_config_init(void); 41.27 +struct pci_bus * pcibios_scan_root(int bus); 41.28 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); 41.29 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); 41.30 + 41.31 +void pcibios_set_master(struct pci_dev *dev); 41.32 +void pcibios_penalize_isa_irq(int irq); 41.33 +struct irq_routing_table *pcibios_get_irq_routing_table(void); 41.34 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); 41.35 + 41.36 +/* Dynamic DMA mapping stuff. 41.37 + * i386 has everything mapped statically. 41.38 + */ 41.39 + 41.40 +#include <linux/types.h> 41.41 +#include <linux/slab.h> 41.42 +#include <asm/scatterlist.h> 41.43 +#include <linux/string.h> 41.44 +#include <asm/io.h> 41.45 + 41.46 +struct pci_dev; 41.47 + 41.48 +/* The networking and block device layers use this boolean for bounce 41.49 + * buffer decisions. 41.50 + */ 41.51 +#define PCI_DMA_BUS_IS_PHYS (0) 41.52 + 41.53 +/* Allocate and map kernel buffer using consistent mode DMA for a device. 41.54 + * hwdev should be valid struct pci_dev pointer for PCI devices, 41.55 + * NULL for PCI-like buses (ISA, EISA). 41.56 + * Returns non-NULL cpu-view pointer to the buffer if successful and 41.57 + * sets *dma_addrp to the pci side dma address as well, else *dma_addrp 41.58 + * is undefined. 41.59 + */ 41.60 +extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, 41.61 + dma_addr_t *dma_handle); 41.62 + 41.63 +/* Free and unmap a consistent DMA buffer. 41.64 + * cpu_addr is what was returned from pci_alloc_consistent, 41.65 + * size must be the same as what as passed into pci_alloc_consistent, 41.66 + * and likewise dma_addr must be the same as what *dma_addrp was set to. 41.67 + * 41.68 + * References to the memory and mappings associated with cpu_addr/dma_addr 41.69 + * past this call are illegal. 41.70 + */ 41.71 +extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, 41.72 + void *vaddr, dma_addr_t dma_handle); 41.73 + 41.74 +/* Map a single buffer of the indicated size for DMA in streaming mode. 41.75 + * The 32-bit bus address to use is returned. 41.76 + * 41.77 + * Once the device is given the dma address, the device owns this memory 41.78 + * until either pci_unmap_single or pci_dma_sync_single is performed. 41.79 + */ 41.80 +static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, 41.81 + size_t size, int direction) 41.82 +{ 41.83 + if (direction == PCI_DMA_NONE) 41.84 + out_of_line_bug(); 41.85 + flush_write_buffers(); 41.86 + return virt_to_bus(ptr); 41.87 +} 41.88 + 41.89 +/* Unmap a single streaming mode DMA translation. The dma_addr and size 41.90 + * must match what was provided for in a previous pci_map_single call. All 41.91 + * other usages are undefined. 41.92 + * 41.93 + * After this call, reads by the cpu to the buffer are guarenteed to see 41.94 + * whatever the device wrote there. 41.95 + */ 41.96 +static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, 41.97 + size_t size, int direction) 41.98 +{ 41.99 + if (direction == PCI_DMA_NONE) 41.100 + out_of_line_bug(); 41.101 + /* Nothing to do */ 41.102 +} 41.103 + 41.104 +/* 41.105 + * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical 41.106 + * to pci_map_single, but takes a struct page instead of a virtual address 41.107 + */ 41.108 +static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, 41.109 + unsigned long offset, size_t size, int direction) 41.110 +{ 41.111 + if (direction == PCI_DMA_NONE) 41.112 + out_of_line_bug(); 41.113 + 41.114 + return page_to_bus(page) + offset; 41.115 +} 41.116 + 41.117 +static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address, 41.118 + size_t size, int direction) 41.119 +{ 41.120 + if (direction == PCI_DMA_NONE) 41.121 + out_of_line_bug(); 41.122 + /* Nothing to do */ 41.123 +} 41.124 + 41.125 +/* pci_unmap_{page,single} is a nop so... */ 41.126 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) 41.127 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) 41.128 +#define pci_unmap_addr(PTR, ADDR_NAME) (0) 41.129 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) 41.130 +#define pci_unmap_len(PTR, LEN_NAME) (0) 41.131 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) 41.132 + 41.133 +/* Map a set of buffers described by scatterlist in streaming 41.134 + * mode for DMA. This is the scather-gather version of the 41.135 + * above pci_map_single interface. Here the scatter gather list 41.136 + * elements are each tagged with the appropriate dma address 41.137 + * and length. They are obtained via sg_dma_{address,length}(SG). 41.138 + * 41.139 + * NOTE: An implementation may be able to use a smaller number of 41.140 + * DMA address/length pairs than there are SG table elements. 41.141 + * (for example via virtual mapping capabilities) 41.142 + * The routine returns the number of addr/length pairs actually 41.143 + * used, at most nents. 41.144 + * 41.145 + * Device ownership issues as mentioned above for pci_map_single are 41.146 + * the same here. 41.147 + */ 41.148 +static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, 41.149 + int nents, int direction) 41.150 +{ 41.151 + int i; 41.152 + 41.153 + if (direction == PCI_DMA_NONE) 41.154 + out_of_line_bug(); 41.155 + 41.156 + /* 41.157 + * temporary 2.4 hack 41.158 + */ 41.159 + for (i = 0; i < nents; i++ ) { 41.160 + if (sg[i].address && sg[i].page) 41.161 + out_of_line_bug(); 41.162 + else if (!sg[i].address && !sg[i].page) 41.163 + out_of_line_bug(); 41.164 + 41.165 + if (sg[i].address) 41.166 + sg[i].dma_address = virt_to_bus(sg[i].address); 41.167 + else 41.168 + sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset; 41.169 + } 41.170 + 41.171 + flush_write_buffers(); 41.172 + return nents; 41.173 +} 41.174 + 41.175 +/* Unmap a set of streaming mode DMA translations. 41.176 + * Again, cpu read rules concerning calls here are the same as for 41.177 + * pci_unmap_single() above. 41.178 + */ 41.179 +static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, 41.180 + int nents, int direction) 41.181 +{ 41.182 + if (direction == PCI_DMA_NONE) 41.183 + out_of_line_bug(); 41.184 + /* Nothing to do */ 41.185 +} 41.186 + 41.187 +/* Make physical memory consistent for a single 41.188 + * streaming mode DMA translation after a transfer. 41.189 + * 41.190 + * If you perform a pci_map_single() but wish to interrogate the 41.191 + * buffer using the cpu, yet do not wish to teardown the PCI dma 41.192 + * mapping, you must call this function before doing so. At the 41.193 + * next point you give the PCI dma address back to the card, the 41.194 + * device again owns the buffer. 41.195 + */ 41.196 +static inline void pci_dma_sync_single(struct pci_dev *hwdev, 41.197 + dma_addr_t dma_handle, 41.198 + size_t size, int direction) 41.199 +{ 41.200 + if (direction == PCI_DMA_NONE) 41.201 + out_of_line_bug(); 41.202 + flush_write_buffers(); 41.203 +} 41.204 + 41.205 +/* Make physical memory consistent for a set of streaming 41.206 + * mode DMA translations after a transfer. 41.207 + * 41.208 + * The same as pci_dma_sync_single but for a scatter-gather list, 41.209 + * same rules and usage. 41.210 + */ 41.211 +static inline void pci_dma_sync_sg(struct pci_dev *hwdev, 41.212 + struct scatterlist *sg, 41.213 + int nelems, int direction) 41.214 +{ 41.215 + if (direction == PCI_DMA_NONE) 41.216 + out_of_line_bug(); 41.217 + flush_write_buffers(); 41.218 +} 41.219 + 41.220 +/* Return whether the given PCI device DMA address mask can 41.221 + * be supported properly. For example, if your device can 41.222 + * only drive the low 24-bits during PCI bus mastering, then 41.223 + * you would pass 0x00ffffff as the mask to this function. 41.224 + */ 41.225 +static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) 41.226 +{ 41.227 + /* 41.228 + * we fall back to GFP_DMA when the mask isn't all 1s, 41.229 + * so we can't guarantee allocations that must be 41.230 + * within a tighter range than GFP_DMA.. 41.231 + */ 41.232 + if(mask < 0x00ffffff) 41.233 + return 0; 41.234 + 41.235 + return 1; 41.236 +} 41.237 + 41.238 +/* This is always fine. */ 41.239 +#define pci_dac_dma_supported(pci_dev, mask) (1) 41.240 + 41.241 +static __inline__ dma64_addr_t 41.242 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) 41.243 +{ 41.244 + return ((dma64_addr_t) page_to_bus(page) + 41.245 + (dma64_addr_t) offset); 41.246 +} 41.247 + 41.248 +static __inline__ struct page * 41.249 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) 41.250 +{ 41.251 + return bus_to_page(dma_addr); 41.252 +} 41.253 + 41.254 +static __inline__ unsigned long 41.255 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) 41.256 +{ 41.257 + return (dma_addr & ~PAGE_MASK); 41.258 +} 41.259 + 41.260 +static __inline__ void 41.261 +pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) 41.262 +{ 41.263 + flush_write_buffers(); 41.264 +} 41.265 + 41.266 +/* These macros should be used after a pci_map_sg call has been done 41.267 + * to get bus addresses of each of the SG entries and their lengths. 41.268 + * You should only work with the number of sg entries pci_map_sg 41.269 + * returns. 41.270 + */ 41.271 +#define sg_dma_address(sg) ((sg)->dma_address) 41.272 +#define sg_dma_len(sg) ((sg)->length) 41.273 + 41.274 +/* Return the index of the PCI controller for device. */ 41.275 +static inline int pci_controller_num(struct pci_dev *dev) 41.276 +{ 41.277 + return 0; 41.278 +} 41.279 + 41.280 +#define HAVE_PCI_MMAP 41.281 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 41.282 + enum pci_mmap_state mmap_state, int write_combine); 41.283 + 41.284 +#endif /* __KERNEL__ */ 41.285 + 41.286 +#endif /* __i386_PCI_H */
42.1 --- a/xenolinux-2.4.26-sparse/mkbuildtree Tue May 11 14:57:44 2004 +0000 42.2 +++ b/xenolinux-2.4.26-sparse/mkbuildtree Tue May 11 15:02:26 2004 +0000 42.3 @@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h 42.4 ln -sf ../asm-i386/namei.h 42.5 ln -sf ../asm-i386/param.h 42.6 ln -sf ../asm-i386/parport.h 42.7 -ln -sf ../asm-i386/pci.h 42.8 ln -sf ../asm-i386/pgtable-3level.h 42.9 ln -sf ../asm-i386/poll.h 42.10 ln -sf ../asm-i386/posix_types.h
43.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 43.2 +++ b/xenolinux-2.4.26-sparse/mm/page_alloc.c Tue May 11 15:02:26 2004 +0000 43.3 @@ -0,0 +1,930 @@ 43.4 +/* 43.5 + * linux/mm/page_alloc.c 43.6 + * 43.7 + * Manages the free list, the system allocates free pages here. 43.8 + * Note that kmalloc() lives in slab.c 43.9 + * 43.10 + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 43.11 + * Swap reorganised 29.12.95, Stephen Tweedie 43.12 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 43.13 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 43.14 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 43.15 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 43.16 + */ 43.17 + 43.18 +#include <linux/config.h> 43.19 +#include <linux/mm.h> 43.20 +#include <linux/swap.h> 43.21 +#include <linux/swapctl.h> 43.22 +#include <linux/interrupt.h> 43.23 +#include <linux/pagemap.h> 43.24 +#include <linux/bootmem.h> 43.25 +#include <linux/slab.h> 43.26 +#include <linux/module.h> 43.27 + 43.28 +int nr_swap_pages; 43.29 +int nr_active_pages; 43.30 +int nr_inactive_pages; 43.31 +LIST_HEAD(inactive_list); 43.32 +LIST_HEAD(active_list); 43.33 +pg_data_t *pgdat_list; 43.34 + 43.35 +/* 43.36 + * 43.37 + * The zone_table array is used to look up the address of the 43.38 + * struct zone corresponding to a given zone number (ZONE_DMA, 43.39 + * ZONE_NORMAL, or ZONE_HIGHMEM). 43.40 + */ 43.41 +zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; 43.42 +EXPORT_SYMBOL(zone_table); 43.43 + 43.44 +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 43.45 +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; 43.46 +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; 43.47 +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; 43.48 +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 43.49 + 43.50 +int vm_gfp_debug = 0; 43.51 + 43.52 +/* 43.53 + * Temporary debugging check. 43.54 + */ 43.55 +#define BAD_RANGE(zone, page) \ 43.56 +( \ 43.57 + (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ 43.58 + || (((page) - mem_map) < (zone)->zone_start_mapnr) \ 43.59 + || ((zone) != page_zone(page)) \ 43.60 +) 43.61 + 43.62 +/* 43.63 + * Freeing function for a buddy system allocator. 43.64 + * Contrary to prior comments, this is *NOT* hairy, and there 43.65 + * is no reason for anyone not to understand it. 43.66 + * 43.67 + * The concept of a buddy system is to maintain direct-mapped tables 43.68 + * (containing bit values) for memory blocks of various "orders". 43.69 + * The bottom level table contains the map for the smallest allocatable 43.70 + * units of memory (here, pages), and each level above it describes 43.71 + * pairs of units from the levels below, hence, "buddies". 43.72 + * At a high level, all that happens here is marking the table entry 43.73 + * at the bottom level available, and propagating the changes upward 43.74 + * as necessary, plus some accounting needed to play nicely with other 43.75 + * parts of the VM system. 43.76 + * At each level, we keep one bit for each pair of blocks, which 43.77 + * is set to 1 iff only one of the pair is allocated. So when we 43.78 + * are allocating or freeing one, we can derive the state of the 43.79 + * other. That is, if we allocate a small block, and both were 43.80 + * free, the remainder of the region must be split into blocks. 43.81 + * If a block is freed, and its buddy is also free, then this 43.82 + * triggers coalescing into a block of larger size. 43.83 + * 43.84 + * -- wli 43.85 + */ 43.86 + 43.87 +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); 43.88 +static void __free_pages_ok (struct page *page, unsigned int order) 43.89 +{ 43.90 + unsigned long index, page_idx, mask, flags; 43.91 + free_area_t *area; 43.92 + struct page *base; 43.93 + zone_t *zone; 43.94 + 43.95 + /* 43.96 + * Yes, think what happens when other parts of the kernel take 43.97 + * a reference to a page in order to pin it for io. -ben 43.98 + */ 43.99 + if (PageLRU(page)) { 43.100 + if (unlikely(in_interrupt())) 43.101 + BUG(); 43.102 + lru_cache_del(page); 43.103 + } 43.104 + 43.105 + if (page->buffers) 43.106 + BUG(); 43.107 + if (page->mapping) 43.108 + return (*(void(*)(struct page *))page->mapping)(page); 43.109 + if (!VALID_PAGE(page)) 43.110 + BUG(); 43.111 + if (PageLocked(page)) 43.112 + BUG(); 43.113 + if (PageActive(page)) 43.114 + BUG(); 43.115 + ClearPageReferenced(page); 43.116 + ClearPageDirty(page); 43.117 + 43.118 + if (current->flags & PF_FREE_PAGES) 43.119 + goto local_freelist; 43.120 + back_local_freelist: 43.121 + 43.122 + zone = page_zone(page); 43.123 + 43.124 + mask = (~0UL) << order; 43.125 + base = zone->zone_mem_map; 43.126 + page_idx = page - base; 43.127 + if (page_idx & ~mask) 43.128 + BUG(); 43.129 + index = page_idx >> (1 + order); 43.130 + 43.131 + area = zone->free_area + order; 43.132 + 43.133 + spin_lock_irqsave(&zone->lock, flags); 43.134 + 43.135 + zone->free_pages -= mask; 43.136 + 43.137 + while (mask + (1 << (MAX_ORDER-1))) { 43.138 + struct page *buddy1, *buddy2; 43.139 + 43.140 + if (area >= zone->free_area + MAX_ORDER) 43.141 + BUG(); 43.142 + if (!__test_and_change_bit(index, area->map)) 43.143 + /* 43.144 + * the buddy page is still allocated. 43.145 + */ 43.146 + break; 43.147 + /* 43.148 + * Move the buddy up one level. 43.149 + * This code is taking advantage of the identity: 43.150 + * -mask = 1+~mask 43.151 + */ 43.152 + buddy1 = base + (page_idx ^ -mask); 43.153 + buddy2 = base + page_idx; 43.154 + if (BAD_RANGE(zone,buddy1)) 43.155 + BUG(); 43.156 + if (BAD_RANGE(zone,buddy2)) 43.157 + BUG(); 43.158 + 43.159 + list_del(&buddy1->list); 43.160 + mask <<= 1; 43.161 + area++; 43.162 + index >>= 1; 43.163 + page_idx &= mask; 43.164 + } 43.165 + list_add(&(base + page_idx)->list, &area->free_list); 43.166 + 43.167 + spin_unlock_irqrestore(&zone->lock, flags); 43.168 + return; 43.169 + 43.170 + local_freelist: 43.171 + if (current->nr_local_pages) 43.172 + goto back_local_freelist; 43.173 + if (in_interrupt()) 43.174 + goto back_local_freelist; 43.175 + 43.176 + list_add(&page->list, ¤t->local_pages); 43.177 + page->index = order; 43.178 + current->nr_local_pages++; 43.179 +} 43.180 + 43.181 +#define MARK_USED(index, order, area) \ 43.182 + __change_bit((index) >> (1+(order)), (area)->map) 43.183 + 43.184 +static inline struct page * expand (zone_t *zone, struct page *page, 43.185 + unsigned long index, int low, int high, free_area_t * area) 43.186 +{ 43.187 + unsigned long size = 1 << high; 43.188 + 43.189 + while (high > low) { 43.190 + if (BAD_RANGE(zone,page)) 43.191 + BUG(); 43.192 + area--; 43.193 + high--; 43.194 + size >>= 1; 43.195 + list_add(&(page)->list, &(area)->free_list); 43.196 + MARK_USED(index, high, area); 43.197 + index += size; 43.198 + page += size; 43.199 + } 43.200 + if (BAD_RANGE(zone,page)) 43.201 + BUG(); 43.202 + return page; 43.203 +} 43.204 + 43.205 +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); 43.206 +static struct page * rmqueue(zone_t *zone, unsigned int order) 43.207 +{ 43.208 + free_area_t * area = zone->free_area + order; 43.209 + unsigned int curr_order = order; 43.210 + struct list_head *head, *curr; 43.211 + unsigned long flags; 43.212 + struct page *page; 43.213 + 43.214 + spin_lock_irqsave(&zone->lock, flags); 43.215 + do { 43.216 + head = &area->free_list; 43.217 + curr = head->next; 43.218 + 43.219 + if (curr != head) { 43.220 + unsigned int index; 43.221 + 43.222 + page = list_entry(curr, struct page, list); 43.223 + if (BAD_RANGE(zone,page)) 43.224 + BUG(); 43.225 + list_del(curr); 43.226 + index = page - zone->zone_mem_map; 43.227 + if (curr_order != MAX_ORDER-1) 43.228 + MARK_USED(index, curr_order, area); 43.229 + zone->free_pages -= 1UL << order; 43.230 + 43.231 + page = expand(zone, page, index, order, curr_order, area); 43.232 + spin_unlock_irqrestore(&zone->lock, flags); 43.233 + 43.234 + set_page_count(page, 1); 43.235 + if (BAD_RANGE(zone,page)) 43.236 + BUG(); 43.237 + if (PageLRU(page)) 43.238 + BUG(); 43.239 + if (PageActive(page)) 43.240 + BUG(); 43.241 + return page; 43.242 + } 43.243 + curr_order++; 43.244 + area++; 43.245 + } while (curr_order < MAX_ORDER); 43.246 + spin_unlock_irqrestore(&zone->lock, flags); 43.247 + 43.248 + return NULL; 43.249 +} 43.250 + 43.251 +#ifndef CONFIG_DISCONTIGMEM 43.252 +struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) 43.253 +{ 43.254 + return __alloc_pages(gfp_mask, order, 43.255 + contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); 43.256 +} 43.257 +#endif 43.258 + 43.259 +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); 43.260 +static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) 43.261 +{ 43.262 + struct page * page = NULL; 43.263 + int __freed; 43.264 + 43.265 + if (in_interrupt()) 43.266 + BUG(); 43.267 + 43.268 + current->allocation_order = order; 43.269 + current->flags |= PF_MEMALLOC | PF_FREE_PAGES; 43.270 + 43.271 + __freed = try_to_free_pages_zone(classzone, gfp_mask); 43.272 + 43.273 + current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); 43.274 + 43.275 + if (current->nr_local_pages) { 43.276 + struct list_head * entry, * local_pages; 43.277 + struct page * tmp; 43.278 + int nr_pages; 43.279 + 43.280 + local_pages = ¤t->local_pages; 43.281 + 43.282 + if (likely(__freed)) { 43.283 + /* pick from the last inserted so we're lifo */ 43.284 + entry = local_pages->next; 43.285 + do { 43.286 + tmp = list_entry(entry, struct page, list); 43.287 + if (tmp->index == order && memclass(page_zone(tmp), classzone)) { 43.288 + list_del(entry); 43.289 + current->nr_local_pages--; 43.290 + set_page_count(tmp, 1); 43.291 + page = tmp; 43.292 + 43.293 + if (page->buffers) 43.294 + BUG(); 43.295 + if (page->mapping) 43.296 + BUG(); 43.297 + if (!VALID_PAGE(page)) 43.298 + BUG(); 43.299 + if (PageLocked(page)) 43.300 + BUG(); 43.301 + if (PageLRU(page)) 43.302 + BUG(); 43.303 + if (PageActive(page)) 43.304 + BUG(); 43.305 + if (PageDirty(page)) 43.306 + BUG(); 43.307 + 43.308 + break; 43.309 + } 43.310 + } while ((entry = entry->next) != local_pages); 43.311 + } 43.312 + 43.313 + nr_pages = current->nr_local_pages; 43.314 + /* free in reverse order so that the global order will be lifo */ 43.315 + while ((entry = local_pages->prev) != local_pages) { 43.316 + list_del(entry); 43.317 + tmp = list_entry(entry, struct page, list); 43.318 + __free_pages_ok(tmp, tmp->index); 43.319 + if (!nr_pages--) 43.320 + BUG(); 43.321 + } 43.322 + current->nr_local_pages = 0; 43.323 + } 43.324 + 43.325 + *freed = __freed; 43.326 + return page; 43.327 +} 43.328 + 43.329 +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) 43.330 +{ 43.331 + long free = zone->free_pages - (1UL << order); 43.332 + return free >= 0 ? free : 0; 43.333 +} 43.334 + 43.335 +/* 43.336 + * This is the 'heart' of the zoned buddy allocator: 43.337 + */ 43.338 +struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) 43.339 +{ 43.340 + zone_t **zone, * classzone; 43.341 + struct page * page; 43.342 + int freed, class_idx; 43.343 + 43.344 + zone = zonelist->zones; 43.345 + classzone = *zone; 43.346 + class_idx = zone_idx(classzone); 43.347 + 43.348 + for (;;) { 43.349 + zone_t *z = *(zone++); 43.350 + if (!z) 43.351 + break; 43.352 + 43.353 + if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { 43.354 + page = rmqueue(z, order); 43.355 + if (page) 43.356 + return page; 43.357 + } 43.358 + } 43.359 + 43.360 + classzone->need_balance = 1; 43.361 + mb(); 43.362 + if (waitqueue_active(&kswapd_wait)) 43.363 + wake_up_interruptible(&kswapd_wait); 43.364 + 43.365 + zone = zonelist->zones; 43.366 + for (;;) { 43.367 + unsigned long min; 43.368 + zone_t *z = *(zone++); 43.369 + if (!z) 43.370 + break; 43.371 + 43.372 + min = z->watermarks[class_idx].min; 43.373 + if (!(gfp_mask & __GFP_WAIT)) 43.374 + min >>= 2; 43.375 + if (zone_free_pages(z, order) > min) { 43.376 + page = rmqueue(z, order); 43.377 + if (page) 43.378 + return page; 43.379 + } 43.380 + } 43.381 + 43.382 + /* here we're in the low on memory slow path */ 43.383 + 43.384 + if ((current->flags & PF_MEMALLOC) && 43.385 + (!in_interrupt() || (current->flags & PF_MEMDIE))) { 43.386 + zone = zonelist->zones; 43.387 + for (;;) { 43.388 + zone_t *z = *(zone++); 43.389 + if (!z) 43.390 + break; 43.391 + 43.392 + page = rmqueue(z, order); 43.393 + if (page) 43.394 + return page; 43.395 + } 43.396 + return NULL; 43.397 + } 43.398 + 43.399 + /* Atomic allocations - we can't balance anything */ 43.400 + if (!(gfp_mask & __GFP_WAIT)) 43.401 + goto out; 43.402 + 43.403 + rebalance: 43.404 + page = balance_classzone(classzone, gfp_mask, order, &freed); 43.405 + if (page) 43.406 + return page; 43.407 + 43.408 + zone = zonelist->zones; 43.409 + if (likely(freed)) { 43.410 + for (;;) { 43.411 + zone_t *z = *(zone++); 43.412 + if (!z) 43.413 + break; 43.414 + 43.415 + if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { 43.416 + page = rmqueue(z, order); 43.417 + if (page) 43.418 + return page; 43.419 + } 43.420 + } 43.421 + goto rebalance; 43.422 + } else { 43.423 + /* 43.424 + * Check that no other task is been killed meanwhile, 43.425 + * in such a case we can succeed the allocation. 43.426 + */ 43.427 + for (;;) { 43.428 + zone_t *z = *(zone++); 43.429 + if (!z) 43.430 + break; 43.431 + 43.432 + if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { 43.433 + page = rmqueue(z, order); 43.434 + if (page) 43.435 + return page; 43.436 + } 43.437 + } 43.438 + } 43.439 + 43.440 + out: 43.441 + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", 43.442 + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); 43.443 + if (unlikely(vm_gfp_debug)) 43.444 + dump_stack(); 43.445 + return NULL; 43.446 +} 43.447 + 43.448 +/* 43.449 + * Common helper functions. 43.450 + */ 43.451 +unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) 43.452 +{ 43.453 + struct page * page; 43.454 + 43.455 + page = alloc_pages(gfp_mask, order); 43.456 + if (!page) 43.457 + return 0; 43.458 + return (unsigned long) page_address(page); 43.459 +} 43.460 + 43.461 +unsigned long get_zeroed_page(unsigned int gfp_mask) 43.462 +{ 43.463 + struct page * page; 43.464 + 43.465 + page = alloc_pages(gfp_mask, 0); 43.466 + if (page) { 43.467 + void *address = page_address(page); 43.468 + clear_page(address); 43.469 + return (unsigned long) address; 43.470 + } 43.471 + return 0; 43.472 +} 43.473 + 43.474 +void __free_pages(struct page *page, unsigned int order) 43.475 +{ 43.476 + if (!PageReserved(page) && put_page_testzero(page)) 43.477 + __free_pages_ok(page, order); 43.478 +} 43.479 + 43.480 +void free_pages(unsigned long addr, unsigned int order) 43.481 +{ 43.482 + if (addr != 0) 43.483 + __free_pages(virt_to_page(addr), order); 43.484 +} 43.485 + 43.486 +/* 43.487 + * Total amount of free (allocatable) RAM: 43.488 + */ 43.489 +unsigned int nr_free_pages (void) 43.490 +{ 43.491 + unsigned int sum = 0; 43.492 + zone_t *zone; 43.493 + 43.494 + for_each_zone(zone) 43.495 + sum += zone->free_pages; 43.496 + 43.497 + return sum; 43.498 +} 43.499 + 43.500 +/* 43.501 + * Amount of free RAM allocatable as buffer memory: 43.502 + */ 43.503 +unsigned int nr_free_buffer_pages (void) 43.504 +{ 43.505 + pg_data_t *pgdat; 43.506 + unsigned int sum = 0; 43.507 + zonelist_t *zonelist; 43.508 + zone_t **zonep, *zone; 43.509 + 43.510 + for_each_pgdat(pgdat) { 43.511 + int class_idx; 43.512 + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); 43.513 + zonep = zonelist->zones; 43.514 + zone = *zonep; 43.515 + class_idx = zone_idx(zone); 43.516 + 43.517 + sum += zone->nr_cache_pages; 43.518 + for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { 43.519 + int free = zone->free_pages - zone->watermarks[class_idx].high; 43.520 + if (free <= 0) 43.521 + continue; 43.522 + sum += free; 43.523 + } 43.524 + } 43.525 + 43.526 + return sum; 43.527 +} 43.528 + 43.529 +#if CONFIG_HIGHMEM 43.530 +unsigned int nr_free_highpages (void) 43.531 +{ 43.532 + pg_data_t *pgdat; 43.533 + unsigned int pages = 0; 43.534 + 43.535 + for_each_pgdat(pgdat) 43.536 + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 43.537 + 43.538 + return pages; 43.539 +} 43.540 + 43.541 +unsigned int freeable_lowmem(void) 43.542 +{ 43.543 + unsigned int pages = 0; 43.544 + pg_data_t *pgdat; 43.545 + 43.546 + for_each_pgdat(pgdat) { 43.547 + pages += pgdat->node_zones[ZONE_DMA].free_pages; 43.548 + pages += pgdat->node_zones[ZONE_DMA].nr_active_pages; 43.549 + pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages; 43.550 + pages += pgdat->node_zones[ZONE_NORMAL].free_pages; 43.551 + pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages; 43.552 + pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages; 43.553 + } 43.554 + 43.555 + return pages; 43.556 +} 43.557 +#endif 43.558 + 43.559 +#define K(x) ((x) << (PAGE_SHIFT-10)) 43.560 + 43.561 +/* 43.562 + * Show free area list (used inside shift_scroll-lock stuff) 43.563 + * We also calculate the percentage fragmentation. We do this by counting the 43.564 + * memory on each free list with the exception of the first item on the list. 43.565 + */ 43.566 +void show_free_areas_core(pg_data_t *pgdat) 43.567 +{ 43.568 + unsigned int order; 43.569 + unsigned type; 43.570 + pg_data_t *tmpdat = pgdat; 43.571 + 43.572 + printk("Free pages: %6dkB (%6dkB HighMem)\n", 43.573 + K(nr_free_pages()), 43.574 + K(nr_free_highpages())); 43.575 + 43.576 + while (tmpdat) { 43.577 + zone_t *zone; 43.578 + for (zone = tmpdat->node_zones; 43.579 + zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) 43.580 + printk("Zone:%s freepages:%6lukB\n", 43.581 + zone->name, 43.582 + K(zone->free_pages)); 43.583 + 43.584 + tmpdat = tmpdat->node_next; 43.585 + } 43.586 + 43.587 + printk("( Active: %d, inactive: %d, free: %d )\n", 43.588 + nr_active_pages, 43.589 + nr_inactive_pages, 43.590 + nr_free_pages()); 43.591 + 43.592 + for (type = 0; type < MAX_NR_ZONES; type++) { 43.593 + struct list_head *head, *curr; 43.594 + zone_t *zone = pgdat->node_zones + type; 43.595 + unsigned long nr, total, flags; 43.596 + 43.597 + total = 0; 43.598 + if (zone->size) { 43.599 + spin_lock_irqsave(&zone->lock, flags); 43.600 + for (order = 0; order < MAX_ORDER; order++) { 43.601 + head = &(zone->free_area + order)->free_list; 43.602 + curr = head; 43.603 + nr = 0; 43.604 + for (;;) { 43.605 + if ((curr = curr->next) == head) 43.606 + break; 43.607 + nr++; 43.608 + } 43.609 + total += nr * (1 << order); 43.610 + printk("%lu*%lukB ", nr, K(1UL) << order); 43.611 + } 43.612 + spin_unlock_irqrestore(&zone->lock, flags); 43.613 + } 43.614 + printk("= %lukB)\n", K(total)); 43.615 + } 43.616 + 43.617 +#ifdef SWAP_CACHE_INFO 43.618 + show_swap_cache_info(); 43.619 +#endif 43.620 +} 43.621 + 43.622 +void show_free_areas(void) 43.623 +{ 43.624 + show_free_areas_core(pgdat_list); 43.625 +} 43.626 + 43.627 +/* 43.628 + * Builds allocation fallback zone lists. 43.629 + */ 43.630 +static inline void build_zonelists(pg_data_t *pgdat) 43.631 +{ 43.632 + int i, j, k; 43.633 + 43.634 + for (i = 0; i <= GFP_ZONEMASK; i++) { 43.635 + zonelist_t *zonelist; 43.636 + zone_t *zone; 43.637 + 43.638 + zonelist = pgdat->node_zonelists + i; 43.639 + memset(zonelist, 0, sizeof(*zonelist)); 43.640 + 43.641 + j = 0; 43.642 + k = ZONE_NORMAL; 43.643 + if (i & __GFP_HIGHMEM) 43.644 + k = ZONE_HIGHMEM; 43.645 + if (i & __GFP_DMA) 43.646 + k = ZONE_DMA; 43.647 + 43.648 + switch (k) { 43.649 + default: 43.650 + BUG(); 43.651 + /* 43.652 + * fallthrough: 43.653 + */ 43.654 + case ZONE_HIGHMEM: 43.655 + zone = pgdat->node_zones + ZONE_HIGHMEM; 43.656 + if (zone->size) { 43.657 +#ifndef CONFIG_HIGHMEM 43.658 + BUG(); 43.659 +#endif 43.660 + zonelist->zones[j++] = zone; 43.661 + } 43.662 + case ZONE_NORMAL: 43.663 + zone = pgdat->node_zones + ZONE_NORMAL; 43.664 + if (zone->size) 43.665 + zonelist->zones[j++] = zone; 43.666 + case ZONE_DMA: 43.667 + zone = pgdat->node_zones + ZONE_DMA; 43.668 + if (zone->size) 43.669 + zonelist->zones[j++] = zone; 43.670 + } 43.671 + zonelist->zones[j++] = NULL; 43.672 + } 43.673 +} 43.674 + 43.675 +/* 43.676 + * Helper functions to size the waitqueue hash table. 43.677 + * Essentially these want to choose hash table sizes sufficiently 43.678 + * large so that collisions trying to wait on pages are rare. 43.679 + * But in fact, the number of active page waitqueues on typical 43.680 + * systems is ridiculously low, less than 200. So this is even 43.681 + * conservative, even though it seems large. 43.682 + * 43.683 + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 43.684 + * waitqueues, i.e. the size of the waitq table given the number of pages. 43.685 + */ 43.686 +#define PAGES_PER_WAITQUEUE 256 43.687 + 43.688 +static inline unsigned long wait_table_size(unsigned long pages) 43.689 +{ 43.690 + unsigned long size = 1; 43.691 + 43.692 + pages /= PAGES_PER_WAITQUEUE; 43.693 + 43.694 + while (size < pages) 43.695 + size <<= 1; 43.696 + 43.697 + /* 43.698 + * Once we have dozens or even hundreds of threads sleeping 43.699 + * on IO we've got bigger problems than wait queue collision. 43.700 + * Limit the size of the wait table to a reasonable size. 43.701 + */ 43.702 + size = min(size, 4096UL); 43.703 + 43.704 + return size; 43.705 +} 43.706 + 43.707 +/* 43.708 + * This is an integer logarithm so that shifts can be used later 43.709 + * to extract the more random high bits from the multiplicative 43.710 + * hash function before the remainder is taken. 43.711 + */ 43.712 +static inline unsigned long wait_table_bits(unsigned long size) 43.713 +{ 43.714 + return ffz(~size); 43.715 +} 43.716 + 43.717 +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 43.718 + 43.719 +/* 43.720 + * Set up the zone data structures: 43.721 + * - mark all pages reserved 43.722 + * - mark all memory queues empty 43.723 + * - clear the memory bitmaps 43.724 + */ 43.725 +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, 43.726 + unsigned long *zones_size, unsigned long zone_start_paddr, 43.727 + unsigned long *zholes_size, struct page *lmem_map) 43.728 +{ 43.729 + unsigned long i, j; 43.730 + unsigned long map_size; 43.731 + unsigned long totalpages, offset, realtotalpages; 43.732 + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); 43.733 + 43.734 + if (zone_start_paddr & ~PAGE_MASK) 43.735 + BUG(); 43.736 + 43.737 + totalpages = 0; 43.738 + for (i = 0; i < MAX_NR_ZONES; i++) { 43.739 + unsigned long size = zones_size[i]; 43.740 + totalpages += size; 43.741 + } 43.742 + realtotalpages = totalpages; 43.743 + if (zholes_size) 43.744 + for (i = 0; i < MAX_NR_ZONES; i++) 43.745 + realtotalpages -= zholes_size[i]; 43.746 + 43.747 + printk("On node %d totalpages: %lu\n", nid, realtotalpages); 43.748 + 43.749 + /* 43.750 + * Some architectures (with lots of mem and discontinous memory 43.751 + * maps) have to search for a good mem_map area: 43.752 + * For discontigmem, the conceptual mem map array starts from 43.753 + * PAGE_OFFSET, we need to align the actual array onto a mem map 43.754 + * boundary, so that MAP_NR works. 43.755 + */ 43.756 + map_size = (totalpages + 1)*sizeof(struct page); 43.757 + if (lmem_map == (struct page *)0) { 43.758 + lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); 43.759 + lmem_map = (struct page *)(PAGE_OFFSET + 43.760 + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); 43.761 + } 43.762 + *gmap = pgdat->node_mem_map = lmem_map; 43.763 + pgdat->node_size = totalpages; 43.764 + pgdat->node_start_paddr = zone_start_paddr; 43.765 + pgdat->node_start_mapnr = (lmem_map - mem_map); 43.766 + pgdat->nr_zones = 0; 43.767 + 43.768 + offset = lmem_map - mem_map; 43.769 + for (j = 0; j < MAX_NR_ZONES; j++) { 43.770 + zone_t *zone = pgdat->node_zones + j; 43.771 + unsigned long mask; 43.772 + unsigned long size, realsize; 43.773 + int idx; 43.774 + 43.775 + zone_table[nid * MAX_NR_ZONES + j] = zone; 43.776 + realsize = size = zones_size[j]; 43.777 + if (zholes_size) 43.778 + realsize -= zholes_size[j]; 43.779 + 43.780 + printk("zone(%lu): %lu pages.\n", j, size); 43.781 + zone->size = size; 43.782 + zone->realsize = realsize; 43.783 + zone->name = zone_names[j]; 43.784 + zone->lock = SPIN_LOCK_UNLOCKED; 43.785 + zone->zone_pgdat = pgdat; 43.786 + zone->free_pages = 0; 43.787 + zone->need_balance = 0; 43.788 + zone->nr_active_pages = zone->nr_inactive_pages = 0; 43.789 + 43.790 + 43.791 + if (!size) 43.792 + continue; 43.793 + 43.794 + /* 43.795 + * The per-page waitqueue mechanism uses hashed waitqueues 43.796 + * per zone. 43.797 + */ 43.798 + zone->wait_table_size = wait_table_size(size); 43.799 + zone->wait_table_shift = 43.800 + BITS_PER_LONG - wait_table_bits(zone->wait_table_size); 43.801 + zone->wait_table = (wait_queue_head_t *) 43.802 + alloc_bootmem_node(pgdat, zone->wait_table_size 43.803 + * sizeof(wait_queue_head_t)); 43.804 + 43.805 + for(i = 0; i < zone->wait_table_size; ++i) 43.806 + init_waitqueue_head(zone->wait_table + i); 43.807 + 43.808 + pgdat->nr_zones = j+1; 43.809 + 43.810 + mask = (realsize / zone_balance_ratio[j]); 43.811 + if (mask < zone_balance_min[j]) 43.812 + mask = zone_balance_min[j]; 43.813 + else if (mask > zone_balance_max[j]) 43.814 + mask = zone_balance_max[j]; 43.815 + zone->watermarks[j].min = mask; 43.816 + zone->watermarks[j].low = mask*2; 43.817 + zone->watermarks[j].high = mask*3; 43.818 + /* now set the watermarks of the lower zones in the "j" classzone */ 43.819 + for (idx = j-1; idx >= 0; idx--) { 43.820 + zone_t * lower_zone = pgdat->node_zones + idx; 43.821 + unsigned long lower_zone_reserve; 43.822 + if (!lower_zone->size) 43.823 + continue; 43.824 + 43.825 + mask = lower_zone->watermarks[idx].min; 43.826 + lower_zone->watermarks[j].min = mask; 43.827 + lower_zone->watermarks[j].low = mask*2; 43.828 + lower_zone->watermarks[j].high = mask*3; 43.829 + 43.830 + /* now the brainer part */ 43.831 + lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; 43.832 + lower_zone->watermarks[j].min += lower_zone_reserve; 43.833 + lower_zone->watermarks[j].low += lower_zone_reserve; 43.834 + lower_zone->watermarks[j].high += lower_zone_reserve; 43.835 + 43.836 + realsize += lower_zone->realsize; 43.837 + } 43.838 + 43.839 + zone->zone_mem_map = mem_map + offset; 43.840 + zone->zone_start_mapnr = offset; 43.841 + zone->zone_start_paddr = zone_start_paddr; 43.842 + 43.843 + if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) 43.844 + printk("BUG: wrong zone alignment, it will crash\n"); 43.845 + 43.846 + /* 43.847 + * Initially all pages are reserved - free ones are freed 43.848 + * up by free_all_bootmem() once the early boot process is 43.849 + * done. Non-atomic initialization, single-pass. 43.850 + */ 43.851 + for (i = 0; i < size; i++) { 43.852 + struct page *page = mem_map + offset + i; 43.853 + set_page_zone(page, nid * MAX_NR_ZONES + j); 43.854 + set_page_count(page, 0); 43.855 + SetPageReserved(page); 43.856 + INIT_LIST_HEAD(&page->list); 43.857 + if (j != ZONE_HIGHMEM) 43.858 + set_page_address(page, __va(zone_start_paddr)); 43.859 + zone_start_paddr += PAGE_SIZE; 43.860 + } 43.861 + 43.862 + offset += size; 43.863 + for (i = 0; ; i++) { 43.864 + unsigned long bitmap_size; 43.865 + 43.866 + INIT_LIST_HEAD(&zone->free_area[i].free_list); 43.867 + if (i == MAX_ORDER-1) { 43.868 + zone->free_area[i].map = NULL; 43.869 + break; 43.870 + } 43.871 + 43.872 + /* 43.873 + * Page buddy system uses "index >> (i+1)", 43.874 + * where "index" is at most "size-1". 43.875 + * 43.876 + * The extra "+3" is to round down to byte 43.877 + * size (8 bits per byte assumption). Thus 43.878 + * we get "(size-1) >> (i+4)" as the last byte 43.879 + * we can access. 43.880 + * 43.881 + * The "+1" is because we want to round the 43.882 + * byte allocation up rather than down. So 43.883 + * we should have had a "+7" before we shifted 43.884 + * down by three. Also, we have to add one as 43.885 + * we actually _use_ the last bit (it's [0,n] 43.886 + * inclusive, not [0,n[). 43.887 + * 43.888 + * So we actually had +7+1 before we shift 43.889 + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 43.890 + * (modulo overflows, which we do not have). 43.891 + * 43.892 + * Finally, we LONG_ALIGN because all bitmap 43.893 + * operations are on longs. 43.894 + */ 43.895 + bitmap_size = (size-1) >> (i+4); 43.896 + bitmap_size = LONG_ALIGN(bitmap_size+1); 43.897 + zone->free_area[i].map = 43.898 + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); 43.899 + } 43.900 + } 43.901 + build_zonelists(pgdat); 43.902 +} 43.903 + 43.904 +void __init free_area_init(unsigned long *zones_size) 43.905 +{ 43.906 + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); 43.907 +} 43.908 + 43.909 +static int __init setup_mem_frac(char *str) 43.910 +{ 43.911 + int j = 0; 43.912 + 43.913 + while (get_option(&str, &zone_balance_ratio[j++]) == 2); 43.914 + printk("setup_mem_frac: "); 43.915 + for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); 43.916 + printk("\n"); 43.917 + return 1; 43.918 +} 43.919 + 43.920 +__setup("memfrac=", setup_mem_frac); 43.921 + 43.922 +static int __init setup_lower_zone_reserve(char *str) 43.923 +{ 43.924 + int j = 0; 43.925 + 43.926 + while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); 43.927 + printk("setup_lower_zone_reserve: "); 43.928 + for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); 43.929 + printk("\n"); 43.930 + return 1; 43.931 +} 43.932 + 43.933 +__setup("lower_zone_reserve=", setup_lower_zone_reserve);