ia64/xen-unstable
changeset 1354:00059c1948cf
bitkeeper revision 1.891.1.5 (409ba2e8A6F60eP06BqyZUGapsn8XA)
Network interface for new IO model is now completed.
Network interface for new IO model is now completed.
line diff
1.1 --- a/.rootkeys Thu May 06 14:53:19 2004 +0000 1.2 +++ b/.rootkeys Fri May 07 14:53:28 2004 +0000 1.3 @@ -107,6 +107,7 @@ 4055ad97wMLUj0BZT0e_T0EwQN0Bvw tools/xen 1.4 4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h 1.5 4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py 1.6 4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py 1.7 +409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py 1.8 40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c 1.9 4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py 1.10 4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend 1.11 @@ -735,6 +736,7 @@ 3e5a4e678ddsQOpbSiRdy1GRcDc9WA xenolinux 1.12 3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h 1.13 3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h 1.14 3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h 1.15 +409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h 1.16 3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h 1.17 3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h 1.18 3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h 1.19 @@ -762,6 +764,7 @@ 406aeeafkrnCuIVWLFv3kfn4uAD5Eg xenolinux 1.20 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c 1.21 3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c 1.22 3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c 1.23 +409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c 1.24 3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c 1.25 3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c 1.26 407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse
2.1 --- a/tools/examples/xc_dom_create.py Thu May 06 14:53:19 2004 +0000 2.2 +++ b/tools/examples/xc_dom_create.py Fri May 07 14:53:28 2004 +0000 2.3 @@ -333,7 +333,18 @@ def make_domain(): 2.4 xc.domain_destroy ( dom=id ) 2.5 sys.exit() 2.6 2.7 - if not new_io_world: 2.8 + if new_io_world: 2.9 + cmsg = 'new_network_interface(dom='+str(id)+')' 2.10 + xend_response = xenctl.utils.xend_control_message(cmsg) 2.11 + if not xend_response['success']: 2.12 + print "Error creating network interface" 2.13 + print "Error type: " + xend_response['error_type'] 2.14 + if xend_response['error_type'] == 'exception': 2.15 + print "Exception type: " + xend_response['exception_type'] 2.16 + print "Exception val: " + xend_response['exception_value'] 2.17 + xc.domain_destroy ( dom=id ) 2.18 + sys.exit() 2.19 + else: 2.20 # setup virtual firewall rules for all aliases 2.21 for ip in vfr_ipaddr: 2.22 xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip )
3.1 --- a/tools/xenctl/lib/utils.py Thu May 06 14:53:19 2004 +0000 3.2 +++ b/tools/xenctl/lib/utils.py Fri May 07 14:53:28 2004 +0000 3.3 @@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'): 3.4 return m.group(1) 3.5 return None 3.6 3.7 -def get_current_ipgw(dev='eth0'): 3.8 - """Return a string containing the IP gateway for the given 3.9 - network interface (default 'eth0'). 3.10 - """ 3.11 +def get_current_ipgw(): 3.12 + """Return a string containing the default IP gateway.""" 3.13 fd = os.popen( '/sbin/route -n' ) 3.14 lines = fd.readlines() 3.15 for line in lines: 3.16 - m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' + 3.17 - '\s+\S+\s+\S*G.*' + dev + '.*', line ) 3.18 + m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' + 3.19 + '\s+0.0.0.0+\s+\S*G.*', line ) 3.20 if m: 3.21 return m.group(1) 3.22 return None
4.1 --- a/tools/xend/lib/domain_controller.h Thu May 06 14:53:19 2004 +0000 4.2 +++ b/tools/xend/lib/domain_controller.h Fri May 07 14:53:28 2004 +0000 4.3 @@ -468,7 +468,6 @@ typedef struct { 4.4 unsigned int evtchn; /* Event channel for notifications. */ 4.5 unsigned long tx_shmem_frame; /* Page cont. tx shared comms window. */ 4.6 unsigned long rx_shmem_frame; /* Page cont. rx shared comms window. */ 4.7 - unsigned long shmem_frame; 4.8 /* OUT */ 4.9 unsigned int status; 4.10 } netif_be_connect_t;
5.1 --- a/tools/xend/lib/main.py Thu May 06 14:53:19 2004 +0000 5.2 +++ b/tools/xend/lib/main.py Fri May 07 14:53:28 2004 +0000 5.3 @@ -5,7 +5,7 @@ 5.4 ########################################################### 5.5 5.6 import errno, re, os, pwd, select, signal, socket, struct, sys, time 5.7 -import xend.blkif, xend.console, xend.manager, xend.utils, Xc 5.8 +import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc 5.9 5.10 5.11 # The following parameters could be placed in a configuration file. 5.12 @@ -19,6 +19,8 @@ UNIX_SOCK = 'management_sock' # relat 5.13 CMSG_CONSOLE = 0 5.14 CMSG_BLKIF_BE = 1 5.15 CMSG_BLKIF_FE = 2 5.16 +CMSG_NETIF_BE = 3 5.17 +CMSG_NETIF_FE = 4 5.18 5.19 5.20 def port_from_dom(dom): 5.21 @@ -162,6 +164,10 @@ def daemon_loop(): 5.22 if xend.blkif.interface.list.has_key(idx): 5.23 blk_if = xend.blkif.interface.list[idx] 5.24 5.25 + net_if = False 5.26 + if xend.netif.interface.list.has_key(idx): 5.27 + net_if = xend.netif.interface.list[idx] 5.28 + 5.29 # If we pick up a disconnect notification then we do any necessary 5.30 # cleanup. 5.31 if type == notifier.EXCEPTION: 5.32 @@ -175,6 +181,9 @@ def daemon_loop(): 5.33 if blk_if: 5.34 blk_if.destroy() 5.35 del blk_if 5.36 + if net_if: 5.37 + net_if.destroy() 5.38 + del net_if 5.39 continue 5.40 5.41 # Process incoming requests. 5.42 @@ -188,6 +197,10 @@ def daemon_loop(): 5.43 blk_if.ctrlif_rx_req(port, msg) 5.44 elif type == CMSG_BLKIF_BE and port == dom0_port: 5.45 xend.blkif.backend_rx_req(port, msg) 5.46 + elif type == CMSG_NETIF_FE and net_if: 5.47 + net_if.ctrlif_rx_req(port, msg) 5.48 + elif type == CMSG_NETIF_BE and port == dom0_port: 5.49 + xend.netif.backend_rx_req(port, msg) 5.50 else: 5.51 port.write_response(msg) 5.52 5.53 @@ -198,6 +211,8 @@ def daemon_loop(): 5.54 type = (msg.get_header())['type'] 5.55 if type == CMSG_BLKIF_BE and port == dom0_port: 5.56 xend.blkif.backend_rx_rsp(port, msg) 5.57 + elif type == CMSG_NETIF_BE and port == dom0_port: 5.58 + xend.netif.backend_rx_rsp(port, msg) 5.59 5.60 # Send console data. 5.61 if con_if and con_if.ctrlif_transmit_work(port): 5.62 @@ -207,10 +222,18 @@ def daemon_loop(): 5.63 if blk_if and blk_if.ctrlif_transmit_work(port): 5.64 work_done = True 5.65 5.66 + # Send netif messages. 5.67 + if net_if and net_if.ctrlif_transmit_work(port): 5.68 + work_done = True 5.69 + 5.70 # Back-end block-device work. 5.71 if port == dom0_port and xend.blkif.backend_do_work(port): 5.72 work_done = True 5.73 5.74 + # Back-end network-device work. 5.75 + if port == dom0_port and xend.netif.backend_do_work(port): 5.76 + work_done = True 5.77 + 5.78 # Finally, notify the remote end of any work that we did. 5.79 if work_done: 5.80 port.notify()
6.1 --- a/tools/xend/lib/manager.py Thu May 06 14:53:19 2004 +0000 6.2 +++ b/tools/xend/lib/manager.py Fri May 07 14:53:28 2004 +0000 6.3 @@ -4,7 +4,7 @@ 6.4 ## Copyright (c) 2004, K A Fraser (University of Cambridge) 6.5 ############################################################# 6.6 6.7 -import xend.blkif, xend.console, xend.main, xend.utils 6.8 +import xend.blkif, xend.netif, xend.console, xend.main, xend.utils 6.9 6.10 6.11 ## 6.12 @@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, 6.13 6.14 # Response is deferred until back-end driver sends acknowledgement. 6.15 return None 6.16 + 6.17 + 6.18 +## 6.19 +## new_network_interface: 6.20 +## Create a new network interface for the specified domain @dom. 6.21 +## 6.22 +def new_network_interface(dom, handle=-1): 6.23 + # By default we create an interface with handle zero. 6.24 + if handle < 0: 6.25 + handle = 0 6.26 + 6.27 + # We only support one interface per domain, which must have handle zero. 6.28 + if handle != 0: 6.29 + response = { 'success': False } 6.30 + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ 6.31 + 'is supported)' % handle 6.32 + return response 6.33 + 6.34 + # Find local event-channel port associated with the specified domain. 6.35 + port = xend.main.port_from_dom(dom) 6.36 + if not port: 6.37 + response = { 'success': False } 6.38 + response['error_type'] = 'Unknown domain %d' % dom 6.39 + return response 6.40 + 6.41 + # The interface must not already exist. 6.42 + if xend.netif.interface.list.has_key(port.local_port): 6.43 + response = { 'success': False } 6.44 + response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \ 6.45 + 'exists' % (dom, handle) 6.46 + return response 6.47 + 6.48 + # Create the new interface. Initially no virtual devices are attached. 6.49 + xend.netif.interface(dom, port.local_port) 6.50 + 6.51 + # Response is deferred until back-end driver sends acknowledgement. 6.52 + return None
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/tools/xend/lib/netif.py Fri May 07 14:53:28 2004 +0000 7.3 @@ -0,0 +1,144 @@ 7.4 + 7.5 +################################################################### 7.6 +## xend/netif.py -- Network-interface management functions for Xend 7.7 +## Copyright (c) 2004, K A Fraser (University of Cambridge) 7.8 +################################################################### 7.9 + 7.10 +import errno, random, re, os, select, signal, socket, struct, sys 7.11 +import xend.main, xend.console, xend.manager, xend.utils, Xc 7.12 + 7.13 +CMSG_NETIF_BE = 3 7.14 +CMSG_NETIF_FE = 4 7.15 +CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED = 0 7.16 +CMSG_NETIF_FE_DRIVER_STATUS_CHANGED = 32 7.17 +CMSG_NETIF_FE_INTERFACE_CONNECT = 33 7.18 +CMSG_NETIF_FE_INTERFACE_DISCONNECT = 34 7.19 +CMSG_NETIF_BE_CREATE = 0 7.20 +CMSG_NETIF_BE_DESTROY = 1 7.21 +CMSG_NETIF_BE_CONNECT = 2 7.22 +CMSG_NETIF_BE_DISCONNECT = 3 7.23 + 7.24 +pendmsg = None 7.25 +pendaddr = None 7.26 + 7.27 +def backend_tx_req(msg): 7.28 + port = xend.main.dom0_port 7.29 + if port.space_to_write_request(): 7.30 + port.write_request(msg) 7.31 + port.notify() 7.32 + else: 7.33 + xend.netif.pendmsg = msg 7.34 + 7.35 +def backend_rx_req(port, msg): 7.36 + port.write_response(msg) 7.37 + 7.38 +def backend_rx_rsp(port, msg): 7.39 + subtype = (msg.get_header())['subtype'] 7.40 + print "Received netif-be response, subtype %d" % subtype 7.41 + if subtype == CMSG_NETIF_BE_CREATE: 7.42 + rsp = { 'success': True } 7.43 + xend.main.send_management_response(rsp, xend.netif.pendaddr) 7.44 + elif subtype == CMSG_NETIF_BE_CONNECT: 7.45 + (dom,hnd,evtchn,tx_frame,rx_frame,st) = \ 7.46 + struct.unpack("QIILLI", msg.get_payload()) 7.47 + netif = interface.list[xend.main.port_from_dom(dom).local_port] 7.48 + msg = xend.utils.message(CMSG_NETIF_FE, \ 7.49 + CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0) 7.50 + msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \ 7.51 + netif.evtchn['port2'], \ 7.52 + netif.mac[0],netif.mac[1], \ 7.53 + netif.mac[2],netif.mac[3], \ 7.54 + netif.mac[4],netif.mac[5], \ 7.55 + 0,0)) 7.56 + netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg) 7.57 + 7.58 +def backend_do_work(port): 7.59 + global pendmsg 7.60 + if pendmsg and port.space_to_write_request(): 7.61 + port.write_request(pendmsg) 7.62 + pendmsg = None 7.63 + return True 7.64 + return False 7.65 + 7.66 + 7.67 +class interface: 7.68 + 7.69 + # Dictionary of all network-device interfaces. 7.70 + list = {} 7.71 + 7.72 + 7.73 + # NB. 'key' is an opaque value that has no meaning in this class. 7.74 + def __init__(self, dom, key): 7.75 + self.dom = dom 7.76 + self.key = key 7.77 + self.pendmsg = None 7.78 + 7.79 + # VIFs get a random MAC address with a "special" vendor id. 7.80 + # 7.81 + # NB. The vendor is currently an "obsolete" one that used to belong 7.82 + # to DEC (AA-00-00). Using it is probably a bit rude :-) 7.83 + # 7.84 + # NB2. The first bit of the first random octet is set to zero for 7.85 + # all dynamic MAC addresses. This may allow us to manually specify 7.86 + # MAC addresses for some VIFs with no fear of clashes. 7.87 + self.mac = [ 0xaa, 0x00, 0x00 ] 7.88 + self.mac.append(int(random.random()*128)) 7.89 + self.mac.append(int(random.random()*256)) 7.90 + self.mac.append(int(random.random()*256)) 7.91 + 7.92 + interface.list[key] = self 7.93 + msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0) 7.94 + msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \ 7.95 + self.mac[0],self.mac[1], \ 7.96 + self.mac[2],self.mac[3], \ 7.97 + self.mac[4],self.mac[5], \ 7.98 + 0,0,0)) 7.99 + xend.netif.pendaddr = xend.main.mgmt_req_addr 7.100 + backend_tx_req(msg) 7.101 + 7.102 + 7.103 + # Completely destroy this interface. 7.104 + def destroy(self): 7.105 + del interface.list[self.key] 7.106 + msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0) 7.107 + msg.append_payload(struct.pack("QII",self.dom,0,0)) 7.108 + backend_tx_req(msg) 7.109 + 7.110 + 7.111 + # The parameter @port is the control-interface event channel. This method 7.112 + # returns True if messages were written to the control interface. 7.113 + def ctrlif_transmit_work(self, port): 7.114 + if self.pendmsg and port.space_to_write_request(): 7.115 + port.write_request(self.pendmsg) 7.116 + self.pendmsg = None 7.117 + return True 7.118 + return False 7.119 + 7.120 + def ctrlif_tx_req(self, port, msg): 7.121 + if port.space_to_write_request(): 7.122 + port.write_request(msg) 7.123 + port.notify() 7.124 + else: 7.125 + self.pendmsg = msg 7.126 + 7.127 + def ctrlif_rx_req(self, port, msg): 7.128 + port.write_response(msg) 7.129 + subtype = (msg.get_header())['subtype'] 7.130 + if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED: 7.131 + msg = xend.utils.message(CMSG_NETIF_FE, \ 7.132 + CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0) 7.133 + msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \ 7.134 + self.mac[1],self.mac[2], \ 7.135 + self.mac[3],self.mac[4], \ 7.136 + self.mac[5],0,0)) 7.137 + self.ctrlif_tx_req(port, msg) 7.138 + elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT: 7.139 + (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload()) 7.140 + xc = Xc.new() 7.141 + self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom) 7.142 + msg = xend.utils.message(CMSG_NETIF_BE, \ 7.143 + CMSG_NETIF_BE_CONNECT, 0) 7.144 + msg.append_payload(struct.pack("QIILLI",self.dom,0, \ 7.145 + self.evtchn['port1'],tx_frame, \ 7.146 + rx_frame,0)) 7.147 + backend_tx_req(msg)
8.1 --- a/xen/common/dom_mem_ops.c Thu May 06 14:53:19 2004 +0000 8.2 +++ b/xen/common/dom_mem_ops.c Fri May 07 14:53:28 2004 +0000 8.3 @@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_st 8.4 { 8.5 /* Leave some slack pages; e.g., for the network. */ 8.6 if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 8.7 - (PAGE_SHIFT-10))) ) 8.8 + (PAGE_SHIFT-10))) ) 8.9 + { 8.10 + DPRINTK("Not enough slack: %u %u\n", 8.11 + free_pfns, 8.12 + SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10)); 8.13 break; 8.14 + } 8.15 8.16 /* NB. 'alloc_domain_page' does limit checking on pages per domain. */ 8.17 if ( unlikely((page = alloc_domain_page(p)) == NULL) ) 8.18 + { 8.19 + DPRINTK("Could not allocate a frame\n"); 8.20 break; 8.21 - 8.22 + } 8.23 + 8.24 /* Inform the domain of the new page's machine address. */ 8.25 mpfn = (unsigned long)(page - frame_table); 8.26 copy_to_user(op.pages, &mpfn, sizeof(mpfn));
9.1 --- a/xen/common/domain.c Thu May 06 14:53:19 2004 +0000 9.2 +++ b/xen/common/domain.c Fri May 07 14:53:28 2004 +0000 9.3 @@ -334,6 +334,8 @@ struct pfn_info *alloc_domain_page(struc 9.4 spin_lock(&p->page_list_lock); 9.5 if ( unlikely(p->tot_pages >= p->max_pages) ) 9.6 { 9.7 + DPRINTK("Over-allocation for domain %llu: %u >= %u\n", 9.8 + p->domain, p->tot_pages, p->max_pages); 9.9 spin_unlock(&p->page_list_lock); 9.10 goto free_and_exit; 9.11 } 9.12 @@ -884,7 +886,7 @@ int construct_dom0(struct task_struct *p 9.13 page->type_and_flags = 0; 9.14 page->count_and_flags = PGC_allocated | 1; 9.15 list_add_tail(&page->list, &p->page_list); 9.16 - p->tot_pages++; 9.17 + p->tot_pages++; p->max_pages++; 9.18 } 9.19 9.20 mpt_alloc = (vpt_start - v_start) + alloc_start;
10.1 --- a/xen/common/kernel.c Thu May 06 14:53:19 2004 +0000 10.2 +++ b/xen/common/kernel.c Fri May 07 14:53:28 2004 +0000 10.3 @@ -105,7 +105,6 @@ static struct { 10.4 void cmain(unsigned long magic, multiboot_info_t *mbi) 10.5 { 10.6 struct task_struct *new_dom; 10.7 - dom0_createdomain_t dom0_params; 10.8 unsigned long max_page; 10.9 unsigned char *cmdline; 10.10 module_t *mod = (module_t *)__va(mbi->mods_addr); 10.11 @@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboo 10.12 task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task; 10.13 10.14 /* Create initial domain 0. */ 10.15 - dom0_params.memory_kb = opt_dom0_mem; 10.16 new_dom = do_createdomain(0, 0); 10.17 if ( new_dom == NULL ) 10.18 panic("Error creating domain 0\n");
11.1 --- a/xen/common/memory.c Thu May 06 14:53:19 2004 +0000 11.2 +++ b/xen/common/memory.c Fri May 07 14:53:28 2004 +0000 11.3 @@ -940,17 +940,25 @@ static int do_extended_command(unsigned 11.4 } 11.5 break; 11.6 11.7 + /* XXX This function is racey! */ 11.8 case MMUEXT_REASSIGN_PAGE: 11.9 - if ( !IS_PRIV(current) ) 11.10 + if ( unlikely(!IS_PRIV(current)) ) 11.11 { 11.12 MEM_LOG("Dom %llu has no privilege to reassign page ownership", 11.13 current->domain); 11.14 okay = 0; 11.15 } 11.16 - else if ( percpu_info[cpu].gps != NULL ) 11.17 + else if ( likely(percpu_info[cpu].gps != NULL) ) 11.18 { 11.19 + current->tot_pages--; 11.20 + percpu_info[cpu].gps->tot_pages++; 11.21 page->u.domain = percpu_info[cpu].gps; 11.22 } 11.23 + else 11.24 + { 11.25 + MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn); 11.26 + okay = 0; 11.27 + } 11.28 break; 11.29 11.30 case MMUEXT_RESET_SUBJECTDOM:
12.1 --- a/xenolinux-2.4.26-sparse/arch/xen/config.in Thu May 06 14:53:19 2004 +0000 12.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/config.in Fri May 07 14:53:28 2004 +0000 12.3 @@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then 12.4 bool 'HIGHMEM I/O support' CONFIG_HIGHIO 12.5 fi 12.6 12.7 +define_int CONFIG_FORCE_MAX_ZONEORDER 12 12.8 + 12.9 #bool 'Symmetric multi-processing support' CONFIG_SMP 12.10 #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then 12.11 # define_bool CONFIG_HAVE_DEC_LOCK y
13.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig Thu May 06 14:53:19 2004 +0000 13.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig Fri May 07 14:53:28 2004 +0000 13.3 @@ -50,6 +50,7 @@ CONFIG_X86_TSC=y 13.4 CONFIG_X86_L1_CACHE_SHIFT=5 13.5 CONFIG_NOHIGHMEM=y 13.6 # CONFIG_HIGHMEM4G is not set 13.7 +CONFIG_FORCE_MAX_ZONEORDER=12 13.8 13.9 # 13.10 # General setup 13.11 @@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y 13.12 # Network testing 13.13 # 13.14 # CONFIG_NET_PKTGEN is not set 13.15 +CONFIG_NETDEVICES=y 13.16 13.17 # 13.18 # Block devices
14.1 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev Thu May 06 14:53:19 2004 +0000 14.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev Fri May 07 14:53:28 2004 +0000 14.3 @@ -51,6 +51,7 @@ CONFIG_X86_TSC=y 14.4 CONFIG_X86_L1_CACHE_SHIFT=5 14.5 CONFIG_NOHIGHMEM=y 14.6 # CONFIG_HIGHMEM4G is not set 14.7 +CONFIG_FORCE_MAX_ZONEORDER=12 14.8 14.9 # 14.10 # General setup
15.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h Thu May 06 14:53:19 2004 +0000 15.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h Fri May 07 14:53:28 2004 +0000 15.3 @@ -10,6 +10,7 @@ 15.4 #include <linux/rbtree.h> 15.5 #include <linux/interrupt.h> 15.6 #include <linux/slab.h> 15.7 +#include <linux/blkdev.h> 15.8 #include <asm/ctrl_if.h> 15.9 #include <asm/io.h> 15.10 #include "../blkif.h"
16.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c Thu May 06 14:53:19 2004 +0000 16.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c Fri May 07 14:53:28 2004 +0000 16.3 @@ -74,7 +74,8 @@ void blkif_ctrlif_init(void) 16.4 ctrl_msg_t cmsg; 16.5 blkif_be_driver_status_changed_t st; 16.6 16.7 - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx); 16.8 + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 16.9 + CALLBACK_IN_BLOCKING_CONTEXT); 16.10 16.11 /* Send a driver-UP notification to the domain controller. */ 16.12 cmsg.type = CMSG_BLKIF_BE;
17.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c Thu May 06 14:53:19 2004 +0000 17.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c Fri May 07 14:53:28 2004 +0000 17.3 @@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *cre 17.4 unsigned int handle = create->blkif_handle; 17.5 blkif_t **pblkif, *blkif; 17.6 17.7 - if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL ) 17.8 + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) 17.9 { 17.10 DPRINTK("Could not create blkif: out of memory\n"); 17.11 create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
18.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c Thu May 06 14:53:19 2004 +0000 18.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c Fri May 07 14:53:28 2004 +0000 18.3 @@ -24,17 +24,15 @@ 18.4 #define MAX_PENDING_REQS 64 18.5 #define BATCH_PER_DOMAIN 16 18.6 18.7 -static struct vm_struct *mmap_vma; 18.8 -#define MMAP_PAGES_PER_SEGMENT \ 18.9 - ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1) 18.10 +static unsigned long mmap_vstart; 18.11 #define MMAP_PAGES_PER_REQUEST \ 18.12 - (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT) 18.13 + (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) 18.14 #define MMAP_PAGES \ 18.15 (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) 18.16 -#define MMAP_VADDR(_req,_seg) \ 18.17 - ((unsigned long)mmap_vma->addr + \ 18.18 +#define MMAP_VADDR(_req,_seg) \ 18.19 + (mmap_vstart + \ 18.20 ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ 18.21 - ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE)) 18.22 + ((_seg) * PAGE_SIZE)) 18.23 18.24 /* 18.25 * Each outstanding request that we've passed to the lower device layers has a 18.26 @@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blki 18.27 prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW); 18.28 for ( i = 0; i < req->nr_segments; i++ ) 18.29 { 18.30 - if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) ) 18.31 + /* Make sure the buffer is page-sized. */ 18.32 + if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) || 18.33 + (blkif_last_sect(req->frame_and_sects[i]) != 7) ) 18.34 goto bad_descriptor; 18.35 rc = direct_remap_area_pages(&init_mm, 18.36 MMAP_VADDR(pending_idx, i), 18.37 - req->buffer_and_sects[i] & PAGE_MASK, 18.38 + req->frame_and_sects[i] & PAGE_MASK, 18.39 PAGE_SIZE, prot, blkif->domid); 18.40 if ( rc != 0 ) 18.41 goto bad_descriptor; 18.42 @@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t 18.43 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 18.44 struct buffer_head *bh; 18.45 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; 18.46 - unsigned short nr_sects; 18.47 - unsigned long buffer; 18.48 + short nr_sects; 18.49 + unsigned long buffer, fas; 18.50 int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; 18.51 pending_req_t *pending_req; 18.52 pgprot_t prot; 18.53 18.54 /* We map virtual scatter/gather segments to physical segments. */ 18.55 int new_segs, nr_psegs = 0; 18.56 - phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; 18.57 + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1]; 18.58 18.59 /* Check that number of segments is sane. */ 18.60 if ( unlikely(req->nr_segments == 0) || 18.61 @@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t 18.62 */ 18.63 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) 18.64 { 18.65 - buffer = req->buffer_and_sects[i] & ~0x1FF; 18.66 - nr_sects = req->buffer_and_sects[i] & 0x1FF; 18.67 - 18.68 - if ( unlikely(nr_sects == 0) ) 18.69 - continue; 18.70 + fas = req->frame_and_sects[i]; 18.71 + buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); 18.72 + nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; 18.73 18.74 - if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) ) 18.75 - { 18.76 - DPRINTK("Too many sectors in segment\n"); 18.77 + if ( nr_sects <= 0 ) 18.78 goto bad_descriptor; 18.79 - } 18.80 18.81 phys_seg[nr_psegs].dev = req->device; 18.82 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; 18.83 @@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t 18.84 } 18.85 18.86 nr_psegs += new_segs; 18.87 - ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2); 18.88 + ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1)); 18.89 } 18.90 18.91 /* Nonsensical zero-sized request? */ 18.92 @@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t 18.93 18.94 for ( i = 0; i < nr_psegs; i++ ) 18.95 { 18.96 - unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + 18.97 - (phys_seg[i].nr_sects << 9) + 18.98 - (PAGE_SIZE - 1)) & PAGE_MASK; 18.99 int rc = direct_remap_area_pages(&init_mm, 18.100 MMAP_VADDR(pending_idx, i), 18.101 phys_seg[i].buffer & PAGE_MASK, 18.102 - sz, prot, blkif->domid); 18.103 + PAGE_SIZE, prot, blkif->domid); 18.104 if ( rc != 0 ) 18.105 { 18.106 DPRINTK("invalid buffer\n"); 18.107 @@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t 18.108 MMAP_PAGES_PER_REQUEST * PAGE_SIZE); 18.109 goto bad_descriptor; 18.110 } 18.111 + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = 18.112 + phys_seg[i].buffer >> PAGE_SHIFT; 18.113 } 18.114 18.115 pending_req = &pending_reqs[pending_idx]; 18.116 @@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t 18.117 bh->b_rsector = (unsigned long)phys_seg[i].sector_number; 18.118 bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + 18.119 (phys_seg[i].buffer & ~PAGE_MASK); 18.120 +// bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); 18.121 bh->b_end_io = end_block_io_op; 18.122 bh->b_private = pending_req; 18.123 18.124 @@ -456,13 +451,13 @@ static int __init init_module(void) 18.125 { 18.126 int i; 18.127 18.128 + if ( !(start_info.flags & SIF_INITDOMAIN) ) 18.129 + return 0; 18.130 + 18.131 blkif_interface_init(); 18.132 18.133 - if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL ) 18.134 - { 18.135 - printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n"); 18.136 - return -ENOMEM; 18.137 - } 18.138 + if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) 18.139 + BUG(); 18.140 18.141 pending_cons = 0; 18.142 pending_prod = MAX_PENDING_REQS; 18.143 @@ -484,6 +479,7 @@ static int __init init_module(void) 18.144 18.145 static void cleanup_module(void) 18.146 { 18.147 + BUG(); 18.148 } 18.149 18.150 module_init(init_module);
19.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c Thu May 06 14:53:19 2004 +0000 19.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c Fri May 07 14:53:28 2004 +0000 19.3 @@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *c 19.4 } 19.5 } 19.6 19.7 - if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) ) 19.8 + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) 19.9 { 19.10 DPRINTK("vbd_create: out of memory\n"); 19.11 create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; 19.12 @@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow) 19.13 } 19.14 19.15 if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 19.16 - GFP_ATOMIC)) == NULL) ) 19.17 + GFP_KERNEL)) == NULL) ) 19.18 { 19.19 DPRINTK("vbd_grow: out of memory\n"); 19.20 grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
20.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h Thu May 06 14:53:19 2004 +0000 20.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h Fri May 07 14:53:28 2004 +0000 20.3 @@ -26,19 +26,22 @@ 20.4 */ 20.5 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 20.6 20.7 -#define BLKIF_MAX_SECTORS_PER_SEGMENT 16 20.8 - 20.9 typedef struct { 20.10 u8 operation; /* BLKIF_OP_??? */ 20.11 u8 nr_segments; /* number of segments */ 20.12 blkif_vdev_t device; /* only for read/write requests */ 20.13 unsigned long id; /* private guest value, echoed in resp */ 20.14 blkif_sector_t sector_number; /* start sector idx on disk (r/w only) */ 20.15 - /* Least 9 bits is 'nr_sects'. High 23 bits is the address. */ 20.16 - /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */ 20.17 - unsigned long buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 20.18 + /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame. */ 20.19 + /* @first_sect: first sector in frame to transfer (inclusive). */ 20.20 + /* @last_sect: last sector in frame to transfer (inclusive). */ 20.21 + /* @frame: machine page frame number. */ 20.22 + unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 20.23 } blkif_request_t; 20.24 20.25 +#define blkif_first_sect(_fas) (((_fas)>>3)&7) 20.26 +#define blkif_last_sect(_fas) ((_fas)&7) 20.27 + 20.28 typedef struct { 20.29 unsigned long id; /* copied from request */ 20.30 u8 operation; /* copied from request */ 20.31 @@ -79,8 +82,8 @@ typedef struct { 20.32 * @device == unused (zero) 20.33 * @id == any value (echoed in response message) 20.34 * @sector_num == unused (zero) 20.35 - * @buffer_and_sects == list of page-aligned, page-sized buffers. 20.36 - * (i.e., nr_sects == 8). 20.37 + * @frame_and_sects == list of page-sized buffers. 20.38 + * (i.e., @first_sect == 0, @last_sect == 7). 20.39 * 20.40 * The response is a list of vdisk_t elements copied into the out-of-band 20.41 * probe buffer. On success the response status field contains the number
21.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c Thu May 06 14:53:19 2004 +0000 21.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c Fri May 07 14:53:28 2004 +0000 21.3 @@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linu 21.4 static unsigned int blkif_state = BLKIF_STATE_CLOSED; 21.5 static unsigned int blkif_evtchn, blkif_irq; 21.6 21.7 -static struct tq_struct blkif_statechange_tq; 21.8 - 21.9 static int blkif_control_rsp_valid; 21.10 static blkif_response_t blkif_control_rsp; 21.11 21.12 @@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned 21.13 struct gendisk *gd; 21.14 blkif_request_t *req; 21.15 struct buffer_head *bh; 21.16 + unsigned int fsect, lsect; 21.17 21.18 - if ( unlikely(nr_sectors >= (1<<9)) ) 21.19 - BUG(); 21.20 + fsect = (buffer_ma & ~PAGE_MASK) >> 9; 21.21 + lsect = fsect + nr_sectors - 1; 21.22 + 21.23 + /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ 21.24 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) 21.25 BUG(); 21.26 + if ( lsect > 7 ) 21.27 + BUG(); 21.28 + 21.29 + buffer_ma &= PAGE_MASK; 21.30 21.31 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) 21.32 return 1; 21.33 @@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned 21.34 bh = (struct buffer_head *)id; 21.35 bh->b_reqnext = (struct buffer_head *)req->id; 21.36 req->id = id; 21.37 - req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors; 21.38 - if ( ++req->nr_segments < MAX_BLK_SEGS ) 21.39 + req->frame_and_sects[req->nr_segments] = 21.40 + buffer_ma | (fsect<<3) | lsect; 21.41 + if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) 21.42 sg_next_sect += nr_sectors; 21.43 else 21.44 DISABLE_SCATTERGATHER(); 21.45 @@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned 21.46 req->sector_number = (blkif_sector_t)sector_number; 21.47 req->device = device; 21.48 req->nr_segments = 1; 21.49 - req->buffer_and_sects[0] = buffer_ma | nr_sectors; 21.50 + req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect; 21.51 req_prod++; 21.52 21.53 return 0; 21.54 @@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t 21.55 } 21.56 21.57 21.58 -static void blkif_bringup_phase1(void *unused) 21.59 +static void blkif_status_change(blkif_fe_interface_status_changed_t *status) 21.60 { 21.61 ctrl_msg_t cmsg; 21.62 blkif_fe_interface_connect_t up; 21.63 21.64 - /* Move from CLOSED to DISCONNECTED state. */ 21.65 - blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); 21.66 - blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; 21.67 - blkif_state = BLKIF_STATE_DISCONNECTED; 21.68 - 21.69 - /* Construct an interface-CONNECT message for the domain controller. */ 21.70 - cmsg.type = CMSG_BLKIF_FE; 21.71 - cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; 21.72 - cmsg.length = sizeof(blkif_fe_interface_connect_t); 21.73 - up.handle = 0; 21.74 - up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; 21.75 - memcpy(cmsg.msg, &up, sizeof(up)); 21.76 - 21.77 - /* Tell the controller to bring up the interface. */ 21.78 - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 21.79 -} 21.80 - 21.81 -static void blkif_bringup_phase2(void *unused) 21.82 -{ 21.83 - blkif_irq = bind_evtchn_to_irq(blkif_evtchn); 21.84 - (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); 21.85 - 21.86 - /* Probe for discs that are attached to the interface. */ 21.87 - xlvbd_init(); 21.88 - 21.89 - blkif_state = BLKIF_STATE_CONNECTED; 21.90 - 21.91 - /* Kick pending requests. */ 21.92 - spin_lock_irq(&io_request_lock); 21.93 - kick_pending_request_queues(); 21.94 - spin_unlock_irq(&io_request_lock); 21.95 -} 21.96 - 21.97 -static void blkif_status_change(blkif_fe_interface_status_changed_t *status) 21.98 -{ 21.99 if ( status->handle != 0 ) 21.100 { 21.101 printk(KERN_WARNING "Status change on unsupported blkif %d\n", 21.102 @@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe 21.103 " in state %d\n", blkif_state); 21.104 break; 21.105 } 21.106 - blkif_statechange_tq.routine = blkif_bringup_phase1; 21.107 - schedule_task(&blkif_statechange_tq); 21.108 + 21.109 + /* Move from CLOSED to DISCONNECTED state. */ 21.110 + blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); 21.111 + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; 21.112 + blkif_state = BLKIF_STATE_DISCONNECTED; 21.113 + 21.114 + /* Construct an interface-CONNECT message for the domain controller. */ 21.115 + cmsg.type = CMSG_BLKIF_FE; 21.116 + cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; 21.117 + cmsg.length = sizeof(blkif_fe_interface_connect_t); 21.118 + up.handle = 0; 21.119 + up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; 21.120 + memcpy(cmsg.msg, &up, sizeof(up)); 21.121 + 21.122 + /* Tell the controller to bring up the interface. */ 21.123 + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 21.124 break; 21.125 21.126 case BLKIF_INTERFACE_STATUS_CONNECTED: 21.127 @@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe 21.128 " in state %d\n", blkif_state); 21.129 break; 21.130 } 21.131 + 21.132 blkif_evtchn = status->evtchn; 21.133 - blkif_statechange_tq.routine = blkif_bringup_phase2; 21.134 - schedule_task(&blkif_statechange_tq); 21.135 + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); 21.136 + (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); 21.137 + 21.138 + /* Probe for discs that are attached to the interface. */ 21.139 + xlvbd_init(); 21.140 + 21.141 + blkif_state = BLKIF_STATE_CONNECTED; 21.142 + 21.143 + /* Kick pending requests. */ 21.144 + spin_lock_irq(&io_request_lock); 21.145 + kick_pending_request_queues(); 21.146 + spin_unlock_irq(&io_request_lock); 21.147 break; 21.148 21.149 default: 21.150 @@ -675,7 +671,11 @@ int __init xlblk_init(void) 21.151 ctrl_msg_t cmsg; 21.152 blkif_fe_driver_status_changed_t st; 21.153 21.154 - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx); 21.155 + if ( start_info.flags & SIF_INITDOMAIN ) 21.156 + return 0; 21.157 + 21.158 + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, 21.159 + CALLBACK_IN_BLOCKING_CONTEXT); 21.160 21.161 /* Send a driver-UP notification to the domain controller. */ 21.162 cmsg.type = CMSG_BLKIF_FE;
22.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c Thu May 06 14:53:19 2004 +0000 22.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c Fri May 07 14:53:28 2004 +0000 22.3 @@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *d 22.4 memset(&req, 0, sizeof(req)); 22.5 req.operation = BLKIF_OP_PROBE; 22.6 req.nr_segments = 1; 22.7 - req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512); 22.8 + req.frame_and_sects[0] = virt_to_machine(buf) | 7; 22.9 22.10 blkif_control_send(&req, &rsp); 22.11
23.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c Thu May 06 14:53:19 2004 +0000 23.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c Fri May 07 14:53:28 2004 +0000 23.3 @@ -513,7 +513,7 @@ static int __init xencons_init(void) 23.4 } 23.5 else 23.6 { 23.7 - (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx); 23.8 + (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); 23.9 } 23.10 23.11 printk("Xen virtual console successfully installed\n");
24.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c Thu May 06 14:53:19 2004 +0000 24.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c Fri May 07 14:53:28 2004 +0000 24.3 @@ -10,8 +10,6 @@ 24.4 24.5 static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) 24.6 { 24.7 - DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype); 24.8 - 24.9 switch ( msg->subtype ) 24.10 { 24.11 case CMSG_NETIF_BE_CREATE: 24.12 @@ -54,7 +52,8 @@ void netif_ctrlif_init(void) 24.13 ctrl_msg_t cmsg; 24.14 netif_be_driver_status_changed_t st; 24.15 24.16 - (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx); 24.17 + (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, 24.18 + CALLBACK_IN_BLOCKING_CONTEXT); 24.19 24.20 /* Send a driver-UP notification to the domain controller. */ 24.21 cmsg.type = CMSG_NETIF_BE;
25.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c Thu May 06 14:53:19 2004 +0000 25.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c Fri May 07 14:53:28 2004 +0000 25.3 @@ -7,6 +7,7 @@ 25.4 */ 25.5 25.6 #include "common.h" 25.7 +#include <linux/rtnetlink.h> 25.8 25.9 #define NETIF_HASHSZ 1024 25.10 #define NETIF_HASH(_d,_h) \ 25.11 @@ -14,6 +15,7 @@ 25.12 25.13 static netif_t *netif_hash[NETIF_HASHSZ]; 25.14 static struct net_device *bridge_dev; 25.15 +static struct net_bridge *bridge_br; 25.16 25.17 netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) 25.18 { 25.19 @@ -36,8 +38,10 @@ void __netif_disconnect_complete(netif_t 25.20 */ 25.21 unbind_evtchn_from_irq(netif->evtchn); 25.22 vfree(netif->tx); /* Frees netif->rx as well. */ 25.23 - (void)br_del_if((struct net_bridge *)bridge_dev->priv, netif->dev); 25.24 + rtnl_lock(); 25.25 + (void)br_del_if(bridge_br, netif->dev); 25.26 (void)dev_close(netif->dev); 25.27 + rtnl_unlock(); 25.28 25.29 /* Construct the deferred response message. */ 25.30 cmsg.type = CMSG_NETIF_BE; 25.31 @@ -73,7 +77,7 @@ void netif_create(netif_be_create_t *cre 25.32 struct net_device *dev; 25.33 netif_t **pnetif, *netif; 25.34 25.35 - dev = alloc_netdev(sizeof(netif_t), "netif-be-%d", ether_setup); 25.36 + dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup); 25.37 if ( dev == NULL ) 25.38 { 25.39 DPRINTK("Could not create netif: out of memory\n"); 25.40 @@ -111,7 +115,10 @@ void netif_create(netif_be_create_t *cre 25.41 dev->hard_start_xmit = netif_be_start_xmit; 25.42 dev->get_stats = netif_be_get_stats; 25.43 memcpy(dev->dev_addr, create->mac, ETH_ALEN); 25.44 - 25.45 + 25.46 + /* XXX In bridge mode we should force a different MAC from remote end. */ 25.47 + dev->dev_addr[2] ^= 1; 25.48 + 25.49 if ( register_netdev(dev) != 0 ) 25.50 { 25.51 DPRINTK("Could not register new net device\n"); 25.52 @@ -225,15 +232,27 @@ void netif_connect(netif_be_connect_t *c 25.53 netif->status = CONNECTED; 25.54 netif_get(netif); 25.55 25.56 + rtnl_lock(); 25.57 + 25.58 (void)dev_open(netif->dev); 25.59 - (void)br_add_if((struct net_bridge *)bridge_dev->priv, netif->dev); 25.60 - /* At this point we try to ensure that eth0 is attached to the bridge. */ 25.61 + (void)br_add_if(bridge_br, netif->dev); 25.62 + 25.63 + /* 25.64 + * The default config is a very simple binding to eth0. 25.65 + * If eth0 is being used as an IP interface by this OS then someone 25.66 + * must add eth0's IP address to nbe-br, and change the routing table 25.67 + * to refer to nbe-br instead of eth0. 25.68 + */ 25.69 + (void)dev_open(bridge_dev); 25.70 if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL ) 25.71 { 25.72 (void)dev_open(eth0_dev); 25.73 - (void)br_add_if((struct net_bridge *)bridge_dev->priv, eth0_dev); 25.74 + (void)br_add_if(bridge_br, eth0_dev); 25.75 } 25.76 - (void)request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif); 25.77 + 25.78 + rtnl_unlock(); 25.79 + 25.80 + (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif); 25.81 netif_start_queue(netif->dev); 25.82 25.83 connect->status = NETIF_BE_STATUS_OKAY; 25.84 @@ -271,8 +290,11 @@ int netif_disconnect(netif_be_disconnect 25.85 void netif_interface_init(void) 25.86 { 25.87 memset(netif_hash, 0, sizeof(netif_hash)); 25.88 - if ( br_add_bridge("netif-backend") != 0 ) 25.89 + if ( br_add_bridge("nbe-br") != 0 ) 25.90 BUG(); 25.91 - bridge_dev = __dev_get_by_name("netif-be-bridge"); 25.92 - (void)dev_open(bridge_dev); 25.93 + bridge_dev = __dev_get_by_name("nbe-br"); 25.94 + bridge_br = (struct net_bridge *)bridge_dev->priv; 25.95 + bridge_br->bridge_hello_time = bridge_br->hello_time = 0; 25.96 + bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0; 25.97 + bridge_br->stp_enabled = 0; 25.98 }
26.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c Thu May 06 14:53:19 2004 +0000 26.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c Fri May 07 14:53:28 2004 +0000 26.3 @@ -14,7 +14,7 @@ 26.4 #include <asm/hypervisor-ifs/dom_mem_ops.h> 26.5 26.6 static void net_tx_action(unsigned long unused); 26.7 -static void tx_skb_release(struct sk_buff *skb); 26.8 +static void netif_page_release(struct page *page); 26.9 static void make_tx_response(netif_t *netif, 26.10 u16 id, 26.11 s8 st); 26.12 @@ -30,13 +30,13 @@ static DECLARE_TASKLET(net_tx_tasklet, n 26.13 #define tx_work_exists(_if) (1) 26.14 26.15 #define MAX_PENDING_REQS 256 26.16 -unsigned long mmap_vstart; 26.17 +static unsigned long mmap_vstart; 26.18 #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) 26.19 26.20 #define PKT_PROT_LEN (ETH_HLEN + 20) 26.21 26.22 -/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/ 26.23 static u16 pending_id[MAX_PENDING_REQS]; 26.24 +static netif_t *pending_netif[MAX_PENDING_REQS]; 26.25 static u16 pending_ring[MAX_PENDING_REQS]; 26.26 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; 26.27 typedef unsigned int PEND_RING_IDX; 26.28 @@ -60,8 +60,7 @@ static void __refresh_mfn_list(void) 26.29 op.u.increase.pages = mfn_list; 26.30 if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC ) 26.31 { 26.32 - printk(KERN_WARNING "Unable to increase memory reservation (%d)\n", 26.33 - ret); 26.34 + printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret); 26.35 BUG(); 26.36 } 26.37 alloc_index = MAX_MFN_ALLOC; 26.38 @@ -100,10 +99,10 @@ int netif_be_start_xmit(struct sk_buff * 26.39 { 26.40 netif_t *netif = (netif_t *)dev->priv; 26.41 s8 status = NETIF_RSP_OKAY; 26.42 - u16 size, id; 26.43 + u16 size=0, id; 26.44 mmu_update_t mmu[6]; 26.45 pgd_t *pgd; pmd_t *pmd; pte_t *pte; 26.46 - unsigned long vdata, new_mfn; 26.47 + unsigned long vdata, mdata=0, new_mfn; 26.48 26.49 /* Drop the packet if the target domain has no receive buffers. */ 26.50 if ( (netif->rx_req_cons == netif->rx->req_prod) || 26.51 @@ -126,16 +125,23 @@ int netif_be_start_xmit(struct sk_buff * 26.52 (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) || 26.53 ((skb->end - skb->head) < (PAGE_SIZE/2)) ) 26.54 { 26.55 - struct sk_buff *nskb = dev_alloc_skb(PAGE_SIZE-1024); 26.56 + struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC); 26.57 int hlen = skb->data - skb->head; 26.58 + if ( unlikely(nskb == NULL) ) 26.59 + { 26.60 + DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid); 26.61 + status = NETIF_RSP_ERROR; 26.62 + goto out; 26.63 + } 26.64 skb_reserve(nskb, hlen); 26.65 - skb_put(nskb, skb->len); 26.66 + __skb_put(nskb, skb->len); 26.67 (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len); 26.68 dev_kfree_skb(skb); 26.69 skb = nskb; 26.70 } 26.71 26.72 vdata = (unsigned long)skb->data; 26.73 + mdata = virt_to_machine(vdata); 26.74 size = skb->tail - skb->data; 26.75 26.76 new_mfn = get_new_mfn(); 26.77 @@ -153,7 +159,7 @@ int netif_be_start_xmit(struct sk_buff * 26.78 mmu[1].ptr |= MMU_EXTENDED_COMMAND; 26.79 mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H; 26.80 26.81 - mmu[2].ptr = virt_to_machine(vdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; 26.82 + mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; 26.83 mmu[2].val = MMUEXT_REASSIGN_PAGE; 26.84 26.85 mmu[3].ptr = MMU_EXTENDED_COMMAND; 26.86 @@ -167,6 +173,7 @@ int netif_be_start_xmit(struct sk_buff * 26.87 26.88 if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) ) 26.89 { 26.90 + DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid); 26.91 dealloc_mfn(new_mfn); 26.92 status = NETIF_RSP_ERROR; 26.93 goto out; 26.94 @@ -174,12 +181,12 @@ int netif_be_start_xmit(struct sk_buff * 26.95 26.96 phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn; 26.97 26.98 - netif->stats.tx_bytes += size; 26.99 - netif->stats.tx_packets++; 26.100 + netif->stats.rx_bytes += size; 26.101 + netif->stats.rx_packets++; 26.102 26.103 out: 26.104 spin_lock(&netif->rx_lock); 26.105 - make_rx_response(netif, id, status, virt_to_machine(vdata), size); 26.106 + make_rx_response(netif, id, status, mdata, size); 26.107 spin_unlock(&netif->rx_lock); 26.108 dev_kfree_skb(skb); 26.109 return 0; 26.110 @@ -220,6 +227,16 @@ static void add_to_net_schedule_list_tai 26.111 spin_unlock(&net_schedule_list_lock); 26.112 } 26.113 26.114 +static inline void netif_schedule_work(netif_t *netif) 26.115 +{ 26.116 + if ( (netif->tx_req_cons != netif->tx->req_prod) && 26.117 + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) 26.118 + { 26.119 + add_to_net_schedule_list_tail(netif); 26.120 + maybe_schedule_tx_action(); 26.121 + } 26.122 +} 26.123 + 26.124 void netif_deschedule(netif_t *netif) 26.125 { 26.126 remove_from_net_schedule_list(netif); 26.127 @@ -229,14 +246,8 @@ void netif_deschedule(netif_t *netif) 26.128 static void tx_credit_callback(unsigned long data) 26.129 { 26.130 netif_t *netif = (netif_t *)data; 26.131 - 26.132 netif->remaining_credit = netif->credit_bytes; 26.133 - 26.134 - if ( tx_work_exists(netif) ) 26.135 - { 26.136 - add_to_net_schedule_list_tail(netif); 26.137 - maybe_schedule_tx_action(); 26.138 - } 26.139 + netif_schedule_work(netif); 26.140 } 26.141 #endif 26.142 26.143 @@ -249,6 +260,7 @@ static void net_tx_action(unsigned long 26.144 u16 pending_idx; 26.145 NETIF_RING_IDX i; 26.146 pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED); 26.147 + struct page *page; 26.148 26.149 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && 26.150 !list_empty(&net_schedule_list) ) 26.151 @@ -261,7 +273,7 @@ static void net_tx_action(unsigned long 26.152 26.153 /* Work to do? */ 26.154 i = netif->tx_req_cons; 26.155 - if ( (i == netif->tx->req_prod) && 26.156 + if ( (i == netif->tx->req_prod) || 26.157 ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) 26.158 { 26.159 netif_put(netif); 26.160 @@ -296,7 +308,7 @@ static void net_tx_action(unsigned long 26.161 netif->remaining_credit -= tx.size; 26.162 #endif 26.163 26.164 - add_to_net_schedule_list_tail(netif); 26.165 + netif_schedule_work(netif); 26.166 26.167 if ( unlikely(txreq.size <= PKT_PROT_LEN) || 26.168 unlikely(txreq.size > ETH_FRAME_LEN) ) 26.169 @@ -335,6 +347,7 @@ static void net_tx_action(unsigned long 26.170 26.171 if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) ) 26.172 { 26.173 + DPRINTK("Can't allocate a skb in start_xmit.\n"); 26.174 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); 26.175 netif_put(netif); 26.176 vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE); 26.177 @@ -346,29 +359,29 @@ static void net_tx_action(unsigned long 26.178 (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), 26.179 PKT_PROT_LEN); 26.180 26.181 - skb->dev = netif->dev; 26.182 - skb->protocol = eth_type_trans(skb, skb->dev); 26.183 - 26.184 + page = virt_to_page(MMAP_VADDR(pending_idx)); 26.185 + 26.186 /* Append the packet payload as a fragment. */ 26.187 - skb_shinfo(skb)->frags[0].page = 26.188 - virt_to_page(MMAP_VADDR(pending_idx)); 26.189 - skb_shinfo(skb)->frags[0].size = 26.190 - txreq.size - PKT_PROT_LEN; 26.191 + skb_shinfo(skb)->frags[0].page = page; 26.192 + skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN; 26.193 skb_shinfo(skb)->frags[0].page_offset = 26.194 (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK; 26.195 skb_shinfo(skb)->nr_frags = 1; 26.196 skb->data_len = txreq.size - PKT_PROT_LEN; 26.197 skb->len += skb->data_len; 26.198 26.199 - /* Destructor information. */ 26.200 - skb->destructor = tx_skb_release; 26.201 - skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif; 26.202 - skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx; 26.203 + skb->dev = netif->dev; 26.204 + skb->protocol = eth_type_trans(skb, skb->dev); 26.205 26.206 - netif->stats.rx_bytes += txreq.size; 26.207 - netif->stats.rx_packets++; 26.208 + /* Destructor information. */ 26.209 + atomic_set(&page->count, 1); 26.210 + page->mapping = (struct address_space *)netif_page_release; 26.211 + pending_id[pending_idx] = txreq.id; 26.212 + pending_netif[pending_idx] = netif; 26.213 26.214 - pending_id[pending_idx] = txreq.id; 26.215 + netif->stats.tx_bytes += txreq.size; 26.216 + netif->stats.tx_packets++; 26.217 + 26.218 pending_cons++; 26.219 26.220 netif_rx(skb); 26.221 @@ -376,28 +389,34 @@ static void net_tx_action(unsigned long 26.222 } 26.223 } 26.224 26.225 -/* Destructor function for tx skbs. */ 26.226 -static void tx_skb_release(struct sk_buff *skb) 26.227 +static void netif_page_release(struct page *page) 26.228 { 26.229 unsigned long flags; 26.230 - netif_t *netif = (netif_t *)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page; 26.231 - u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size; 26.232 + netif_t *netif; 26.233 + u16 pending_idx; 26.234 + 26.235 + pending_idx = page - virt_to_page(mmap_vstart); 26.236 + 26.237 + netif = pending_netif[pending_idx]; 26.238 26.239 vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE); 26.240 - 26.241 - skb_shinfo(skb)->nr_frags = 0; 26.242 - 26.243 + 26.244 spin_lock(&netif->tx_lock); 26.245 make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY); 26.246 spin_unlock(&netif->tx_lock); 26.247 - 26.248 + 26.249 + /* 26.250 + * Scheduling checks must happen after the above response is posted. 26.251 + * This avoids a possible race with a guest OS on another CPU. 26.252 + */ 26.253 + mb(); 26.254 + netif_schedule_work(netif); 26.255 + 26.256 netif_put(netif); 26.257 26.258 spin_lock_irqsave(&pend_prod_lock, flags); 26.259 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; 26.260 spin_unlock_irqrestore(&pend_prod_lock, flags); 26.261 - 26.262 - maybe_schedule_tx_action(); 26.263 } 26.264 26.265 #if 0 26.266 @@ -493,9 +512,26 @@ static void make_rx_response(netif_t 26.267 26.268 static int __init init_module(void) 26.269 { 26.270 + int i; 26.271 + 26.272 + if ( !(start_info.flags & SIF_INITDOMAIN) ) 26.273 + return 0; 26.274 + 26.275 netif_interface_init(); 26.276 - mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS); 26.277 + 26.278 + if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) 26.279 + BUG(); 26.280 + 26.281 + pending_cons = 0; 26.282 + pending_prod = MAX_PENDING_REQS; 26.283 + for ( i = 0; i < MAX_PENDING_REQS; i++ ) 26.284 + pending_ring[i] = i; 26.285 + 26.286 + spin_lock_init(&net_schedule_list_lock); 26.287 + INIT_LIST_HEAD(&net_schedule_list); 26.288 + 26.289 netif_ctrlif_init(); 26.290 + 26.291 return 0; 26.292 } 26.293
27.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c Thu May 06 14:53:19 2004 +0000 27.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c Fri May 07 14:53:28 2004 +0000 27.3 @@ -25,20 +25,18 @@ 27.4 #include <net/sock.h> 27.5 #include <net/pkt_sched.h> 27.6 27.7 -#include "../netif.h" 27.8 +#include <asm/evtchn.h> 27.9 +#include <asm/ctrl_if.h> 27.10 +#include <asm/hypervisor-ifs/dom_mem_ops.h> 27.11 27.12 -static struct tq_struct netif_statechange_tq; 27.13 +#include "../netif.h" 27.14 27.15 #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ 27.16 27.17 -static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs); 27.18 static void network_tx_buf_gc(struct net_device *dev); 27.19 static void network_alloc_rx_buffers(struct net_device *dev); 27.20 static void cleanup_module(void); 27.21 27.22 -/* Dynamically-mapped IRQs. */ 27.23 -static int network_irq, debug_irq; 27.24 - 27.25 static struct list_head dev_list; 27.26 27.27 struct net_private 27.28 @@ -47,7 +45,7 @@ struct net_private 27.29 struct net_device *dev; 27.30 27.31 struct net_device_stats stats; 27.32 - NET_RING_IDX rx_resp_cons, tx_resp_cons; 27.33 + NETIF_RING_IDX rx_resp_cons, tx_resp_cons; 27.34 unsigned int tx_full; 27.35 27.36 netif_tx_interface_t *tx; 27.37 @@ -69,8 +67,8 @@ struct net_private 27.38 * {tx,rx}_skbs store outstanding skbuffs. The first entry in each 27.39 * array is an index into a chain of free entries. 27.40 */ 27.41 - struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1]; 27.42 - struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1]; 27.43 + struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; 27.44 + struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; 27.45 }; 27.46 27.47 /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ 27.48 @@ -91,7 +89,7 @@ static struct net_device *find_dev_by_ha 27.49 { 27.50 np = list_entry(ent, struct net_private, list); 27.51 if ( np->handle == handle ) 27.52 - return np; 27.53 + return np->dev; 27.54 } 27.55 return NULL; 27.56 } 27.57 @@ -100,8 +98,7 @@ static struct net_device *find_dev_by_ha 27.58 static int network_open(struct net_device *dev) 27.59 { 27.60 struct net_private *np = dev->priv; 27.61 - netop_t netop; 27.62 - int i, ret; 27.63 + int i; 27.64 27.65 if ( np->state != NETIF_STATE_CONNECTED ) 27.66 return -EINVAL; 27.67 @@ -111,15 +108,16 @@ static int network_open(struct net_devic 27.68 spin_lock_init(&np->tx_lock); 27.69 27.70 /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ 27.71 - for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ ) 27.72 + for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ ) 27.73 np->tx_skbs[i] = (void *)(i+1); 27.74 - for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ ) 27.75 + for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ ) 27.76 np->rx_skbs[i] = (void *)(i+1); 27.77 27.78 wmb(); 27.79 np->state = NETIF_STATE_ACTIVE; 27.80 27.81 network_alloc_rx_buffers(dev); 27.82 + np->rx->event = np->rx_resp_cons + 1; 27.83 27.84 netif_start_queue(dev); 27.85 27.86 @@ -131,18 +129,17 @@ static int network_open(struct net_devic 27.87 27.88 static void network_tx_buf_gc(struct net_device *dev) 27.89 { 27.90 - NET_RING_IDX i, prod; 27.91 + NETIF_RING_IDX i, prod; 27.92 unsigned short id; 27.93 struct net_private *np = dev->priv; 27.94 struct sk_buff *skb; 27.95 - tx_entry_t *tx_ring = np->net_ring->tx_ring; 27.96 27.97 do { 27.98 - prod = np->net_idx->tx_resp_prod; 27.99 + prod = np->tx->resp_prod; 27.100 27.101 for ( i = np->tx_resp_cons; i != prod; i++ ) 27.102 { 27.103 - id = tx_ring[MASK_NET_TX_IDX(i)].resp.id; 27.104 + id = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id; 27.105 skb = np->tx_skbs[id]; 27.106 ADD_ID_TO_FREELIST(np->tx_skbs, id); 27.107 dev_kfree_skb_any(skb); 27.108 @@ -158,14 +155,14 @@ static void network_tx_buf_gc(struct net 27.109 * in such cases notification from Xen is likely to be the only kick 27.110 * that we'll get. 27.111 */ 27.112 - np->net_idx->tx_event = 27.113 - prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1; 27.114 + np->tx->event = 27.115 + prod + ((np->tx->req_prod - prod) >> 1) + 1; 27.116 mb(); 27.117 } 27.118 - while ( prod != np->net_idx->tx_resp_prod ); 27.119 + while ( prod != np->tx->resp_prod ); 27.120 27.121 if ( np->tx_full && 27.122 - ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) ) 27.123 + ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) ) 27.124 { 27.125 np->tx_full = 0; 27.126 if ( np->state == NETIF_STATE_ACTIVE ) 27.127 @@ -189,10 +186,14 @@ static void network_alloc_rx_buffers(str 27.128 unsigned short id; 27.129 struct net_private *np = dev->priv; 27.130 struct sk_buff *skb; 27.131 - netop_t netop; 27.132 - NET_RING_IDX i = np->net_idx->rx_req_prod; 27.133 + NETIF_RING_IDX i = np->rx->req_prod; 27.134 + dom_mem_op_t op; 27.135 + unsigned long pfn_array[NETIF_RX_RING_SIZE]; 27.136 + int ret, nr_pfns = 0; 27.137 + pte_t *pte; 27.138 27.139 - if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || 27.140 + /* Make sure the batch is large enough to be worthwhile (1/2 ring). */ 27.141 + if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 27.142 unlikely(np->state != NETIF_STATE_ACTIVE) ) 27.143 return; 27.144 27.145 @@ -209,13 +210,13 @@ static void network_alloc_rx_buffers(str 27.146 id = GET_ID_FROM_FREELIST(np->rx_skbs); 27.147 np->rx_skbs[id] = skb; 27.148 27.149 - np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id; 27.150 - np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 27.151 - virt_to_machine(get_ppte(skb->head)); 27.152 - 27.153 - np->rx_bufs_to_notify++; 27.154 + np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id; 27.155 + 27.156 + pte = get_ppte(skb->head); 27.157 + pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT; 27.158 + queue_l1_entry_update(pte, 0); 27.159 } 27.160 - while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE ); 27.161 + while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE ); 27.162 27.163 /* 27.164 * We may have allocated buffers which have entries outstanding in the page 27.165 @@ -223,17 +224,16 @@ static void network_alloc_rx_buffers(str 27.166 */ 27.167 flush_page_update_queue(); 27.168 27.169 - np->net_idx->rx_req_prod = i; 27.170 - np->net_idx->rx_event = np->rx_resp_cons + 1; 27.171 - 27.172 - /* Batch Xen notifications. */ 27.173 - if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) ) 27.174 + op.op = MEMOP_RESERVATION_DECREASE; 27.175 + op.u.decrease.size = nr_pfns; 27.176 + op.u.decrease.pages = pfn_array; 27.177 + if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns ) 27.178 { 27.179 - netop.cmd = NETOP_PUSH_BUFFERS; 27.180 - netop.vif = np->idx; 27.181 - (void)HYPERVISOR_net_io_op(&netop); 27.182 - np->rx_bufs_to_notify = 0; 27.183 + printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret); 27.184 + BUG(); 27.185 } 27.186 + 27.187 + np->rx->req_prod = i; 27.188 } 27.189 27.190 27.191 @@ -241,9 +241,8 @@ static int network_start_xmit(struct sk_ 27.192 { 27.193 unsigned short id; 27.194 struct net_private *np = (struct net_private *)dev->priv; 27.195 - tx_req_entry_t *tx; 27.196 - netop_t netop; 27.197 - NET_RING_IDX i; 27.198 + netif_tx_request_t *tx; 27.199 + NETIF_RING_IDX i; 27.200 27.201 if ( unlikely(np->tx_full) ) 27.202 { 27.203 @@ -262,27 +261,27 @@ static int network_start_xmit(struct sk_ 27.204 memcpy(new_skb->data, skb->data, skb->len); 27.205 dev_kfree_skb(skb); 27.206 skb = new_skb; 27.207 - } 27.208 + } 27.209 27.210 spin_lock_irq(&np->tx_lock); 27.211 27.212 - i = np->net_idx->tx_req_prod; 27.213 + i = np->tx->req_prod; 27.214 27.215 id = GET_ID_FROM_FREELIST(np->tx_skbs); 27.216 np->tx_skbs[id] = skb; 27.217 27.218 - tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req; 27.219 + tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req; 27.220 27.221 tx->id = id; 27.222 - tx->addr = phys_to_machine(virt_to_phys(skb->data)); 27.223 + tx->addr = virt_to_machine(skb->data); 27.224 tx->size = skb->len; 27.225 27.226 wmb(); 27.227 - np->net_idx->tx_req_prod = i + 1; 27.228 + np->tx->req_prod = i + 1; 27.229 27.230 network_tx_buf_gc(dev); 27.231 27.232 - if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) ) 27.233 + if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) ) 27.234 { 27.235 np->tx_full = 1; 27.236 netif_stop_queue(dev); 27.237 @@ -295,12 +294,8 @@ static int network_start_xmit(struct sk_ 27.238 27.239 /* Only notify Xen if there are no outstanding responses. */ 27.240 mb(); 27.241 - if ( np->net_idx->tx_resp_prod == i ) 27.242 - { 27.243 - netop.cmd = NETOP_PUSH_BUFFERS; 27.244 - netop.vif = np->idx; 27.245 - (void)HYPERVISOR_net_io_op(&netop); 27.246 - } 27.247 + if ( np->tx->resp_prod == i ) 27.248 + notify_via_evtchn(np->evtchn); 27.249 27.250 return 0; 27.251 } 27.252 @@ -312,22 +307,24 @@ static void netif_int(int irq, void *dev 27.253 struct net_private *np = dev->priv; 27.254 unsigned long flags; 27.255 struct sk_buff *skb; 27.256 - rx_resp_entry_t *rx; 27.257 - NET_RING_IDX i; 27.258 + netif_rx_response_t *rx; 27.259 + NETIF_RING_IDX i; 27.260 + mmu_update_t mmu[2]; 27.261 + pte_t *pte; 27.262 27.263 spin_lock_irqsave(&np->tx_lock, flags); 27.264 network_tx_buf_gc(dev); 27.265 spin_unlock_irqrestore(&np->tx_lock, flags); 27.266 27.267 again: 27.268 - for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ ) 27.269 + for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ ) 27.270 { 27.271 - rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp; 27.272 + rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp; 27.273 27.274 skb = np->rx_skbs[rx->id]; 27.275 ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); 27.276 27.277 - if ( unlikely(rx->status != RING_STATUS_OK) ) 27.278 + if ( unlikely(rx->status <= 0) ) 27.279 { 27.280 /* Gate this error. We get a (valid) slew of them on suspend. */ 27.281 if ( np->state == NETIF_STATE_ACTIVE ) 27.282 @@ -336,6 +333,17 @@ static void netif_int(int irq, void *dev 27.283 continue; 27.284 } 27.285 27.286 + /* Remap the page. */ 27.287 + pte = get_ppte(skb->head); 27.288 + mmu[0].ptr = virt_to_machine(pte); 27.289 + mmu[0].val = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; 27.290 + mmu[1].ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; 27.291 + mmu[1].val = __pa(skb->head) >> PAGE_SHIFT; 27.292 + if ( HYPERVISOR_mmu_update(mmu, 2) != 0 ) 27.293 + BUG(); 27.294 + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 27.295 + rx->addr >> PAGE_SHIFT; 27.296 + 27.297 /* 27.298 * Set up shinfo -- from alloc_skb This was particularily nasty: the 27.299 * shared info is hidden at the back of the data area (presumably so it 27.300 @@ -348,13 +356,13 @@ static void netif_int(int irq, void *dev 27.301 phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = 27.302 (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; 27.303 27.304 - skb->data = skb->tail = skb->head + rx->offset; 27.305 - skb_put(skb, rx->size); 27.306 + skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK); 27.307 + skb_put(skb, rx->status); 27.308 skb->protocol = eth_type_trans(skb, dev); 27.309 27.310 np->stats.rx_packets++; 27.311 27.312 - np->stats.rx_bytes += rx->size; 27.313 + np->stats.rx_bytes += rx->status; 27.314 netif_rx(skb); 27.315 dev->last_rx = jiffies; 27.316 } 27.317 @@ -362,10 +370,11 @@ static void netif_int(int irq, void *dev 27.318 np->rx_resp_cons = i; 27.319 27.320 network_alloc_rx_buffers(dev); 27.321 + np->rx->event = np->rx_resp_cons + 1; 27.322 27.323 /* Deal with hypervisor racing our resetting of rx_event. */ 27.324 mb(); 27.325 - if ( np->net_idx->rx_resp_prod != i ) 27.326 + if ( np->rx->resp_prod != i ) 27.327 goto again; 27.328 } 27.329 27.330 @@ -373,16 +382,11 @@ static void netif_int(int irq, void *dev 27.331 static int network_close(struct net_device *dev) 27.332 { 27.333 struct net_private *np = dev->priv; 27.334 - netop_t netop; 27.335 27.336 netif_stop_queue(np->dev); 27.337 27.338 - netop.cmd = NETOP_FLUSH_BUFFERS; 27.339 - netop.vif = np->idx; 27.340 - (void)HYPERVISOR_net_io_op(&netop); 27.341 - 27.342 - while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) || 27.343 - (np->tx_resp_cons != np->net_idx->tx_req_prod) ) 27.344 + while ( (np->rx_resp_cons != np->rx->req_prod) || 27.345 + (np->tx_resp_cons != np->tx->req_prod) ) 27.346 { 27.347 barrier(); 27.348 current->state = TASK_INTERRUPTIBLE; 27.349 @@ -406,55 +410,12 @@ static struct net_device_stats *network_ 27.350 } 27.351 27.352 27.353 -static void netif_bringup_phase1(void *unused) 27.354 +static void netif_status_change(netif_fe_interface_status_changed_t *status) 27.355 { 27.356 ctrl_msg_t cmsg; 27.357 netif_fe_interface_connect_t up; 27.358 struct net_device *dev; 27.359 struct net_private *np; 27.360 - 27.361 - dev = find_dev_by_handle(0); 27.362 - np = dev->priv; 27.363 - 27.364 - /* Move from CLOSED to DISCONNECTED state. */ 27.365 - np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); 27.366 - np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); 27.367 - memset(np->tx, 0, PAGE_SIZE); 27.368 - memset(np->rx, 0, PAGE_SIZE); 27.369 - np->state = NETIF_STATE_DISCONNECTED; 27.370 - 27.371 - /* Construct an interface-CONNECT message for the domain controller. */ 27.372 - cmsg.type = CMSG_NETIF_FE; 27.373 - cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; 27.374 - cmsg.length = sizeof(netif_fe_interface_connect_t); 27.375 - up.handle = 0; 27.376 - up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT; 27.377 - up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT; 27.378 - memcpy(cmsg.msg, &up, sizeof(up)); 27.379 - 27.380 - /* Tell the controller to bring up the interface. */ 27.381 - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 27.382 -} 27.383 - 27.384 -static void netif_bringup_phase2(void *unused) 27.385 -{ 27.386 - struct net_device *dev; 27.387 - struct net_private *np; 27.388 - 27.389 - dev = find_dev_by_handle(0); 27.390 - np = dev->priv; 27.391 - 27.392 - np->irq = bind_evtchn_to_irq(np->evtchn); 27.393 - (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 27.394 - "netif", dev); 27.395 - 27.396 - np->state = NETIF_STATE_CONNECTED; 27.397 -} 27.398 - 27.399 -static void netif_status_change(netif_fe_interface_status_changed_t *status) 27.400 -{ 27.401 - struct net_device *dev; 27.402 - struct net_private *np; 27.403 27.404 if ( status->handle != 0 ) 27.405 { 27.406 @@ -470,31 +431,53 @@ static void netif_status_change(netif_fe 27.407 { 27.408 case NETIF_INTERFACE_STATUS_DESTROYED: 27.409 printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n", 27.410 - netif_state); 27.411 + np->state); 27.412 break; 27.413 27.414 case NETIF_INTERFACE_STATUS_DISCONNECTED: 27.415 if ( np->state != NETIF_STATE_CLOSED ) 27.416 { 27.417 printk(KERN_WARNING "Unexpected netif-DISCONNECTED message" 27.418 - " in state %d\n", netif_state); 27.419 + " in state %d\n", np->state); 27.420 break; 27.421 } 27.422 - netif_statechange_tq.routine = netif_bringup_phase1; 27.423 - schedule_task(&netif_statechange_tq); 27.424 + 27.425 + /* Move from CLOSED to DISCONNECTED state. */ 27.426 + np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); 27.427 + np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); 27.428 + memset(np->tx, 0, PAGE_SIZE); 27.429 + memset(np->rx, 0, PAGE_SIZE); 27.430 + np->state = NETIF_STATE_DISCONNECTED; 27.431 + 27.432 + /* Construct an interface-CONNECT message for the domain controller. */ 27.433 + cmsg.type = CMSG_NETIF_FE; 27.434 + cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; 27.435 + cmsg.length = sizeof(netif_fe_interface_connect_t); 27.436 + up.handle = 0; 27.437 + up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT; 27.438 + up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT; 27.439 + memcpy(cmsg.msg, &up, sizeof(up)); 27.440 + 27.441 + /* Tell the controller to bring up the interface. */ 27.442 + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 27.443 break; 27.444 27.445 case NETIF_INTERFACE_STATUS_CONNECTED: 27.446 if ( np->state == NETIF_STATE_CLOSED ) 27.447 { 27.448 printk(KERN_WARNING "Unexpected netif-CONNECTED message" 27.449 - " in state %d\n", netif_state); 27.450 + " in state %d\n", np->state); 27.451 break; 27.452 } 27.453 - np->evtchn = status->evtchn; 27.454 + 27.455 memcpy(dev->dev_addr, status->mac, ETH_ALEN); 27.456 - netif_statechange_tq.routine = netif_bringup_phase2; 27.457 - schedule_task(&netif_statechange_tq); 27.458 + 27.459 + np->evtchn = status->evtchn; 27.460 + np->irq = bind_evtchn_to_irq(np->evtchn); 27.461 + (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 27.462 + dev->name, dev); 27.463 + 27.464 + np->state = NETIF_STATE_CONNECTED; 27.465 break; 27.466 27.467 default: 27.468 @@ -532,10 +515,13 @@ static int __init init_module(void) 27.469 { 27.470 ctrl_msg_t cmsg; 27.471 netif_fe_driver_status_changed_t st; 27.472 - int i, err; 27.473 + int err; 27.474 struct net_device *dev; 27.475 struct net_private *np; 27.476 27.477 + if ( start_info.flags & SIF_INITDOMAIN ) 27.478 + return 0; 27.479 + 27.480 INIT_LIST_HEAD(&dev_list); 27.481 27.482 if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL ) 27.483 @@ -562,7 +548,8 @@ static int __init init_module(void) 27.484 np->dev = dev; 27.485 list_add(&np->list, &dev_list); 27.486 27.487 - (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); 27.488 + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, 27.489 + CALLBACK_IN_BLOCKING_CONTEXT); 27.490 27.491 /* Send a driver-UP notification to the domain controller. */ 27.492 cmsg.type = CMSG_NETIF_FE;
28.1 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c Thu May 06 14:53:19 2004 +0000 28.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c Fri May 07 14:53:28 2004 +0000 28.3 @@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_acti 28.4 static CONTROL_RING_IDX ctrl_if_tx_resp_cons; 28.5 static CONTROL_RING_IDX ctrl_if_rx_req_cons; 28.6 28.7 -/* Incoming message requests: primary message type -> message handler. */ 28.8 +/* Incoming message requests. */ 28.9 + /* Primary message type -> message handler. */ 28.10 static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256]; 28.11 + /* Primary message type -> callback in process context? */ 28.12 +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)]; 28.13 + /* Is it late enough during bootstrap to use schedule_task()? */ 28.14 +static int safe_to_schedule_task; 28.15 + /* Passed to schedule_task(). */ 28.16 +static struct tq_struct ctrl_if_rxmsg_deferred_tq; 28.17 + /* Queue up messages to be handled in process context. */ 28.18 +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE]; 28.19 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod; 28.20 +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons; 28.21 28.22 /* Incoming message responses: message identifier -> message handler/id. */ 28.23 static struct { 28.24 @@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigne 28.25 } 28.26 } 28.27 28.28 +static void __ctrl_if_rxmsg_deferred(void *unused) 28.29 +{ 28.30 + ctrl_msg_t *msg; 28.31 + 28.32 + while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod ) 28.33 + { 28.34 + msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( 28.35 + ctrl_if_rxmsg_deferred_cons++)]; 28.36 + (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); 28.37 + } 28.38 +} 28.39 + 28.40 static void __ctrl_if_rx_tasklet(unsigned long data) 28.41 { 28.42 control_if_t *ctrl_if = get_ctrl_if(); 28.43 - ctrl_msg_t *msg; 28.44 + ctrl_msg_t msg, *pmsg; 28.45 28.46 while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod ) 28.47 { 28.48 - /* 28.49 - * We need no locking or barriers here. There will be one and only one 28.50 - * response as a result of each callback, so the callback handler 28.51 - * doesn't need to worry about the 'msg' being overwritten until: 28.52 - * 1. It returns (if the message must persist then it must be copied). 28.53 - * 2. A response is sent (the response may overwrite the request). 28.54 - */ 28.55 - msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; 28.56 - (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); 28.57 + pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; 28.58 + memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg)); 28.59 + if ( msg.length != 0 ) 28.60 + memcpy(msg.msg, pmsg->msg, msg.length); 28.61 + if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) ) 28.62 + { 28.63 + pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( 28.64 + ctrl_if_rxmsg_deferred_prod++)]; 28.65 + memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length); 28.66 + schedule_task(&ctrl_if_rxmsg_deferred_tq); 28.67 + } 28.68 + else 28.69 + { 28.70 + (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0); 28.71 + } 28.72 } 28.73 } 28.74 28.75 @@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *m 28.76 ctrl_if_notify_controller(); 28.77 } 28.78 28.79 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd) 28.80 +int ctrl_if_register_receiver( 28.81 + u8 type, 28.82 + ctrl_msg_handler_t hnd, 28.83 + unsigned int flags) 28.84 { 28.85 - unsigned long flags; 28.86 + unsigned long _flags; 28.87 int inuse; 28.88 28.89 - spin_lock_irqsave(&ctrl_if_lock, flags); 28.90 + spin_lock_irqsave(&ctrl_if_lock, _flags); 28.91 28.92 inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler); 28.93 28.94 if ( inuse ) 28.95 + { 28.96 printk(KERN_INFO "Receiver %p already established for control " 28.97 "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type); 28.98 + } 28.99 else 28.100 + { 28.101 ctrl_if_rxmsg_handler[type] = hnd; 28.102 + clear_bit(type, &ctrl_if_rxmsg_blocking_context); 28.103 + if ( flags == CALLBACK_IN_BLOCKING_CONTEXT ) 28.104 + { 28.105 + set_bit(type, &ctrl_if_rxmsg_blocking_context); 28.106 + if ( !safe_to_schedule_task ) 28.107 + BUG(); 28.108 + } 28.109 + } 28.110 28.111 - spin_unlock_irqrestore(&ctrl_if_lock, flags); 28.112 + spin_unlock_irqrestore(&ctrl_if_lock, _flags); 28.113 28.114 return !inuse; 28.115 } 28.116 @@ -326,6 +369,7 @@ void __init ctrl_if_init(void) 28.117 28.118 for ( i = 0; i < 256; i++ ) 28.119 ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler; 28.120 + ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred; 28.121 28.122 spin_lock_init(&ctrl_if_lock); 28.123 28.124 @@ -333,6 +377,15 @@ void __init ctrl_if_init(void) 28.125 } 28.126 28.127 28.128 +/* This is called after it is safe to call schedule_task(). */ 28.129 +static int __init ctrl_if_late_setup(void) 28.130 +{ 28.131 + safe_to_schedule_task = 1; 28.132 + return 0; 28.133 +} 28.134 +__initcall(ctrl_if_late_setup); 28.135 + 28.136 + 28.137 /* 28.138 * !! The following are DANGEROUS FUNCTIONS !! 28.139 * Use with care [for example, see xencons_force_flush()].
29.1 --- a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c Thu May 06 14:53:19 2004 +0000 29.2 +++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c Fri May 07 14:53:28 2004 +0000 29.3 @@ -1626,7 +1626,7 @@ int __init blk_dev_init(void) 29.4 jsfd_init(); 29.5 #endif 29.6 29.7 -#ifdef CONFIG_XEN_VBD 29.8 +#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO) 29.9 xlblk_init(); 29.10 #endif 29.11
30.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h Thu May 06 14:53:19 2004 +0000 30.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h Fri May 07 14:53:28 2004 +0000 30.3 @@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *m 30.4 * Register a receiver for typed messages from the domain controller. The 30.5 * handler (@hnd) is called for every received message of specified @type. 30.6 * Returns TRUE (non-zero) if the handler was successfully registered. 30.7 + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will 30.8 + * occur in a context in which it is safe to yield (i.e., process context). 30.9 */ 30.10 -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd); 30.11 +#define CALLBACK_IN_BLOCKING_CONTEXT 1 30.12 +int ctrl_if_register_receiver( 30.13 + u8 type, 30.14 + ctrl_msg_handler_t hnd, 30.15 + unsigned int flags); 30.16 30.17 /* 30.18 * Unregister a receiver for typed messages from the domain controller. The
31.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h Thu May 06 14:53:19 2004 +0000 31.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h Fri May 07 14:53:28 2004 +0000 31.3 @@ -159,46 +159,11 @@ extern void iounmap(void *addr); 31.4 extern void *bt_ioremap(unsigned long offset, unsigned long size); 31.5 extern void bt_iounmap(void *addr, unsigned long size); 31.6 31.7 -#ifdef CONFIG_XEN_PHYSDEV_ACCESS 31.8 - 31.9 -#ifdef CONFIG_HIGHMEM 31.10 -#error "Highmem is not yet compatible with physical device access" 31.11 -#endif 31.12 - 31.13 -/* 31.14 - * The bus translation macros need special care if we are executing device 31.15 - * accesses to/from other domains' memory. In these cases the virtual address 31.16 - * is actually a temporary mapping in the 'vmalloc' space. The physical 31.17 - * address will therefore be >max_low_pfn, and will not have a valid entry 31.18 - * in the phys_to_mach mapping table. 31.19 - */ 31.20 -static inline unsigned long phys_to_bus(unsigned long phys) 31.21 -{ 31.22 - extern unsigned long max_pfn; 31.23 - pgd_t *pgd; pmd_t *pmd; pte_t *pte; 31.24 - void *addr; 31.25 - unsigned long bus; 31.26 - if ( (phys >> PAGE_SHIFT) < max_pfn ) 31.27 - return phys_to_machine(phys); 31.28 - addr = phys_to_virt(phys); 31.29 - pgd = pgd_offset_k( (unsigned long)addr); 31.30 - pmd = pmd_offset(pgd, (unsigned long)addr); 31.31 - pte = pte_offset(pmd, (unsigned long)addr); 31.32 - bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK); 31.33 - return bus; 31.34 -} 31.35 - 31.36 -#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x)) 31.37 -#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) 31.38 -#define page_to_bus(_x) phys_to_bus(page_to_phys(_x)) 31.39 - 31.40 -#else 31.41 - 31.42 #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x)) 31.43 #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) 31.44 #define page_to_bus(_x) phys_to_machine(page_to_phys(_x)) 31.45 - 31.46 -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ 31.47 +#define bus_to_phys(_x) machine_to_phys(_x) 31.48 +#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT)) 31.49 31.50 /* 31.51 * readX/writeX() are used to access memory mapped devices. On some
32.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 32.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h Fri May 07 14:53:28 2004 +0000 32.3 @@ -0,0 +1,283 @@ 32.4 +#ifndef __i386_PCI_H 32.5 +#define __i386_PCI_H 32.6 + 32.7 +#include <linux/config.h> 32.8 + 32.9 +#ifdef __KERNEL__ 32.10 + 32.11 +/* Can be used to override the logic in pci_scan_bus for skipping 32.12 + already-configured bus numbers - to be used for buggy BIOSes 32.13 + or architectures with incomplete PCI setup by the loader */ 32.14 + 32.15 +#ifdef CONFIG_PCI 32.16 +extern unsigned int pcibios_assign_all_busses(void); 32.17 +#else 32.18 +#define pcibios_assign_all_busses() 0 32.19 +#endif 32.20 +#define pcibios_scan_all_fns() 0 32.21 + 32.22 +extern unsigned long pci_mem_start; 32.23 +#define PCIBIOS_MIN_IO 0x1000 32.24 +#define PCIBIOS_MIN_MEM (pci_mem_start) 32.25 + 32.26 +void pcibios_config_init(void); 32.27 +struct pci_bus * pcibios_scan_root(int bus); 32.28 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); 32.29 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); 32.30 + 32.31 +void pcibios_set_master(struct pci_dev *dev); 32.32 +void pcibios_penalize_isa_irq(int irq); 32.33 +struct irq_routing_table *pcibios_get_irq_routing_table(void); 32.34 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); 32.35 + 32.36 +/* Dynamic DMA mapping stuff. 32.37 + * i386 has everything mapped statically. 32.38 + */ 32.39 + 32.40 +#include <linux/types.h> 32.41 +#include <linux/slab.h> 32.42 +#include <asm/scatterlist.h> 32.43 +#include <linux/string.h> 32.44 +#include <asm/io.h> 32.45 + 32.46 +struct pci_dev; 32.47 + 32.48 +/* The networking and block device layers use this boolean for bounce 32.49 + * buffer decisions. 32.50 + */ 32.51 +#define PCI_DMA_BUS_IS_PHYS (0) 32.52 + 32.53 +/* Allocate and map kernel buffer using consistent mode DMA for a device. 32.54 + * hwdev should be valid struct pci_dev pointer for PCI devices, 32.55 + * NULL for PCI-like buses (ISA, EISA). 32.56 + * Returns non-NULL cpu-view pointer to the buffer if successful and 32.57 + * sets *dma_addrp to the pci side dma address as well, else *dma_addrp 32.58 + * is undefined. 32.59 + */ 32.60 +extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, 32.61 + dma_addr_t *dma_handle); 32.62 + 32.63 +/* Free and unmap a consistent DMA buffer. 32.64 + * cpu_addr is what was returned from pci_alloc_consistent, 32.65 + * size must be the same as what as passed into pci_alloc_consistent, 32.66 + * and likewise dma_addr must be the same as what *dma_addrp was set to. 32.67 + * 32.68 + * References to the memory and mappings associated with cpu_addr/dma_addr 32.69 + * past this call are illegal. 32.70 + */ 32.71 +extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, 32.72 + void *vaddr, dma_addr_t dma_handle); 32.73 + 32.74 +/* Map a single buffer of the indicated size for DMA in streaming mode. 32.75 + * The 32-bit bus address to use is returned. 32.76 + * 32.77 + * Once the device is given the dma address, the device owns this memory 32.78 + * until either pci_unmap_single or pci_dma_sync_single is performed. 32.79 + */ 32.80 +static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, 32.81 + size_t size, int direction) 32.82 +{ 32.83 + if (direction == PCI_DMA_NONE) 32.84 + out_of_line_bug(); 32.85 + flush_write_buffers(); 32.86 + return virt_to_bus(ptr); 32.87 +} 32.88 + 32.89 +/* Unmap a single streaming mode DMA translation. The dma_addr and size 32.90 + * must match what was provided for in a previous pci_map_single call. All 32.91 + * other usages are undefined. 32.92 + * 32.93 + * After this call, reads by the cpu to the buffer are guarenteed to see 32.94 + * whatever the device wrote there. 32.95 + */ 32.96 +static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, 32.97 + size_t size, int direction) 32.98 +{ 32.99 + if (direction == PCI_DMA_NONE) 32.100 + out_of_line_bug(); 32.101 + /* Nothing to do */ 32.102 +} 32.103 + 32.104 +/* 32.105 + * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical 32.106 + * to pci_map_single, but takes a struct page instead of a virtual address 32.107 + */ 32.108 +static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, 32.109 + unsigned long offset, size_t size, int direction) 32.110 +{ 32.111 + if (direction == PCI_DMA_NONE) 32.112 + out_of_line_bug(); 32.113 + 32.114 + return page_to_bus(page) + offset; 32.115 +} 32.116 + 32.117 +static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address, 32.118 + size_t size, int direction) 32.119 +{ 32.120 + if (direction == PCI_DMA_NONE) 32.121 + out_of_line_bug(); 32.122 + /* Nothing to do */ 32.123 +} 32.124 + 32.125 +/* pci_unmap_{page,single} is a nop so... */ 32.126 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) 32.127 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) 32.128 +#define pci_unmap_addr(PTR, ADDR_NAME) (0) 32.129 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) 32.130 +#define pci_unmap_len(PTR, LEN_NAME) (0) 32.131 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) 32.132 + 32.133 +/* Map a set of buffers described by scatterlist in streaming 32.134 + * mode for DMA. This is the scather-gather version of the 32.135 + * above pci_map_single interface. Here the scatter gather list 32.136 + * elements are each tagged with the appropriate dma address 32.137 + * and length. They are obtained via sg_dma_{address,length}(SG). 32.138 + * 32.139 + * NOTE: An implementation may be able to use a smaller number of 32.140 + * DMA address/length pairs than there are SG table elements. 32.141 + * (for example via virtual mapping capabilities) 32.142 + * The routine returns the number of addr/length pairs actually 32.143 + * used, at most nents. 32.144 + * 32.145 + * Device ownership issues as mentioned above for pci_map_single are 32.146 + * the same here. 32.147 + */ 32.148 +static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, 32.149 + int nents, int direction) 32.150 +{ 32.151 + int i; 32.152 + 32.153 + if (direction == PCI_DMA_NONE) 32.154 + out_of_line_bug(); 32.155 + 32.156 + /* 32.157 + * temporary 2.4 hack 32.158 + */ 32.159 + for (i = 0; i < nents; i++ ) { 32.160 + if (sg[i].address && sg[i].page) 32.161 + out_of_line_bug(); 32.162 + else if (!sg[i].address && !sg[i].page) 32.163 + out_of_line_bug(); 32.164 + 32.165 + if (sg[i].address) 32.166 + sg[i].dma_address = virt_to_bus(sg[i].address); 32.167 + else 32.168 + sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset; 32.169 + } 32.170 + 32.171 + flush_write_buffers(); 32.172 + return nents; 32.173 +} 32.174 + 32.175 +/* Unmap a set of streaming mode DMA translations. 32.176 + * Again, cpu read rules concerning calls here are the same as for 32.177 + * pci_unmap_single() above. 32.178 + */ 32.179 +static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, 32.180 + int nents, int direction) 32.181 +{ 32.182 + if (direction == PCI_DMA_NONE) 32.183 + out_of_line_bug(); 32.184 + /* Nothing to do */ 32.185 +} 32.186 + 32.187 +/* Make physical memory consistent for a single 32.188 + * streaming mode DMA translation after a transfer. 32.189 + * 32.190 + * If you perform a pci_map_single() but wish to interrogate the 32.191 + * buffer using the cpu, yet do not wish to teardown the PCI dma 32.192 + * mapping, you must call this function before doing so. At the 32.193 + * next point you give the PCI dma address back to the card, the 32.194 + * device again owns the buffer. 32.195 + */ 32.196 +static inline void pci_dma_sync_single(struct pci_dev *hwdev, 32.197 + dma_addr_t dma_handle, 32.198 + size_t size, int direction) 32.199 +{ 32.200 + if (direction == PCI_DMA_NONE) 32.201 + out_of_line_bug(); 32.202 + flush_write_buffers(); 32.203 +} 32.204 + 32.205 +/* Make physical memory consistent for a set of streaming 32.206 + * mode DMA translations after a transfer. 32.207 + * 32.208 + * The same as pci_dma_sync_single but for a scatter-gather list, 32.209 + * same rules and usage. 32.210 + */ 32.211 +static inline void pci_dma_sync_sg(struct pci_dev *hwdev, 32.212 + struct scatterlist *sg, 32.213 + int nelems, int direction) 32.214 +{ 32.215 + if (direction == PCI_DMA_NONE) 32.216 + out_of_line_bug(); 32.217 + flush_write_buffers(); 32.218 +} 32.219 + 32.220 +/* Return whether the given PCI device DMA address mask can 32.221 + * be supported properly. For example, if your device can 32.222 + * only drive the low 24-bits during PCI bus mastering, then 32.223 + * you would pass 0x00ffffff as the mask to this function. 32.224 + */ 32.225 +static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) 32.226 +{ 32.227 + /* 32.228 + * we fall back to GFP_DMA when the mask isn't all 1s, 32.229 + * so we can't guarantee allocations that must be 32.230 + * within a tighter range than GFP_DMA.. 32.231 + */ 32.232 + if(mask < 0x00ffffff) 32.233 + return 0; 32.234 + 32.235 + return 1; 32.236 +} 32.237 + 32.238 +/* This is always fine. */ 32.239 +#define pci_dac_dma_supported(pci_dev, mask) (1) 32.240 + 32.241 +static __inline__ dma64_addr_t 32.242 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) 32.243 +{ 32.244 + return ((dma64_addr_t) page_to_bus(page) + 32.245 + (dma64_addr_t) offset); 32.246 +} 32.247 + 32.248 +static __inline__ struct page * 32.249 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) 32.250 +{ 32.251 + return bus_to_page(dma_addr); 32.252 +} 32.253 + 32.254 +static __inline__ unsigned long 32.255 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) 32.256 +{ 32.257 + return (dma_addr & ~PAGE_MASK); 32.258 +} 32.259 + 32.260 +static __inline__ void 32.261 +pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) 32.262 +{ 32.263 + flush_write_buffers(); 32.264 +} 32.265 + 32.266 +/* These macros should be used after a pci_map_sg call has been done 32.267 + * to get bus addresses of each of the SG entries and their lengths. 32.268 + * You should only work with the number of sg entries pci_map_sg 32.269 + * returns. 32.270 + */ 32.271 +#define sg_dma_address(sg) ((sg)->dma_address) 32.272 +#define sg_dma_len(sg) ((sg)->length) 32.273 + 32.274 +/* Return the index of the PCI controller for device. */ 32.275 +static inline int pci_controller_num(struct pci_dev *dev) 32.276 +{ 32.277 + return 0; 32.278 +} 32.279 + 32.280 +#define HAVE_PCI_MMAP 32.281 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 32.282 + enum pci_mmap_state mmap_state, int write_combine); 32.283 + 32.284 +#endif /* __KERNEL__ */ 32.285 + 32.286 +#endif /* __i386_PCI_H */
33.1 --- a/xenolinux-2.4.26-sparse/mkbuildtree Thu May 06 14:53:19 2004 +0000 33.2 +++ b/xenolinux-2.4.26-sparse/mkbuildtree Fri May 07 14:53:28 2004 +0000 33.3 @@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h 33.4 ln -sf ../asm-i386/namei.h 33.5 ln -sf ../asm-i386/param.h 33.6 ln -sf ../asm-i386/parport.h 33.7 -ln -sf ../asm-i386/pci.h 33.8 ln -sf ../asm-i386/pgtable-3level.h 33.9 ln -sf ../asm-i386/poll.h 33.10 ln -sf ../asm-i386/posix_types.h
34.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 34.2 +++ b/xenolinux-2.4.26-sparse/mm/page_alloc.c Fri May 07 14:53:28 2004 +0000 34.3 @@ -0,0 +1,930 @@ 34.4 +/* 34.5 + * linux/mm/page_alloc.c 34.6 + * 34.7 + * Manages the free list, the system allocates free pages here. 34.8 + * Note that kmalloc() lives in slab.c 34.9 + * 34.10 + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 34.11 + * Swap reorganised 29.12.95, Stephen Tweedie 34.12 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 34.13 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 34.14 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 34.15 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 34.16 + */ 34.17 + 34.18 +#include <linux/config.h> 34.19 +#include <linux/mm.h> 34.20 +#include <linux/swap.h> 34.21 +#include <linux/swapctl.h> 34.22 +#include <linux/interrupt.h> 34.23 +#include <linux/pagemap.h> 34.24 +#include <linux/bootmem.h> 34.25 +#include <linux/slab.h> 34.26 +#include <linux/module.h> 34.27 + 34.28 +int nr_swap_pages; 34.29 +int nr_active_pages; 34.30 +int nr_inactive_pages; 34.31 +LIST_HEAD(inactive_list); 34.32 +LIST_HEAD(active_list); 34.33 +pg_data_t *pgdat_list; 34.34 + 34.35 +/* 34.36 + * 34.37 + * The zone_table array is used to look up the address of the 34.38 + * struct zone corresponding to a given zone number (ZONE_DMA, 34.39 + * ZONE_NORMAL, or ZONE_HIGHMEM). 34.40 + */ 34.41 +zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; 34.42 +EXPORT_SYMBOL(zone_table); 34.43 + 34.44 +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 34.45 +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; 34.46 +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; 34.47 +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; 34.48 +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 34.49 + 34.50 +int vm_gfp_debug = 0; 34.51 + 34.52 +/* 34.53 + * Temporary debugging check. 34.54 + */ 34.55 +#define BAD_RANGE(zone, page) \ 34.56 +( \ 34.57 + (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ 34.58 + || (((page) - mem_map) < (zone)->zone_start_mapnr) \ 34.59 + || ((zone) != page_zone(page)) \ 34.60 +) 34.61 + 34.62 +/* 34.63 + * Freeing function for a buddy system allocator. 34.64 + * Contrary to prior comments, this is *NOT* hairy, and there 34.65 + * is no reason for anyone not to understand it. 34.66 + * 34.67 + * The concept of a buddy system is to maintain direct-mapped tables 34.68 + * (containing bit values) for memory blocks of various "orders". 34.69 + * The bottom level table contains the map for the smallest allocatable 34.70 + * units of memory (here, pages), and each level above it describes 34.71 + * pairs of units from the levels below, hence, "buddies". 34.72 + * At a high level, all that happens here is marking the table entry 34.73 + * at the bottom level available, and propagating the changes upward 34.74 + * as necessary, plus some accounting needed to play nicely with other 34.75 + * parts of the VM system. 34.76 + * At each level, we keep one bit for each pair of blocks, which 34.77 + * is set to 1 iff only one of the pair is allocated. So when we 34.78 + * are allocating or freeing one, we can derive the state of the 34.79 + * other. That is, if we allocate a small block, and both were 34.80 + * free, the remainder of the region must be split into blocks. 34.81 + * If a block is freed, and its buddy is also free, then this 34.82 + * triggers coalescing into a block of larger size. 34.83 + * 34.84 + * -- wli 34.85 + */ 34.86 + 34.87 +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); 34.88 +static void __free_pages_ok (struct page *page, unsigned int order) 34.89 +{ 34.90 + unsigned long index, page_idx, mask, flags; 34.91 + free_area_t *area; 34.92 + struct page *base; 34.93 + zone_t *zone; 34.94 + 34.95 + /* 34.96 + * Yes, think what happens when other parts of the kernel take 34.97 + * a reference to a page in order to pin it for io. -ben 34.98 + */ 34.99 + if (PageLRU(page)) { 34.100 + if (unlikely(in_interrupt())) 34.101 + BUG(); 34.102 + lru_cache_del(page); 34.103 + } 34.104 + 34.105 + if (page->buffers) 34.106 + BUG(); 34.107 + if (page->mapping) 34.108 + return (*(void(*)(struct page *))page->mapping)(page); 34.109 + if (!VALID_PAGE(page)) 34.110 + BUG(); 34.111 + if (PageLocked(page)) 34.112 + BUG(); 34.113 + if (PageActive(page)) 34.114 + BUG(); 34.115 + ClearPageReferenced(page); 34.116 + ClearPageDirty(page); 34.117 + 34.118 + if (current->flags & PF_FREE_PAGES) 34.119 + goto local_freelist; 34.120 + back_local_freelist: 34.121 + 34.122 + zone = page_zone(page); 34.123 + 34.124 + mask = (~0UL) << order; 34.125 + base = zone->zone_mem_map; 34.126 + page_idx = page - base; 34.127 + if (page_idx & ~mask) 34.128 + BUG(); 34.129 + index = page_idx >> (1 + order); 34.130 + 34.131 + area = zone->free_area + order; 34.132 + 34.133 + spin_lock_irqsave(&zone->lock, flags); 34.134 + 34.135 + zone->free_pages -= mask; 34.136 + 34.137 + while (mask + (1 << (MAX_ORDER-1))) { 34.138 + struct page *buddy1, *buddy2; 34.139 + 34.140 + if (area >= zone->free_area + MAX_ORDER) 34.141 + BUG(); 34.142 + if (!__test_and_change_bit(index, area->map)) 34.143 + /* 34.144 + * the buddy page is still allocated. 34.145 + */ 34.146 + break; 34.147 + /* 34.148 + * Move the buddy up one level. 34.149 + * This code is taking advantage of the identity: 34.150 + * -mask = 1+~mask 34.151 + */ 34.152 + buddy1 = base + (page_idx ^ -mask); 34.153 + buddy2 = base + page_idx; 34.154 + if (BAD_RANGE(zone,buddy1)) 34.155 + BUG(); 34.156 + if (BAD_RANGE(zone,buddy2)) 34.157 + BUG(); 34.158 + 34.159 + list_del(&buddy1->list); 34.160 + mask <<= 1; 34.161 + area++; 34.162 + index >>= 1; 34.163 + page_idx &= mask; 34.164 + } 34.165 + list_add(&(base + page_idx)->list, &area->free_list); 34.166 + 34.167 + spin_unlock_irqrestore(&zone->lock, flags); 34.168 + return; 34.169 + 34.170 + local_freelist: 34.171 + if (current->nr_local_pages) 34.172 + goto back_local_freelist; 34.173 + if (in_interrupt()) 34.174 + goto back_local_freelist; 34.175 + 34.176 + list_add(&page->list, ¤t->local_pages); 34.177 + page->index = order; 34.178 + current->nr_local_pages++; 34.179 +} 34.180 + 34.181 +#define MARK_USED(index, order, area) \ 34.182 + __change_bit((index) >> (1+(order)), (area)->map) 34.183 + 34.184 +static inline struct page * expand (zone_t *zone, struct page *page, 34.185 + unsigned long index, int low, int high, free_area_t * area) 34.186 +{ 34.187 + unsigned long size = 1 << high; 34.188 + 34.189 + while (high > low) { 34.190 + if (BAD_RANGE(zone,page)) 34.191 + BUG(); 34.192 + area--; 34.193 + high--; 34.194 + size >>= 1; 34.195 + list_add(&(page)->list, &(area)->free_list); 34.196 + MARK_USED(index, high, area); 34.197 + index += size; 34.198 + page += size; 34.199 + } 34.200 + if (BAD_RANGE(zone,page)) 34.201 + BUG(); 34.202 + return page; 34.203 +} 34.204 + 34.205 +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); 34.206 +static struct page * rmqueue(zone_t *zone, unsigned int order) 34.207 +{ 34.208 + free_area_t * area = zone->free_area + order; 34.209 + unsigned int curr_order = order; 34.210 + struct list_head *head, *curr; 34.211 + unsigned long flags; 34.212 + struct page *page; 34.213 + 34.214 + spin_lock_irqsave(&zone->lock, flags); 34.215 + do { 34.216 + head = &area->free_list; 34.217 + curr = head->next; 34.218 + 34.219 + if (curr != head) { 34.220 + unsigned int index; 34.221 + 34.222 + page = list_entry(curr, struct page, list); 34.223 + if (BAD_RANGE(zone,page)) 34.224 + BUG(); 34.225 + list_del(curr); 34.226 + index = page - zone->zone_mem_map; 34.227 + if (curr_order != MAX_ORDER-1) 34.228 + MARK_USED(index, curr_order, area); 34.229 + zone->free_pages -= 1UL << order; 34.230 + 34.231 + page = expand(zone, page, index, order, curr_order, area); 34.232 + spin_unlock_irqrestore(&zone->lock, flags); 34.233 + 34.234 + set_page_count(page, 1); 34.235 + if (BAD_RANGE(zone,page)) 34.236 + BUG(); 34.237 + if (PageLRU(page)) 34.238 + BUG(); 34.239 + if (PageActive(page)) 34.240 + BUG(); 34.241 + return page; 34.242 + } 34.243 + curr_order++; 34.244 + area++; 34.245 + } while (curr_order < MAX_ORDER); 34.246 + spin_unlock_irqrestore(&zone->lock, flags); 34.247 + 34.248 + return NULL; 34.249 +} 34.250 + 34.251 +#ifndef CONFIG_DISCONTIGMEM 34.252 +struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) 34.253 +{ 34.254 + return __alloc_pages(gfp_mask, order, 34.255 + contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); 34.256 +} 34.257 +#endif 34.258 + 34.259 +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); 34.260 +static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) 34.261 +{ 34.262 + struct page * page = NULL; 34.263 + int __freed; 34.264 + 34.265 + if (in_interrupt()) 34.266 + BUG(); 34.267 + 34.268 + current->allocation_order = order; 34.269 + current->flags |= PF_MEMALLOC | PF_FREE_PAGES; 34.270 + 34.271 + __freed = try_to_free_pages_zone(classzone, gfp_mask); 34.272 + 34.273 + current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); 34.274 + 34.275 + if (current->nr_local_pages) { 34.276 + struct list_head * entry, * local_pages; 34.277 + struct page * tmp; 34.278 + int nr_pages; 34.279 + 34.280 + local_pages = ¤t->local_pages; 34.281 + 34.282 + if (likely(__freed)) { 34.283 + /* pick from the last inserted so we're lifo */ 34.284 + entry = local_pages->next; 34.285 + do { 34.286 + tmp = list_entry(entry, struct page, list); 34.287 + if (tmp->index == order && memclass(page_zone(tmp), classzone)) { 34.288 + list_del(entry); 34.289 + current->nr_local_pages--; 34.290 + set_page_count(tmp, 1); 34.291 + page = tmp; 34.292 + 34.293 + if (page->buffers) 34.294 + BUG(); 34.295 + if (page->mapping) 34.296 + BUG(); 34.297 + if (!VALID_PAGE(page)) 34.298 + BUG(); 34.299 + if (PageLocked(page)) 34.300 + BUG(); 34.301 + if (PageLRU(page)) 34.302 + BUG(); 34.303 + if (PageActive(page)) 34.304 + BUG(); 34.305 + if (PageDirty(page)) 34.306 + BUG(); 34.307 + 34.308 + break; 34.309 + } 34.310 + } while ((entry = entry->next) != local_pages); 34.311 + } 34.312 + 34.313 + nr_pages = current->nr_local_pages; 34.314 + /* free in reverse order so that the global order will be lifo */ 34.315 + while ((entry = local_pages->prev) != local_pages) { 34.316 + list_del(entry); 34.317 + tmp = list_entry(entry, struct page, list); 34.318 + __free_pages_ok(tmp, tmp->index); 34.319 + if (!nr_pages--) 34.320 + BUG(); 34.321 + } 34.322 + current->nr_local_pages = 0; 34.323 + } 34.324 + 34.325 + *freed = __freed; 34.326 + return page; 34.327 +} 34.328 + 34.329 +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) 34.330 +{ 34.331 + long free = zone->free_pages - (1UL << order); 34.332 + return free >= 0 ? free : 0; 34.333 +} 34.334 + 34.335 +/* 34.336 + * This is the 'heart' of the zoned buddy allocator: 34.337 + */ 34.338 +struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) 34.339 +{ 34.340 + zone_t **zone, * classzone; 34.341 + struct page * page; 34.342 + int freed, class_idx; 34.343 + 34.344 + zone = zonelist->zones; 34.345 + classzone = *zone; 34.346 + class_idx = zone_idx(classzone); 34.347 + 34.348 + for (;;) { 34.349 + zone_t *z = *(zone++); 34.350 + if (!z) 34.351 + break; 34.352 + 34.353 + if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { 34.354 + page = rmqueue(z, order); 34.355 + if (page) 34.356 + return page; 34.357 + } 34.358 + } 34.359 + 34.360 + classzone->need_balance = 1; 34.361 + mb(); 34.362 + if (waitqueue_active(&kswapd_wait)) 34.363 + wake_up_interruptible(&kswapd_wait); 34.364 + 34.365 + zone = zonelist->zones; 34.366 + for (;;) { 34.367 + unsigned long min; 34.368 + zone_t *z = *(zone++); 34.369 + if (!z) 34.370 + break; 34.371 + 34.372 + min = z->watermarks[class_idx].min; 34.373 + if (!(gfp_mask & __GFP_WAIT)) 34.374 + min >>= 2; 34.375 + if (zone_free_pages(z, order) > min) { 34.376 + page = rmqueue(z, order); 34.377 + if (page) 34.378 + return page; 34.379 + } 34.380 + } 34.381 + 34.382 + /* here we're in the low on memory slow path */ 34.383 + 34.384 + if ((current->flags & PF_MEMALLOC) && 34.385 + (!in_interrupt() || (current->flags & PF_MEMDIE))) { 34.386 + zone = zonelist->zones; 34.387 + for (;;) { 34.388 + zone_t *z = *(zone++); 34.389 + if (!z) 34.390 + break; 34.391 + 34.392 + page = rmqueue(z, order); 34.393 + if (page) 34.394 + return page; 34.395 + } 34.396 + return NULL; 34.397 + } 34.398 + 34.399 + /* Atomic allocations - we can't balance anything */ 34.400 + if (!(gfp_mask & __GFP_WAIT)) 34.401 + goto out; 34.402 + 34.403 + rebalance: 34.404 + page = balance_classzone(classzone, gfp_mask, order, &freed); 34.405 + if (page) 34.406 + return page; 34.407 + 34.408 + zone = zonelist->zones; 34.409 + if (likely(freed)) { 34.410 + for (;;) { 34.411 + zone_t *z = *(zone++); 34.412 + if (!z) 34.413 + break; 34.414 + 34.415 + if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { 34.416 + page = rmqueue(z, order); 34.417 + if (page) 34.418 + return page; 34.419 + } 34.420 + } 34.421 + goto rebalance; 34.422 + } else { 34.423 + /* 34.424 + * Check that no other task is been killed meanwhile, 34.425 + * in such a case we can succeed the allocation. 34.426 + */ 34.427 + for (;;) { 34.428 + zone_t *z = *(zone++); 34.429 + if (!z) 34.430 + break; 34.431 + 34.432 + if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { 34.433 + page = rmqueue(z, order); 34.434 + if (page) 34.435 + return page; 34.436 + } 34.437 + } 34.438 + } 34.439 + 34.440 + out: 34.441 + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", 34.442 + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); 34.443 + if (unlikely(vm_gfp_debug)) 34.444 + dump_stack(); 34.445 + return NULL; 34.446 +} 34.447 + 34.448 +/* 34.449 + * Common helper functions. 34.450 + */ 34.451 +unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) 34.452 +{ 34.453 + struct page * page; 34.454 + 34.455 + page = alloc_pages(gfp_mask, order); 34.456 + if (!page) 34.457 + return 0; 34.458 + return (unsigned long) page_address(page); 34.459 +} 34.460 + 34.461 +unsigned long get_zeroed_page(unsigned int gfp_mask) 34.462 +{ 34.463 + struct page * page; 34.464 + 34.465 + page = alloc_pages(gfp_mask, 0); 34.466 + if (page) { 34.467 + void *address = page_address(page); 34.468 + clear_page(address); 34.469 + return (unsigned long) address; 34.470 + } 34.471 + return 0; 34.472 +} 34.473 + 34.474 +void __free_pages(struct page *page, unsigned int order) 34.475 +{ 34.476 + if (!PageReserved(page) && put_page_testzero(page)) 34.477 + __free_pages_ok(page, order); 34.478 +} 34.479 + 34.480 +void free_pages(unsigned long addr, unsigned int order) 34.481 +{ 34.482 + if (addr != 0) 34.483 + __free_pages(virt_to_page(addr), order); 34.484 +} 34.485 + 34.486 +/* 34.487 + * Total amount of free (allocatable) RAM: 34.488 + */ 34.489 +unsigned int nr_free_pages (void) 34.490 +{ 34.491 + unsigned int sum = 0; 34.492 + zone_t *zone; 34.493 + 34.494 + for_each_zone(zone) 34.495 + sum += zone->free_pages; 34.496 + 34.497 + return sum; 34.498 +} 34.499 + 34.500 +/* 34.501 + * Amount of free RAM allocatable as buffer memory: 34.502 + */ 34.503 +unsigned int nr_free_buffer_pages (void) 34.504 +{ 34.505 + pg_data_t *pgdat; 34.506 + unsigned int sum = 0; 34.507 + zonelist_t *zonelist; 34.508 + zone_t **zonep, *zone; 34.509 + 34.510 + for_each_pgdat(pgdat) { 34.511 + int class_idx; 34.512 + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); 34.513 + zonep = zonelist->zones; 34.514 + zone = *zonep; 34.515 + class_idx = zone_idx(zone); 34.516 + 34.517 + sum += zone->nr_cache_pages; 34.518 + for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { 34.519 + int free = zone->free_pages - zone->watermarks[class_idx].high; 34.520 + if (free <= 0) 34.521 + continue; 34.522 + sum += free; 34.523 + } 34.524 + } 34.525 + 34.526 + return sum; 34.527 +} 34.528 + 34.529 +#if CONFIG_HIGHMEM 34.530 +unsigned int nr_free_highpages (void) 34.531 +{ 34.532 + pg_data_t *pgdat; 34.533 + unsigned int pages = 0; 34.534 + 34.535 + for_each_pgdat(pgdat) 34.536 + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 34.537 + 34.538 + return pages; 34.539 +} 34.540 + 34.541 +unsigned int freeable_lowmem(void) 34.542 +{ 34.543 + unsigned int pages = 0; 34.544 + pg_data_t *pgdat; 34.545 + 34.546 + for_each_pgdat(pgdat) { 34.547 + pages += pgdat->node_zones[ZONE_DMA].free_pages; 34.548 + pages += pgdat->node_zones[ZONE_DMA].nr_active_pages; 34.549 + pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages; 34.550 + pages += pgdat->node_zones[ZONE_NORMAL].free_pages; 34.551 + pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages; 34.552 + pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages; 34.553 + } 34.554 + 34.555 + return pages; 34.556 +} 34.557 +#endif 34.558 + 34.559 +#define K(x) ((x) << (PAGE_SHIFT-10)) 34.560 + 34.561 +/* 34.562 + * Show free area list (used inside shift_scroll-lock stuff) 34.563 + * We also calculate the percentage fragmentation. We do this by counting the 34.564 + * memory on each free list with the exception of the first item on the list. 34.565 + */ 34.566 +void show_free_areas_core(pg_data_t *pgdat) 34.567 +{ 34.568 + unsigned int order; 34.569 + unsigned type; 34.570 + pg_data_t *tmpdat = pgdat; 34.571 + 34.572 + printk("Free pages: %6dkB (%6dkB HighMem)\n", 34.573 + K(nr_free_pages()), 34.574 + K(nr_free_highpages())); 34.575 + 34.576 + while (tmpdat) { 34.577 + zone_t *zone; 34.578 + for (zone = tmpdat->node_zones; 34.579 + zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) 34.580 + printk("Zone:%s freepages:%6lukB\n", 34.581 + zone->name, 34.582 + K(zone->free_pages)); 34.583 + 34.584 + tmpdat = tmpdat->node_next; 34.585 + } 34.586 + 34.587 + printk("( Active: %d, inactive: %d, free: %d )\n", 34.588 + nr_active_pages, 34.589 + nr_inactive_pages, 34.590 + nr_free_pages()); 34.591 + 34.592 + for (type = 0; type < MAX_NR_ZONES; type++) { 34.593 + struct list_head *head, *curr; 34.594 + zone_t *zone = pgdat->node_zones + type; 34.595 + unsigned long nr, total, flags; 34.596 + 34.597 + total = 0; 34.598 + if (zone->size) { 34.599 + spin_lock_irqsave(&zone->lock, flags); 34.600 + for (order = 0; order < MAX_ORDER; order++) { 34.601 + head = &(zone->free_area + order)->free_list; 34.602 + curr = head; 34.603 + nr = 0; 34.604 + for (;;) { 34.605 + if ((curr = curr->next) == head) 34.606 + break; 34.607 + nr++; 34.608 + } 34.609 + total += nr * (1 << order); 34.610 + printk("%lu*%lukB ", nr, K(1UL) << order); 34.611 + } 34.612 + spin_unlock_irqrestore(&zone->lock, flags); 34.613 + } 34.614 + printk("= %lukB)\n", K(total)); 34.615 + } 34.616 + 34.617 +#ifdef SWAP_CACHE_INFO 34.618 + show_swap_cache_info(); 34.619 +#endif 34.620 +} 34.621 + 34.622 +void show_free_areas(void) 34.623 +{ 34.624 + show_free_areas_core(pgdat_list); 34.625 +} 34.626 + 34.627 +/* 34.628 + * Builds allocation fallback zone lists. 34.629 + */ 34.630 +static inline void build_zonelists(pg_data_t *pgdat) 34.631 +{ 34.632 + int i, j, k; 34.633 + 34.634 + for (i = 0; i <= GFP_ZONEMASK; i++) { 34.635 + zonelist_t *zonelist; 34.636 + zone_t *zone; 34.637 + 34.638 + zonelist = pgdat->node_zonelists + i; 34.639 + memset(zonelist, 0, sizeof(*zonelist)); 34.640 + 34.641 + j = 0; 34.642 + k = ZONE_NORMAL; 34.643 + if (i & __GFP_HIGHMEM) 34.644 + k = ZONE_HIGHMEM; 34.645 + if (i & __GFP_DMA) 34.646 + k = ZONE_DMA; 34.647 + 34.648 + switch (k) { 34.649 + default: 34.650 + BUG(); 34.651 + /* 34.652 + * fallthrough: 34.653 + */ 34.654 + case ZONE_HIGHMEM: 34.655 + zone = pgdat->node_zones + ZONE_HIGHMEM; 34.656 + if (zone->size) { 34.657 +#ifndef CONFIG_HIGHMEM 34.658 + BUG(); 34.659 +#endif 34.660 + zonelist->zones[j++] = zone; 34.661 + } 34.662 + case ZONE_NORMAL: 34.663 + zone = pgdat->node_zones + ZONE_NORMAL; 34.664 + if (zone->size) 34.665 + zonelist->zones[j++] = zone; 34.666 + case ZONE_DMA: 34.667 + zone = pgdat->node_zones + ZONE_DMA; 34.668 + if (zone->size) 34.669 + zonelist->zones[j++] = zone; 34.670 + } 34.671 + zonelist->zones[j++] = NULL; 34.672 + } 34.673 +} 34.674 + 34.675 +/* 34.676 + * Helper functions to size the waitqueue hash table. 34.677 + * Essentially these want to choose hash table sizes sufficiently 34.678 + * large so that collisions trying to wait on pages are rare. 34.679 + * But in fact, the number of active page waitqueues on typical 34.680 + * systems is ridiculously low, less than 200. So this is even 34.681 + * conservative, even though it seems large. 34.682 + * 34.683 + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 34.684 + * waitqueues, i.e. the size of the waitq table given the number of pages. 34.685 + */ 34.686 +#define PAGES_PER_WAITQUEUE 256 34.687 + 34.688 +static inline unsigned long wait_table_size(unsigned long pages) 34.689 +{ 34.690 + unsigned long size = 1; 34.691 + 34.692 + pages /= PAGES_PER_WAITQUEUE; 34.693 + 34.694 + while (size < pages) 34.695 + size <<= 1; 34.696 + 34.697 + /* 34.698 + * Once we have dozens or even hundreds of threads sleeping 34.699 + * on IO we've got bigger problems than wait queue collision. 34.700 + * Limit the size of the wait table to a reasonable size. 34.701 + */ 34.702 + size = min(size, 4096UL); 34.703 + 34.704 + return size; 34.705 +} 34.706 + 34.707 +/* 34.708 + * This is an integer logarithm so that shifts can be used later 34.709 + * to extract the more random high bits from the multiplicative 34.710 + * hash function before the remainder is taken. 34.711 + */ 34.712 +static inline unsigned long wait_table_bits(unsigned long size) 34.713 +{ 34.714 + return ffz(~size); 34.715 +} 34.716 + 34.717 +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 34.718 + 34.719 +/* 34.720 + * Set up the zone data structures: 34.721 + * - mark all pages reserved 34.722 + * - mark all memory queues empty 34.723 + * - clear the memory bitmaps 34.724 + */ 34.725 +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, 34.726 + unsigned long *zones_size, unsigned long zone_start_paddr, 34.727 + unsigned long *zholes_size, struct page *lmem_map) 34.728 +{ 34.729 + unsigned long i, j; 34.730 + unsigned long map_size; 34.731 + unsigned long totalpages, offset, realtotalpages; 34.732 + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); 34.733 + 34.734 + if (zone_start_paddr & ~PAGE_MASK) 34.735 + BUG(); 34.736 + 34.737 + totalpages = 0; 34.738 + for (i = 0; i < MAX_NR_ZONES; i++) { 34.739 + unsigned long size = zones_size[i]; 34.740 + totalpages += size; 34.741 + } 34.742 + realtotalpages = totalpages; 34.743 + if (zholes_size) 34.744 + for (i = 0; i < MAX_NR_ZONES; i++) 34.745 + realtotalpages -= zholes_size[i]; 34.746 + 34.747 + printk("On node %d totalpages: %lu\n", nid, realtotalpages); 34.748 + 34.749 + /* 34.750 + * Some architectures (with lots of mem and discontinous memory 34.751 + * maps) have to search for a good mem_map area: 34.752 + * For discontigmem, the conceptual mem map array starts from 34.753 + * PAGE_OFFSET, we need to align the actual array onto a mem map 34.754 + * boundary, so that MAP_NR works. 34.755 + */ 34.756 + map_size = (totalpages + 1)*sizeof(struct page); 34.757 + if (lmem_map == (struct page *)0) { 34.758 + lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); 34.759 + lmem_map = (struct page *)(PAGE_OFFSET + 34.760 + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); 34.761 + } 34.762 + *gmap = pgdat->node_mem_map = lmem_map; 34.763 + pgdat->node_size = totalpages; 34.764 + pgdat->node_start_paddr = zone_start_paddr; 34.765 + pgdat->node_start_mapnr = (lmem_map - mem_map); 34.766 + pgdat->nr_zones = 0; 34.767 + 34.768 + offset = lmem_map - mem_map; 34.769 + for (j = 0; j < MAX_NR_ZONES; j++) { 34.770 + zone_t *zone = pgdat->node_zones + j; 34.771 + unsigned long mask; 34.772 + unsigned long size, realsize; 34.773 + int idx; 34.774 + 34.775 + zone_table[nid * MAX_NR_ZONES + j] = zone; 34.776 + realsize = size = zones_size[j]; 34.777 + if (zholes_size) 34.778 + realsize -= zholes_size[j]; 34.779 + 34.780 + printk("zone(%lu): %lu pages.\n", j, size); 34.781 + zone->size = size; 34.782 + zone->realsize = realsize; 34.783 + zone->name = zone_names[j]; 34.784 + zone->lock = SPIN_LOCK_UNLOCKED; 34.785 + zone->zone_pgdat = pgdat; 34.786 + zone->free_pages = 0; 34.787 + zone->need_balance = 0; 34.788 + zone->nr_active_pages = zone->nr_inactive_pages = 0; 34.789 + 34.790 + 34.791 + if (!size) 34.792 + continue; 34.793 + 34.794 + /* 34.795 + * The per-page waitqueue mechanism uses hashed waitqueues 34.796 + * per zone. 34.797 + */ 34.798 + zone->wait_table_size = wait_table_size(size); 34.799 + zone->wait_table_shift = 34.800 + BITS_PER_LONG - wait_table_bits(zone->wait_table_size); 34.801 + zone->wait_table = (wait_queue_head_t *) 34.802 + alloc_bootmem_node(pgdat, zone->wait_table_size 34.803 + * sizeof(wait_queue_head_t)); 34.804 + 34.805 + for(i = 0; i < zone->wait_table_size; ++i) 34.806 + init_waitqueue_head(zone->wait_table + i); 34.807 + 34.808 + pgdat->nr_zones = j+1; 34.809 + 34.810 + mask = (realsize / zone_balance_ratio[j]); 34.811 + if (mask < zone_balance_min[j]) 34.812 + mask = zone_balance_min[j]; 34.813 + else if (mask > zone_balance_max[j]) 34.814 + mask = zone_balance_max[j]; 34.815 + zone->watermarks[j].min = mask; 34.816 + zone->watermarks[j].low = mask*2; 34.817 + zone->watermarks[j].high = mask*3; 34.818 + /* now set the watermarks of the lower zones in the "j" classzone */ 34.819 + for (idx = j-1; idx >= 0; idx--) { 34.820 + zone_t * lower_zone = pgdat->node_zones + idx; 34.821 + unsigned long lower_zone_reserve; 34.822 + if (!lower_zone->size) 34.823 + continue; 34.824 + 34.825 + mask = lower_zone->watermarks[idx].min; 34.826 + lower_zone->watermarks[j].min = mask; 34.827 + lower_zone->watermarks[j].low = mask*2; 34.828 + lower_zone->watermarks[j].high = mask*3; 34.829 + 34.830 + /* now the brainer part */ 34.831 + lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; 34.832 + lower_zone->watermarks[j].min += lower_zone_reserve; 34.833 + lower_zone->watermarks[j].low += lower_zone_reserve; 34.834 + lower_zone->watermarks[j].high += lower_zone_reserve; 34.835 + 34.836 + realsize += lower_zone->realsize; 34.837 + } 34.838 + 34.839 + zone->zone_mem_map = mem_map + offset; 34.840 + zone->zone_start_mapnr = offset; 34.841 + zone->zone_start_paddr = zone_start_paddr; 34.842 + 34.843 + if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) 34.844 + printk("BUG: wrong zone alignment, it will crash\n"); 34.845 + 34.846 + /* 34.847 + * Initially all pages are reserved - free ones are freed 34.848 + * up by free_all_bootmem() once the early boot process is 34.849 + * done. Non-atomic initialization, single-pass. 34.850 + */ 34.851 + for (i = 0; i < size; i++) { 34.852 + struct page *page = mem_map + offset + i; 34.853 + set_page_zone(page, nid * MAX_NR_ZONES + j); 34.854 + set_page_count(page, 0); 34.855 + SetPageReserved(page); 34.856 + INIT_LIST_HEAD(&page->list); 34.857 + if (j != ZONE_HIGHMEM) 34.858 + set_page_address(page, __va(zone_start_paddr)); 34.859 + zone_start_paddr += PAGE_SIZE; 34.860 + } 34.861 + 34.862 + offset += size; 34.863 + for (i = 0; ; i++) { 34.864 + unsigned long bitmap_size; 34.865 + 34.866 + INIT_LIST_HEAD(&zone->free_area[i].free_list); 34.867 + if (i == MAX_ORDER-1) { 34.868 + zone->free_area[i].map = NULL; 34.869 + break; 34.870 + } 34.871 + 34.872 + /* 34.873 + * Page buddy system uses "index >> (i+1)", 34.874 + * where "index" is at most "size-1". 34.875 + * 34.876 + * The extra "+3" is to round down to byte 34.877 + * size (8 bits per byte assumption). Thus 34.878 + * we get "(size-1) >> (i+4)" as the last byte 34.879 + * we can access. 34.880 + * 34.881 + * The "+1" is because we want to round the 34.882 + * byte allocation up rather than down. So 34.883 + * we should have had a "+7" before we shifted 34.884 + * down by three. Also, we have to add one as 34.885 + * we actually _use_ the last bit (it's [0,n] 34.886 + * inclusive, not [0,n[). 34.887 + * 34.888 + * So we actually had +7+1 before we shift 34.889 + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 34.890 + * (modulo overflows, which we do not have). 34.891 + * 34.892 + * Finally, we LONG_ALIGN because all bitmap 34.893 + * operations are on longs. 34.894 + */ 34.895 + bitmap_size = (size-1) >> (i+4); 34.896 + bitmap_size = LONG_ALIGN(bitmap_size+1); 34.897 + zone->free_area[i].map = 34.898 + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); 34.899 + } 34.900 + } 34.901 + build_zonelists(pgdat); 34.902 +} 34.903 + 34.904 +void __init free_area_init(unsigned long *zones_size) 34.905 +{ 34.906 + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); 34.907 +} 34.908 + 34.909 +static int __init setup_mem_frac(char *str) 34.910 +{ 34.911 + int j = 0; 34.912 + 34.913 + while (get_option(&str, &zone_balance_ratio[j++]) == 2); 34.914 + printk("setup_mem_frac: "); 34.915 + for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); 34.916 + printk("\n"); 34.917 + return 1; 34.918 +} 34.919 + 34.920 +__setup("memfrac=", setup_mem_frac); 34.921 + 34.922 +static int __init setup_lower_zone_reserve(char *str) 34.923 +{ 34.924 + int j = 0; 34.925 + 34.926 + while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); 34.927 + printk("setup_lower_zone_reserve: "); 34.928 + for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); 34.929 + printk("\n"); 34.930 + return 1; 34.931 +} 34.932 + 34.933 +__setup("lower_zone_reserve=", setup_lower_zone_reserve);