ia64/xen-unstable

changeset 269:f51eab080fa1

bitkeeper revision 1.115 (3e6ba94627SF_Dv66Al7guNkgaK_xg)

Many files:
Add scatter/gather to the Xen blkdev interface. Our write speeds are now comparable with Linux. Also fixed a few bugs.
author kaf24@labyrinth.cl.cam.ac.uk
date Sun Mar 09 20:51:18 2003 +0000 (2003-03-09)
parents 6501c2dbec48
children 565a9104c380
files xen/common/dom_mem_ops.c xen/common/domain.c xen/common/memory.c xen/drivers/block/ll_rw_blk.c xen/drivers/block/xen_block.c xen/drivers/block/xen_segment.c xen/drivers/ide/ide-dma.c xen/include/hypervisor-ifs/block.h xen/include/xeno/blkdev.h xen/include/xeno/sched.h xen/include/xeno/segment.h xen/net/dev.c xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c
line diff
     1.1 --- a/xen/common/dom_mem_ops.c	Fri Mar 07 18:26:52 2003 +0000
     1.2 +++ b/xen/common/dom_mem_ops.c	Sun Mar 09 20:51:18 2003 +0000
     1.3 @@ -37,6 +37,7 @@ static long alloc_dom_mem(struct task_st
     1.4          return -ENOMEM;
     1.5      
     1.6      spin_lock_irqsave(&free_list_lock, flags);
     1.7 +    spin_lock(&p->page_lock);
     1.8      
     1.9      temp = free_list.next;
    1.10      for ( i = 0; i < bop.size; i++ )
    1.11 @@ -63,6 +64,7 @@ static long alloc_dom_mem(struct task_st
    1.12          unmap_domain_mem(va);
    1.13      }
    1.14  
    1.15 +    spin_unlock(&p->page_lock);
    1.16      spin_unlock_irqrestore(&free_list_lock, flags);
    1.17      
    1.18      return bop.size;
    1.19 @@ -78,7 +80,8 @@ static long free_dom_mem(struct task_str
    1.20      long              rc = 0;
    1.21  
    1.22      spin_lock_irqsave(&free_list_lock, flags);
    1.23 -    
    1.24 +    spin_lock(&p->page_lock);
    1.25 +
    1.26      temp = free_list.next;
    1.27      for ( i = 0; i < bop.size; i++ )
    1.28      {
    1.29 @@ -94,7 +97,7 @@ static long free_dom_mem(struct task_str
    1.30  
    1.31          pf = &frame_table[mpfn];
    1.32          if ( (pf->type_count != 0) || 
    1.33 -             (pf->type_count != 0) ||
    1.34 +             (pf->tot_count != 0) ||
    1.35               (pf->flags != p->domain) )
    1.36          {
    1.37              DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n",
    1.38 @@ -113,6 +116,7 @@ static long free_dom_mem(struct task_str
    1.39      }
    1.40  
    1.41   out:
    1.42 +    spin_unlock(&p->page_lock);
    1.43      spin_unlock_irqrestore(&free_list_lock, flags);
    1.44      
    1.45      return rc ? rc : bop.size;
     2.1 --- a/xen/common/domain.c	Fri Mar 07 18:26:52 2003 +0000
     2.2 +++ b/xen/common/domain.c	Sun Mar 09 20:51:18 2003 +0000
     2.3 @@ -46,6 +46,7 @@ struct task_struct *do_newdomain(unsigne
     2.4      p->processor = cpu;
     2.5  
     2.6      spin_lock_init(&p->blk_ring_lock);
     2.7 +    spin_lock_init(&p->page_lock);
     2.8  
     2.9      p->shared_info = (void *)get_free_page(GFP_KERNEL);
    2.10      memset(p->shared_info, 0, PAGE_SIZE);
     3.1 --- a/xen/common/memory.c	Fri Mar 07 18:26:52 2003 +0000
     3.2 +++ b/xen/common/memory.c	Sun Mar 09 20:51:18 2003 +0000
     3.3 @@ -726,6 +726,7 @@ int do_process_page_updates(page_update_
     3.4          err = 1;
     3.5  
     3.6          /* Least significant bits of 'ptr' demux the operation type. */
     3.7 +        spin_lock_irq(&current->page_lock);
     3.8          switch ( req.ptr & (sizeof(l1_pgentry_t)-1) )
     3.9          {
    3.10              /*
    3.11 @@ -799,6 +800,7 @@ int do_process_page_updates(page_update_
    3.12              MEM_LOG("Invalid page update command %08lx", req.ptr);
    3.13              break;
    3.14          }
    3.15 +        spin_unlock_irq(&current->page_lock);
    3.16  
    3.17          if ( err )
    3.18          {
     4.1 --- a/xen/drivers/block/ll_rw_blk.c	Fri Mar 07 18:26:52 2003 +0000
     4.2 +++ b/xen/drivers/block/ll_rw_blk.c	Sun Mar 09 20:51:18 2003 +0000
     4.3 @@ -1224,7 +1224,7 @@ void ll_rw_block(int rw, int nr, struct 
     4.4  			continue;
     4.5  
     4.6  		/* We have the buffer lock */
     4.7 -		atomic_inc(&bh->b_count);
     4.8 +		/*atomic_inc(&bh->b_count);*/
     4.9  
    4.10  		switch(rw) {
    4.11  		case WRITE:
     5.1 --- a/xen/drivers/block/xen_block.c	Fri Mar 07 18:26:52 2003 +0000
     5.2 +++ b/xen/drivers/block/xen_block.c	Sun Mar 09 20:51:18 2003 +0000
     5.3 @@ -18,7 +18,7 @@
     5.4  #include <xeno/interrupt.h>
     5.5  #include <xeno/segment.h>
     5.6  
     5.7 -#if 0
     5.8 +#if 1
     5.9  #define DPRINTK(_f, _a...) printk( _f , ## _a )
    5.10  #else
    5.11  #define DPRINTK(_f, _a...) ((void)0)
    5.12 @@ -28,12 +28,30 @@
    5.13   * These are rather arbitrary. They are fairly large because adjacent
    5.14   * requests pulled from a communication ring are quite likely to end
    5.15   * up being part of the same scatter/gather request at the disc.
    5.16 - * It might be a good idea to add scatter/gather support explicitly to
    5.17 - * the scatter/gather ring (eg. each request has an array of N pointers);
    5.18 - * then these values would better reflect real costs at the disc.
    5.19 + * 
    5.20 + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
    5.21 + * This will increase the chances of being able to write whole tracks.
    5.22 + * '64' should be enough to keep us competitive with Linux.
    5.23   */
    5.24 -#define MAX_PENDING_REQS 32
    5.25 -#define BATCH_PER_DOMAIN 8
    5.26 +#define MAX_PENDING_REQS 64
    5.27 +#define BATCH_PER_DOMAIN 16
    5.28 +
    5.29 +/*
    5.30 + * Each outstanding request which we've passed to the lower device layers
    5.31 + * has a 'pending_req' allocated to it. Each buffer_head that completes
    5.32 + * decrements the pendcnt towards zero. When it hits zero, the specified
    5.33 + * domain has a response queued for it, with the saved 'id' passed back.
    5.34 + * 
    5.35 + * We can't allocate pending_req's in order, since they may complete out
    5.36 + * of order. We therefore maintain an allocation ring. This ring also 
    5.37 + * indicates when enough work has been passed down -- at that point the
    5.38 + * allocation ring will be empty.
    5.39 + */
    5.40 +static pending_req_t pending_reqs[MAX_PENDING_REQS];
    5.41 +static unsigned char pending_ring[MAX_PENDING_REQS];
    5.42 +static unsigned int pending_prod, pending_cons;
    5.43 +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
    5.44 +#define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1))
    5.45  
    5.46  static kmem_cache_t *buffer_head_cachep;
    5.47  static atomic_t nr_pending;
    5.48 @@ -65,6 +83,18 @@ static kdev_t scsi_devs[NR_SCSI_DEVS] = 
    5.49      MKDEV(SCSI_DISK0_MAJOR, 224), MKDEV(SCSI_DISK0_MAJOR, 240), /* sdo, sdp */
    5.50  };
    5.51  
    5.52 +static int __buffer_is_valid(struct task_struct *p, 
    5.53 +                             unsigned long buffer, 
    5.54 +                             unsigned short size,
    5.55 +                             int writeable_buffer);
    5.56 +static void __lock_buffer(unsigned long buffer,
    5.57 +                          unsigned short size,
    5.58 +                          int writeable_buffer);
    5.59 +static void unlock_buffer(struct task_struct *p,
    5.60 +                          unsigned long buffer,
    5.61 +                          unsigned short size,
    5.62 +                          int writeable_buffer);
    5.63 +
    5.64  static void io_schedule(unsigned long unused);
    5.65  static int do_block_io_op_domain(struct task_struct *p, int max_to_do);
    5.66  static void dispatch_rw_block_io(struct task_struct *p, int index);
    5.67 @@ -73,8 +103,8 @@ static void dispatch_probe_seg(struct ta
    5.68  static void dispatch_debug_block_io(struct task_struct *p, int index);
    5.69  static void dispatch_create_segment(struct task_struct *p, int index);
    5.70  static void dispatch_delete_segment(struct task_struct *p, int index);
    5.71 -static void make_response(struct task_struct *p, void *id, int op, 
    5.72 -			  unsigned long st);
    5.73 +static void make_response(struct task_struct *p, unsigned long id, 
    5.74 +                          unsigned short op, unsigned long st);
    5.75  
    5.76  
    5.77  /******************************************************************
    5.78 @@ -165,28 +195,27 @@ static void maybe_trigger_io_schedule(vo
    5.79  
    5.80  static void end_block_io_op(struct buffer_head *bh, int uptodate)
    5.81  {
    5.82 -    struct pfn_info *page;
    5.83 -    unsigned long pfn;
    5.84 +    unsigned long flags;
    5.85 +    pending_req_t *pending_req = bh->pending_req;
    5.86 +
    5.87 +    unlock_buffer(pending_req->domain, 
    5.88 +                  virt_to_phys(bh->b_data), 
    5.89 +                  bh->b_size, 
    5.90 +                  (pending_req->operation==READ));
    5.91  
    5.92 -    for ( pfn = virt_to_phys(bh->b_data) >> PAGE_SHIFT; 
    5.93 -          pfn < ((virt_to_phys(bh->b_data) + bh->b_size + PAGE_SIZE - 1) >> 
    5.94 -                 PAGE_SHIFT);
    5.95 -          pfn++ )
    5.96 +    if ( atomic_dec_and_test(&pending_req->pendcnt) )
    5.97      {
    5.98 -        page = frame_table + pfn;
    5.99 -        if ( ((bh->b_state & (1 << BH_Read)) != 0) &&
   5.100 -             (put_page_type(page) == 0) )
   5.101 -            page->flags &= ~PG_type_mask;
   5.102 -        put_page_tot(page);
   5.103 +        make_response(pending_req->domain, pending_req->id,
   5.104 +                      pending_req->operation, uptodate ? 0 : 1);
   5.105 +        spin_lock_irqsave(&pend_prod_lock, flags);
   5.106 +        pending_ring[pending_prod] = pending_req - pending_reqs;
   5.107 +        PENDREQ_IDX_INC(pending_prod);
   5.108 +        spin_unlock_irqrestore(&pend_prod_lock, flags);
   5.109 +        atomic_dec(&nr_pending);
   5.110 +        maybe_trigger_io_schedule();
   5.111      }
   5.112  
   5.113 -    atomic_dec(&nr_pending);
   5.114 -    make_response(bh->b_xen_domain, bh->b_xen_id, 
   5.115 -		  XEN_BLOCK_READ, uptodate ? 0 : 1);
   5.116 -
   5.117      kmem_cache_free(buffer_head_cachep, bh);
   5.118 -
   5.119 -    maybe_trigger_io_schedule();
   5.120  }
   5.121  
   5.122  
   5.123 @@ -208,16 +237,105 @@ long do_block_io_op(void)
   5.124   * DOWNWARD CALLS -- These interface with the block-device layer proper.
   5.125   */
   5.126  
   5.127 -static int do_block_io_op_domain(struct task_struct* p, int max_to_do)
   5.128 +static int __buffer_is_valid(struct task_struct *p, 
   5.129 +                             unsigned long buffer, 
   5.130 +                             unsigned short size,
   5.131 +                             int writeable_buffer)
   5.132 +{
   5.133 +    unsigned long    pfn;
   5.134 +    struct pfn_info *page;
   5.135 +    int rc = 0;
   5.136 +
   5.137 +    /* A request may span multiple page frames. Each must be checked. */
   5.138 +    for ( pfn = buffer >> PAGE_SHIFT; 
   5.139 +          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   5.140 +          pfn++ )
   5.141 +    {
   5.142 +        /* Each frame must be within bounds of machine memory. */
   5.143 +        if ( pfn >= max_page )
   5.144 +        {
   5.145 +            DPRINTK("pfn out of range: %08lx\n", pfn);
   5.146 +            goto out;
   5.147 +        }
   5.148 +
   5.149 +        page = frame_table + pfn;
   5.150 +
   5.151 +        /* Each frame must belong to the requesting domain. */
   5.152 +        if ( (page->flags & PG_domain_mask) != p->domain )
   5.153 +        {
   5.154 +            DPRINTK("bad domain: expected %d, got %ld\n", 
   5.155 +                    p->domain, page->flags & PG_domain_mask);
   5.156 +            goto out;
   5.157 +        }
   5.158 +
   5.159 +        /* If reading into the frame, the frame must be writeable. */
   5.160 +        if ( writeable_buffer &&
   5.161 +             ((page->flags & PG_type_mask) != PGT_writeable_page) )
   5.162 +        {
   5.163 +            DPRINTK("non-writeable page passed for block read\n");
   5.164 +            goto out;
   5.165 +        }
   5.166 +    }    
   5.167 +
   5.168 +    rc = 1;
   5.169 + out:
   5.170 +    return rc;
   5.171 +}
   5.172 +
   5.173 +static void __lock_buffer(unsigned long buffer,
   5.174 +                          unsigned short size,
   5.175 +                          int writeable_buffer)
   5.176 +{
   5.177 +    unsigned long    pfn;
   5.178 +    struct pfn_info *page;
   5.179 +
   5.180 +    for ( pfn = buffer >> PAGE_SHIFT; 
   5.181 +          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   5.182 +          pfn++ )
   5.183 +    {
   5.184 +        page = frame_table + pfn;
   5.185 +        if ( writeable_buffer ) get_page_type(page);
   5.186 +        get_page_tot(page);
   5.187 +    }
   5.188 +}
   5.189 +
   5.190 +static void unlock_buffer(struct task_struct *p,
   5.191 +                          unsigned long buffer,
   5.192 +                          unsigned short size,
   5.193 +                          int writeable_buffer)
   5.194 +{
   5.195 +    unsigned long    pfn, flags;
   5.196 +    struct pfn_info *page;
   5.197 +
   5.198 +    spin_lock_irqsave(&p->page_lock, flags);
   5.199 +    for ( pfn = buffer >> PAGE_SHIFT; 
   5.200 +          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   5.201 +          pfn++ )
   5.202 +    {
   5.203 +        page = frame_table + pfn;
   5.204 +        if ( writeable_buffer && (put_page_type(page) == 0) )
   5.205 +            page->flags &= ~PG_type_mask;
   5.206 +        put_page_tot(page);
   5.207 +    }
   5.208 +    spin_unlock_irqrestore(&p->page_lock, flags);
   5.209 +}
   5.210 +
   5.211 +static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
   5.212  {
   5.213      blk_ring_t *blk_ring = p->blk_ring_base;
   5.214      int i, more_to_do = 0;
   5.215  
   5.216 +    /*
   5.217 +     * Take items off the comms ring, taking care not to catch up
   5.218 +     * with the response-producer index.
   5.219 +     */
   5.220      for ( i = p->blk_req_cons; 
   5.221 -	  i != blk_ring->req_prod; 
   5.222 +	  (i != blk_ring->req_prod) &&
   5.223 +              (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1); 
   5.224  	  i = BLK_RING_INC(i) ) 
   5.225      {
   5.226 -        if ( max_to_do-- == 0 )
   5.227 +        if ( (max_to_do-- == 0) || 
   5.228 +             (atomic_read(&nr_pending) == MAX_PENDING_REQS) )
   5.229          {
   5.230              more_to_do = 1;
   5.231              break;
   5.232 @@ -251,8 +369,11 @@ static int do_block_io_op_domain(struct 
   5.233  	    break;
   5.234  
   5.235  	default:
   5.236 -	    panic("error: unknown block io operation [%d]\n",
   5.237 -                  blk_ring->ring[i].req.operation);
   5.238 +            DPRINTK("error: unknown block io operation [%d]\n",
   5.239 +                    blk_ring->ring[i].req.operation);
   5.240 +            make_response(p, blk_ring->ring[i].req.id, 
   5.241 +                          blk_ring->ring[i].req.operation, 1);
   5.242 +            break;
   5.243  	}
   5.244      }
   5.245  
   5.246 @@ -268,23 +389,38 @@ static void dispatch_debug_block_io(stru
   5.247  static void dispatch_create_segment(struct task_struct *p, int index)
   5.248  {
   5.249      blk_ring_t *blk_ring = p->blk_ring_base;
   5.250 +    unsigned long flags, buffer;
   5.251      xv_disk_t *xvd;
   5.252      int result;
   5.253  
   5.254 -    if (p->domain != 0)
   5.255 +    if ( p->domain != 0 )
   5.256      {
   5.257          DPRINTK("dispatch_create_segment called by dom%d\n", p->domain);
   5.258 -        make_response(p, blk_ring->ring[index].req.id, 
   5.259 -                      XEN_BLOCK_SEG_CREATE, 1); 
   5.260 -        return;
   5.261 +        result = 1;
   5.262 +        goto out;
   5.263      }
   5.264  
   5.265 -    xvd = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer);    
   5.266 +    buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
   5.267 +
   5.268 +    spin_lock_irqsave(&p->page_lock, flags);
   5.269 +    if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) )
   5.270 +    {
   5.271 +        DPRINTK("Bad buffer in dispatch_create_segment\n");
   5.272 +        spin_unlock_irqrestore(&p->page_lock, flags);
   5.273 +        result = 1;
   5.274 +        goto out;
   5.275 +    }
   5.276 +    __lock_buffer(buffer, sizeof(xv_disk_t), 1);
   5.277 +    spin_unlock_irqrestore(&p->page_lock, flags);
   5.278 +
   5.279 +    xvd = phys_to_virt(buffer);
   5.280      result = xen_segment_create(xvd);
   5.281  
   5.282 +    unlock_buffer(p, buffer, sizeof(xv_disk_t), 1);    
   5.283 +
   5.284 + out:
   5.285      make_response(p, blk_ring->ring[index].req.id, 
   5.286                    XEN_BLOCK_SEG_CREATE, result); 
   5.287 -    return;
   5.288  }
   5.289  
   5.290  static void dispatch_delete_segment(struct task_struct *p, int index)
   5.291 @@ -299,13 +435,30 @@ static void dispatch_probe_blk(struct ta
   5.292  
   5.293      blk_ring_t *blk_ring = p->blk_ring_base;
   5.294      xen_disk_info_t *xdi;
   5.295 +    unsigned long flags, buffer;
   5.296 +    int rc = 0;
   5.297 +    
   5.298 +    buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
   5.299  
   5.300 -    xdi = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer);    
   5.301 +    spin_lock_irqsave(&p->page_lock, flags);
   5.302 +    if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
   5.303 +    {
   5.304 +        DPRINTK("Bad buffer in dispatch_probe_blk\n");
   5.305 +        spin_unlock_irqrestore(&p->page_lock, flags);
   5.306 +        rc = 1;
   5.307 +        goto out;
   5.308 +    }
   5.309 +    __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
   5.310 +    spin_unlock_irqrestore(&p->page_lock, flags);
   5.311  
   5.312 +    xdi = phys_to_virt(buffer);
   5.313      ide_probe_devices(xdi);
   5.314      scsi_probe_devices(xdi);
   5.315  
   5.316 -    make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, 0);
   5.317 +    unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
   5.318 +
   5.319 + out:
   5.320 +    make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, rc);
   5.321  }
   5.322  
   5.323  static void dispatch_probe_seg(struct task_struct *p, int index)
   5.324 @@ -313,175 +466,147 @@ static void dispatch_probe_seg(struct ta
   5.325      extern void xen_segment_probe(xen_disk_info_t *xdi);
   5.326      blk_ring_t *blk_ring = p->blk_ring_base;
   5.327      xen_disk_info_t *xdi;
   5.328 +    unsigned long flags, buffer;
   5.329 +    int rc = 0;
   5.330  
   5.331 -    xdi = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer);    
   5.332 +    buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
   5.333 +
   5.334 +    spin_lock_irqsave(&p->page_lock, flags);
   5.335 +    if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
   5.336 +    {
   5.337 +        DPRINTK("Bad buffer in dispatch_probe_seg\n");
   5.338 +        spin_unlock_irqrestore(&p->page_lock, flags);
   5.339 +        rc = 1;
   5.340 +        goto out;
   5.341 +    }
   5.342 +    __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
   5.343 +    spin_unlock_irqrestore(&p->page_lock, flags);
   5.344 +
   5.345 +    xdi = phys_to_virt(buffer);
   5.346      xen_segment_probe(xdi);
   5.347  
   5.348 -    make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, 0);
   5.349 +    unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
   5.350 +
   5.351 + out:
   5.352 +    make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, rc);
   5.353  }
   5.354  
   5.355  static void dispatch_rw_block_io(struct task_struct *p, int index)
   5.356  {
   5.357      extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
   5.358      blk_ring_t *blk_ring = p->blk_ring_base;
   5.359 +    blk_ring_req_entry_t *req = &blk_ring->ring[index].req;
   5.360      struct buffer_head *bh;
   5.361 -    int operation;
   5.362 -    unsigned short size;
   5.363 -    unsigned long  block_number = 0L;
   5.364 -    unsigned long  sector_number = 0L;
   5.365 -    unsigned long buffer, pfn;
   5.366 -    struct pfn_info *page;
   5.367 -    int s, xen_device, phys_device = 0;
   5.368 -
   5.369 -    operation = (blk_ring->ring[index].req.operation == XEN_BLOCK_WRITE) ? 
   5.370 -        WRITE : READ;
   5.371 +    int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
   5.372 +    unsigned short nr_sects;
   5.373 +    unsigned long buffer, flags;
   5.374 +    int i, tot_sects;
   5.375 +    pending_req_t *pending_req;
   5.376  
   5.377 -    /* Sectors are 512 bytes. Make sure request size is a multiple. */
   5.378 -    size = blk_ring->ring[index].req.block_size; 
   5.379 -    if ( (size == 0) || (size & (0x200 - 1)) != 0 )
   5.380 +    /* We map virtual scatter/gather segments to physical segments. */
   5.381 +    int new_segs, nr_psegs = 0;
   5.382 +    phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
   5.383 +
   5.384 +    spin_lock_irqsave(&p->page_lock, flags);
   5.385 +
   5.386 +    /* Check that number of segments is sane. */
   5.387 +    if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
   5.388      {
   5.389 -	DPRINTK("dodgy block size: %d\n", 
   5.390 -                blk_ring->ring[index].req.block_size);
   5.391 -        goto bad_descriptor;
   5.392 -    }
   5.393 -
   5.394 -    /* Buffer address should be sector aligned. */
   5.395 -    buffer = (unsigned long)blk_ring->ring[index].req.buffer;
   5.396 -    if ( (buffer & (0x200 - 1)) != 0 )
   5.397 -    {
   5.398 -        DPRINTK("unaligned buffer %08lx\n", buffer);
   5.399 +        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
   5.400          goto bad_descriptor;
   5.401      }
   5.402  
   5.403 -    /* A request may span multiple page frames. Each must be checked. */
   5.404 -    for ( pfn = buffer >> PAGE_SHIFT; 
   5.405 -          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   5.406 -          pfn++ )
   5.407 +    /*
   5.408 +     * Check each address/size pair is sane, and convert into a
   5.409 +     * physical device and block offset. Note that if the offset and size
   5.410 +     * crosses a virtual extent boundary, we may end up with more
   5.411 +     * physical scatter/gather segments than virtual segments.
   5.412 +     */
   5.413 +    for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
   5.414      {
   5.415 -        /* Each frame must be within bounds of machine memory. */
   5.416 -        if ( pfn >= max_page )
   5.417 +        buffer   = req->buffer_and_sects[i] & ~0x1FF;
   5.418 +        nr_sects = req->buffer_and_sects[i] &  0x1FF;
   5.419 +
   5.420 +        if ( nr_sects == 0 )
   5.421          {
   5.422 -            DPRINTK("pfn out of range: %08lx\n", pfn);
   5.423 -            goto bad_descriptor_free_frames;
   5.424 +            DPRINTK("zero-sized data request\n");
   5.425 +            goto bad_descriptor;
   5.426          }
   5.427  
   5.428 -        page = frame_table + pfn;
   5.429 -
   5.430 -        /* Each frame must belong to the requesting domain. */
   5.431 -        if ( (page->flags & PG_domain_mask) != p->domain )
   5.432 -        {
   5.433 -            DPRINTK("bad domain: expected %d, got %ld\n", 
   5.434 -                    p->domain, page->flags & PG_domain_mask);
   5.435 -            goto bad_descriptor_free_frames;
   5.436 -        }
   5.437 +        if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
   5.438 +            goto bad_descriptor;
   5.439  
   5.440 -        /* If reading into the frame, the frame must be writeable. */
   5.441 -        if ( operation == READ )
   5.442 +        /* Get the physical device and block index. */
   5.443 +        if ( (req->device & XENDEV_TYPE_MASK) == XENDEV_VIRTUAL )
   5.444 +        {
   5.445 +            new_segs = xen_segment_map_request(
   5.446 +                &phys_seg[nr_psegs], p, operation,
   5.447 +                req->device, 
   5.448 +                req->sector_number + tot_sects,
   5.449 +                buffer, nr_sects);
   5.450 +            if ( new_segs <= 0 ) goto bad_descriptor;
   5.451 +        }
   5.452 +        else
   5.453          {
   5.454 -            if ( (page->flags & PG_type_mask) != PGT_writeable_page )
   5.455 -            {
   5.456 -                DPRINTK("non-writeable page passed for block read\n");
   5.457 -                goto bad_descriptor_free_frames;
   5.458 -            }
   5.459 -            get_page_type(page);
   5.460 +            phys_seg[nr_psegs].dev           = xendev_to_physdev(req->device);
   5.461 +            phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
   5.462 +            phys_seg[nr_psegs].buffer        = buffer;
   5.463 +            phys_seg[nr_psegs].nr_sects      = nr_sects;
   5.464 +            if ( phys_seg[nr_psegs].dev == 0 ) goto bad_descriptor;
   5.465 +            new_segs = 1;
   5.466          }
   5.467 +        
   5.468 +        nr_psegs += new_segs;
   5.469 +        if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
   5.470 +    }
   5.471  
   5.472 -        /* Xen holds a frame reference until the operation is complete. */
   5.473 -        get_page_tot(page);
   5.474 -    }
   5.475 +    /* Lock pages associated with each buffer head. */
   5.476 +    for ( i = 0; i < nr_psegs; i++ )
   5.477 +        __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, 
   5.478 +                      (operation==READ));
   5.479 +    spin_unlock_irqrestore(&p->page_lock, flags);
   5.480  
   5.481      atomic_inc(&nr_pending);
   5.482 -    bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
   5.483 -    if ( bh == NULL ) panic("bh is null\n");
   5.484 -
   5.485 -    /* set just the important bits of the buffer header */
   5.486 -    memset (bh, 0, sizeof (struct buffer_head));
   5.487 -
   5.488 -    xen_device = blk_ring->ring[index].req.device;
   5.489 -
   5.490 - again:
   5.491 -    switch ( (xen_device & XENDEV_TYPE_MASK) )
   5.492 -    {
   5.493 -    case XENDEV_IDE:
   5.494 -        xen_device &= XENDEV_IDX_MASK;
   5.495 -        if ( xen_device >= NR_IDE_DEVS )
   5.496 -        {
   5.497 -            DPRINTK("IDE device number out of range %d\n", xen_device);
   5.498 -            goto bad_descriptor_free_frames;
   5.499 -        }
   5.500 -        phys_device   = ide_devs[xen_device];
   5.501 -        block_number  = blk_ring->ring[index].req.block_number;
   5.502 -        sector_number = blk_ring->ring[index].req.sector_number;
   5.503 -        break;
   5.504 -
   5.505 -    case XENDEV_SCSI:
   5.506 -        xen_device &= XENDEV_IDX_MASK;
   5.507 -        if ( xen_device >= NR_SCSI_DEVS )
   5.508 -        {
   5.509 -            DPRINTK("SCSI device number out of range %d\n", xen_device);
   5.510 -            goto bad_descriptor_free_frames;
   5.511 -        }
   5.512 -        phys_device   = scsi_devs[xen_device];
   5.513 -        block_number  = blk_ring->ring[index].req.block_number;
   5.514 -        sector_number = blk_ring->ring[index].req.sector_number;
   5.515 -        break;
   5.516 +    pending_req = pending_reqs + pending_ring[pending_cons];
   5.517 +    PENDREQ_IDX_INC(pending_cons);
   5.518 +    pending_req->domain    = p;
   5.519 +    pending_req->id        = req->id;
   5.520 +    pending_req->operation = operation;
   5.521 +    atomic_set(&pending_req->pendcnt, nr_psegs);
   5.522  
   5.523 -    case XENDEV_VIRTUAL:
   5.524 -        xen_device &= XENDEV_IDX_MASK;
   5.525 -        s = xen_segment_map_request(
   5.526 -            &xen_device, &block_number, &sector_number,
   5.527 -            p, operation, xen_device,
   5.528 -            blk_ring->ring[index].req.block_number,
   5.529 -            blk_ring->ring[index].req.sector_number);
   5.530 -        if ( s != 0 )
   5.531 -        {
   5.532 -            DPRINTK("xen_seg_map_request status: %d\n", s);
   5.533 -            goto bad_descriptor_free_frames;
   5.534 -        }
   5.535 -        goto again; /* Loop round to convert the virt IDE/SCSI identifier. */
   5.536 -
   5.537 -    default:
   5.538 -        DPRINTK("dispatch_rw_block_io: unknown device %d\n", xen_device);
   5.539 -        goto bad_descriptor_free_frames;
   5.540 -    }
   5.541 +    /* Now we pass each segment down to the real blkdev layer. */
   5.542 +    for ( i = 0; i < nr_psegs; i++ )
   5.543 +    {
   5.544 +        bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
   5.545 +        if ( bh == NULL ) panic("bh is null\n");
   5.546 +        memset (bh, 0, sizeof (struct buffer_head));
   5.547      
   5.548 -    bh->b_blocknr       = block_number;
   5.549 -    bh->b_size          = size;
   5.550 -    bh->b_dev           = phys_device;
   5.551 -    bh->b_rsector       = sector_number;
   5.552 -    bh->b_data          = phys_to_virt(buffer);
   5.553 -    bh->b_count.counter = 1;
   5.554 -    bh->b_end_io        = end_block_io_op;
   5.555 +        bh->b_size          = phys_seg[i].nr_sects << 9;
   5.556 +        bh->b_dev           = phys_seg[i].dev;
   5.557 +        bh->b_rsector       = phys_seg[i].sector_number;
   5.558 +        bh->b_data          = phys_to_virt(phys_seg[i].buffer);
   5.559 +        bh->b_end_io        = end_block_io_op;
   5.560 +        bh->pending_req     = pending_req;
   5.561  
   5.562 -    /* Save meta data about request. */
   5.563 -    bh->b_xen_domain    = p;
   5.564 -    bh->b_xen_id        = blk_ring->ring[index].req.id;
   5.565 +        if ( operation == WRITE )
   5.566 +        {
   5.567 +            bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
   5.568 +                (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
   5.569 +        } 
   5.570 +        else
   5.571 +        {
   5.572 +            bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
   5.573 +        }
   5.574  
   5.575 -    if ( operation == WRITE )
   5.576 -    {
   5.577 -	bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
   5.578 -            (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
   5.579 -    } 
   5.580 -    else
   5.581 -    {
   5.582 -	bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
   5.583 +        /* Dispatch a single request. We'll flush it to disc later. */
   5.584 +        ll_rw_block(operation, 1, &bh);
   5.585      }
   5.586  
   5.587 -    /* Dispatch a single request. We'll flush it to disc later. */
   5.588 -    ll_rw_block(operation, 1, &bh);
   5.589      return;
   5.590  
   5.591 - bad_descriptor_free_frames:
   5.592 -    while ( pfn > (buffer >> PAGE_SHIFT) )
   5.593 -    {
   5.594 -        page = frame_table + --pfn;
   5.595 -        put_page_tot(page);
   5.596 -        if ( operation == READ ) put_page_type(page);
   5.597 -    }
   5.598 -
   5.599 - bad_descriptor: 
   5.600 -    DPRINTK("dispatch rw blockio bad descriptor\n");
   5.601 -    make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_READ, 1);
   5.602 + bad_descriptor:
   5.603 +    spin_unlock_irqrestore(&p->page_lock, flags);
   5.604 +    make_response(p, req->id, req->operation, 1);
   5.605  } 
   5.606  
   5.607  
   5.608 @@ -490,8 +615,38 @@ static void dispatch_rw_block_io(struct 
   5.609   * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
   5.610   */
   5.611  
   5.612 -static void make_response(struct task_struct *p, void *id, 
   5.613 -			  int op, unsigned long st)
   5.614 +kdev_t xendev_to_physdev(unsigned short xendev)
   5.615 +{
   5.616 +    switch ( (xendev & XENDEV_TYPE_MASK) )
   5.617 +    {
   5.618 +    case XENDEV_IDE:
   5.619 +        xendev &= XENDEV_IDX_MASK;
   5.620 +        if ( xendev >= NR_IDE_DEVS )
   5.621 +        {
   5.622 +            DPRINTK("IDE device number out of range %d\n", xendev);
   5.623 +            goto fail;
   5.624 +        }
   5.625 +        return ide_devs[xendev];
   5.626 +        
   5.627 +    case XENDEV_SCSI:
   5.628 +        xendev &= XENDEV_IDX_MASK;
   5.629 +        if ( xendev >= NR_SCSI_DEVS )
   5.630 +        {
   5.631 +            DPRINTK("SCSI device number out of range %d\n", xendev);
   5.632 +            goto fail;
   5.633 +        }
   5.634 +        return scsi_devs[xendev];
   5.635 +        
   5.636 +    default:
   5.637 +        DPRINTK("xendev_to_physdev: unknown device %d\n", xendev);
   5.638 +    }
   5.639 +
   5.640 + fail:
   5.641 +    return (kdev_t)0;
   5.642 +}
   5.643 +
   5.644 +static void make_response(struct task_struct *p, unsigned long id, 
   5.645 +			  unsigned short op, unsigned long st)
   5.646  {
   5.647      unsigned long cpu_mask, flags;
   5.648      int position;
   5.649 @@ -500,11 +655,11 @@ static void make_response(struct task_st
   5.650      /* Place on the response ring for the relevant domain. */ 
   5.651      spin_lock_irqsave(&p->blk_ring_lock, flags);
   5.652      blk_ring = p->blk_ring_base;
   5.653 -    position = blk_ring->resp_prod;
   5.654 +    position = p->blk_resp_prod;
   5.655      blk_ring->ring[position].resp.id        = id;
   5.656      blk_ring->ring[position].resp.operation = op;
   5.657      blk_ring->ring[position].resp.status    = st;
   5.658 -    blk_ring->resp_prod = BLK_RING_INC(position);
   5.659 +    p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
   5.660      spin_unlock_irqrestore(&p->blk_ring_lock, flags);
   5.661      
   5.662      /* Kick the relevant domain. */
   5.663 @@ -517,18 +672,22 @@ static void dump_blockq(u_char key, void
   5.664      struct task_struct *p;
   5.665      blk_ring_t *blk_ring ;
   5.666  
   5.667 -    printk("Dumping block queue stats: nr_pending = %d\n",
   5.668 -           atomic_read(&nr_pending));
   5.669 +    printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n",
   5.670 +           atomic_read(&nr_pending), pending_prod, pending_cons);
   5.671  
   5.672      p = current->next_task;
   5.673      do
   5.674      {
   5.675 -        printk (KERN_ALERT "Domain: %d\n", p->domain);
   5.676 -        blk_ring = p->blk_ring_base;
   5.677 -
   5.678 -        printk("  req_prod:%d, resp_prod:%d, req_cons:%d\n",
   5.679 -               blk_ring->req_prod, blk_ring->resp_prod, p->blk_req_cons);
   5.680 -
   5.681 +        if ( !is_idle_task(p) )
   5.682 +        {
   5.683 +            printk("Domain: %d\n", p->domain);
   5.684 +            blk_ring = p->blk_ring_base;
   5.685 +            
   5.686 +            printk("  req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n",
   5.687 +                   blk_ring->req_prod, p->blk_req_cons,
   5.688 +                   blk_ring->resp_prod, p->blk_resp_prod,
   5.689 +                   __on_blkdev_list(p));
   5.690 +        }
   5.691          p = p->next_task;
   5.692      } while (p != current);
   5.693  }
   5.694 @@ -545,7 +704,8 @@ void init_blkdev_info(struct task_struct
   5.695      memset(p->segment_list, 0, sizeof(p->segment_list));
   5.696      p->segment_count = 0;
   5.697  
   5.698 -    xen_refresh_segment_list(p);      /* get any previously created segments */
   5.699 +    /* Get any previously created segments. */
   5.700 +    xen_refresh_segment_list(p);
   5.701  }
   5.702  
   5.703  /* End-of-day teardown for a domain. XXX Outstanding requests? */
   5.704 @@ -558,7 +718,12 @@ void destroy_blkdev_info(struct task_str
   5.705  
   5.706  void initialize_block_io ()
   5.707  {
   5.708 +    int i;
   5.709 +
   5.710      atomic_set(&nr_pending, 0);
   5.711 +    pending_prod = pending_cons = 0;
   5.712 +    memset(pending_reqs, 0, sizeof(pending_reqs));
   5.713 +    for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
   5.714  
   5.715      spin_lock_init(&io_schedule_list_lock);
   5.716      INIT_LIST_HEAD(&io_schedule_list);
     6.1 --- a/xen/drivers/block/xen_segment.c	Fri Mar 07 18:26:52 2003 +0000
     6.2 +++ b/xen/drivers/block/xen_segment.c	Sun Mar 09 20:51:18 2003 +0000
     6.3 @@ -23,70 +23,73 @@ segment_t xsegments[XEN_MAX_SEGMENTS];
     6.4   * xen_device must be a valid device.
     6.5   */
     6.6  
     6.7 +/*
     6.8 + * NB. Al offsets and sizes here are in sector units.
     6.9 + * eg. 'size == 1' means an actual size of 512 bytes.
    6.10 + */
    6.11  int xen_segment_map_request(
    6.12 -    int *phys_device,                         /* out */
    6.13 -    unsigned long *block_number,              /* out */
    6.14 -    unsigned long *sector_number,             /* out */
    6.15 -    struct task_struct *domain,
    6.16 -    int operation,
    6.17 -    int segment_number,
    6.18 -    int xen_block_number,
    6.19 -    int xen_sector_number)
    6.20 +    phys_seg_t *pseg, struct task_struct *p, int operation,
    6.21 +    unsigned short segment_number,
    6.22 +    unsigned long sect_nr, unsigned long buffer, unsigned short nr_sects)
    6.23  {
    6.24      segment_t *seg;
    6.25 -    int sum; 
    6.26 -    int loop;
    6.27 +    extent_t  *ext;
    6.28 +    int sum, i;
    6.29  
    6.30 -    if ( segment_number >= XEN_MAX_SEGMENTS )
    6.31 -    {
    6.32 -        /* No VHD. */
    6.33 -        return 1;
    6.34 -    }
    6.35 +    if ( segment_number >= XEN_MAX_SEGMENTS ) goto fail;
    6.36  
    6.37 -    seg = domain->segment_list[segment_number];
    6.38 -    
    6.39 -    if (seg == NULL)
    6.40 -    {
    6.41 -        /* oops.  no vhd exists! */
    6.42 -        return 1;
    6.43 -    }
    6.44 +    seg = p->segment_list[segment_number];
    6.45 +    if ( seg == NULL ) goto fail;
    6.46  
    6.47      /* check domain permissions */
    6.48 -    if (seg->domain != domain->domain)
    6.49 -    {
    6.50 -        /* domain doesn't own segment */
    6.51 -        return 2;
    6.52 -    }
    6.53 +    if ( seg->domain != p->domain ) goto fail;
    6.54  
    6.55      /* check rw access */
    6.56      if ((operation == WRITE && seg->mode != XEN_SEGMENT_RW) ||
    6.57          (operation == READ  && seg->mode == XEN_SEGMENT_UNUSED))
    6.58 -    {
    6.59 -        /* access violation */
    6.60 -        return 3;
    6.61 -    }
    6.62 +        goto fail;
    6.63  
    6.64      /* find extent, check size */
    6.65      sum = 0; 
    6.66 -    loop = 0;
    6.67 -    while (loop < seg->num_extents && sum <= xen_block_number)
    6.68 -    {
    6.69 -        sum += seg->extents[loop++].size;
    6.70 -    }
    6.71 -    sum -= seg->extents[--loop].size;
    6.72 -
    6.73 -    if (sum + seg->extents[loop].size <= xen_block_number)
    6.74 +    i = 0;
    6.75 +    ext = seg->extents;
    6.76 +    while ( (i < seg->num_extents) && ((sum + ext->size) <= sect_nr) )
    6.77      {
    6.78 -        /* tried to read past the end of the segment */
    6.79 -        return 4;
    6.80 +        sum += ext->size;
    6.81 +        ext++; i++;
    6.82      }
    6.83 -    *block_number = xen_block_number - sum + seg->extents[loop].offset;
    6.84 -    *sector_number = xen_sector_number - sum + seg->extents[loop].offset;;
    6.85 +
    6.86 +    if ( (sum + ext->size) <= sect_nr ) goto fail;
    6.87 +
    6.88 +    pseg->sector_number = sect_nr + ext->offset - sum;
    6.89 +    pseg->buffer        = buffer;
    6.90 +    pseg->nr_sects      = nr_sects;
    6.91 +    pseg->dev           = xendev_to_physdev(ext->disk);
    6.92 +    if ( pseg->dev == 0 ) goto fail;
    6.93 +
    6.94 +    /* We're finished if the virtual extent didn't overrun the phys extent. */
    6.95 +    if ( (sum + ext->size) >= (sect_nr + nr_sects) )
    6.96 +        return 1; /* Just one more physical extent. */
    6.97 +
    6.98 +    /* Hmmm... make sure there's another extent to overrun onto! */
    6.99 +    if ( (i+1) == seg->num_extents ) goto fail;
   6.100  
   6.101 -    /* This actually needs to be passed thru one more indirection :-) */
   6.102 -    *phys_device = seg->extents[loop].disk;
   6.103 +    pseg[1].nr_sects = (sect_nr + nr_sects) - (sum + ext->size);
   6.104 +    pseg[0].nr_sects = sum + ext->size - sect_nr;
   6.105 +    pseg[1].buffer = buffer + (pseg->nr_sects << 9);
   6.106 +    pseg[1].sector_number = ext[1].offset;
   6.107 +    pseg[1].dev = xendev_to_physdev(ext[1].disk);
   6.108 +    if ( pseg[1].dev == 0 ) goto fail;
   6.109  
   6.110 -    return 0;
   6.111 +    /* We don't allow overrun onto a third physical extent. */
   6.112 +    if ( (sum + ext[0].size + ext[1].size) < 
   6.113 +         (pseg[1].sector_number + pseg[1].nr_sects) )
   6.114 +        goto fail;    
   6.115 +
   6.116 +    return 2; /* We overran onto a second physical es\xtent. */
   6.117 +
   6.118 + fail:
   6.119 +    return -1;
   6.120  }
   6.121  
   6.122  /*
     7.1 --- a/xen/drivers/ide/ide-dma.c	Fri Mar 07 18:26:52 2003 +0000
     7.2 +++ b/xen/drivers/ide/ide-dma.c	Sun Mar 09 20:51:18 2003 +0000
     7.3 @@ -271,7 +271,7 @@ static int ide_build_sglist (ide_hwif_t 
     7.4  		/*
     7.5  		 * continue segment from before?
     7.6  		 */
     7.7 -		if (bh_phys(bh) == lastdataend) {
     7.8 +		if (virt_to_phys(bh->b_data) == lastdataend) {
     7.9  			sg[nents - 1].length += bh->b_size;
    7.10  			lastdataend += bh->b_size;
    7.11  			continue;
    7.12 @@ -285,25 +285,9 @@ static int ide_build_sglist (ide_hwif_t 
    7.13  
    7.14  		sge = &sg[nents];
    7.15  		memset(sge, 0, sizeof(*sge));
    7.16 -
    7.17 -		if (bh->b_page) {
    7.18 -			sge->page = bh->b_page;
    7.19 -			sge->offset = bh_offset(bh);
    7.20 -		} else {
    7.21 -
    7.22 -		   
    7.23 -#if 0 
    7.24 -		    /* below is wrong for xen since b_data is actually
    7.25 -		       a 'physical / virtual' thingy. Ask KAF. */
    7.26 -			if (((unsigned long) bh->b_data) < PAGE_SIZE)
    7.27 -				BUG();
    7.28 -#endif
    7.29 -
    7.30 -			sge->address = bh->b_data;
    7.31 -		}
    7.32 -
    7.33 +		sge->address = bh->b_data;
    7.34  		sge->length = bh->b_size;
    7.35 -		lastdataend = bh_phys(bh) + bh->b_size;
    7.36 +		lastdataend = virt_to_phys(bh->b_data) + bh->b_size;
    7.37  		nents++;
    7.38  	} while ((bh = bh->b_reqnext) != NULL);
    7.39  
     8.1 --- a/xen/include/hypervisor-ifs/block.h	Fri Mar 07 18:26:52 2003 +0000
     8.2 +++ b/xen/include/hypervisor-ifs/block.h	Sun Mar 09 20:51:18 2003 +0000
     8.3 @@ -34,37 +34,42 @@
     8.4   */
     8.5  
     8.6  /* the first four definitions match fs.h */
     8.7 -#define XEN_BLOCK_READ  0
     8.8 -#define XEN_BLOCK_WRITE 1
     8.9 -#define XEN_BLOCK_READA 2                                /* currently unused */
    8.10 -#define XEN_BLOCK_SPECIAL 4                              /* currently unused */
    8.11 -#define XEN_BLOCK_PROBE_BLK  8             /* get xhd config from hypervisor */
    8.12 -#define XEN_BLOCK_DEBUG      16                                     /* debug */
    8.13 -#define XEN_BLOCK_SEG_CREATE 32                      /* create segment (vhd) */
    8.14 -#define XEN_BLOCK_SEG_DELETE 64                      /* delete segment (vhd) */
    8.15 -#define XEN_BLOCK_PROBE_SEG  128           /* get vhd config from hypervisor */
    8.16 +#define XEN_BLOCK_READ         0
    8.17 +#define XEN_BLOCK_WRITE        1
    8.18 +#define XEN_BLOCK_READA        2
    8.19 +#define XEN_BLOCK_SPECIAL      4
    8.20 +#define XEN_BLOCK_PROBE_BLK    5  /* get xhd config from hypervisor */
    8.21 +#define XEN_BLOCK_DEBUG        6  /* debug */
    8.22 +#define XEN_BLOCK_SEG_CREATE   7  /* create segment (vhd) */
    8.23 +#define XEN_BLOCK_SEG_DELETE   8  /* delete segment (vhd) */
    8.24 +#define XEN_BLOCK_PROBE_SEG    9  /* get vhd config from hypervisor */
    8.25  
    8.26 -#define BLK_RING_SIZE        128
    8.27 -#define BLK_RING_MAX_ENTRIES (BLK_RING_SIZE - 2)
    8.28 +/* NB. Ring size must be small enough for sizeof(blk_ring_t) <= PAGE_SIZE. */
    8.29 +#define BLK_RING_SIZE        64
    8.30  #define BLK_RING_INC(_i)     (((_i)+1) & (BLK_RING_SIZE-1))
    8.31 -#define BLK_RING_ADD(_i,_j)  (((_i)+(_j)) & (BLK_RING_SIZE-1))
    8.32 +
    8.33 +/*
    8.34 + * Maximum scatter/gather segments per request.
    8.35 + * This is carefully chosen so that sizeof(blk_ring_t) <= PAGE_SIZE.
    8.36 + */
    8.37 +#define MAX_BLK_SEGS 12
    8.38  
    8.39  typedef struct blk_ring_req_entry 
    8.40  {
    8.41 -    void *          id;                /* for guest os use */
    8.42 -    int             operation;         /* from above */
    8.43 -    char *          buffer;
    8.44 -    unsigned long   block_number;      /* block number */
    8.45 -    unsigned short  block_size;        /* block size */
    8.46 -    unsigned short  device;
    8.47 -    unsigned long   sector_number;     /* real buffer location on disk */
    8.48 +    unsigned long  id;                     /* private guest os value       */
    8.49 +    unsigned long  sector_number;          /* start sector idx on disk     */
    8.50 +    unsigned short device;                 /* XENDEV_??? + idx             */
    8.51 +    unsigned char  operation;              /* XEN_BLOCK_???                */
    8.52 +    unsigned char  nr_segments;            /* number of segments           */
    8.53 +    /* Least 9 bits is 'nr_sects'. High 23 bits are the address.           */
    8.54 +    unsigned long  buffer_and_sects[MAX_BLK_SEGS];
    8.55  } blk_ring_req_entry_t;
    8.56  
    8.57  typedef struct blk_ring_resp_entry
    8.58  {
    8.59 -    void *          id;                                  /* for guest os use */
    8.60 -    int             operation;                                 /* from above */
    8.61 -    unsigned long   status;
    8.62 +    unsigned long   id;                   /* copied from request          */
    8.63 +    unsigned short  operation;            /* copied from request          */
    8.64 +    unsigned long   status;               /* cuurently boolean good/bad   */
    8.65  } blk_ring_resp_entry_t;
    8.66  
    8.67  typedef struct blk_ring_st 
     9.1 --- a/xen/include/xeno/blkdev.h	Fri Mar 07 18:26:52 2003 +0000
     9.2 +++ b/xen/include/xeno/blkdev.h	Sun Mar 09 20:51:18 2003 +0000
     9.3 @@ -15,6 +15,15 @@
     9.4  #define BLOCK_SIZE_BITS 10
     9.5  #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
     9.6  
     9.7 +typedef struct {
     9.8 +    struct task_struct *domain;
     9.9 +    unsigned long       id;
    9.10 +    atomic_t            pendcnt;
    9.11 +    unsigned short      operation;
    9.12 +} pending_req_t;
    9.13 +
    9.14 +extern kdev_t xendev_to_physdev(unsigned short xendev);
    9.15 +
    9.16  extern void init_blkdev_info(struct task_struct *);
    9.17  extern void destroy_blkdev_info(struct task_struct *);
    9.18  
    9.19 @@ -61,27 +70,17 @@ enum bh_state_bits {
    9.20  };
    9.21  
    9.22  struct buffer_head {
    9.23 -        unsigned long b_blocknr;        /* block number */
    9.24 +        unsigned long b_rsector;        /* Real buffer location on disk */
    9.25          unsigned short b_size;          /* block size */
    9.26 -        unsigned short b_list;          /* List that this buffer appears */
    9.27          kdev_t b_dev;                   /* device (B_FREE = free) */
    9.28 -
    9.29 -        atomic_t b_count;               /* users using this block */
    9.30 -        kdev_t b_rdev;                  /* Real device */
    9.31          unsigned long b_state;          /* buffer state bitmap (see above) */
    9.32 -
    9.33          struct buffer_head *b_reqnext;  /* request queue */
    9.34 +        char *b_data;                  /* pointer to data block */
    9.35 +        void (*b_end_io)(struct buffer_head *bh, int uptodate);
    9.36 +        pending_req_t *pending_req;
    9.37 +};
    9.38  
    9.39 -        char * b_data;                  /* pointer to data block */
    9.40 -        struct pfn_info *b_page;            /* the page this bh is mapped to */
    9.41 -        void (*b_end_io)(struct buffer_head *bh, int uptodate);
    9.42 -
    9.43 -        unsigned long b_rsector;        /* Real buffer location on disk */
    9.44 -
    9.45 -        /* Both used by b_end_io function in xen_block.c */
    9.46 -        void *b_xen_domain;
    9.47 -        void *b_xen_id;
    9.48 -};
    9.49 +#define b_rdev b_dev /* In Xen, there's no device layering (eg. s/w RAID). */
    9.50  
    9.51  typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
    9.52  void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
    9.53 @@ -101,8 +100,6 @@ void init_buffer(struct buffer_head *, b
    9.54  
    9.55  extern void set_bh_page(struct buffer_head *bh, struct pfn_info *page, unsigned long offset);
    9.56  
    9.57 -#define touch_buffer(bh)        mark_page_accessed(bh->b_page)
    9.58 -
    9.59  #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
    9.60  
    9.61  static inline void __mark_buffer_clean(struct buffer_head *bh)
    9.62 @@ -261,8 +258,6 @@ struct request_queue
    9.63  #endif
    9.64  };
    9.65  
    9.66 -#define bh_phys(bh)            (page_to_phys((bh)->b_page) + bh_offset((bh)))
    9.67 -
    9.68  struct blk_dev_struct {
    9.69  	/*
    9.70  	 * queue_proc has to be atomic
    10.1 --- a/xen/include/xeno/sched.h	Fri Mar 07 18:26:52 2003 +0000
    10.2 +++ b/xen/include/xeno/sched.h	Sun Mar 09 20:51:18 2003 +0000
    10.3 @@ -78,7 +78,8 @@ struct task_struct {
    10.4  
    10.5      /* Block I/O */
    10.6      blk_ring_t *blk_ring_base;
    10.7 -    unsigned int blk_req_cons; /* request consumer */
    10.8 +    unsigned int blk_req_cons;  /* request consumer */
    10.9 +    unsigned int blk_resp_prod; /* (private version of) response producer */
   10.10      struct list_head blkdev_list;
   10.11      spinlock_t blk_ring_lock;
   10.12      segment_t *segment_list[XEN_MAX_SEGMENTS];                        /* vhd */
   10.13 @@ -89,6 +90,8 @@ struct task_struct {
   10.14      struct list_head run_list;
   10.15      
   10.16      struct mm_struct mm;
   10.17 +    /* We need this lock to check page types and frob reference counts. */
   10.18 +    spinlock_t page_lock;
   10.19  
   10.20      mm_segment_t addr_limit;        /* thread address space:
   10.21                                         0-0xBFFFFFFF for user-thead
    11.1 --- a/xen/include/xeno/segment.h	Fri Mar 07 18:26:52 2003 +0000
    11.2 +++ b/xen/include/xeno/segment.h	Sun Mar 09 20:51:18 2003 +0000
    11.3 @@ -3,18 +3,21 @@
    11.4  
    11.5  #include <hypervisor-ifs/block.h>
    11.6  
    11.7 +/* Describes a physical disk extent. */
    11.8 +typedef struct {
    11.9 +    unsigned short dev;
   11.10 +    unsigned short nr_sects;
   11.11 +    unsigned long  sector_number;
   11.12 +    unsigned long  buffer;
   11.13 +} phys_seg_t;
   11.14 +
   11.15  void xen_segment_initialize(void);
   11.16  void xen_refresh_segment_list (struct task_struct *p);
   11.17  int xen_segment_create(xv_disk_t *xvd);
   11.18  int xen_segment_map_request(
   11.19 -    int *phys_device,                         /* out */
   11.20 -    unsigned long *block_number,              /* out */
   11.21 -    unsigned long *sector_number,             /* out */
   11.22 -    struct task_struct *domain,
   11.23 -    int operation,
   11.24 -    int segment_number,
   11.25 -    int xen_block_number,
   11.26 -    int xen_sector_number);
   11.27 +    phys_seg_t *pseg, struct task_struct *p, int operation,
   11.28 +    unsigned short segment_number,
   11.29 +    unsigned long sect_nr, unsigned long buffer, unsigned short nr_sects);
   11.30  
   11.31  #define XEN_MAX_SEGMENTS 100     /* total number of segments across all doms */
   11.32  
    12.1 --- a/xen/net/dev.c	Fri Mar 07 18:26:52 2003 +0000
    12.2 +++ b/xen/net/dev.c	Sun Mar 09 20:51:18 2003 +0000
    12.3 @@ -489,6 +489,7 @@ void deliver_packet(struct sk_buff *skb,
    12.4      unsigned long *g_pte; 
    12.5      struct pfn_info *g_pfn, *h_pfn;
    12.6      unsigned int i; 
    12.7 +    unsigned long flags;
    12.8  
    12.9      memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN);
   12.10      if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
   12.11 @@ -508,6 +509,8 @@ void deliver_packet(struct sk_buff *skb,
   12.12      if ( (skb->len + ETH_HLEN) < rx->size )
   12.13          rx->size = skb->len + ETH_HLEN;
   12.14              
   12.15 +    spin_lock_irqsave(&vif->domain->page_lock, flags);
   12.16 +
   12.17      g_pte = map_domain_mem(rx->addr);
   12.18  
   12.19      g_pfn = frame_table + (*g_pte >> PAGE_SHIFT);
   12.20 @@ -526,9 +529,11 @@ void deliver_packet(struct sk_buff *skb,
   12.21      *g_pte = (*g_pte & ~PAGE_MASK) 
   12.22          | (((h_pfn - frame_table) << PAGE_SHIFT) & PAGE_MASK);
   12.23      *g_pte |= _PAGE_PRESENT;
   12.24 -        
   12.25 +    
   12.26      unmap_domain_mem(g_pte);
   12.27  
   12.28 +    spin_unlock_irqrestore(&vif->domain->page_lock, flags);
   12.29 +    
   12.30      /* Our skbuff now points at the guest's old frame. */
   12.31      skb->pf = g_pfn;
   12.32  
   12.33 @@ -661,10 +666,12 @@ static void tx_skb_release(struct sk_buf
   12.34      net_vif_t *vif = sys_vif_list[skb->src_vif];
   12.35      unsigned int idx;
   12.36      tx_shadow_entry_t *tx;
   12.37 -    unsigned long cpu_mask;
   12.38 +    unsigned long cpu_mask, flags;
   12.39      
   12.40 +    spin_lock_irqsave(&vif->domain->page_lock, flags);
   12.41      for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
   12.42          put_page_tot(skb_shinfo(skb)->frags[i].page);
   12.43 +    spin_unlock_irqrestore(&vif->domain->page_lock, flags);
   12.44  
   12.45      if ( skb->skb_type == SKB_NODATA )
   12.46          kmem_cache_free(net_header_cachep, skb->head);
   12.47 @@ -713,8 +720,7 @@ static void tx_skb_release(struct sk_buf
   12.48      /* Send a transmit event if requested. */
   12.49      if ( send )
   12.50      {
   12.51 -        cpu_mask = mark_guest_event(
   12.52 -            sys_vif_list[skb->src_vif]->domain, _EVENT_NET_TX);
   12.53 +        cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX);
   12.54          guest_event_notify(cpu_mask);
   12.55      }
   12.56  }
   12.57 @@ -1870,10 +1876,12 @@ long do_net_update(void)
   12.58  
   12.59              pfn  = tx.addr >> PAGE_SHIFT;
   12.60              page = frame_table + pfn;
   12.61 +            spin_lock_irq(&current->page_lock);
   12.62              if ( (pfn >= max_page) || 
   12.63                   ((page->flags & PG_domain_mask) != current->domain) ) 
   12.64              {
   12.65                  DPRINTK("Bad page frame\n");
   12.66 +                spin_unlock_irq(&current->page_lock);
   12.67                  continue;
   12.68              }
   12.69              
   12.70 @@ -1882,7 +1890,7 @@ long do_net_update(void)
   12.71              protocol = __constant_htons(
   12.72                  init_tx_header(g_data, tx.size, the_dev));
   12.73              if ( protocol == 0 )
   12.74 -                goto unmap_and_continue;
   12.75 +                goto tx_unmap_and_continue;
   12.76  
   12.77              target = __net_get_target_vif(g_data, tx.size, current_vif->id);
   12.78  
   12.79 @@ -1890,7 +1898,7 @@ long do_net_update(void)
   12.80              {
   12.81                  /* Local delivery */
   12.82                  if ( (skb = dev_alloc_skb(tx.size)) == NULL ) 
   12.83 -                    goto unmap_and_continue;
   12.84 +                    goto tx_unmap_and_continue;
   12.85                  
   12.86                  skb->destructor = tx_skb_release;
   12.87  
   12.88 @@ -1915,15 +1923,16 @@ long do_net_update(void)
   12.89                  shadow_ring->tx_ring[i].header = 
   12.90                      kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
   12.91                  if ( shadow_ring->tx_ring[i].header == NULL ) 
   12.92 -                    goto unmap_and_continue;
   12.93 +                    goto tx_unmap_and_continue;
   12.94                  memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN);
   12.95                  shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN;
   12.96                  shadow_ring->tx_ring[i].status = RING_STATUS_OK;
   12.97                  get_page_tot(page);
   12.98              }
   12.99  
  12.100 -        unmap_and_continue:
  12.101 +        tx_unmap_and_continue:
  12.102              unmap_domain_mem(g_data);
  12.103 +            spin_unlock_irq(&current->page_lock);
  12.104          }
  12.105  
  12.106          if ( shadow_ring->tx_prod != i )
  12.107 @@ -1966,10 +1975,12 @@ long do_net_update(void)
  12.108              
  12.109              shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE;
  12.110              
  12.111 +            spin_lock_irq(&current->page_lock);
  12.112              if ( (pfn >= max_page) || 
  12.113                   (page->flags != (PGT_l1_page_table | current->domain)) ) 
  12.114              {
  12.115                  DPRINTK("Bad page frame containing ppte\n");
  12.116 +                spin_unlock_irq(&current->page_lock);
  12.117                  continue;
  12.118              }
  12.119              
  12.120 @@ -1978,8 +1989,7 @@ long do_net_update(void)
  12.121              if (!(*g_pte & _PAGE_PRESENT))
  12.122              {
  12.123                  DPRINTK("Inavlid PTE passed down (not present)\n");
  12.124 -                unmap_domain_mem(g_pte);
  12.125 -                continue;
  12.126 +                goto rx_unmap_and_continue;
  12.127              }
  12.128              
  12.129              page = (*g_pte >> PAGE_SHIFT) + frame_table;
  12.130 @@ -1987,8 +1997,7 @@ long do_net_update(void)
  12.131              if (page->tot_count != 1) 
  12.132              {
  12.133                  DPRINTK("An rx page must be mapped exactly once\n");
  12.134 -                unmap_domain_mem(g_pte);
  12.135 -                continue;
  12.136 +                goto rx_unmap_and_continue;
  12.137              }
  12.138              
  12.139              /* The pte they passed was good, so take it away from them. */
  12.140 @@ -1997,7 +2006,9 @@ long do_net_update(void)
  12.141              page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf;
  12.142              rx->flush_count = tlb_flush_count[smp_processor_id()];
  12.143              
  12.144 +        rx_unmap_and_continue:
  12.145              unmap_domain_mem(g_pte);
  12.146 +            spin_unlock_irq(&current->page_lock);
  12.147          }
  12.148  
  12.149          if ( shadow_ring->rx_prod != i )
    13.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c	Fri Mar 07 18:26:52 2003 +0000
    13.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c	Sun Mar 09 20:51:18 2003 +0000
    13.3 @@ -19,10 +19,11 @@ typedef unsigned char byte; /* from linu
    13.4  
    13.5  static blk_ring_t *blk_ring;
    13.6  static unsigned int resp_cons; /* Response consumer for comms ring. */
    13.7 +static unsigned int req_prod;  /* Private request producer.         */
    13.8  static xen_disk_info_t xlblk_disk_info;
    13.9  static int xlblk_control_msg_pending;
   13.10  
   13.11 -#define RING_FULL (BLK_RING_INC(blk_ring->req_prod) == resp_cons)
   13.12 +#define RING_FULL (BLK_RING_INC(req_prod) == resp_cons)
   13.13  
   13.14  /*
   13.15   * Request queues with outstanding work, but ring is currently full.
   13.16 @@ -33,6 +34,18 @@ static int xlblk_control_msg_pending;
   13.17  static request_queue_t *pending_queues[MAX_PENDING];
   13.18  static int nr_pending;
   13.19  
   13.20 +static kdev_t        sg_dev;
   13.21 +static int           sg_operation = -1;
   13.22 +static unsigned long sg_next_sect;
   13.23 +#define DISABLE_SCATTERGATHER() (sg_operation = -1)
   13.24 +
   13.25 +static inline void signal_requests_to_xen(void)
   13.26 +{
   13.27 +    DISABLE_SCATTERGATHER();
   13.28 +    blk_ring->req_prod = req_prod;
   13.29 +    HYPERVISOR_block_io_op();
   13.30 +}
   13.31 +
   13.32  /* Convert from a XenoLinux major device to the Xen-level 'physical' device */
   13.33  static inline unsigned short xldev_to_physdev(kdev_t xldev) 
   13.34  {
   13.35 @@ -253,31 +266,22 @@ int xenolinux_block_revalidate(kdev_t de
   13.36   * operation: XEN_BLOCK_{READ,WRITE,PROBE*,SEG*}
   13.37   * buffer: buffer to read/write into. this should be a
   13.38   *   virtual address in the guest os.
   13.39 - * block_number:  block to read
   13.40 - * block_size:  size of each block
   13.41 - * device:  xhd*, ksd*, xvd*, ...
   13.42   */
   13.43 -static int hypervisor_request(void *          id,
   13.44 +static int hypervisor_request(unsigned long   id,
   13.45                                int             operation,
   13.46                                char *          buffer,
   13.47 -                              unsigned long   block_number,
   13.48 -                              unsigned short  block_size,
   13.49 +                              unsigned long   sector_number,
   13.50 +                              unsigned short  nr_sectors,
   13.51                                kdev_t          device)
   13.52  {
   13.53 -    int position;
   13.54 -    void *buffer_ma; 
   13.55 +    unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); 
   13.56      kdev_t phys_device = (kdev_t) 0;
   13.57 -    unsigned long sector_number = 0;
   13.58      struct gendisk *gd;
   13.59 - 
   13.60 -    /*
   13.61 -     * Bail if there's no room in the request communication ring. This may be 
   13.62 -     * because we have a whole bunch of outstanding responses to process. No 
   13.63 -     * matter, as the response handler will kick the request queue.
   13.64 -     */
   13.65 -    if ( RING_FULL ) return 1;
   13.66 +    blk_ring_req_entry_t *req;
   13.67 +    struct buffer_head *bh;
   13.68  
   13.69 -    buffer_ma = (void *)phys_to_machine(virt_to_phys(buffer)); 
   13.70 +    if ( nr_sectors >= (1<<9) ) BUG();
   13.71 +    if ( (buffer_ma & ((1<<9)-1)) != 0 ) BUG();
   13.72  
   13.73      switch ( operation )
   13.74      {
   13.75 @@ -285,17 +289,42 @@ static int hypervisor_request(void *    
   13.76      case XEN_BLOCK_SEG_DELETE:
   13.77      case XEN_BLOCK_PROBE_BLK:
   13.78      case XEN_BLOCK_PROBE_SEG:
   13.79 +        if ( RING_FULL ) return 1;
   13.80  	phys_device = (kdev_t) 0;
   13.81  	sector_number = 0;
   13.82 +        DISABLE_SCATTERGATHER();
   13.83          break;
   13.84  
   13.85      case XEN_BLOCK_READ:
   13.86      case XEN_BLOCK_WRITE:
   13.87          phys_device = xldev_to_physdev(device);
   13.88 -	/* Compute real buffer location on disk */
   13.89 -	sector_number = block_number;
   13.90  	gd = xldev_to_gendisk(device); 
   13.91  	sector_number += gd->part[MINOR(device)].start_sect;
   13.92 +        if ( (sg_operation == operation) &&
   13.93 +             (sg_dev == phys_device) &&
   13.94 +             (sg_next_sect == sector_number) )
   13.95 +        {
   13.96 +            req = &blk_ring->ring[(req_prod-1)&(BLK_RING_SIZE-1)].req;
   13.97 +            bh = (struct buffer_head *)id;
   13.98 +            bh->b_reqnext = (struct buffer_head *)req->id;
   13.99 +            req->id = id;
  13.100 +            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
  13.101 +            if ( ++req->nr_segments < MAX_BLK_SEGS )
  13.102 +                sg_next_sect += nr_sectors;
  13.103 +            else
  13.104 +                DISABLE_SCATTERGATHER();
  13.105 +            return 0;
  13.106 +        }
  13.107 +        else if ( RING_FULL )
  13.108 +        {
  13.109 +            return 1;
  13.110 +        }
  13.111 +        else
  13.112 +        {
  13.113 +            sg_operation = operation;
  13.114 +            sg_dev       = phys_device;
  13.115 +            sg_next_sect = sector_number + nr_sectors;
  13.116 +        }
  13.117          break;
  13.118  
  13.119      default:
  13.120 @@ -303,16 +332,14 @@ static int hypervisor_request(void *    
  13.121      }
  13.122  
  13.123      /* Fill out a communications ring structure. */
  13.124 -    position = blk_ring->req_prod;
  13.125 -    blk_ring->ring[position].req.id            = id;
  13.126 -    blk_ring->ring[position].req.operation     = operation;
  13.127 -    blk_ring->ring[position].req.buffer        = buffer_ma;
  13.128 -    blk_ring->ring[position].req.block_number  = block_number;
  13.129 -    blk_ring->ring[position].req.block_size    = block_size;
  13.130 -    blk_ring->ring[position].req.device        = phys_device;
  13.131 -    blk_ring->ring[position].req.sector_number = sector_number;
  13.132 -
  13.133 -    blk_ring->req_prod = BLK_RING_INC(position);
  13.134 +    req = &blk_ring->ring[req_prod].req;
  13.135 +    req->id            = id;
  13.136 +    req->operation     = operation;
  13.137 +    req->sector_number = sector_number;
  13.138 +    req->device        = phys_device;
  13.139 +    req->nr_segments   = 1;
  13.140 +    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
  13.141 +    req_prod = BLK_RING_INC(req_prod);
  13.142  
  13.143      return 0;
  13.144  }
  13.145 @@ -325,7 +352,7 @@ static int hypervisor_request(void *    
  13.146  void do_xlblk_request(request_queue_t *rq)
  13.147  {
  13.148      struct request *req;
  13.149 -    struct buffer_head *bh;
  13.150 +    struct buffer_head *bh, *next_bh;
  13.151      int rw, nsect, full, queued = 0;
  13.152      
  13.153      DPRINTK("xlblk.c::do_xlblk_request for '%s'\n", DEVICE_NAME); 
  13.154 @@ -349,12 +376,17 @@ void do_xlblk_request(request_queue_t *r
  13.155          bh = req->bh;
  13.156          while ( bh != NULL )
  13.157  	{
  13.158 +            next_bh = bh->b_reqnext;
  13.159 +            bh->b_reqnext = NULL;
  13.160 +
  13.161              full = hypervisor_request(
  13.162 -                bh, (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, 
  13.163 -                bh->b_data, bh->b_rsector, bh->b_size, bh->b_dev);
  13.164 +                (unsigned long)bh,
  13.165 +                (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, 
  13.166 +                bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
  13.167  
  13.168              if ( full )
  13.169              {
  13.170 +                bh->b_reqnext = next_bh;
  13.171                  pending_queues[nr_pending++] = rq;
  13.172                  if ( nr_pending >= MAX_PENDING ) BUG();
  13.173                  goto out;
  13.174 @@ -364,9 +396,7 @@ void do_xlblk_request(request_queue_t *r
  13.175  
  13.176              /* Dequeue the buffer head from the request. */
  13.177              nsect = bh->b_size >> 9;
  13.178 -            req->bh = bh->b_reqnext;
  13.179 -            bh->b_reqnext = NULL;
  13.180 -            bh = req->bh;
  13.181 +            bh = req->bh = next_bh;
  13.182              
  13.183              if ( bh != NULL )
  13.184              {
  13.185 @@ -389,7 +419,7 @@ void do_xlblk_request(request_queue_t *r
  13.186      }
  13.187  
  13.188   out:
  13.189 -    if ( queued != 0 ) HYPERVISOR_block_io_op();
  13.190 +    if ( queued != 0 ) signal_requests_to_xen();
  13.191  }
  13.192  
  13.193  
  13.194 @@ -397,7 +427,7 @@ static void xlblk_response_int(int irq, 
  13.195  {
  13.196      int i; 
  13.197      unsigned long flags; 
  13.198 -    struct buffer_head *bh;
  13.199 +    struct buffer_head *bh, *next_bh;
  13.200      
  13.201      spin_lock_irqsave(&io_request_lock, flags);	    
  13.202  
  13.203 @@ -410,7 +440,14 @@ static void xlblk_response_int(int irq, 
  13.204  	{
  13.205          case XEN_BLOCK_READ:
  13.206          case XEN_BLOCK_WRITE:
  13.207 -	    if ( (bh = bret->id) != NULL ) bh->b_end_io(bh, 1);
  13.208 +            for ( bh = (struct buffer_head *)bret->id; 
  13.209 +                  bh != NULL; 
  13.210 +                  bh = next_bh )
  13.211 +            {
  13.212 +                next_bh = bh->b_reqnext;
  13.213 +                bh->b_reqnext = NULL;
  13.214 +                bh->b_end_io(bh, 1);
  13.215 +            }
  13.216  	    break;
  13.217  	    
  13.218          case XEN_BLOCK_SEG_CREATE:
  13.219 @@ -429,7 +466,7 @@ static void xlblk_response_int(int irq, 
  13.220  
  13.221      /* We kick pending request queues if the ring is reasonably empty. */
  13.222      if ( (nr_pending != 0) && 
  13.223 -         (((blk_ring->req_prod - resp_cons) & (BLK_RING_SIZE - 1)) < 
  13.224 +         (((req_prod - resp_cons) & (BLK_RING_SIZE - 1)) < 
  13.225            (BLK_RING_SIZE >> 1)) )
  13.226      {
  13.227          /* Attempt to drain the queue, but bail if the ring becomes full. */
  13.228 @@ -445,13 +482,27 @@ static void xlblk_response_int(int irq, 
  13.229  
  13.230  
  13.231  /* Send a synchronous message to Xen. */
  13.232 -int xenolinux_control_msg(int operation, char *buffer)
  13.233 +int xenolinux_control_msg(int operation, char *buffer, int size)
  13.234  {
  13.235 -    xlblk_control_msg_pending = 1; barrier();
  13.236 -    if ( hypervisor_request(NULL, operation, buffer, 0, 0, 0) )
  13.237 +    unsigned long flags;
  13.238 +    char *aligned_buf;
  13.239 +
  13.240 +    /* We copy from an aligned buffer, as interface needs sector alignment. */
  13.241 +    aligned_buf = get_free_page(GFP_KERNEL);
  13.242 +    if ( aligned_buf == NULL ) BUG();
  13.243 +
  13.244 +    xlblk_control_msg_pending = 1;
  13.245 +    spin_lock_irqsave(&io_request_lock, flags);
  13.246 +    /* Note that size gets rounded up to a sector-sized boundary. */
  13.247 +    if ( hypervisor_request(0, operation, aligned_buf, 0, (size+511)/512, 0) )
  13.248          return -EAGAIN;
  13.249 -    HYPERVISOR_block_io_op();
  13.250 -    while ( xlblk_control_msg_pending ) barrier();    
  13.251 +    signal_requests_to_xen();
  13.252 +    spin_unlock_irqrestore(&io_request_lock, flags);
  13.253 +    while ( xlblk_control_msg_pending ) barrier();
  13.254 +
  13.255 +    memcpy(buffer, aligned_buf, size);
  13.256 +    free_page(aligned_buf);
  13.257 +    
  13.258      return 0;
  13.259  }
  13.260  
  13.261 @@ -465,7 +516,7 @@ int __init xlblk_init(void)
  13.262  
  13.263      /* This mapping was created early at boot time. */
  13.264      blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE);
  13.265 -    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = 0;
  13.266 +    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
  13.267      
  13.268      error = request_irq(XLBLK_RESPONSE_IRQ, xlblk_response_int, 0, 
  13.269  			"xlblk-response", NULL);
  13.270 @@ -478,7 +529,8 @@ int __init xlblk_init(void)
  13.271      /* Probe for disk information. */
  13.272      memset(&xlblk_disk_info, 0, sizeof(xlblk_disk_info));
  13.273      error = xenolinux_control_msg(XEN_BLOCK_PROBE_BLK, 
  13.274 -                                  (char *)&xlblk_disk_info);
  13.275 +                                  (char *)&xlblk_disk_info,
  13.276 +                                  sizeof(xen_disk_info_t));
  13.277      if ( error )
  13.278      {
  13.279          printk(KERN_ALERT "Could not probe disks (%d)\n", error);
    14.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h	Fri Mar 07 18:26:52 2003 +0000
    14.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h	Sun Mar 09 20:51:18 2003 +0000
    14.3 @@ -46,7 +46,7 @@ typedef struct xl_disk {
    14.4  } xl_disk_t;
    14.5  
    14.6  /* Generic layer. */
    14.7 -extern int xenolinux_control_msg(int operration, char *buffer);
    14.8 +extern int xenolinux_control_msg(int operration, char *buffer, int size);
    14.9  extern int xenolinux_block_open(struct inode *inode, struct file *filep);
   14.10  extern int xenolinux_block_release(struct inode *inode, struct file *filep);
   14.11  extern int xenolinux_block_ioctl(struct inode *inode, struct file *filep,
    15.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c	Fri Mar 07 18:26:52 2003 +0000
    15.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c	Sun Mar 09 20:51:18 2003 +0000
    15.3 @@ -51,7 +51,7 @@ int __init xlseg_init(void)
    15.4  
    15.5      /* Probe for disk information. */
    15.6      memset(xdi, 0, sizeof(*xdi));
    15.7 -    xenolinux_control_msg(XEN_BLOCK_PROBE_SEG, (char *)xdi);
    15.8 +    xenolinux_control_msg(XEN_BLOCK_PROBE_SEG, (char *)xdi, sizeof(*xdi));
    15.9  
   15.10      DPRINTK("vhd block device probe:\n");
   15.11      for ( i = 0; i < xdi->count; i++ )
    16.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c	Fri Mar 07 18:26:52 2003 +0000
    16.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c	Sun Mar 09 20:51:18 2003 +0000
    16.3 @@ -210,7 +210,7 @@ static int proc_write_vhd(struct file *f
    16.4          xvd.extents[loop].size =  to_number(string);
    16.5      }
    16.6  
    16.7 -    xenolinux_control_msg(XEN_BLOCK_SEG_CREATE, (char *)&xvd);
    16.8 +    xenolinux_control_msg(XEN_BLOCK_SEG_CREATE, (char *)&xvd, sizeof(xvd));
    16.9  
   16.10      return count;
   16.11  }