ia64/xen-unstable

view extras/mini-os/blkfront.c @ 17058:c1003b9e0bb8

stubdom: optimize block io completion polling by not polling all the
time; only when some requests have completed.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Feb 14 09:23:14 2008 +0000 (2008-02-14)
parents a905c582a406
children eefd66912b65
line source
1 /* Minimal block driver for Mini-OS.
2 * Copyright (c) 2007-2008 Samuel Thibault.
3 * Based on netfront.c.
4 */
6 #include <os.h>
7 #include <xenbus.h>
8 #include <events.h>
9 #include <errno.h>
10 #include <xen/io/blkif.h>
11 #include <gnttab.h>
12 #include <xmalloc.h>
13 #include <time.h>
14 #include <blkfront.h>
15 #include <lib.h>
16 #include <fcntl.h>
18 #ifndef HAVE_LIBC
19 #define strtoul simple_strtoul
20 #endif
22 /* Note: we generally don't need to disable IRQs since we hardly do anything in
23 * the interrupt handler. */
25 /* Note: we really suppose non-preemptive threads. */
27 DECLARE_WAIT_QUEUE_HEAD(blkfront_queue);
32 #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
33 #define GRANT_INVALID_REF 0
36 struct blk_buffer {
37 void* page;
38 grant_ref_t gref;
39 };
41 struct blkfront_dev {
42 domid_t dom;
44 struct blkif_front_ring ring;
45 grant_ref_t ring_ref;
46 evtchn_port_t evtchn, local_port;
47 blkif_vdev_t handle;
49 char *nodename;
50 char *backend;
51 unsigned sector_size;
52 unsigned sectors;
53 int mode;
54 int barrier;
55 int flush;
57 #ifdef HAVE_LIBC
58 int fd;
59 #endif
60 };
62 static inline int xenblk_rxidx(RING_IDX idx)
63 {
64 return idx & (BLK_RING_SIZE - 1);
65 }
67 void blkfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
68 {
69 #ifdef HAVE_LIBC
70 struct blkfront_dev *dev = data;
71 int fd = dev->fd;
73 files[fd].read = 1;
74 #endif
75 wake_up(&blkfront_queue);
76 }
78 struct blkfront_dev *init_blkfront(char *nodename, uint64_t *sectors, unsigned *sector_size, int *mode)
79 {
80 xenbus_transaction_t xbt;
81 char* err;
82 char* message=NULL;
83 struct blkif_sring *s;
84 int retry=0;
85 char* msg;
86 char* c;
88 struct blkfront_dev *dev;
90 if (!nodename)
91 nodename = "device/vbd/768";
93 char path[strlen(nodename) + 1 + 10 + 1];
95 printk("******************* BLKFRONT for %s **********\n\n\n", nodename);
97 dev = malloc(sizeof(*dev));
98 dev->nodename = strdup(nodename);
100 evtchn_alloc_unbound_t op;
101 op.dom = DOMID_SELF;
102 snprintf(path, sizeof(path), "%s/backend-id", nodename);
103 dev->dom = op.remote_dom = xenbus_read_integer(path);
104 HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &op);
105 clear_evtchn(op.port); /* Without, handler gets invoked now! */
106 dev->local_port = bind_evtchn(op.port, blkfront_handler, dev);
107 dev->evtchn=op.port;
109 s = (struct blkif_sring*) alloc_page();
110 memset(s,0,PAGE_SIZE);
113 SHARED_RING_INIT(s);
114 FRONT_RING_INIT(&dev->ring, s, PAGE_SIZE);
116 dev->ring_ref = gnttab_grant_access(dev->dom,virt_to_mfn(s),0);
118 // FIXME: proper frees on failures
119 again:
120 err = xenbus_transaction_start(&xbt);
121 if (err) {
122 printk("starting transaction\n");
123 }
125 err = xenbus_printf(xbt, nodename, "ring-ref","%u",
126 dev->ring_ref);
127 if (err) {
128 message = "writing ring-ref";
129 goto abort_transaction;
130 }
131 err = xenbus_printf(xbt, nodename,
132 "event-channel", "%u", dev->evtchn);
133 if (err) {
134 message = "writing event-channel";
135 goto abort_transaction;
136 }
138 err = xenbus_printf(xbt, nodename, "state", "%u",
139 4); /* connected */
142 err = xenbus_transaction_end(xbt, 0, &retry);
143 if (retry) {
144 goto again;
145 printk("completing transaction\n");
146 }
148 goto done;
150 abort_transaction:
151 xenbus_transaction_end(xbt, 1, &retry);
152 return NULL;
154 done:
156 snprintf(path, sizeof(path), "%s/backend", nodename);
157 msg = xenbus_read(XBT_NIL, path, &dev->backend);
158 if (msg) {
159 printk("Error %s when reading the backend path %s\n", msg, path);
160 return NULL;
161 }
163 printk("backend at %s\n", dev->backend);
165 dev->handle = strtoul(strrchr(nodename, '/')+1, NULL, 0);
167 {
168 char path[strlen(dev->backend) + 1 + 19 + 1];
169 snprintf(path, sizeof(path), "%s/mode", dev->backend);
170 msg = xenbus_read(XBT_NIL, path, &c);
171 if (msg) {
172 printk("Error %s when reading the mode\n", msg);
173 return NULL;
174 }
175 if (*c == 'w')
176 *mode = dev->mode = O_RDWR;
177 else
178 *mode = dev->mode = O_RDONLY;
179 free(c);
181 snprintf(path, sizeof(path), "%s/state", dev->backend);
183 xenbus_watch_path(XBT_NIL, path);
185 xenbus_wait_for_value(path,"4");
187 xenbus_unwatch_path(XBT_NIL, path);
189 snprintf(path, sizeof(path), "%s/sectors", dev->backend);
190 // FIXME: read_integer returns an int, so disk size limited to 1TB for now
191 *sectors = dev->sectors = xenbus_read_integer(path);
193 snprintf(path, sizeof(path), "%s/sector-size", dev->backend);
194 *sector_size = dev->sector_size = xenbus_read_integer(path);
196 snprintf(path, sizeof(path), "%s/feature-barrier", dev->backend);
197 dev->barrier = xenbus_read_integer(path);
199 snprintf(path, sizeof(path), "%s/feature-flush-cache", dev->backend);
200 dev->flush = xenbus_read_integer(path);
201 }
203 printk("%u sectors of %u bytes\n", dev->sectors, dev->sector_size);
204 printk("**************************\n");
206 return dev;
207 }
209 void shutdown_blkfront(struct blkfront_dev *dev)
210 {
211 char* err;
212 char *nodename = dev->nodename;
214 char path[strlen(dev->backend) + 1 + 5 + 1];
216 blkfront_sync(dev);
218 printk("close blk: backend at %s\n",dev->backend);
220 snprintf(path, sizeof(path), "%s/state", dev->backend);
221 err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 5); /* closing */
222 xenbus_wait_for_value(path,"5");
224 err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 6);
225 xenbus_wait_for_value(path,"6");
227 unbind_evtchn(dev->local_port);
229 free(nodename);
230 free(dev->backend);
231 free(dev);
232 }
234 static void blkfront_wait_slot(struct blkfront_dev *dev)
235 {
236 /* Wait for a slot */
237 if (RING_FULL(&dev->ring)) {
238 unsigned long flags;
239 DEFINE_WAIT(w);
240 local_irq_save(flags);
241 while (1) {
242 blkfront_aio_poll(dev);
243 if (!RING_FULL(&dev->ring))
244 break;
245 /* Really no slot, go to sleep. */
246 add_waiter(w, blkfront_queue);
247 local_irq_restore(flags);
248 schedule();
249 local_irq_save(flags);
250 }
251 remove_waiter(w);
252 local_irq_restore(flags);
253 }
254 }
256 /* Issue an aio */
257 void blkfront_aio(struct blkfront_aiocb *aiocbp, int write)
258 {
259 struct blkfront_dev *dev = aiocbp->aio_dev;
260 struct blkif_request *req;
261 RING_IDX i;
262 int notify;
263 int n, j;
264 uintptr_t start, end;
266 // Can't io at non-sector-aligned location
267 ASSERT(!(aiocbp->aio_offset & (dev->sector_size-1)));
268 // Can't io non-sector-sized amounts
269 ASSERT(!(aiocbp->aio_nbytes & (dev->sector_size-1)));
270 // Can't io non-sector-aligned buffer
271 ASSERT(!((uintptr_t) aiocbp->aio_buf & (dev->sector_size-1)));
273 start = (uintptr_t)aiocbp->aio_buf & PAGE_MASK;
274 end = ((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes + PAGE_SIZE - 1) & PAGE_MASK;
275 aiocbp->n = n = (end - start) / PAGE_SIZE;
277 /* qemu's IDE max multsect is 16 (8KB) and SCSI max DMA was set to 32KB,
278 * so max 44KB can't happen */
279 ASSERT(n <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
281 blkfront_wait_slot(dev);
282 i = dev->ring.req_prod_pvt;
283 req = RING_GET_REQUEST(&dev->ring, i);
285 req->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
286 req->nr_segments = n;
287 req->handle = dev->handle;
288 req->id = (uintptr_t) aiocbp;
289 req->sector_number = aiocbp->aio_offset / dev->sector_size;
291 for (j = 0; j < n; j++) {
292 uintptr_t data = start + j * PAGE_SIZE;
293 if (!write) {
294 /* Trigger CoW if needed */
295 *(char*)data = 0;
296 barrier();
297 }
298 aiocbp->gref[j] = req->seg[j].gref =
299 gnttab_grant_access(dev->dom, virtual_to_mfn(data), write);
300 req->seg[j].first_sect = 0;
301 req->seg[j].last_sect = PAGE_SIZE / dev->sector_size - 1;
302 }
303 req->seg[0].first_sect = ((uintptr_t)aiocbp->aio_buf & ~PAGE_MASK) / dev->sector_size;
304 req->seg[n-1].last_sect = (((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes - 1) & ~PAGE_MASK) / dev->sector_size;
306 dev->ring.req_prod_pvt = i + 1;
308 wmb();
309 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
311 if(notify) notify_remote_via_evtchn(dev->evtchn);
312 }
314 void blkfront_aio_write(struct blkfront_aiocb *aiocbp)
315 {
316 blkfront_aio(aiocbp, 1);
317 }
319 void blkfront_aio_read(struct blkfront_aiocb *aiocbp)
320 {
321 blkfront_aio(aiocbp, 0);
322 }
324 int blkfront_aio_poll(struct blkfront_dev *dev)
325 {
326 RING_IDX rp, cons;
327 struct blkif_response *rsp;
329 moretodo:
330 #ifdef HAVE_LIBC
331 files[dev->fd].read = 0;
332 mb(); /* Make sure to let the handler set read to 1 before we start looking at the ring */
333 #endif
335 rp = dev->ring.sring->rsp_prod;
336 rmb(); /* Ensure we see queued responses up to 'rp'. */
337 cons = dev->ring.rsp_cons;
339 int nr_consumed = 0;
340 while ((cons != rp))
341 {
342 rsp = RING_GET_RESPONSE(&dev->ring, cons);
344 if (rsp->status != BLKIF_RSP_OKAY)
345 printk("block error %d for op %d\n", rsp->status, rsp->operation);
347 switch (rsp->operation) {
348 case BLKIF_OP_READ:
349 case BLKIF_OP_WRITE:
350 {
351 struct blkfront_aiocb *aiocbp = (void*) (uintptr_t) rsp->id;
352 int j;
354 for (j = 0; j < aiocbp->n; j++)
355 gnttab_end_access(aiocbp->gref[j]);
357 /* Nota: callback frees aiocbp itself */
358 aiocbp->aio_cb(aiocbp, rsp->status ? -EIO : 0);
359 break;
360 }
361 case BLKIF_OP_WRITE_BARRIER:
362 case BLKIF_OP_FLUSH_DISKCACHE:
363 break;
364 default:
365 printk("unrecognized block operation %d response\n", rsp->operation);
366 break;
367 }
369 nr_consumed++;
370 ++cons;
371 }
372 dev->ring.rsp_cons = cons;
374 int more;
375 RING_FINAL_CHECK_FOR_RESPONSES(&dev->ring, more);
376 if (more) goto moretodo;
378 return nr_consumed;
379 }
381 static void blkfront_push_operation(struct blkfront_dev *dev, uint8_t op)
382 {
383 int i;
384 struct blkif_request *req;
385 int notify;
387 blkfront_wait_slot(dev);
388 i = dev->ring.req_prod_pvt;
389 req = RING_GET_REQUEST(&dev->ring, i);
390 req->operation = op;
391 req->nr_segments = 0;
392 req->handle = dev->handle;
393 /* Not used */
394 req->id = 0;
395 /* Not needed anyway, but the backend will check it */
396 req->sector_number = 0;
397 dev->ring.req_prod_pvt = i + 1;
398 wmb();
399 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
400 if (notify) notify_remote_via_evtchn(dev->evtchn);
401 }
403 void blkfront_sync(struct blkfront_dev *dev)
404 {
405 unsigned long flags;
407 if (dev->mode == O_RDWR) {
408 if (dev->barrier == 1)
409 blkfront_push_operation(dev, BLKIF_OP_WRITE_BARRIER);
411 if (dev->flush == 1)
412 blkfront_push_operation(dev, BLKIF_OP_FLUSH_DISKCACHE);
413 }
415 /* Note: This won't finish if another thread enqueues requests. */
416 local_irq_save(flags);
417 DEFINE_WAIT(w);
418 while (1) {
419 blkfront_aio_poll(dev);
420 if (RING_FREE_REQUESTS(&dev->ring) == RING_SIZE(&dev->ring))
421 break;
423 add_waiter(w, blkfront_queue);
424 local_irq_restore(flags);
425 schedule();
426 local_irq_save(flags);
427 }
428 remove_waiter(w);
429 local_irq_restore(flags);
430 }
432 #ifdef HAVE_LIBC
433 int blkfront_open(struct blkfront_dev *dev)
434 {
435 dev->fd = alloc_fd(FTYPE_BLK);
436 printk("blk_open(%s) -> %d\n", dev->nodename, dev->fd);
437 files[dev->fd].blk.dev = dev;
438 return dev->fd;
439 }
440 #endif