ia64/linux-2.6.18-xen.hg

view drivers/block/loop.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * linux/drivers/block/loop.c
3 *
4 * Written by Theodore Ts'o, 3/29/93
5 *
6 * Copyright 1993 by Theodore Ts'o. Redistribution of this file is
7 * permitted under the GNU General Public License.
8 *
9 * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
10 * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
11 *
12 * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
13 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
14 *
15 * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
16 *
17 * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
18 *
19 * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
20 *
21 * Loadable modules and other fixes by AK, 1998
22 *
23 * Make real block number available to downstream transfer functions, enables
24 * CBC (and relatives) mode encryption requiring unique IVs per data block.
25 * Reed H. Petty, rhp@draper.net
26 *
27 * Maximum number of loop devices now dynamic via max_loop module parameter.
28 * Russell Kroll <rkroll@exploits.org> 19990701
29 *
30 * Maximum number of loop devices when compiled-in now selectable by passing
31 * max_loop=<1-255> to the kernel on boot.
32 * Erik I. Bolsų, <eriki@himolde.no>, Oct 31, 1999
33 *
34 * Completely rewrite request handling to be make_request_fn style and
35 * non blocking, pushing work to a helper thread. Lots of fixes from
36 * Al Viro too.
37 * Jens Axboe <axboe@suse.de>, Nov 2000
38 *
39 * Support up to 256 loop devices
40 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
41 *
42 * Support for falling back on the write file operation when the address space
43 * operations prepare_write and/or commit_write are not available on the
44 * backing filesystem.
45 * Anton Altaparmakov, 16 Feb 2005
46 *
47 * Still To Fix:
48 * - Advisory locking is ignored here.
49 * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
50 *
51 */
53 #include <linux/module.h>
54 #include <linux/moduleparam.h>
55 #include <linux/sched.h>
56 #include <linux/fs.h>
57 #include <linux/file.h>
58 #include <linux/stat.h>
59 #include <linux/errno.h>
60 #include <linux/major.h>
61 #include <linux/wait.h>
62 #include <linux/blkdev.h>
63 #include <linux/blkpg.h>
64 #include <linux/init.h>
65 #include <linux/smp_lock.h>
66 #include <linux/swap.h>
67 #include <linux/slab.h>
68 #include <linux/loop.h>
69 #include <linux/suspend.h>
70 #include <linux/writeback.h>
71 #include <linux/buffer_head.h> /* for invalidate_bdev() */
72 #include <linux/completion.h>
73 #include <linux/highmem.h>
74 #include <linux/gfp.h>
76 #include <asm/uaccess.h>
78 static int max_loop = 8;
79 static struct loop_device *loop_dev;
80 static struct gendisk **disks;
82 /*
83 * Transfer functions
84 */
85 static int transfer_none(struct loop_device *lo, int cmd,
86 struct page *raw_page, unsigned raw_off,
87 struct page *loop_page, unsigned loop_off,
88 int size, sector_t real_block)
89 {
90 char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
91 char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
93 if (cmd == READ)
94 memcpy(loop_buf, raw_buf, size);
95 else
96 memcpy(raw_buf, loop_buf, size);
98 kunmap_atomic(raw_buf, KM_USER0);
99 kunmap_atomic(loop_buf, KM_USER1);
100 cond_resched();
101 return 0;
102 }
104 static int transfer_xor(struct loop_device *lo, int cmd,
105 struct page *raw_page, unsigned raw_off,
106 struct page *loop_page, unsigned loop_off,
107 int size, sector_t real_block)
108 {
109 char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
110 char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
111 char *in, *out, *key;
112 int i, keysize;
114 if (cmd == READ) {
115 in = raw_buf;
116 out = loop_buf;
117 } else {
118 in = loop_buf;
119 out = raw_buf;
120 }
122 key = lo->lo_encrypt_key;
123 keysize = lo->lo_encrypt_key_size;
124 for (i = 0; i < size; i++)
125 *out++ = *in++ ^ key[(i & 511) % keysize];
127 kunmap_atomic(raw_buf, KM_USER0);
128 kunmap_atomic(loop_buf, KM_USER1);
129 cond_resched();
130 return 0;
131 }
133 static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
134 {
135 if (unlikely(info->lo_encrypt_key_size <= 0))
136 return -EINVAL;
137 return 0;
138 }
140 static struct loop_func_table none_funcs = {
141 .number = LO_CRYPT_NONE,
142 .transfer = transfer_none,
143 };
145 static struct loop_func_table xor_funcs = {
146 .number = LO_CRYPT_XOR,
147 .transfer = transfer_xor,
148 .init = xor_init
149 };
151 /* xfer_funcs[0] is special - its release function is never called */
152 static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
153 &none_funcs,
154 &xor_funcs
155 };
157 static loff_t get_loop_size(struct loop_device *lo, struct file *file)
158 {
159 loff_t size, offset, loopsize;
161 /* Compute loopsize in bytes */
162 size = i_size_read(file->f_mapping->host);
163 offset = lo->lo_offset;
164 loopsize = size - offset;
165 if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
166 loopsize = lo->lo_sizelimit;
168 /*
169 * Unfortunately, if we want to do I/O on the device,
170 * the number of 512-byte sectors has to fit into a sector_t.
171 */
172 return loopsize >> 9;
173 }
175 static int
176 figure_loop_size(struct loop_device *lo)
177 {
178 loff_t size = get_loop_size(lo, lo->lo_backing_file);
179 sector_t x = (sector_t)size;
181 if (unlikely((loff_t)x != size))
182 return -EFBIG;
184 set_capacity(disks[lo->lo_number], x);
185 return 0;
186 }
188 static inline int
189 lo_do_transfer(struct loop_device *lo, int cmd,
190 struct page *rpage, unsigned roffs,
191 struct page *lpage, unsigned loffs,
192 int size, sector_t rblock)
193 {
194 if (unlikely(!lo->transfer))
195 return 0;
197 return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
198 }
200 /**
201 * do_lo_send_aops - helper for writing data to a loop device
202 *
203 * This is the fast version for backing filesystems which implement the address
204 * space operations prepare_write and commit_write.
205 */
206 static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
207 int bsize, loff_t pos, struct page *page)
208 {
209 struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
210 struct address_space *mapping = file->f_mapping;
211 const struct address_space_operations *aops = mapping->a_ops;
212 pgoff_t index;
213 unsigned offset, bv_offs;
214 int len, ret;
216 mutex_lock(&mapping->host->i_mutex);
217 index = pos >> PAGE_CACHE_SHIFT;
218 offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
219 bv_offs = bvec->bv_offset;
220 len = bvec->bv_len;
221 while (len > 0) {
222 sector_t IV;
223 unsigned size;
224 int transfer_result;
226 IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
227 size = PAGE_CACHE_SIZE - offset;
228 if (size > len)
229 size = len;
230 page = grab_cache_page(mapping, index);
231 if (unlikely(!page))
232 goto fail;
233 ret = aops->prepare_write(file, page, offset,
234 offset + size);
235 if (unlikely(ret)) {
236 if (ret == AOP_TRUNCATED_PAGE) {
237 page_cache_release(page);
238 continue;
239 }
240 goto unlock;
241 }
242 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
243 bvec->bv_page, bv_offs, size, IV);
244 if (unlikely(transfer_result)) {
245 char *kaddr;
247 /*
248 * The transfer failed, but we still write the data to
249 * keep prepare/commit calls balanced.
250 */
251 printk(KERN_ERR "loop: transfer error block %llu\n",
252 (unsigned long long)index);
253 kaddr = kmap_atomic(page, KM_USER0);
254 memset(kaddr + offset, 0, size);
255 kunmap_atomic(kaddr, KM_USER0);
256 }
257 flush_dcache_page(page);
258 ret = aops->commit_write(file, page, offset,
259 offset + size);
260 if (unlikely(ret)) {
261 if (ret == AOP_TRUNCATED_PAGE) {
262 page_cache_release(page);
263 continue;
264 }
265 goto unlock;
266 }
267 if (unlikely(transfer_result))
268 goto unlock;
269 bv_offs += size;
270 len -= size;
271 offset = 0;
272 index++;
273 pos += size;
274 unlock_page(page);
275 page_cache_release(page);
276 }
277 ret = 0;
278 out:
279 mutex_unlock(&mapping->host->i_mutex);
280 return ret;
281 unlock:
282 unlock_page(page);
283 page_cache_release(page);
284 fail:
285 ret = -1;
286 goto out;
287 }
289 /**
290 * __do_lo_send_write - helper for writing data to a loop device
291 *
292 * This helper just factors out common code between do_lo_send_direct_write()
293 * and do_lo_send_write().
294 */
295 static int __do_lo_send_write(struct file *file,
296 u8 __user *buf, const int len, loff_t pos)
297 {
298 ssize_t bw;
299 mm_segment_t old_fs = get_fs();
301 set_fs(get_ds());
302 bw = file->f_op->write(file, buf, len, &pos);
303 set_fs(old_fs);
304 if (likely(bw == len))
305 return 0;
306 printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
307 (unsigned long long)pos, len);
308 if (bw >= 0)
309 bw = -EIO;
310 return bw;
311 }
313 /**
314 * do_lo_send_direct_write - helper for writing data to a loop device
315 *
316 * This is the fast, non-transforming version for backing filesystems which do
317 * not implement the address space operations prepare_write and commit_write.
318 * It uses the write file operation which should be present on all writeable
319 * filesystems.
320 */
321 static int do_lo_send_direct_write(struct loop_device *lo,
322 struct bio_vec *bvec, int bsize, loff_t pos, struct page *page)
323 {
324 ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
325 (u8 __user *)kmap(bvec->bv_page) + bvec->bv_offset,
326 bvec->bv_len, pos);
327 kunmap(bvec->bv_page);
328 cond_resched();
329 return bw;
330 }
332 /**
333 * do_lo_send_write - helper for writing data to a loop device
334 *
335 * This is the slow, transforming version for filesystems which do not
336 * implement the address space operations prepare_write and commit_write. It
337 * uses the write file operation which should be present on all writeable
338 * filesystems.
339 *
340 * Using fops->write is slower than using aops->{prepare,commit}_write in the
341 * transforming case because we need to double buffer the data as we cannot do
342 * the transformations in place as we do not have direct access to the
343 * destination pages of the backing file.
344 */
345 static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
346 int bsize, loff_t pos, struct page *page)
347 {
348 int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
349 bvec->bv_offset, bvec->bv_len, pos >> 9);
350 if (likely(!ret))
351 return __do_lo_send_write(lo->lo_backing_file,
352 (u8 __user *)page_address(page), bvec->bv_len,
353 pos);
354 printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
355 "length %i.\n", (unsigned long long)pos, bvec->bv_len);
356 if (ret > 0)
357 ret = -EIO;
358 return ret;
359 }
361 static int lo_send(struct loop_device *lo, struct bio *bio, int bsize,
362 loff_t pos)
363 {
364 int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t,
365 struct page *page);
366 struct bio_vec *bvec;
367 struct page *page = NULL;
368 int i, ret = 0;
370 do_lo_send = do_lo_send_aops;
371 if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
372 do_lo_send = do_lo_send_direct_write;
373 if (lo->transfer != transfer_none) {
374 page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
375 if (unlikely(!page))
376 goto fail;
377 kmap(page);
378 do_lo_send = do_lo_send_write;
379 }
380 }
381 bio_for_each_segment(bvec, bio, i) {
382 ret = do_lo_send(lo, bvec, bsize, pos, page);
383 if (ret < 0)
384 break;
385 pos += bvec->bv_len;
386 }
387 if (page) {
388 kunmap(page);
389 __free_page(page);
390 }
391 out:
392 return ret;
393 fail:
394 printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
395 ret = -ENOMEM;
396 goto out;
397 }
399 struct lo_read_data {
400 struct loop_device *lo;
401 struct page *page;
402 unsigned offset;
403 int bsize;
404 };
406 static int
407 lo_read_actor(read_descriptor_t *desc, struct page *page,
408 unsigned long offset, unsigned long size)
409 {
410 unsigned long count = desc->count;
411 struct lo_read_data *p = desc->arg.data;
412 struct loop_device *lo = p->lo;
413 sector_t IV;
415 IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
417 if (size > count)
418 size = count;
420 if (lo_do_transfer(lo, READ, page, offset, p->page, p->offset, size, IV)) {
421 size = 0;
422 printk(KERN_ERR "loop: transfer error block %ld\n",
423 page->index);
424 desc->error = -EINVAL;
425 }
427 flush_dcache_page(p->page);
429 desc->count = count - size;
430 desc->written += size;
431 p->offset += size;
432 return size;
433 }
435 static int
436 do_lo_receive(struct loop_device *lo,
437 struct bio_vec *bvec, int bsize, loff_t pos)
438 {
439 struct lo_read_data cookie;
440 struct file *file;
441 int retval;
443 cookie.lo = lo;
444 cookie.page = bvec->bv_page;
445 cookie.offset = bvec->bv_offset;
446 cookie.bsize = bsize;
447 file = lo->lo_backing_file;
448 retval = file->f_op->sendfile(file, &pos, bvec->bv_len,
449 lo_read_actor, &cookie);
450 return (retval < 0)? retval: 0;
451 }
453 static int
454 lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
455 {
456 struct bio_vec *bvec;
457 int i, ret = 0;
459 bio_for_each_segment(bvec, bio, i) {
460 ret = do_lo_receive(lo, bvec, bsize, pos);
461 if (ret < 0)
462 break;
463 pos += bvec->bv_len;
464 }
465 return ret;
466 }
468 static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
469 {
470 loff_t pos;
471 int ret;
473 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
474 if (bio_rw(bio) == WRITE)
475 ret = lo_send(lo, bio, lo->lo_blocksize, pos);
476 else
477 ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
478 return ret;
479 }
481 /*
482 * Add bio to back of pending list
483 */
484 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
485 {
486 if (lo->lo_biotail) {
487 lo->lo_biotail->bi_next = bio;
488 lo->lo_biotail = bio;
489 } else
490 lo->lo_bio = lo->lo_biotail = bio;
491 }
493 /*
494 * Grab first pending buffer
495 */
496 static struct bio *loop_get_bio(struct loop_device *lo)
497 {
498 struct bio *bio;
500 if ((bio = lo->lo_bio)) {
501 if (bio == lo->lo_biotail)
502 lo->lo_biotail = NULL;
503 lo->lo_bio = bio->bi_next;
504 bio->bi_next = NULL;
505 }
507 return bio;
508 }
510 static int loop_make_request(request_queue_t *q, struct bio *old_bio)
511 {
512 struct loop_device *lo = q->queuedata;
513 int rw = bio_rw(old_bio);
515 if (rw == READA)
516 rw = READ;
518 BUG_ON(!lo || (rw != READ && rw != WRITE));
520 spin_lock_irq(&lo->lo_lock);
521 if (lo->lo_state != Lo_bound)
522 goto out;
523 if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
524 goto out;
525 lo->lo_pending++;
526 loop_add_bio(lo, old_bio);
527 spin_unlock_irq(&lo->lo_lock);
528 complete(&lo->lo_bh_done);
529 return 0;
531 out:
532 if (lo->lo_pending == 0)
533 complete(&lo->lo_bh_done);
534 spin_unlock_irq(&lo->lo_lock);
535 bio_io_error(old_bio, old_bio->bi_size);
536 return 0;
537 }
539 /*
540 * kick off io on the underlying address space
541 */
542 static void loop_unplug(request_queue_t *q)
543 {
544 struct loop_device *lo = q->queuedata;
546 clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
547 blk_run_address_space(lo->lo_backing_file->f_mapping);
548 }
550 struct switch_request {
551 struct file *file;
552 struct completion wait;
553 };
555 static void do_loop_switch(struct loop_device *, struct switch_request *);
557 static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
558 {
559 if (unlikely(!bio->bi_bdev)) {
560 do_loop_switch(lo, bio->bi_private);
561 bio_put(bio);
562 } else {
563 int ret = do_bio_filebacked(lo, bio);
564 bio_endio(bio, bio->bi_size, ret);
565 }
566 }
568 /*
569 * worker thread that handles reads/writes to file backed loop devices,
570 * to avoid blocking in our make_request_fn. it also does loop decrypting
571 * on reads for block backed loop, as that is too heavy to do from
572 * b_end_io context where irqs may be disabled.
573 */
574 static int loop_thread(void *data)
575 {
576 struct loop_device *lo = data;
577 struct bio *bio;
579 daemonize("loop%d", lo->lo_number);
581 /*
582 * loop can be used in an encrypted device,
583 * hence, it mustn't be stopped at all
584 * because it could be indirectly used during suspension
585 */
586 current->flags |= PF_NOFREEZE;
588 set_user_nice(current, -20);
590 lo->lo_state = Lo_bound;
591 lo->lo_pending = 1;
593 /*
594 * complete it, we are running
595 */
596 complete(&lo->lo_done);
598 for (;;) {
599 int pending;
601 if (wait_for_completion_interruptible(&lo->lo_bh_done))
602 continue;
604 spin_lock_irq(&lo->lo_lock);
606 /*
607 * could be completed because of tear-down, not pending work
608 */
609 if (unlikely(!lo->lo_pending)) {
610 spin_unlock_irq(&lo->lo_lock);
611 break;
612 }
614 bio = loop_get_bio(lo);
615 lo->lo_pending--;
616 pending = lo->lo_pending;
617 spin_unlock_irq(&lo->lo_lock);
619 BUG_ON(!bio);
620 loop_handle_bio(lo, bio);
622 /*
623 * upped both for pending work and tear-down, lo_pending
624 * will hit zero then
625 */
626 if (unlikely(!pending))
627 break;
628 }
630 complete(&lo->lo_done);
631 return 0;
632 }
634 /*
635 * loop_switch performs the hard work of switching a backing store.
636 * First it needs to flush existing IO, it does this by sending a magic
637 * BIO down the pipe. The completion of this BIO does the actual switch.
638 */
639 static int loop_switch(struct loop_device *lo, struct file *file)
640 {
641 struct switch_request w;
642 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
643 if (!bio)
644 return -ENOMEM;
645 init_completion(&w.wait);
646 w.file = file;
647 bio->bi_private = &w;
648 bio->bi_bdev = NULL;
649 loop_make_request(lo->lo_queue, bio);
650 wait_for_completion(&w.wait);
651 return 0;
652 }
654 /*
655 * Do the actual switch; called from the BIO completion routine
656 */
657 static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
658 {
659 struct file *file = p->file;
660 struct file *old_file = lo->lo_backing_file;
661 struct address_space *mapping = file->f_mapping;
663 mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
664 lo->lo_backing_file = file;
665 lo->lo_blocksize = mapping->host->i_blksize;
666 lo->old_gfp_mask = mapping_gfp_mask(mapping);
667 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
668 complete(&p->wait);
669 }
672 /*
673 * loop_change_fd switched the backing store of a loopback device to
674 * a new file. This is useful for operating system installers to free up
675 * the original file and in High Availability environments to switch to
676 * an alternative location for the content in case of server meltdown.
677 * This can only work if the loop device is used read-only, and if the
678 * new backing store is the same size and type as the old backing store.
679 */
680 static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
681 struct block_device *bdev, unsigned int arg)
682 {
683 struct file *file, *old_file;
684 struct inode *inode;
685 int error;
687 error = -ENXIO;
688 if (lo->lo_state != Lo_bound)
689 goto out;
691 /* the loop device has to be read-only */
692 error = -EINVAL;
693 if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
694 goto out;
696 error = -EBADF;
697 file = fget(arg);
698 if (!file)
699 goto out;
701 inode = file->f_mapping->host;
702 old_file = lo->lo_backing_file;
704 error = -EINVAL;
706 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
707 goto out_putf;
709 /* new backing store needs to support loop (eg sendfile) */
710 if (!inode->i_fop->sendfile)
711 goto out_putf;
713 /* size of the new backing store needs to be the same */
714 if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
715 goto out_putf;
717 /* and ... switch */
718 error = loop_switch(lo, file);
719 if (error)
720 goto out_putf;
722 fput(old_file);
723 return 0;
725 out_putf:
726 fput(file);
727 out:
728 return error;
729 }
731 static inline int is_loop_device(struct file *file)
732 {
733 struct inode *i = file->f_mapping->host;
735 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
736 }
738 static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
739 struct block_device *bdev, unsigned int arg)
740 {
741 struct file *file, *f;
742 struct inode *inode;
743 struct address_space *mapping;
744 unsigned lo_blocksize;
745 int lo_flags = 0;
746 int error;
747 loff_t size;
749 /* This is safe, since we have a reference from open(). */
750 __module_get(THIS_MODULE);
752 error = -EBADF;
753 file = fget(arg);
754 if (!file)
755 goto out;
757 error = -EBUSY;
758 if (lo->lo_state != Lo_unbound)
759 goto out_putf;
761 /* Avoid recursion */
762 f = file;
763 while (is_loop_device(f)) {
764 struct loop_device *l;
766 if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev)
767 goto out_putf;
769 l = f->f_mapping->host->i_bdev->bd_disk->private_data;
770 if (l->lo_state == Lo_unbound) {
771 error = -EINVAL;
772 goto out_putf;
773 }
774 f = l->lo_backing_file;
775 }
777 mapping = file->f_mapping;
778 inode = mapping->host;
780 if (!(file->f_mode & FMODE_WRITE))
781 lo_flags |= LO_FLAGS_READ_ONLY;
783 error = -EINVAL;
784 if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
785 const struct address_space_operations *aops = mapping->a_ops;
786 /*
787 * If we can't read - sorry. If we only can't write - well,
788 * it's going to be read-only.
789 */
790 if (!file->f_op->sendfile)
791 goto out_putf;
792 if (aops->prepare_write && aops->commit_write)
793 lo_flags |= LO_FLAGS_USE_AOPS;
794 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
795 lo_flags |= LO_FLAGS_READ_ONLY;
797 lo_blocksize = inode->i_blksize;
798 error = 0;
799 } else {
800 goto out_putf;
801 }
803 size = get_loop_size(lo, file);
805 if ((loff_t)(sector_t)size != size) {
806 error = -EFBIG;
807 goto out_putf;
808 }
810 if (!(lo_file->f_mode & FMODE_WRITE))
811 lo_flags |= LO_FLAGS_READ_ONLY;
813 set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
815 lo->lo_blocksize = lo_blocksize;
816 lo->lo_device = bdev;
817 lo->lo_flags = lo_flags;
818 lo->lo_backing_file = file;
819 lo->transfer = transfer_none;
820 lo->ioctl = NULL;
821 lo->lo_sizelimit = 0;
822 lo->old_gfp_mask = mapping_gfp_mask(mapping);
823 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
825 lo->lo_bio = lo->lo_biotail = NULL;
827 /*
828 * set queue make_request_fn, and add limits based on lower level
829 * device
830 */
831 blk_queue_make_request(lo->lo_queue, loop_make_request);
832 lo->lo_queue->queuedata = lo;
833 lo->lo_queue->unplug_fn = loop_unplug;
835 set_capacity(disks[lo->lo_number], size);
836 bd_set_size(bdev, size << 9);
838 set_blocksize(bdev, lo_blocksize);
840 error = kernel_thread(loop_thread, lo, CLONE_KERNEL);
841 if (error < 0)
842 goto out_putf;
843 wait_for_completion(&lo->lo_done);
844 return 0;
846 out_putf:
847 fput(file);
848 out:
849 /* This is safe: open() is still holding a reference. */
850 module_put(THIS_MODULE);
851 return error;
852 }
854 static int
855 loop_release_xfer(struct loop_device *lo)
856 {
857 int err = 0;
858 struct loop_func_table *xfer = lo->lo_encryption;
860 if (xfer) {
861 if (xfer->release)
862 err = xfer->release(lo);
863 lo->transfer = NULL;
864 lo->lo_encryption = NULL;
865 module_put(xfer->owner);
866 }
867 return err;
868 }
870 static int
871 loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
872 const struct loop_info64 *i)
873 {
874 int err = 0;
876 if (xfer) {
877 struct module *owner = xfer->owner;
879 if (!try_module_get(owner))
880 return -EINVAL;
881 if (xfer->init)
882 err = xfer->init(lo, i);
883 if (err)
884 module_put(owner);
885 else
886 lo->lo_encryption = xfer;
887 }
888 return err;
889 }
891 static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
892 {
893 struct file *filp = lo->lo_backing_file;
894 gfp_t gfp = lo->old_gfp_mask;
896 if (lo->lo_state != Lo_bound)
897 return -ENXIO;
899 if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */
900 return -EBUSY;
902 if (filp == NULL)
903 return -EINVAL;
905 spin_lock_irq(&lo->lo_lock);
906 lo->lo_state = Lo_rundown;
907 lo->lo_pending--;
908 if (!lo->lo_pending)
909 complete(&lo->lo_bh_done);
910 spin_unlock_irq(&lo->lo_lock);
912 wait_for_completion(&lo->lo_done);
914 lo->lo_backing_file = NULL;
916 loop_release_xfer(lo);
917 lo->transfer = NULL;
918 lo->ioctl = NULL;
919 lo->lo_device = NULL;
920 lo->lo_encryption = NULL;
921 lo->lo_offset = 0;
922 lo->lo_sizelimit = 0;
923 lo->lo_encrypt_key_size = 0;
924 lo->lo_flags = 0;
925 memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
926 memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
927 memset(lo->lo_file_name, 0, LO_NAME_SIZE);
928 invalidate_bdev(bdev, 0);
929 set_capacity(disks[lo->lo_number], 0);
930 bd_set_size(bdev, 0);
931 mapping_set_gfp_mask(filp->f_mapping, gfp);
932 lo->lo_state = Lo_unbound;
933 fput(filp);
934 /* This is safe: open() is still holding a reference. */
935 module_put(THIS_MODULE);
936 return 0;
937 }
939 static int
940 loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
941 {
942 int err;
943 struct loop_func_table *xfer;
945 if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid &&
946 !capable(CAP_SYS_ADMIN))
947 return -EPERM;
948 if (lo->lo_state != Lo_bound)
949 return -ENXIO;
950 if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
951 return -EINVAL;
953 err = loop_release_xfer(lo);
954 if (err)
955 return err;
957 if (info->lo_encrypt_type) {
958 unsigned int type = info->lo_encrypt_type;
960 if (type >= MAX_LO_CRYPT)
961 return -EINVAL;
962 xfer = xfer_funcs[type];
963 if (xfer == NULL)
964 return -EINVAL;
965 } else
966 xfer = NULL;
968 err = loop_init_xfer(lo, xfer, info);
969 if (err)
970 return err;
972 if (lo->lo_offset != info->lo_offset ||
973 lo->lo_sizelimit != info->lo_sizelimit) {
974 lo->lo_offset = info->lo_offset;
975 lo->lo_sizelimit = info->lo_sizelimit;
976 if (figure_loop_size(lo))
977 return -EFBIG;
978 }
980 memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
981 memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
982 lo->lo_file_name[LO_NAME_SIZE-1] = 0;
983 lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
985 if (!xfer)
986 xfer = &none_funcs;
987 lo->transfer = xfer->transfer;
988 lo->ioctl = xfer->ioctl;
990 lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
991 lo->lo_init[0] = info->lo_init[0];
992 lo->lo_init[1] = info->lo_init[1];
993 if (info->lo_encrypt_key_size) {
994 memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
995 info->lo_encrypt_key_size);
996 lo->lo_key_owner = current->uid;
997 }
999 return 0;
1002 static int
1003 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
1005 struct file *file = lo->lo_backing_file;
1006 struct kstat stat;
1007 int error;
1009 if (lo->lo_state != Lo_bound)
1010 return -ENXIO;
1011 error = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
1012 if (error)
1013 return error;
1014 memset(info, 0, sizeof(*info));
1015 info->lo_number = lo->lo_number;
1016 info->lo_device = huge_encode_dev(stat.dev);
1017 info->lo_inode = stat.ino;
1018 info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);
1019 info->lo_offset = lo->lo_offset;
1020 info->lo_sizelimit = lo->lo_sizelimit;
1021 info->lo_flags = lo->lo_flags;
1022 memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
1023 memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
1024 info->lo_encrypt_type =
1025 lo->lo_encryption ? lo->lo_encryption->number : 0;
1026 if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
1027 info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
1028 memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
1029 lo->lo_encrypt_key_size);
1031 return 0;
1034 static void
1035 loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
1037 memset(info64, 0, sizeof(*info64));
1038 info64->lo_number = info->lo_number;
1039 info64->lo_device = info->lo_device;
1040 info64->lo_inode = info->lo_inode;
1041 info64->lo_rdevice = info->lo_rdevice;
1042 info64->lo_offset = info->lo_offset;
1043 info64->lo_sizelimit = 0;
1044 info64->lo_encrypt_type = info->lo_encrypt_type;
1045 info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
1046 info64->lo_flags = info->lo_flags;
1047 info64->lo_init[0] = info->lo_init[0];
1048 info64->lo_init[1] = info->lo_init[1];
1049 if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1050 memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
1051 else
1052 memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
1053 memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
1056 static int
1057 loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
1059 memset(info, 0, sizeof(*info));
1060 info->lo_number = info64->lo_number;
1061 info->lo_device = info64->lo_device;
1062 info->lo_inode = info64->lo_inode;
1063 info->lo_rdevice = info64->lo_rdevice;
1064 info->lo_offset = info64->lo_offset;
1065 info->lo_encrypt_type = info64->lo_encrypt_type;
1066 info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
1067 info->lo_flags = info64->lo_flags;
1068 info->lo_init[0] = info64->lo_init[0];
1069 info->lo_init[1] = info64->lo_init[1];
1070 if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1071 memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1072 else
1073 memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
1074 memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1076 /* error in case values were truncated */
1077 if (info->lo_device != info64->lo_device ||
1078 info->lo_rdevice != info64->lo_rdevice ||
1079 info->lo_inode != info64->lo_inode ||
1080 info->lo_offset != info64->lo_offset)
1081 return -EOVERFLOW;
1083 return 0;
1086 static int
1087 loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
1089 struct loop_info info;
1090 struct loop_info64 info64;
1092 if (copy_from_user(&info, arg, sizeof (struct loop_info)))
1093 return -EFAULT;
1094 loop_info64_from_old(&info, &info64);
1095 return loop_set_status(lo, &info64);
1098 static int
1099 loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
1101 struct loop_info64 info64;
1103 if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
1104 return -EFAULT;
1105 return loop_set_status(lo, &info64);
1108 static int
1109 loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
1110 struct loop_info info;
1111 struct loop_info64 info64;
1112 int err = 0;
1114 if (!arg)
1115 err = -EINVAL;
1116 if (!err)
1117 err = loop_get_status(lo, &info64);
1118 if (!err)
1119 err = loop_info64_to_old(&info64, &info);
1120 if (!err && copy_to_user(arg, &info, sizeof(info)))
1121 err = -EFAULT;
1123 return err;
1126 static int
1127 loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
1128 struct loop_info64 info64;
1129 int err = 0;
1131 if (!arg)
1132 err = -EINVAL;
1133 if (!err)
1134 err = loop_get_status(lo, &info64);
1135 if (!err && copy_to_user(arg, &info64, sizeof(info64)))
1136 err = -EFAULT;
1138 return err;
1141 static int lo_ioctl(struct inode * inode, struct file * file,
1142 unsigned int cmd, unsigned long arg)
1144 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1145 int err;
1147 mutex_lock(&lo->lo_ctl_mutex);
1148 switch (cmd) {
1149 case LOOP_SET_FD:
1150 err = loop_set_fd(lo, file, inode->i_bdev, arg);
1151 break;
1152 case LOOP_CHANGE_FD:
1153 err = loop_change_fd(lo, file, inode->i_bdev, arg);
1154 break;
1155 case LOOP_CLR_FD:
1156 err = loop_clr_fd(lo, inode->i_bdev);
1157 break;
1158 case LOOP_SET_STATUS:
1159 err = loop_set_status_old(lo, (struct loop_info __user *) arg);
1160 break;
1161 case LOOP_GET_STATUS:
1162 err = loop_get_status_old(lo, (struct loop_info __user *) arg);
1163 break;
1164 case LOOP_SET_STATUS64:
1165 err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
1166 break;
1167 case LOOP_GET_STATUS64:
1168 err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
1169 break;
1170 default:
1171 err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
1173 mutex_unlock(&lo->lo_ctl_mutex);
1174 return err;
1177 static int lo_open(struct inode *inode, struct file *file)
1179 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1181 mutex_lock(&lo->lo_ctl_mutex);
1182 lo->lo_refcnt++;
1183 mutex_unlock(&lo->lo_ctl_mutex);
1185 return 0;
1188 static int lo_release(struct inode *inode, struct file *file)
1190 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1192 mutex_lock(&lo->lo_ctl_mutex);
1193 --lo->lo_refcnt;
1194 mutex_unlock(&lo->lo_ctl_mutex);
1196 return 0;
1199 static struct block_device_operations lo_fops = {
1200 .owner = THIS_MODULE,
1201 .open = lo_open,
1202 .release = lo_release,
1203 .ioctl = lo_ioctl,
1204 };
1206 /*
1207 * And now the modules code and kernel interface.
1208 */
1209 module_param(max_loop, int, 0);
1210 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)");
1211 MODULE_LICENSE("GPL");
1212 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
1214 int loop_register_transfer(struct loop_func_table *funcs)
1216 unsigned int n = funcs->number;
1218 if (n >= MAX_LO_CRYPT || xfer_funcs[n])
1219 return -EINVAL;
1220 xfer_funcs[n] = funcs;
1221 return 0;
1224 int loop_unregister_transfer(int number)
1226 unsigned int n = number;
1227 struct loop_device *lo;
1228 struct loop_func_table *xfer;
1230 if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
1231 return -EINVAL;
1233 xfer_funcs[n] = NULL;
1235 for (lo = &loop_dev[0]; lo < &loop_dev[max_loop]; lo++) {
1236 mutex_lock(&lo->lo_ctl_mutex);
1238 if (lo->lo_encryption == xfer)
1239 loop_release_xfer(lo);
1241 mutex_unlock(&lo->lo_ctl_mutex);
1244 return 0;
1247 EXPORT_SYMBOL(loop_register_transfer);
1248 EXPORT_SYMBOL(loop_unregister_transfer);
1250 static int __init loop_init(void)
1252 int i;
1254 if (max_loop < 1 || max_loop > 256) {
1255 printk(KERN_WARNING "loop: invalid max_loop (must be between"
1256 " 1 and 256), using default (8)\n");
1257 max_loop = 8;
1260 if (register_blkdev(LOOP_MAJOR, "loop"))
1261 return -EIO;
1263 loop_dev = kmalloc(max_loop * sizeof(struct loop_device), GFP_KERNEL);
1264 if (!loop_dev)
1265 goto out_mem1;
1266 memset(loop_dev, 0, max_loop * sizeof(struct loop_device));
1268 disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL);
1269 if (!disks)
1270 goto out_mem2;
1272 for (i = 0; i < max_loop; i++) {
1273 disks[i] = alloc_disk(1);
1274 if (!disks[i])
1275 goto out_mem3;
1278 for (i = 0; i < max_loop; i++) {
1279 struct loop_device *lo = &loop_dev[i];
1280 struct gendisk *disk = disks[i];
1282 memset(lo, 0, sizeof(*lo));
1283 lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
1284 if (!lo->lo_queue)
1285 goto out_mem4;
1286 mutex_init(&lo->lo_ctl_mutex);
1287 init_completion(&lo->lo_done);
1288 init_completion(&lo->lo_bh_done);
1289 lo->lo_number = i;
1290 spin_lock_init(&lo->lo_lock);
1291 disk->major = LOOP_MAJOR;
1292 disk->first_minor = i;
1293 disk->fops = &lo_fops;
1294 sprintf(disk->disk_name, "loop%d", i);
1295 disk->private_data = lo;
1296 disk->queue = lo->lo_queue;
1299 /* We cannot fail after we call this, so another loop!*/
1300 for (i = 0; i < max_loop; i++)
1301 add_disk(disks[i]);
1302 printk(KERN_INFO "loop: loaded (max %d devices)\n", max_loop);
1303 return 0;
1305 out_mem4:
1306 while (i--)
1307 blk_cleanup_queue(loop_dev[i].lo_queue);
1308 i = max_loop;
1309 out_mem3:
1310 while (i--)
1311 put_disk(disks[i]);
1312 kfree(disks);
1313 out_mem2:
1314 kfree(loop_dev);
1315 out_mem1:
1316 unregister_blkdev(LOOP_MAJOR, "loop");
1317 printk(KERN_ERR "loop: ran out of memory\n");
1318 return -ENOMEM;
1321 static void loop_exit(void)
1323 int i;
1325 for (i = 0; i < max_loop; i++) {
1326 del_gendisk(disks[i]);
1327 blk_cleanup_queue(loop_dev[i].lo_queue);
1328 put_disk(disks[i]);
1330 if (unregister_blkdev(LOOP_MAJOR, "loop"))
1331 printk(KERN_WARNING "loop: cannot unregister blkdev\n");
1333 kfree(disks);
1334 kfree(loop_dev);
1337 module_init(loop_init);
1338 module_exit(loop_exit);
1340 #ifndef MODULE
1341 static int __init max_loop_setup(char *str)
1343 max_loop = simple_strtol(str, NULL, 0);
1344 return 1;
1347 __setup("max_loop=", max_loop_setup);
1348 #endif