ia64/linux-2.6.18-xen.hg

view drivers/md/dm-log.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the LGPL.
5 */
7 #include <linux/init.h>
8 #include <linux/slab.h>
9 #include <linux/module.h>
10 #include <linux/vmalloc.h>
12 #include "dm-log.h"
13 #include "dm-io.h"
15 #define DM_MSG_PREFIX "mirror log"
17 static LIST_HEAD(_log_types);
18 static DEFINE_SPINLOCK(_lock);
20 int dm_register_dirty_log_type(struct dirty_log_type *type)
21 {
22 spin_lock(&_lock);
23 type->use_count = 0;
24 list_add(&type->list, &_log_types);
25 spin_unlock(&_lock);
27 return 0;
28 }
30 int dm_unregister_dirty_log_type(struct dirty_log_type *type)
31 {
32 spin_lock(&_lock);
34 if (type->use_count)
35 DMWARN("Attempt to unregister a log type that is still in use");
36 else
37 list_del(&type->list);
39 spin_unlock(&_lock);
41 return 0;
42 }
44 static struct dirty_log_type *get_type(const char *type_name)
45 {
46 struct dirty_log_type *type;
48 spin_lock(&_lock);
49 list_for_each_entry (type, &_log_types, list)
50 if (!strcmp(type_name, type->name)) {
51 if (!type->use_count && !try_module_get(type->module)){
52 spin_unlock(&_lock);
53 return NULL;
54 }
55 type->use_count++;
56 spin_unlock(&_lock);
57 return type;
58 }
60 spin_unlock(&_lock);
61 return NULL;
62 }
64 static void put_type(struct dirty_log_type *type)
65 {
66 spin_lock(&_lock);
67 if (!--type->use_count)
68 module_put(type->module);
69 spin_unlock(&_lock);
70 }
72 struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
73 unsigned int argc, char **argv)
74 {
75 struct dirty_log_type *type;
76 struct dirty_log *log;
78 log = kmalloc(sizeof(*log), GFP_KERNEL);
79 if (!log)
80 return NULL;
82 type = get_type(type_name);
83 if (!type) {
84 kfree(log);
85 return NULL;
86 }
88 log->type = type;
89 if (type->ctr(log, ti, argc, argv)) {
90 kfree(log);
91 put_type(type);
92 return NULL;
93 }
95 return log;
96 }
98 void dm_destroy_dirty_log(struct dirty_log *log)
99 {
100 log->type->dtr(log);
101 put_type(log->type);
102 kfree(log);
103 }
105 /*-----------------------------------------------------------------
106 * Persistent and core logs share a lot of their implementation.
107 * FIXME: need a reload method to be called from a resume
108 *---------------------------------------------------------------*/
109 /*
110 * Magic for persistent mirrors: "MiRr"
111 */
112 #define MIRROR_MAGIC 0x4D695272
114 /*
115 * The on-disk version of the metadata.
116 */
117 #define MIRROR_DISK_VERSION 2
118 #define LOG_OFFSET 2
120 struct log_header {
121 uint32_t magic;
123 /*
124 * Simple, incrementing version. no backward
125 * compatibility.
126 */
127 uint32_t version;
128 sector_t nr_regions;
129 };
131 struct log_c {
132 struct dm_target *ti;
133 int touched;
134 uint32_t region_size;
135 unsigned int region_count;
136 region_t sync_count;
138 unsigned bitset_uint32_count;
139 uint32_t *clean_bits;
140 uint32_t *sync_bits;
141 uint32_t *recovering_bits; /* FIXME: this seems excessive */
143 int sync_search;
145 /* Resync flag */
146 enum sync {
147 DEFAULTSYNC, /* Synchronize if necessary */
148 NOSYNC, /* Devices known to be already in sync */
149 FORCESYNC, /* Force a sync to happen */
150 } sync;
152 /*
153 * Disk log fields
154 */
155 struct dm_dev *log_dev;
156 struct log_header header;
158 struct io_region header_location;
159 struct log_header *disk_header;
160 };
162 /*
163 * The touched member needs to be updated every time we access
164 * one of the bitsets.
165 */
166 static inline int log_test_bit(uint32_t *bs, unsigned bit)
167 {
168 return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
169 }
171 static inline void log_set_bit(struct log_c *l,
172 uint32_t *bs, unsigned bit)
173 {
174 ext2_set_bit(bit, (unsigned long *) bs);
175 l->touched = 1;
176 }
178 static inline void log_clear_bit(struct log_c *l,
179 uint32_t *bs, unsigned bit)
180 {
181 ext2_clear_bit(bit, (unsigned long *) bs);
182 l->touched = 1;
183 }
185 /*----------------------------------------------------------------
186 * Header IO
187 *--------------------------------------------------------------*/
188 static void header_to_disk(struct log_header *core, struct log_header *disk)
189 {
190 disk->magic = cpu_to_le32(core->magic);
191 disk->version = cpu_to_le32(core->version);
192 disk->nr_regions = cpu_to_le64(core->nr_regions);
193 }
195 static void header_from_disk(struct log_header *core, struct log_header *disk)
196 {
197 core->magic = le32_to_cpu(disk->magic);
198 core->version = le32_to_cpu(disk->version);
199 core->nr_regions = le64_to_cpu(disk->nr_regions);
200 }
202 static int read_header(struct log_c *log)
203 {
204 int r;
205 unsigned long ebits;
207 r = dm_io_sync_vm(1, &log->header_location, READ,
208 log->disk_header, &ebits);
209 if (r)
210 return r;
212 header_from_disk(&log->header, log->disk_header);
214 /* New log required? */
215 if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
216 log->header.magic = MIRROR_MAGIC;
217 log->header.version = MIRROR_DISK_VERSION;
218 log->header.nr_regions = 0;
219 }
221 #ifdef __LITTLE_ENDIAN
222 if (log->header.version == 1)
223 log->header.version = 2;
224 #endif
226 if (log->header.version != MIRROR_DISK_VERSION) {
227 DMWARN("incompatible disk log version");
228 return -EINVAL;
229 }
231 return 0;
232 }
234 static inline int write_header(struct log_c *log)
235 {
236 unsigned long ebits;
238 header_to_disk(&log->header, log->disk_header);
239 return dm_io_sync_vm(1, &log->header_location, WRITE,
240 log->disk_header, &ebits);
241 }
243 /*----------------------------------------------------------------
244 * core log constructor/destructor
245 *
246 * argv contains region_size followed optionally by [no]sync
247 *--------------------------------------------------------------*/
248 #define BYTE_SHIFT 3
249 static int create_log_context(struct dirty_log *log, struct dm_target *ti,
250 unsigned int argc, char **argv,
251 struct dm_dev *dev)
252 {
253 enum sync sync = DEFAULTSYNC;
255 struct log_c *lc;
256 uint32_t region_size;
257 unsigned int region_count;
258 size_t bitset_size, buf_size;
260 if (argc < 1 || argc > 2) {
261 DMWARN("wrong number of arguments to mirror log");
262 return -EINVAL;
263 }
265 if (argc > 1) {
266 if (!strcmp(argv[1], "sync"))
267 sync = FORCESYNC;
268 else if (!strcmp(argv[1], "nosync"))
269 sync = NOSYNC;
270 else {
271 DMWARN("unrecognised sync argument to mirror log: %s",
272 argv[1]);
273 return -EINVAL;
274 }
275 }
277 if (sscanf(argv[0], "%u", &region_size) != 1) {
278 DMWARN("invalid region size string");
279 return -EINVAL;
280 }
282 region_count = dm_sector_div_up(ti->len, region_size);
284 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
285 if (!lc) {
286 DMWARN("couldn't allocate core log");
287 return -ENOMEM;
288 }
290 lc->ti = ti;
291 lc->touched = 0;
292 lc->region_size = region_size;
293 lc->region_count = region_count;
294 lc->sync = sync;
296 /*
297 * Work out how many "unsigned long"s we need to hold the bitset.
298 */
299 bitset_size = dm_round_up(region_count,
300 sizeof(*lc->clean_bits) << BYTE_SHIFT);
301 bitset_size >>= BYTE_SHIFT;
303 lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
305 /*
306 * Disk log?
307 */
308 if (!dev) {
309 lc->clean_bits = vmalloc(bitset_size);
310 if (!lc->clean_bits) {
311 DMWARN("couldn't allocate clean bitset");
312 kfree(lc);
313 return -ENOMEM;
314 }
315 lc->disk_header = NULL;
316 } else {
317 lc->log_dev = dev;
318 lc->header_location.bdev = lc->log_dev->bdev;
319 lc->header_location.sector = 0;
321 /*
322 * Buffer holds both header and bitset.
323 */
324 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
325 bitset_size, ti->limits.hardsect_size);
326 lc->header_location.count = buf_size >> SECTOR_SHIFT;
328 lc->disk_header = vmalloc(buf_size);
329 if (!lc->disk_header) {
330 DMWARN("couldn't allocate disk log buffer");
331 kfree(lc);
332 return -ENOMEM;
333 }
335 lc->clean_bits = (void *)lc->disk_header +
336 (LOG_OFFSET << SECTOR_SHIFT);
337 }
339 memset(lc->clean_bits, -1, bitset_size);
341 lc->sync_bits = vmalloc(bitset_size);
342 if (!lc->sync_bits) {
343 DMWARN("couldn't allocate sync bitset");
344 if (!dev)
345 vfree(lc->clean_bits);
346 vfree(lc->disk_header);
347 kfree(lc);
348 return -ENOMEM;
349 }
350 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
351 lc->sync_count = (sync == NOSYNC) ? region_count : 0;
353 lc->recovering_bits = vmalloc(bitset_size);
354 if (!lc->recovering_bits) {
355 DMWARN("couldn't allocate sync bitset");
356 vfree(lc->sync_bits);
357 if (!dev)
358 vfree(lc->clean_bits);
359 vfree(lc->disk_header);
360 kfree(lc);
361 return -ENOMEM;
362 }
363 memset(lc->recovering_bits, 0, bitset_size);
364 lc->sync_search = 0;
365 log->context = lc;
367 return 0;
368 }
370 static int core_ctr(struct dirty_log *log, struct dm_target *ti,
371 unsigned int argc, char **argv)
372 {
373 return create_log_context(log, ti, argc, argv, NULL);
374 }
376 static void destroy_log_context(struct log_c *lc)
377 {
378 vfree(lc->sync_bits);
379 vfree(lc->recovering_bits);
380 kfree(lc);
381 }
383 static void core_dtr(struct dirty_log *log)
384 {
385 struct log_c *lc = (struct log_c *) log->context;
387 vfree(lc->clean_bits);
388 destroy_log_context(lc);
389 }
391 /*----------------------------------------------------------------
392 * disk log constructor/destructor
393 *
394 * argv contains log_device region_size followed optionally by [no]sync
395 *--------------------------------------------------------------*/
396 static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
397 unsigned int argc, char **argv)
398 {
399 int r;
400 struct dm_dev *dev;
402 if (argc < 2 || argc > 3) {
403 DMWARN("wrong number of arguments to disk mirror log");
404 return -EINVAL;
405 }
407 r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
408 FMODE_READ | FMODE_WRITE, &dev);
409 if (r)
410 return r;
412 r = create_log_context(log, ti, argc - 1, argv + 1, dev);
413 if (r) {
414 dm_put_device(ti, dev);
415 return r;
416 }
418 return 0;
419 }
421 static void disk_dtr(struct dirty_log *log)
422 {
423 struct log_c *lc = (struct log_c *) log->context;
425 dm_put_device(lc->ti, lc->log_dev);
426 vfree(lc->disk_header);
427 destroy_log_context(lc);
428 }
430 static int count_bits32(uint32_t *addr, unsigned size)
431 {
432 int count = 0, i;
434 for (i = 0; i < size; i++) {
435 count += hweight32(*(addr+i));
436 }
437 return count;
438 }
440 static int disk_resume(struct dirty_log *log)
441 {
442 int r;
443 unsigned i;
444 struct log_c *lc = (struct log_c *) log->context;
445 size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
447 /* read the disk header */
448 r = read_header(lc);
449 if (r)
450 return r;
452 /* set or clear any new bits -- device has grown */
453 if (lc->sync == NOSYNC)
454 for (i = lc->header.nr_regions; i < lc->region_count; i++)
455 /* FIXME: amazingly inefficient */
456 log_set_bit(lc, lc->clean_bits, i);
457 else
458 for (i = lc->header.nr_regions; i < lc->region_count; i++)
459 /* FIXME: amazingly inefficient */
460 log_clear_bit(lc, lc->clean_bits, i);
462 /* clear any old bits -- device has shrunk */
463 for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
464 log_clear_bit(lc, lc->clean_bits, i);
466 /* copy clean across to sync */
467 memcpy(lc->sync_bits, lc->clean_bits, size);
468 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
470 /* set the correct number of regions in the header */
471 lc->header.nr_regions = lc->region_count;
473 /* write the new header */
474 return write_header(lc);
475 }
477 static uint32_t core_get_region_size(struct dirty_log *log)
478 {
479 struct log_c *lc = (struct log_c *) log->context;
480 return lc->region_size;
481 }
483 static int core_is_clean(struct dirty_log *log, region_t region)
484 {
485 struct log_c *lc = (struct log_c *) log->context;
486 return log_test_bit(lc->clean_bits, region);
487 }
489 static int core_in_sync(struct dirty_log *log, region_t region, int block)
490 {
491 struct log_c *lc = (struct log_c *) log->context;
492 return log_test_bit(lc->sync_bits, region);
493 }
495 static int core_flush(struct dirty_log *log)
496 {
497 /* no op */
498 return 0;
499 }
501 static int disk_flush(struct dirty_log *log)
502 {
503 int r;
504 struct log_c *lc = (struct log_c *) log->context;
506 /* only write if the log has changed */
507 if (!lc->touched)
508 return 0;
510 r = write_header(lc);
511 if (!r)
512 lc->touched = 0;
514 return r;
515 }
517 static void core_mark_region(struct dirty_log *log, region_t region)
518 {
519 struct log_c *lc = (struct log_c *) log->context;
520 log_clear_bit(lc, lc->clean_bits, region);
521 }
523 static void core_clear_region(struct dirty_log *log, region_t region)
524 {
525 struct log_c *lc = (struct log_c *) log->context;
526 log_set_bit(lc, lc->clean_bits, region);
527 }
529 static int core_get_resync_work(struct dirty_log *log, region_t *region)
530 {
531 struct log_c *lc = (struct log_c *) log->context;
533 if (lc->sync_search >= lc->region_count)
534 return 0;
536 do {
537 *region = ext2_find_next_zero_bit(
538 (unsigned long *) lc->sync_bits,
539 lc->region_count,
540 lc->sync_search);
541 lc->sync_search = *region + 1;
543 if (*region >= lc->region_count)
544 return 0;
546 } while (log_test_bit(lc->recovering_bits, *region));
548 log_set_bit(lc, lc->recovering_bits, *region);
549 return 1;
550 }
552 static void core_complete_resync_work(struct dirty_log *log, region_t region,
553 int success)
554 {
555 struct log_c *lc = (struct log_c *) log->context;
557 log_clear_bit(lc, lc->recovering_bits, region);
558 if (success) {
559 log_set_bit(lc, lc->sync_bits, region);
560 lc->sync_count++;
561 }
562 }
564 static region_t core_get_sync_count(struct dirty_log *log)
565 {
566 struct log_c *lc = (struct log_c *) log->context;
568 return lc->sync_count;
569 }
571 #define DMEMIT_SYNC \
572 if (lc->sync != DEFAULTSYNC) \
573 DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
575 static int core_status(struct dirty_log *log, status_type_t status,
576 char *result, unsigned int maxlen)
577 {
578 int sz = 0;
579 struct log_c *lc = log->context;
581 switch(status) {
582 case STATUSTYPE_INFO:
583 break;
585 case STATUSTYPE_TABLE:
586 DMEMIT("%s %u %u ", log->type->name,
587 lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
588 DMEMIT_SYNC;
589 }
591 return sz;
592 }
594 static int disk_status(struct dirty_log *log, status_type_t status,
595 char *result, unsigned int maxlen)
596 {
597 int sz = 0;
598 char buffer[16];
599 struct log_c *lc = log->context;
601 switch(status) {
602 case STATUSTYPE_INFO:
603 break;
605 case STATUSTYPE_TABLE:
606 format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
607 DMEMIT("%s %u %s %u ", log->type->name,
608 lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
609 lc->region_size);
610 DMEMIT_SYNC;
611 }
613 return sz;
614 }
616 static struct dirty_log_type _core_type = {
617 .name = "core",
618 .module = THIS_MODULE,
619 .ctr = core_ctr,
620 .dtr = core_dtr,
621 .get_region_size = core_get_region_size,
622 .is_clean = core_is_clean,
623 .in_sync = core_in_sync,
624 .flush = core_flush,
625 .mark_region = core_mark_region,
626 .clear_region = core_clear_region,
627 .get_resync_work = core_get_resync_work,
628 .complete_resync_work = core_complete_resync_work,
629 .get_sync_count = core_get_sync_count,
630 .status = core_status,
631 };
633 static struct dirty_log_type _disk_type = {
634 .name = "disk",
635 .module = THIS_MODULE,
636 .ctr = disk_ctr,
637 .dtr = disk_dtr,
638 .suspend = disk_flush,
639 .resume = disk_resume,
640 .get_region_size = core_get_region_size,
641 .is_clean = core_is_clean,
642 .in_sync = core_in_sync,
643 .flush = disk_flush,
644 .mark_region = core_mark_region,
645 .clear_region = core_clear_region,
646 .get_resync_work = core_get_resync_work,
647 .complete_resync_work = core_complete_resync_work,
648 .get_sync_count = core_get_sync_count,
649 .status = disk_status,
650 };
652 int __init dm_dirty_log_init(void)
653 {
654 int r;
656 r = dm_register_dirty_log_type(&_core_type);
657 if (r)
658 DMWARN("couldn't register core log");
660 r = dm_register_dirty_log_type(&_disk_type);
661 if (r) {
662 DMWARN("couldn't register disk type");
663 dm_unregister_dirty_log_type(&_core_type);
664 }
666 return r;
667 }
669 void dm_dirty_log_exit(void)
670 {
671 dm_unregister_dirty_log_type(&_disk_type);
672 dm_unregister_dirty_log_type(&_core_type);
673 }
675 EXPORT_SYMBOL(dm_register_dirty_log_type);
676 EXPORT_SYMBOL(dm_unregister_dirty_log_type);
677 EXPORT_SYMBOL(dm_create_dirty_log);
678 EXPORT_SYMBOL(dm_destroy_dirty_log);