ia64/linux-2.6.18-xen.hg

view drivers/md/dm-exception-store.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 *
6 * This file is released under the GPL.
7 */
9 #include "dm.h"
10 #include "dm-snap.h"
11 #include "dm-io.h"
12 #include "kcopyd.h"
14 #include <linux/mm.h>
15 #include <linux/pagemap.h>
16 #include <linux/vmalloc.h>
17 #include <linux/slab.h>
19 #define DM_MSG_PREFIX "snapshots"
21 /*-----------------------------------------------------------------
22 * Persistent snapshots, by persistent we mean that the snapshot
23 * will survive a reboot.
24 *---------------------------------------------------------------*/
26 /*
27 * We need to store a record of which parts of the origin have
28 * been copied to the snapshot device. The snapshot code
29 * requires that we copy exception chunks to chunk aligned areas
30 * of the COW store. It makes sense therefore, to store the
31 * metadata in chunk size blocks.
32 *
33 * There is no backward or forward compatibility implemented,
34 * snapshots with different disk versions than the kernel will
35 * not be usable. It is expected that "lvcreate" will blank out
36 * the start of a fresh COW device before calling the snapshot
37 * constructor.
38 *
39 * The first chunk of the COW device just contains the header.
40 * After this there is a chunk filled with exception metadata,
41 * followed by as many exception chunks as can fit in the
42 * metadata areas.
43 *
44 * All on disk structures are in little-endian format. The end
45 * of the exceptions info is indicated by an exception with a
46 * new_chunk of 0, which is invalid since it would point to the
47 * header chunk.
48 */
50 /*
51 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
52 */
53 #define SNAP_MAGIC 0x70416e53
55 /*
56 * The on-disk version of the metadata.
57 */
58 #define SNAPSHOT_DISK_VERSION 1
60 struct disk_header {
61 uint32_t magic;
63 /*
64 * Is this snapshot valid. There is no way of recovering
65 * an invalid snapshot.
66 */
67 uint32_t valid;
69 /*
70 * Simple, incrementing version. no backward
71 * compatibility.
72 */
73 uint32_t version;
75 /* In sectors */
76 uint32_t chunk_size;
77 };
79 struct disk_exception {
80 uint64_t old_chunk;
81 uint64_t new_chunk;
82 };
84 struct commit_callback {
85 void (*callback)(void *, int success);
86 void *context;
87 };
89 /*
90 * The top level structure for a persistent exception store.
91 */
92 struct pstore {
93 struct dm_snapshot *snap; /* up pointer to my snapshot */
94 int version;
95 int valid;
96 uint32_t exceptions_per_area;
98 /*
99 * Now that we have an asynchronous kcopyd there is no
100 * need for large chunk sizes, so it wont hurt to have a
101 * whole chunks worth of metadata in memory at once.
102 */
103 void *area;
105 /*
106 * Used to keep track of which metadata area the data in
107 * 'chunk' refers to.
108 */
109 uint32_t current_area;
111 /*
112 * The next free chunk for an exception.
113 */
114 uint32_t next_free;
116 /*
117 * The index of next free exception in the current
118 * metadata area.
119 */
120 uint32_t current_committed;
122 atomic_t pending_count;
123 uint32_t callback_count;
124 struct commit_callback *callbacks;
125 };
127 static inline unsigned int sectors_to_pages(unsigned int sectors)
128 {
129 return sectors / (PAGE_SIZE >> 9);
130 }
132 static int alloc_area(struct pstore *ps)
133 {
134 int r = -ENOMEM;
135 size_t len;
137 len = ps->snap->chunk_size << SECTOR_SHIFT;
139 /*
140 * Allocate the chunk_size block of memory that will hold
141 * a single metadata area.
142 */
143 ps->area = vmalloc(len);
144 if (!ps->area)
145 return r;
147 return 0;
148 }
150 static void free_area(struct pstore *ps)
151 {
152 vfree(ps->area);
153 }
155 /*
156 * Read or write a chunk aligned and sized block of data from a device.
157 */
158 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
159 {
160 struct io_region where;
161 unsigned long bits;
163 where.bdev = ps->snap->cow->bdev;
164 where.sector = ps->snap->chunk_size * chunk;
165 where.count = ps->snap->chunk_size;
167 return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
168 }
170 /*
171 * Read or write a metadata area. Remembering to skip the first
172 * chunk which holds the header.
173 */
174 static int area_io(struct pstore *ps, uint32_t area, int rw)
175 {
176 int r;
177 uint32_t chunk;
179 /* convert a metadata area index to a chunk index */
180 chunk = 1 + ((ps->exceptions_per_area + 1) * area);
182 r = chunk_io(ps, chunk, rw);
183 if (r)
184 return r;
186 ps->current_area = area;
187 return 0;
188 }
190 static int zero_area(struct pstore *ps, uint32_t area)
191 {
192 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
193 return area_io(ps, area, WRITE);
194 }
196 static int read_header(struct pstore *ps, int *new_snapshot)
197 {
198 int r;
199 struct disk_header *dh;
200 chunk_t chunk_size;
202 r = chunk_io(ps, 0, READ);
203 if (r)
204 return r;
206 dh = (struct disk_header *) ps->area;
208 if (le32_to_cpu(dh->magic) == 0) {
209 *new_snapshot = 1;
211 } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
212 *new_snapshot = 0;
213 ps->valid = le32_to_cpu(dh->valid);
214 ps->version = le32_to_cpu(dh->version);
215 chunk_size = le32_to_cpu(dh->chunk_size);
216 if (ps->snap->chunk_size != chunk_size) {
217 DMWARN("chunk size %llu in device metadata overrides "
218 "table chunk size of %llu.",
219 (unsigned long long)chunk_size,
220 (unsigned long long)ps->snap->chunk_size);
222 /* We had a bogus chunk_size. Fix stuff up. */
223 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
224 free_area(ps);
226 ps->snap->chunk_size = chunk_size;
227 ps->snap->chunk_mask = chunk_size - 1;
228 ps->snap->chunk_shift = ffs(chunk_size) - 1;
230 r = alloc_area(ps);
231 if (r)
232 return r;
234 r = dm_io_get(sectors_to_pages(chunk_size));
235 if (r)
236 return r;
237 }
238 } else {
239 DMWARN("Invalid/corrupt snapshot");
240 r = -ENXIO;
241 }
243 return r;
244 }
246 static int write_header(struct pstore *ps)
247 {
248 struct disk_header *dh;
250 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
252 dh = (struct disk_header *) ps->area;
253 dh->magic = cpu_to_le32(SNAP_MAGIC);
254 dh->valid = cpu_to_le32(ps->valid);
255 dh->version = cpu_to_le32(ps->version);
256 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
258 return chunk_io(ps, 0, WRITE);
259 }
261 /*
262 * Access functions for the disk exceptions, these do the endian conversions.
263 */
264 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
265 {
266 if (index >= ps->exceptions_per_area)
267 return NULL;
269 return ((struct disk_exception *) ps->area) + index;
270 }
272 static int read_exception(struct pstore *ps,
273 uint32_t index, struct disk_exception *result)
274 {
275 struct disk_exception *e;
277 e = get_exception(ps, index);
278 if (!e)
279 return -EINVAL;
281 /* copy it */
282 result->old_chunk = le64_to_cpu(e->old_chunk);
283 result->new_chunk = le64_to_cpu(e->new_chunk);
285 return 0;
286 }
288 static int write_exception(struct pstore *ps,
289 uint32_t index, struct disk_exception *de)
290 {
291 struct disk_exception *e;
293 e = get_exception(ps, index);
294 if (!e)
295 return -EINVAL;
297 /* copy it */
298 e->old_chunk = cpu_to_le64(de->old_chunk);
299 e->new_chunk = cpu_to_le64(de->new_chunk);
301 return 0;
302 }
304 /*
305 * Registers the exceptions that are present in the current area.
306 * 'full' is filled in to indicate if the area has been
307 * filled.
308 */
309 static int insert_exceptions(struct pstore *ps, int *full)
310 {
311 int r;
312 unsigned int i;
313 struct disk_exception de;
315 /* presume the area is full */
316 *full = 1;
318 for (i = 0; i < ps->exceptions_per_area; i++) {
319 r = read_exception(ps, i, &de);
321 if (r)
322 return r;
324 /*
325 * If the new_chunk is pointing at the start of
326 * the COW device, where the first metadata area
327 * is we know that we've hit the end of the
328 * exceptions. Therefore the area is not full.
329 */
330 if (de.new_chunk == 0LL) {
331 ps->current_committed = i;
332 *full = 0;
333 break;
334 }
336 /*
337 * Keep track of the start of the free chunks.
338 */
339 if (ps->next_free <= de.new_chunk)
340 ps->next_free = de.new_chunk + 1;
342 /*
343 * Otherwise we add the exception to the snapshot.
344 */
345 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
346 if (r)
347 return r;
348 }
350 return 0;
351 }
353 static int read_exceptions(struct pstore *ps)
354 {
355 uint32_t area;
356 int r, full = 1;
358 /*
359 * Keeping reading chunks and inserting exceptions until
360 * we find a partially full area.
361 */
362 for (area = 0; full; area++) {
363 r = area_io(ps, area, READ);
364 if (r)
365 return r;
367 r = insert_exceptions(ps, &full);
368 if (r)
369 return r;
370 }
372 return 0;
373 }
375 static inline struct pstore *get_info(struct exception_store *store)
376 {
377 return (struct pstore *) store->context;
378 }
380 static void persistent_fraction_full(struct exception_store *store,
381 sector_t *numerator, sector_t *denominator)
382 {
383 *numerator = get_info(store)->next_free * store->snap->chunk_size;
384 *denominator = get_dev_size(store->snap->cow->bdev);
385 }
387 static void persistent_destroy(struct exception_store *store)
388 {
389 struct pstore *ps = get_info(store);
391 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
392 vfree(ps->callbacks);
393 free_area(ps);
394 kfree(ps);
395 }
397 static int persistent_read_metadata(struct exception_store *store)
398 {
399 int r, new_snapshot;
400 struct pstore *ps = get_info(store);
402 /*
403 * Read the snapshot header.
404 */
405 r = read_header(ps, &new_snapshot);
406 if (r)
407 return r;
409 /*
410 * Now we know correct chunk_size, complete the initialisation.
411 */
412 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
413 sizeof(struct disk_exception);
414 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
415 sizeof(*ps->callbacks));
416 if (!ps->callbacks)
417 return -ENOMEM;
419 /*
420 * Do we need to setup a new snapshot ?
421 */
422 if (new_snapshot) {
423 r = write_header(ps);
424 if (r) {
425 DMWARN("write_header failed");
426 return r;
427 }
429 r = zero_area(ps, 0);
430 if (r) {
431 DMWARN("zero_area(0) failed");
432 return r;
433 }
435 } else {
436 /*
437 * Sanity checks.
438 */
439 if (!ps->valid) {
440 DMWARN("snapshot is marked invalid");
441 return -EINVAL;
442 }
444 if (ps->version != SNAPSHOT_DISK_VERSION) {
445 DMWARN("unable to handle snapshot disk version %d",
446 ps->version);
447 return -EINVAL;
448 }
450 /*
451 * Read the metadata.
452 */
453 r = read_exceptions(ps);
454 if (r)
455 return r;
456 }
458 return 0;
459 }
461 static int persistent_prepare(struct exception_store *store,
462 struct exception *e)
463 {
464 struct pstore *ps = get_info(store);
465 uint32_t stride;
466 sector_t size = get_dev_size(store->snap->cow->bdev);
468 /* Is there enough room ? */
469 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
470 return -ENOSPC;
472 e->new_chunk = ps->next_free;
474 /*
475 * Move onto the next free pending, making sure to take
476 * into account the location of the metadata chunks.
477 */
478 stride = (ps->exceptions_per_area + 1);
479 if ((++ps->next_free % stride) == 1)
480 ps->next_free++;
482 atomic_inc(&ps->pending_count);
483 return 0;
484 }
486 static void persistent_commit(struct exception_store *store,
487 struct exception *e,
488 void (*callback) (void *, int success),
489 void *callback_context)
490 {
491 int r;
492 unsigned int i;
493 struct pstore *ps = get_info(store);
494 struct disk_exception de;
495 struct commit_callback *cb;
497 de.old_chunk = e->old_chunk;
498 de.new_chunk = e->new_chunk;
499 write_exception(ps, ps->current_committed++, &de);
501 /*
502 * Add the callback to the back of the array. This code
503 * is the only place where the callback array is
504 * manipulated, and we know that it will never be called
505 * multiple times concurrently.
506 */
507 cb = ps->callbacks + ps->callback_count++;
508 cb->callback = callback;
509 cb->context = callback_context;
511 /*
512 * If there are no more exceptions in flight, or we have
513 * filled this metadata area we commit the exceptions to
514 * disk.
515 */
516 if (atomic_dec_and_test(&ps->pending_count) ||
517 (ps->current_committed == ps->exceptions_per_area)) {
518 r = area_io(ps, ps->current_area, WRITE);
519 if (r)
520 ps->valid = 0;
522 for (i = 0; i < ps->callback_count; i++) {
523 cb = ps->callbacks + i;
524 cb->callback(cb->context, r == 0 ? 1 : 0);
525 }
527 ps->callback_count = 0;
528 }
530 /*
531 * Have we completely filled the current area ?
532 */
533 if (ps->current_committed == ps->exceptions_per_area) {
534 ps->current_committed = 0;
535 r = zero_area(ps, ps->current_area + 1);
536 if (r)
537 ps->valid = 0;
538 }
539 }
541 static void persistent_drop(struct exception_store *store)
542 {
543 struct pstore *ps = get_info(store);
545 ps->valid = 0;
546 if (write_header(ps))
547 DMWARN("write header failed");
548 }
550 int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
551 {
552 int r;
553 struct pstore *ps;
555 r = dm_io_get(sectors_to_pages(chunk_size));
556 if (r)
557 return r;
559 /* allocate the pstore */
560 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
561 if (!ps) {
562 r = -ENOMEM;
563 goto bad;
564 }
566 ps->snap = store->snap;
567 ps->valid = 1;
568 ps->version = SNAPSHOT_DISK_VERSION;
569 ps->next_free = 2; /* skipping the header and first area */
570 ps->current_committed = 0;
572 r = alloc_area(ps);
573 if (r)
574 goto bad;
576 ps->callback_count = 0;
577 atomic_set(&ps->pending_count, 0);
578 ps->callbacks = NULL;
580 store->destroy = persistent_destroy;
581 store->read_metadata = persistent_read_metadata;
582 store->prepare_exception = persistent_prepare;
583 store->commit_exception = persistent_commit;
584 store->drop_snapshot = persistent_drop;
585 store->fraction_full = persistent_fraction_full;
586 store->context = ps;
588 return 0;
590 bad:
591 dm_io_put(sectors_to_pages(chunk_size));
592 if (ps && ps->area)
593 free_area(ps);
594 kfree(ps);
595 return r;
596 }
598 /*-----------------------------------------------------------------
599 * Implementation of the store for non-persistent snapshots.
600 *---------------------------------------------------------------*/
601 struct transient_c {
602 sector_t next_free;
603 };
605 static void transient_destroy(struct exception_store *store)
606 {
607 kfree(store->context);
608 }
610 static int transient_read_metadata(struct exception_store *store)
611 {
612 return 0;
613 }
615 static int transient_prepare(struct exception_store *store, struct exception *e)
616 {
617 struct transient_c *tc = (struct transient_c *) store->context;
618 sector_t size = get_dev_size(store->snap->cow->bdev);
620 if (size < (tc->next_free + store->snap->chunk_size))
621 return -1;
623 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
624 tc->next_free += store->snap->chunk_size;
626 return 0;
627 }
629 static void transient_commit(struct exception_store *store,
630 struct exception *e,
631 void (*callback) (void *, int success),
632 void *callback_context)
633 {
634 /* Just succeed */
635 callback(callback_context, 1);
636 }
638 static void transient_fraction_full(struct exception_store *store,
639 sector_t *numerator, sector_t *denominator)
640 {
641 *numerator = ((struct transient_c *) store->context)->next_free;
642 *denominator = get_dev_size(store->snap->cow->bdev);
643 }
645 int dm_create_transient(struct exception_store *store,
646 struct dm_snapshot *s, int blocksize)
647 {
648 struct transient_c *tc;
650 memset(store, 0, sizeof(*store));
651 store->destroy = transient_destroy;
652 store->read_metadata = transient_read_metadata;
653 store->prepare_exception = transient_prepare;
654 store->commit_exception = transient_commit;
655 store->fraction_full = transient_fraction_full;
656 store->snap = s;
658 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
659 if (!tc)
660 return -ENOMEM;
662 tc->next_free = 0;
663 store->context = tc;
665 return 0;
666 }