ia64/linux-2.6.18-xen.hg

view drivers/md/kcopyd.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Copyright (C) 2002 Sistina Software (UK) Limited.
3 *
4 * This file is released under the GPL.
5 *
6 * Kcopyd provides a simple interface for copying an area of one
7 * block-device to one or more other block-devices, with an asynchronous
8 * completion notification.
9 */
11 #include <asm/types.h>
12 #include <asm/atomic.h>
14 #include <linux/blkdev.h>
15 #include <linux/fs.h>
16 #include <linux/init.h>
17 #include <linux/list.h>
18 #include <linux/mempool.h>
19 #include <linux/module.h>
20 #include <linux/pagemap.h>
21 #include <linux/slab.h>
22 #include <linux/vmalloc.h>
23 #include <linux/workqueue.h>
24 #include <linux/mutex.h>
26 #include "kcopyd.h"
28 static struct workqueue_struct *_kcopyd_wq;
29 static struct work_struct _kcopyd_work;
31 static inline void wake(void)
32 {
33 queue_work(_kcopyd_wq, &_kcopyd_work);
34 }
36 /*-----------------------------------------------------------------
37 * Each kcopyd client has its own little pool of preallocated
38 * pages for kcopyd io.
39 *---------------------------------------------------------------*/
40 struct kcopyd_client {
41 struct list_head list;
43 spinlock_t lock;
44 struct page_list *pages;
45 unsigned int nr_pages;
46 unsigned int nr_free_pages;
48 wait_queue_head_t destroyq;
49 atomic_t nr_jobs;
50 };
52 static struct page_list *alloc_pl(void)
53 {
54 struct page_list *pl;
56 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
57 if (!pl)
58 return NULL;
60 pl->page = alloc_page(GFP_KERNEL);
61 if (!pl->page) {
62 kfree(pl);
63 return NULL;
64 }
66 return pl;
67 }
69 static void free_pl(struct page_list *pl)
70 {
71 __free_page(pl->page);
72 kfree(pl);
73 }
75 static int kcopyd_get_pages(struct kcopyd_client *kc,
76 unsigned int nr, struct page_list **pages)
77 {
78 struct page_list *pl;
80 spin_lock(&kc->lock);
81 if (kc->nr_free_pages < nr) {
82 spin_unlock(&kc->lock);
83 return -ENOMEM;
84 }
86 kc->nr_free_pages -= nr;
87 for (*pages = pl = kc->pages; --nr; pl = pl->next)
88 ;
90 kc->pages = pl->next;
91 pl->next = NULL;
93 spin_unlock(&kc->lock);
95 return 0;
96 }
98 static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl)
99 {
100 struct page_list *cursor;
102 spin_lock(&kc->lock);
103 for (cursor = pl; cursor->next; cursor = cursor->next)
104 kc->nr_free_pages++;
106 kc->nr_free_pages++;
107 cursor->next = kc->pages;
108 kc->pages = pl;
109 spin_unlock(&kc->lock);
110 }
112 /*
113 * These three functions resize the page pool.
114 */
115 static void drop_pages(struct page_list *pl)
116 {
117 struct page_list *next;
119 while (pl) {
120 next = pl->next;
121 free_pl(pl);
122 pl = next;
123 }
124 }
126 static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
127 {
128 unsigned int i;
129 struct page_list *pl = NULL, *next;
131 for (i = 0; i < nr; i++) {
132 next = alloc_pl();
133 if (!next) {
134 if (pl)
135 drop_pages(pl);
136 return -ENOMEM;
137 }
138 next->next = pl;
139 pl = next;
140 }
142 kcopyd_put_pages(kc, pl);
143 kc->nr_pages += nr;
144 return 0;
145 }
147 static void client_free_pages(struct kcopyd_client *kc)
148 {
149 BUG_ON(kc->nr_free_pages != kc->nr_pages);
150 drop_pages(kc->pages);
151 kc->pages = NULL;
152 kc->nr_free_pages = kc->nr_pages = 0;
153 }
155 /*-----------------------------------------------------------------
156 * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
157 * for this reason we use a mempool to prevent the client from
158 * ever having to do io (which could cause a deadlock).
159 *---------------------------------------------------------------*/
160 struct kcopyd_job {
161 struct kcopyd_client *kc;
162 struct list_head list;
163 unsigned long flags;
165 /*
166 * Error state of the job.
167 */
168 int read_err;
169 unsigned int write_err;
171 /*
172 * Either READ or WRITE
173 */
174 int rw;
175 struct io_region source;
177 /*
178 * The destinations for the transfer.
179 */
180 unsigned int num_dests;
181 struct io_region dests[KCOPYD_MAX_REGIONS];
183 sector_t offset;
184 unsigned int nr_pages;
185 struct page_list *pages;
187 /*
188 * Set this to ensure you are notified when the job has
189 * completed. 'context' is for callback to use.
190 */
191 kcopyd_notify_fn fn;
192 void *context;
194 /*
195 * These fields are only used if the job has been split
196 * into more manageable parts.
197 */
198 struct semaphore lock;
199 atomic_t sub_jobs;
200 sector_t progress;
201 };
203 /* FIXME: this should scale with the number of pages */
204 #define MIN_JOBS 512
206 static kmem_cache_t *_job_cache;
207 static mempool_t *_job_pool;
209 /*
210 * We maintain three lists of jobs:
211 *
212 * i) jobs waiting for pages
213 * ii) jobs that have pages, and are waiting for the io to be issued.
214 * iii) jobs that have completed.
215 *
216 * All three of these are protected by job_lock.
217 */
218 static DEFINE_SPINLOCK(_job_lock);
220 static LIST_HEAD(_complete_jobs);
221 static LIST_HEAD(_io_jobs);
222 static LIST_HEAD(_pages_jobs);
224 static int jobs_init(void)
225 {
226 _job_cache = kmem_cache_create("kcopyd-jobs",
227 sizeof(struct kcopyd_job),
228 __alignof__(struct kcopyd_job),
229 0, NULL, NULL);
230 if (!_job_cache)
231 return -ENOMEM;
233 _job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
234 if (!_job_pool) {
235 kmem_cache_destroy(_job_cache);
236 return -ENOMEM;
237 }
239 return 0;
240 }
242 static void jobs_exit(void)
243 {
244 BUG_ON(!list_empty(&_complete_jobs));
245 BUG_ON(!list_empty(&_io_jobs));
246 BUG_ON(!list_empty(&_pages_jobs));
248 mempool_destroy(_job_pool);
249 kmem_cache_destroy(_job_cache);
250 _job_pool = NULL;
251 _job_cache = NULL;
252 }
254 /*
255 * Functions to push and pop a job onto the head of a given job
256 * list.
257 */
258 static inline struct kcopyd_job *pop(struct list_head *jobs)
259 {
260 struct kcopyd_job *job = NULL;
261 unsigned long flags;
263 spin_lock_irqsave(&_job_lock, flags);
265 if (!list_empty(jobs)) {
266 job = list_entry(jobs->next, struct kcopyd_job, list);
267 list_del(&job->list);
268 }
269 spin_unlock_irqrestore(&_job_lock, flags);
271 return job;
272 }
274 static inline void push(struct list_head *jobs, struct kcopyd_job *job)
275 {
276 unsigned long flags;
278 spin_lock_irqsave(&_job_lock, flags);
279 list_add_tail(&job->list, jobs);
280 spin_unlock_irqrestore(&_job_lock, flags);
281 }
283 /*
284 * These three functions process 1 item from the corresponding
285 * job list.
286 *
287 * They return:
288 * < 0: error
289 * 0: success
290 * > 0: can't process yet.
291 */
292 static int run_complete_job(struct kcopyd_job *job)
293 {
294 void *context = job->context;
295 int read_err = job->read_err;
296 unsigned int write_err = job->write_err;
297 kcopyd_notify_fn fn = job->fn;
298 struct kcopyd_client *kc = job->kc;
300 kcopyd_put_pages(kc, job->pages);
301 mempool_free(job, _job_pool);
302 fn(read_err, write_err, context);
304 if (atomic_dec_and_test(&kc->nr_jobs))
305 wake_up(&kc->destroyq);
307 return 0;
308 }
310 static void complete_io(unsigned long error, void *context)
311 {
312 struct kcopyd_job *job = (struct kcopyd_job *) context;
314 if (error) {
315 if (job->rw == WRITE)
316 job->write_err |= error;
317 else
318 job->read_err = 1;
320 if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
321 push(&_complete_jobs, job);
322 wake();
323 return;
324 }
325 }
327 if (job->rw == WRITE)
328 push(&_complete_jobs, job);
330 else {
331 job->rw = WRITE;
332 push(&_io_jobs, job);
333 }
335 wake();
336 }
338 /*
339 * Request io on as many buffer heads as we can currently get for
340 * a particular job.
341 */
342 static int run_io_job(struct kcopyd_job *job)
343 {
344 int r;
346 if (job->rw == READ)
347 r = dm_io_async(1, &job->source, job->rw,
348 job->pages,
349 job->offset, complete_io, job);
351 else
352 r = dm_io_async(job->num_dests, job->dests, job->rw,
353 job->pages,
354 job->offset, complete_io, job);
356 return r;
357 }
359 static int run_pages_job(struct kcopyd_job *job)
360 {
361 int r;
363 job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
364 PAGE_SIZE >> 9);
365 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
366 if (!r) {
367 /* this job is ready for io */
368 push(&_io_jobs, job);
369 return 0;
370 }
372 if (r == -ENOMEM)
373 /* can't complete now */
374 return 1;
376 return r;
377 }
379 /*
380 * Run through a list for as long as possible. Returns the count
381 * of successful jobs.
382 */
383 static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
384 {
385 struct kcopyd_job *job;
386 int r, count = 0;
388 while ((job = pop(jobs))) {
390 r = fn(job);
392 if (r < 0) {
393 /* error this rogue job */
394 if (job->rw == WRITE)
395 job->write_err = (unsigned int) -1;
396 else
397 job->read_err = 1;
398 push(&_complete_jobs, job);
399 break;
400 }
402 if (r > 0) {
403 /*
404 * We couldn't service this job ATM, so
405 * push this job back onto the list.
406 */
407 push(jobs, job);
408 break;
409 }
411 count++;
412 }
414 return count;
415 }
417 /*
418 * kcopyd does this every time it's woken up.
419 */
420 static void do_work(void *ignored)
421 {
422 /*
423 * The order that these are called is *very* important.
424 * complete jobs can free some pages for pages jobs.
425 * Pages jobs when successful will jump onto the io jobs
426 * list. io jobs call wake when they complete and it all
427 * starts again.
428 */
429 process_jobs(&_complete_jobs, run_complete_job);
430 process_jobs(&_pages_jobs, run_pages_job);
431 process_jobs(&_io_jobs, run_io_job);
432 }
434 /*
435 * If we are copying a small region we just dispatch a single job
436 * to do the copy, otherwise the io has to be split up into many
437 * jobs.
438 */
439 static void dispatch_job(struct kcopyd_job *job)
440 {
441 atomic_inc(&job->kc->nr_jobs);
442 push(&_pages_jobs, job);
443 wake();
444 }
446 #define SUB_JOB_SIZE 128
447 static void segment_complete(int read_err,
448 unsigned int write_err, void *context)
449 {
450 /* FIXME: tidy this function */
451 sector_t progress = 0;
452 sector_t count = 0;
453 struct kcopyd_job *job = (struct kcopyd_job *) context;
455 down(&job->lock);
457 /* update the error */
458 if (read_err)
459 job->read_err = 1;
461 if (write_err)
462 job->write_err |= write_err;
464 /*
465 * Only dispatch more work if there hasn't been an error.
466 */
467 if ((!job->read_err && !job->write_err) ||
468 test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
469 /* get the next chunk of work */
470 progress = job->progress;
471 count = job->source.count - progress;
472 if (count) {
473 if (count > SUB_JOB_SIZE)
474 count = SUB_JOB_SIZE;
476 job->progress += count;
477 }
478 }
479 up(&job->lock);
481 if (count) {
482 int i;
483 struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
485 *sub_job = *job;
486 sub_job->source.sector += progress;
487 sub_job->source.count = count;
489 for (i = 0; i < job->num_dests; i++) {
490 sub_job->dests[i].sector += progress;
491 sub_job->dests[i].count = count;
492 }
494 sub_job->fn = segment_complete;
495 sub_job->context = job;
496 dispatch_job(sub_job);
498 } else if (atomic_dec_and_test(&job->sub_jobs)) {
500 /*
501 * To avoid a race we must keep the job around
502 * until after the notify function has completed.
503 * Otherwise the client may try and stop the job
504 * after we've completed.
505 */
506 job->fn(read_err, write_err, job->context);
507 mempool_free(job, _job_pool);
508 }
509 }
511 /*
512 * Create some little jobs that will do the move between
513 * them.
514 */
515 #define SPLIT_COUNT 8
516 static void split_job(struct kcopyd_job *job)
517 {
518 int i;
520 atomic_set(&job->sub_jobs, SPLIT_COUNT);
521 for (i = 0; i < SPLIT_COUNT; i++)
522 segment_complete(0, 0u, job);
523 }
525 int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
526 unsigned int num_dests, struct io_region *dests,
527 unsigned int flags, kcopyd_notify_fn fn, void *context)
528 {
529 struct kcopyd_job *job;
531 /*
532 * Allocate a new job.
533 */
534 job = mempool_alloc(_job_pool, GFP_NOIO);
536 /*
537 * set up for the read.
538 */
539 job->kc = kc;
540 job->flags = flags;
541 job->read_err = 0;
542 job->write_err = 0;
543 job->rw = READ;
545 job->source = *from;
547 job->num_dests = num_dests;
548 memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
550 job->offset = 0;
551 job->nr_pages = 0;
552 job->pages = NULL;
554 job->fn = fn;
555 job->context = context;
557 if (job->source.count < SUB_JOB_SIZE)
558 dispatch_job(job);
560 else {
561 init_MUTEX(&job->lock);
562 job->progress = 0;
563 split_job(job);
564 }
566 return 0;
567 }
569 /*
570 * Cancels a kcopyd job, eg. someone might be deactivating a
571 * mirror.
572 */
573 #if 0
574 int kcopyd_cancel(struct kcopyd_job *job, int block)
575 {
576 /* FIXME: finish */
577 return -1;
578 }
579 #endif /* 0 */
581 /*-----------------------------------------------------------------
582 * Unit setup
583 *---------------------------------------------------------------*/
584 static DEFINE_MUTEX(_client_lock);
585 static LIST_HEAD(_clients);
587 static void client_add(struct kcopyd_client *kc)
588 {
589 mutex_lock(&_client_lock);
590 list_add(&kc->list, &_clients);
591 mutex_unlock(&_client_lock);
592 }
594 static void client_del(struct kcopyd_client *kc)
595 {
596 mutex_lock(&_client_lock);
597 list_del(&kc->list);
598 mutex_unlock(&_client_lock);
599 }
601 static DEFINE_MUTEX(kcopyd_init_lock);
602 static int kcopyd_clients = 0;
604 static int kcopyd_init(void)
605 {
606 int r;
608 mutex_lock(&kcopyd_init_lock);
610 if (kcopyd_clients) {
611 /* Already initialized. */
612 kcopyd_clients++;
613 mutex_unlock(&kcopyd_init_lock);
614 return 0;
615 }
617 r = jobs_init();
618 if (r) {
619 mutex_unlock(&kcopyd_init_lock);
620 return r;
621 }
623 _kcopyd_wq = create_singlethread_workqueue("kcopyd");
624 if (!_kcopyd_wq) {
625 jobs_exit();
626 mutex_unlock(&kcopyd_init_lock);
627 return -ENOMEM;
628 }
630 kcopyd_clients++;
631 INIT_WORK(&_kcopyd_work, do_work, NULL);
632 mutex_unlock(&kcopyd_init_lock);
633 return 0;
634 }
636 static void kcopyd_exit(void)
637 {
638 mutex_lock(&kcopyd_init_lock);
639 kcopyd_clients--;
640 if (!kcopyd_clients) {
641 jobs_exit();
642 destroy_workqueue(_kcopyd_wq);
643 _kcopyd_wq = NULL;
644 }
645 mutex_unlock(&kcopyd_init_lock);
646 }
648 int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
649 {
650 int r = 0;
651 struct kcopyd_client *kc;
653 r = kcopyd_init();
654 if (r)
655 return r;
657 kc = kmalloc(sizeof(*kc), GFP_KERNEL);
658 if (!kc) {
659 kcopyd_exit();
660 return -ENOMEM;
661 }
663 spin_lock_init(&kc->lock);
664 kc->pages = NULL;
665 kc->nr_pages = kc->nr_free_pages = 0;
666 r = client_alloc_pages(kc, nr_pages);
667 if (r) {
668 kfree(kc);
669 kcopyd_exit();
670 return r;
671 }
673 r = dm_io_get(nr_pages);
674 if (r) {
675 client_free_pages(kc);
676 kfree(kc);
677 kcopyd_exit();
678 return r;
679 }
681 init_waitqueue_head(&kc->destroyq);
682 atomic_set(&kc->nr_jobs, 0);
684 client_add(kc);
685 *result = kc;
686 return 0;
687 }
689 void kcopyd_client_destroy(struct kcopyd_client *kc)
690 {
691 /* Wait for completion of all jobs submitted by this client. */
692 wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
694 dm_io_put(kc->nr_pages);
695 client_free_pages(kc);
696 client_del(kc);
697 kfree(kc);
698 kcopyd_exit();
699 }
701 EXPORT_SYMBOL(kcopyd_client_create);
702 EXPORT_SYMBOL(kcopyd_client_destroy);
703 EXPORT_SYMBOL(kcopyd_copy);