ia64/linux-2.6.18-xen.hg

view drivers/md/dm-emc.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 *
7 * Multipath support for EMC CLARiiON AX/CX-series hardware.
8 */
10 #include "dm.h"
11 #include "dm-hw-handler.h"
12 #include <scsi/scsi.h>
13 #include <scsi/scsi_cmnd.h>
15 #define DM_MSG_PREFIX "multipath emc"
17 struct emc_handler {
18 spinlock_t lock;
20 /* Whether we should send the short trespass command (FC-series)
21 * or the long version (default for AX/CX CLARiiON arrays). */
22 unsigned short_trespass;
23 /* Whether or not to honor SCSI reservations when initiating a
24 * switch-over. Default: Don't. */
25 unsigned hr;
27 unsigned char sense[SCSI_SENSE_BUFFERSIZE];
28 };
30 #define TRESPASS_PAGE 0x22
31 #define EMC_FAILOVER_TIMEOUT (60 * HZ)
33 /* Code borrowed from dm-lsi-rdac by Mike Christie */
35 static inline void free_bio(struct bio *bio)
36 {
37 __free_page(bio->bi_io_vec[0].bv_page);
38 bio_put(bio);
39 }
41 static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
42 {
43 struct path *path = bio->bi_private;
45 if (bio->bi_size)
46 return 1;
48 /* We also need to look at the sense keys here whether or not to
49 * switch to the next PG etc.
50 *
51 * For now simple logic: either it works or it doesn't.
52 */
53 if (error)
54 dm_pg_init_complete(path, MP_FAIL_PATH);
55 else
56 dm_pg_init_complete(path, 0);
58 /* request is freed in block layer */
59 free_bio(bio);
61 return 0;
62 }
64 static struct bio *get_failover_bio(struct path *path, unsigned data_size)
65 {
66 struct bio *bio;
67 struct page *page;
69 bio = bio_alloc(GFP_ATOMIC, 1);
70 if (!bio) {
71 DMERR("get_failover_bio: bio_alloc() failed.");
72 return NULL;
73 }
75 bio->bi_rw |= (1 << BIO_RW);
76 bio->bi_bdev = path->dev->bdev;
77 bio->bi_sector = 0;
78 bio->bi_private = path;
79 bio->bi_end_io = emc_endio;
81 page = alloc_page(GFP_ATOMIC);
82 if (!page) {
83 DMERR("get_failover_bio: alloc_page() failed.");
84 bio_put(bio);
85 return NULL;
86 }
88 if (bio_add_page(bio, page, data_size, 0) != data_size) {
89 DMERR("get_failover_bio: alloc_page() failed.");
90 __free_page(page);
91 bio_put(bio);
92 return NULL;
93 }
95 return bio;
96 }
98 static struct request *get_failover_req(struct emc_handler *h,
99 struct bio *bio, struct path *path)
100 {
101 struct request *rq;
102 struct block_device *bdev = bio->bi_bdev;
103 struct request_queue *q = bdev_get_queue(bdev);
105 /* FIXME: Figure out why it fails with GFP_ATOMIC. */
106 rq = blk_get_request(q, WRITE, __GFP_WAIT);
107 if (!rq) {
108 DMERR("get_failover_req: blk_get_request failed");
109 return NULL;
110 }
112 rq->bio = rq->biotail = bio;
113 blk_rq_bio_prep(q, rq, bio);
115 rq->rq_disk = bdev->bd_contains->bd_disk;
117 /* bio backed don't set data */
118 rq->buffer = rq->data = NULL;
119 /* rq data_len used for pc cmd's request_bufflen */
120 rq->data_len = bio->bi_size;
122 rq->sense = h->sense;
123 memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
124 rq->sense_len = 0;
126 memset(&rq->cmd, 0, BLK_MAX_CDB);
128 rq->timeout = EMC_FAILOVER_TIMEOUT;
129 rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
131 return rq;
132 }
134 static struct request *emc_trespass_get(struct emc_handler *h,
135 struct path *path)
136 {
137 struct bio *bio;
138 struct request *rq;
139 unsigned char *page22;
140 unsigned char long_trespass_pg[] = {
141 0, 0, 0, 0,
142 TRESPASS_PAGE, /* Page code */
143 0x09, /* Page length - 2 */
144 h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
145 0xff, 0xff, /* Trespass target */
146 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */
147 };
148 unsigned char short_trespass_pg[] = {
149 0, 0, 0, 0,
150 TRESPASS_PAGE, /* Page code */
151 0x02, /* Page length - 2 */
152 h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
153 0xff, /* Trespass target */
154 };
155 unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
156 sizeof(long_trespass_pg);
158 /* get bio backing */
159 if (data_size > PAGE_SIZE)
160 /* this should never happen */
161 return NULL;
163 bio = get_failover_bio(path, data_size);
164 if (!bio) {
165 DMERR("emc_trespass_get: no bio");
166 return NULL;
167 }
169 page22 = (unsigned char *)bio_data(bio);
170 memset(page22, 0, data_size);
172 memcpy(page22, h->short_trespass ?
173 short_trespass_pg : long_trespass_pg, data_size);
175 /* get request for block layer packet command */
176 rq = get_failover_req(h, bio, path);
177 if (!rq) {
178 DMERR("emc_trespass_get: no rq");
179 free_bio(bio);
180 return NULL;
181 }
183 /* Prepare the command. */
184 rq->cmd[0] = MODE_SELECT;
185 rq->cmd[1] = 0x10;
186 rq->cmd[4] = data_size;
187 rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
189 return rq;
190 }
192 static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
193 struct path *path)
194 {
195 struct request *rq;
196 struct request_queue *q = bdev_get_queue(path->dev->bdev);
198 /*
199 * We can either blindly init the pg (then look at the sense),
200 * or we can send some commands to get the state here (then
201 * possibly send the fo cmnd), or we can also have the
202 * initial state passed into us and then get an update here.
203 */
204 if (!q) {
205 DMINFO("emc_pg_init: no queue");
206 goto fail_path;
207 }
209 /* FIXME: The request should be pre-allocated. */
210 rq = emc_trespass_get(hwh->context, path);
211 if (!rq) {
212 DMERR("emc_pg_init: no rq");
213 goto fail_path;
214 }
216 DMINFO("emc_pg_init: sending switch-over command");
217 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
218 return;
220 fail_path:
221 dm_pg_init_complete(path, MP_FAIL_PATH);
222 }
224 static struct emc_handler *alloc_emc_handler(void)
225 {
226 struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
228 if (h) {
229 memset(h, 0, sizeof(*h));
230 spin_lock_init(&h->lock);
231 }
233 return h;
234 }
236 static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
237 {
238 struct emc_handler *h;
239 unsigned hr, short_trespass;
241 if (argc == 0) {
242 /* No arguments: use defaults */
243 hr = 0;
244 short_trespass = 0;
245 } else if (argc != 2) {
246 DMWARN("incorrect number of arguments");
247 return -EINVAL;
248 } else {
249 if ((sscanf(argv[0], "%u", &short_trespass) != 1)
250 || (short_trespass > 1)) {
251 DMWARN("invalid trespass mode selected");
252 return -EINVAL;
253 }
255 if ((sscanf(argv[1], "%u", &hr) != 1)
256 || (hr > 1)) {
257 DMWARN("invalid honor reservation flag selected");
258 return -EINVAL;
259 }
260 }
262 h = alloc_emc_handler();
263 if (!h)
264 return -ENOMEM;
266 hwh->context = h;
268 if ((h->short_trespass = short_trespass))
269 DMWARN("short trespass command will be send");
270 else
271 DMWARN("long trespass command will be send");
273 if ((h->hr = hr))
274 DMWARN("honor reservation bit will be set");
275 else
276 DMWARN("honor reservation bit will not be set (default)");
278 return 0;
279 }
281 static void emc_destroy(struct hw_handler *hwh)
282 {
283 struct emc_handler *h = (struct emc_handler *) hwh->context;
285 kfree(h);
286 hwh->context = NULL;
287 }
289 static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
290 {
291 /* FIXME: Patch from axboe still missing */
292 #if 0
293 int sense;
295 if (bio->bi_error & BIO_SENSE) {
296 sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
298 if (sense == 0x020403) {
299 /* LUN Not Ready - Manual Intervention Required
300 * indicates this is a passive path.
301 *
302 * FIXME: However, if this is seen and EVPD C0
303 * indicates that this is due to a NDU in
304 * progress, we should set FAIL_PATH too.
305 * This indicates we might have to do a SCSI
306 * inquiry in the end_io path. Ugh. */
307 return MP_BYPASS_PG | MP_RETRY_IO;
308 } else if (sense == 0x052501) {
309 /* An array based copy is in progress. Do not
310 * fail the path, do not bypass to another PG,
311 * do not retry. Fail the IO immediately.
312 * (Actually this is the same conclusion as in
313 * the default handler, but lets make sure.) */
314 return 0;
315 } else if (sense == 0x062900) {
316 /* Unit Attention Code. This is the first IO
317 * to the new path, so just retry. */
318 return MP_RETRY_IO;
319 }
320 }
321 #endif
323 /* Try default handler */
324 return dm_scsi_err_handler(hwh, bio);
325 }
327 static struct hw_handler_type emc_hwh = {
328 .name = "emc",
329 .module = THIS_MODULE,
330 .create = emc_create,
331 .destroy = emc_destroy,
332 .pg_init = emc_pg_init,
333 .error = emc_error,
334 };
336 static int __init dm_emc_init(void)
337 {
338 int r = dm_register_hw_handler(&emc_hwh);
340 if (r < 0)
341 DMERR("register failed %d", r);
343 DMINFO("version 0.0.3 loaded");
345 return r;
346 }
348 static void __exit dm_emc_exit(void)
349 {
350 int r = dm_unregister_hw_handler(&emc_hwh);
352 if (r < 0)
353 DMERR("unregister failed %d", r);
354 }
356 module_init(dm_emc_init);
357 module_exit(dm_emc_exit);
359 MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
360 MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
361 MODULE_LICENSE("GPL");