ia64/linux-2.6.18-xen.hg

annotate lib/prio_tree.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
rev   line source
ian@0 1 /*
ian@0 2 * lib/prio_tree.c - priority search tree
ian@0 3 *
ian@0 4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
ian@0 5 *
ian@0 6 * This file is released under the GPL v2.
ian@0 7 *
ian@0 8 * Based on the radix priority search tree proposed by Edward M. McCreight
ian@0 9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
ian@0 10 *
ian@0 11 * 02Feb2004 Initial version
ian@0 12 */
ian@0 13
ian@0 14 #include <linux/init.h>
ian@0 15 #include <linux/mm.h>
ian@0 16 #include <linux/prio_tree.h>
ian@0 17
ian@0 18 /*
ian@0 19 * A clever mix of heap and radix trees forms a radix priority search tree (PST)
ian@0 20 * which is useful for storing intervals, e.g, we can consider a vma as a closed
ian@0 21 * interval of file pages [offset_begin, offset_end], and store all vmas that
ian@0 22 * map a file in a PST. Then, using the PST, we can answer a stabbing query,
ian@0 23 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
ian@0 24 * given input interval X (a set of consecutive file pages), in "O(log n + m)"
ian@0 25 * time where 'log n' is the height of the PST, and 'm' is the number of stored
ian@0 26 * intervals (vmas) that overlap (map) with the input interval X (the set of
ian@0 27 * consecutive file pages).
ian@0 28 *
ian@0 29 * In our implementation, we store closed intervals of the form [radix_index,
ian@0 30 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
ian@0 31 * is designed for storing intervals with unique radix indices, i.e., each
ian@0 32 * interval have different radix_index. However, this limitation can be easily
ian@0 33 * overcome by using the size, i.e., heap_index - radix_index, as part of the
ian@0 34 * index, so we index the tree using [(radix_index,size), heap_index].
ian@0 35 *
ian@0 36 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
ian@0 37 * machine, the maximum height of a PST can be 64. We can use a balanced version
ian@0 38 * of the priority search tree to optimize the tree height, but the balanced
ian@0 39 * tree proposed by McCreight is too complex and memory-hungry for our purpose.
ian@0 40 */
ian@0 41
ian@0 42 /*
ian@0 43 * The following macros are used for implementing prio_tree for i_mmap
ian@0 44 */
ian@0 45
ian@0 46 #define RADIX_INDEX(vma) ((vma)->vm_pgoff)
ian@0 47 #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
ian@0 48 /* avoid overflow */
ian@0 49 #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
ian@0 50
ian@0 51
ian@0 52 static void get_index(const struct prio_tree_root *root,
ian@0 53 const struct prio_tree_node *node,
ian@0 54 unsigned long *radix, unsigned long *heap)
ian@0 55 {
ian@0 56 if (root->raw) {
ian@0 57 struct vm_area_struct *vma = prio_tree_entry(
ian@0 58 node, struct vm_area_struct, shared.prio_tree_node);
ian@0 59
ian@0 60 *radix = RADIX_INDEX(vma);
ian@0 61 *heap = HEAP_INDEX(vma);
ian@0 62 }
ian@0 63 else {
ian@0 64 *radix = node->start;
ian@0 65 *heap = node->last;
ian@0 66 }
ian@0 67 }
ian@0 68
ian@0 69 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
ian@0 70
ian@0 71 void __init prio_tree_init(void)
ian@0 72 {
ian@0 73 unsigned int i;
ian@0 74
ian@0 75 for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
ian@0 76 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
ian@0 77 index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
ian@0 78 }
ian@0 79
ian@0 80 /*
ian@0 81 * Maximum heap_index that can be stored in a PST with index_bits bits
ian@0 82 */
ian@0 83 static inline unsigned long prio_tree_maxindex(unsigned int bits)
ian@0 84 {
ian@0 85 return index_bits_to_maxindex[bits - 1];
ian@0 86 }
ian@0 87
ian@0 88 /*
ian@0 89 * Extend a priority search tree so that it can store a node with heap_index
ian@0 90 * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
ian@0 91 * However, this function is used rarely and the common case performance is
ian@0 92 * not bad.
ian@0 93 */
ian@0 94 static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
ian@0 95 struct prio_tree_node *node, unsigned long max_heap_index)
ian@0 96 {
ian@0 97 struct prio_tree_node *first = NULL, *prev, *last = NULL;
ian@0 98
ian@0 99 if (max_heap_index > prio_tree_maxindex(root->index_bits))
ian@0 100 root->index_bits++;
ian@0 101
ian@0 102 while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
ian@0 103 root->index_bits++;
ian@0 104
ian@0 105 if (prio_tree_empty(root))
ian@0 106 continue;
ian@0 107
ian@0 108 if (first == NULL) {
ian@0 109 first = root->prio_tree_node;
ian@0 110 prio_tree_remove(root, root->prio_tree_node);
ian@0 111 INIT_PRIO_TREE_NODE(first);
ian@0 112 last = first;
ian@0 113 } else {
ian@0 114 prev = last;
ian@0 115 last = root->prio_tree_node;
ian@0 116 prio_tree_remove(root, root->prio_tree_node);
ian@0 117 INIT_PRIO_TREE_NODE(last);
ian@0 118 prev->left = last;
ian@0 119 last->parent = prev;
ian@0 120 }
ian@0 121 }
ian@0 122
ian@0 123 INIT_PRIO_TREE_NODE(node);
ian@0 124
ian@0 125 if (first) {
ian@0 126 node->left = first;
ian@0 127 first->parent = node;
ian@0 128 } else
ian@0 129 last = node;
ian@0 130
ian@0 131 if (!prio_tree_empty(root)) {
ian@0 132 last->left = root->prio_tree_node;
ian@0 133 last->left->parent = last;
ian@0 134 }
ian@0 135
ian@0 136 root->prio_tree_node = node;
ian@0 137 return node;
ian@0 138 }
ian@0 139
ian@0 140 /*
ian@0 141 * Replace a prio_tree_node with a new node and return the old node
ian@0 142 */
ian@0 143 struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
ian@0 144 struct prio_tree_node *old, struct prio_tree_node *node)
ian@0 145 {
ian@0 146 INIT_PRIO_TREE_NODE(node);
ian@0 147
ian@0 148 if (prio_tree_root(old)) {
ian@0 149 BUG_ON(root->prio_tree_node != old);
ian@0 150 /*
ian@0 151 * We can reduce root->index_bits here. However, it is complex
ian@0 152 * and does not help much to improve performance (IMO).
ian@0 153 */
ian@0 154 node->parent = node;
ian@0 155 root->prio_tree_node = node;
ian@0 156 } else {
ian@0 157 node->parent = old->parent;
ian@0 158 if (old->parent->left == old)
ian@0 159 old->parent->left = node;
ian@0 160 else
ian@0 161 old->parent->right = node;
ian@0 162 }
ian@0 163
ian@0 164 if (!prio_tree_left_empty(old)) {
ian@0 165 node->left = old->left;
ian@0 166 old->left->parent = node;
ian@0 167 }
ian@0 168
ian@0 169 if (!prio_tree_right_empty(old)) {
ian@0 170 node->right = old->right;
ian@0 171 old->right->parent = node;
ian@0 172 }
ian@0 173
ian@0 174 return old;
ian@0 175 }
ian@0 176
ian@0 177 /*
ian@0 178 * Insert a prio_tree_node @node into a radix priority search tree @root. The
ian@0 179 * algorithm typically takes O(log n) time where 'log n' is the number of bits
ian@0 180 * required to represent the maximum heap_index. In the worst case, the algo
ian@0 181 * can take O((log n)^2) - check prio_tree_expand.
ian@0 182 *
ian@0 183 * If a prior node with same radix_index and heap_index is already found in
ian@0 184 * the tree, then returns the address of the prior node. Otherwise, inserts
ian@0 185 * @node into the tree and returns @node.
ian@0 186 */
ian@0 187 struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
ian@0 188 struct prio_tree_node *node)
ian@0 189 {
ian@0 190 struct prio_tree_node *cur, *res = node;
ian@0 191 unsigned long radix_index, heap_index;
ian@0 192 unsigned long r_index, h_index, index, mask;
ian@0 193 int size_flag = 0;
ian@0 194
ian@0 195 get_index(root, node, &radix_index, &heap_index);
ian@0 196
ian@0 197 if (prio_tree_empty(root) ||
ian@0 198 heap_index > prio_tree_maxindex(root->index_bits))
ian@0 199 return prio_tree_expand(root, node, heap_index);
ian@0 200
ian@0 201 cur = root->prio_tree_node;
ian@0 202 mask = 1UL << (root->index_bits - 1);
ian@0 203
ian@0 204 while (mask) {
ian@0 205 get_index(root, cur, &r_index, &h_index);
ian@0 206
ian@0 207 if (r_index == radix_index && h_index == heap_index)
ian@0 208 return cur;
ian@0 209
ian@0 210 if (h_index < heap_index ||
ian@0 211 (h_index == heap_index && r_index > radix_index)) {
ian@0 212 struct prio_tree_node *tmp = node;
ian@0 213 node = prio_tree_replace(root, cur, node);
ian@0 214 cur = tmp;
ian@0 215 /* swap indices */
ian@0 216 index = r_index;
ian@0 217 r_index = radix_index;
ian@0 218 radix_index = index;
ian@0 219 index = h_index;
ian@0 220 h_index = heap_index;
ian@0 221 heap_index = index;
ian@0 222 }
ian@0 223
ian@0 224 if (size_flag)
ian@0 225 index = heap_index - radix_index;
ian@0 226 else
ian@0 227 index = radix_index;
ian@0 228
ian@0 229 if (index & mask) {
ian@0 230 if (prio_tree_right_empty(cur)) {
ian@0 231 INIT_PRIO_TREE_NODE(node);
ian@0 232 cur->right = node;
ian@0 233 node->parent = cur;
ian@0 234 return res;
ian@0 235 } else
ian@0 236 cur = cur->right;
ian@0 237 } else {
ian@0 238 if (prio_tree_left_empty(cur)) {
ian@0 239 INIT_PRIO_TREE_NODE(node);
ian@0 240 cur->left = node;
ian@0 241 node->parent = cur;
ian@0 242 return res;
ian@0 243 } else
ian@0 244 cur = cur->left;
ian@0 245 }
ian@0 246
ian@0 247 mask >>= 1;
ian@0 248
ian@0 249 if (!mask) {
ian@0 250 mask = 1UL << (BITS_PER_LONG - 1);
ian@0 251 size_flag = 1;
ian@0 252 }
ian@0 253 }
ian@0 254 /* Should not reach here */
ian@0 255 BUG();
ian@0 256 return NULL;
ian@0 257 }
ian@0 258
ian@0 259 /*
ian@0 260 * Remove a prio_tree_node @node from a radix priority search tree @root. The
ian@0 261 * algorithm takes O(log n) time where 'log n' is the number of bits required
ian@0 262 * to represent the maximum heap_index.
ian@0 263 */
ian@0 264 void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
ian@0 265 {
ian@0 266 struct prio_tree_node *cur;
ian@0 267 unsigned long r_index, h_index_right, h_index_left;
ian@0 268
ian@0 269 cur = node;
ian@0 270
ian@0 271 while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
ian@0 272 if (!prio_tree_left_empty(cur))
ian@0 273 get_index(root, cur->left, &r_index, &h_index_left);
ian@0 274 else {
ian@0 275 cur = cur->right;
ian@0 276 continue;
ian@0 277 }
ian@0 278
ian@0 279 if (!prio_tree_right_empty(cur))
ian@0 280 get_index(root, cur->right, &r_index, &h_index_right);
ian@0 281 else {
ian@0 282 cur = cur->left;
ian@0 283 continue;
ian@0 284 }
ian@0 285
ian@0 286 /* both h_index_left and h_index_right cannot be 0 */
ian@0 287 if (h_index_left >= h_index_right)
ian@0 288 cur = cur->left;
ian@0 289 else
ian@0 290 cur = cur->right;
ian@0 291 }
ian@0 292
ian@0 293 if (prio_tree_root(cur)) {
ian@0 294 BUG_ON(root->prio_tree_node != cur);
ian@0 295 __INIT_PRIO_TREE_ROOT(root, root->raw);
ian@0 296 return;
ian@0 297 }
ian@0 298
ian@0 299 if (cur->parent->right == cur)
ian@0 300 cur->parent->right = cur->parent;
ian@0 301 else
ian@0 302 cur->parent->left = cur->parent;
ian@0 303
ian@0 304 while (cur != node)
ian@0 305 cur = prio_tree_replace(root, cur->parent, cur);
ian@0 306 }
ian@0 307
ian@0 308 /*
ian@0 309 * Following functions help to enumerate all prio_tree_nodes in the tree that
ian@0 310 * overlap with the input interval X [radix_index, heap_index]. The enumeration
ian@0 311 * takes O(log n + m) time where 'log n' is the height of the tree (which is
ian@0 312 * proportional to # of bits required to represent the maximum heap_index) and
ian@0 313 * 'm' is the number of prio_tree_nodes that overlap the interval X.
ian@0 314 */
ian@0 315
ian@0 316 static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
ian@0 317 unsigned long *r_index, unsigned long *h_index)
ian@0 318 {
ian@0 319 if (prio_tree_left_empty(iter->cur))
ian@0 320 return NULL;
ian@0 321
ian@0 322 get_index(iter->root, iter->cur->left, r_index, h_index);
ian@0 323
ian@0 324 if (iter->r_index <= *h_index) {
ian@0 325 iter->cur = iter->cur->left;
ian@0 326 iter->mask >>= 1;
ian@0 327 if (iter->mask) {
ian@0 328 if (iter->size_level)
ian@0 329 iter->size_level++;
ian@0 330 } else {
ian@0 331 if (iter->size_level) {
ian@0 332 BUG_ON(!prio_tree_left_empty(iter->cur));
ian@0 333 BUG_ON(!prio_tree_right_empty(iter->cur));
ian@0 334 iter->size_level++;
ian@0 335 iter->mask = ULONG_MAX;
ian@0 336 } else {
ian@0 337 iter->size_level = 1;
ian@0 338 iter->mask = 1UL << (BITS_PER_LONG - 1);
ian@0 339 }
ian@0 340 }
ian@0 341 return iter->cur;
ian@0 342 }
ian@0 343
ian@0 344 return NULL;
ian@0 345 }
ian@0 346
ian@0 347 static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
ian@0 348 unsigned long *r_index, unsigned long *h_index)
ian@0 349 {
ian@0 350 unsigned long value;
ian@0 351
ian@0 352 if (prio_tree_right_empty(iter->cur))
ian@0 353 return NULL;
ian@0 354
ian@0 355 if (iter->size_level)
ian@0 356 value = iter->value;
ian@0 357 else
ian@0 358 value = iter->value | iter->mask;
ian@0 359
ian@0 360 if (iter->h_index < value)
ian@0 361 return NULL;
ian@0 362
ian@0 363 get_index(iter->root, iter->cur->right, r_index, h_index);
ian@0 364
ian@0 365 if (iter->r_index <= *h_index) {
ian@0 366 iter->cur = iter->cur->right;
ian@0 367 iter->mask >>= 1;
ian@0 368 iter->value = value;
ian@0 369 if (iter->mask) {
ian@0 370 if (iter->size_level)
ian@0 371 iter->size_level++;
ian@0 372 } else {
ian@0 373 if (iter->size_level) {
ian@0 374 BUG_ON(!prio_tree_left_empty(iter->cur));
ian@0 375 BUG_ON(!prio_tree_right_empty(iter->cur));
ian@0 376 iter->size_level++;
ian@0 377 iter->mask = ULONG_MAX;
ian@0 378 } else {
ian@0 379 iter->size_level = 1;
ian@0 380 iter->mask = 1UL << (BITS_PER_LONG - 1);
ian@0 381 }
ian@0 382 }
ian@0 383 return iter->cur;
ian@0 384 }
ian@0 385
ian@0 386 return NULL;
ian@0 387 }
ian@0 388
ian@0 389 static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
ian@0 390 {
ian@0 391 iter->cur = iter->cur->parent;
ian@0 392 if (iter->mask == ULONG_MAX)
ian@0 393 iter->mask = 1UL;
ian@0 394 else if (iter->size_level == 1)
ian@0 395 iter->mask = 1UL;
ian@0 396 else
ian@0 397 iter->mask <<= 1;
ian@0 398 if (iter->size_level)
ian@0 399 iter->size_level--;
ian@0 400 if (!iter->size_level && (iter->value & iter->mask))
ian@0 401 iter->value ^= iter->mask;
ian@0 402 return iter->cur;
ian@0 403 }
ian@0 404
ian@0 405 static inline int overlap(struct prio_tree_iter *iter,
ian@0 406 unsigned long r_index, unsigned long h_index)
ian@0 407 {
ian@0 408 return iter->h_index >= r_index && iter->r_index <= h_index;
ian@0 409 }
ian@0 410
ian@0 411 /*
ian@0 412 * prio_tree_first:
ian@0 413 *
ian@0 414 * Get the first prio_tree_node that overlaps with the interval [radix_index,
ian@0 415 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
ian@0 416 * traversal of the tree.
ian@0 417 */
ian@0 418 static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
ian@0 419 {
ian@0 420 struct prio_tree_root *root;
ian@0 421 unsigned long r_index, h_index;
ian@0 422
ian@0 423 INIT_PRIO_TREE_ITER(iter);
ian@0 424
ian@0 425 root = iter->root;
ian@0 426 if (prio_tree_empty(root))
ian@0 427 return NULL;
ian@0 428
ian@0 429 get_index(root, root->prio_tree_node, &r_index, &h_index);
ian@0 430
ian@0 431 if (iter->r_index > h_index)
ian@0 432 return NULL;
ian@0 433
ian@0 434 iter->mask = 1UL << (root->index_bits - 1);
ian@0 435 iter->cur = root->prio_tree_node;
ian@0 436
ian@0 437 while (1) {
ian@0 438 if (overlap(iter, r_index, h_index))
ian@0 439 return iter->cur;
ian@0 440
ian@0 441 if (prio_tree_left(iter, &r_index, &h_index))
ian@0 442 continue;
ian@0 443
ian@0 444 if (prio_tree_right(iter, &r_index, &h_index))
ian@0 445 continue;
ian@0 446
ian@0 447 break;
ian@0 448 }
ian@0 449 return NULL;
ian@0 450 }
ian@0 451
ian@0 452 /*
ian@0 453 * prio_tree_next:
ian@0 454 *
ian@0 455 * Get the next prio_tree_node that overlaps with the input interval in iter
ian@0 456 */
ian@0 457 struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
ian@0 458 {
ian@0 459 unsigned long r_index, h_index;
ian@0 460
ian@0 461 if (iter->cur == NULL)
ian@0 462 return prio_tree_first(iter);
ian@0 463
ian@0 464 repeat:
ian@0 465 while (prio_tree_left(iter, &r_index, &h_index))
ian@0 466 if (overlap(iter, r_index, h_index))
ian@0 467 return iter->cur;
ian@0 468
ian@0 469 while (!prio_tree_right(iter, &r_index, &h_index)) {
ian@0 470 while (!prio_tree_root(iter->cur) &&
ian@0 471 iter->cur->parent->right == iter->cur)
ian@0 472 prio_tree_parent(iter);
ian@0 473
ian@0 474 if (prio_tree_root(iter->cur))
ian@0 475 return NULL;
ian@0 476
ian@0 477 prio_tree_parent(iter);
ian@0 478 }
ian@0 479
ian@0 480 if (overlap(iter, r_index, h_index))
ian@0 481 return iter->cur;
ian@0 482
ian@0 483 goto repeat;
ian@0 484 }