ia64/linux-2.6.18-xen.hg

view drivers/pci/iov.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 104b6ae6b257
children
line source
1 /*
2 * drivers/pci/iov.c
3 *
4 * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
5 *
6 * PCI Express I/O Virtualization (IOV) support.
7 * Single Root IOV 1.0
8 */
10 #include <linux/pci.h>
11 #include <linux/mutex.h>
12 #include <linux/string.h>
13 #include <linux/delay.h>
14 #include "pci.h"
16 #define VIRTFN_ID_LEN 16
18 static inline u8 virtfn_bus(struct pci_dev *dev, int id)
19 {
20 return dev->bus->number + ((dev->devfn + dev->sriov->offset +
21 dev->sriov->stride * id) >> 8);
22 }
24 static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
25 {
26 return (dev->devfn + dev->sriov->offset +
27 dev->sriov->stride * id) & 0xff;
28 }
30 static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
31 {
32 struct pci_bus *child;
34 if (bus->number == busnr)
35 return bus;
37 child = pci_find_bus(pci_domain_nr(bus), busnr);
38 if (child)
39 return child;
41 child = pci_add_new_bus(bus, NULL, busnr);
42 if (!child)
43 return NULL;
45 child->subordinate = busnr;
47 return child;
48 }
50 static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
51 {
52 struct pci_bus *child;
54 if (bus->number == busnr)
55 return;
57 child = pci_find_bus(pci_domain_nr(bus), busnr);
58 BUG_ON(!child);
60 if (list_empty(&child->devices))
61 pci_remove_bus(child);
62 }
64 static int virtfn_add(struct pci_dev *dev, int id)
65 {
66 int i;
67 int rc;
68 u64 size;
69 char buf[VIRTFN_ID_LEN];
70 struct pci_dev *virtfn;
71 struct resource *res;
72 struct pci_sriov *iov = dev->sriov;
74 virtfn = kzalloc(sizeof(struct pci_dev), GFP_KERNEL);
75 if (!virtfn)
76 return -ENOMEM;
78 mutex_lock(&iov->dev->sriov->lock);
79 virtfn->bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
80 if (!virtfn->bus) {
81 kfree(virtfn);
82 mutex_unlock(&iov->dev->sriov->lock);
83 return -ENOMEM;
84 }
85 virtfn->devfn = virtfn_devfn(dev, id);
86 virtfn->vendor = dev->vendor;
87 pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
88 pci_setup_device(virtfn);
89 virtfn->dev.parent = dev->dev.parent;
91 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
92 res = dev->resource + PCI_IOV_RESOURCES + i;
93 if (!res->parent)
94 continue;
95 virtfn->resource[i].name = pci_name(virtfn);
96 virtfn->resource[i].flags = res->flags;
97 size = res->end - res->start + 1;
98 do_div(size, iov->total);
99 virtfn->resource[i].start = res->start + size * id;
100 virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
101 rc = request_resource(res, &virtfn->resource[i]);
102 BUG_ON(rc);
103 }
105 pci_device_add(virtfn, virtfn->bus);
106 mutex_unlock(&iov->dev->sriov->lock);
108 virtfn->physfn = pci_dev_get(dev);
109 virtfn->is_virtfn = 1;
111 pci_bus_add_device(virtfn);
112 sprintf(buf, "virtfn%u", id);
113 rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
114 if (rc)
115 goto failed1;
116 rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
117 if (rc)
118 goto failed2;
120 kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
122 return 0;
124 failed2:
125 sysfs_remove_link(&dev->dev.kobj, buf);
126 failed1:
127 pci_dev_put(dev);
128 mutex_lock(&iov->dev->sriov->lock);
129 pci_remove_bus_device(virtfn);
130 virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
131 mutex_unlock(&iov->dev->sriov->lock);
133 return rc;
134 }
136 static void virtfn_remove(struct pci_dev *dev, int id)
137 {
138 char buf[VIRTFN_ID_LEN];
139 struct pci_bus *bus;
140 struct pci_dev *virtfn;
141 struct pci_sriov *iov = dev->sriov;
143 bus = pci_find_bus(pci_domain_nr(dev->bus), virtfn_bus(dev, id));
144 if (!bus)
145 return;
147 virtfn = pci_get_slot(bus, virtfn_devfn(dev, id));
148 if (!virtfn)
149 return;
151 pci_dev_put(virtfn);
153 sprintf(buf, "virtfn%u", id);
154 sysfs_remove_link(&dev->dev.kobj, buf);
155 sysfs_remove_link(&virtfn->dev.kobj, "physfn");
157 mutex_lock(&iov->dev->sriov->lock);
158 pci_remove_bus_device(virtfn);
159 virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
160 mutex_unlock(&iov->dev->sriov->lock);
162 pci_dev_put(dev);
163 }
165 static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
166 {
167 int rc;
168 int i, j;
169 int nres;
170 u16 offset, stride, initial;
171 struct resource *res;
172 struct pci_dev *pdev;
173 struct pci_sriov *iov = dev->sriov;
175 if (!nr_virtfn)
176 return 0;
178 if (iov->nr_virtfn)
179 return -EINVAL;
181 pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
182 if (initial > iov->total ||
183 (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
184 return -EIO;
186 if (nr_virtfn < 0 || nr_virtfn > iov->total ||
187 (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
188 return -EINVAL;
190 pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
191 pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
192 pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
193 if (!offset || (nr_virtfn > 1 && !stride))
194 return -EIO;
196 nres = 0;
197 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
198 res = dev->resource + PCI_IOV_RESOURCES + i;
199 if (res->parent)
200 nres++;
201 }
202 if (nres != iov->nres) {
203 dev_err(&dev->dev, "not enough MMIO resources for SR-IOV\n");
204 return -ENOMEM;
205 }
207 iov->offset = offset;
208 iov->stride = stride;
210 if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) {
211 dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
212 return -ENOMEM;
213 }
215 if (iov->link != dev->devfn) {
216 pdev = pci_get_slot(dev->bus, iov->link);
217 if (!pdev)
218 return -ENODEV;
220 pci_dev_put(pdev);
222 if (!pdev->is_physfn)
223 return -ENODEV;
225 rc = sysfs_create_link(&dev->dev.kobj,
226 &pdev->dev.kobj, "dep_link");
227 if (rc)
228 return rc;
229 }
231 iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
232 pci_block_user_cfg_access(dev);
233 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
234 msleep(100);
235 pci_unblock_user_cfg_access(dev);
237 iov->initial = initial;
238 if (nr_virtfn < initial)
239 initial = nr_virtfn;
241 for (i = 0; i < initial; i++) {
242 rc = virtfn_add(dev, i);
243 if (rc)
244 goto failed;
245 }
247 kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
248 iov->nr_virtfn = nr_virtfn;
250 return 0;
252 failed:
253 for (j = 0; j < i; j++)
254 virtfn_remove(dev, j);
256 iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
257 pci_block_user_cfg_access(dev);
258 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
259 ssleep(1);
260 pci_unblock_user_cfg_access(dev);
262 if (iov->link != dev->devfn)
263 sysfs_remove_link(&dev->dev.kobj, "dep_link");
265 return rc;
266 }
268 static void sriov_disable(struct pci_dev *dev)
269 {
270 int i;
271 struct pci_sriov *iov = dev->sriov;
273 if (!iov->nr_virtfn)
274 return;
276 for (i = 0; i < iov->nr_virtfn; i++)
277 virtfn_remove(dev, i);
279 iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
280 pci_block_user_cfg_access(dev);
281 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
282 ssleep(1);
283 pci_unblock_user_cfg_access(dev);
285 if (iov->link != dev->devfn)
286 sysfs_remove_link(&dev->dev.kobj, "dep_link");
288 iov->nr_virtfn = 0;
289 }
291 static int sriov_init(struct pci_dev *dev, int pos)
292 {
293 int i;
294 int rc;
295 int nres;
296 u32 pgsz;
297 u16 ctrl, total, offset, stride;
298 struct pci_sriov *iov;
299 struct resource *res;
300 struct pci_dev *pdev;
302 pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
303 if (ctrl & PCI_SRIOV_CTRL_VFE) {
304 pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
305 ssleep(1);
306 }
308 pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
309 if (!total)
310 return 0;
312 ctrl = 0;
313 list_for_each_entry(pdev, &dev->bus->devices, bus_list)
314 if (pdev->is_physfn)
315 goto found;
317 pdev = NULL;
318 if (pci_ari_enabled(dev->bus))
319 ctrl |= PCI_SRIOV_CTRL_ARI;
321 found:
322 pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
323 pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
324 pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
325 pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
326 if (!offset || (total > 1 && !stride))
327 return -EIO;
329 pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
330 i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
331 pgsz &= ~((1 << i) - 1);
332 if (!pgsz)
333 return -EIO;
335 pgsz &= ~(pgsz - 1);
336 pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
338 nres = 0;
339 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
340 res = dev->resource + PCI_IOV_RESOURCES + i;
341 i += __pci_read_base(dev, pci_bar_unknown, res,
342 pos + PCI_SRIOV_BAR + i * 4);
343 if (!res->flags)
344 continue;
345 if ((res->end - res->start + 1) & (PAGE_SIZE - 1)) {
346 rc = -EIO;
347 goto failed;
348 }
349 res->end = res->start + (res->end - res->start + 1) * total - 1;
350 nres++;
351 }
353 iov = kzalloc(sizeof(*iov), GFP_KERNEL);
354 if (!iov) {
355 rc = -ENOMEM;
356 goto failed;
357 }
359 iov->pos = pos;
360 iov->nres = nres;
361 iov->ctrl = ctrl;
362 iov->total = total;
363 iov->offset = offset;
364 iov->stride = stride;
365 iov->pgsz = pgsz;
366 iov->self = dev;
367 pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
368 pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
369 if (!dev->bus->number) /* Root Complex Integrated Endpoint */
370 iov->link = PCI_DEVFN(PCI_SLOT(dev->devfn), iov->link);
372 if (pdev)
373 iov->dev = pci_dev_get(pdev);
374 else {
375 iov->dev = dev;
376 mutex_init(&iov->lock);
377 }
379 dev->sriov = iov;
380 dev->is_physfn = 1;
382 return 0;
384 failed:
385 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
386 res = dev->resource + PCI_IOV_RESOURCES + i;
387 res->flags = 0;
388 }
390 return rc;
391 }
393 static void sriov_release(struct pci_dev *dev)
394 {
395 BUG_ON(dev->sriov->nr_virtfn);
397 if (dev == dev->sriov->dev)
398 mutex_destroy(&dev->sriov->lock);
399 else
400 pci_dev_put(dev->sriov->dev);
402 kfree(dev->sriov);
403 dev->sriov = NULL;
404 }
406 static void sriov_restore_state(struct pci_dev *dev)
407 {
408 int i;
409 u16 ctrl;
410 struct pci_sriov *iov = dev->sriov;
412 pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl);
413 if (ctrl & PCI_SRIOV_CTRL_VFE)
414 return;
416 for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
417 pci_update_resource(dev, i);
419 pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
420 pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
421 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
422 if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
423 msleep(100);
424 }
426 /**
427 * pci_iov_init - initialize the IOV capability
428 * @dev: the PCI device
429 *
430 * Returns 0 on success, or negative on failure.
431 */
432 int pci_iov_init(struct pci_dev *dev)
433 {
434 int pos;
436 pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
437 if (pos)
438 return sriov_init(dev, pos);
440 return -ENODEV;
441 }
443 /**
444 * pci_iov_release - release resources used by the IOV capability
445 * @dev: the PCI device
446 */
447 void pci_iov_release(struct pci_dev *dev)
448 {
449 if (dev->is_physfn)
450 sriov_release(dev);
451 }
453 /**
454 * pci_iov_resource_bar - get position of the SR-IOV BAR
455 * @dev: the PCI device
456 * @resno: the resource number
457 * @type: the BAR type to be filled in
458 *
459 * Returns position of the BAR encapsulated in the SR-IOV capability.
460 */
461 int pci_iov_resource_bar(struct pci_dev *dev, int resno,
462 enum pci_bar_type *type)
463 {
464 if (resno < PCI_IOV_RESOURCES || resno > PCI_IOV_RESOURCE_END)
465 return 0;
467 BUG_ON(!dev->is_physfn);
469 *type = pci_bar_unknown;
471 return dev->sriov->pos + PCI_SRIOV_BAR +
472 4 * (resno - PCI_IOV_RESOURCES);
473 }
475 /**
476 * pci_restore_iov_state - restore the state of the IOV capability
477 * @dev: the PCI device
478 */
479 void pci_restore_iov_state(struct pci_dev *dev)
480 {
481 if (dev->is_physfn)
482 sriov_restore_state(dev);
483 }
485 /**
486 * pci_iov_bus_range - find bus range used by Virtual Function
487 * @bus: the PCI bus
488 *
489 * Returns max number of buses (exclude current one) used by Virtual
490 * Functions.
491 */
492 int pci_iov_bus_range(struct pci_bus *bus)
493 {
494 int max = 0;
495 u8 busnr;
496 struct pci_dev *dev;
498 list_for_each_entry(dev, &bus->devices, bus_list) {
499 if (!dev->is_physfn)
500 continue;
501 busnr = virtfn_bus(dev, dev->sriov->total - 1);
502 if (busnr > max)
503 max = busnr;
504 }
506 return max ? max - bus->number : 0;
507 }
509 /**
510 * pci_enable_sriov - enable the SR-IOV capability
511 * @dev: the PCI device
512 * @nr_virtfn: number of Virtual Functions to enable
513 *
514 * Returns 0 on success, or negative on failure.
515 */
516 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
517 {
518 might_sleep();
520 if (!dev->is_physfn)
521 return -ENODEV;
523 return sriov_enable(dev, nr_virtfn);
524 }
525 EXPORT_SYMBOL_GPL(pci_enable_sriov);
527 /**
528 * pci_disable_sriov - disable the SR-IOV capability
529 * @dev: the PCI device
530 */
531 void pci_disable_sriov(struct pci_dev *dev)
532 {
533 might_sleep();
535 if (!dev->is_physfn)
536 return;
538 sriov_disable(dev);
539 }
540 EXPORT_SYMBOL_GPL(pci_disable_sriov);