ia64/linux-2.6.18-xen.hg

view drivers/pci/iov.c @ 882:8dec4aa9b8b9

PCI pass through: PCIe IO space multiplexing

This is required for more than 16 HVM domain to boot from
PCIe pass through device.

Linux as dom0 exclusively assigns IO space to downstream PCI bridges
and the assignment unit of PCI bridge IO space is 4K. So the only up
to 16 PCIe device can be accessed via IO space within 64K IO ports.
PCI expansion ROM BIOS often uses IO port access to boot from the
device, so on virtualized environment, it means only up to 16 guest
domain can boot from pass-through device.

This patch allows PCIe IO space sharing of pass-through device.
- reassign IO space of PCIe devices specified by
"guestiomuldev=[<segment>:]<bus>:<dev>[,[<segment:><bus>:dev]][,...]"
to be shared.
This is implemented as Linux PCI quirk fixup.

The sharing unit is PCIe switch. Ie IO space of the end point
devices under the same switch will be shared. If there are more than
one switches, two areas of IO space will be used.

- And the driver which arbitrates the accesses to the multiplexed PCIe
IO space. Later qemu-dm will use this.

Limitation:
IO port of IO shared devices can't be accessed from dom0 Linux device
driver. But this wouldn't be a big issue because PCIe specification
discourages the use of IO space and recommends that IO space should be
used only for bootable device with ROM code. OS device driver should
work without IO space access.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Keir Fraser <keir.fraser@citrix.com>
date Thu May 28 09:57:49 2009 +0100 (2009-05-28)
parents 711e402bc141
children 104b6ae6b257
line source
1 /*
2 * drivers/pci/iov.c
3 *
4 * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
5 *
6 * PCI Express I/O Virtualization (IOV) support.
7 * Single Root IOV 1.0
8 */
10 #include <linux/pci.h>
11 #include <linux/mutex.h>
12 #include <linux/string.h>
13 #include <linux/delay.h>
14 #include "pci.h"
16 #define VIRTFN_ID_LEN 16
18 static inline u8 virtfn_bus(struct pci_dev *dev, int id)
19 {
20 return dev->bus->number + ((dev->devfn + dev->sriov->offset +
21 dev->sriov->stride * id) >> 8);
22 }
24 static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
25 {
26 return (dev->devfn + dev->sriov->offset +
27 dev->sriov->stride * id) & 0xff;
28 }
30 static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
31 {
32 struct pci_bus *child;
34 if (bus->number == busnr)
35 return bus;
37 child = pci_find_bus(pci_domain_nr(bus), busnr);
38 if (child)
39 return child;
41 child = pci_add_new_bus(bus, NULL, busnr);
42 if (!child)
43 return NULL;
45 child->subordinate = busnr;
47 return child;
48 }
50 static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
51 {
52 struct pci_bus *child;
54 if (bus->number == busnr)
55 return;
57 child = pci_find_bus(pci_domain_nr(bus), busnr);
58 BUG_ON(!child);
60 if (list_empty(&child->devices))
61 pci_remove_bus(child);
62 }
64 static int virtfn_add(struct pci_dev *dev, int id)
65 {
66 int i;
67 int rc;
68 u64 size;
69 char buf[VIRTFN_ID_LEN];
70 struct pci_dev *virtfn;
71 struct resource *res;
72 struct pci_sriov *iov = dev->sriov;
74 virtfn = kzalloc(sizeof(struct pci_dev), GFP_KERNEL);
75 if (!virtfn)
76 return -ENOMEM;
78 mutex_lock(&iov->dev->sriov->lock);
79 virtfn->bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
80 if (!virtfn->bus) {
81 kfree(virtfn);
82 mutex_unlock(&iov->dev->sriov->lock);
83 return -ENOMEM;
84 }
85 virtfn->devfn = virtfn_devfn(dev, id);
86 virtfn->vendor = dev->vendor;
87 pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
88 pci_setup_device(virtfn);
89 virtfn->dev.parent = dev->dev.parent;
91 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
92 res = dev->resource + PCI_IOV_RESOURCES + i;
93 if (!res->parent)
94 continue;
95 virtfn->resource[i].name = pci_name(virtfn);
96 virtfn->resource[i].flags = res->flags;
97 size = res->end - res->start + 1;
98 do_div(size, iov->total);
99 virtfn->resource[i].start = res->start + size * id;
100 virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
101 rc = request_resource(res, &virtfn->resource[i]);
102 BUG_ON(rc);
103 }
105 pci_device_add(virtfn, virtfn->bus);
106 mutex_unlock(&iov->dev->sriov->lock);
108 virtfn->physfn = pci_dev_get(dev);
109 virtfn->is_virtfn = 1;
111 pci_bus_add_device(virtfn);
112 sprintf(buf, "virtfn%u", id);
113 rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
114 if (rc)
115 goto failed1;
116 rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
117 if (rc)
118 goto failed2;
120 kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
122 return 0;
124 failed2:
125 sysfs_remove_link(&dev->dev.kobj, buf);
126 failed1:
127 pci_dev_put(dev);
128 mutex_lock(&iov->dev->sriov->lock);
129 pci_remove_bus_device(virtfn);
130 virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
131 mutex_unlock(&iov->dev->sriov->lock);
133 return rc;
134 }
136 static void virtfn_remove(struct pci_dev *dev, int id)
137 {
138 char buf[VIRTFN_ID_LEN];
139 struct pci_bus *bus;
140 struct pci_dev *virtfn;
141 struct pci_sriov *iov = dev->sriov;
143 bus = pci_find_bus(pci_domain_nr(dev->bus), virtfn_bus(dev, id));
144 if (!bus)
145 return;
147 virtfn = pci_get_slot(bus, virtfn_devfn(dev, id));
148 if (!virtfn)
149 return;
151 pci_dev_put(virtfn);
153 sprintf(buf, "virtfn%u", id);
154 sysfs_remove_link(&dev->dev.kobj, buf);
155 sysfs_remove_link(&virtfn->dev.kobj, "physfn");
157 mutex_lock(&iov->dev->sriov->lock);
158 pci_remove_bus_device(virtfn);
159 virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
160 mutex_unlock(&iov->dev->sriov->lock);
162 pci_dev_put(dev);
163 }
165 static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
166 {
167 int rc;
168 int i, j;
169 int nres;
170 u16 offset, stride, initial;
171 struct resource *res;
172 struct pci_dev *pdev;
173 struct pci_sriov *iov = dev->sriov;
175 if (!nr_virtfn)
176 return 0;
178 if (iov->nr_virtfn)
179 return -EINVAL;
181 pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
182 if (initial > iov->total ||
183 (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
184 return -EIO;
186 if (nr_virtfn < 0 || nr_virtfn > iov->total ||
187 (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
188 return -EINVAL;
190 pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
191 pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
192 pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
193 if (!offset || (nr_virtfn > 1 && !stride))
194 return -EIO;
196 nres = 0;
197 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
198 res = dev->resource + PCI_IOV_RESOURCES + i;
199 if (res->parent)
200 nres++;
201 }
202 if (nres != iov->nres) {
203 dev_err(&dev->dev, "not enough MMIO resources for SR-IOV\n");
204 return -ENOMEM;
205 }
207 iov->offset = offset;
208 iov->stride = stride;
210 if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) {
211 dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
212 return -ENOMEM;
213 }
215 if (iov->link != dev->devfn) {
216 pdev = pci_get_slot(dev->bus, iov->link);
217 if (!pdev)
218 return -ENODEV;
220 pci_dev_put(pdev);
222 if (!pdev->is_physfn)
223 return -ENODEV;
225 rc = sysfs_create_link(&dev->dev.kobj,
226 &pdev->dev.kobj, "dep_link");
227 if (rc)
228 return rc;
229 }
231 iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
232 pci_block_user_cfg_access(dev);
233 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
234 msleep(100);
235 pci_unblock_user_cfg_access(dev);
237 iov->initial = initial;
238 if (nr_virtfn < initial)
239 initial = nr_virtfn;
241 for (i = 0; i < initial; i++) {
242 rc = virtfn_add(dev, i);
243 if (rc)
244 goto failed;
245 }
247 kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
248 iov->nr_virtfn = nr_virtfn;
250 return 0;
252 failed:
253 for (j = 0; j < i; j++)
254 virtfn_remove(dev, j);
256 iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
257 pci_block_user_cfg_access(dev);
258 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
259 ssleep(1);
260 pci_unblock_user_cfg_access(dev);
262 if (iov->link != dev->devfn)
263 sysfs_remove_link(&dev->dev.kobj, "dep_link");
265 return rc;
266 }
268 static void sriov_disable(struct pci_dev *dev)
269 {
270 int i;
271 struct pci_sriov *iov = dev->sriov;
273 if (!iov->nr_virtfn)
274 return;
276 for (i = 0; i < iov->nr_virtfn; i++)
277 virtfn_remove(dev, i);
279 iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
280 pci_block_user_cfg_access(dev);
281 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
282 ssleep(1);
283 pci_unblock_user_cfg_access(dev);
285 if (iov->link != dev->devfn)
286 sysfs_remove_link(&dev->dev.kobj, "dep_link");
288 iov->nr_virtfn = 0;
289 }
291 static int sriov_init(struct pci_dev *dev, int pos)
292 {
293 int i;
294 int rc;
295 int nres;
296 u32 pgsz;
297 u16 ctrl, total, offset, stride;
298 struct pci_sriov *iov;
299 struct resource *res;
300 struct pci_dev *pdev;
302 pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
303 if (ctrl & PCI_SRIOV_CTRL_VFE) {
304 pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
305 ssleep(1);
306 }
308 pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
309 if (!total)
310 return 0;
312 ctrl = 0;
313 list_for_each_entry(pdev, &dev->bus->devices, bus_list)
314 if (pdev->is_physfn)
315 goto found;
317 pdev = NULL;
318 if (pci_ari_enabled(dev->bus))
319 ctrl |= PCI_SRIOV_CTRL_ARI;
321 found:
322 pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
323 pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
324 pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
325 pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
326 if (!offset || (total > 1 && !stride))
327 return -EIO;
329 pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
330 i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
331 pgsz &= ~((1 << i) - 1);
332 if (!pgsz)
333 return -EIO;
335 pgsz &= ~(pgsz - 1);
336 pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
338 nres = 0;
339 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
340 res = dev->resource + PCI_IOV_RESOURCES + i;
341 i += __pci_read_base(dev, pci_bar_unknown, res,
342 pos + PCI_SRIOV_BAR + i * 4);
343 if (!res->flags)
344 continue;
345 if ((res->end - res->start + 1) & (PAGE_SIZE - 1)) {
346 rc = -EIO;
347 goto failed;
348 }
349 res->end = res->start + (res->end - res->start + 1) * total - 1;
350 nres++;
351 }
353 iov = kzalloc(sizeof(*iov), GFP_KERNEL);
354 if (!iov) {
355 rc = -ENOMEM;
356 goto failed;
357 }
359 iov->pos = pos;
360 iov->nres = nres;
361 iov->ctrl = ctrl;
362 iov->total = total;
363 iov->offset = offset;
364 iov->stride = stride;
365 iov->pgsz = pgsz;
366 iov->self = dev;
367 pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
368 pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
370 if (pdev)
371 iov->dev = pci_dev_get(pdev);
372 else {
373 iov->dev = dev;
374 mutex_init(&iov->lock);
375 }
377 dev->sriov = iov;
378 dev->is_physfn = 1;
380 return 0;
382 failed:
383 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
384 res = dev->resource + PCI_IOV_RESOURCES + i;
385 res->flags = 0;
386 }
388 return rc;
389 }
391 static void sriov_release(struct pci_dev *dev)
392 {
393 BUG_ON(dev->sriov->nr_virtfn);
395 if (dev == dev->sriov->dev)
396 mutex_destroy(&dev->sriov->lock);
397 else
398 pci_dev_put(dev->sriov->dev);
400 kfree(dev->sriov);
401 dev->sriov = NULL;
402 }
404 static void sriov_restore_state(struct pci_dev *dev)
405 {
406 int i;
407 u16 ctrl;
408 struct pci_sriov *iov = dev->sriov;
410 pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl);
411 if (ctrl & PCI_SRIOV_CTRL_VFE)
412 return;
414 for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
415 pci_update_resource(dev, i);
417 pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
418 pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
419 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
420 if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
421 msleep(100);
422 }
424 /**
425 * pci_iov_init - initialize the IOV capability
426 * @dev: the PCI device
427 *
428 * Returns 0 on success, or negative on failure.
429 */
430 int pci_iov_init(struct pci_dev *dev)
431 {
432 int pos;
434 pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
435 if (pos)
436 return sriov_init(dev, pos);
438 return -ENODEV;
439 }
441 /**
442 * pci_iov_release - release resources used by the IOV capability
443 * @dev: the PCI device
444 */
445 void pci_iov_release(struct pci_dev *dev)
446 {
447 if (dev->is_physfn)
448 sriov_release(dev);
449 }
451 /**
452 * pci_iov_resource_bar - get position of the SR-IOV BAR
453 * @dev: the PCI device
454 * @resno: the resource number
455 * @type: the BAR type to be filled in
456 *
457 * Returns position of the BAR encapsulated in the SR-IOV capability.
458 */
459 int pci_iov_resource_bar(struct pci_dev *dev, int resno,
460 enum pci_bar_type *type)
461 {
462 if (resno < PCI_IOV_RESOURCES || resno > PCI_IOV_RESOURCE_END)
463 return 0;
465 BUG_ON(!dev->is_physfn);
467 *type = pci_bar_unknown;
469 return dev->sriov->pos + PCI_SRIOV_BAR +
470 4 * (resno - PCI_IOV_RESOURCES);
471 }
473 /**
474 * pci_restore_iov_state - restore the state of the IOV capability
475 * @dev: the PCI device
476 */
477 void pci_restore_iov_state(struct pci_dev *dev)
478 {
479 if (dev->is_physfn)
480 sriov_restore_state(dev);
481 }
483 /**
484 * pci_iov_bus_range - find bus range used by Virtual Function
485 * @bus: the PCI bus
486 *
487 * Returns max number of buses (exclude current one) used by Virtual
488 * Functions.
489 */
490 int pci_iov_bus_range(struct pci_bus *bus)
491 {
492 int max = 0;
493 u8 busnr;
494 struct pci_dev *dev;
496 list_for_each_entry(dev, &bus->devices, bus_list) {
497 if (!dev->is_physfn)
498 continue;
499 busnr = virtfn_bus(dev, dev->sriov->total - 1);
500 if (busnr > max)
501 max = busnr;
502 }
504 return max ? max - bus->number : 0;
505 }
507 /**
508 * pci_enable_sriov - enable the SR-IOV capability
509 * @dev: the PCI device
510 * @nr_virtfn: number of Virtual Functions to enable
511 *
512 * Returns 0 on success, or negative on failure.
513 */
514 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
515 {
516 might_sleep();
518 if (!dev->is_physfn)
519 return -ENODEV;
521 return sriov_enable(dev, nr_virtfn);
522 }
523 EXPORT_SYMBOL_GPL(pci_enable_sriov);
525 /**
526 * pci_disable_sriov - disable the SR-IOV capability
527 * @dev: the PCI device
528 */
529 void pci_disable_sriov(struct pci_dev *dev)
530 {
531 might_sleep();
533 if (!dev->is_physfn)
534 return;
536 sriov_disable(dev);
537 }
538 EXPORT_SYMBOL_GPL(pci_disable_sriov);