ia64/linux-2.6.18-xen.hg

view arch/sparc64/kernel/pci_sun4v.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /* pci_sun4v.c: SUN4V specific PCI controller support.
2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4 */
6 #include <linux/kernel.h>
7 #include <linux/types.h>
8 #include <linux/pci.h>
9 #include <linux/init.h>
10 #include <linux/slab.h>
11 #include <linux/interrupt.h>
12 #include <linux/percpu.h>
14 #include <asm/pbm.h>
15 #include <asm/iommu.h>
16 #include <asm/irq.h>
17 #include <asm/upa.h>
18 #include <asm/pstate.h>
19 #include <asm/oplib.h>
20 #include <asm/hypervisor.h>
21 #include <asm/prom.h>
23 #include "pci_impl.h"
24 #include "iommu_common.h"
26 #include "pci_sun4v.h"
28 #define PGLIST_NENTS (PAGE_SIZE / sizeof(u64))
30 struct pci_iommu_batch {
31 struct pci_dev *pdev; /* Device mapping is for. */
32 unsigned long prot; /* IOMMU page protections */
33 unsigned long entry; /* Index into IOTSB. */
34 u64 *pglist; /* List of physical pages */
35 unsigned long npages; /* Number of pages in list. */
36 };
38 static DEFINE_PER_CPU(struct pci_iommu_batch, pci_iommu_batch);
40 /* Interrupts must be disabled. */
41 static inline void pci_iommu_batch_start(struct pci_dev *pdev, unsigned long prot, unsigned long entry)
42 {
43 struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
45 p->pdev = pdev;
46 p->prot = prot;
47 p->entry = entry;
48 p->npages = 0;
49 }
51 /* Interrupts must be disabled. */
52 static long pci_iommu_batch_flush(struct pci_iommu_batch *p)
53 {
54 struct pcidev_cookie *pcp = p->pdev->sysdata;
55 unsigned long devhandle = pcp->pbm->devhandle;
56 unsigned long prot = p->prot;
57 unsigned long entry = p->entry;
58 u64 *pglist = p->pglist;
59 unsigned long npages = p->npages;
61 while (npages != 0) {
62 long num;
64 num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry),
65 npages, prot, __pa(pglist));
66 if (unlikely(num < 0)) {
67 if (printk_ratelimit())
68 printk("pci_iommu_batch_flush: IOMMU map of "
69 "[%08lx:%08lx:%lx:%lx:%lx] failed with "
70 "status %ld\n",
71 devhandle, HV_PCI_TSBID(0, entry),
72 npages, prot, __pa(pglist), num);
73 return -1;
74 }
76 entry += num;
77 npages -= num;
78 pglist += num;
79 }
81 p->entry = entry;
82 p->npages = 0;
84 return 0;
85 }
87 /* Interrupts must be disabled. */
88 static inline long pci_iommu_batch_add(u64 phys_page)
89 {
90 struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
92 BUG_ON(p->npages >= PGLIST_NENTS);
94 p->pglist[p->npages++] = phys_page;
95 if (p->npages == PGLIST_NENTS)
96 return pci_iommu_batch_flush(p);
98 return 0;
99 }
101 /* Interrupts must be disabled. */
102 static inline long pci_iommu_batch_end(void)
103 {
104 struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
106 BUG_ON(p->npages >= PGLIST_NENTS);
108 return pci_iommu_batch_flush(p);
109 }
111 static long pci_arena_alloc(struct pci_iommu_arena *arena, unsigned long npages)
112 {
113 unsigned long n, i, start, end, limit;
114 int pass;
116 limit = arena->limit;
117 start = arena->hint;
118 pass = 0;
120 again:
121 n = find_next_zero_bit(arena->map, limit, start);
122 end = n + npages;
123 if (unlikely(end >= limit)) {
124 if (likely(pass < 1)) {
125 limit = start;
126 start = 0;
127 pass++;
128 goto again;
129 } else {
130 /* Scanned the whole thing, give up. */
131 return -1;
132 }
133 }
135 for (i = n; i < end; i++) {
136 if (test_bit(i, arena->map)) {
137 start = i + 1;
138 goto again;
139 }
140 }
142 for (i = n; i < end; i++)
143 __set_bit(i, arena->map);
145 arena->hint = end;
147 return n;
148 }
150 static void pci_arena_free(struct pci_iommu_arena *arena, unsigned long base, unsigned long npages)
151 {
152 unsigned long i;
154 for (i = base; i < (base + npages); i++)
155 __clear_bit(i, arena->map);
156 }
158 static void *pci_4v_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp, gfp_t gfp)
159 {
160 struct pcidev_cookie *pcp;
161 struct pci_iommu *iommu;
162 unsigned long flags, order, first_page, npages, n;
163 void *ret;
164 long entry;
166 size = IO_PAGE_ALIGN(size);
167 order = get_order(size);
168 if (unlikely(order >= MAX_ORDER))
169 return NULL;
171 npages = size >> IO_PAGE_SHIFT;
173 first_page = __get_free_pages(gfp, order);
174 if (unlikely(first_page == 0UL))
175 return NULL;
177 memset((char *)first_page, 0, PAGE_SIZE << order);
179 pcp = pdev->sysdata;
180 iommu = pcp->pbm->iommu;
182 spin_lock_irqsave(&iommu->lock, flags);
183 entry = pci_arena_alloc(&iommu->arena, npages);
184 spin_unlock_irqrestore(&iommu->lock, flags);
186 if (unlikely(entry < 0L))
187 goto arena_alloc_fail;
189 *dma_addrp = (iommu->page_table_map_base +
190 (entry << IO_PAGE_SHIFT));
191 ret = (void *) first_page;
192 first_page = __pa(first_page);
194 local_irq_save(flags);
196 pci_iommu_batch_start(pdev,
197 (HV_PCI_MAP_ATTR_READ |
198 HV_PCI_MAP_ATTR_WRITE),
199 entry);
201 for (n = 0; n < npages; n++) {
202 long err = pci_iommu_batch_add(first_page + (n * PAGE_SIZE));
203 if (unlikely(err < 0L))
204 goto iommu_map_fail;
205 }
207 if (unlikely(pci_iommu_batch_end() < 0L))
208 goto iommu_map_fail;
210 local_irq_restore(flags);
212 return ret;
214 iommu_map_fail:
215 /* Interrupts are disabled. */
216 spin_lock(&iommu->lock);
217 pci_arena_free(&iommu->arena, entry, npages);
218 spin_unlock_irqrestore(&iommu->lock, flags);
220 arena_alloc_fail:
221 free_pages(first_page, order);
222 return NULL;
223 }
225 static void pci_4v_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
226 {
227 struct pcidev_cookie *pcp;
228 struct pci_iommu *iommu;
229 unsigned long flags, order, npages, entry;
230 u32 devhandle;
232 npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT;
233 pcp = pdev->sysdata;
234 iommu = pcp->pbm->iommu;
235 devhandle = pcp->pbm->devhandle;
236 entry = ((dvma - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
238 spin_lock_irqsave(&iommu->lock, flags);
240 pci_arena_free(&iommu->arena, entry, npages);
242 do {
243 unsigned long num;
245 num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
246 npages);
247 entry += num;
248 npages -= num;
249 } while (npages != 0);
251 spin_unlock_irqrestore(&iommu->lock, flags);
253 order = get_order(size);
254 if (order < 10)
255 free_pages((unsigned long)cpu, order);
256 }
258 static dma_addr_t pci_4v_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
259 {
260 struct pcidev_cookie *pcp;
261 struct pci_iommu *iommu;
262 unsigned long flags, npages, oaddr;
263 unsigned long i, base_paddr;
264 u32 bus_addr, ret;
265 unsigned long prot;
266 long entry;
268 pcp = pdev->sysdata;
269 iommu = pcp->pbm->iommu;
271 if (unlikely(direction == PCI_DMA_NONE))
272 goto bad;
274 oaddr = (unsigned long)ptr;
275 npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
276 npages >>= IO_PAGE_SHIFT;
278 spin_lock_irqsave(&iommu->lock, flags);
279 entry = pci_arena_alloc(&iommu->arena, npages);
280 spin_unlock_irqrestore(&iommu->lock, flags);
282 if (unlikely(entry < 0L))
283 goto bad;
285 bus_addr = (iommu->page_table_map_base +
286 (entry << IO_PAGE_SHIFT));
287 ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
288 base_paddr = __pa(oaddr & IO_PAGE_MASK);
289 prot = HV_PCI_MAP_ATTR_READ;
290 if (direction != PCI_DMA_TODEVICE)
291 prot |= HV_PCI_MAP_ATTR_WRITE;
293 local_irq_save(flags);
295 pci_iommu_batch_start(pdev, prot, entry);
297 for (i = 0; i < npages; i++, base_paddr += IO_PAGE_SIZE) {
298 long err = pci_iommu_batch_add(base_paddr);
299 if (unlikely(err < 0L))
300 goto iommu_map_fail;
301 }
302 if (unlikely(pci_iommu_batch_end() < 0L))
303 goto iommu_map_fail;
305 local_irq_restore(flags);
307 return ret;
309 bad:
310 if (printk_ratelimit())
311 WARN_ON(1);
312 return PCI_DMA_ERROR_CODE;
314 iommu_map_fail:
315 /* Interrupts are disabled. */
316 spin_lock(&iommu->lock);
317 pci_arena_free(&iommu->arena, entry, npages);
318 spin_unlock_irqrestore(&iommu->lock, flags);
320 return PCI_DMA_ERROR_CODE;
321 }
323 static void pci_4v_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
324 {
325 struct pcidev_cookie *pcp;
326 struct pci_iommu *iommu;
327 unsigned long flags, npages;
328 long entry;
329 u32 devhandle;
331 if (unlikely(direction == PCI_DMA_NONE)) {
332 if (printk_ratelimit())
333 WARN_ON(1);
334 return;
335 }
337 pcp = pdev->sysdata;
338 iommu = pcp->pbm->iommu;
339 devhandle = pcp->pbm->devhandle;
341 npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK);
342 npages >>= IO_PAGE_SHIFT;
343 bus_addr &= IO_PAGE_MASK;
345 spin_lock_irqsave(&iommu->lock, flags);
347 entry = (bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT;
348 pci_arena_free(&iommu->arena, entry, npages);
350 do {
351 unsigned long num;
353 num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
354 npages);
355 entry += num;
356 npages -= num;
357 } while (npages != 0);
359 spin_unlock_irqrestore(&iommu->lock, flags);
360 }
362 #define SG_ENT_PHYS_ADDRESS(SG) \
363 (__pa(page_address((SG)->page)) + (SG)->offset)
365 static inline long fill_sg(long entry, struct pci_dev *pdev,
366 struct scatterlist *sg,
367 int nused, int nelems, unsigned long prot)
368 {
369 struct scatterlist *dma_sg = sg;
370 struct scatterlist *sg_end = sg + nelems;
371 unsigned long flags;
372 int i;
374 local_irq_save(flags);
376 pci_iommu_batch_start(pdev, prot, entry);
378 for (i = 0; i < nused; i++) {
379 unsigned long pteval = ~0UL;
380 u32 dma_npages;
382 dma_npages = ((dma_sg->dma_address & (IO_PAGE_SIZE - 1UL)) +
383 dma_sg->dma_length +
384 ((IO_PAGE_SIZE - 1UL))) >> IO_PAGE_SHIFT;
385 do {
386 unsigned long offset;
387 signed int len;
389 /* If we are here, we know we have at least one
390 * more page to map. So walk forward until we
391 * hit a page crossing, and begin creating new
392 * mappings from that spot.
393 */
394 for (;;) {
395 unsigned long tmp;
397 tmp = SG_ENT_PHYS_ADDRESS(sg);
398 len = sg->length;
399 if (((tmp ^ pteval) >> IO_PAGE_SHIFT) != 0UL) {
400 pteval = tmp & IO_PAGE_MASK;
401 offset = tmp & (IO_PAGE_SIZE - 1UL);
402 break;
403 }
404 if (((tmp ^ (tmp + len - 1UL)) >> IO_PAGE_SHIFT) != 0UL) {
405 pteval = (tmp + IO_PAGE_SIZE) & IO_PAGE_MASK;
406 offset = 0UL;
407 len -= (IO_PAGE_SIZE - (tmp & (IO_PAGE_SIZE - 1UL)));
408 break;
409 }
410 sg++;
411 }
413 pteval = (pteval & IOPTE_PAGE);
414 while (len > 0) {
415 long err;
417 err = pci_iommu_batch_add(pteval);
418 if (unlikely(err < 0L))
419 goto iommu_map_failed;
421 pteval += IO_PAGE_SIZE;
422 len -= (IO_PAGE_SIZE - offset);
423 offset = 0;
424 dma_npages--;
425 }
427 pteval = (pteval & IOPTE_PAGE) + len;
428 sg++;
430 /* Skip over any tail mappings we've fully mapped,
431 * adjusting pteval along the way. Stop when we
432 * detect a page crossing event.
433 */
434 while (sg < sg_end &&
435 (pteval << (64 - IO_PAGE_SHIFT)) != 0UL &&
436 (pteval == SG_ENT_PHYS_ADDRESS(sg)) &&
437 ((pteval ^
438 (SG_ENT_PHYS_ADDRESS(sg) + sg->length - 1UL)) >> IO_PAGE_SHIFT) == 0UL) {
439 pteval += sg->length;
440 sg++;
441 }
442 if ((pteval << (64 - IO_PAGE_SHIFT)) == 0UL)
443 pteval = ~0UL;
444 } while (dma_npages != 0);
445 dma_sg++;
446 }
448 if (unlikely(pci_iommu_batch_end() < 0L))
449 goto iommu_map_failed;
451 local_irq_restore(flags);
452 return 0;
454 iommu_map_failed:
455 local_irq_restore(flags);
456 return -1L;
457 }
459 static int pci_4v_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
460 {
461 struct pcidev_cookie *pcp;
462 struct pci_iommu *iommu;
463 unsigned long flags, npages, prot;
464 u32 dma_base;
465 struct scatterlist *sgtmp;
466 long entry, err;
467 int used;
469 /* Fast path single entry scatterlists. */
470 if (nelems == 1) {
471 sglist->dma_address =
472 pci_4v_map_single(pdev,
473 (page_address(sglist->page) + sglist->offset),
474 sglist->length, direction);
475 if (unlikely(sglist->dma_address == PCI_DMA_ERROR_CODE))
476 return 0;
477 sglist->dma_length = sglist->length;
478 return 1;
479 }
481 pcp = pdev->sysdata;
482 iommu = pcp->pbm->iommu;
484 if (unlikely(direction == PCI_DMA_NONE))
485 goto bad;
487 /* Step 1: Prepare scatter list. */
488 npages = prepare_sg(sglist, nelems);
490 /* Step 2: Allocate a cluster and context, if necessary. */
491 spin_lock_irqsave(&iommu->lock, flags);
492 entry = pci_arena_alloc(&iommu->arena, npages);
493 spin_unlock_irqrestore(&iommu->lock, flags);
495 if (unlikely(entry < 0L))
496 goto bad;
498 dma_base = iommu->page_table_map_base +
499 (entry << IO_PAGE_SHIFT);
501 /* Step 3: Normalize DMA addresses. */
502 used = nelems;
504 sgtmp = sglist;
505 while (used && sgtmp->dma_length) {
506 sgtmp->dma_address += dma_base;
507 sgtmp++;
508 used--;
509 }
510 used = nelems - used;
512 /* Step 4: Create the mappings. */
513 prot = HV_PCI_MAP_ATTR_READ;
514 if (direction != PCI_DMA_TODEVICE)
515 prot |= HV_PCI_MAP_ATTR_WRITE;
517 err = fill_sg(entry, pdev, sglist, used, nelems, prot);
518 if (unlikely(err < 0L))
519 goto iommu_map_failed;
521 return used;
523 bad:
524 if (printk_ratelimit())
525 WARN_ON(1);
526 return 0;
528 iommu_map_failed:
529 spin_lock_irqsave(&iommu->lock, flags);
530 pci_arena_free(&iommu->arena, entry, npages);
531 spin_unlock_irqrestore(&iommu->lock, flags);
533 return 0;
534 }
536 static void pci_4v_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
537 {
538 struct pcidev_cookie *pcp;
539 struct pci_iommu *iommu;
540 unsigned long flags, i, npages;
541 long entry;
542 u32 devhandle, bus_addr;
544 if (unlikely(direction == PCI_DMA_NONE)) {
545 if (printk_ratelimit())
546 WARN_ON(1);
547 }
549 pcp = pdev->sysdata;
550 iommu = pcp->pbm->iommu;
551 devhandle = pcp->pbm->devhandle;
553 bus_addr = sglist->dma_address & IO_PAGE_MASK;
555 for (i = 1; i < nelems; i++)
556 if (sglist[i].dma_length == 0)
557 break;
558 i--;
559 npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) -
560 bus_addr) >> IO_PAGE_SHIFT;
562 entry = ((bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
564 spin_lock_irqsave(&iommu->lock, flags);
566 pci_arena_free(&iommu->arena, entry, npages);
568 do {
569 unsigned long num;
571 num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
572 npages);
573 entry += num;
574 npages -= num;
575 } while (npages != 0);
577 spin_unlock_irqrestore(&iommu->lock, flags);
578 }
580 static void pci_4v_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
581 {
582 /* Nothing to do... */
583 }
585 static void pci_4v_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
586 {
587 /* Nothing to do... */
588 }
590 struct pci_iommu_ops pci_sun4v_iommu_ops = {
591 .alloc_consistent = pci_4v_alloc_consistent,
592 .free_consistent = pci_4v_free_consistent,
593 .map_single = pci_4v_map_single,
594 .unmap_single = pci_4v_unmap_single,
595 .map_sg = pci_4v_map_sg,
596 .unmap_sg = pci_4v_unmap_sg,
597 .dma_sync_single_for_cpu = pci_4v_dma_sync_single_for_cpu,
598 .dma_sync_sg_for_cpu = pci_4v_dma_sync_sg_for_cpu,
599 };
601 /* SUN4V PCI configuration space accessors. */
603 struct pdev_entry {
604 struct pdev_entry *next;
605 u32 devhandle;
606 unsigned int bus;
607 unsigned int device;
608 unsigned int func;
609 };
611 #define PDEV_HTAB_SIZE 16
612 #define PDEV_HTAB_MASK (PDEV_HTAB_SIZE - 1)
613 static struct pdev_entry *pdev_htab[PDEV_HTAB_SIZE];
615 static inline unsigned int pdev_hashfn(u32 devhandle, unsigned int bus, unsigned int device, unsigned int func)
616 {
617 unsigned int val;
619 val = (devhandle ^ (devhandle >> 4));
620 val ^= bus;
621 val ^= device;
622 val ^= func;
624 return val & PDEV_HTAB_MASK;
625 }
627 static int pdev_htab_add(u32 devhandle, unsigned int bus, unsigned int device, unsigned int func)
628 {
629 struct pdev_entry *p = kmalloc(sizeof(*p), GFP_KERNEL);
630 struct pdev_entry **slot;
632 if (!p)
633 return -ENOMEM;
635 slot = &pdev_htab[pdev_hashfn(devhandle, bus, device, func)];
636 p->next = *slot;
637 *slot = p;
639 p->devhandle = devhandle;
640 p->bus = bus;
641 p->device = device;
642 p->func = func;
644 return 0;
645 }
647 /* Recursively descend into the OBP device tree, rooted at toplevel_node,
648 * looking for a PCI device matching bus and devfn.
649 */
650 static int obp_find(struct device_node *toplevel_node, unsigned int bus, unsigned int devfn)
651 {
652 toplevel_node = toplevel_node->child;
654 while (toplevel_node != NULL) {
655 struct linux_prom_pci_registers *regs;
656 struct property *prop;
657 int ret;
659 ret = obp_find(toplevel_node, bus, devfn);
660 if (ret != 0)
661 return ret;
663 prop = of_find_property(toplevel_node, "reg", NULL);
664 if (!prop)
665 goto next_sibling;
667 regs = prop->value;
668 if (((regs->phys_hi >> 16) & 0xff) == bus &&
669 ((regs->phys_hi >> 8) & 0xff) == devfn)
670 break;
672 next_sibling:
673 toplevel_node = toplevel_node->sibling;
674 }
676 return toplevel_node != NULL;
677 }
679 static int pdev_htab_populate(struct pci_pbm_info *pbm)
680 {
681 u32 devhandle = pbm->devhandle;
682 unsigned int bus;
684 for (bus = pbm->pci_first_busno; bus <= pbm->pci_last_busno; bus++) {
685 unsigned int devfn;
687 for (devfn = 0; devfn < 256; devfn++) {
688 unsigned int device = PCI_SLOT(devfn);
689 unsigned int func = PCI_FUNC(devfn);
691 if (obp_find(pbm->prom_node, bus, devfn)) {
692 int err = pdev_htab_add(devhandle, bus,
693 device, func);
694 if (err)
695 return err;
696 }
697 }
698 }
700 return 0;
701 }
703 static struct pdev_entry *pdev_find(u32 devhandle, unsigned int bus, unsigned int device, unsigned int func)
704 {
705 struct pdev_entry *p;
707 p = pdev_htab[pdev_hashfn(devhandle, bus, device, func)];
708 while (p) {
709 if (p->devhandle == devhandle &&
710 p->bus == bus &&
711 p->device == device &&
712 p->func == func)
713 break;
715 p = p->next;
716 }
718 return p;
719 }
721 static inline int pci_sun4v_out_of_range(struct pci_pbm_info *pbm, unsigned int bus, unsigned int device, unsigned int func)
722 {
723 if (bus < pbm->pci_first_busno ||
724 bus > pbm->pci_last_busno)
725 return 1;
726 return pdev_find(pbm->devhandle, bus, device, func) == NULL;
727 }
729 static int pci_sun4v_read_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn,
730 int where, int size, u32 *value)
731 {
732 struct pci_pbm_info *pbm = bus_dev->sysdata;
733 u32 devhandle = pbm->devhandle;
734 unsigned int bus = bus_dev->number;
735 unsigned int device = PCI_SLOT(devfn);
736 unsigned int func = PCI_FUNC(devfn);
737 unsigned long ret;
739 if (pci_sun4v_out_of_range(pbm, bus, device, func)) {
740 ret = ~0UL;
741 } else {
742 ret = pci_sun4v_config_get(devhandle,
743 HV_PCI_DEVICE_BUILD(bus, device, func),
744 where, size);
745 #if 0
746 printk("rcfg: [%x:%x:%x:%d]=[%lx]\n",
747 devhandle, HV_PCI_DEVICE_BUILD(bus, device, func),
748 where, size, ret);
749 #endif
750 }
751 switch (size) {
752 case 1:
753 *value = ret & 0xff;
754 break;
755 case 2:
756 *value = ret & 0xffff;
757 break;
758 case 4:
759 *value = ret & 0xffffffff;
760 break;
761 };
764 return PCIBIOS_SUCCESSFUL;
765 }
767 static int pci_sun4v_write_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn,
768 int where, int size, u32 value)
769 {
770 struct pci_pbm_info *pbm = bus_dev->sysdata;
771 u32 devhandle = pbm->devhandle;
772 unsigned int bus = bus_dev->number;
773 unsigned int device = PCI_SLOT(devfn);
774 unsigned int func = PCI_FUNC(devfn);
775 unsigned long ret;
777 if (pci_sun4v_out_of_range(pbm, bus, device, func)) {
778 /* Do nothing. */
779 } else {
780 ret = pci_sun4v_config_put(devhandle,
781 HV_PCI_DEVICE_BUILD(bus, device, func),
782 where, size, value);
783 #if 0
784 printk("wcfg: [%x:%x:%x:%d] v[%x] == [%lx]\n",
785 devhandle, HV_PCI_DEVICE_BUILD(bus, device, func),
786 where, size, value, ret);
787 #endif
788 }
789 return PCIBIOS_SUCCESSFUL;
790 }
792 static struct pci_ops pci_sun4v_ops = {
793 .read = pci_sun4v_read_pci_cfg,
794 .write = pci_sun4v_write_pci_cfg,
795 };
798 static void pbm_scan_bus(struct pci_controller_info *p,
799 struct pci_pbm_info *pbm)
800 {
801 struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
803 if (!cookie) {
804 prom_printf("%s: Critical allocation failure.\n", pbm->name);
805 prom_halt();
806 }
808 /* All we care about is the PBM. */
809 memset(cookie, 0, sizeof(*cookie));
810 cookie->pbm = pbm;
812 pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno, p->pci_ops, pbm);
813 #if 0
814 pci_fixup_host_bridge_self(pbm->pci_bus);
815 pbm->pci_bus->self->sysdata = cookie;
816 #endif
817 pci_fill_in_pbm_cookies(pbm->pci_bus, pbm, pbm->prom_node);
818 pci_record_assignments(pbm, pbm->pci_bus);
819 pci_assign_unassigned(pbm, pbm->pci_bus);
820 pci_fixup_irq(pbm, pbm->pci_bus);
821 pci_determine_66mhz_disposition(pbm, pbm->pci_bus);
822 pci_setup_busmastering(pbm, pbm->pci_bus);
823 }
825 static void pci_sun4v_scan_bus(struct pci_controller_info *p)
826 {
827 struct property *prop;
828 struct device_node *dp;
830 if ((dp = p->pbm_A.prom_node) != NULL) {
831 prop = of_find_property(dp, "66mhz-capable", NULL);
832 p->pbm_A.is_66mhz_capable = (prop != NULL);
834 pbm_scan_bus(p, &p->pbm_A);
835 }
836 if ((dp = p->pbm_B.prom_node) != NULL) {
837 prop = of_find_property(dp, "66mhz-capable", NULL);
838 p->pbm_B.is_66mhz_capable = (prop != NULL);
840 pbm_scan_bus(p, &p->pbm_B);
841 }
843 /* XXX register error interrupt handlers XXX */
844 }
846 static void pci_sun4v_base_address_update(struct pci_dev *pdev, int resource)
847 {
848 struct pcidev_cookie *pcp = pdev->sysdata;
849 struct pci_pbm_info *pbm = pcp->pbm;
850 struct resource *res, *root;
851 u32 reg;
852 int where, size, is_64bit;
854 res = &pdev->resource[resource];
855 if (resource < 6) {
856 where = PCI_BASE_ADDRESS_0 + (resource * 4);
857 } else if (resource == PCI_ROM_RESOURCE) {
858 where = pdev->rom_base_reg;
859 } else {
860 /* Somebody might have asked allocation of a non-standard resource */
861 return;
862 }
864 /* XXX 64-bit MEM handling is not %100 correct... XXX */
865 is_64bit = 0;
866 if (res->flags & IORESOURCE_IO)
867 root = &pbm->io_space;
868 else {
869 root = &pbm->mem_space;
870 if ((res->flags & PCI_BASE_ADDRESS_MEM_TYPE_MASK)
871 == PCI_BASE_ADDRESS_MEM_TYPE_64)
872 is_64bit = 1;
873 }
875 size = res->end - res->start;
876 pci_read_config_dword(pdev, where, &reg);
877 reg = ((reg & size) |
878 (((u32)(res->start - root->start)) & ~size));
879 if (resource == PCI_ROM_RESOURCE) {
880 reg |= PCI_ROM_ADDRESS_ENABLE;
881 res->flags |= IORESOURCE_ROM_ENABLE;
882 }
883 pci_write_config_dword(pdev, where, reg);
885 /* This knows that the upper 32-bits of the address
886 * must be zero. Our PCI common layer enforces this.
887 */
888 if (is_64bit)
889 pci_write_config_dword(pdev, where + 4, 0);
890 }
892 static void pci_sun4v_resource_adjust(struct pci_dev *pdev,
893 struct resource *res,
894 struct resource *root)
895 {
896 res->start += root->start;
897 res->end += root->start;
898 }
900 /* Use ranges property to determine where PCI MEM, I/O, and Config
901 * space are for this PCI bus module.
902 */
903 static void pci_sun4v_determine_mem_io_space(struct pci_pbm_info *pbm)
904 {
905 int i, saw_mem, saw_io;
907 saw_mem = saw_io = 0;
908 for (i = 0; i < pbm->num_pbm_ranges; i++) {
909 struct linux_prom_pci_ranges *pr = &pbm->pbm_ranges[i];
910 unsigned long a;
911 int type;
913 type = (pr->child_phys_hi >> 24) & 0x3;
914 a = (((unsigned long)pr->parent_phys_hi << 32UL) |
915 ((unsigned long)pr->parent_phys_lo << 0UL));
917 switch (type) {
918 case 1:
919 /* 16-bit IO space, 16MB */
920 pbm->io_space.start = a;
921 pbm->io_space.end = a + ((16UL*1024UL*1024UL) - 1UL);
922 pbm->io_space.flags = IORESOURCE_IO;
923 saw_io = 1;
924 break;
926 case 2:
927 /* 32-bit MEM space, 2GB */
928 pbm->mem_space.start = a;
929 pbm->mem_space.end = a + (0x80000000UL - 1UL);
930 pbm->mem_space.flags = IORESOURCE_MEM;
931 saw_mem = 1;
932 break;
934 case 3:
935 /* XXX 64-bit MEM handling XXX */
937 default:
938 break;
939 };
940 }
942 if (!saw_io || !saw_mem) {
943 prom_printf("%s: Fatal error, missing %s PBM range.\n",
944 pbm->name,
945 (!saw_io ? "IO" : "MEM"));
946 prom_halt();
947 }
949 printk("%s: PCI IO[%lx] MEM[%lx]\n",
950 pbm->name,
951 pbm->io_space.start,
952 pbm->mem_space.start);
953 }
955 static void pbm_register_toplevel_resources(struct pci_controller_info *p,
956 struct pci_pbm_info *pbm)
957 {
958 pbm->io_space.name = pbm->mem_space.name = pbm->name;
960 request_resource(&ioport_resource, &pbm->io_space);
961 request_resource(&iomem_resource, &pbm->mem_space);
962 pci_register_legacy_regions(&pbm->io_space,
963 &pbm->mem_space);
964 }
966 static unsigned long probe_existing_entries(struct pci_pbm_info *pbm,
967 struct pci_iommu *iommu)
968 {
969 struct pci_iommu_arena *arena = &iommu->arena;
970 unsigned long i, cnt = 0;
971 u32 devhandle;
973 devhandle = pbm->devhandle;
974 for (i = 0; i < arena->limit; i++) {
975 unsigned long ret, io_attrs, ra;
977 ret = pci_sun4v_iommu_getmap(devhandle,
978 HV_PCI_TSBID(0, i),
979 &io_attrs, &ra);
980 if (ret == HV_EOK) {
981 if (page_in_phys_avail(ra)) {
982 pci_sun4v_iommu_demap(devhandle,
983 HV_PCI_TSBID(0, i), 1);
984 } else {
985 cnt++;
986 __set_bit(i, arena->map);
987 }
988 }
989 }
991 return cnt;
992 }
994 static void pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
995 {
996 struct pci_iommu *iommu = pbm->iommu;
997 struct property *prop;
998 unsigned long num_tsb_entries, sz;
999 u32 vdma[2], dma_mask, dma_offset;
1000 int tsbsize;
1002 prop = of_find_property(pbm->prom_node, "virtual-dma", NULL);
1003 if (prop) {
1004 u32 *val = prop->value;
1006 vdma[0] = val[0];
1007 vdma[1] = val[1];
1008 } else {
1009 /* No property, use default values. */
1010 vdma[0] = 0x80000000;
1011 vdma[1] = 0x80000000;
1014 dma_mask = vdma[0];
1015 switch (vdma[1]) {
1016 case 0x20000000:
1017 dma_mask |= 0x1fffffff;
1018 tsbsize = 64;
1019 break;
1021 case 0x40000000:
1022 dma_mask |= 0x3fffffff;
1023 tsbsize = 128;
1024 break;
1026 case 0x80000000:
1027 dma_mask |= 0x7fffffff;
1028 tsbsize = 256;
1029 break;
1031 default:
1032 prom_printf("PCI-SUN4V: strange virtual-dma size.\n");
1033 prom_halt();
1034 };
1036 tsbsize *= (8 * 1024);
1038 num_tsb_entries = tsbsize / sizeof(iopte_t);
1040 dma_offset = vdma[0];
1042 /* Setup initial software IOMMU state. */
1043 spin_lock_init(&iommu->lock);
1044 iommu->ctx_lowest_free = 1;
1045 iommu->page_table_map_base = dma_offset;
1046 iommu->dma_addr_mask = dma_mask;
1048 /* Allocate and initialize the free area map. */
1049 sz = num_tsb_entries / 8;
1050 sz = (sz + 7UL) & ~7UL;
1051 iommu->arena.map = kmalloc(sz, GFP_KERNEL);
1052 if (!iommu->arena.map) {
1053 prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n");
1054 prom_halt();
1056 memset(iommu->arena.map, 0, sz);
1057 iommu->arena.limit = num_tsb_entries;
1059 sz = probe_existing_entries(pbm, iommu);
1060 if (sz)
1061 printk("%s: Imported %lu TSB entries from OBP\n",
1062 pbm->name, sz);
1065 static void pci_sun4v_get_bus_range(struct pci_pbm_info *pbm)
1067 struct property *prop;
1068 unsigned int *busrange;
1070 prop = of_find_property(pbm->prom_node, "bus-range", NULL);
1072 busrange = prop->value;
1074 pbm->pci_first_busno = busrange[0];
1075 pbm->pci_last_busno = busrange[1];
1079 static void pci_sun4v_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 devhandle)
1081 struct pci_pbm_info *pbm;
1082 struct property *prop;
1083 int len, i;
1085 if (devhandle & 0x40)
1086 pbm = &p->pbm_B;
1087 else
1088 pbm = &p->pbm_A;
1090 pbm->parent = p;
1091 pbm->prom_node = dp;
1092 pbm->pci_first_slot = 1;
1094 pbm->devhandle = devhandle;
1096 pbm->name = dp->full_name;
1098 printk("%s: SUN4V PCI Bus Module\n", pbm->name);
1100 prop = of_find_property(dp, "ranges", &len);
1101 pbm->pbm_ranges = prop->value;
1102 pbm->num_pbm_ranges =
1103 (len / sizeof(struct linux_prom_pci_ranges));
1105 /* Mask out the top 8 bits of the ranges, leaving the real
1106 * physical address.
1107 */
1108 for (i = 0; i < pbm->num_pbm_ranges; i++)
1109 pbm->pbm_ranges[i].parent_phys_hi &= 0x0fffffff;
1111 pci_sun4v_determine_mem_io_space(pbm);
1112 pbm_register_toplevel_resources(p, pbm);
1114 prop = of_find_property(dp, "interrupt-map", &len);
1115 pbm->pbm_intmap = prop->value;
1116 pbm->num_pbm_intmap =
1117 (len / sizeof(struct linux_prom_pci_intmap));
1119 prop = of_find_property(dp, "interrupt-map-mask", NULL);
1120 pbm->pbm_intmask = prop->value;
1122 pci_sun4v_get_bus_range(pbm);
1123 pci_sun4v_iommu_init(pbm);
1125 pdev_htab_populate(pbm);
1128 void sun4v_pci_init(struct device_node *dp, char *model_name)
1130 struct pci_controller_info *p;
1131 struct pci_iommu *iommu;
1132 struct property *prop;
1133 struct linux_prom64_registers *regs;
1134 u32 devhandle;
1135 int i;
1137 prop = of_find_property(dp, "reg", NULL);
1138 regs = prop->value;
1140 devhandle = (regs->phys_addr >> 32UL) & 0x0fffffff;
1142 for (p = pci_controller_root; p; p = p->next) {
1143 struct pci_pbm_info *pbm;
1145 if (p->pbm_A.prom_node && p->pbm_B.prom_node)
1146 continue;
1148 pbm = (p->pbm_A.prom_node ?
1149 &p->pbm_A :
1150 &p->pbm_B);
1152 if (pbm->devhandle == (devhandle ^ 0x40)) {
1153 pci_sun4v_pbm_init(p, dp, devhandle);
1154 return;
1158 for_each_possible_cpu(i) {
1159 unsigned long page = get_zeroed_page(GFP_ATOMIC);
1161 if (!page)
1162 goto fatal_memory_error;
1164 per_cpu(pci_iommu_batch, i).pglist = (u64 *) page;
1167 p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
1168 if (!p)
1169 goto fatal_memory_error;
1171 memset(p, 0, sizeof(*p));
1173 iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
1174 if (!iommu)
1175 goto fatal_memory_error;
1177 memset(iommu, 0, sizeof(*iommu));
1178 p->pbm_A.iommu = iommu;
1180 iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
1181 if (!iommu)
1182 goto fatal_memory_error;
1184 memset(iommu, 0, sizeof(*iommu));
1185 p->pbm_B.iommu = iommu;
1187 p->next = pci_controller_root;
1188 pci_controller_root = p;
1190 p->index = pci_num_controllers++;
1191 p->pbms_same_domain = 0;
1193 p->scan_bus = pci_sun4v_scan_bus;
1194 p->base_address_update = pci_sun4v_base_address_update;
1195 p->resource_adjust = pci_sun4v_resource_adjust;
1196 p->pci_ops = &pci_sun4v_ops;
1198 /* Like PSYCHO and SCHIZO we have a 2GB aligned area
1199 * for memory space.
1200 */
1201 pci_memspace_mask = 0x7fffffffUL;
1203 pci_sun4v_pbm_init(p, dp, devhandle);
1204 return;
1206 fatal_memory_error:
1207 prom_printf("SUN4V_PCI: Fatal memory allocation error.\n");
1208 prom_halt();