ia64/linux-2.6.18-xen.hg

view arch/sparc64/kernel/pci.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /* $Id: pci.c,v 1.39 2002/01/05 01:13:43 davem Exp $
2 * pci.c: UltraSparc PCI controller support.
3 *
4 * Copyright (C) 1997, 1998, 1999 David S. Miller (davem@redhat.com)
5 * Copyright (C) 1998, 1999 Eddie C. Dost (ecd@skynet.be)
6 * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
7 */
9 #include <linux/module.h>
10 #include <linux/kernel.h>
11 #include <linux/string.h>
12 #include <linux/sched.h>
13 #include <linux/capability.h>
14 #include <linux/errno.h>
15 #include <linux/smp_lock.h>
16 #include <linux/init.h>
18 #include <asm/uaccess.h>
19 #include <asm/pbm.h>
20 #include <asm/pgtable.h>
21 #include <asm/irq.h>
22 #include <asm/ebus.h>
23 #include <asm/isa.h>
24 #include <asm/prom.h>
26 unsigned long pci_memspace_mask = 0xffffffffUL;
28 #ifndef CONFIG_PCI
29 /* A "nop" PCI implementation. */
30 asmlinkage int sys_pciconfig_read(unsigned long bus, unsigned long dfn,
31 unsigned long off, unsigned long len,
32 unsigned char *buf)
33 {
34 return 0;
35 }
36 asmlinkage int sys_pciconfig_write(unsigned long bus, unsigned long dfn,
37 unsigned long off, unsigned long len,
38 unsigned char *buf)
39 {
40 return 0;
41 }
42 #else
44 /* List of all PCI controllers found in the system. */
45 struct pci_controller_info *pci_controller_root = NULL;
47 /* Each PCI controller found gets a unique index. */
48 int pci_num_controllers = 0;
50 volatile int pci_poke_in_progress;
51 volatile int pci_poke_cpu = -1;
52 volatile int pci_poke_faulted;
54 static DEFINE_SPINLOCK(pci_poke_lock);
56 void pci_config_read8(u8 *addr, u8 *ret)
57 {
58 unsigned long flags;
59 u8 byte;
61 spin_lock_irqsave(&pci_poke_lock, flags);
62 pci_poke_cpu = smp_processor_id();
63 pci_poke_in_progress = 1;
64 pci_poke_faulted = 0;
65 __asm__ __volatile__("membar #Sync\n\t"
66 "lduba [%1] %2, %0\n\t"
67 "membar #Sync"
68 : "=r" (byte)
69 : "r" (addr), "i" (ASI_PHYS_BYPASS_EC_E_L)
70 : "memory");
71 pci_poke_in_progress = 0;
72 pci_poke_cpu = -1;
73 if (!pci_poke_faulted)
74 *ret = byte;
75 spin_unlock_irqrestore(&pci_poke_lock, flags);
76 }
78 void pci_config_read16(u16 *addr, u16 *ret)
79 {
80 unsigned long flags;
81 u16 word;
83 spin_lock_irqsave(&pci_poke_lock, flags);
84 pci_poke_cpu = smp_processor_id();
85 pci_poke_in_progress = 1;
86 pci_poke_faulted = 0;
87 __asm__ __volatile__("membar #Sync\n\t"
88 "lduha [%1] %2, %0\n\t"
89 "membar #Sync"
90 : "=r" (word)
91 : "r" (addr), "i" (ASI_PHYS_BYPASS_EC_E_L)
92 : "memory");
93 pci_poke_in_progress = 0;
94 pci_poke_cpu = -1;
95 if (!pci_poke_faulted)
96 *ret = word;
97 spin_unlock_irqrestore(&pci_poke_lock, flags);
98 }
100 void pci_config_read32(u32 *addr, u32 *ret)
101 {
102 unsigned long flags;
103 u32 dword;
105 spin_lock_irqsave(&pci_poke_lock, flags);
106 pci_poke_cpu = smp_processor_id();
107 pci_poke_in_progress = 1;
108 pci_poke_faulted = 0;
109 __asm__ __volatile__("membar #Sync\n\t"
110 "lduwa [%1] %2, %0\n\t"
111 "membar #Sync"
112 : "=r" (dword)
113 : "r" (addr), "i" (ASI_PHYS_BYPASS_EC_E_L)
114 : "memory");
115 pci_poke_in_progress = 0;
116 pci_poke_cpu = -1;
117 if (!pci_poke_faulted)
118 *ret = dword;
119 spin_unlock_irqrestore(&pci_poke_lock, flags);
120 }
122 void pci_config_write8(u8 *addr, u8 val)
123 {
124 unsigned long flags;
126 spin_lock_irqsave(&pci_poke_lock, flags);
127 pci_poke_cpu = smp_processor_id();
128 pci_poke_in_progress = 1;
129 pci_poke_faulted = 0;
130 __asm__ __volatile__("membar #Sync\n\t"
131 "stba %0, [%1] %2\n\t"
132 "membar #Sync"
133 : /* no outputs */
134 : "r" (val), "r" (addr), "i" (ASI_PHYS_BYPASS_EC_E_L)
135 : "memory");
136 pci_poke_in_progress = 0;
137 pci_poke_cpu = -1;
138 spin_unlock_irqrestore(&pci_poke_lock, flags);
139 }
141 void pci_config_write16(u16 *addr, u16 val)
142 {
143 unsigned long flags;
145 spin_lock_irqsave(&pci_poke_lock, flags);
146 pci_poke_cpu = smp_processor_id();
147 pci_poke_in_progress = 1;
148 pci_poke_faulted = 0;
149 __asm__ __volatile__("membar #Sync\n\t"
150 "stha %0, [%1] %2\n\t"
151 "membar #Sync"
152 : /* no outputs */
153 : "r" (val), "r" (addr), "i" (ASI_PHYS_BYPASS_EC_E_L)
154 : "memory");
155 pci_poke_in_progress = 0;
156 pci_poke_cpu = -1;
157 spin_unlock_irqrestore(&pci_poke_lock, flags);
158 }
160 void pci_config_write32(u32 *addr, u32 val)
161 {
162 unsigned long flags;
164 spin_lock_irqsave(&pci_poke_lock, flags);
165 pci_poke_cpu = smp_processor_id();
166 pci_poke_in_progress = 1;
167 pci_poke_faulted = 0;
168 __asm__ __volatile__("membar #Sync\n\t"
169 "stwa %0, [%1] %2\n\t"
170 "membar #Sync"
171 : /* no outputs */
172 : "r" (val), "r" (addr), "i" (ASI_PHYS_BYPASS_EC_E_L)
173 : "memory");
174 pci_poke_in_progress = 0;
175 pci_poke_cpu = -1;
176 spin_unlock_irqrestore(&pci_poke_lock, flags);
177 }
179 /* Probe for all PCI controllers in the system. */
180 extern void sabre_init(struct device_node *, const char *);
181 extern void psycho_init(struct device_node *, const char *);
182 extern void schizo_init(struct device_node *, const char *);
183 extern void schizo_plus_init(struct device_node *, const char *);
184 extern void tomatillo_init(struct device_node *, const char *);
185 extern void sun4v_pci_init(struct device_node *, const char *);
187 static struct {
188 char *model_name;
189 void (*init)(struct device_node *, const char *);
190 } pci_controller_table[] __initdata = {
191 { "SUNW,sabre", sabre_init },
192 { "pci108e,a000", sabre_init },
193 { "pci108e,a001", sabre_init },
194 { "SUNW,psycho", psycho_init },
195 { "pci108e,8000", psycho_init },
196 { "SUNW,schizo", schizo_init },
197 { "pci108e,8001", schizo_init },
198 { "SUNW,schizo+", schizo_plus_init },
199 { "pci108e,8002", schizo_plus_init },
200 { "SUNW,tomatillo", tomatillo_init },
201 { "pci108e,a801", tomatillo_init },
202 { "SUNW,sun4v-pci", sun4v_pci_init },
203 };
204 #define PCI_NUM_CONTROLLER_TYPES (sizeof(pci_controller_table) / \
205 sizeof(pci_controller_table[0]))
207 static int __init pci_controller_init(const char *model_name, int namelen, struct device_node *dp)
208 {
209 int i;
211 for (i = 0; i < PCI_NUM_CONTROLLER_TYPES; i++) {
212 if (!strncmp(model_name,
213 pci_controller_table[i].model_name,
214 namelen)) {
215 pci_controller_table[i].init(dp, model_name);
216 return 1;
217 }
218 }
220 return 0;
221 }
223 static int __init pci_is_controller(const char *model_name, int namelen, struct device_node *dp)
224 {
225 int i;
227 for (i = 0; i < PCI_NUM_CONTROLLER_TYPES; i++) {
228 if (!strncmp(model_name,
229 pci_controller_table[i].model_name,
230 namelen)) {
231 return 1;
232 }
233 }
234 return 0;
235 }
237 static int __init pci_controller_scan(int (*handler)(const char *, int, struct device_node *))
238 {
239 struct device_node *dp;
240 int count = 0;
242 for_each_node_by_name(dp, "pci") {
243 struct property *prop;
244 int len;
246 prop = of_find_property(dp, "model", &len);
247 if (!prop)
248 prop = of_find_property(dp, "compatible", &len);
250 if (prop) {
251 const char *model = prop->value;
252 int item_len = 0;
254 /* Our value may be a multi-valued string in the
255 * case of some compatible properties. For sanity,
256 * only try the first one.
257 */
258 while (model[item_len] && len) {
259 len--;
260 item_len++;
261 }
263 if (handler(model, item_len, dp))
264 count++;
265 }
266 }
268 return count;
269 }
272 /* Is there some PCI controller in the system? */
273 int __init pcic_present(void)
274 {
275 return pci_controller_scan(pci_is_controller);
276 }
278 struct pci_iommu_ops *pci_iommu_ops;
279 EXPORT_SYMBOL(pci_iommu_ops);
281 extern struct pci_iommu_ops pci_sun4u_iommu_ops,
282 pci_sun4v_iommu_ops;
284 /* Find each controller in the system, attach and initialize
285 * software state structure for each and link into the
286 * pci_controller_root. Setup the controller enough such
287 * that bus scanning can be done.
288 */
289 static void __init pci_controller_probe(void)
290 {
291 if (tlb_type == hypervisor)
292 pci_iommu_ops = &pci_sun4v_iommu_ops;
293 else
294 pci_iommu_ops = &pci_sun4u_iommu_ops;
296 printk("PCI: Probing for controllers.\n");
298 pci_controller_scan(pci_controller_init);
299 }
301 static void __init pci_scan_each_controller_bus(void)
302 {
303 struct pci_controller_info *p;
305 for (p = pci_controller_root; p; p = p->next)
306 p->scan_bus(p);
307 }
309 extern void power_init(void);
311 static int __init pcibios_init(void)
312 {
313 pci_controller_probe();
314 if (pci_controller_root == NULL)
315 return 0;
317 pci_scan_each_controller_bus();
319 isa_init();
320 ebus_init();
321 power_init();
323 return 0;
324 }
326 subsys_initcall(pcibios_init);
328 void pcibios_fixup_bus(struct pci_bus *pbus)
329 {
330 struct pci_pbm_info *pbm = pbus->sysdata;
332 /* Generic PCI bus probing sets these to point at
333 * &io{port,mem}_resouce which is wrong for us.
334 */
335 pbus->resource[0] = &pbm->io_space;
336 pbus->resource[1] = &pbm->mem_space;
337 }
339 struct resource *pcibios_select_root(struct pci_dev *pdev, struct resource *r)
340 {
341 struct pci_pbm_info *pbm = pdev->bus->sysdata;
342 struct resource *root = NULL;
344 if (r->flags & IORESOURCE_IO)
345 root = &pbm->io_space;
346 if (r->flags & IORESOURCE_MEM)
347 root = &pbm->mem_space;
349 return root;
350 }
352 void pcibios_update_irq(struct pci_dev *pdev, int irq)
353 {
354 }
356 void pcibios_align_resource(void *data, struct resource *res,
357 resource_size_t size, resource_size_t align)
358 {
359 }
361 int pcibios_enable_device(struct pci_dev *pdev, int mask)
362 {
363 return 0;
364 }
366 void pcibios_resource_to_bus(struct pci_dev *pdev, struct pci_bus_region *region,
367 struct resource *res)
368 {
369 struct pci_pbm_info *pbm = pdev->bus->sysdata;
370 struct resource zero_res, *root;
372 zero_res.start = 0;
373 zero_res.end = 0;
374 zero_res.flags = res->flags;
376 if (res->flags & IORESOURCE_IO)
377 root = &pbm->io_space;
378 else
379 root = &pbm->mem_space;
381 pbm->parent->resource_adjust(pdev, &zero_res, root);
383 region->start = res->start - zero_res.start;
384 region->end = res->end - zero_res.start;
385 }
386 EXPORT_SYMBOL(pcibios_resource_to_bus);
388 void pcibios_bus_to_resource(struct pci_dev *pdev, struct resource *res,
389 struct pci_bus_region *region)
390 {
391 struct pci_pbm_info *pbm = pdev->bus->sysdata;
392 struct resource *root;
394 res->start = region->start;
395 res->end = region->end;
397 if (res->flags & IORESOURCE_IO)
398 root = &pbm->io_space;
399 else
400 root = &pbm->mem_space;
402 pbm->parent->resource_adjust(pdev, res, root);
403 }
404 EXPORT_SYMBOL(pcibios_bus_to_resource);
406 char * __init pcibios_setup(char *str)
407 {
408 return str;
409 }
411 /* Platform support for /proc/bus/pci/X/Y mmap()s. */
413 /* If the user uses a host-bridge as the PCI device, he may use
414 * this to perform a raw mmap() of the I/O or MEM space behind
415 * that controller.
416 *
417 * This can be useful for execution of x86 PCI bios initialization code
418 * on a PCI card, like the xfree86 int10 stuff does.
419 */
420 static int __pci_mmap_make_offset_bus(struct pci_dev *pdev, struct vm_area_struct *vma,
421 enum pci_mmap_state mmap_state)
422 {
423 struct pcidev_cookie *pcp = pdev->sysdata;
424 struct pci_pbm_info *pbm;
425 struct pci_controller_info *p;
426 unsigned long space_size, user_offset, user_size;
428 if (!pcp)
429 return -ENXIO;
430 pbm = pcp->pbm;
431 if (!pbm)
432 return -ENXIO;
434 p = pbm->parent;
435 if (p->pbms_same_domain) {
436 unsigned long lowest, highest;
438 lowest = ~0UL; highest = 0UL;
439 if (mmap_state == pci_mmap_io) {
440 if (p->pbm_A.io_space.flags) {
441 lowest = p->pbm_A.io_space.start;
442 highest = p->pbm_A.io_space.end + 1;
443 }
444 if (p->pbm_B.io_space.flags) {
445 if (lowest > p->pbm_B.io_space.start)
446 lowest = p->pbm_B.io_space.start;
447 if (highest < p->pbm_B.io_space.end + 1)
448 highest = p->pbm_B.io_space.end + 1;
449 }
450 space_size = highest - lowest;
451 } else {
452 if (p->pbm_A.mem_space.flags) {
453 lowest = p->pbm_A.mem_space.start;
454 highest = p->pbm_A.mem_space.end + 1;
455 }
456 if (p->pbm_B.mem_space.flags) {
457 if (lowest > p->pbm_B.mem_space.start)
458 lowest = p->pbm_B.mem_space.start;
459 if (highest < p->pbm_B.mem_space.end + 1)
460 highest = p->pbm_B.mem_space.end + 1;
461 }
462 space_size = highest - lowest;
463 }
464 } else {
465 if (mmap_state == pci_mmap_io) {
466 space_size = (pbm->io_space.end -
467 pbm->io_space.start) + 1;
468 } else {
469 space_size = (pbm->mem_space.end -
470 pbm->mem_space.start) + 1;
471 }
472 }
474 /* Make sure the request is in range. */
475 user_offset = vma->vm_pgoff << PAGE_SHIFT;
476 user_size = vma->vm_end - vma->vm_start;
478 if (user_offset >= space_size ||
479 (user_offset + user_size) > space_size)
480 return -EINVAL;
482 if (p->pbms_same_domain) {
483 unsigned long lowest = ~0UL;
485 if (mmap_state == pci_mmap_io) {
486 if (p->pbm_A.io_space.flags)
487 lowest = p->pbm_A.io_space.start;
488 if (p->pbm_B.io_space.flags &&
489 lowest > p->pbm_B.io_space.start)
490 lowest = p->pbm_B.io_space.start;
491 } else {
492 if (p->pbm_A.mem_space.flags)
493 lowest = p->pbm_A.mem_space.start;
494 if (p->pbm_B.mem_space.flags &&
495 lowest > p->pbm_B.mem_space.start)
496 lowest = p->pbm_B.mem_space.start;
497 }
498 vma->vm_pgoff = (lowest + user_offset) >> PAGE_SHIFT;
499 } else {
500 if (mmap_state == pci_mmap_io) {
501 vma->vm_pgoff = (pbm->io_space.start +
502 user_offset) >> PAGE_SHIFT;
503 } else {
504 vma->vm_pgoff = (pbm->mem_space.start +
505 user_offset) >> PAGE_SHIFT;
506 }
507 }
509 return 0;
510 }
512 /* Adjust vm_pgoff of VMA such that it is the physical page offset corresponding
513 * to the 32-bit pci bus offset for DEV requested by the user.
514 *
515 * Basically, the user finds the base address for his device which he wishes
516 * to mmap. They read the 32-bit value from the config space base register,
517 * add whatever PAGE_SIZE multiple offset they wish, and feed this into the
518 * offset parameter of mmap on /proc/bus/pci/XXX for that device.
519 *
520 * Returns negative error code on failure, zero on success.
521 */
522 static int __pci_mmap_make_offset(struct pci_dev *dev, struct vm_area_struct *vma,
523 enum pci_mmap_state mmap_state)
524 {
525 unsigned long user_offset = vma->vm_pgoff << PAGE_SHIFT;
526 unsigned long user32 = user_offset & pci_memspace_mask;
527 unsigned long largest_base, this_base, addr32;
528 int i;
530 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
531 return __pci_mmap_make_offset_bus(dev, vma, mmap_state);
533 /* Figure out which base address this is for. */
534 largest_base = 0UL;
535 for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
536 struct resource *rp = &dev->resource[i];
538 /* Active? */
539 if (!rp->flags)
540 continue;
542 /* Same type? */
543 if (i == PCI_ROM_RESOURCE) {
544 if (mmap_state != pci_mmap_mem)
545 continue;
546 } else {
547 if ((mmap_state == pci_mmap_io &&
548 (rp->flags & IORESOURCE_IO) == 0) ||
549 (mmap_state == pci_mmap_mem &&
550 (rp->flags & IORESOURCE_MEM) == 0))
551 continue;
552 }
554 this_base = rp->start;
556 addr32 = (this_base & PAGE_MASK) & pci_memspace_mask;
558 if (mmap_state == pci_mmap_io)
559 addr32 &= 0xffffff;
561 if (addr32 <= user32 && this_base > largest_base)
562 largest_base = this_base;
563 }
565 if (largest_base == 0UL)
566 return -EINVAL;
568 /* Now construct the final physical address. */
569 if (mmap_state == pci_mmap_io)
570 vma->vm_pgoff = (((largest_base & ~0xffffffUL) | user32) >> PAGE_SHIFT);
571 else
572 vma->vm_pgoff = (((largest_base & ~(pci_memspace_mask)) | user32) >> PAGE_SHIFT);
574 return 0;
575 }
577 /* Set vm_flags of VMA, as appropriate for this architecture, for a pci device
578 * mapping.
579 */
580 static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma,
581 enum pci_mmap_state mmap_state)
582 {
583 vma->vm_flags |= (VM_IO | VM_RESERVED);
584 }
586 /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
587 * device mapping.
588 */
589 static void __pci_mmap_set_pgprot(struct pci_dev *dev, struct vm_area_struct *vma,
590 enum pci_mmap_state mmap_state)
591 {
592 /* Our io_remap_pfn_range takes care of this, do nothing. */
593 }
595 /* Perform the actual remap of the pages for a PCI device mapping, as appropriate
596 * for this architecture. The region in the process to map is described by vm_start
597 * and vm_end members of VMA, the base physical address is found in vm_pgoff.
598 * The pci device structure is provided so that architectures may make mapping
599 * decisions on a per-device or per-bus basis.
600 *
601 * Returns a negative error code on failure, zero on success.
602 */
603 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
604 enum pci_mmap_state mmap_state,
605 int write_combine)
606 {
607 int ret;
609 ret = __pci_mmap_make_offset(dev, vma, mmap_state);
610 if (ret < 0)
611 return ret;
613 __pci_mmap_set_flags(dev, vma, mmap_state);
614 __pci_mmap_set_pgprot(dev, vma, mmap_state);
616 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
617 ret = io_remap_pfn_range(vma, vma->vm_start,
618 vma->vm_pgoff,
619 vma->vm_end - vma->vm_start,
620 vma->vm_page_prot);
621 if (ret)
622 return ret;
624 return 0;
625 }
627 /* Return the domain nuber for this pci bus */
629 int pci_domain_nr(struct pci_bus *pbus)
630 {
631 struct pci_pbm_info *pbm = pbus->sysdata;
632 int ret;
634 if (pbm == NULL || pbm->parent == NULL) {
635 ret = -ENXIO;
636 } else {
637 struct pci_controller_info *p = pbm->parent;
639 ret = p->index;
640 if (p->pbms_same_domain == 0)
641 ret = ((ret << 1) +
642 ((pbm == &pbm->parent->pbm_B) ? 1 : 0));
643 }
645 return ret;
646 }
647 EXPORT_SYMBOL(pci_domain_nr);
649 int pcibios_prep_mwi(struct pci_dev *dev)
650 {
651 /* We set correct PCI_CACHE_LINE_SIZE register values for every
652 * device probed on this platform. So there is nothing to check
653 * and this always succeeds.
654 */
655 return 0;
656 }
658 #endif /* !(CONFIG_PCI) */