ia64/linux-2.6.18-xen.hg

view arch/alpha/kernel/core_apecs.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * linux/arch/alpha/kernel/core_apecs.c
3 *
4 * Rewritten for Apecs from the lca.c from:
5 *
6 * Written by David Mosberger (davidm@cs.arizona.edu) with some code
7 * taken from Dave Rusling's (david.rusling@reo.mts.dec.com) 32-bit
8 * bios code.
9 *
10 * Code common to all APECS core logic chips.
11 */
13 #define __EXTERN_INLINE inline
14 #include <asm/io.h>
15 #include <asm/core_apecs.h>
16 #undef __EXTERN_INLINE
18 #include <linux/types.h>
19 #include <linux/pci.h>
20 #include <linux/init.h>
22 #include <asm/ptrace.h>
23 #include <asm/smp.h>
25 #include "proto.h"
26 #include "pci_impl.h"
28 /*
29 * NOTE: Herein lie back-to-back mb instructions. They are magic.
30 * One plausible explanation is that the i/o controller does not properly
31 * handle the system transaction. Another involves timing. Ho hum.
32 */
34 /*
35 * BIOS32-style PCI interface:
36 */
38 #define DEBUG_CONFIG 0
40 #if DEBUG_CONFIG
41 # define DBGC(args) printk args
42 #else
43 # define DBGC(args)
44 #endif
46 #define vuip volatile unsigned int *
48 /*
49 * Given a bus, device, and function number, compute resulting
50 * configuration space address and setup the APECS_HAXR2 register
51 * accordingly. It is therefore not safe to have concurrent
52 * invocations to configuration space access routines, but there
53 * really shouldn't be any need for this.
54 *
55 * Type 0:
56 *
57 * 3 3|3 3 2 2|2 2 2 2|2 2 2 2|1 1 1 1|1 1 1 1|1 1
58 * 3 2|1 0 9 8|7 6 5 4|3 2 1 0|9 8 7 6|5 4 3 2|1 0 9 8|7 6 5 4|3 2 1 0
59 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60 * | | | | | | | | | | | | | | | | | | | | | | | |F|F|F|R|R|R|R|R|R|0|0|
61 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 *
63 * 31:11 Device select bit.
64 * 10:8 Function number
65 * 7:2 Register number
66 *
67 * Type 1:
68 *
69 * 3 3|3 3 2 2|2 2 2 2|2 2 2 2|1 1 1 1|1 1 1 1|1 1
70 * 3 2|1 0 9 8|7 6 5 4|3 2 1 0|9 8 7 6|5 4 3 2|1 0 9 8|7 6 5 4|3 2 1 0
71 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
72 * | | | | | | | | | | |B|B|B|B|B|B|B|B|D|D|D|D|D|F|F|F|R|R|R|R|R|R|0|1|
73 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
74 *
75 * 31:24 reserved
76 * 23:16 bus number (8 bits = 128 possible buses)
77 * 15:11 Device number (5 bits)
78 * 10:8 function number
79 * 7:2 register number
80 *
81 * Notes:
82 * The function number selects which function of a multi-function device
83 * (e.g., SCSI and Ethernet).
84 *
85 * The register selects a DWORD (32 bit) register offset. Hence it
86 * doesn't get shifted by 2 bits as we want to "drop" the bottom two
87 * bits.
88 */
90 static int
91 mk_conf_addr(struct pci_bus *pbus, unsigned int device_fn, int where,
92 unsigned long *pci_addr, unsigned char *type1)
93 {
94 unsigned long addr;
95 u8 bus = pbus->number;
97 DBGC(("mk_conf_addr(bus=%d ,device_fn=0x%x, where=0x%x,"
98 " pci_addr=0x%p, type1=0x%p)\n",
99 bus, device_fn, where, pci_addr, type1));
101 if (bus == 0) {
102 int device = device_fn >> 3;
104 /* type 0 configuration cycle: */
106 if (device > 20) {
107 DBGC(("mk_conf_addr: device (%d) > 20, returning -1\n",
108 device));
109 return -1;
110 }
112 *type1 = 0;
113 addr = (device_fn << 8) | (where);
114 } else {
115 /* type 1 configuration cycle: */
116 *type1 = 1;
117 addr = (bus << 16) | (device_fn << 8) | (where);
118 }
119 *pci_addr = addr;
120 DBGC(("mk_conf_addr: returning pci_addr 0x%lx\n", addr));
121 return 0;
122 }
124 static unsigned int
125 conf_read(unsigned long addr, unsigned char type1)
126 {
127 unsigned long flags;
128 unsigned int stat0, value;
129 unsigned int haxr2 = 0;
131 local_irq_save(flags); /* avoid getting hit by machine check */
133 DBGC(("conf_read(addr=0x%lx, type1=%d)\n", addr, type1));
135 /* Reset status register to avoid losing errors. */
136 stat0 = *(vuip)APECS_IOC_DCSR;
137 *(vuip)APECS_IOC_DCSR = stat0;
138 mb();
139 DBGC(("conf_read: APECS DCSR was 0x%x\n", stat0));
141 /* If Type1 access, must set HAE #2. */
142 if (type1) {
143 haxr2 = *(vuip)APECS_IOC_HAXR2;
144 mb();
145 *(vuip)APECS_IOC_HAXR2 = haxr2 | 1;
146 DBGC(("conf_read: TYPE1 access\n"));
147 }
149 draina();
150 mcheck_expected(0) = 1;
151 mcheck_taken(0) = 0;
152 mb();
154 /* Access configuration space. */
156 /* Some SRMs step on these registers during a machine check. */
157 asm volatile("ldl %0,%1; mb; mb" : "=r"(value) : "m"(*(vuip)addr)
158 : "$9", "$10", "$11", "$12", "$13", "$14", "memory");
160 if (mcheck_taken(0)) {
161 mcheck_taken(0) = 0;
162 value = 0xffffffffU;
163 mb();
164 }
165 mcheck_expected(0) = 0;
166 mb();
168 #if 1
169 /*
170 * david.rusling@reo.mts.dec.com. This code is needed for the
171 * EB64+ as it does not generate a machine check (why I don't
172 * know). When we build kernels for one particular platform
173 * then we can make this conditional on the type.
174 */
175 draina();
177 /* Now look for any errors. */
178 stat0 = *(vuip)APECS_IOC_DCSR;
179 DBGC(("conf_read: APECS DCSR after read 0x%x\n", stat0));
181 /* Is any error bit set? */
182 if (stat0 & 0xffe0U) {
183 /* If not NDEV, print status. */
184 if (!(stat0 & 0x0800)) {
185 printk("apecs.c:conf_read: got stat0=%x\n", stat0);
186 }
188 /* Reset error status. */
189 *(vuip)APECS_IOC_DCSR = stat0;
190 mb();
191 wrmces(0x7); /* reset machine check */
192 value = 0xffffffff;
193 }
194 #endif
196 /* If Type1 access, must reset HAE #2 so normal IO space ops work. */
197 if (type1) {
198 *(vuip)APECS_IOC_HAXR2 = haxr2 & ~1;
199 mb();
200 }
201 local_irq_restore(flags);
203 return value;
204 }
206 static void
207 conf_write(unsigned long addr, unsigned int value, unsigned char type1)
208 {
209 unsigned long flags;
210 unsigned int stat0;
211 unsigned int haxr2 = 0;
213 local_irq_save(flags); /* avoid getting hit by machine check */
215 /* Reset status register to avoid losing errors. */
216 stat0 = *(vuip)APECS_IOC_DCSR;
217 *(vuip)APECS_IOC_DCSR = stat0;
218 mb();
220 /* If Type1 access, must set HAE #2. */
221 if (type1) {
222 haxr2 = *(vuip)APECS_IOC_HAXR2;
223 mb();
224 *(vuip)APECS_IOC_HAXR2 = haxr2 | 1;
225 }
227 draina();
228 mcheck_expected(0) = 1;
229 mb();
231 /* Access configuration space. */
232 *(vuip)addr = value;
233 mb();
234 mb(); /* magic */
235 mcheck_expected(0) = 0;
236 mb();
238 #if 1
239 /*
240 * david.rusling@reo.mts.dec.com. This code is needed for the
241 * EB64+ as it does not generate a machine check (why I don't
242 * know). When we build kernels for one particular platform
243 * then we can make this conditional on the type.
244 */
245 draina();
247 /* Now look for any errors. */
248 stat0 = *(vuip)APECS_IOC_DCSR;
250 /* Is any error bit set? */
251 if (stat0 & 0xffe0U) {
252 /* If not NDEV, print status. */
253 if (!(stat0 & 0x0800)) {
254 printk("apecs.c:conf_write: got stat0=%x\n", stat0);
255 }
257 /* Reset error status. */
258 *(vuip)APECS_IOC_DCSR = stat0;
259 mb();
260 wrmces(0x7); /* reset machine check */
261 }
262 #endif
264 /* If Type1 access, must reset HAE #2 so normal IO space ops work. */
265 if (type1) {
266 *(vuip)APECS_IOC_HAXR2 = haxr2 & ~1;
267 mb();
268 }
269 local_irq_restore(flags);
270 }
272 static int
273 apecs_read_config(struct pci_bus *bus, unsigned int devfn, int where,
274 int size, u32 *value)
275 {
276 unsigned long addr, pci_addr;
277 unsigned char type1;
278 long mask;
279 int shift;
281 if (mk_conf_addr(bus, devfn, where, &pci_addr, &type1))
282 return PCIBIOS_DEVICE_NOT_FOUND;
284 mask = (size - 1) * 8;
285 shift = (where & 3) * 8;
286 addr = (pci_addr << 5) + mask + APECS_CONF;
287 *value = conf_read(addr, type1) >> (shift);
288 return PCIBIOS_SUCCESSFUL;
289 }
291 static int
292 apecs_write_config(struct pci_bus *bus, unsigned int devfn, int where,
293 int size, u32 value)
294 {
295 unsigned long addr, pci_addr;
296 unsigned char type1;
297 long mask;
299 if (mk_conf_addr(bus, devfn, where, &pci_addr, &type1))
300 return PCIBIOS_DEVICE_NOT_FOUND;
302 mask = (size - 1) * 8;
303 addr = (pci_addr << 5) + mask + APECS_CONF;
304 conf_write(addr, value << ((where & 3) * 8), type1);
305 return PCIBIOS_SUCCESSFUL;
306 }
308 struct pci_ops apecs_pci_ops =
309 {
310 .read = apecs_read_config,
311 .write = apecs_write_config,
312 };
314 void
315 apecs_pci_tbi(struct pci_controller *hose, dma_addr_t start, dma_addr_t end)
316 {
317 wmb();
318 *(vip)APECS_IOC_TBIA = 0;
319 mb();
320 }
322 void __init
323 apecs_init_arch(void)
324 {
325 struct pci_controller *hose;
327 /*
328 * Create our single hose.
329 */
331 pci_isa_hose = hose = alloc_pci_controller();
332 hose->io_space = &ioport_resource;
333 hose->mem_space = &iomem_resource;
334 hose->index = 0;
336 hose->sparse_mem_base = APECS_SPARSE_MEM - IDENT_ADDR;
337 hose->dense_mem_base = APECS_DENSE_MEM - IDENT_ADDR;
338 hose->sparse_io_base = APECS_IO - IDENT_ADDR;
339 hose->dense_io_base = 0;
341 /*
342 * Set up the PCI to main memory translation windows.
343 *
344 * Window 1 is direct access 1GB at 1GB
345 * Window 2 is scatter-gather 8MB at 8MB (for isa)
346 */
347 hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0);
348 hose->sg_pci = NULL;
349 __direct_map_base = 0x40000000;
350 __direct_map_size = 0x40000000;
352 *(vuip)APECS_IOC_PB1R = __direct_map_base | 0x00080000;
353 *(vuip)APECS_IOC_PM1R = (__direct_map_size - 1) & 0xfff00000U;
354 *(vuip)APECS_IOC_TB1R = 0;
356 *(vuip)APECS_IOC_PB2R = hose->sg_isa->dma_base | 0x000c0000;
357 *(vuip)APECS_IOC_PM2R = (hose->sg_isa->size - 1) & 0xfff00000;
358 *(vuip)APECS_IOC_TB2R = virt_to_phys(hose->sg_isa->ptes) >> 1;
360 apecs_pci_tbi(hose, 0, -1);
362 /*
363 * Finally, clear the HAXR2 register, which gets used
364 * for PCI Config Space accesses. That is the way
365 * we want to use it, and we do not want to depend on
366 * what ARC or SRM might have left behind...
367 */
368 *(vuip)APECS_IOC_HAXR2 = 0;
369 mb();
370 }
372 void
373 apecs_pci_clr_err(void)
374 {
375 unsigned int jd;
377 jd = *(vuip)APECS_IOC_DCSR;
378 if (jd & 0xffe0L) {
379 *(vuip)APECS_IOC_SEAR;
380 *(vuip)APECS_IOC_DCSR = jd | 0xffe1L;
381 mb();
382 *(vuip)APECS_IOC_DCSR;
383 }
384 *(vuip)APECS_IOC_TBIA = (unsigned int)APECS_IOC_TBIA;
385 mb();
386 *(vuip)APECS_IOC_TBIA;
387 }
389 void
390 apecs_machine_check(unsigned long vector, unsigned long la_ptr,
391 struct pt_regs * regs)
392 {
393 struct el_common *mchk_header;
394 struct el_apecs_procdata *mchk_procdata;
395 struct el_apecs_sysdata_mcheck *mchk_sysdata;
397 mchk_header = (struct el_common *)la_ptr;
399 mchk_procdata = (struct el_apecs_procdata *)
400 (la_ptr + mchk_header->proc_offset
401 - sizeof(mchk_procdata->paltemp));
403 mchk_sysdata = (struct el_apecs_sysdata_mcheck *)
404 (la_ptr + mchk_header->sys_offset);
407 /* Clear the error before any reporting. */
408 mb();
409 mb(); /* magic */
410 draina();
411 apecs_pci_clr_err();
412 wrmces(0x7); /* reset machine check pending flag */
413 mb();
415 process_mcheck_info(vector, la_ptr, regs, "APECS",
416 (mcheck_expected(0)
417 && (mchk_sysdata->epic_dcsr & 0x0c00UL)));
418 }