ia64/linux-2.6.18-xen.hg

view arch/sparc64/kernel/smp.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /* smp.c: Sparc64 SMP support.
2 *
3 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
4 */
6 #include <linux/module.h>
7 #include <linux/kernel.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/pagemap.h>
11 #include <linux/threads.h>
12 #include <linux/smp.h>
13 #include <linux/smp_lock.h>
14 #include <linux/interrupt.h>
15 #include <linux/kernel_stat.h>
16 #include <linux/delay.h>
17 #include <linux/init.h>
18 #include <linux/spinlock.h>
19 #include <linux/fs.h>
20 #include <linux/seq_file.h>
21 #include <linux/cache.h>
22 #include <linux/jiffies.h>
23 #include <linux/profile.h>
24 #include <linux/bootmem.h>
26 #include <asm/head.h>
27 #include <asm/ptrace.h>
28 #include <asm/atomic.h>
29 #include <asm/tlbflush.h>
30 #include <asm/mmu_context.h>
31 #include <asm/cpudata.h>
33 #include <asm/irq.h>
34 #include <asm/page.h>
35 #include <asm/pgtable.h>
36 #include <asm/oplib.h>
37 #include <asm/uaccess.h>
38 #include <asm/timer.h>
39 #include <asm/starfire.h>
40 #include <asm/tlb.h>
41 #include <asm/sections.h>
42 #include <asm/prom.h>
44 extern void calibrate_delay(void);
46 /* Please don't make this stuff initdata!!! --DaveM */
47 static unsigned char boot_cpu_id;
49 cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
50 cpumask_t phys_cpu_present_map __read_mostly = CPU_MASK_NONE;
51 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly =
52 { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
53 static cpumask_t smp_commenced_mask;
54 static cpumask_t cpu_callout_map;
56 void smp_info(struct seq_file *m)
57 {
58 int i;
60 seq_printf(m, "State:\n");
61 for_each_online_cpu(i)
62 seq_printf(m, "CPU%d:\t\tonline\n", i);
63 }
65 void smp_bogo(struct seq_file *m)
66 {
67 int i;
69 for_each_online_cpu(i)
70 seq_printf(m,
71 "Cpu%dBogo\t: %lu.%02lu\n"
72 "Cpu%dClkTck\t: %016lx\n",
73 i, cpu_data(i).udelay_val / (500000/HZ),
74 (cpu_data(i).udelay_val / (5000/HZ)) % 100,
75 i, cpu_data(i).clock_tick);
76 }
78 void __init smp_store_cpu_info(int id)
79 {
80 struct device_node *dp;
81 int def;
83 /* multiplier and counter set by
84 smp_setup_percpu_timer() */
85 cpu_data(id).udelay_val = loops_per_jiffy;
87 cpu_find_by_mid(id, &dp);
88 cpu_data(id).clock_tick =
89 of_getintprop_default(dp, "clock-frequency", 0);
91 def = ((tlb_type == hypervisor) ? (8 * 1024) : (16 * 1024));
92 cpu_data(id).dcache_size =
93 of_getintprop_default(dp, "dcache-size", def);
95 def = 32;
96 cpu_data(id).dcache_line_size =
97 of_getintprop_default(dp, "dcache-line-size", def);
99 def = 16 * 1024;
100 cpu_data(id).icache_size =
101 of_getintprop_default(dp, "icache-size", def);
103 def = 32;
104 cpu_data(id).icache_line_size =
105 of_getintprop_default(dp, "icache-line-size", def);
107 def = ((tlb_type == hypervisor) ?
108 (3 * 1024 * 1024) :
109 (4 * 1024 * 1024));
110 cpu_data(id).ecache_size =
111 of_getintprop_default(dp, "ecache-size", def);
113 def = 64;
114 cpu_data(id).ecache_line_size =
115 of_getintprop_default(dp, "ecache-line-size", def);
117 printk("CPU[%d]: Caches "
118 "D[sz(%d):line_sz(%d)] "
119 "I[sz(%d):line_sz(%d)] "
120 "E[sz(%d):line_sz(%d)]\n",
121 id,
122 cpu_data(id).dcache_size, cpu_data(id).dcache_line_size,
123 cpu_data(id).icache_size, cpu_data(id).icache_line_size,
124 cpu_data(id).ecache_size, cpu_data(id).ecache_line_size);
125 }
127 static void smp_setup_percpu_timer(void);
129 static volatile unsigned long callin_flag = 0;
131 void __init smp_callin(void)
132 {
133 int cpuid = hard_smp_processor_id();
135 __local_per_cpu_offset = __per_cpu_offset(cpuid);
137 if (tlb_type == hypervisor)
138 sun4v_ktsb_register();
140 __flush_tlb_all();
142 smp_setup_percpu_timer();
144 if (cheetah_pcache_forced_on)
145 cheetah_enable_pcache();
147 local_irq_enable();
149 calibrate_delay();
150 smp_store_cpu_info(cpuid);
151 callin_flag = 1;
152 __asm__ __volatile__("membar #Sync\n\t"
153 "flush %%g6" : : : "memory");
155 /* Clear this or we will die instantly when we
156 * schedule back to this idler...
157 */
158 current_thread_info()->new_child = 0;
160 /* Attach to the address space of init_task. */
161 atomic_inc(&init_mm.mm_count);
162 current->active_mm = &init_mm;
164 while (!cpu_isset(cpuid, smp_commenced_mask))
165 rmb();
167 cpu_set(cpuid, cpu_online_map);
169 /* idle thread is expected to have preempt disabled */
170 preempt_disable();
171 }
173 void cpu_panic(void)
174 {
175 printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id());
176 panic("SMP bolixed\n");
177 }
179 static unsigned long current_tick_offset __read_mostly;
181 /* This tick register synchronization scheme is taken entirely from
182 * the ia64 port, see arch/ia64/kernel/smpboot.c for details and credit.
183 *
184 * The only change I've made is to rework it so that the master
185 * initiates the synchonization instead of the slave. -DaveM
186 */
188 #define MASTER 0
189 #define SLAVE (SMP_CACHE_BYTES/sizeof(unsigned long))
191 #define NUM_ROUNDS 64 /* magic value */
192 #define NUM_ITERS 5 /* likewise */
194 static DEFINE_SPINLOCK(itc_sync_lock);
195 static unsigned long go[SLAVE + 1];
197 #define DEBUG_TICK_SYNC 0
199 static inline long get_delta (long *rt, long *master)
200 {
201 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
202 unsigned long tcenter, t0, t1, tm;
203 unsigned long i;
205 for (i = 0; i < NUM_ITERS; i++) {
206 t0 = tick_ops->get_tick();
207 go[MASTER] = 1;
208 membar_storeload();
209 while (!(tm = go[SLAVE]))
210 rmb();
211 go[SLAVE] = 0;
212 wmb();
213 t1 = tick_ops->get_tick();
215 if (t1 - t0 < best_t1 - best_t0)
216 best_t0 = t0, best_t1 = t1, best_tm = tm;
217 }
219 *rt = best_t1 - best_t0;
220 *master = best_tm - best_t0;
222 /* average best_t0 and best_t1 without overflow: */
223 tcenter = (best_t0/2 + best_t1/2);
224 if (best_t0 % 2 + best_t1 % 2 == 2)
225 tcenter++;
226 return tcenter - best_tm;
227 }
229 void smp_synchronize_tick_client(void)
230 {
231 long i, delta, adj, adjust_latency = 0, done = 0;
232 unsigned long flags, rt, master_time_stamp, bound;
233 #if DEBUG_TICK_SYNC
234 struct {
235 long rt; /* roundtrip time */
236 long master; /* master's timestamp */
237 long diff; /* difference between midpoint and master's timestamp */
238 long lat; /* estimate of itc adjustment latency */
239 } t[NUM_ROUNDS];
240 #endif
242 go[MASTER] = 1;
244 while (go[MASTER])
245 rmb();
247 local_irq_save(flags);
248 {
249 for (i = 0; i < NUM_ROUNDS; i++) {
250 delta = get_delta(&rt, &master_time_stamp);
251 if (delta == 0) {
252 done = 1; /* let's lock on to this... */
253 bound = rt;
254 }
256 if (!done) {
257 if (i > 0) {
258 adjust_latency += -delta;
259 adj = -delta + adjust_latency/4;
260 } else
261 adj = -delta;
263 tick_ops->add_tick(adj, current_tick_offset);
264 }
265 #if DEBUG_TICK_SYNC
266 t[i].rt = rt;
267 t[i].master = master_time_stamp;
268 t[i].diff = delta;
269 t[i].lat = adjust_latency/4;
270 #endif
271 }
272 }
273 local_irq_restore(flags);
275 #if DEBUG_TICK_SYNC
276 for (i = 0; i < NUM_ROUNDS; i++)
277 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
278 t[i].rt, t[i].master, t[i].diff, t[i].lat);
279 #endif
281 printk(KERN_INFO "CPU %d: synchronized TICK with master CPU (last diff %ld cycles,"
282 "maxerr %lu cycles)\n", smp_processor_id(), delta, rt);
283 }
285 static void smp_start_sync_tick_client(int cpu);
287 static void smp_synchronize_one_tick(int cpu)
288 {
289 unsigned long flags, i;
291 go[MASTER] = 0;
293 smp_start_sync_tick_client(cpu);
295 /* wait for client to be ready */
296 while (!go[MASTER])
297 rmb();
299 /* now let the client proceed into his loop */
300 go[MASTER] = 0;
301 membar_storeload();
303 spin_lock_irqsave(&itc_sync_lock, flags);
304 {
305 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) {
306 while (!go[MASTER])
307 rmb();
308 go[MASTER] = 0;
309 wmb();
310 go[SLAVE] = tick_ops->get_tick();
311 membar_storeload();
312 }
313 }
314 spin_unlock_irqrestore(&itc_sync_lock, flags);
315 }
317 extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
319 extern unsigned long sparc64_cpu_startup;
321 /* The OBP cpu startup callback truncates the 3rd arg cookie to
322 * 32-bits (I think) so to be safe we have it read the pointer
323 * contained here so we work on >4GB machines. -DaveM
324 */
325 static struct thread_info *cpu_new_thread = NULL;
327 static int __devinit smp_boot_one_cpu(unsigned int cpu)
328 {
329 unsigned long entry =
330 (unsigned long)(&sparc64_cpu_startup);
331 unsigned long cookie =
332 (unsigned long)(&cpu_new_thread);
333 struct task_struct *p;
334 int timeout, ret;
336 p = fork_idle(cpu);
337 callin_flag = 0;
338 cpu_new_thread = task_thread_info(p);
339 cpu_set(cpu, cpu_callout_map);
341 if (tlb_type == hypervisor) {
342 /* Alloc the mondo queues, cpu will load them. */
343 sun4v_init_mondo_queues(0, cpu, 1, 0);
345 prom_startcpu_cpuid(cpu, entry, cookie);
346 } else {
347 struct device_node *dp;
349 cpu_find_by_mid(cpu, &dp);
350 prom_startcpu(dp->node, entry, cookie);
351 }
353 for (timeout = 0; timeout < 5000000; timeout++) {
354 if (callin_flag)
355 break;
356 udelay(100);
357 }
359 if (callin_flag) {
360 ret = 0;
361 } else {
362 printk("Processor %d is stuck.\n", cpu);
363 cpu_clear(cpu, cpu_callout_map);
364 ret = -ENODEV;
365 }
366 cpu_new_thread = NULL;
368 return ret;
369 }
371 static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
372 {
373 u64 result, target;
374 int stuck, tmp;
376 if (this_is_starfire) {
377 /* map to real upaid */
378 cpu = (((cpu & 0x3c) << 1) |
379 ((cpu & 0x40) >> 4) |
380 (cpu & 0x3));
381 }
383 target = (cpu << 14) | 0x70;
384 again:
385 /* Ok, this is the real Spitfire Errata #54.
386 * One must read back from a UDB internal register
387 * after writes to the UDB interrupt dispatch, but
388 * before the membar Sync for that write.
389 * So we use the high UDB control register (ASI 0x7f,
390 * ADDR 0x20) for the dummy read. -DaveM
391 */
392 tmp = 0x40;
393 __asm__ __volatile__(
394 "wrpr %1, %2, %%pstate\n\t"
395 "stxa %4, [%0] %3\n\t"
396 "stxa %5, [%0+%8] %3\n\t"
397 "add %0, %8, %0\n\t"
398 "stxa %6, [%0+%8] %3\n\t"
399 "membar #Sync\n\t"
400 "stxa %%g0, [%7] %3\n\t"
401 "membar #Sync\n\t"
402 "mov 0x20, %%g1\n\t"
403 "ldxa [%%g1] 0x7f, %%g0\n\t"
404 "membar #Sync"
405 : "=r" (tmp)
406 : "r" (pstate), "i" (PSTATE_IE), "i" (ASI_INTR_W),
407 "r" (data0), "r" (data1), "r" (data2), "r" (target),
408 "r" (0x10), "0" (tmp)
409 : "g1");
411 /* NOTE: PSTATE_IE is still clear. */
412 stuck = 100000;
413 do {
414 __asm__ __volatile__("ldxa [%%g0] %1, %0"
415 : "=r" (result)
416 : "i" (ASI_INTR_DISPATCH_STAT));
417 if (result == 0) {
418 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
419 : : "r" (pstate));
420 return;
421 }
422 stuck -= 1;
423 if (stuck == 0)
424 break;
425 } while (result & 0x1);
426 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
427 : : "r" (pstate));
428 if (stuck == 0) {
429 printk("CPU[%d]: mondo stuckage result[%016lx]\n",
430 smp_processor_id(), result);
431 } else {
432 udelay(2);
433 goto again;
434 }
435 }
437 static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
438 {
439 u64 pstate;
440 int i;
442 __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
443 for_each_cpu_mask(i, mask)
444 spitfire_xcall_helper(data0, data1, data2, pstate, i);
445 }
447 /* Cheetah now allows to send the whole 64-bytes of data in the interrupt
448 * packet, but we have no use for that. However we do take advantage of
449 * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
450 */
451 static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
452 {
453 u64 pstate, ver;
454 int nack_busy_id, is_jbus;
456 if (cpus_empty(mask))
457 return;
459 /* Unfortunately, someone at Sun had the brilliant idea to make the
460 * busy/nack fields hard-coded by ITID number for this Ultra-III
461 * derivative processor.
462 */
463 __asm__ ("rdpr %%ver, %0" : "=r" (ver));
464 is_jbus = ((ver >> 32) == __JALAPENO_ID ||
465 (ver >> 32) == __SERRANO_ID);
467 __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
469 retry:
470 __asm__ __volatile__("wrpr %0, %1, %%pstate\n\t"
471 : : "r" (pstate), "i" (PSTATE_IE));
473 /* Setup the dispatch data registers. */
474 __asm__ __volatile__("stxa %0, [%3] %6\n\t"
475 "stxa %1, [%4] %6\n\t"
476 "stxa %2, [%5] %6\n\t"
477 "membar #Sync\n\t"
478 : /* no outputs */
479 : "r" (data0), "r" (data1), "r" (data2),
480 "r" (0x40), "r" (0x50), "r" (0x60),
481 "i" (ASI_INTR_W));
483 nack_busy_id = 0;
484 {
485 int i;
487 for_each_cpu_mask(i, mask) {
488 u64 target = (i << 14) | 0x70;
490 if (!is_jbus)
491 target |= (nack_busy_id << 24);
492 __asm__ __volatile__(
493 "stxa %%g0, [%0] %1\n\t"
494 "membar #Sync\n\t"
495 : /* no outputs */
496 : "r" (target), "i" (ASI_INTR_W));
497 nack_busy_id++;
498 }
499 }
501 /* Now, poll for completion. */
502 {
503 u64 dispatch_stat;
504 long stuck;
506 stuck = 100000 * nack_busy_id;
507 do {
508 __asm__ __volatile__("ldxa [%%g0] %1, %0"
509 : "=r" (dispatch_stat)
510 : "i" (ASI_INTR_DISPATCH_STAT));
511 if (dispatch_stat == 0UL) {
512 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
513 : : "r" (pstate));
514 return;
515 }
516 if (!--stuck)
517 break;
518 } while (dispatch_stat & 0x5555555555555555UL);
520 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
521 : : "r" (pstate));
523 if ((dispatch_stat & ~(0x5555555555555555UL)) == 0) {
524 /* Busy bits will not clear, continue instead
525 * of freezing up on this cpu.
526 */
527 printk("CPU[%d]: mondo stuckage result[%016lx]\n",
528 smp_processor_id(), dispatch_stat);
529 } else {
530 int i, this_busy_nack = 0;
532 /* Delay some random time with interrupts enabled
533 * to prevent deadlock.
534 */
535 udelay(2 * nack_busy_id);
537 /* Clear out the mask bits for cpus which did not
538 * NACK us.
539 */
540 for_each_cpu_mask(i, mask) {
541 u64 check_mask;
543 if (is_jbus)
544 check_mask = (0x2UL << (2*i));
545 else
546 check_mask = (0x2UL <<
547 this_busy_nack);
548 if ((dispatch_stat & check_mask) == 0)
549 cpu_clear(i, mask);
550 this_busy_nack += 2;
551 }
553 goto retry;
554 }
555 }
556 }
558 /* Multi-cpu list version. */
559 static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
560 {
561 struct trap_per_cpu *tb;
562 u16 *cpu_list;
563 u64 *mondo;
564 cpumask_t error_mask;
565 unsigned long flags, status;
566 int cnt, retries, this_cpu, prev_sent, i;
568 /* We have to do this whole thing with interrupts fully disabled.
569 * Otherwise if we send an xcall from interrupt context it will
570 * corrupt both our mondo block and cpu list state.
571 *
572 * One consequence of this is that we cannot use timeout mechanisms
573 * that depend upon interrupts being delivered locally. So, for
574 * example, we cannot sample jiffies and expect it to advance.
575 *
576 * Fortunately, udelay() uses %stick/%tick so we can use that.
577 */
578 local_irq_save(flags);
580 this_cpu = smp_processor_id();
581 tb = &trap_block[this_cpu];
583 mondo = __va(tb->cpu_mondo_block_pa);
584 mondo[0] = data0;
585 mondo[1] = data1;
586 mondo[2] = data2;
587 wmb();
589 cpu_list = __va(tb->cpu_list_pa);
591 /* Setup the initial cpu list. */
592 cnt = 0;
593 for_each_cpu_mask(i, mask)
594 cpu_list[cnt++] = i;
596 cpus_clear(error_mask);
597 retries = 0;
598 prev_sent = 0;
599 do {
600 int forward_progress, n_sent;
602 status = sun4v_cpu_mondo_send(cnt,
603 tb->cpu_list_pa,
604 tb->cpu_mondo_block_pa);
606 /* HV_EOK means all cpus received the xcall, we're done. */
607 if (likely(status == HV_EOK))
608 break;
610 /* First, see if we made any forward progress.
611 *
612 * The hypervisor indicates successful sends by setting
613 * cpu list entries to the value 0xffff.
614 */
615 n_sent = 0;
616 for (i = 0; i < cnt; i++) {
617 if (likely(cpu_list[i] == 0xffff))
618 n_sent++;
619 }
621 forward_progress = 0;
622 if (n_sent > prev_sent)
623 forward_progress = 1;
625 prev_sent = n_sent;
627 /* If we get a HV_ECPUERROR, then one or more of the cpus
628 * in the list are in error state. Use the cpu_state()
629 * hypervisor call to find out which cpus are in error state.
630 */
631 if (unlikely(status == HV_ECPUERROR)) {
632 for (i = 0; i < cnt; i++) {
633 long err;
634 u16 cpu;
636 cpu = cpu_list[i];
637 if (cpu == 0xffff)
638 continue;
640 err = sun4v_cpu_state(cpu);
641 if (err >= 0 &&
642 err == HV_CPU_STATE_ERROR) {
643 cpu_list[i] = 0xffff;
644 cpu_set(cpu, error_mask);
645 }
646 }
647 } else if (unlikely(status != HV_EWOULDBLOCK))
648 goto fatal_mondo_error;
650 /* Don't bother rewriting the CPU list, just leave the
651 * 0xffff and non-0xffff entries in there and the
652 * hypervisor will do the right thing.
653 *
654 * Only advance timeout state if we didn't make any
655 * forward progress.
656 */
657 if (unlikely(!forward_progress)) {
658 if (unlikely(++retries > 10000))
659 goto fatal_mondo_timeout;
661 /* Delay a little bit to let other cpus catch up
662 * on their cpu mondo queue work.
663 */
664 udelay(2 * cnt);
665 }
666 } while (1);
668 local_irq_restore(flags);
670 if (unlikely(!cpus_empty(error_mask)))
671 goto fatal_mondo_cpu_error;
673 return;
675 fatal_mondo_cpu_error:
676 printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
677 "were in error state\n",
678 this_cpu);
679 printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
680 for_each_cpu_mask(i, error_mask)
681 printk("%d ", i);
682 printk("]\n");
683 return;
685 fatal_mondo_timeout:
686 local_irq_restore(flags);
687 printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
688 " progress after %d retries.\n",
689 this_cpu, retries);
690 goto dump_cpu_list_and_out;
692 fatal_mondo_error:
693 local_irq_restore(flags);
694 printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
695 this_cpu, status);
696 printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
697 "mondo_block_pa(%lx)\n",
698 this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
700 dump_cpu_list_and_out:
701 printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
702 for (i = 0; i < cnt; i++)
703 printk("%u ", cpu_list[i]);
704 printk("]\n");
705 }
707 /* Send cross call to all processors mentioned in MASK
708 * except self.
709 */
710 static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, cpumask_t mask)
711 {
712 u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
713 int this_cpu = get_cpu();
715 cpus_and(mask, mask, cpu_online_map);
716 cpu_clear(this_cpu, mask);
718 if (tlb_type == spitfire)
719 spitfire_xcall_deliver(data0, data1, data2, mask);
720 else if (tlb_type == cheetah || tlb_type == cheetah_plus)
721 cheetah_xcall_deliver(data0, data1, data2, mask);
722 else
723 hypervisor_xcall_deliver(data0, data1, data2, mask);
724 /* NOTE: Caller runs local copy on master. */
726 put_cpu();
727 }
729 extern unsigned long xcall_sync_tick;
731 static void smp_start_sync_tick_client(int cpu)
732 {
733 cpumask_t mask = cpumask_of_cpu(cpu);
735 smp_cross_call_masked(&xcall_sync_tick,
736 0, 0, 0, mask);
737 }
739 /* Send cross call to all processors except self. */
740 #define smp_cross_call(func, ctx, data1, data2) \
741 smp_cross_call_masked(func, ctx, data1, data2, cpu_online_map)
743 struct call_data_struct {
744 void (*func) (void *info);
745 void *info;
746 atomic_t finished;
747 int wait;
748 };
750 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
751 static struct call_data_struct *call_data;
753 extern unsigned long xcall_call_function;
755 /**
756 * smp_call_function(): Run a function on all other CPUs.
757 * @func: The function to run. This must be fast and non-blocking.
758 * @info: An arbitrary pointer to pass to the function.
759 * @nonatomic: currently unused.
760 * @wait: If true, wait (atomically) until function has completed on other CPUs.
761 *
762 * Returns 0 on success, else a negative status code. Does not return until
763 * remote CPUs are nearly ready to execute <<func>> or are or have executed.
764 *
765 * You must not call this function with disabled interrupts or from a
766 * hardware interrupt handler or from a bottom half handler.
767 */
768 static int smp_call_function_mask(void (*func)(void *info), void *info,
769 int nonatomic, int wait, cpumask_t mask)
770 {
771 struct call_data_struct data;
772 int cpus;
774 /* Can deadlock when called with interrupts disabled */
775 WARN_ON(irqs_disabled());
777 data.func = func;
778 data.info = info;
779 atomic_set(&data.finished, 0);
780 data.wait = wait;
782 spin_lock(&call_lock);
784 cpu_clear(smp_processor_id(), mask);
785 cpus = cpus_weight(mask);
786 if (!cpus)
787 goto out_unlock;
789 call_data = &data;
790 mb();
792 smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
794 /* Wait for response */
795 while (atomic_read(&data.finished) != cpus)
796 cpu_relax();
798 out_unlock:
799 spin_unlock(&call_lock);
801 return 0;
802 }
804 int smp_call_function(void (*func)(void *info), void *info,
805 int nonatomic, int wait)
806 {
807 return smp_call_function_mask(func, info, nonatomic, wait,
808 cpu_online_map);
809 }
811 void smp_call_function_client(int irq, struct pt_regs *regs)
812 {
813 void (*func) (void *info) = call_data->func;
814 void *info = call_data->info;
816 clear_softint(1 << irq);
817 if (call_data->wait) {
818 /* let initiator proceed only after completion */
819 func(info);
820 atomic_inc(&call_data->finished);
821 } else {
822 /* let initiator proceed after getting data */
823 atomic_inc(&call_data->finished);
824 func(info);
825 }
826 }
828 static void tsb_sync(void *info)
829 {
830 struct trap_per_cpu *tp = &trap_block[raw_smp_processor_id()];
831 struct mm_struct *mm = info;
833 /* It is not valid to test "currrent->active_mm == mm" here.
834 *
835 * The value of "current" is not changed atomically with
836 * switch_mm(). But that's OK, we just need to check the
837 * current cpu's trap block PGD physical address.
838 */
839 if (tp->pgd_paddr == __pa(mm->pgd))
840 tsb_context_switch(mm);
841 }
843 void smp_tsb_sync(struct mm_struct *mm)
844 {
845 smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
846 }
848 extern unsigned long xcall_flush_tlb_mm;
849 extern unsigned long xcall_flush_tlb_pending;
850 extern unsigned long xcall_flush_tlb_kernel_range;
851 extern unsigned long xcall_report_regs;
852 extern unsigned long xcall_receive_signal;
853 extern unsigned long xcall_new_mmu_context_version;
855 #ifdef DCACHE_ALIASING_POSSIBLE
856 extern unsigned long xcall_flush_dcache_page_cheetah;
857 #endif
858 extern unsigned long xcall_flush_dcache_page_spitfire;
860 #ifdef CONFIG_DEBUG_DCFLUSH
861 extern atomic_t dcpage_flushes;
862 extern atomic_t dcpage_flushes_xcall;
863 #endif
865 static __inline__ void __local_flush_dcache_page(struct page *page)
866 {
867 #ifdef DCACHE_ALIASING_POSSIBLE
868 __flush_dcache_page(page_address(page),
869 ((tlb_type == spitfire) &&
870 page_mapping(page) != NULL));
871 #else
872 if (page_mapping(page) != NULL &&
873 tlb_type == spitfire)
874 __flush_icache_page(__pa(page_address(page)));
875 #endif
876 }
878 void smp_flush_dcache_page_impl(struct page *page, int cpu)
879 {
880 cpumask_t mask = cpumask_of_cpu(cpu);
881 int this_cpu;
883 if (tlb_type == hypervisor)
884 return;
886 #ifdef CONFIG_DEBUG_DCFLUSH
887 atomic_inc(&dcpage_flushes);
888 #endif
890 this_cpu = get_cpu();
892 if (cpu == this_cpu) {
893 __local_flush_dcache_page(page);
894 } else if (cpu_online(cpu)) {
895 void *pg_addr = page_address(page);
896 u64 data0;
898 if (tlb_type == spitfire) {
899 data0 =
900 ((u64)&xcall_flush_dcache_page_spitfire);
901 if (page_mapping(page) != NULL)
902 data0 |= ((u64)1 << 32);
903 spitfire_xcall_deliver(data0,
904 __pa(pg_addr),
905 (u64) pg_addr,
906 mask);
907 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
908 #ifdef DCACHE_ALIASING_POSSIBLE
909 data0 =
910 ((u64)&xcall_flush_dcache_page_cheetah);
911 cheetah_xcall_deliver(data0,
912 __pa(pg_addr),
913 0, mask);
914 #endif
915 }
916 #ifdef CONFIG_DEBUG_DCFLUSH
917 atomic_inc(&dcpage_flushes_xcall);
918 #endif
919 }
921 put_cpu();
922 }
924 void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
925 {
926 void *pg_addr = page_address(page);
927 cpumask_t mask = cpu_online_map;
928 u64 data0;
929 int this_cpu;
931 if (tlb_type == hypervisor)
932 return;
934 this_cpu = get_cpu();
936 cpu_clear(this_cpu, mask);
938 #ifdef CONFIG_DEBUG_DCFLUSH
939 atomic_inc(&dcpage_flushes);
940 #endif
941 if (cpus_empty(mask))
942 goto flush_self;
943 if (tlb_type == spitfire) {
944 data0 = ((u64)&xcall_flush_dcache_page_spitfire);
945 if (page_mapping(page) != NULL)
946 data0 |= ((u64)1 << 32);
947 spitfire_xcall_deliver(data0,
948 __pa(pg_addr),
949 (u64) pg_addr,
950 mask);
951 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
952 #ifdef DCACHE_ALIASING_POSSIBLE
953 data0 = ((u64)&xcall_flush_dcache_page_cheetah);
954 cheetah_xcall_deliver(data0,
955 __pa(pg_addr),
956 0, mask);
957 #endif
958 }
959 #ifdef CONFIG_DEBUG_DCFLUSH
960 atomic_inc(&dcpage_flushes_xcall);
961 #endif
962 flush_self:
963 __local_flush_dcache_page(page);
965 put_cpu();
966 }
968 static void __smp_receive_signal_mask(cpumask_t mask)
969 {
970 smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
971 }
973 void smp_receive_signal(int cpu)
974 {
975 cpumask_t mask = cpumask_of_cpu(cpu);
977 if (cpu_online(cpu))
978 __smp_receive_signal_mask(mask);
979 }
981 void smp_receive_signal_client(int irq, struct pt_regs *regs)
982 {
983 clear_softint(1 << irq);
984 }
986 void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
987 {
988 struct mm_struct *mm;
989 unsigned long flags;
991 clear_softint(1 << irq);
993 /* See if we need to allocate a new TLB context because
994 * the version of the one we are using is now out of date.
995 */
996 mm = current->active_mm;
997 if (unlikely(!mm || (mm == &init_mm)))
998 return;
1000 spin_lock_irqsave(&mm->context.lock, flags);
1002 if (unlikely(!CTX_VALID(mm->context)))
1003 get_new_mmu_context(mm);
1005 spin_unlock_irqrestore(&mm->context.lock, flags);
1007 load_secondary_context(mm);
1008 __flush_tlb_mm(CTX_HWBITS(mm->context),
1009 SECONDARY_CONTEXT);
1012 void smp_new_mmu_context_version(void)
1014 smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0);
1017 void smp_report_regs(void)
1019 smp_cross_call(&xcall_report_regs, 0, 0, 0);
1022 /* We know that the window frames of the user have been flushed
1023 * to the stack before we get here because all callers of us
1024 * are flush_tlb_*() routines, and these run after flush_cache_*()
1025 * which performs the flushw.
1027 * The SMP TLB coherency scheme we use works as follows:
1029 * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
1030 * space has (potentially) executed on, this is the heuristic
1031 * we use to avoid doing cross calls.
1033 * Also, for flushing from kswapd and also for clones, we
1034 * use cpu_vm_mask as the list of cpus to make run the TLB.
1036 * 2) TLB context numbers are shared globally across all processors
1037 * in the system, this allows us to play several games to avoid
1038 * cross calls.
1040 * One invariant is that when a cpu switches to a process, and
1041 * that processes tsk->active_mm->cpu_vm_mask does not have the
1042 * current cpu's bit set, that tlb context is flushed locally.
1044 * If the address space is non-shared (ie. mm->count == 1) we avoid
1045 * cross calls when we want to flush the currently running process's
1046 * tlb state. This is done by clearing all cpu bits except the current
1047 * processor's in current->active_mm->cpu_vm_mask and performing the
1048 * flush locally only. This will force any subsequent cpus which run
1049 * this task to flush the context from the local tlb if the process
1050 * migrates to another cpu (again).
1052 * 3) For shared address spaces (threads) and swapping we bite the
1053 * bullet for most cases and perform the cross call (but only to
1054 * the cpus listed in cpu_vm_mask).
1056 * The performance gain from "optimizing" away the cross call for threads is
1057 * questionable (in theory the big win for threads is the massive sharing of
1058 * address space state across processors).
1059 */
1061 /* This currently is only used by the hugetlb arch pre-fault
1062 * hook on UltraSPARC-III+ and later when changing the pagesize
1063 * bits of the context register for an address space.
1064 */
1065 void smp_flush_tlb_mm(struct mm_struct *mm)
1067 u32 ctx = CTX_HWBITS(mm->context);
1068 int cpu = get_cpu();
1070 if (atomic_read(&mm->mm_users) == 1) {
1071 mm->cpu_vm_mask = cpumask_of_cpu(cpu);
1072 goto local_flush_and_out;
1075 smp_cross_call_masked(&xcall_flush_tlb_mm,
1076 ctx, 0, 0,
1077 mm->cpu_vm_mask);
1079 local_flush_and_out:
1080 __flush_tlb_mm(ctx, SECONDARY_CONTEXT);
1082 put_cpu();
1085 void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs)
1087 u32 ctx = CTX_HWBITS(mm->context);
1088 int cpu = get_cpu();
1090 if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1)
1091 mm->cpu_vm_mask = cpumask_of_cpu(cpu);
1092 else
1093 smp_cross_call_masked(&xcall_flush_tlb_pending,
1094 ctx, nr, (unsigned long) vaddrs,
1095 mm->cpu_vm_mask);
1097 __flush_tlb_pending(ctx, nr, vaddrs);
1099 put_cpu();
1102 void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
1104 start &= PAGE_MASK;
1105 end = PAGE_ALIGN(end);
1106 if (start != end) {
1107 smp_cross_call(&xcall_flush_tlb_kernel_range,
1108 0, start, end);
1110 __flush_tlb_kernel_range(start, end);
1114 /* CPU capture. */
1115 /* #define CAPTURE_DEBUG */
1116 extern unsigned long xcall_capture;
1118 static atomic_t smp_capture_depth = ATOMIC_INIT(0);
1119 static atomic_t smp_capture_registry = ATOMIC_INIT(0);
1120 static unsigned long penguins_are_doing_time;
1122 void smp_capture(void)
1124 int result = atomic_add_ret(1, &smp_capture_depth);
1126 if (result == 1) {
1127 int ncpus = num_online_cpus();
1129 #ifdef CAPTURE_DEBUG
1130 printk("CPU[%d]: Sending penguins to jail...",
1131 smp_processor_id());
1132 #endif
1133 penguins_are_doing_time = 1;
1134 membar_storestore_loadstore();
1135 atomic_inc(&smp_capture_registry);
1136 smp_cross_call(&xcall_capture, 0, 0, 0);
1137 while (atomic_read(&smp_capture_registry) != ncpus)
1138 rmb();
1139 #ifdef CAPTURE_DEBUG
1140 printk("done\n");
1141 #endif
1145 void smp_release(void)
1147 if (atomic_dec_and_test(&smp_capture_depth)) {
1148 #ifdef CAPTURE_DEBUG
1149 printk("CPU[%d]: Giving pardon to "
1150 "imprisoned penguins\n",
1151 smp_processor_id());
1152 #endif
1153 penguins_are_doing_time = 0;
1154 membar_storeload_storestore();
1155 atomic_dec(&smp_capture_registry);
1159 /* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
1160 * can service tlb flush xcalls...
1161 */
1162 extern void prom_world(int);
1164 void smp_penguin_jailcell(int irq, struct pt_regs *regs)
1166 clear_softint(1 << irq);
1168 preempt_disable();
1170 __asm__ __volatile__("flushw");
1171 prom_world(1);
1172 atomic_inc(&smp_capture_registry);
1173 membar_storeload_storestore();
1174 while (penguins_are_doing_time)
1175 rmb();
1176 atomic_dec(&smp_capture_registry);
1177 prom_world(0);
1179 preempt_enable();
1182 #define prof_multiplier(__cpu) cpu_data(__cpu).multiplier
1183 #define prof_counter(__cpu) cpu_data(__cpu).counter
1185 void smp_percpu_timer_interrupt(struct pt_regs *regs)
1187 unsigned long compare, tick, pstate;
1188 int cpu = smp_processor_id();
1189 int user = user_mode(regs);
1191 /*
1192 * Check for level 14 softint.
1193 */
1195 unsigned long tick_mask = tick_ops->softint_mask;
1197 if (!(get_softint() & tick_mask)) {
1198 extern void handler_irq(int, struct pt_regs *);
1200 handler_irq(14, regs);
1201 return;
1203 clear_softint(tick_mask);
1206 do {
1207 profile_tick(CPU_PROFILING, regs);
1208 if (!--prof_counter(cpu)) {
1209 irq_enter();
1211 if (cpu == boot_cpu_id) {
1212 kstat_this_cpu.irqs[0]++;
1213 timer_tick_interrupt(regs);
1216 update_process_times(user);
1218 irq_exit();
1220 prof_counter(cpu) = prof_multiplier(cpu);
1223 /* Guarantee that the following sequences execute
1224 * uninterrupted.
1225 */
1226 __asm__ __volatile__("rdpr %%pstate, %0\n\t"
1227 "wrpr %0, %1, %%pstate"
1228 : "=r" (pstate)
1229 : "i" (PSTATE_IE));
1231 compare = tick_ops->add_compare(current_tick_offset);
1232 tick = tick_ops->get_tick();
1234 /* Restore PSTATE_IE. */
1235 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
1236 : /* no outputs */
1237 : "r" (pstate));
1238 } while (time_after_eq(tick, compare));
1241 static void __init smp_setup_percpu_timer(void)
1243 int cpu = smp_processor_id();
1244 unsigned long pstate;
1246 prof_counter(cpu) = prof_multiplier(cpu) = 1;
1248 /* Guarantee that the following sequences execute
1249 * uninterrupted.
1250 */
1251 __asm__ __volatile__("rdpr %%pstate, %0\n\t"
1252 "wrpr %0, %1, %%pstate"
1253 : "=r" (pstate)
1254 : "i" (PSTATE_IE));
1256 tick_ops->init_tick(current_tick_offset);
1258 /* Restore PSTATE_IE. */
1259 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
1260 : /* no outputs */
1261 : "r" (pstate));
1264 void __init smp_tick_init(void)
1266 boot_cpu_id = hard_smp_processor_id();
1267 current_tick_offset = timer_tick_offset;
1269 prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
1272 /* /proc/profile writes can call this, don't __init it please. */
1273 static DEFINE_SPINLOCK(prof_setup_lock);
1275 int setup_profiling_timer(unsigned int multiplier)
1277 unsigned long flags;
1278 int i;
1280 if ((!multiplier) || (timer_tick_offset / multiplier) < 1000)
1281 return -EINVAL;
1283 spin_lock_irqsave(&prof_setup_lock, flags);
1284 for_each_possible_cpu(i)
1285 prof_multiplier(i) = multiplier;
1286 current_tick_offset = (timer_tick_offset / multiplier);
1287 spin_unlock_irqrestore(&prof_setup_lock, flags);
1289 return 0;
1292 static void __init smp_tune_scheduling(void)
1294 struct device_node *dp;
1295 int instance;
1296 unsigned int def, smallest = ~0U;
1298 def = ((tlb_type == hypervisor) ?
1299 (3 * 1024 * 1024) :
1300 (4 * 1024 * 1024));
1302 instance = 0;
1303 while (!cpu_find_by_instance(instance, &dp, NULL)) {
1304 unsigned int val;
1306 val = of_getintprop_default(dp, "ecache-size", def);
1307 if (val < smallest)
1308 smallest = val;
1310 instance++;
1313 /* Any value less than 256K is nonsense. */
1314 if (smallest < (256U * 1024U))
1315 smallest = 256 * 1024;
1317 max_cache_size = smallest;
1319 if (smallest < 1U * 1024U * 1024U)
1320 printk(KERN_INFO "Using max_cache_size of %uKB\n",
1321 smallest / 1024U);
1322 else
1323 printk(KERN_INFO "Using max_cache_size of %uMB\n",
1324 smallest / 1024U / 1024U);
1327 /* Constrain the number of cpus to max_cpus. */
1328 void __init smp_prepare_cpus(unsigned int max_cpus)
1330 int i;
1332 if (num_possible_cpus() > max_cpus) {
1333 int instance, mid;
1335 instance = 0;
1336 while (!cpu_find_by_instance(instance, NULL, &mid)) {
1337 if (mid != boot_cpu_id) {
1338 cpu_clear(mid, phys_cpu_present_map);
1339 cpu_clear(mid, cpu_present_map);
1340 if (num_possible_cpus() <= max_cpus)
1341 break;
1343 instance++;
1347 for_each_possible_cpu(i) {
1348 if (tlb_type == hypervisor) {
1349 int j;
1351 /* XXX get this mapping from machine description */
1352 for_each_possible_cpu(j) {
1353 if ((j >> 2) == (i >> 2))
1354 cpu_set(j, cpu_sibling_map[i]);
1356 } else {
1357 cpu_set(i, cpu_sibling_map[i]);
1361 smp_store_cpu_info(boot_cpu_id);
1362 smp_tune_scheduling();
1365 /* Set this up early so that things like the scheduler can init
1366 * properly. We use the same cpu mask for both the present and
1367 * possible cpu map.
1368 */
1369 void __init smp_setup_cpu_possible_map(void)
1371 int instance, mid;
1373 instance = 0;
1374 while (!cpu_find_by_instance(instance, NULL, &mid)) {
1375 if (mid < NR_CPUS) {
1376 cpu_set(mid, phys_cpu_present_map);
1377 cpu_set(mid, cpu_present_map);
1379 instance++;
1383 void __devinit smp_prepare_boot_cpu(void)
1387 int __devinit __cpu_up(unsigned int cpu)
1389 int ret = smp_boot_one_cpu(cpu);
1391 if (!ret) {
1392 cpu_set(cpu, smp_commenced_mask);
1393 while (!cpu_isset(cpu, cpu_online_map))
1394 mb();
1395 if (!cpu_isset(cpu, cpu_online_map)) {
1396 ret = -ENODEV;
1397 } else {
1398 /* On SUN4V, writes to %tick and %stick are
1399 * not allowed.
1400 */
1401 if (tlb_type != hypervisor)
1402 smp_synchronize_one_tick(cpu);
1405 return ret;
1408 void __init smp_cpus_done(unsigned int max_cpus)
1410 unsigned long bogosum = 0;
1411 int i;
1413 for_each_online_cpu(i)
1414 bogosum += cpu_data(i).udelay_val;
1415 printk("Total of %ld processors activated "
1416 "(%lu.%02lu BogoMIPS).\n",
1417 (long) num_online_cpus(),
1418 bogosum/(500000/HZ),
1419 (bogosum/(5000/HZ))%100);
1422 void smp_send_reschedule(int cpu)
1424 smp_receive_signal(cpu);
1427 /* This is a nop because we capture all other cpus
1428 * anyways when making the PROM active.
1429 */
1430 void smp_send_stop(void)
1434 unsigned long __per_cpu_base __read_mostly;
1435 unsigned long __per_cpu_shift __read_mostly;
1437 EXPORT_SYMBOL(__per_cpu_base);
1438 EXPORT_SYMBOL(__per_cpu_shift);
1440 void __init setup_per_cpu_areas(void)
1442 unsigned long goal, size, i;
1443 char *ptr;
1445 /* Copy section for each CPU (we discard the original) */
1446 goal = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
1447 #ifdef CONFIG_MODULES
1448 if (goal < PERCPU_ENOUGH_ROOM)
1449 goal = PERCPU_ENOUGH_ROOM;
1450 #endif
1451 __per_cpu_shift = 0;
1452 for (size = 1UL; size < goal; size <<= 1UL)
1453 __per_cpu_shift++;
1455 ptr = alloc_bootmem(size * NR_CPUS);
1457 __per_cpu_base = ptr - __per_cpu_start;
1459 for (i = 0; i < NR_CPUS; i++, ptr += size)
1460 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
1462 /* Setup %g5 for the boot cpu. */
1463 __local_per_cpu_offset = __per_cpu_offset(smp_processor_id());