ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @ 6294:1a0723cd37f1

Fix many uses of machine addresses in XenLinux. Primarily
this fixes users of virt_to_machine/machine_to_virt to
use virt_to_mfn/mfn_to_virt where that is more appropriate.

This should be a big step to improved PAE stability.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Aug 19 16:06:43 2005 +0000 (2005-08-19)
parents 47d49e8b8042
children f51fe43c5d1c 5f4724c13040 81576d3d1ca8 3a8f27c6d56c
line source
1 /*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
37 */
40 #include <linux/config.h>
41 #include <linux/init.h>
43 #include <linux/mm.h>
44 #include <linux/kernel_stat.h>
45 #include <linux/smp_lock.h>
46 #include <linux/irq.h>
47 #include <linux/bootmem.h>
48 #include <linux/thread_info.h>
49 #include <linux/module.h>
50 #ifdef CONFIG_XEN
51 #include <linux/interrupt.h>
52 #endif
54 #include <linux/delay.h>
55 #include <linux/mc146818rtc.h>
56 #include <asm/mtrr.h>
57 #include <asm/pgalloc.h>
58 #include <asm/desc.h>
59 #include <asm/kdebug.h>
60 #include <asm/tlbflush.h>
61 #include <asm/proto.h>
62 #include <asm/nmi.h>
63 #ifdef CONFIG_XEN
64 #include <asm/arch_hooks.h>
66 #include <asm-xen/evtchn.h>
67 #endif
69 /* Change for real CPU hotplug. Note other files need to be fixed
70 first too. */
71 #define __cpuinit __init
72 #define __cpuinitdata __initdata
74 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
75 unsigned int maxcpus = NR_CPUS;
76 #endif
78 /* Number of siblings per CPU package */
79 int smp_num_siblings = 1;
80 /* Package ID of each logical CPU */
81 u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
82 u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
83 EXPORT_SYMBOL(phys_proc_id);
84 EXPORT_SYMBOL(cpu_core_id);
86 /* Bitmask of currently online CPUs */
87 cpumask_t cpu_online_map;
89 EXPORT_SYMBOL(cpu_online_map);
91 /*
92 * Private maps to synchronize booting between AP and BP.
93 * Probably not needed anymore, but it makes for easier debugging. -AK
94 */
95 cpumask_t cpu_callin_map;
96 cpumask_t cpu_callout_map;
98 cpumask_t cpu_possible_map;
99 EXPORT_SYMBOL(cpu_possible_map);
101 /* Per CPU bogomips and other parameters */
102 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
104 /* Set when the idlers are all forked */
105 int smp_threads_ready;
107 cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
108 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
109 EXPORT_SYMBOL(cpu_core_map);
111 #ifndef CONFIG_XEN
112 /*
113 * Trampoline 80x86 program as an array.
114 */
116 extern unsigned char trampoline_data[];
117 extern unsigned char trampoline_end[];
119 /*
120 * Currently trivial. Write the real->protected mode
121 * bootstrap into the page concerned. The caller
122 * has made sure it's suitably aligned.
123 */
125 static unsigned long __cpuinit setup_trampoline(void)
126 {
127 void *tramp = __va(SMP_TRAMPOLINE_BASE);
128 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
129 return virt_to_phys(tramp);
130 }
131 #endif
133 /*
134 * The bootstrap kernel entry code has set these up. Save them for
135 * a given CPU
136 */
138 static void __cpuinit smp_store_cpu_info(int id)
139 {
140 struct cpuinfo_x86 *c = cpu_data + id;
142 *c = boot_cpu_data;
143 identify_cpu(c);
144 print_cpu_info(c);
145 }
147 #ifndef CONFIG_XEN
148 /*
149 * New Funky TSC sync algorithm borrowed from IA64.
150 * Main advantage is that it doesn't reset the TSCs fully and
151 * in general looks more robust and it works better than my earlier
152 * attempts. I believe it was written by David Mosberger. Some minor
153 * adjustments for x86-64 by me -AK
154 *
155 * Original comment reproduced below.
156 *
157 * Synchronize TSC of the current (slave) CPU with the TSC of the
158 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
159 * eliminate the possibility of unaccounted-for errors (such as
160 * getting a machine check in the middle of a calibration step). The
161 * basic idea is for the slave to ask the master what itc value it has
162 * and to read its own itc before and after the master responds. Each
163 * iteration gives us three timestamps:
164 *
165 * slave master
166 *
167 * t0 ---\
168 * ---\
169 * --->
170 * tm
171 * /---
172 * /---
173 * t1 <---
174 *
175 *
176 * The goal is to adjust the slave's TSC such that tm falls exactly
177 * half-way between t0 and t1. If we achieve this, the clocks are
178 * synchronized provided the interconnect between the slave and the
179 * master is symmetric. Even if the interconnect were asymmetric, we
180 * would still know that the synchronization error is smaller than the
181 * roundtrip latency (t0 - t1).
182 *
183 * When the interconnect is quiet and symmetric, this lets us
184 * synchronize the TSC to within one or two cycles. However, we can
185 * only *guarantee* that the synchronization is accurate to within a
186 * round-trip time, which is typically in the range of several hundred
187 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
188 * are usually almost perfectly synchronized, but we shouldn't assume
189 * that the accuracy is much better than half a micro second or so.
190 *
191 * [there are other errors like the latency of RDTSC and of the
192 * WRMSR. These can also account to hundreds of cycles. So it's
193 * probably worse. It claims 153 cycles error on a dual Opteron,
194 * but I suspect the numbers are actually somewhat worse -AK]
195 */
197 #define MASTER 0
198 #define SLAVE (SMP_CACHE_BYTES/8)
200 /* Intentionally don't use cpu_relax() while TSC synchronization
201 because we don't want to go into funky power save modi or cause
202 hypervisors to schedule us away. Going to sleep would likely affect
203 latency and low latency is the primary objective here. -AK */
204 #define no_cpu_relax() barrier()
206 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
207 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
208 static int notscsync __cpuinitdata;
210 #undef DEBUG_TSC_SYNC
212 #define NUM_ROUNDS 64 /* magic value */
213 #define NUM_ITERS 5 /* likewise */
215 /* Callback on boot CPU */
216 static __cpuinit void sync_master(void *arg)
217 {
218 unsigned long flags, i;
220 if (smp_processor_id() != boot_cpu_id)
221 return;
223 go[MASTER] = 0;
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235 }
237 /*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242 static inline long
243 get_delta(long *rt, long *master)
244 {
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269 }
271 static __cpuinit void sync_tsc(void)
272 {
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276 #if DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283 #endif
285 go[MASTER] = 1;
287 smp_call_function(sync_master, NULL, 1, 0);
289 while (go[MASTER]) /* wait for master to be ready */
290 no_cpu_relax();
292 spin_lock_irqsave(&tsc_sync_lock, flags);
293 {
294 for (i = 0; i < NUM_ROUNDS; ++i) {
295 delta = get_delta(&rt, &master_time_stamp);
296 if (delta == 0) {
297 done = 1; /* let's lock on to this... */
298 bound = rt;
299 }
301 if (!done) {
302 unsigned long t;
303 if (i > 0) {
304 adjust_latency += -delta;
305 adj = -delta + adjust_latency/4;
306 } else
307 adj = -delta;
309 rdtscll(t);
310 wrmsrl(MSR_IA32_TSC, t + adj);
311 }
312 #if DEBUG_TSC_SYNC
313 t[i].rt = rt;
314 t[i].master = master_time_stamp;
315 t[i].diff = delta;
316 t[i].lat = adjust_latency/4;
317 #endif
318 }
319 }
320 spin_unlock_irqrestore(&tsc_sync_lock, flags);
322 #if DEBUG_TSC_SYNC
323 for (i = 0; i < NUM_ROUNDS; ++i)
324 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
325 t[i].rt, t[i].master, t[i].diff, t[i].lat);
326 #endif
328 printk(KERN_INFO
329 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
330 "maxerr %lu cycles)\n",
331 smp_processor_id(), boot_cpu_id, delta, rt);
332 }
334 static void __cpuinit tsc_sync_wait(void)
335 {
336 if (notscsync || !cpu_has_tsc)
337 return;
338 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
339 boot_cpu_id);
340 sync_tsc();
341 }
343 static __init int notscsync_setup(char *s)
344 {
345 notscsync = 1;
346 return 0;
347 }
348 __setup("notscsync", notscsync_setup);
349 #endif
351 static atomic_t init_deasserted __cpuinitdata;
353 /*
354 * Report back to the Boot Processor.
355 * Running on AP.
356 */
357 void __cpuinit smp_callin(void)
358 {
359 int cpuid, phys_id;
360 unsigned long timeout;
362 #ifndef CONFIG_XEN
363 /*
364 * If waken up by an INIT in an 82489DX configuration
365 * we may get here before an INIT-deassert IPI reaches
366 * our local APIC. We have to wait for the IPI or we'll
367 * lock up on an APIC access.
368 */
369 while (!atomic_read(&init_deasserted))
370 cpu_relax();
372 #endif
373 /*
374 * (This works even if the APIC is not enabled.)
375 */
376 #ifndef CONFIG_XEN
377 phys_id = GET_APIC_ID(apic_read(APIC_ID));
378 #else
379 phys_id = smp_processor_id();
380 #endif
381 cpuid = smp_processor_id();
382 if (cpu_isset(cpuid, cpu_callin_map)) {
383 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
384 phys_id, cpuid);
385 }
386 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
388 /*
389 * STARTUP IPIs are fragile beasts as they might sometimes
390 * trigger some glue motherboard logic. Complete APIC bus
391 * silence for 1 second, this overestimates the time the
392 * boot CPU is spending to send the up to 2 STARTUP IPIs
393 * by a factor of two. This should be enough.
394 */
396 /*
397 * Waiting 2s total for startup (udelay is not yet working)
398 */
399 timeout = jiffies + 2*HZ;
400 while (time_before(jiffies, timeout)) {
401 /*
402 * Has the boot CPU finished it's STARTUP sequence?
403 */
404 if (cpu_isset(cpuid, cpu_callout_map))
405 break;
406 cpu_relax();
407 }
409 if (!time_before(jiffies, timeout)) {
410 panic("smp_callin: CPU%d started up but did not get a callout!\n",
411 cpuid);
412 }
414 #ifndef CONFIG_XEN
415 /*
416 * the boot CPU has finished the init stage and is spinning
417 * on callin_map until we finish. We are free to set up this
418 * CPU, first the APIC. (this is probably redundant on most
419 * boards)
420 */
422 Dprintk("CALLIN, before setup_local_APIC().\n");
423 setup_local_APIC();
424 #endif
426 /*
427 * Get our bogomips.
428 */
429 calibrate_delay();
430 Dprintk("Stack at about %p\n",&cpuid);
432 #ifndef CONFIG_XEN
433 disable_APIC_timer();
434 #endif
436 /*
437 * Save our processor parameters
438 */
439 smp_store_cpu_info(cpuid);
441 /*
442 * Allow the master to continue.
443 */
444 cpu_set(cpuid, cpu_callin_map);
445 }
447 #ifdef CONFIG_XEN
448 static irqreturn_t ldebug_interrupt(
449 int irq, void *dev_id, struct pt_regs *regs)
450 {
451 return IRQ_HANDLED;
452 }
454 static DEFINE_PER_CPU(int, ldebug_irq);
455 static char ldebug_name[NR_CPUS][15];
457 void ldebug_setup(void)
458 {
459 int cpu = smp_processor_id();
461 per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG);
462 sprintf(ldebug_name[cpu], "ldebug%d", cpu);
463 BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt,
464 SA_INTERRUPT, ldebug_name[cpu], NULL));
465 }
467 extern void local_setup_timer(void);
468 #endif
470 /*
471 * Setup code on secondary processor (after comming out of the trampoline)
472 */
473 void __cpuinit start_secondary(void)
474 {
475 /*
476 * Dont put anything before smp_callin(), SMP
477 * booting is too fragile that we want to limit the
478 * things done here to the most necessary things.
479 */
480 cpu_init();
481 smp_callin();
483 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
484 barrier();
486 #ifndef CONFIG_XEN
487 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
488 setup_secondary_APIC_clock();
490 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
492 if (nmi_watchdog == NMI_IO_APIC) {
493 disable_8259A_irq(0);
494 enable_NMI_through_LVT0(NULL);
495 enable_8259A_irq(0);
496 }
498 enable_APIC_timer();
499 #else
500 local_setup_timer();
501 ldebug_setup();
502 smp_intr_init();
503 local_irq_enable();
504 #endif
506 /*
507 * Allow the master to continue.
508 */
509 cpu_set(smp_processor_id(), cpu_online_map);
510 mb();
512 #ifndef CONFIG_XEN
513 /* Wait for TSC sync to not schedule things before.
514 We still process interrupts, which could see an inconsistent
515 time in that window unfortunately. */
516 tsc_sync_wait();
517 #endif
519 cpu_idle();
520 }
522 extern volatile unsigned long init_rsp;
523 extern void (*initial_code)(void);
525 #ifndef CONFIG_XEN
526 #if APIC_DEBUG
527 static void inquire_remote_apic(int apicid)
528 {
529 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
530 char *names[] = { "ID", "VERSION", "SPIV" };
531 int timeout, status;
533 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
535 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
536 printk("... APIC #%d %s: ", apicid, names[i]);
538 /*
539 * Wait for idle.
540 */
541 apic_wait_icr_idle();
543 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
544 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
546 timeout = 0;
547 do {
548 udelay(100);
549 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
550 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
552 switch (status) {
553 case APIC_ICR_RR_VALID:
554 status = apic_read(APIC_RRR);
555 printk("%08x\n", status);
556 break;
557 default:
558 printk("failed\n");
559 }
560 }
561 }
562 #endif
564 /*
565 * Kick the secondary to wake up.
566 */
567 static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
568 {
569 unsigned long send_status = 0, accept_status = 0;
570 int maxlvt, timeout, num_starts, j;
572 Dprintk("Asserting INIT.\n");
574 /*
575 * Turn INIT on target chip
576 */
577 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
579 /*
580 * Send IPI
581 */
582 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
583 | APIC_DM_INIT);
585 Dprintk("Waiting for send to finish...\n");
586 timeout = 0;
587 do {
588 Dprintk("+");
589 udelay(100);
590 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
591 } while (send_status && (timeout++ < 1000));
593 mdelay(10);
595 Dprintk("Deasserting INIT.\n");
597 /* Target chip */
598 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
600 /* Send IPI */
601 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
603 Dprintk("Waiting for send to finish...\n");
604 timeout = 0;
605 do {
606 Dprintk("+");
607 udelay(100);
608 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
609 } while (send_status && (timeout++ < 1000));
611 atomic_set(&init_deasserted, 1);
613 /*
614 * Should we send STARTUP IPIs ?
615 *
616 * Determine this based on the APIC version.
617 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
618 */
619 if (APIC_INTEGRATED(apic_version[phys_apicid]))
620 num_starts = 2;
621 else
622 num_starts = 0;
624 /*
625 * Run STARTUP IPI loop.
626 */
627 Dprintk("#startup loops: %d.\n", num_starts);
629 maxlvt = get_maxlvt();
631 for (j = 1; j <= num_starts; j++) {
632 Dprintk("Sending STARTUP #%d.\n",j);
633 apic_read_around(APIC_SPIV);
634 apic_write(APIC_ESR, 0);
635 apic_read(APIC_ESR);
636 Dprintk("After apic_write.\n");
638 /*
639 * STARTUP IPI
640 */
642 /* Target chip */
643 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
645 /* Boot on the stack */
646 /* Kick the second */
647 apic_write_around(APIC_ICR, APIC_DM_STARTUP
648 | (start_rip >> 12));
650 /*
651 * Give the other CPU some time to accept the IPI.
652 */
653 udelay(300);
655 Dprintk("Startup point 1.\n");
657 Dprintk("Waiting for send to finish...\n");
658 timeout = 0;
659 do {
660 Dprintk("+");
661 udelay(100);
662 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
663 } while (send_status && (timeout++ < 1000));
665 /*
666 * Give the other CPU some time to accept the IPI.
667 */
668 udelay(200);
669 /*
670 * Due to the Pentium erratum 3AP.
671 */
672 if (maxlvt > 3) {
673 apic_read_around(APIC_SPIV);
674 apic_write(APIC_ESR, 0);
675 }
676 accept_status = (apic_read(APIC_ESR) & 0xEF);
677 if (send_status || accept_status)
678 break;
679 }
680 Dprintk("After Startup.\n");
682 if (send_status)
683 printk(KERN_ERR "APIC never delivered???\n");
684 if (accept_status)
685 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
687 return (send_status | accept_status);
688 }
689 #endif
691 /*
692 * Boot one CPU.
693 */
694 static int __cpuinit do_boot_cpu(int cpu, int apicid)
695 {
696 struct task_struct *idle;
697 unsigned long boot_error;
698 int timeout;
699 unsigned long start_rip;
700 #ifdef CONFIG_XEN
701 vcpu_guest_context_t ctxt;
702 extern void startup_64_smp(void);
703 extern void hypervisor_callback(void);
704 extern void failsafe_callback(void);
705 extern void smp_trap_init(trap_info_t *);
706 int i;
707 #endif
708 /*
709 * We can't use kernel_thread since we must avoid to
710 * reschedule the child.
711 */
712 idle = fork_idle(cpu);
713 if (IS_ERR(idle)) {
714 printk("failed fork for CPU %d\n", cpu);
715 return PTR_ERR(idle);
716 }
718 cpu_pda[cpu].pcurrent = idle;
720 #ifndef CONFIG_XEN
721 start_rip = setup_trampoline();
722 #else
723 start_rip = (unsigned long)startup_64_smp;
724 #endif
726 init_rsp = idle->thread.rsp;
727 per_cpu(init_tss,cpu).rsp0 = init_rsp;
728 initial_code = start_secondary;
729 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
731 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
732 start_rip, init_rsp);
734 /*
735 * This grunge runs the startup process for
736 * the targeted processor.
737 */
739 atomic_set(&init_deasserted, 0);
741 #ifdef CONFIG_XEN
742 if (cpu_gdt_descr[0].size > PAGE_SIZE)
743 BUG();
744 cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
745 memcpy((void *)cpu_gdt_descr[cpu].address,
746 (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
748 memset(&ctxt, 0, sizeof(ctxt));
750 ctxt.flags = VGCF_IN_KERNEL;
751 ctxt.user_regs.ds = __USER_DS;
752 ctxt.user_regs.es = __USER_DS;
753 ctxt.user_regs.fs = 0;
754 ctxt.user_regs.gs = 0;
755 ctxt.user_regs.ss = __KERNEL_DS|0x3;
756 ctxt.user_regs.cs = __KERNEL_CS|0x3;
757 ctxt.user_regs.rip = start_rip;
758 ctxt.user_regs.rsp = idle->thread.rsp;
759 #define X86_EFLAGS_IOPL_RING3 0x3000
760 ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING3;
762 /* FPU is set up to default initial state. */
763 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
765 /* Virtual IDT is empty at start-of-day. */
766 for ( i = 0; i < 256; i++ )
767 {
768 ctxt.trap_ctxt[i].vector = i;
769 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
770 }
771 smp_trap_init(ctxt.trap_ctxt);
773 /* No LDT. */
774 ctxt.ldt_ents = 0;
776 {
777 unsigned long va;
778 int f;
780 for (va = cpu_gdt_descr[cpu].address, f = 0;
781 va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
782 va += PAGE_SIZE, f++) {
783 ctxt.gdt_frames[f] = virt_to_mfn(va);
784 make_page_readonly((void *)va);
785 }
786 ctxt.gdt_ents = GDT_ENTRIES;
787 }
789 /* Ring 1 stack is the initial stack. */
790 ctxt.kernel_ss = __KERNEL_DS;
791 ctxt.kernel_sp = idle->thread.rsp;
793 /* Callback handlers. */
794 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
795 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
796 ctxt.syscall_callback_eip = (unsigned long)system_call;
798 ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT;
800 boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
802 if (!boot_error) {
803 /*
804 * allow APs to start initializing.
805 */
806 Dprintk("Before Callout %d.\n", cpu);
807 cpu_set(cpu, cpu_callout_map);
808 Dprintk("After Callout %d.\n", cpu);
810 /*
811 * Wait 5s total for a response
812 */
813 for (timeout = 0; timeout < 50000; timeout++) {
814 if (cpu_isset(cpu, cpu_callin_map))
815 break; /* It has booted */
816 udelay(100);
817 }
819 if (cpu_isset(cpu, cpu_callin_map)) {
820 /* number CPUs logically, starting from 1 (BSP is 0) */
821 Dprintk("CPU has booted.\n");
822 } else {
823 boot_error= 1;
824 }
825 }
826 x86_cpu_to_apicid[cpu] = apicid;
827 #else
828 Dprintk("Setting warm reset code and vector.\n");
830 CMOS_WRITE(0xa, 0xf);
831 local_flush_tlb();
832 Dprintk("1.\n");
833 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
834 Dprintk("2.\n");
835 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
836 Dprintk("3.\n");
838 /*
839 * Be paranoid about clearing APIC errors.
840 */
841 if (APIC_INTEGRATED(apic_version[apicid])) {
842 apic_read_around(APIC_SPIV);
843 apic_write(APIC_ESR, 0);
844 apic_read(APIC_ESR);
845 }
847 /*
848 * Status is now clean
849 */
850 boot_error = 0;
852 /*
853 * Starting actual IPI sequence...
854 */
855 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
857 if (!boot_error) {
858 /*
859 * allow APs to start initializing.
860 */
861 Dprintk("Before Callout %d.\n", cpu);
862 cpu_set(cpu, cpu_callout_map);
863 Dprintk("After Callout %d.\n", cpu);
865 /*
866 * Wait 5s total for a response
867 */
868 for (timeout = 0; timeout < 50000; timeout++) {
869 if (cpu_isset(cpu, cpu_callin_map))
870 break; /* It has booted */
871 udelay(100);
872 }
874 if (cpu_isset(cpu, cpu_callin_map)) {
875 /* number CPUs logically, starting from 1 (BSP is 0) */
876 Dprintk("CPU has booted.\n");
877 } else {
878 boot_error = 1;
879 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
880 == 0xA5)
881 /* trampoline started but...? */
882 printk("Stuck ??\n");
883 else
884 /* trampoline code not run */
885 printk("Not responding.\n");
886 #if APIC_DEBUG
887 inquire_remote_apic(apicid);
888 #endif
889 }
890 }
891 #endif
892 if (boot_error) {
893 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
894 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
895 cpu_clear(cpu, cpu_present_map);
896 cpu_clear(cpu, cpu_possible_map);
897 x86_cpu_to_apicid[cpu] = BAD_APICID;
898 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
899 return -EIO;
900 }
902 return 0;
903 }
905 cycles_t cacheflush_time;
906 unsigned long cache_decay_ticks;
908 /*
909 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
910 * on SMT systems efficiently.
911 */
912 static __cpuinit void detect_siblings(void)
913 {
914 int cpu;
916 for (cpu = 0; cpu < NR_CPUS; cpu++) {
917 cpus_clear(cpu_sibling_map[cpu]);
918 cpus_clear(cpu_core_map[cpu]);
919 }
921 for_each_online_cpu (cpu) {
922 struct cpuinfo_x86 *c = cpu_data + cpu;
923 int siblings = 0;
924 int i;
925 if (smp_num_siblings > 1) {
926 for_each_online_cpu (i) {
927 if (cpu_core_id[cpu] == cpu_core_id[i]) {
928 siblings++;
929 cpu_set(i, cpu_sibling_map[cpu]);
930 }
931 }
932 } else {
933 siblings++;
934 cpu_set(cpu, cpu_sibling_map[cpu]);
935 }
937 if (siblings != smp_num_siblings) {
938 printk(KERN_WARNING
939 "WARNING: %d siblings found for CPU%d, should be %d\n",
940 siblings, cpu, smp_num_siblings);
941 smp_num_siblings = siblings;
942 }
943 if (c->x86_num_cores > 1) {
944 for_each_online_cpu(i) {
945 if (phys_proc_id[cpu] == phys_proc_id[i])
946 cpu_set(i, cpu_core_map[cpu]);
947 }
948 } else
949 cpu_core_map[cpu] = cpu_sibling_map[cpu];
950 }
951 }
953 #ifndef CONFIG_XEN
954 /*
955 * Cleanup possible dangling ends...
956 */
957 static __cpuinit void smp_cleanup_boot(void)
958 {
959 /*
960 * Paranoid: Set warm reset code and vector here back
961 * to default values.
962 */
963 CMOS_WRITE(0, 0xf);
965 /*
966 * Reset trampoline flag
967 */
968 *((volatile int *) phys_to_virt(0x467)) = 0;
970 #ifndef CONFIG_HOTPLUG_CPU
971 /*
972 * Free pages reserved for SMP bootup.
973 * When you add hotplug CPU support later remove this
974 * Note there is more work to be done for later CPU bootup.
975 */
977 free_page((unsigned long) __va(PAGE_SIZE));
978 free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE));
979 #endif
980 }
981 #endif
983 /*
984 * Fall back to non SMP mode after errors.
985 *
986 * RED-PEN audit/test this more. I bet there is more state messed up here.
987 */
988 static __cpuinit void disable_smp(void)
989 {
990 cpu_present_map = cpumask_of_cpu(0);
991 cpu_possible_map = cpumask_of_cpu(0);
992 #ifndef CONFIG_XEN
993 if (smp_found_config)
994 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
995 else
996 phys_cpu_present_map = physid_mask_of_physid(0);
997 #endif
998 cpu_set(0, cpu_sibling_map[0]);
999 cpu_set(0, cpu_core_map[0]);
1002 /*
1003 * Handle user cpus=... parameter.
1004 */
1005 static __cpuinit void enforce_max_cpus(unsigned max_cpus)
1007 int i, k;
1008 k = 0;
1009 for (i = 0; i < NR_CPUS; i++) {
1010 if (!cpu_possible(i))
1011 continue;
1012 if (++k > max_cpus) {
1013 cpu_clear(i, cpu_possible_map);
1014 cpu_clear(i, cpu_present_map);
1019 /*
1020 * Various sanity checks.
1021 */
1022 static int __cpuinit smp_sanity_check(unsigned max_cpus)
1024 #ifndef CONFIG_XEN
1025 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1026 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1027 hard_smp_processor_id());
1028 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1031 /*
1032 * If we couldn't find an SMP configuration at boot time,
1033 * get out of here now!
1034 */
1035 if (!smp_found_config) {
1036 printk(KERN_NOTICE "SMP motherboard not detected.\n");
1037 disable_smp();
1038 if (APIC_init_uniprocessor())
1039 printk(KERN_NOTICE "Local APIC not detected."
1040 " Using dummy APIC emulation.\n");
1041 return -1;
1044 /*
1045 * Should not be necessary because the MP table should list the boot
1046 * CPU too, but we do it for the sake of robustness anyway.
1047 */
1048 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
1049 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
1050 boot_cpu_id);
1051 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1054 /*
1055 * If we couldn't find a local APIC, then get out of here now!
1056 */
1057 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
1058 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1059 boot_cpu_id);
1060 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1061 nr_ioapics = 0;
1062 return -1;
1064 #endif
1066 /*
1067 * If SMP should be disabled, then really disable it!
1068 */
1069 if (!max_cpus) {
1070 #ifdef CONFIG_XEN
1071 HYPERVISOR_shared_info->n_vcpu = 1;
1072 #endif
1073 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1074 #ifndef CONFIG_XEN
1075 nr_ioapics = 0;
1076 #endif
1077 return -1;
1080 return 0;
1083 /*
1084 * Prepare for SMP bootup. The MP table or ACPI has been read
1085 * earlier. Just do some sanity checking here and enable APIC mode.
1086 */
1087 void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
1089 int i;
1091 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1092 #else
1093 nmi_watchdog_default();
1094 #endif
1095 current_cpu_data = boot_cpu_data;
1096 current_thread_info()->cpu = 0; /* needed? */
1098 enforce_max_cpus(max_cpus);
1100 /*
1101 * Fill in cpu_present_mask
1102 */
1103 for (i = 0; i < NR_CPUS; i++) {
1104 #ifndef CONFIG_XEN
1105 int apicid = cpu_present_to_apicid(i);
1106 if (physid_isset(apicid, phys_cpu_present_map)) {
1107 #else
1108 if (i < HYPERVISOR_shared_info->n_vcpu) {
1109 #endif
1110 cpu_set(i, cpu_present_map);
1111 /* possible map would be different if we supported real
1112 CPU hotplug. */
1113 cpu_set(i, cpu_possible_map);
1117 if (smp_sanity_check(max_cpus) < 0) {
1118 printk(KERN_INFO "SMP disabled\n");
1119 disable_smp();
1120 return;
1123 #ifdef CONFIG_XEN
1124 smp_intr_init();
1125 #else
1127 /*
1128 * Switch from PIC to APIC mode.
1129 */
1130 connect_bsp_APIC();
1131 setup_local_APIC();
1133 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
1134 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1135 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
1136 /* Or can we switch back to PIC here? */
1138 #endif
1140 /*
1141 * Now start the IO-APICs
1142 */
1143 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1144 #else
1145 if (!skip_ioapic_setup && nr_ioapics)
1146 setup_IO_APIC();
1147 else
1148 nr_ioapics = 0;
1149 #endif
1151 /*
1152 * Set up local APIC timer on boot CPU.
1153 */
1155 #ifndef CONFIG_XEN
1156 setup_boot_APIC_clock();
1157 #endif
1160 /*
1161 * Early setup to make printk work.
1162 */
1163 void __init smp_prepare_boot_cpu(void)
1165 int me = smp_processor_id();
1166 cpu_set(me, cpu_online_map);
1167 cpu_set(me, cpu_callout_map);
1170 /*
1171 * Entry point to boot a CPU.
1173 * This is all __cpuinit, not __devinit for now because we don't support
1174 * CPU hotplug (yet).
1175 */
1176 int __cpuinit __cpu_up(unsigned int cpu)
1178 int err;
1179 #ifndef CONFIG_XEN
1180 int apicid = cpu_present_to_apicid(cpu);
1181 #else
1182 int apicid = cpu;
1183 #endif
1185 WARN_ON(irqs_disabled());
1187 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1189 #ifndef CONFIG_XEN
1190 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
1191 !physid_isset(apicid, phys_cpu_present_map)) {
1192 printk("__cpu_up: bad cpu %d\n", cpu);
1193 return -EINVAL;
1195 #endif
1197 /* Boot it! */
1198 err = do_boot_cpu(cpu, apicid);
1199 if (err < 0) {
1200 Dprintk("do_boot_cpu failed %d\n", err);
1201 return err;
1204 /* Unleash the CPU! */
1205 Dprintk("waiting for cpu %d\n", cpu);
1207 while (!cpu_isset(cpu, cpu_online_map))
1208 cpu_relax();
1209 return 0;
1212 /*
1213 * Finish the SMP boot.
1214 */
1215 void __cpuinit smp_cpus_done(unsigned int max_cpus)
1217 #ifndef CONFIG_XEN
1218 zap_low_mappings();
1219 smp_cleanup_boot();
1221 #ifdef CONFIG_X86_IO_APIC
1222 setup_ioapic_dest();
1223 #endif
1224 #endif
1226 detect_siblings();
1227 #ifndef CONFIG_XEN
1228 time_init_gtod();
1230 check_nmi_watchdog();
1231 #endif
1234 #ifdef CONFIG_XEN
1235 extern int bind_ipi_to_irq(int ipi);
1236 extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
1237 extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
1239 static DEFINE_PER_CPU(int, resched_irq);
1240 static DEFINE_PER_CPU(int, callfunc_irq);
1241 static char resched_name[NR_CPUS][15];
1242 static char callfunc_name[NR_CPUS][15];
1244 void smp_intr_init(void)
1246 int cpu = smp_processor_id();
1248 per_cpu(resched_irq, cpu) =
1249 bind_ipi_to_irq(RESCHEDULE_VECTOR);
1250 sprintf(resched_name[cpu], "resched%d", cpu);
1251 BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt,
1252 SA_INTERRUPT, resched_name[cpu], NULL));
1254 per_cpu(callfunc_irq, cpu) =
1255 bind_ipi_to_irq(CALL_FUNCTION_VECTOR);
1256 sprintf(callfunc_name[cpu], "callfunc%d", cpu);
1257 BUG_ON(request_irq(per_cpu(callfunc_irq, cpu),
1258 smp_call_function_interrupt,
1259 SA_INTERRUPT, callfunc_name[cpu], NULL));
1262 static void smp_intr_exit(void)
1264 int cpu = smp_processor_id();
1266 free_irq(per_cpu(resched_irq, cpu), NULL);
1267 unbind_ipi_from_irq(RESCHEDULE_VECTOR);
1269 free_irq(per_cpu(callfunc_irq, cpu), NULL);
1270 unbind_ipi_from_irq(CALL_FUNCTION_VECTOR);
1273 extern void local_setup_timer_irq(void);
1274 extern void local_teardown_timer_irq(void);
1276 void smp_suspend(void)
1278 /* XXX todo: take down time and ipi's on all cpus */
1279 local_teardown_timer_irq();
1280 smp_intr_exit();
1283 void smp_resume(void)
1285 /* XXX todo: restore time and ipi's on all cpus */
1286 smp_intr_init();
1287 local_setup_timer_irq();
1290 void _restore_vcpu(void)
1292 /* XXX need to write this */
1295 #endif