ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @ 6433:0610add7c3fe

merge?
author cl349@firebug.cl.cam.ac.uk
date Thu Aug 25 16:27:04 2005 +0000 (2005-08-25)
parents b54144915ae6 8d31f9a9c423
children b4b3f6be5226
line source
1 /*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
37 */
40 #include <linux/config.h>
41 #include <linux/init.h>
43 #include <linux/mm.h>
44 #include <linux/kernel_stat.h>
45 #include <linux/smp_lock.h>
46 #include <linux/irq.h>
47 #include <linux/bootmem.h>
48 #include <linux/thread_info.h>
49 #include <linux/module.h>
50 #ifdef CONFIG_XEN
51 #include <linux/interrupt.h>
52 #endif
54 #include <linux/delay.h>
55 #include <linux/mc146818rtc.h>
56 #include <asm/mtrr.h>
57 #include <asm/pgalloc.h>
58 #include <asm/desc.h>
59 #include <asm/kdebug.h>
60 #include <asm/tlbflush.h>
61 #include <asm/proto.h>
62 #include <asm/nmi.h>
63 #ifdef CONFIG_XEN
64 #include <asm/arch_hooks.h>
66 #include <asm-xen/evtchn.h>
67 #endif
69 /* Change for real CPU hotplug. Note other files need to be fixed
70 first too. */
71 #define __cpuinit __init
72 #define __cpuinitdata __initdata
74 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
75 unsigned int maxcpus = NR_CPUS;
76 #endif
78 /* Number of siblings per CPU package */
79 int smp_num_siblings = 1;
80 /* Package ID of each logical CPU */
81 u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
82 u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
83 EXPORT_SYMBOL(phys_proc_id);
84 EXPORT_SYMBOL(cpu_core_id);
86 /* Bitmask of currently online CPUs */
87 cpumask_t cpu_online_map;
89 EXPORT_SYMBOL(cpu_online_map);
91 /*
92 * Private maps to synchronize booting between AP and BP.
93 * Probably not needed anymore, but it makes for easier debugging. -AK
94 */
95 cpumask_t cpu_callin_map;
96 cpumask_t cpu_callout_map;
98 cpumask_t cpu_possible_map;
99 EXPORT_SYMBOL(cpu_possible_map);
101 /* Per CPU bogomips and other parameters */
102 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
104 /* Set when the idlers are all forked */
105 int smp_threads_ready;
107 cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
108 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
109 EXPORT_SYMBOL(cpu_core_map);
111 #ifndef CONFIG_XEN
112 /*
113 * Trampoline 80x86 program as an array.
114 */
116 extern unsigned char trampoline_data[];
117 extern unsigned char trampoline_end[];
119 /*
120 * Currently trivial. Write the real->protected mode
121 * bootstrap into the page concerned. The caller
122 * has made sure it's suitably aligned.
123 */
125 static unsigned long __cpuinit setup_trampoline(void)
126 {
127 void *tramp = __va(SMP_TRAMPOLINE_BASE);
128 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
129 return virt_to_phys(tramp);
130 }
131 #endif
133 /*
134 * The bootstrap kernel entry code has set these up. Save them for
135 * a given CPU
136 */
138 static void __cpuinit smp_store_cpu_info(int id)
139 {
140 struct cpuinfo_x86 *c = cpu_data + id;
142 *c = boot_cpu_data;
143 identify_cpu(c);
144 print_cpu_info(c);
145 }
147 #ifndef CONFIG_XEN
148 /*
149 * New Funky TSC sync algorithm borrowed from IA64.
150 * Main advantage is that it doesn't reset the TSCs fully and
151 * in general looks more robust and it works better than my earlier
152 * attempts. I believe it was written by David Mosberger. Some minor
153 * adjustments for x86-64 by me -AK
154 *
155 * Original comment reproduced below.
156 *
157 * Synchronize TSC of the current (slave) CPU with the TSC of the
158 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
159 * eliminate the possibility of unaccounted-for errors (such as
160 * getting a machine check in the middle of a calibration step). The
161 * basic idea is for the slave to ask the master what itc value it has
162 * and to read its own itc before and after the master responds. Each
163 * iteration gives us three timestamps:
164 *
165 * slave master
166 *
167 * t0 ---\
168 * ---\
169 * --->
170 * tm
171 * /---
172 * /---
173 * t1 <---
174 *
175 *
176 * The goal is to adjust the slave's TSC such that tm falls exactly
177 * half-way between t0 and t1. If we achieve this, the clocks are
178 * synchronized provided the interconnect between the slave and the
179 * master is symmetric. Even if the interconnect were asymmetric, we
180 * would still know that the synchronization error is smaller than the
181 * roundtrip latency (t0 - t1).
182 *
183 * When the interconnect is quiet and symmetric, this lets us
184 * synchronize the TSC to within one or two cycles. However, we can
185 * only *guarantee* that the synchronization is accurate to within a
186 * round-trip time, which is typically in the range of several hundred
187 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
188 * are usually almost perfectly synchronized, but we shouldn't assume
189 * that the accuracy is much better than half a micro second or so.
190 *
191 * [there are other errors like the latency of RDTSC and of the
192 * WRMSR. These can also account to hundreds of cycles. So it's
193 * probably worse. It claims 153 cycles error on a dual Opteron,
194 * but I suspect the numbers are actually somewhat worse -AK]
195 */
197 #define MASTER 0
198 #define SLAVE (SMP_CACHE_BYTES/8)
200 /* Intentionally don't use cpu_relax() while TSC synchronization
201 because we don't want to go into funky power save modi or cause
202 hypervisors to schedule us away. Going to sleep would likely affect
203 latency and low latency is the primary objective here. -AK */
204 #define no_cpu_relax() barrier()
206 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
207 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
208 static int notscsync __cpuinitdata;
210 #undef DEBUG_TSC_SYNC
212 #define NUM_ROUNDS 64 /* magic value */
213 #define NUM_ITERS 5 /* likewise */
215 /* Callback on boot CPU */
216 static __cpuinit void sync_master(void *arg)
217 {
218 unsigned long flags, i;
220 if (smp_processor_id() != boot_cpu_id)
221 return;
223 go[MASTER] = 0;
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235 }
237 /*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242 static inline long
243 get_delta(long *rt, long *master)
244 {
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269 }
271 static __cpuinit void sync_tsc(void)
272 {
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276 #if DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283 #endif
285 go[MASTER] = 1;
287 smp_call_function(sync_master, NULL, 1, 0);
289 while (go[MASTER]) /* wait for master to be ready */
290 no_cpu_relax();
292 spin_lock_irqsave(&tsc_sync_lock, flags);
293 {
294 for (i = 0; i < NUM_ROUNDS; ++i) {
295 delta = get_delta(&rt, &master_time_stamp);
296 if (delta == 0) {
297 done = 1; /* let's lock on to this... */
298 bound = rt;
299 }
301 if (!done) {
302 unsigned long t;
303 if (i > 0) {
304 adjust_latency += -delta;
305 adj = -delta + adjust_latency/4;
306 } else
307 adj = -delta;
309 rdtscll(t);
310 wrmsrl(MSR_IA32_TSC, t + adj);
311 }
312 #if DEBUG_TSC_SYNC
313 t[i].rt = rt;
314 t[i].master = master_time_stamp;
315 t[i].diff = delta;
316 t[i].lat = adjust_latency/4;
317 #endif
318 }
319 }
320 spin_unlock_irqrestore(&tsc_sync_lock, flags);
322 #if DEBUG_TSC_SYNC
323 for (i = 0; i < NUM_ROUNDS; ++i)
324 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
325 t[i].rt, t[i].master, t[i].diff, t[i].lat);
326 #endif
328 printk(KERN_INFO
329 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
330 "maxerr %lu cycles)\n",
331 smp_processor_id(), boot_cpu_id, delta, rt);
332 }
334 static void __cpuinit tsc_sync_wait(void)
335 {
336 if (notscsync || !cpu_has_tsc)
337 return;
338 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
339 boot_cpu_id);
340 sync_tsc();
341 }
343 static __init int notscsync_setup(char *s)
344 {
345 notscsync = 1;
346 return 0;
347 }
348 __setup("notscsync", notscsync_setup);
349 #endif
351 static atomic_t init_deasserted __cpuinitdata;
353 /*
354 * Report back to the Boot Processor.
355 * Running on AP.
356 */
357 void __cpuinit smp_callin(void)
358 {
359 int cpuid, phys_id;
360 unsigned long timeout;
362 #ifndef CONFIG_XEN
363 /*
364 * If waken up by an INIT in an 82489DX configuration
365 * we may get here before an INIT-deassert IPI reaches
366 * our local APIC. We have to wait for the IPI or we'll
367 * lock up on an APIC access.
368 */
369 while (!atomic_read(&init_deasserted))
370 cpu_relax();
372 #endif
373 /*
374 * (This works even if the APIC is not enabled.)
375 */
376 #ifndef CONFIG_XEN
377 phys_id = GET_APIC_ID(apic_read(APIC_ID));
378 #else
379 phys_id = smp_processor_id();
380 #endif
381 cpuid = smp_processor_id();
382 if (cpu_isset(cpuid, cpu_callin_map)) {
383 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
384 phys_id, cpuid);
385 }
386 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
388 /*
389 * STARTUP IPIs are fragile beasts as they might sometimes
390 * trigger some glue motherboard logic. Complete APIC bus
391 * silence for 1 second, this overestimates the time the
392 * boot CPU is spending to send the up to 2 STARTUP IPIs
393 * by a factor of two. This should be enough.
394 */
396 /*
397 * Waiting 2s total for startup (udelay is not yet working)
398 */
399 timeout = jiffies + 2*HZ;
400 while (time_before(jiffies, timeout)) {
401 /*
402 * Has the boot CPU finished it's STARTUP sequence?
403 */
404 if (cpu_isset(cpuid, cpu_callout_map))
405 break;
406 cpu_relax();
407 }
409 if (!time_before(jiffies, timeout)) {
410 panic("smp_callin: CPU%d started up but did not get a callout!\n",
411 cpuid);
412 }
414 #ifndef CONFIG_XEN
415 /*
416 * the boot CPU has finished the init stage and is spinning
417 * on callin_map until we finish. We are free to set up this
418 * CPU, first the APIC. (this is probably redundant on most
419 * boards)
420 */
422 Dprintk("CALLIN, before setup_local_APIC().\n");
423 setup_local_APIC();
424 #endif
426 /*
427 * Get our bogomips.
428 */
429 calibrate_delay();
430 Dprintk("Stack at about %p\n",&cpuid);
432 #ifndef CONFIG_XEN
433 disable_APIC_timer();
434 #endif
436 /*
437 * Save our processor parameters
438 */
439 smp_store_cpu_info(cpuid);
441 /*
442 * Allow the master to continue.
443 */
444 cpu_set(cpuid, cpu_callin_map);
445 }
447 #ifdef CONFIG_XEN
448 static irqreturn_t ldebug_interrupt(
449 int irq, void *dev_id, struct pt_regs *regs)
450 {
451 return IRQ_HANDLED;
452 }
454 static DEFINE_PER_CPU(int, ldebug_irq);
455 static char ldebug_name[NR_CPUS][15];
457 void ldebug_setup(void)
458 {
459 int cpu = smp_processor_id();
461 per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG);
462 sprintf(ldebug_name[cpu], "ldebug%d", cpu);
463 BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt,
464 SA_INTERRUPT, ldebug_name[cpu], NULL));
465 }
467 extern void local_setup_timer(void);
468 #endif
470 /*
471 * Setup code on secondary processor (after comming out of the trampoline)
472 */
473 void __cpuinit start_secondary(void)
474 {
475 /*
476 * Dont put anything before smp_callin(), SMP
477 * booting is too fragile that we want to limit the
478 * things done here to the most necessary things.
479 */
480 cpu_init();
481 smp_callin();
483 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
484 barrier();
486 #ifndef CONFIG_XEN
487 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
488 setup_secondary_APIC_clock();
490 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
492 if (nmi_watchdog == NMI_IO_APIC) {
493 disable_8259A_irq(0);
494 enable_NMI_through_LVT0(NULL);
495 enable_8259A_irq(0);
496 }
498 enable_APIC_timer();
499 #else
500 local_setup_timer();
501 ldebug_setup();
502 smp_intr_init();
503 local_irq_enable();
504 #endif
506 /*
507 * Allow the master to continue.
508 */
509 cpu_set(smp_processor_id(), cpu_online_map);
510 mb();
512 #ifndef CONFIG_XEN
513 /* Wait for TSC sync to not schedule things before.
514 We still process interrupts, which could see an inconsistent
515 time in that window unfortunately. */
516 tsc_sync_wait();
517 #endif
519 cpu_idle();
520 }
522 extern volatile unsigned long init_rsp;
523 extern void (*initial_code)(void);
525 #ifndef CONFIG_XEN
526 #if APIC_DEBUG
527 static void inquire_remote_apic(int apicid)
528 {
529 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
530 char *names[] = { "ID", "VERSION", "SPIV" };
531 int timeout, status;
533 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
535 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
536 printk("... APIC #%d %s: ", apicid, names[i]);
538 /*
539 * Wait for idle.
540 */
541 apic_wait_icr_idle();
543 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
544 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
546 timeout = 0;
547 do {
548 udelay(100);
549 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
550 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
552 switch (status) {
553 case APIC_ICR_RR_VALID:
554 status = apic_read(APIC_RRR);
555 printk("%08x\n", status);
556 break;
557 default:
558 printk("failed\n");
559 }
560 }
561 }
562 #endif
564 /*
565 * Kick the secondary to wake up.
566 */
567 static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
568 {
569 unsigned long send_status = 0, accept_status = 0;
570 int maxlvt, timeout, num_starts, j;
572 Dprintk("Asserting INIT.\n");
574 /*
575 * Turn INIT on target chip
576 */
577 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
579 /*
580 * Send IPI
581 */
582 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
583 | APIC_DM_INIT);
585 Dprintk("Waiting for send to finish...\n");
586 timeout = 0;
587 do {
588 Dprintk("+");
589 udelay(100);
590 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
591 } while (send_status && (timeout++ < 1000));
593 mdelay(10);
595 Dprintk("Deasserting INIT.\n");
597 /* Target chip */
598 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
600 /* Send IPI */
601 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
603 Dprintk("Waiting for send to finish...\n");
604 timeout = 0;
605 do {
606 Dprintk("+");
607 udelay(100);
608 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
609 } while (send_status && (timeout++ < 1000));
611 atomic_set(&init_deasserted, 1);
613 /*
614 * Should we send STARTUP IPIs ?
615 *
616 * Determine this based on the APIC version.
617 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
618 */
619 if (APIC_INTEGRATED(apic_version[phys_apicid]))
620 num_starts = 2;
621 else
622 num_starts = 0;
624 /*
625 * Run STARTUP IPI loop.
626 */
627 Dprintk("#startup loops: %d.\n", num_starts);
629 maxlvt = get_maxlvt();
631 for (j = 1; j <= num_starts; j++) {
632 Dprintk("Sending STARTUP #%d.\n",j);
633 apic_read_around(APIC_SPIV);
634 apic_write(APIC_ESR, 0);
635 apic_read(APIC_ESR);
636 Dprintk("After apic_write.\n");
638 /*
639 * STARTUP IPI
640 */
642 /* Target chip */
643 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
645 /* Boot on the stack */
646 /* Kick the second */
647 apic_write_around(APIC_ICR, APIC_DM_STARTUP
648 | (start_rip >> 12));
650 /*
651 * Give the other CPU some time to accept the IPI.
652 */
653 udelay(300);
655 Dprintk("Startup point 1.\n");
657 Dprintk("Waiting for send to finish...\n");
658 timeout = 0;
659 do {
660 Dprintk("+");
661 udelay(100);
662 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
663 } while (send_status && (timeout++ < 1000));
665 /*
666 * Give the other CPU some time to accept the IPI.
667 */
668 udelay(200);
669 /*
670 * Due to the Pentium erratum 3AP.
671 */
672 if (maxlvt > 3) {
673 apic_read_around(APIC_SPIV);
674 apic_write(APIC_ESR, 0);
675 }
676 accept_status = (apic_read(APIC_ESR) & 0xEF);
677 if (send_status || accept_status)
678 break;
679 }
680 Dprintk("After Startup.\n");
682 if (send_status)
683 printk(KERN_ERR "APIC never delivered???\n");
684 if (accept_status)
685 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
687 return (send_status | accept_status);
688 }
689 #endif
691 /*
692 * Boot one CPU.
693 */
694 static int __cpuinit do_boot_cpu(int cpu, int apicid)
695 {
696 struct task_struct *idle;
697 unsigned long boot_error;
698 int timeout;
699 unsigned long start_rip;
700 #ifdef CONFIG_XEN
701 vcpu_guest_context_t ctxt;
702 extern void startup_64_smp(void);
703 extern void hypervisor_callback(void);
704 extern void failsafe_callback(void);
705 extern void smp_trap_init(trap_info_t *);
706 int i;
707 #endif
708 /*
709 * We can't use kernel_thread since we must avoid to
710 * reschedule the child.
711 */
712 idle = fork_idle(cpu);
713 if (IS_ERR(idle)) {
714 printk("failed fork for CPU %d\n", cpu);
715 return PTR_ERR(idle);
716 }
718 cpu_pda[cpu].pcurrent = idle;
720 #ifndef CONFIG_XEN
721 start_rip = setup_trampoline();
722 #else
723 start_rip = (unsigned long)startup_64_smp;
724 #endif
726 init_rsp = idle->thread.rsp;
727 per_cpu(init_tss,cpu).rsp0 = init_rsp;
728 initial_code = start_secondary;
729 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
731 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
732 start_rip, init_rsp);
734 /*
735 * This grunge runs the startup process for
736 * the targeted processor.
737 */
739 atomic_set(&init_deasserted, 0);
741 #ifdef CONFIG_XEN
742 cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL);
743 BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
744 cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
745 memcpy((void *)cpu_gdt_descr[cpu].address,
746 (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
748 memset(&ctxt, 0, sizeof(ctxt));
750 ctxt.flags = VGCF_IN_KERNEL;
751 ctxt.user_regs.ds = __USER_DS;
752 ctxt.user_regs.es = __USER_DS;
753 ctxt.user_regs.fs = 0;
754 ctxt.user_regs.gs = 0;
755 ctxt.user_regs.ss = __KERNEL_DS|0x3;
756 ctxt.user_regs.cs = __KERNEL_CS|0x3;
757 ctxt.user_regs.rip = start_rip;
758 ctxt.user_regs.rsp = idle->thread.rsp;
759 #define X86_EFLAGS_IOPL_RING3 0x3000
760 ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING3;
762 /* FPU is set up to default initial state. */
763 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
765 /* Virtual IDT is empty at start-of-day. */
766 for ( i = 0; i < 256; i++ )
767 {
768 ctxt.trap_ctxt[i].vector = i;
769 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
770 }
771 smp_trap_init(ctxt.trap_ctxt);
773 /* No LDT. */
774 ctxt.ldt_ents = 0;
776 {
777 unsigned long va;
778 int f;
780 for (va = cpu_gdt_descr[cpu].address, f = 0;
781 va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
782 va += PAGE_SIZE, f++) {
783 ctxt.gdt_frames[f] = virt_to_mfn(va);
784 make_page_readonly((void *)va);
785 }
786 ctxt.gdt_ents = GDT_ENTRIES;
787 }
789 /* Ring 1 stack is the initial stack. */
790 ctxt.kernel_ss = __KERNEL_DS;
791 ctxt.kernel_sp = idle->thread.rsp;
793 /* Callback handlers. */
794 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
795 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
796 ctxt.syscall_callback_eip = (unsigned long)system_call;
798 ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT;
800 boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
801 if (boot_error)
802 printk("boot error: %ld\n", boot_error);
804 if (!boot_error) {
805 /*
806 * allow APs to start initializing.
807 */
808 Dprintk("Before Callout %d.\n", cpu);
809 cpu_set(cpu, cpu_callout_map);
810 Dprintk("After Callout %d.\n", cpu);
812 /*
813 * Wait 5s total for a response
814 */
815 for (timeout = 0; timeout < 50000; timeout++) {
816 if (cpu_isset(cpu, cpu_callin_map))
817 break; /* It has booted */
818 udelay(100);
819 }
821 if (cpu_isset(cpu, cpu_callin_map)) {
822 /* number CPUs logically, starting from 1 (BSP is 0) */
823 Dprintk("CPU has booted.\n");
824 } else {
825 boot_error= 1;
826 }
827 }
828 x86_cpu_to_apicid[cpu] = apicid;
829 #else
830 Dprintk("Setting warm reset code and vector.\n");
832 CMOS_WRITE(0xa, 0xf);
833 local_flush_tlb();
834 Dprintk("1.\n");
835 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
836 Dprintk("2.\n");
837 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
838 Dprintk("3.\n");
840 /*
841 * Be paranoid about clearing APIC errors.
842 */
843 if (APIC_INTEGRATED(apic_version[apicid])) {
844 apic_read_around(APIC_SPIV);
845 apic_write(APIC_ESR, 0);
846 apic_read(APIC_ESR);
847 }
849 /*
850 * Status is now clean
851 */
852 boot_error = 0;
854 /*
855 * Starting actual IPI sequence...
856 */
857 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
859 if (!boot_error) {
860 /*
861 * allow APs to start initializing.
862 */
863 Dprintk("Before Callout %d.\n", cpu);
864 cpu_set(cpu, cpu_callout_map);
865 Dprintk("After Callout %d.\n", cpu);
867 /*
868 * Wait 5s total for a response
869 */
870 for (timeout = 0; timeout < 50000; timeout++) {
871 if (cpu_isset(cpu, cpu_callin_map))
872 break; /* It has booted */
873 udelay(100);
874 }
876 if (cpu_isset(cpu, cpu_callin_map)) {
877 /* number CPUs logically, starting from 1 (BSP is 0) */
878 Dprintk("CPU has booted.\n");
879 } else {
880 boot_error = 1;
881 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
882 == 0xA5)
883 /* trampoline started but...? */
884 printk("Stuck ??\n");
885 else
886 /* trampoline code not run */
887 printk("Not responding.\n");
888 #if APIC_DEBUG
889 inquire_remote_apic(apicid);
890 #endif
891 }
892 }
893 #endif
894 if (boot_error) {
895 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
896 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
897 cpu_clear(cpu, cpu_present_map);
898 cpu_clear(cpu, cpu_possible_map);
899 x86_cpu_to_apicid[cpu] = BAD_APICID;
900 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
901 return -EIO;
902 }
904 return 0;
905 }
907 cycles_t cacheflush_time;
908 unsigned long cache_decay_ticks;
910 /*
911 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
912 * on SMT systems efficiently.
913 */
914 static __cpuinit void detect_siblings(void)
915 {
916 int cpu;
918 for (cpu = 0; cpu < NR_CPUS; cpu++) {
919 cpus_clear(cpu_sibling_map[cpu]);
920 cpus_clear(cpu_core_map[cpu]);
921 }
923 for_each_online_cpu (cpu) {
924 struct cpuinfo_x86 *c = cpu_data + cpu;
925 int siblings = 0;
926 int i;
927 if (smp_num_siblings > 1) {
928 for_each_online_cpu (i) {
929 if (cpu_core_id[cpu] == cpu_core_id[i]) {
930 siblings++;
931 cpu_set(i, cpu_sibling_map[cpu]);
932 }
933 }
934 } else {
935 siblings++;
936 cpu_set(cpu, cpu_sibling_map[cpu]);
937 }
939 if (siblings != smp_num_siblings) {
940 printk(KERN_WARNING
941 "WARNING: %d siblings found for CPU%d, should be %d\n",
942 siblings, cpu, smp_num_siblings);
943 smp_num_siblings = siblings;
944 }
945 if (c->x86_num_cores > 1) {
946 for_each_online_cpu(i) {
947 if (phys_proc_id[cpu] == phys_proc_id[i])
948 cpu_set(i, cpu_core_map[cpu]);
949 }
950 } else
951 cpu_core_map[cpu] = cpu_sibling_map[cpu];
952 }
953 }
955 #ifndef CONFIG_XEN
956 /*
957 * Cleanup possible dangling ends...
958 */
959 static __cpuinit void smp_cleanup_boot(void)
960 {
961 /*
962 * Paranoid: Set warm reset code and vector here back
963 * to default values.
964 */
965 CMOS_WRITE(0, 0xf);
967 /*
968 * Reset trampoline flag
969 */
970 *((volatile int *) phys_to_virt(0x467)) = 0;
972 #ifndef CONFIG_HOTPLUG_CPU
973 /*
974 * Free pages reserved for SMP bootup.
975 * When you add hotplug CPU support later remove this
976 * Note there is more work to be done for later CPU bootup.
977 */
979 free_page((unsigned long) __va(PAGE_SIZE));
980 free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE));
981 #endif
982 }
983 #endif
985 /*
986 * Fall back to non SMP mode after errors.
987 *
988 * RED-PEN audit/test this more. I bet there is more state messed up here.
989 */
990 static __cpuinit void disable_smp(void)
991 {
992 cpu_present_map = cpumask_of_cpu(0);
993 cpu_possible_map = cpumask_of_cpu(0);
994 #ifndef CONFIG_XEN
995 if (smp_found_config)
996 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
997 else
998 phys_cpu_present_map = physid_mask_of_physid(0);
999 #endif
1000 cpu_set(0, cpu_sibling_map[0]);
1001 cpu_set(0, cpu_core_map[0]);
1004 /*
1005 * Handle user cpus=... parameter.
1006 */
1007 static __cpuinit void enforce_max_cpus(unsigned max_cpus)
1009 int i, k;
1010 k = 0;
1011 for (i = 0; i < NR_CPUS; i++) {
1012 if (!cpu_possible(i))
1013 continue;
1014 if (++k > max_cpus) {
1015 cpu_clear(i, cpu_possible_map);
1016 cpu_clear(i, cpu_present_map);
1021 /*
1022 * Various sanity checks.
1023 */
1024 static int __cpuinit smp_sanity_check(unsigned max_cpus)
1026 #ifndef CONFIG_XEN
1027 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1028 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1029 hard_smp_processor_id());
1030 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1033 /*
1034 * If we couldn't find an SMP configuration at boot time,
1035 * get out of here now!
1036 */
1037 if (!smp_found_config) {
1038 printk(KERN_NOTICE "SMP motherboard not detected.\n");
1039 disable_smp();
1040 if (APIC_init_uniprocessor())
1041 printk(KERN_NOTICE "Local APIC not detected."
1042 " Using dummy APIC emulation.\n");
1043 return -1;
1046 /*
1047 * Should not be necessary because the MP table should list the boot
1048 * CPU too, but we do it for the sake of robustness anyway.
1049 */
1050 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
1051 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
1052 boot_cpu_id);
1053 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1056 /*
1057 * If we couldn't find a local APIC, then get out of here now!
1058 */
1059 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
1060 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1061 boot_cpu_id);
1062 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1063 nr_ioapics = 0;
1064 return -1;
1066 #endif
1068 /*
1069 * If SMP should be disabled, then really disable it!
1070 */
1071 if (!max_cpus) {
1072 #ifdef CONFIG_XEN
1073 HYPERVISOR_shared_info->n_vcpu = 1;
1074 #endif
1075 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1076 #ifndef CONFIG_XEN
1077 nr_ioapics = 0;
1078 #endif
1079 return -1;
1082 return 0;
1085 /*
1086 * Prepare for SMP bootup. The MP table or ACPI has been read
1087 * earlier. Just do some sanity checking here and enable APIC mode.
1088 */
1089 void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
1091 int i;
1093 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1094 #else
1095 nmi_watchdog_default();
1096 #endif
1097 current_cpu_data = boot_cpu_data;
1098 current_thread_info()->cpu = 0; /* needed? */
1100 enforce_max_cpus(max_cpus);
1102 /*
1103 * Fill in cpu_present_mask
1104 */
1105 for (i = 0; i < NR_CPUS; i++) {
1106 #ifndef CONFIG_XEN
1107 int apicid = cpu_present_to_apicid(i);
1108 if (physid_isset(apicid, phys_cpu_present_map)) {
1109 #else
1110 if (i < HYPERVISOR_shared_info->n_vcpu) {
1111 #endif
1112 cpu_set(i, cpu_present_map);
1113 /* possible map would be different if we supported real
1114 CPU hotplug. */
1115 cpu_set(i, cpu_possible_map);
1119 if (smp_sanity_check(max_cpus) < 0) {
1120 printk(KERN_INFO "SMP disabled\n");
1121 disable_smp();
1122 return;
1125 #ifdef CONFIG_XEN
1126 smp_intr_init();
1127 #else
1129 /*
1130 * Switch from PIC to APIC mode.
1131 */
1132 connect_bsp_APIC();
1133 setup_local_APIC();
1135 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
1136 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1137 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
1138 /* Or can we switch back to PIC here? */
1140 #endif
1142 /*
1143 * Now start the IO-APICs
1144 */
1145 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1146 #else
1147 if (!skip_ioapic_setup && nr_ioapics)
1148 setup_IO_APIC();
1149 else
1150 nr_ioapics = 0;
1151 #endif
1153 /*
1154 * Set up local APIC timer on boot CPU.
1155 */
1157 #ifndef CONFIG_XEN
1158 setup_boot_APIC_clock();
1159 #endif
1162 /*
1163 * Early setup to make printk work.
1164 */
1165 void __init smp_prepare_boot_cpu(void)
1167 int me = smp_processor_id();
1168 cpu_set(me, cpu_online_map);
1169 cpu_set(me, cpu_callout_map);
1172 /*
1173 * Entry point to boot a CPU.
1175 * This is all __cpuinit, not __devinit for now because we don't support
1176 * CPU hotplug (yet).
1177 */
1178 int __cpuinit __cpu_up(unsigned int cpu)
1180 int err;
1181 #ifndef CONFIG_XEN
1182 int apicid = cpu_present_to_apicid(cpu);
1183 #else
1184 int apicid = cpu;
1185 #endif
1187 WARN_ON(irqs_disabled());
1189 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1191 #ifndef CONFIG_XEN
1192 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
1193 !physid_isset(apicid, phys_cpu_present_map)) {
1194 printk("__cpu_up: bad cpu %d\n", cpu);
1195 return -EINVAL;
1197 #endif
1199 /* Boot it! */
1200 err = do_boot_cpu(cpu, apicid);
1201 if (err < 0) {
1202 Dprintk("do_boot_cpu failed %d\n", err);
1203 return err;
1206 /* Unleash the CPU! */
1207 Dprintk("waiting for cpu %d\n", cpu);
1209 while (!cpu_isset(cpu, cpu_online_map))
1210 cpu_relax();
1211 return 0;
1214 /*
1215 * Finish the SMP boot.
1216 */
1217 void __cpuinit smp_cpus_done(unsigned int max_cpus)
1219 #ifndef CONFIG_XEN
1220 zap_low_mappings();
1221 smp_cleanup_boot();
1223 #ifdef CONFIG_X86_IO_APIC
1224 setup_ioapic_dest();
1225 #endif
1226 #endif
1228 detect_siblings();
1229 #ifndef CONFIG_XEN
1230 time_init_gtod();
1232 check_nmi_watchdog();
1233 #endif
1236 #ifdef CONFIG_XEN
1237 extern int bind_ipi_to_irq(int ipi);
1238 extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
1239 extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
1241 static DEFINE_PER_CPU(int, resched_irq);
1242 static DEFINE_PER_CPU(int, callfunc_irq);
1243 static char resched_name[NR_CPUS][15];
1244 static char callfunc_name[NR_CPUS][15];
1246 void smp_intr_init(void)
1248 int cpu = smp_processor_id();
1250 per_cpu(resched_irq, cpu) =
1251 bind_ipi_to_irq(RESCHEDULE_VECTOR);
1252 sprintf(resched_name[cpu], "resched%d", cpu);
1253 BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt,
1254 SA_INTERRUPT, resched_name[cpu], NULL));
1256 per_cpu(callfunc_irq, cpu) =
1257 bind_ipi_to_irq(CALL_FUNCTION_VECTOR);
1258 sprintf(callfunc_name[cpu], "callfunc%d", cpu);
1259 BUG_ON(request_irq(per_cpu(callfunc_irq, cpu),
1260 smp_call_function_interrupt,
1261 SA_INTERRUPT, callfunc_name[cpu], NULL));
1264 static void smp_intr_exit(void)
1266 int cpu = smp_processor_id();
1268 free_irq(per_cpu(resched_irq, cpu), NULL);
1269 unbind_ipi_from_irq(RESCHEDULE_VECTOR);
1271 free_irq(per_cpu(callfunc_irq, cpu), NULL);
1272 unbind_ipi_from_irq(CALL_FUNCTION_VECTOR);
1275 extern void local_setup_timer_irq(void);
1276 extern void local_teardown_timer_irq(void);
1278 void smp_suspend(void)
1280 /* XXX todo: take down time and ipi's on all cpus */
1281 local_teardown_timer_irq();
1282 smp_intr_exit();
1285 void smp_resume(void)
1287 /* XXX todo: restore time and ipi's on all cpus */
1288 smp_intr_init();
1289 local_setup_timer_irq();
1292 void _restore_vcpu(void)
1294 /* XXX need to write this */
1297 #endif