ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @ 7136:1abbb6448b4a

Remove unused debug code.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Sep 29 18:15:49 2005 +0100 (2005-09-29)
parents 06d84bf87159
children 61b3b357d827 bf6119a58655 a746126c3bb1
line source
1 /*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
37 */
40 #include <linux/config.h>
41 #include <linux/init.h>
43 #include <linux/mm.h>
44 #include <linux/kernel_stat.h>
45 #include <linux/smp_lock.h>
46 #include <linux/irq.h>
47 #include <linux/bootmem.h>
48 #include <linux/thread_info.h>
49 #include <linux/module.h>
50 #ifdef CONFIG_XEN
51 #include <linux/interrupt.h>
52 #endif
54 #include <linux/delay.h>
55 #include <linux/mc146818rtc.h>
56 #include <asm/mtrr.h>
57 #include <asm/pgalloc.h>
58 #include <asm/desc.h>
59 #include <asm/kdebug.h>
60 #include <asm/tlbflush.h>
61 #include <asm/proto.h>
62 #include <asm/nmi.h>
63 #ifdef CONFIG_XEN
64 #include <asm/arch_hooks.h>
66 #include <asm-xen/evtchn.h>
67 #endif
69 /* Change for real CPU hotplug. Note other files need to be fixed
70 first too. */
71 #define __cpuinit __init
72 #define __cpuinitdata __initdata
74 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
75 unsigned int maxcpus = NR_CPUS;
76 #endif
78 /* Number of siblings per CPU package */
79 int smp_num_siblings = 1;
80 /* Package ID of each logical CPU */
81 u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
82 u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
83 EXPORT_SYMBOL(phys_proc_id);
84 EXPORT_SYMBOL(cpu_core_id);
86 /* Bitmask of currently online CPUs */
87 cpumask_t cpu_online_map;
89 EXPORT_SYMBOL(cpu_online_map);
91 /*
92 * Private maps to synchronize booting between AP and BP.
93 * Probably not needed anymore, but it makes for easier debugging. -AK
94 */
95 cpumask_t cpu_callin_map;
96 cpumask_t cpu_callout_map;
98 cpumask_t cpu_possible_map;
99 EXPORT_SYMBOL(cpu_possible_map);
101 /* Per CPU bogomips and other parameters */
102 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
104 /* Set when the idlers are all forked */
105 int smp_threads_ready;
107 cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
108 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
109 EXPORT_SYMBOL(cpu_core_map);
111 #ifndef CONFIG_XEN
112 /*
113 * Trampoline 80x86 program as an array.
114 */
116 extern unsigned char trampoline_data[];
117 extern unsigned char trampoline_end[];
119 /*
120 * Currently trivial. Write the real->protected mode
121 * bootstrap into the page concerned. The caller
122 * has made sure it's suitably aligned.
123 */
125 static unsigned long __cpuinit setup_trampoline(void)
126 {
127 void *tramp = __va(SMP_TRAMPOLINE_BASE);
128 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
129 return virt_to_phys(tramp);
130 }
131 #endif
133 /*
134 * The bootstrap kernel entry code has set these up. Save them for
135 * a given CPU
136 */
138 static void __cpuinit smp_store_cpu_info(int id)
139 {
140 struct cpuinfo_x86 *c = cpu_data + id;
142 *c = boot_cpu_data;
143 identify_cpu(c);
144 print_cpu_info(c);
145 }
147 #ifndef CONFIG_XEN
148 /*
149 * New Funky TSC sync algorithm borrowed from IA64.
150 * Main advantage is that it doesn't reset the TSCs fully and
151 * in general looks more robust and it works better than my earlier
152 * attempts. I believe it was written by David Mosberger. Some minor
153 * adjustments for x86-64 by me -AK
154 *
155 * Original comment reproduced below.
156 *
157 * Synchronize TSC of the current (slave) CPU with the TSC of the
158 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
159 * eliminate the possibility of unaccounted-for errors (such as
160 * getting a machine check in the middle of a calibration step). The
161 * basic idea is for the slave to ask the master what itc value it has
162 * and to read its own itc before and after the master responds. Each
163 * iteration gives us three timestamps:
164 *
165 * slave master
166 *
167 * t0 ---\
168 * ---\
169 * --->
170 * tm
171 * /---
172 * /---
173 * t1 <---
174 *
175 *
176 * The goal is to adjust the slave's TSC such that tm falls exactly
177 * half-way between t0 and t1. If we achieve this, the clocks are
178 * synchronized provided the interconnect between the slave and the
179 * master is symmetric. Even if the interconnect were asymmetric, we
180 * would still know that the synchronization error is smaller than the
181 * roundtrip latency (t0 - t1).
182 *
183 * When the interconnect is quiet and symmetric, this lets us
184 * synchronize the TSC to within one or two cycles. However, we can
185 * only *guarantee* that the synchronization is accurate to within a
186 * round-trip time, which is typically in the range of several hundred
187 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
188 * are usually almost perfectly synchronized, but we shouldn't assume
189 * that the accuracy is much better than half a micro second or so.
190 *
191 * [there are other errors like the latency of RDTSC and of the
192 * WRMSR. These can also account to hundreds of cycles. So it's
193 * probably worse. It claims 153 cycles error on a dual Opteron,
194 * but I suspect the numbers are actually somewhat worse -AK]
195 */
197 #define MASTER 0
198 #define SLAVE (SMP_CACHE_BYTES/8)
200 /* Intentionally don't use cpu_relax() while TSC synchronization
201 because we don't want to go into funky power save modi or cause
202 hypervisors to schedule us away. Going to sleep would likely affect
203 latency and low latency is the primary objective here. -AK */
204 #define no_cpu_relax() barrier()
206 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
207 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
208 static int notscsync __cpuinitdata;
210 #undef DEBUG_TSC_SYNC
212 #define NUM_ROUNDS 64 /* magic value */
213 #define NUM_ITERS 5 /* likewise */
215 /* Callback on boot CPU */
216 static __cpuinit void sync_master(void *arg)
217 {
218 unsigned long flags, i;
220 if (smp_processor_id() != boot_cpu_id)
221 return;
223 go[MASTER] = 0;
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235 }
237 /*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242 static inline long
243 get_delta(long *rt, long *master)
244 {
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269 }
271 static __cpuinit void sync_tsc(void)
272 {
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276 #if DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283 #endif
285 go[MASTER] = 1;
287 smp_call_function(sync_master, NULL, 1, 0);
289 while (go[MASTER]) /* wait for master to be ready */
290 no_cpu_relax();
292 spin_lock_irqsave(&tsc_sync_lock, flags);
293 {
294 for (i = 0; i < NUM_ROUNDS; ++i) {
295 delta = get_delta(&rt, &master_time_stamp);
296 if (delta == 0) {
297 done = 1; /* let's lock on to this... */
298 bound = rt;
299 }
301 if (!done) {
302 unsigned long t;
303 if (i > 0) {
304 adjust_latency += -delta;
305 adj = -delta + adjust_latency/4;
306 } else
307 adj = -delta;
309 rdtscll(t);
310 wrmsrl(MSR_IA32_TSC, t + adj);
311 }
312 #if DEBUG_TSC_SYNC
313 t[i].rt = rt;
314 t[i].master = master_time_stamp;
315 t[i].diff = delta;
316 t[i].lat = adjust_latency/4;
317 #endif
318 }
319 }
320 spin_unlock_irqrestore(&tsc_sync_lock, flags);
322 #if DEBUG_TSC_SYNC
323 for (i = 0; i < NUM_ROUNDS; ++i)
324 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
325 t[i].rt, t[i].master, t[i].diff, t[i].lat);
326 #endif
328 printk(KERN_INFO
329 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
330 "maxerr %lu cycles)\n",
331 smp_processor_id(), boot_cpu_id, delta, rt);
332 }
334 static void __cpuinit tsc_sync_wait(void)
335 {
336 if (notscsync || !cpu_has_tsc)
337 return;
338 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
339 boot_cpu_id);
340 sync_tsc();
341 }
343 static __init int notscsync_setup(char *s)
344 {
345 notscsync = 1;
346 return 0;
347 }
348 __setup("notscsync", notscsync_setup);
349 #endif
351 static atomic_t init_deasserted __cpuinitdata;
353 /*
354 * Report back to the Boot Processor.
355 * Running on AP.
356 */
357 void __cpuinit smp_callin(void)
358 {
359 int cpuid, phys_id;
360 unsigned long timeout;
362 #ifndef CONFIG_XEN
363 /*
364 * If waken up by an INIT in an 82489DX configuration
365 * we may get here before an INIT-deassert IPI reaches
366 * our local APIC. We have to wait for the IPI or we'll
367 * lock up on an APIC access.
368 */
369 while (!atomic_read(&init_deasserted))
370 cpu_relax();
372 #endif
373 /*
374 * (This works even if the APIC is not enabled.)
375 */
376 #ifndef CONFIG_XEN
377 phys_id = GET_APIC_ID(apic_read(APIC_ID));
378 #else
379 phys_id = smp_processor_id();
380 #endif
381 cpuid = smp_processor_id();
382 if (cpu_isset(cpuid, cpu_callin_map)) {
383 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
384 phys_id, cpuid);
385 }
386 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
388 /*
389 * STARTUP IPIs are fragile beasts as they might sometimes
390 * trigger some glue motherboard logic. Complete APIC bus
391 * silence for 1 second, this overestimates the time the
392 * boot CPU is spending to send the up to 2 STARTUP IPIs
393 * by a factor of two. This should be enough.
394 */
396 /*
397 * Waiting 2s total for startup (udelay is not yet working)
398 */
399 timeout = jiffies + 2*HZ;
400 while (time_before(jiffies, timeout)) {
401 /*
402 * Has the boot CPU finished it's STARTUP sequence?
403 */
404 if (cpu_isset(cpuid, cpu_callout_map))
405 break;
406 cpu_relax();
407 }
409 if (!time_before(jiffies, timeout)) {
410 panic("smp_callin: CPU%d started up but did not get a callout!\n",
411 cpuid);
412 }
414 #ifndef CONFIG_XEN
415 /*
416 * the boot CPU has finished the init stage and is spinning
417 * on callin_map until we finish. We are free to set up this
418 * CPU, first the APIC. (this is probably redundant on most
419 * boards)
420 */
422 Dprintk("CALLIN, before setup_local_APIC().\n");
423 setup_local_APIC();
424 #endif
426 /*
427 * Get our bogomips.
428 */
429 calibrate_delay();
430 Dprintk("Stack at about %p\n",&cpuid);
432 #ifndef CONFIG_XEN
433 disable_APIC_timer();
434 #endif
436 /*
437 * Save our processor parameters
438 */
439 smp_store_cpu_info(cpuid);
441 /*
442 * Allow the master to continue.
443 */
444 cpu_set(cpuid, cpu_callin_map);
445 }
447 #ifdef CONFIG_XEN
448 extern void local_setup_timer(void);
449 #endif
451 /*
452 * Setup code on secondary processor (after comming out of the trampoline)
453 */
454 void __cpuinit start_secondary(void)
455 {
456 /*
457 * Dont put anything before smp_callin(), SMP
458 * booting is too fragile that we want to limit the
459 * things done here to the most necessary things.
460 */
461 cpu_init();
462 smp_callin();
464 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
465 barrier();
467 #ifndef CONFIG_XEN
468 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
469 setup_secondary_APIC_clock();
471 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
473 if (nmi_watchdog == NMI_IO_APIC) {
474 disable_8259A_irq(0);
475 enable_NMI_through_LVT0(NULL);
476 enable_8259A_irq(0);
477 }
479 enable_APIC_timer();
480 #else
481 local_setup_timer();
482 smp_intr_init();
483 local_irq_enable();
484 #endif
486 /*
487 * Allow the master to continue.
488 */
489 cpu_set(smp_processor_id(), cpu_online_map);
490 mb();
492 #ifndef CONFIG_XEN
493 /* Wait for TSC sync to not schedule things before.
494 We still process interrupts, which could see an inconsistent
495 time in that window unfortunately. */
496 tsc_sync_wait();
497 #endif
499 cpu_idle();
500 }
502 extern volatile unsigned long init_rsp;
503 extern void (*initial_code)(void);
505 #ifndef CONFIG_XEN
506 #if APIC_DEBUG
507 static void inquire_remote_apic(int apicid)
508 {
509 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
510 char *names[] = { "ID", "VERSION", "SPIV" };
511 int timeout, status;
513 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
515 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
516 printk("... APIC #%d %s: ", apicid, names[i]);
518 /*
519 * Wait for idle.
520 */
521 apic_wait_icr_idle();
523 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
524 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
526 timeout = 0;
527 do {
528 udelay(100);
529 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
530 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
532 switch (status) {
533 case APIC_ICR_RR_VALID:
534 status = apic_read(APIC_RRR);
535 printk("%08x\n", status);
536 break;
537 default:
538 printk("failed\n");
539 }
540 }
541 }
542 #endif
544 /*
545 * Kick the secondary to wake up.
546 */
547 static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
548 {
549 unsigned long send_status = 0, accept_status = 0;
550 int maxlvt, timeout, num_starts, j;
552 Dprintk("Asserting INIT.\n");
554 /*
555 * Turn INIT on target chip
556 */
557 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
559 /*
560 * Send IPI
561 */
562 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
563 | APIC_DM_INIT);
565 Dprintk("Waiting for send to finish...\n");
566 timeout = 0;
567 do {
568 Dprintk("+");
569 udelay(100);
570 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
571 } while (send_status && (timeout++ < 1000));
573 mdelay(10);
575 Dprintk("Deasserting INIT.\n");
577 /* Target chip */
578 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
580 /* Send IPI */
581 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
583 Dprintk("Waiting for send to finish...\n");
584 timeout = 0;
585 do {
586 Dprintk("+");
587 udelay(100);
588 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
589 } while (send_status && (timeout++ < 1000));
591 atomic_set(&init_deasserted, 1);
593 /*
594 * Should we send STARTUP IPIs ?
595 *
596 * Determine this based on the APIC version.
597 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
598 */
599 if (APIC_INTEGRATED(apic_version[phys_apicid]))
600 num_starts = 2;
601 else
602 num_starts = 0;
604 /*
605 * Run STARTUP IPI loop.
606 */
607 Dprintk("#startup loops: %d.\n", num_starts);
609 maxlvt = get_maxlvt();
611 for (j = 1; j <= num_starts; j++) {
612 Dprintk("Sending STARTUP #%d.\n",j);
613 apic_read_around(APIC_SPIV);
614 apic_write(APIC_ESR, 0);
615 apic_read(APIC_ESR);
616 Dprintk("After apic_write.\n");
618 /*
619 * STARTUP IPI
620 */
622 /* Target chip */
623 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
625 /* Boot on the stack */
626 /* Kick the second */
627 apic_write_around(APIC_ICR, APIC_DM_STARTUP
628 | (start_rip >> 12));
630 /*
631 * Give the other CPU some time to accept the IPI.
632 */
633 udelay(300);
635 Dprintk("Startup point 1.\n");
637 Dprintk("Waiting for send to finish...\n");
638 timeout = 0;
639 do {
640 Dprintk("+");
641 udelay(100);
642 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
643 } while (send_status && (timeout++ < 1000));
645 /*
646 * Give the other CPU some time to accept the IPI.
647 */
648 udelay(200);
649 /*
650 * Due to the Pentium erratum 3AP.
651 */
652 if (maxlvt > 3) {
653 apic_read_around(APIC_SPIV);
654 apic_write(APIC_ESR, 0);
655 }
656 accept_status = (apic_read(APIC_ESR) & 0xEF);
657 if (send_status || accept_status)
658 break;
659 }
660 Dprintk("After Startup.\n");
662 if (send_status)
663 printk(KERN_ERR "APIC never delivered???\n");
664 if (accept_status)
665 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
667 return (send_status | accept_status);
668 }
669 #endif
671 /*
672 * Boot one CPU.
673 */
674 static int __cpuinit do_boot_cpu(int cpu, int apicid)
675 {
676 struct task_struct *idle;
677 unsigned long boot_error;
678 int timeout;
679 unsigned long start_rip;
680 #ifdef CONFIG_XEN
681 vcpu_guest_context_t ctxt;
682 extern void startup_64_smp(void);
683 extern void hypervisor_callback(void);
684 extern void failsafe_callback(void);
685 extern void smp_trap_init(trap_info_t *);
686 int i;
687 #endif
688 /*
689 * We can't use kernel_thread since we must avoid to
690 * reschedule the child.
691 */
692 idle = fork_idle(cpu);
693 if (IS_ERR(idle)) {
694 printk("failed fork for CPU %d\n", cpu);
695 return PTR_ERR(idle);
696 }
698 cpu_pda[cpu].pcurrent = idle;
700 #ifndef CONFIG_XEN
701 start_rip = setup_trampoline();
702 #else
703 start_rip = (unsigned long)startup_64_smp;
704 #endif
706 init_rsp = idle->thread.rsp;
707 per_cpu(init_tss,cpu).rsp0 = init_rsp;
708 initial_code = start_secondary;
709 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
711 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
712 start_rip, init_rsp);
714 /*
715 * This grunge runs the startup process for
716 * the targeted processor.
717 */
719 atomic_set(&init_deasserted, 0);
721 #ifdef CONFIG_XEN
722 cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL|__GFP_ZERO);
723 BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
724 cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
725 memcpy((void *)cpu_gdt_descr[cpu].address,
726 (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
728 memset(&ctxt, 0, sizeof(ctxt));
730 ctxt.flags = VGCF_IN_KERNEL;
731 ctxt.user_regs.ds = __USER_DS;
732 ctxt.user_regs.es = __USER_DS;
733 ctxt.user_regs.fs = 0;
734 ctxt.user_regs.gs = 0;
735 ctxt.user_regs.ss = __KERNEL_DS|0x3;
736 ctxt.user_regs.cs = __KERNEL_CS|0x3;
737 ctxt.user_regs.rip = start_rip;
738 ctxt.user_regs.rsp = idle->thread.rsp;
739 #define X86_EFLAGS_IOPL_RING3 0x3000
740 ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING3;
742 /* FPU is set up to default initial state. */
743 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
745 /* Virtual IDT is empty at start-of-day. */
746 for ( i = 0; i < 256; i++ )
747 {
748 ctxt.trap_ctxt[i].vector = i;
749 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
750 }
751 smp_trap_init(ctxt.trap_ctxt);
753 /* No LDT. */
754 ctxt.ldt_ents = 0;
756 {
757 unsigned long va;
758 int f;
760 for (va = cpu_gdt_descr[cpu].address, f = 0;
761 va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
762 va += PAGE_SIZE, f++) {
763 ctxt.gdt_frames[f] = virt_to_mfn(va);
764 make_page_readonly((void *)va);
765 }
766 ctxt.gdt_ents = GDT_ENTRIES;
767 }
769 /* Ring 1 stack is the initial stack. */
770 ctxt.kernel_ss = __KERNEL_DS;
771 ctxt.kernel_sp = idle->thread.rsp;
773 /* Callback handlers. */
774 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
775 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
776 ctxt.syscall_callback_eip = (unsigned long)system_call;
778 ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT;
780 boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
781 if (boot_error)
782 printk("boot error: %ld\n", boot_error);
784 if (!boot_error) {
785 /*
786 * allow APs to start initializing.
787 */
788 Dprintk("Before Callout %d.\n", cpu);
789 cpu_set(cpu, cpu_callout_map);
790 Dprintk("After Callout %d.\n", cpu);
792 /*
793 * Wait 5s total for a response
794 */
795 for (timeout = 0; timeout < 50000; timeout++) {
796 if (cpu_isset(cpu, cpu_callin_map))
797 break; /* It has booted */
798 udelay(100);
799 }
801 if (cpu_isset(cpu, cpu_callin_map)) {
802 /* number CPUs logically, starting from 1 (BSP is 0) */
803 Dprintk("CPU has booted.\n");
804 } else {
805 boot_error= 1;
806 }
807 }
808 x86_cpu_to_apicid[cpu] = apicid;
809 #else
810 Dprintk("Setting warm reset code and vector.\n");
812 CMOS_WRITE(0xa, 0xf);
813 local_flush_tlb();
814 Dprintk("1.\n");
815 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
816 Dprintk("2.\n");
817 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
818 Dprintk("3.\n");
820 /*
821 * Be paranoid about clearing APIC errors.
822 */
823 if (APIC_INTEGRATED(apic_version[apicid])) {
824 apic_read_around(APIC_SPIV);
825 apic_write(APIC_ESR, 0);
826 apic_read(APIC_ESR);
827 }
829 /*
830 * Status is now clean
831 */
832 boot_error = 0;
834 /*
835 * Starting actual IPI sequence...
836 */
837 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
839 if (!boot_error) {
840 /*
841 * allow APs to start initializing.
842 */
843 Dprintk("Before Callout %d.\n", cpu);
844 cpu_set(cpu, cpu_callout_map);
845 Dprintk("After Callout %d.\n", cpu);
847 /*
848 * Wait 5s total for a response
849 */
850 for (timeout = 0; timeout < 50000; timeout++) {
851 if (cpu_isset(cpu, cpu_callin_map))
852 break; /* It has booted */
853 udelay(100);
854 }
856 if (cpu_isset(cpu, cpu_callin_map)) {
857 /* number CPUs logically, starting from 1 (BSP is 0) */
858 Dprintk("CPU has booted.\n");
859 } else {
860 boot_error = 1;
861 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
862 == 0xA5)
863 /* trampoline started but...? */
864 printk("Stuck ??\n");
865 else
866 /* trampoline code not run */
867 printk("Not responding.\n");
868 #if APIC_DEBUG
869 inquire_remote_apic(apicid);
870 #endif
871 }
872 }
873 #endif
874 if (boot_error) {
875 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
876 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
877 cpu_clear(cpu, cpu_present_map);
878 cpu_clear(cpu, cpu_possible_map);
879 x86_cpu_to_apicid[cpu] = BAD_APICID;
880 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
881 return -EIO;
882 }
884 return 0;
885 }
887 cycles_t cacheflush_time;
888 unsigned long cache_decay_ticks;
890 /*
891 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
892 * on SMT systems efficiently.
893 */
894 static __cpuinit void detect_siblings(void)
895 {
896 int cpu;
898 for (cpu = 0; cpu < NR_CPUS; cpu++) {
899 cpus_clear(cpu_sibling_map[cpu]);
900 cpus_clear(cpu_core_map[cpu]);
901 }
903 for_each_online_cpu (cpu) {
904 struct cpuinfo_x86 *c = cpu_data + cpu;
905 int siblings = 0;
906 int i;
907 if (smp_num_siblings > 1) {
908 for_each_online_cpu (i) {
909 if (cpu_core_id[cpu] == cpu_core_id[i]) {
910 siblings++;
911 cpu_set(i, cpu_sibling_map[cpu]);
912 }
913 }
914 } else {
915 siblings++;
916 cpu_set(cpu, cpu_sibling_map[cpu]);
917 }
919 if (siblings != smp_num_siblings) {
920 printk(KERN_WARNING
921 "WARNING: %d siblings found for CPU%d, should be %d\n",
922 siblings, cpu, smp_num_siblings);
923 smp_num_siblings = siblings;
924 }
925 if (c->x86_num_cores > 1) {
926 for_each_online_cpu(i) {
927 if (phys_proc_id[cpu] == phys_proc_id[i])
928 cpu_set(i, cpu_core_map[cpu]);
929 }
930 } else
931 cpu_core_map[cpu] = cpu_sibling_map[cpu];
932 }
933 }
935 #ifndef CONFIG_XEN
936 /*
937 * Cleanup possible dangling ends...
938 */
939 static __cpuinit void smp_cleanup_boot(void)
940 {
941 /*
942 * Paranoid: Set warm reset code and vector here back
943 * to default values.
944 */
945 CMOS_WRITE(0, 0xf);
947 /*
948 * Reset trampoline flag
949 */
950 *((volatile int *) phys_to_virt(0x467)) = 0;
952 #ifndef CONFIG_HOTPLUG_CPU
953 /*
954 * Free pages reserved for SMP bootup.
955 * When you add hotplug CPU support later remove this
956 * Note there is more work to be done for later CPU bootup.
957 */
959 free_page((unsigned long) __va(PAGE_SIZE));
960 free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE));
961 #endif
962 }
963 #endif
965 /*
966 * Fall back to non SMP mode after errors.
967 *
968 * RED-PEN audit/test this more. I bet there is more state messed up here.
969 */
970 static __cpuinit void disable_smp(void)
971 {
972 cpu_present_map = cpumask_of_cpu(0);
973 cpu_possible_map = cpumask_of_cpu(0);
974 #ifndef CONFIG_XEN
975 if (smp_found_config)
976 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
977 else
978 phys_cpu_present_map = physid_mask_of_physid(0);
979 #endif
980 cpu_set(0, cpu_sibling_map[0]);
981 cpu_set(0, cpu_core_map[0]);
982 }
984 /*
985 * Handle user cpus=... parameter.
986 */
987 static __cpuinit void enforce_max_cpus(unsigned max_cpus)
988 {
989 int i, k;
990 k = 0;
991 for (i = 0; i < NR_CPUS; i++) {
992 if (!cpu_possible(i))
993 continue;
994 if (++k > max_cpus) {
995 cpu_clear(i, cpu_possible_map);
996 cpu_clear(i, cpu_present_map);
997 }
998 }
999 }
1001 /*
1002 * Various sanity checks.
1003 */
1004 static int __cpuinit smp_sanity_check(unsigned max_cpus)
1006 #ifndef CONFIG_XEN
1007 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1008 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1009 hard_smp_processor_id());
1010 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1013 /*
1014 * If we couldn't find an SMP configuration at boot time,
1015 * get out of here now!
1016 */
1017 if (!smp_found_config) {
1018 printk(KERN_NOTICE "SMP motherboard not detected.\n");
1019 disable_smp();
1020 if (APIC_init_uniprocessor())
1021 printk(KERN_NOTICE "Local APIC not detected."
1022 " Using dummy APIC emulation.\n");
1023 return -1;
1026 /*
1027 * Should not be necessary because the MP table should list the boot
1028 * CPU too, but we do it for the sake of robustness anyway.
1029 */
1030 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
1031 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
1032 boot_cpu_id);
1033 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1036 /*
1037 * If we couldn't find a local APIC, then get out of here now!
1038 */
1039 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
1040 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1041 boot_cpu_id);
1042 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1043 nr_ioapics = 0;
1044 return -1;
1046 #endif
1048 /*
1049 * If SMP should be disabled, then really disable it!
1050 */
1051 if (!max_cpus) {
1052 #ifdef CONFIG_XEN
1053 HYPERVISOR_shared_info->n_vcpu = 1;
1054 #endif
1055 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1056 #ifndef CONFIG_XEN
1057 nr_ioapics = 0;
1058 #endif
1059 return -1;
1062 return 0;
1065 /*
1066 * Prepare for SMP bootup. The MP table or ACPI has been read
1067 * earlier. Just do some sanity checking here and enable APIC mode.
1068 */
1069 void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
1071 int i;
1073 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1074 #else
1075 nmi_watchdog_default();
1076 #endif
1077 current_cpu_data = boot_cpu_data;
1078 current_thread_info()->cpu = 0; /* needed? */
1080 enforce_max_cpus(max_cpus);
1082 /*
1083 * Fill in cpu_present_mask
1084 */
1085 for (i = 0; i < NR_CPUS; i++) {
1086 #ifndef CONFIG_XEN
1087 int apicid = cpu_present_to_apicid(i);
1088 if (physid_isset(apicid, phys_cpu_present_map)) {
1089 #else
1090 if (i < HYPERVISOR_shared_info->n_vcpu) {
1091 #endif
1092 cpu_set(i, cpu_present_map);
1093 /* possible map would be different if we supported real
1094 CPU hotplug. */
1095 cpu_set(i, cpu_possible_map);
1099 if (smp_sanity_check(max_cpus) < 0) {
1100 printk(KERN_INFO "SMP disabled\n");
1101 disable_smp();
1102 return;
1105 #ifdef CONFIG_XEN
1106 smp_intr_init();
1107 #else
1109 /*
1110 * Switch from PIC to APIC mode.
1111 */
1112 connect_bsp_APIC();
1113 setup_local_APIC();
1115 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
1116 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1117 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
1118 /* Or can we switch back to PIC here? */
1120 #endif
1122 /*
1123 * Now start the IO-APICs
1124 */
1125 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1126 #else
1127 if (!skip_ioapic_setup && nr_ioapics)
1128 setup_IO_APIC();
1129 else
1130 nr_ioapics = 0;
1131 #endif
1133 /*
1134 * Set up local APIC timer on boot CPU.
1135 */
1137 #ifndef CONFIG_XEN
1138 setup_boot_APIC_clock();
1139 #endif
1142 /*
1143 * Early setup to make printk work.
1144 */
1145 void __init smp_prepare_boot_cpu(void)
1147 int me = smp_processor_id();
1148 cpu_set(me, cpu_online_map);
1149 cpu_set(me, cpu_callout_map);
1152 /*
1153 * Entry point to boot a CPU.
1155 * This is all __cpuinit, not __devinit for now because we don't support
1156 * CPU hotplug (yet).
1157 */
1158 int __cpuinit __cpu_up(unsigned int cpu)
1160 int err;
1161 #ifndef CONFIG_XEN
1162 int apicid = cpu_present_to_apicid(cpu);
1163 #else
1164 int apicid = cpu;
1165 #endif
1167 WARN_ON(irqs_disabled());
1169 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1171 #ifndef CONFIG_XEN
1172 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
1173 !physid_isset(apicid, phys_cpu_present_map)) {
1174 printk("__cpu_up: bad cpu %d\n", cpu);
1175 return -EINVAL;
1177 #endif
1179 /* Boot it! */
1180 err = do_boot_cpu(cpu, apicid);
1181 if (err < 0) {
1182 Dprintk("do_boot_cpu failed %d\n", err);
1183 return err;
1186 /* Unleash the CPU! */
1187 Dprintk("waiting for cpu %d\n", cpu);
1189 while (!cpu_isset(cpu, cpu_online_map))
1190 cpu_relax();
1191 return 0;
1194 /*
1195 * Finish the SMP boot.
1196 */
1197 void __cpuinit smp_cpus_done(unsigned int max_cpus)
1199 #ifndef CONFIG_XEN
1200 zap_low_mappings();
1201 smp_cleanup_boot();
1203 #ifdef CONFIG_X86_IO_APIC
1204 setup_ioapic_dest();
1205 #endif
1206 #endif
1208 detect_siblings();
1209 #ifndef CONFIG_XEN
1210 time_init_gtod();
1212 check_nmi_watchdog();
1213 #endif
1216 #ifdef CONFIG_XEN
1217 extern int bind_ipi_to_irq(int ipi);
1218 extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
1219 extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
1221 static DEFINE_PER_CPU(int, resched_irq);
1222 static DEFINE_PER_CPU(int, callfunc_irq);
1223 static char resched_name[NR_CPUS][15];
1224 static char callfunc_name[NR_CPUS][15];
1226 void smp_intr_init(void)
1228 int cpu = smp_processor_id();
1230 per_cpu(resched_irq, cpu) =
1231 bind_ipi_to_irq(RESCHEDULE_VECTOR);
1232 sprintf(resched_name[cpu], "resched%d", cpu);
1233 BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt,
1234 SA_INTERRUPT, resched_name[cpu], NULL));
1236 per_cpu(callfunc_irq, cpu) =
1237 bind_ipi_to_irq(CALL_FUNCTION_VECTOR);
1238 sprintf(callfunc_name[cpu], "callfunc%d", cpu);
1239 BUG_ON(request_irq(per_cpu(callfunc_irq, cpu),
1240 smp_call_function_interrupt,
1241 SA_INTERRUPT, callfunc_name[cpu], NULL));
1244 static void smp_intr_exit(void)
1246 int cpu = smp_processor_id();
1248 free_irq(per_cpu(resched_irq, cpu), NULL);
1249 unbind_ipi_from_irq(RESCHEDULE_VECTOR);
1251 free_irq(per_cpu(callfunc_irq, cpu), NULL);
1252 unbind_ipi_from_irq(CALL_FUNCTION_VECTOR);
1255 extern void local_setup_timer_irq(void);
1256 extern void local_teardown_timer_irq(void);
1258 void smp_suspend(void)
1260 local_teardown_timer_irq();
1261 smp_intr_exit();
1264 void smp_resume(void)
1266 smp_intr_init();
1267 local_setup_timer_irq();
1270 void save_vcpu_context(int vcpu, vcpu_guest_context_t *ctxt)
1274 int restore_vcpu_context(int vcpu, vcpu_guest_context_t *ctxt)
1276 return 0;
1279 #endif