ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @ 7385:56752fea020d

Fixed 64 bit Linux build.

Fixes from Ian Pratt. extern declaration of bind_ipi was out of sync with
the definition of the function.

Signed-off-by: James Bulpin <james@xensource.com>
author jrb44@plym.cl.cam.ac.uk
date Thu Oct 13 20:16:02 2005 +0100 (2005-10-13)
parents 5a97ee0633e8
children 70aa62954e91
line source
1 /*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
37 */
40 #include <linux/config.h>
41 #include <linux/init.h>
43 #include <linux/mm.h>
44 #include <linux/kernel_stat.h>
45 #include <linux/smp_lock.h>
46 #include <linux/irq.h>
47 #include <linux/bootmem.h>
48 #include <linux/thread_info.h>
49 #include <linux/module.h>
50 #ifdef CONFIG_XEN
51 #include <linux/interrupt.h>
52 #endif
54 #include <linux/delay.h>
55 #include <linux/mc146818rtc.h>
56 #include <asm/mtrr.h>
57 #include <asm/pgalloc.h>
58 #include <asm/desc.h>
59 #include <asm/kdebug.h>
60 #include <asm/tlbflush.h>
61 #include <asm/proto.h>
62 #include <asm/nmi.h>
63 #ifdef CONFIG_XEN
64 #include <asm/arch_hooks.h>
65 #include <asm-xen/evtchn.h>
66 #include <asm-xen/xen-public/vcpu.h>
67 #endif
69 /* Change for real CPU hotplug. Note other files need to be fixed
70 first too. */
71 #define __cpuinit __init
72 #define __cpuinitdata __initdata
74 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
75 unsigned int maxcpus = NR_CPUS;
76 #endif
78 /* Number of siblings per CPU package */
79 int smp_num_siblings = 1;
80 /* Package ID of each logical CPU */
81 u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
82 u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
83 EXPORT_SYMBOL(phys_proc_id);
84 EXPORT_SYMBOL(cpu_core_id);
86 /* Bitmask of currently online CPUs */
87 cpumask_t cpu_online_map;
89 EXPORT_SYMBOL(cpu_online_map);
91 /*
92 * Private maps to synchronize booting between AP and BP.
93 * Probably not needed anymore, but it makes for easier debugging. -AK
94 */
95 cpumask_t cpu_callin_map;
96 cpumask_t cpu_callout_map;
98 cpumask_t cpu_possible_map;
99 EXPORT_SYMBOL(cpu_possible_map);
101 /* Per CPU bogomips and other parameters */
102 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
104 /* Set when the idlers are all forked */
105 int smp_threads_ready;
107 cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
108 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
109 EXPORT_SYMBOL(cpu_core_map);
111 #ifndef CONFIG_XEN
112 /*
113 * Trampoline 80x86 program as an array.
114 */
116 extern unsigned char trampoline_data[];
117 extern unsigned char trampoline_end[];
119 /*
120 * Currently trivial. Write the real->protected mode
121 * bootstrap into the page concerned. The caller
122 * has made sure it's suitably aligned.
123 */
125 static unsigned long __cpuinit setup_trampoline(void)
126 {
127 void *tramp = __va(SMP_TRAMPOLINE_BASE);
128 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
129 return virt_to_phys(tramp);
130 }
131 #endif
133 /*
134 * The bootstrap kernel entry code has set these up. Save them for
135 * a given CPU
136 */
138 static void __cpuinit smp_store_cpu_info(int id)
139 {
140 struct cpuinfo_x86 *c = cpu_data + id;
142 *c = boot_cpu_data;
143 identify_cpu(c);
144 print_cpu_info(c);
145 }
147 #ifndef CONFIG_XEN
148 /*
149 * New Funky TSC sync algorithm borrowed from IA64.
150 * Main advantage is that it doesn't reset the TSCs fully and
151 * in general looks more robust and it works better than my earlier
152 * attempts. I believe it was written by David Mosberger. Some minor
153 * adjustments for x86-64 by me -AK
154 *
155 * Original comment reproduced below.
156 *
157 * Synchronize TSC of the current (slave) CPU with the TSC of the
158 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
159 * eliminate the possibility of unaccounted-for errors (such as
160 * getting a machine check in the middle of a calibration step). The
161 * basic idea is for the slave to ask the master what itc value it has
162 * and to read its own itc before and after the master responds. Each
163 * iteration gives us three timestamps:
164 *
165 * slave master
166 *
167 * t0 ---\
168 * ---\
169 * --->
170 * tm
171 * /---
172 * /---
173 * t1 <---
174 *
175 *
176 * The goal is to adjust the slave's TSC such that tm falls exactly
177 * half-way between t0 and t1. If we achieve this, the clocks are
178 * synchronized provided the interconnect between the slave and the
179 * master is symmetric. Even if the interconnect were asymmetric, we
180 * would still know that the synchronization error is smaller than the
181 * roundtrip latency (t0 - t1).
182 *
183 * When the interconnect is quiet and symmetric, this lets us
184 * synchronize the TSC to within one or two cycles. However, we can
185 * only *guarantee* that the synchronization is accurate to within a
186 * round-trip time, which is typically in the range of several hundred
187 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
188 * are usually almost perfectly synchronized, but we shouldn't assume
189 * that the accuracy is much better than half a micro second or so.
190 *
191 * [there are other errors like the latency of RDTSC and of the
192 * WRMSR. These can also account to hundreds of cycles. So it's
193 * probably worse. It claims 153 cycles error on a dual Opteron,
194 * but I suspect the numbers are actually somewhat worse -AK]
195 */
197 #define MASTER 0
198 #define SLAVE (SMP_CACHE_BYTES/8)
200 /* Intentionally don't use cpu_relax() while TSC synchronization
201 because we don't want to go into funky power save modi or cause
202 hypervisors to schedule us away. Going to sleep would likely affect
203 latency and low latency is the primary objective here. -AK */
204 #define no_cpu_relax() barrier()
206 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
207 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
208 static int notscsync __cpuinitdata;
210 #undef DEBUG_TSC_SYNC
212 #define NUM_ROUNDS 64 /* magic value */
213 #define NUM_ITERS 5 /* likewise */
215 /* Callback on boot CPU */
216 static __cpuinit void sync_master(void *arg)
217 {
218 unsigned long flags, i;
220 if (smp_processor_id() != boot_cpu_id)
221 return;
223 go[MASTER] = 0;
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235 }
237 /*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242 static inline long
243 get_delta(long *rt, long *master)
244 {
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269 }
271 static __cpuinit void sync_tsc(void)
272 {
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276 #if DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283 #endif
285 go[MASTER] = 1;
287 smp_call_function(sync_master, NULL, 1, 0);
289 while (go[MASTER]) /* wait for master to be ready */
290 no_cpu_relax();
292 spin_lock_irqsave(&tsc_sync_lock, flags);
293 {
294 for (i = 0; i < NUM_ROUNDS; ++i) {
295 delta = get_delta(&rt, &master_time_stamp);
296 if (delta == 0) {
297 done = 1; /* let's lock on to this... */
298 bound = rt;
299 }
301 if (!done) {
302 unsigned long t;
303 if (i > 0) {
304 adjust_latency += -delta;
305 adj = -delta + adjust_latency/4;
306 } else
307 adj = -delta;
309 rdtscll(t);
310 wrmsrl(MSR_IA32_TSC, t + adj);
311 }
312 #if DEBUG_TSC_SYNC
313 t[i].rt = rt;
314 t[i].master = master_time_stamp;
315 t[i].diff = delta;
316 t[i].lat = adjust_latency/4;
317 #endif
318 }
319 }
320 spin_unlock_irqrestore(&tsc_sync_lock, flags);
322 #if DEBUG_TSC_SYNC
323 for (i = 0; i < NUM_ROUNDS; ++i)
324 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
325 t[i].rt, t[i].master, t[i].diff, t[i].lat);
326 #endif
328 printk(KERN_INFO
329 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
330 "maxerr %lu cycles)\n",
331 smp_processor_id(), boot_cpu_id, delta, rt);
332 }
334 static void __cpuinit tsc_sync_wait(void)
335 {
336 if (notscsync || !cpu_has_tsc)
337 return;
338 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
339 boot_cpu_id);
340 sync_tsc();
341 }
343 static __init int notscsync_setup(char *s)
344 {
345 notscsync = 1;
346 return 0;
347 }
348 __setup("notscsync", notscsync_setup);
349 #endif
351 static atomic_t init_deasserted __cpuinitdata;
353 /*
354 * Report back to the Boot Processor.
355 * Running on AP.
356 */
357 void __cpuinit smp_callin(void)
358 {
359 int cpuid, phys_id;
360 unsigned long timeout;
362 #ifndef CONFIG_XEN
363 /*
364 * If waken up by an INIT in an 82489DX configuration
365 * we may get here before an INIT-deassert IPI reaches
366 * our local APIC. We have to wait for the IPI or we'll
367 * lock up on an APIC access.
368 */
369 while (!atomic_read(&init_deasserted))
370 cpu_relax();
372 #endif
373 /*
374 * (This works even if the APIC is not enabled.)
375 */
376 #ifndef CONFIG_XEN
377 phys_id = GET_APIC_ID(apic_read(APIC_ID));
378 #else
379 phys_id = smp_processor_id();
380 #endif
381 cpuid = smp_processor_id();
382 if (cpu_isset(cpuid, cpu_callin_map)) {
383 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
384 phys_id, cpuid);
385 }
386 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
388 /*
389 * STARTUP IPIs are fragile beasts as they might sometimes
390 * trigger some glue motherboard logic. Complete APIC bus
391 * silence for 1 second, this overestimates the time the
392 * boot CPU is spending to send the up to 2 STARTUP IPIs
393 * by a factor of two. This should be enough.
394 */
396 /*
397 * Waiting 2s total for startup (udelay is not yet working)
398 */
399 timeout = jiffies + 2*HZ;
400 while (time_before(jiffies, timeout)) {
401 /*
402 * Has the boot CPU finished it's STARTUP sequence?
403 */
404 if (cpu_isset(cpuid, cpu_callout_map))
405 break;
406 cpu_relax();
407 }
409 if (!time_before(jiffies, timeout)) {
410 panic("smp_callin: CPU%d started up but did not get a callout!\n",
411 cpuid);
412 }
414 #ifndef CONFIG_XEN
415 /*
416 * the boot CPU has finished the init stage and is spinning
417 * on callin_map until we finish. We are free to set up this
418 * CPU, first the APIC. (this is probably redundant on most
419 * boards)
420 */
422 Dprintk("CALLIN, before setup_local_APIC().\n");
423 setup_local_APIC();
424 #endif
426 /*
427 * Get our bogomips.
428 */
429 calibrate_delay();
430 Dprintk("Stack at about %p\n",&cpuid);
432 #ifndef CONFIG_XEN
433 disable_APIC_timer();
434 #endif
436 /*
437 * Save our processor parameters
438 */
439 smp_store_cpu_info(cpuid);
441 /*
442 * Allow the master to continue.
443 */
444 cpu_set(cpuid, cpu_callin_map);
445 }
447 #ifdef CONFIG_XEN
448 extern void local_setup_timer(void);
449 #endif
451 /*
452 * Setup code on secondary processor (after comming out of the trampoline)
453 */
454 void __cpuinit start_secondary(void)
455 {
456 /*
457 * Dont put anything before smp_callin(), SMP
458 * booting is too fragile that we want to limit the
459 * things done here to the most necessary things.
460 */
461 cpu_init();
462 smp_callin();
464 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
465 barrier();
467 #ifndef CONFIG_XEN
468 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
469 setup_secondary_APIC_clock();
471 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
473 if (nmi_watchdog == NMI_IO_APIC) {
474 disable_8259A_irq(0);
475 enable_NMI_through_LVT0(NULL);
476 enable_8259A_irq(0);
477 }
479 enable_APIC_timer();
480 #else
481 local_setup_timer();
482 smp_intr_init();
483 local_irq_enable();
484 #endif
486 /*
487 * Allow the master to continue.
488 */
489 cpu_set(smp_processor_id(), cpu_online_map);
490 mb();
492 #ifndef CONFIG_XEN
493 /* Wait for TSC sync to not schedule things before.
494 We still process interrupts, which could see an inconsistent
495 time in that window unfortunately. */
496 tsc_sync_wait();
497 #endif
499 cpu_idle();
500 }
502 extern volatile unsigned long init_rsp;
503 extern void (*initial_code)(void);
505 #ifndef CONFIG_XEN
506 #if APIC_DEBUG
507 static void inquire_remote_apic(int apicid)
508 {
509 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
510 char *names[] = { "ID", "VERSION", "SPIV" };
511 int timeout, status;
513 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
515 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
516 printk("... APIC #%d %s: ", apicid, names[i]);
518 /*
519 * Wait for idle.
520 */
521 apic_wait_icr_idle();
523 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
524 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
526 timeout = 0;
527 do {
528 udelay(100);
529 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
530 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
532 switch (status) {
533 case APIC_ICR_RR_VALID:
534 status = apic_read(APIC_RRR);
535 printk("%08x\n", status);
536 break;
537 default:
538 printk("failed\n");
539 }
540 }
541 }
542 #endif
544 /*
545 * Kick the secondary to wake up.
546 */
547 static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
548 {
549 unsigned long send_status = 0, accept_status = 0;
550 int maxlvt, timeout, num_starts, j;
552 Dprintk("Asserting INIT.\n");
554 /*
555 * Turn INIT on target chip
556 */
557 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
559 /*
560 * Send IPI
561 */
562 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
563 | APIC_DM_INIT);
565 Dprintk("Waiting for send to finish...\n");
566 timeout = 0;
567 do {
568 Dprintk("+");
569 udelay(100);
570 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
571 } while (send_status && (timeout++ < 1000));
573 mdelay(10);
575 Dprintk("Deasserting INIT.\n");
577 /* Target chip */
578 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
580 /* Send IPI */
581 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
583 Dprintk("Waiting for send to finish...\n");
584 timeout = 0;
585 do {
586 Dprintk("+");
587 udelay(100);
588 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
589 } while (send_status && (timeout++ < 1000));
591 atomic_set(&init_deasserted, 1);
593 /*
594 * Should we send STARTUP IPIs ?
595 *
596 * Determine this based on the APIC version.
597 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
598 */
599 if (APIC_INTEGRATED(apic_version[phys_apicid]))
600 num_starts = 2;
601 else
602 num_starts = 0;
604 /*
605 * Run STARTUP IPI loop.
606 */
607 Dprintk("#startup loops: %d.\n", num_starts);
609 maxlvt = get_maxlvt();
611 for (j = 1; j <= num_starts; j++) {
612 Dprintk("Sending STARTUP #%d.\n",j);
613 apic_read_around(APIC_SPIV);
614 apic_write(APIC_ESR, 0);
615 apic_read(APIC_ESR);
616 Dprintk("After apic_write.\n");
618 /*
619 * STARTUP IPI
620 */
622 /* Target chip */
623 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
625 /* Boot on the stack */
626 /* Kick the second */
627 apic_write_around(APIC_ICR, APIC_DM_STARTUP
628 | (start_rip >> 12));
630 /*
631 * Give the other CPU some time to accept the IPI.
632 */
633 udelay(300);
635 Dprintk("Startup point 1.\n");
637 Dprintk("Waiting for send to finish...\n");
638 timeout = 0;
639 do {
640 Dprintk("+");
641 udelay(100);
642 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
643 } while (send_status && (timeout++ < 1000));
645 /*
646 * Give the other CPU some time to accept the IPI.
647 */
648 udelay(200);
649 /*
650 * Due to the Pentium erratum 3AP.
651 */
652 if (maxlvt > 3) {
653 apic_read_around(APIC_SPIV);
654 apic_write(APIC_ESR, 0);
655 }
656 accept_status = (apic_read(APIC_ESR) & 0xEF);
657 if (send_status || accept_status)
658 break;
659 }
660 Dprintk("After Startup.\n");
662 if (send_status)
663 printk(KERN_ERR "APIC never delivered???\n");
664 if (accept_status)
665 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
667 return (send_status | accept_status);
668 }
669 #endif
671 /*
672 * Boot one CPU.
673 */
674 static int __cpuinit do_boot_cpu(int cpu, int apicid)
675 {
676 struct task_struct *idle;
677 unsigned long boot_error;
678 int timeout;
679 unsigned long start_rip;
680 #ifdef CONFIG_XEN
681 vcpu_guest_context_t ctxt;
682 extern void startup_64_smp(void);
683 extern void hypervisor_callback(void);
684 extern void failsafe_callback(void);
685 extern void smp_trap_init(trap_info_t *);
686 #endif
687 /*
688 * We can't use kernel_thread since we must avoid to
689 * reschedule the child.
690 */
691 idle = fork_idle(cpu);
692 if (IS_ERR(idle)) {
693 printk("failed fork for CPU %d\n", cpu);
694 return PTR_ERR(idle);
695 }
697 cpu_pda[cpu].pcurrent = idle;
699 #ifndef CONFIG_XEN
700 start_rip = setup_trampoline();
701 #else
702 start_rip = (unsigned long)startup_64_smp;
703 #endif
705 init_rsp = idle->thread.rsp;
706 per_cpu(init_tss,cpu).rsp0 = init_rsp;
707 initial_code = start_secondary;
708 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
710 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
711 start_rip, init_rsp);
713 /*
714 * This grunge runs the startup process for
715 * the targeted processor.
716 */
718 atomic_set(&init_deasserted, 0);
720 #ifdef CONFIG_XEN
721 cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL|__GFP_ZERO);
722 BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
723 cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
724 memcpy((void *)cpu_gdt_descr[cpu].address,
725 (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
727 memset(&ctxt, 0, sizeof(ctxt));
729 ctxt.flags = VGCF_IN_KERNEL;
730 ctxt.user_regs.ds = __USER_DS;
731 ctxt.user_regs.es = __USER_DS;
732 ctxt.user_regs.fs = 0;
733 ctxt.user_regs.gs = 0;
734 ctxt.user_regs.ss = __KERNEL_DS|0x3;
735 ctxt.user_regs.cs = __KERNEL_CS|0x3;
736 ctxt.user_regs.rip = start_rip;
737 ctxt.user_regs.rsp = idle->thread.rsp;
738 #define X86_EFLAGS_IOPL_RING3 0x3000
739 ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING3;
741 /* FPU is set up to default initial state. */
742 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
744 smp_trap_init(ctxt.trap_ctxt);
746 /* No LDT. */
747 ctxt.ldt_ents = 0;
749 {
750 unsigned long va;
751 int f;
753 for (va = cpu_gdt_descr[cpu].address, f = 0;
754 va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
755 va += PAGE_SIZE, f++) {
756 ctxt.gdt_frames[f] = virt_to_mfn(va);
757 make_page_readonly((void *)va);
758 }
759 ctxt.gdt_ents = GDT_ENTRIES;
760 }
762 /* Ring 1 stack is the initial stack. */
763 ctxt.kernel_ss = __KERNEL_DS;
764 ctxt.kernel_sp = idle->thread.rsp;
766 /* Callback handlers. */
767 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
768 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
769 ctxt.syscall_callback_eip = (unsigned long)system_call;
771 ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT;
773 boot_error = HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt);
774 if (boot_error)
775 printk("boot error: %ld\n", boot_error);
777 if (!boot_error) {
778 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
780 /*
781 * allow APs to start initializing.
782 */
783 Dprintk("Before Callout %d.\n", cpu);
784 cpu_set(cpu, cpu_callout_map);
785 Dprintk("After Callout %d.\n", cpu);
787 /*
788 * Wait 5s total for a response
789 */
790 for (timeout = 0; timeout < 50000; timeout++) {
791 if (cpu_isset(cpu, cpu_callin_map))
792 break; /* It has booted */
793 udelay(100);
794 }
796 if (cpu_isset(cpu, cpu_callin_map)) {
797 /* number CPUs logically, starting from 1 (BSP is 0) */
798 Dprintk("CPU has booted.\n");
799 } else {
800 boot_error= 1;
801 }
802 }
803 x86_cpu_to_apicid[cpu] = apicid;
804 #else
805 Dprintk("Setting warm reset code and vector.\n");
807 CMOS_WRITE(0xa, 0xf);
808 local_flush_tlb();
809 Dprintk("1.\n");
810 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
811 Dprintk("2.\n");
812 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
813 Dprintk("3.\n");
815 /*
816 * Be paranoid about clearing APIC errors.
817 */
818 if (APIC_INTEGRATED(apic_version[apicid])) {
819 apic_read_around(APIC_SPIV);
820 apic_write(APIC_ESR, 0);
821 apic_read(APIC_ESR);
822 }
824 /*
825 * Status is now clean
826 */
827 boot_error = 0;
829 /*
830 * Starting actual IPI sequence...
831 */
832 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
834 if (!boot_error) {
835 /*
836 * allow APs to start initializing.
837 */
838 Dprintk("Before Callout %d.\n", cpu);
839 cpu_set(cpu, cpu_callout_map);
840 Dprintk("After Callout %d.\n", cpu);
842 /*
843 * Wait 5s total for a response
844 */
845 for (timeout = 0; timeout < 50000; timeout++) {
846 if (cpu_isset(cpu, cpu_callin_map))
847 break; /* It has booted */
848 udelay(100);
849 }
851 if (cpu_isset(cpu, cpu_callin_map)) {
852 /* number CPUs logically, starting from 1 (BSP is 0) */
853 Dprintk("CPU has booted.\n");
854 } else {
855 boot_error = 1;
856 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
857 == 0xA5)
858 /* trampoline started but...? */
859 printk("Stuck ??\n");
860 else
861 /* trampoline code not run */
862 printk("Not responding.\n");
863 #if APIC_DEBUG
864 inquire_remote_apic(apicid);
865 #endif
866 }
867 }
868 #endif
869 if (boot_error) {
870 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
871 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
872 cpu_clear(cpu, cpu_present_map);
873 cpu_clear(cpu, cpu_possible_map);
874 x86_cpu_to_apicid[cpu] = BAD_APICID;
875 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
876 return -EIO;
877 }
879 return 0;
880 }
882 cycles_t cacheflush_time;
883 unsigned long cache_decay_ticks;
885 /*
886 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
887 * on SMT systems efficiently.
888 */
889 static __cpuinit void detect_siblings(void)
890 {
891 int cpu;
893 for (cpu = 0; cpu < NR_CPUS; cpu++) {
894 cpus_clear(cpu_sibling_map[cpu]);
895 cpus_clear(cpu_core_map[cpu]);
896 }
898 for_each_online_cpu (cpu) {
899 struct cpuinfo_x86 *c = cpu_data + cpu;
900 int siblings = 0;
901 int i;
902 if (smp_num_siblings > 1) {
903 for_each_online_cpu (i) {
904 if (cpu_core_id[cpu] == cpu_core_id[i]) {
905 siblings++;
906 cpu_set(i, cpu_sibling_map[cpu]);
907 }
908 }
909 } else {
910 siblings++;
911 cpu_set(cpu, cpu_sibling_map[cpu]);
912 }
914 if (siblings != smp_num_siblings) {
915 printk(KERN_WARNING
916 "WARNING: %d siblings found for CPU%d, should be %d\n",
917 siblings, cpu, smp_num_siblings);
918 smp_num_siblings = siblings;
919 }
920 if (c->x86_num_cores > 1) {
921 for_each_online_cpu(i) {
922 if (phys_proc_id[cpu] == phys_proc_id[i])
923 cpu_set(i, cpu_core_map[cpu]);
924 }
925 } else
926 cpu_core_map[cpu] = cpu_sibling_map[cpu];
927 }
928 }
930 #ifndef CONFIG_XEN
931 /*
932 * Cleanup possible dangling ends...
933 */
934 static __cpuinit void smp_cleanup_boot(void)
935 {
936 /*
937 * Paranoid: Set warm reset code and vector here back
938 * to default values.
939 */
940 CMOS_WRITE(0, 0xf);
942 /*
943 * Reset trampoline flag
944 */
945 *((volatile int *) phys_to_virt(0x467)) = 0;
947 #ifndef CONFIG_HOTPLUG_CPU
948 /*
949 * Free pages reserved for SMP bootup.
950 * When you add hotplug CPU support later remove this
951 * Note there is more work to be done for later CPU bootup.
952 */
954 free_page((unsigned long) __va(PAGE_SIZE));
955 free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE));
956 #endif
957 }
958 #endif
960 /*
961 * Fall back to non SMP mode after errors.
962 *
963 * RED-PEN audit/test this more. I bet there is more state messed up here.
964 */
965 static __cpuinit void disable_smp(void)
966 {
967 cpu_present_map = cpumask_of_cpu(0);
968 cpu_possible_map = cpumask_of_cpu(0);
969 #ifndef CONFIG_XEN
970 if (smp_found_config)
971 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
972 else
973 phys_cpu_present_map = physid_mask_of_physid(0);
974 #endif
975 cpu_set(0, cpu_sibling_map[0]);
976 cpu_set(0, cpu_core_map[0]);
977 }
979 /*
980 * Handle user cpus=... parameter.
981 */
982 static __cpuinit void enforce_max_cpus(unsigned max_cpus)
983 {
984 int i, k;
985 k = 0;
986 for (i = 0; i < NR_CPUS; i++) {
987 if (!cpu_possible(i))
988 continue;
989 if (++k > max_cpus) {
990 cpu_clear(i, cpu_possible_map);
991 cpu_clear(i, cpu_present_map);
992 }
993 }
994 }
996 /*
997 * Various sanity checks.
998 */
999 static int __cpuinit smp_sanity_check(unsigned max_cpus)
1001 #ifndef CONFIG_XEN
1002 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1003 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1004 hard_smp_processor_id());
1005 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1008 /*
1009 * If we couldn't find an SMP configuration at boot time,
1010 * get out of here now!
1011 */
1012 if (!smp_found_config) {
1013 printk(KERN_NOTICE "SMP motherboard not detected.\n");
1014 disable_smp();
1015 if (APIC_init_uniprocessor())
1016 printk(KERN_NOTICE "Local APIC not detected."
1017 " Using dummy APIC emulation.\n");
1018 return -1;
1021 /*
1022 * Should not be necessary because the MP table should list the boot
1023 * CPU too, but we do it for the sake of robustness anyway.
1024 */
1025 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
1026 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
1027 boot_cpu_id);
1028 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1031 /*
1032 * If we couldn't find a local APIC, then get out of here now!
1033 */
1034 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
1035 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1036 boot_cpu_id);
1037 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1038 nr_ioapics = 0;
1039 return -1;
1041 #endif
1043 /*
1044 * If SMP should be disabled, then really disable it!
1045 */
1046 if (!max_cpus) {
1047 #ifdef CONFIG_XEN
1048 HYPERVISOR_shared_info->n_vcpu = 1;
1049 #endif
1050 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1051 #ifndef CONFIG_XEN
1052 nr_ioapics = 0;
1053 #endif
1054 return -1;
1057 return 0;
1060 /*
1061 * Prepare for SMP bootup. The MP table or ACPI has been read
1062 * earlier. Just do some sanity checking here and enable APIC mode.
1063 */
1064 void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
1066 int i;
1068 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1069 #else
1070 nmi_watchdog_default();
1071 #endif
1072 current_cpu_data = boot_cpu_data;
1073 current_thread_info()->cpu = 0; /* needed? */
1075 enforce_max_cpus(max_cpus);
1077 /*
1078 * Fill in cpu_present_mask
1079 */
1080 for (i = 0; i < NR_CPUS; i++) {
1081 #ifndef CONFIG_XEN
1082 int apicid = cpu_present_to_apicid(i);
1083 if (physid_isset(apicid, phys_cpu_present_map)) {
1084 #else
1085 if (i < HYPERVISOR_shared_info->n_vcpu) {
1086 #endif
1087 cpu_set(i, cpu_present_map);
1088 /* possible map would be different if we supported real
1089 CPU hotplug. */
1090 cpu_set(i, cpu_possible_map);
1094 if (smp_sanity_check(max_cpus) < 0) {
1095 printk(KERN_INFO "SMP disabled\n");
1096 disable_smp();
1097 return;
1100 #ifdef CONFIG_XEN
1101 smp_intr_init();
1102 #else
1104 /*
1105 * Switch from PIC to APIC mode.
1106 */
1107 connect_bsp_APIC();
1108 setup_local_APIC();
1110 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
1111 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1112 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
1113 /* Or can we switch back to PIC here? */
1115 #endif
1117 /*
1118 * Now start the IO-APICs
1119 */
1120 #if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST)
1121 #else
1122 if (!skip_ioapic_setup && nr_ioapics)
1123 setup_IO_APIC();
1124 else
1125 nr_ioapics = 0;
1126 #endif
1128 /*
1129 * Set up local APIC timer on boot CPU.
1130 */
1132 #ifndef CONFIG_XEN
1133 setup_boot_APIC_clock();
1134 #endif
1137 /*
1138 * Early setup to make printk work.
1139 */
1140 void __init smp_prepare_boot_cpu(void)
1142 int me = smp_processor_id();
1143 cpu_set(me, cpu_online_map);
1144 cpu_set(me, cpu_callout_map);
1147 /*
1148 * Entry point to boot a CPU.
1150 * This is all __cpuinit, not __devinit for now because we don't support
1151 * CPU hotplug (yet).
1152 */
1153 int __cpuinit __cpu_up(unsigned int cpu)
1155 int err;
1156 #ifndef CONFIG_XEN
1157 int apicid = cpu_present_to_apicid(cpu);
1158 #else
1159 int apicid = cpu;
1160 #endif
1162 WARN_ON(irqs_disabled());
1164 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1166 #ifndef CONFIG_XEN
1167 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
1168 !physid_isset(apicid, phys_cpu_present_map)) {
1169 printk("__cpu_up: bad cpu %d\n", cpu);
1170 return -EINVAL;
1172 #endif
1174 /* Boot it! */
1175 err = do_boot_cpu(cpu, apicid);
1176 if (err < 0) {
1177 Dprintk("do_boot_cpu failed %d\n", err);
1178 return err;
1181 /* Unleash the CPU! */
1182 Dprintk("waiting for cpu %d\n", cpu);
1184 while (!cpu_isset(cpu, cpu_online_map))
1185 cpu_relax();
1186 return 0;
1189 /*
1190 * Finish the SMP boot.
1191 */
1192 void __cpuinit smp_cpus_done(unsigned int max_cpus)
1194 #ifndef CONFIG_XEN
1195 zap_low_mappings();
1196 smp_cleanup_boot();
1198 #ifdef CONFIG_X86_IO_APIC
1199 setup_ioapic_dest();
1200 #endif
1201 #endif
1203 detect_siblings();
1204 #ifndef CONFIG_XEN
1205 time_init_gtod();
1207 check_nmi_watchdog();
1208 #endif
1211 #ifdef CONFIG_XEN
1212 extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
1213 extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
1215 static DEFINE_PER_CPU(int, resched_irq);
1216 static DEFINE_PER_CPU(int, callfunc_irq);
1217 static char resched_name[NR_CPUS][15];
1218 static char callfunc_name[NR_CPUS][15];
1220 void smp_intr_init(void)
1222 int cpu = smp_processor_id();
1224 per_cpu(resched_irq, cpu) =
1225 bind_ipi_to_irq(RESCHEDULE_VECTOR, cpu);
1226 sprintf(resched_name[cpu], "resched%d", cpu);
1227 BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt,
1228 SA_INTERRUPT, resched_name[cpu], NULL));
1230 per_cpu(callfunc_irq, cpu) =
1231 bind_ipi_to_irq(CALL_FUNCTION_VECTOR, cpu);
1232 sprintf(callfunc_name[cpu], "callfunc%d", cpu);
1233 BUG_ON(request_irq(per_cpu(callfunc_irq, cpu),
1234 smp_call_function_interrupt,
1235 SA_INTERRUPT, callfunc_name[cpu], NULL));
1238 static void smp_intr_exit(void)
1240 int cpu = smp_processor_id();
1242 free_irq(per_cpu(resched_irq, cpu), NULL);
1243 unbind_ipi_from_irq(RESCHEDULE_VECTOR, cpu);
1245 free_irq(per_cpu(callfunc_irq, cpu), NULL);
1246 unbind_ipi_from_irq(CALL_FUNCTION_VECTOR, cpu);
1249 extern void local_setup_timer_irq(void);
1250 extern void local_teardown_timer_irq(void);
1252 void smp_suspend(void)
1254 local_teardown_timer_irq();
1255 smp_intr_exit();
1258 void smp_resume(void)
1260 smp_intr_init();
1261 local_setup_timer_irq();
1264 void vcpu_prepare(int vcpu)
1268 #endif