ia64/linux-2.6.18-xen.hg

view arch/i386/kernel/time-xen.c @ 559:446c39a967fc

xen: Do not allow gcc-3.4 to turn loop on a 64-bit variable into
64-bit division, which the kernel does not support.

Signed-off-by: Guillaume Rousse <guillaume.rousse@inria.fr>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed May 21 17:04:37 2008 +0100 (2008-05-21)
parents 3cd3352a9985
children 557a4a0a5eac
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
53 #include <linux/cpufreq.h>
55 #include <asm/io.h>
56 #include <asm/smp.h>
57 #include <asm/irq.h>
58 #include <asm/msr.h>
59 #include <asm/delay.h>
60 #include <asm/mpspec.h>
61 #include <asm/uaccess.h>
62 #include <asm/processor.h>
63 #include <asm/timer.h>
64 #include <asm/sections.h>
66 #include "mach_time.h"
68 #include <linux/timex.h>
70 #include <asm/hpet.h>
72 #include <asm/arch_hooks.h>
74 #include <xen/evtchn.h>
75 #include <xen/interface/vcpu.h>
77 #if defined (__i386__)
78 #include <asm/i8259.h>
79 #endif
81 int pit_latch_buggy; /* extern */
83 #if defined(__x86_64__)
84 unsigned long vxtime_hz = PIT_TICK_RATE;
85 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
86 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
87 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
88 struct timespec __xtime __section_xtime;
89 struct timezone __sys_tz __section_sys_tz;
90 #endif
92 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
93 EXPORT_SYMBOL(cpu_khz);
95 extern unsigned long wall_jiffies;
97 DEFINE_SPINLOCK(rtc_lock);
98 EXPORT_SYMBOL(rtc_lock);
100 extern struct init_timer_opts timer_tsc_init;
101 extern struct timer_opts timer_tsc;
102 #define timer_none timer_tsc
104 /* These are peridically updated in shared_info, and then copied here. */
105 struct shadow_time_info {
106 u64 tsc_timestamp; /* TSC at last update of time vals. */
107 u64 system_timestamp; /* Time, in nanosecs, since boot. */
108 u32 tsc_to_nsec_mul;
109 u32 tsc_to_usec_mul;
110 int tsc_shift;
111 u32 version;
112 };
113 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
114 static struct timespec shadow_tv;
115 static u32 shadow_tv_version;
117 static struct timeval monotonic_tv;
118 static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED;
120 /* Keep track of last time we did processing/updating of jiffies and xtime. */
121 static u64 processed_system_time; /* System time (ns) at last processing. */
122 static DEFINE_PER_CPU(u64, processed_system_time);
124 /* How much CPU time was spent blocked and how much was 'stolen'? */
125 static DEFINE_PER_CPU(u64, processed_stolen_time);
126 static DEFINE_PER_CPU(u64, processed_blocked_time);
128 /* Current runstate of each CPU (updated automatically by the hypervisor). */
129 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
131 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
132 #define NS_PER_TICK (1000000000LL/HZ)
134 static void __clock_was_set(void *unused)
135 {
136 clock_was_set();
137 }
138 static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
140 /*
141 * GCC 4.3 can turn loops over an induction variable into division. We do
142 * not support arbitrary 64-bit division, and so must break the induction.
143 */
144 #define clobber_induction_variable(v) asm ( "" : "+r" (v) )
146 static inline void __normalize_time(time_t *sec, s64 *nsec)
147 {
148 while (*nsec >= NSEC_PER_SEC) {
149 clobber_induction_variable(*nsec);
150 (*nsec) -= NSEC_PER_SEC;
151 (*sec)++;
152 }
153 while (*nsec < 0) {
154 clobber_induction_variable(*nsec);
155 (*nsec) += NSEC_PER_SEC;
156 (*sec)--;
157 }
158 }
160 /* Does this guest OS track Xen time, or set its wall clock independently? */
161 static int independent_wallclock = 0;
162 static int __init __independent_wallclock(char *str)
163 {
164 independent_wallclock = 1;
165 return 1;
166 }
167 __setup("independent_wallclock", __independent_wallclock);
169 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
170 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
171 static int __init __permitted_clock_jitter(char *str)
172 {
173 permitted_clock_jitter = simple_strtoul(str, NULL, 0);
174 return 1;
175 }
176 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
178 #if 0
179 static void delay_tsc(unsigned long loops)
180 {
181 unsigned long bclock, now;
183 rdtscl(bclock);
184 do {
185 rep_nop();
186 rdtscl(now);
187 } while ((now - bclock) < loops);
188 }
190 struct timer_opts timer_tsc = {
191 .name = "tsc",
192 .delay = delay_tsc,
193 };
194 #endif
196 /*
197 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
198 * yielding a 64-bit result.
199 */
200 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
201 {
202 u64 product;
203 #ifdef __i386__
204 u32 tmp1, tmp2;
205 #endif
207 if (shift < 0)
208 delta >>= -shift;
209 else
210 delta <<= shift;
212 #ifdef __i386__
213 __asm__ (
214 "mul %5 ; "
215 "mov %4,%%eax ; "
216 "mov %%edx,%4 ; "
217 "mul %5 ; "
218 "xor %5,%5 ; "
219 "add %4,%%eax ; "
220 "adc %5,%%edx ; "
221 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
222 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
223 #else
224 __asm__ (
225 "mul %%rdx ; shrd $32,%%rdx,%%rax"
226 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
227 #endif
229 return product;
230 }
232 #if 0 /* defined (__i386__) */
233 int read_current_timer(unsigned long *timer_val)
234 {
235 rdtscl(*timer_val);
236 return 0;
237 }
238 #endif
240 void init_cpu_khz(void)
241 {
242 u64 __cpu_khz = 1000000ULL << 32;
243 struct vcpu_time_info *info = &vcpu_info(0)->time;
244 do_div(__cpu_khz, info->tsc_to_system_mul);
245 if (info->tsc_shift < 0)
246 cpu_khz = __cpu_khz << -info->tsc_shift;
247 else
248 cpu_khz = __cpu_khz >> info->tsc_shift;
249 }
251 static u64 get_nsec_offset(struct shadow_time_info *shadow)
252 {
253 u64 now, delta;
254 rdtscll(now);
255 delta = now - shadow->tsc_timestamp;
256 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
257 }
259 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
260 {
261 u64 now, delta;
262 rdtscll(now);
263 delta = now - shadow->tsc_timestamp;
264 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
265 }
267 static void __update_wallclock(time_t sec, long nsec)
268 {
269 long wtm_nsec, xtime_nsec;
270 time_t wtm_sec, xtime_sec;
271 u64 tmp, wc_nsec;
273 /* Adjust wall-clock time base based on wall_jiffies ticks. */
274 wc_nsec = processed_system_time;
275 wc_nsec += sec * (u64)NSEC_PER_SEC;
276 wc_nsec += nsec;
277 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
279 /* Split wallclock base into seconds and nanoseconds. */
280 tmp = wc_nsec;
281 xtime_nsec = do_div(tmp, 1000000000);
282 xtime_sec = (time_t)tmp;
284 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
285 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
287 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
288 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
290 ntp_clear();
291 }
293 static void update_wallclock(void)
294 {
295 shared_info_t *s = HYPERVISOR_shared_info;
297 do {
298 shadow_tv_version = s->wc_version;
299 rmb();
300 shadow_tv.tv_sec = s->wc_sec;
301 shadow_tv.tv_nsec = s->wc_nsec;
302 rmb();
303 } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
305 if (!independent_wallclock)
306 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
307 }
309 /*
310 * Reads a consistent set of time-base values from Xen, into a shadow data
311 * area.
312 */
313 static void get_time_values_from_xen(unsigned int cpu)
314 {
315 struct vcpu_time_info *src;
316 struct shadow_time_info *dst;
318 src = &vcpu_info(cpu)->time;
319 dst = &per_cpu(shadow_time, cpu);
321 do {
322 dst->version = src->version;
323 rmb();
324 dst->tsc_timestamp = src->tsc_timestamp;
325 dst->system_timestamp = src->system_time;
326 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
327 dst->tsc_shift = src->tsc_shift;
328 rmb();
329 } while ((src->version & 1) | (dst->version ^ src->version));
331 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
332 }
334 static inline int time_values_up_to_date(unsigned int cpu)
335 {
336 struct vcpu_time_info *src;
337 struct shadow_time_info *dst;
339 src = &vcpu_info(cpu)->time;
340 dst = &per_cpu(shadow_time, cpu);
342 rmb();
343 return (dst->version == src->version);
344 }
346 /*
347 * This is a special lock that is owned by the CPU and holds the index
348 * register we are working with. It is required for NMI access to the
349 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
350 */
351 volatile unsigned long cmos_lock = 0;
352 EXPORT_SYMBOL(cmos_lock);
354 /* Routines for accessing the CMOS RAM/RTC. */
355 unsigned char rtc_cmos_read(unsigned char addr)
356 {
357 unsigned char val;
358 lock_cmos_prefix(addr);
359 outb_p(addr, RTC_PORT(0));
360 val = inb_p(RTC_PORT(1));
361 lock_cmos_suffix(addr);
362 return val;
363 }
364 EXPORT_SYMBOL(rtc_cmos_read);
366 void rtc_cmos_write(unsigned char val, unsigned char addr)
367 {
368 lock_cmos_prefix(addr);
369 outb_p(addr, RTC_PORT(0));
370 outb_p(val, RTC_PORT(1));
371 lock_cmos_suffix(addr);
372 }
373 EXPORT_SYMBOL(rtc_cmos_write);
375 /*
376 * This version of gettimeofday has microsecond resolution
377 * and better than microsecond precision on fast x86 machines with TSC.
378 */
379 void do_gettimeofday(struct timeval *tv)
380 {
381 unsigned long seq;
382 unsigned long usec, sec;
383 unsigned long flags;
384 s64 nsec;
385 unsigned int cpu;
386 struct shadow_time_info *shadow;
387 u32 local_time_version;
389 cpu = get_cpu();
390 shadow = &per_cpu(shadow_time, cpu);
392 do {
393 unsigned long lost;
395 local_time_version = shadow->version;
396 seq = read_seqbegin(&xtime_lock);
398 usec = get_usec_offset(shadow);
399 lost = jiffies - wall_jiffies;
401 if (unlikely(lost))
402 usec += lost * (USEC_PER_SEC / HZ);
404 sec = xtime.tv_sec;
405 usec += (xtime.tv_nsec / NSEC_PER_USEC);
407 nsec = shadow->system_timestamp - processed_system_time;
408 __normalize_time(&sec, &nsec);
409 usec += (long)nsec / NSEC_PER_USEC;
411 if (unlikely(!time_values_up_to_date(cpu))) {
412 /*
413 * We may have blocked for a long time,
414 * rendering our calculations invalid
415 * (e.g. the time delta may have
416 * overflowed). Detect that and recalculate
417 * with fresh values.
418 */
419 get_time_values_from_xen(cpu);
420 continue;
421 }
422 } while (read_seqretry(&xtime_lock, seq) ||
423 (local_time_version != shadow->version));
425 put_cpu();
427 while (usec >= USEC_PER_SEC) {
428 usec -= USEC_PER_SEC;
429 sec++;
430 }
432 spin_lock_irqsave(&monotonic_lock, flags);
433 if ((sec > monotonic_tv.tv_sec) ||
434 ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec)))
435 {
436 monotonic_tv.tv_sec = sec;
437 monotonic_tv.tv_usec = usec;
438 } else {
439 sec = monotonic_tv.tv_sec;
440 usec = monotonic_tv.tv_usec;
441 }
442 spin_unlock_irqrestore(&monotonic_lock, flags);
444 tv->tv_sec = sec;
445 tv->tv_usec = usec;
446 }
448 EXPORT_SYMBOL(do_gettimeofday);
450 int do_settimeofday(struct timespec *tv)
451 {
452 time_t sec;
453 s64 nsec;
454 unsigned int cpu;
455 struct shadow_time_info *shadow;
456 struct xen_platform_op op;
458 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
459 return -EINVAL;
461 cpu = get_cpu();
462 shadow = &per_cpu(shadow_time, cpu);
464 write_seqlock_irq(&xtime_lock);
466 /*
467 * Ensure we don't get blocked for a long time so that our time delta
468 * overflows. If that were to happen then our shadow time values would
469 * be stale, so we can retry with fresh ones.
470 */
471 for (;;) {
472 nsec = tv->tv_nsec - get_nsec_offset(shadow);
473 if (time_values_up_to_date(cpu))
474 break;
475 get_time_values_from_xen(cpu);
476 }
477 sec = tv->tv_sec;
478 __normalize_time(&sec, &nsec);
480 if (is_initial_xendomain() && !independent_wallclock) {
481 op.cmd = XENPF_settime;
482 op.u.settime.secs = sec;
483 op.u.settime.nsecs = nsec;
484 op.u.settime.system_time = shadow->system_timestamp;
485 WARN_ON(HYPERVISOR_platform_op(&op));
486 update_wallclock();
487 } else if (independent_wallclock) {
488 nsec -= shadow->system_timestamp;
489 __normalize_time(&sec, &nsec);
490 __update_wallclock(sec, nsec);
491 }
493 /* Reset monotonic gettimeofday() timeval. */
494 spin_lock(&monotonic_lock);
495 monotonic_tv.tv_sec = 0;
496 monotonic_tv.tv_usec = 0;
497 spin_unlock(&monotonic_lock);
499 write_sequnlock_irq(&xtime_lock);
501 put_cpu();
503 clock_was_set();
504 return 0;
505 }
507 EXPORT_SYMBOL(do_settimeofday);
509 static void sync_xen_wallclock(unsigned long dummy);
510 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
511 static void sync_xen_wallclock(unsigned long dummy)
512 {
513 time_t sec;
514 s64 nsec;
515 struct xen_platform_op op;
517 if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
518 return;
520 write_seqlock_irq(&xtime_lock);
522 sec = xtime.tv_sec;
523 nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
524 __normalize_time(&sec, &nsec);
526 op.cmd = XENPF_settime;
527 op.u.settime.secs = sec;
528 op.u.settime.nsecs = nsec;
529 op.u.settime.system_time = processed_system_time;
530 WARN_ON(HYPERVISOR_platform_op(&op));
532 update_wallclock();
534 write_sequnlock_irq(&xtime_lock);
536 /* Once per minute. */
537 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
538 }
540 static int set_rtc_mmss(unsigned long nowtime)
541 {
542 int retval;
543 unsigned long flags;
545 if (independent_wallclock || !is_initial_xendomain())
546 return 0;
548 /* gets recalled with irq locally disabled */
549 /* XXX - does irqsave resolve this? -johnstul */
550 spin_lock_irqsave(&rtc_lock, flags);
551 if (efi_enabled)
552 retval = efi_set_rtc_mmss(nowtime);
553 else
554 retval = mach_set_rtc_mmss(nowtime);
555 spin_unlock_irqrestore(&rtc_lock, flags);
557 return retval;
558 }
560 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
561 * Note: This function is required to return accurate
562 * time even in the absence of multiple timer ticks.
563 */
564 unsigned long long monotonic_clock(void)
565 {
566 unsigned int cpu = get_cpu();
567 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
568 u64 time;
569 u32 local_time_version;
571 do {
572 local_time_version = shadow->version;
573 barrier();
574 time = shadow->system_timestamp + get_nsec_offset(shadow);
575 if (!time_values_up_to_date(cpu))
576 get_time_values_from_xen(cpu);
577 barrier();
578 } while (local_time_version != shadow->version);
580 put_cpu();
582 return time;
583 }
584 EXPORT_SYMBOL(monotonic_clock);
586 #ifdef __x86_64__
587 unsigned long long sched_clock(void)
588 {
589 return monotonic_clock();
590 }
591 #endif
593 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
594 unsigned long profile_pc(struct pt_regs *regs)
595 {
596 unsigned long pc = instruction_pointer(regs);
598 #ifdef __x86_64__
599 /* Assume the lock function has either no stack frame or only a single word.
600 This checks if the address on the stack looks like a kernel text address.
601 There is a small window for false hits, but in that case the tick
602 is just accounted to the spinlock function.
603 Better would be to write these functions in assembler again
604 and check exactly. */
605 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
606 char *v = *(char **)regs->rsp;
607 if ((v >= _stext && v <= _etext) ||
608 (v >= _sinittext && v <= _einittext) ||
609 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
610 return (unsigned long)v;
611 return ((unsigned long *)regs->rsp)[1];
612 }
613 #else
614 if (!user_mode_vm(regs) && in_lock_functions(pc))
615 return *(unsigned long *)(regs->ebp + 4);
616 #endif
618 return pc;
619 }
620 EXPORT_SYMBOL(profile_pc);
621 #endif
623 /*
624 * This is the same as the above, except we _also_ save the current
625 * Time Stamp Counter value at the time of the timer interrupt, so that
626 * we later on can estimate the time of day more exactly.
627 */
628 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
629 {
630 s64 delta, delta_cpu, stolen, blocked;
631 u64 sched_time;
632 unsigned int i, cpu = smp_processor_id();
633 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
634 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
636 /*
637 * Here we are in the timer irq handler. We just have irqs locally
638 * disabled but we don't know if the timer_bh is running on the other
639 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
640 * the irq version of write_lock because as just said we have irq
641 * locally disabled. -arca
642 */
643 write_seqlock(&xtime_lock);
645 do {
646 get_time_values_from_xen(cpu);
648 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
649 delta = delta_cpu =
650 shadow->system_timestamp + get_nsec_offset(shadow);
651 delta -= processed_system_time;
652 delta_cpu -= per_cpu(processed_system_time, cpu);
654 /*
655 * Obtain a consistent snapshot of stolen/blocked cycles. We
656 * can use state_entry_time to detect if we get preempted here.
657 */
658 do {
659 sched_time = runstate->state_entry_time;
660 barrier();
661 stolen = runstate->time[RUNSTATE_runnable] +
662 runstate->time[RUNSTATE_offline] -
663 per_cpu(processed_stolen_time, cpu);
664 blocked = runstate->time[RUNSTATE_blocked] -
665 per_cpu(processed_blocked_time, cpu);
666 barrier();
667 } while (sched_time != runstate->state_entry_time);
668 } while (!time_values_up_to_date(cpu));
670 if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
671 unlikely(delta_cpu < -(s64)permitted_clock_jitter))
672 && printk_ratelimit()) {
673 printk("Timer ISR/%u: Time went backwards: "
674 "delta=%lld delta_cpu=%lld shadow=%lld "
675 "off=%lld processed=%lld cpu_processed=%lld\n",
676 cpu, delta, delta_cpu, shadow->system_timestamp,
677 (s64)get_nsec_offset(shadow),
678 processed_system_time,
679 per_cpu(processed_system_time, cpu));
680 for (i = 0; i < num_online_cpus(); i++)
681 printk(" %d: %lld\n", i,
682 per_cpu(processed_system_time, i));
683 }
685 /* System-wide jiffy work. */
686 while (delta >= NS_PER_TICK) {
687 delta -= NS_PER_TICK;
688 processed_system_time += NS_PER_TICK;
689 do_timer(regs);
690 }
692 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
693 update_wallclock();
694 if (keventd_up())
695 schedule_work(&clock_was_set_work);
696 }
698 write_sequnlock(&xtime_lock);
700 /*
701 * Account stolen ticks.
702 * HACK: Passing NULL to account_steal_time()
703 * ensures that the ticks are accounted as stolen.
704 */
705 if ((stolen > 0) && (delta_cpu > 0)) {
706 delta_cpu -= stolen;
707 if (unlikely(delta_cpu < 0))
708 stolen += delta_cpu; /* clamp local-time progress */
709 do_div(stolen, NS_PER_TICK);
710 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
711 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
712 account_steal_time(NULL, (cputime_t)stolen);
713 }
715 /*
716 * Account blocked ticks.
717 * HACK: Passing idle_task to account_steal_time()
718 * ensures that the ticks are accounted as idle/wait.
719 */
720 if ((blocked > 0) && (delta_cpu > 0)) {
721 delta_cpu -= blocked;
722 if (unlikely(delta_cpu < 0))
723 blocked += delta_cpu; /* clamp local-time progress */
724 do_div(blocked, NS_PER_TICK);
725 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
726 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
727 account_steal_time(idle_task(cpu), (cputime_t)blocked);
728 }
730 /* Account user/system ticks. */
731 if (delta_cpu > 0) {
732 do_div(delta_cpu, NS_PER_TICK);
733 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
734 if (user_mode_vm(regs))
735 account_user_time(current, (cputime_t)delta_cpu);
736 else
737 account_system_time(current, HARDIRQ_OFFSET,
738 (cputime_t)delta_cpu);
739 }
741 /* Offlined for more than a few seconds? Avoid lockup warnings. */
742 if (stolen > 5*HZ)
743 touch_softlockup_watchdog();
745 /* Local timer processing (see update_process_times()). */
746 run_local_timers();
747 if (rcu_pending(cpu))
748 rcu_check_callbacks(cpu, user_mode_vm(regs));
749 scheduler_tick();
750 run_posix_cpu_timers(current);
751 profile_tick(CPU_PROFILING, regs);
753 return IRQ_HANDLED;
754 }
756 static void init_missing_ticks_accounting(unsigned int cpu)
757 {
758 struct vcpu_register_runstate_memory_area area;
759 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
760 int rc;
762 memset(runstate, 0, sizeof(*runstate));
764 area.addr.v = runstate;
765 rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
766 WARN_ON(rc && rc != -ENOSYS);
768 per_cpu(processed_blocked_time, cpu) =
769 runstate->time[RUNSTATE_blocked];
770 per_cpu(processed_stolen_time, cpu) =
771 runstate->time[RUNSTATE_runnable] +
772 runstate->time[RUNSTATE_offline];
773 }
775 /* not static: needed by APM */
776 unsigned long get_cmos_time(void)
777 {
778 unsigned long retval;
779 unsigned long flags;
781 spin_lock_irqsave(&rtc_lock, flags);
783 if (efi_enabled)
784 retval = efi_get_time();
785 else
786 retval = mach_get_cmos_time();
788 spin_unlock_irqrestore(&rtc_lock, flags);
790 return retval;
791 }
792 EXPORT_SYMBOL(get_cmos_time);
794 static void sync_cmos_clock(unsigned long dummy);
796 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
798 static void sync_cmos_clock(unsigned long dummy)
799 {
800 struct timeval now, next;
801 int fail = 1;
803 /*
804 * If we have an externally synchronized Linux clock, then update
805 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
806 * called as close as possible to 500 ms before the new second starts.
807 * This code is run on a timer. If the clock is set, that timer
808 * may not expire at the correct time. Thus, we adjust...
809 */
810 if (!ntp_synced())
811 /*
812 * Not synced, exit, do not restart a timer (if one is
813 * running, let it run out).
814 */
815 return;
817 do_gettimeofday(&now);
818 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
819 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
820 fail = set_rtc_mmss(now.tv_sec);
822 next.tv_usec = USEC_AFTER - now.tv_usec;
823 if (next.tv_usec <= 0)
824 next.tv_usec += USEC_PER_SEC;
826 if (!fail)
827 next.tv_sec = 659;
828 else
829 next.tv_sec = 0;
831 if (next.tv_usec >= USEC_PER_SEC) {
832 next.tv_sec++;
833 next.tv_usec -= USEC_PER_SEC;
834 }
835 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
836 }
838 void notify_arch_cmos_timer(void)
839 {
840 mod_timer(&sync_cmos_timer, jiffies + 1);
841 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
842 }
844 static int timer_resume(struct sys_device *dev)
845 {
846 extern void time_resume(void);
847 time_resume();
848 return 0;
849 }
851 static struct sysdev_class timer_sysclass = {
852 .resume = timer_resume,
853 set_kset_name("timer"),
854 };
857 /* XXX this driverfs stuff should probably go elsewhere later -john */
858 static struct sys_device device_timer = {
859 .id = 0,
860 .cls = &timer_sysclass,
861 };
863 static int time_init_device(void)
864 {
865 int error = sysdev_class_register(&timer_sysclass);
866 if (!error)
867 error = sysdev_register(&device_timer);
868 return error;
869 }
871 device_initcall(time_init_device);
873 #ifdef CONFIG_HPET_TIMER
874 extern void (*late_time_init)(void);
875 /* Duplicate of time_init() below, with hpet_enable part added */
876 static void __init hpet_time_init(void)
877 {
878 xtime.tv_sec = get_cmos_time();
879 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
880 set_normalized_timespec(&wall_to_monotonic,
881 -xtime.tv_sec, -xtime.tv_nsec);
883 if ((hpet_enable() >= 0) && hpet_use_timer) {
884 printk("Using HPET for base-timer\n");
885 }
887 time_init_hook();
888 }
889 #endif
891 /* Dynamically-mapped IRQ. */
892 DEFINE_PER_CPU(int, timer_irq);
894 extern void (*late_time_init)(void);
895 static void setup_cpu0_timer_irq(void)
896 {
897 per_cpu(timer_irq, 0) =
898 bind_virq_to_irqhandler(
899 VIRQ_TIMER,
900 0,
901 timer_interrupt,
902 SA_INTERRUPT,
903 "timer0",
904 NULL);
905 BUG_ON(per_cpu(timer_irq, 0) < 0);
906 }
908 static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
909 .period_ns = NS_PER_TICK
910 };
912 void __init time_init(void)
913 {
914 #ifdef CONFIG_HPET_TIMER
915 if (is_hpet_capable()) {
916 /*
917 * HPET initialization needs to do memory-mapped io. So, let
918 * us do a late initialization after mem_init().
919 */
920 late_time_init = hpet_time_init;
921 return;
922 }
923 #endif
925 switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
926 &xen_set_periodic_tick)) {
927 case 0:
928 #if CONFIG_XEN_COMPAT <= 0x030004
929 case -ENOSYS:
930 #endif
931 break;
932 default:
933 BUG();
934 }
936 get_time_values_from_xen(0);
938 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
939 per_cpu(processed_system_time, 0) = processed_system_time;
940 init_missing_ticks_accounting(0);
942 update_wallclock();
944 init_cpu_khz();
945 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
946 cpu_khz / 1000, cpu_khz % 1000);
948 #if defined(__x86_64__)
949 vxtime.mode = VXTIME_TSC;
950 vxtime.quot = (1000000L << 32) / vxtime_hz;
951 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
952 sync_core();
953 rdtscll(vxtime.last_tsc);
954 #endif
956 /* Cannot request_irq() until kmem is initialised. */
957 late_time_init = setup_cpu0_timer_irq;
958 }
960 /* Convert jiffies to system time. */
961 u64 jiffies_to_st(unsigned long j)
962 {
963 unsigned long seq;
964 long delta;
965 u64 st;
967 do {
968 seq = read_seqbegin(&xtime_lock);
969 delta = j - jiffies;
970 if (delta < 1) {
971 /* Triggers in some wrap-around cases, but that's okay:
972 * we just end up with a shorter timeout. */
973 st = processed_system_time + NS_PER_TICK;
974 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
975 /* Very long timeout means there is no pending timer.
976 * We indicate this to Xen by passing zero timeout. */
977 st = 0;
978 } else {
979 st = processed_system_time + delta * (u64)NS_PER_TICK;
980 }
981 } while (read_seqretry(&xtime_lock, seq));
983 return st;
984 }
985 EXPORT_SYMBOL(jiffies_to_st);
987 /*
988 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
989 * These functions are based on implementations from arch/s390/kernel/time.c
990 */
991 static void stop_hz_timer(void)
992 {
993 struct vcpu_set_singleshot_timer singleshot;
994 unsigned int cpu = smp_processor_id();
995 unsigned long j;
996 int rc;
998 cpu_set(cpu, nohz_cpu_mask);
1000 /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
1001 /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
1002 /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
1003 /* stop the hz timer then the cpumasks created for subsequent values */
1004 /* of cur in rcu_start_batch are guaranteed to pick up the updated */
1005 /* nohz_cpu_mask and so will not depend on this cpu. */
1007 smp_mb();
1009 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
1010 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
1011 (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
1012 cpu_clear(cpu, nohz_cpu_mask);
1013 j = jiffies + 1;
1016 singleshot.timeout_abs_ns = jiffies_to_st(j);
1017 singleshot.flags = 0;
1018 rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
1019 #if CONFIG_XEN_COMPAT <= 0x030004
1020 if (rc) {
1021 BUG_ON(rc != -ENOSYS);
1022 rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
1024 #endif
1025 BUG_ON(rc);
1028 static void start_hz_timer(void)
1030 cpu_clear(smp_processor_id(), nohz_cpu_mask);
1033 void raw_safe_halt(void)
1035 stop_hz_timer();
1036 /* Blocking includes an implicit local_irq_enable(). */
1037 HYPERVISOR_block();
1038 start_hz_timer();
1040 EXPORT_SYMBOL(raw_safe_halt);
1042 void halt(void)
1044 if (irqs_disabled())
1045 VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
1047 EXPORT_SYMBOL(halt);
1049 /* No locking required. Interrupts are disabled on all CPUs. */
1050 void time_resume(void)
1052 unsigned int cpu;
1054 init_cpu_khz();
1056 for_each_online_cpu(cpu) {
1057 switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
1058 &xen_set_periodic_tick)) {
1059 case 0:
1060 #if CONFIG_XEN_COMPAT <= 0x030004
1061 case -ENOSYS:
1062 #endif
1063 break;
1064 default:
1065 BUG();
1067 get_time_values_from_xen(cpu);
1068 per_cpu(processed_system_time, cpu) =
1069 per_cpu(shadow_time, 0).system_timestamp;
1070 init_missing_ticks_accounting(cpu);
1073 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1075 update_wallclock();
1078 #ifdef CONFIG_SMP
1079 static char timer_name[NR_CPUS][15];
1081 int __cpuinit local_setup_timer(unsigned int cpu)
1083 int seq, irq;
1085 BUG_ON(cpu == 0);
1087 switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
1088 &xen_set_periodic_tick)) {
1089 case 0:
1090 #if CONFIG_XEN_COMPAT <= 0x030004
1091 case -ENOSYS:
1092 #endif
1093 break;
1094 default:
1095 BUG();
1098 do {
1099 seq = read_seqbegin(&xtime_lock);
1100 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1101 per_cpu(processed_system_time, cpu) =
1102 per_cpu(shadow_time, 0).system_timestamp;
1103 init_missing_ticks_accounting(cpu);
1104 } while (read_seqretry(&xtime_lock, seq));
1106 sprintf(timer_name[cpu], "timer%u", cpu);
1107 irq = bind_virq_to_irqhandler(VIRQ_TIMER,
1108 cpu,
1109 timer_interrupt,
1110 SA_INTERRUPT,
1111 timer_name[cpu],
1112 NULL);
1113 if (irq < 0)
1114 return irq;
1115 per_cpu(timer_irq, cpu) = irq;
1117 return 0;
1120 void __cpuexit local_teardown_timer(unsigned int cpu)
1122 BUG_ON(cpu == 0);
1123 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1125 #endif
1127 #ifdef CONFIG_CPU_FREQ
1128 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
1129 void *data)
1131 struct cpufreq_freqs *freq = data;
1132 struct xen_platform_op op;
1134 if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
1135 return 0;
1137 if (val == CPUFREQ_PRECHANGE)
1138 return 0;
1140 op.cmd = XENPF_change_freq;
1141 op.u.change_freq.flags = 0;
1142 op.u.change_freq.cpu = freq->cpu;
1143 op.u.change_freq.freq = (u64)freq->new * 1000;
1144 WARN_ON(HYPERVISOR_platform_op(&op));
1146 return 0;
1149 static struct notifier_block time_cpufreq_notifier_block = {
1150 .notifier_call = time_cpufreq_notifier
1151 };
1153 static int __init cpufreq_time_setup(void)
1155 if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
1156 CPUFREQ_TRANSITION_NOTIFIER)) {
1157 printk(KERN_ERR "failed to set up cpufreq notifier\n");
1158 return -ENODEV;
1160 return 0;
1163 core_initcall(cpufreq_time_setup);
1164 #endif
1166 /*
1167 * /proc/sys/xen: This really belongs in another file. It can stay here for
1168 * now however.
1169 */
1170 static ctl_table xen_subtable[] = {
1172 .ctl_name = 1,
1173 .procname = "independent_wallclock",
1174 .data = &independent_wallclock,
1175 .maxlen = sizeof(independent_wallclock),
1176 .mode = 0644,
1177 .proc_handler = proc_dointvec
1178 },
1180 .ctl_name = 2,
1181 .procname = "permitted_clock_jitter",
1182 .data = &permitted_clock_jitter,
1183 .maxlen = sizeof(permitted_clock_jitter),
1184 .mode = 0644,
1185 .proc_handler = proc_doulongvec_minmax
1186 },
1187 { 0 }
1188 };
1189 static ctl_table xen_table[] = {
1191 .ctl_name = 123,
1192 .procname = "xen",
1193 .mode = 0555,
1194 .child = xen_subtable},
1195 { 0 }
1196 };
1197 static int __init xen_sysctl_init(void)
1199 (void)register_sysctl_table(xen_table, 0);
1200 return 0;
1202 __initcall(xen_sysctl_init);