ia64/xen-unstable

view linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @ 10794:79f4c91c5628

Fix Linux so that it does not set a timeout if there are no pending
timers. Fix Xen so that it does not immediately fire a timer event if
it sees a very long timeout -- sometimes this means that there are
no pending timers.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Jul 25 17:01:49 2006 +0100 (2006-07-25)
parents 4b45f7f62dc7
children d8338b28bcd6
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
65 #include "mach_time.h"
67 #include <linux/timex.h>
68 #include <linux/config.h>
70 #include <asm/hpet.h>
72 #include <asm/arch_hooks.h>
74 #include <xen/evtchn.h>
75 #include <xen/interface/vcpu.h>
77 #if defined (__i386__)
78 #include <asm/i8259.h>
79 #endif
81 int pit_latch_buggy; /* extern */
83 #if defined(__x86_64__)
84 unsigned long vxtime_hz = PIT_TICK_RATE;
85 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
86 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
87 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
88 struct timespec __xtime __section_xtime;
89 struct timezone __sys_tz __section_sys_tz;
90 #endif
92 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
93 EXPORT_SYMBOL(cpu_khz);
95 extern unsigned long wall_jiffies;
97 DEFINE_SPINLOCK(rtc_lock);
98 EXPORT_SYMBOL(rtc_lock);
100 #if defined (__i386__)
101 #include <asm/i8253.h>
102 #endif
104 DEFINE_SPINLOCK(i8253_lock);
105 EXPORT_SYMBOL(i8253_lock);
107 extern struct init_timer_opts timer_tsc_init;
108 extern struct timer_opts timer_tsc;
109 #define timer_none timer_tsc
110 struct timer_opts *cur_timer __read_mostly = &timer_tsc;
112 /* These are peridically updated in shared_info, and then copied here. */
113 struct shadow_time_info {
114 u64 tsc_timestamp; /* TSC at last update of time vals. */
115 u64 system_timestamp; /* Time, in nanosecs, since boot. */
116 u32 tsc_to_nsec_mul;
117 u32 tsc_to_usec_mul;
118 int tsc_shift;
119 u32 version;
120 };
121 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
122 static struct timespec shadow_tv;
123 static u32 shadow_tv_version;
125 /* Keep track of last time we did processing/updating of jiffies and xtime. */
126 static u64 processed_system_time; /* System time (ns) at last processing. */
127 static DEFINE_PER_CPU(u64, processed_system_time);
129 /* How much CPU time was spent blocked and how much was 'stolen'? */
130 static DEFINE_PER_CPU(u64, processed_stolen_time);
131 static DEFINE_PER_CPU(u64, processed_blocked_time);
133 /* Current runstate of each CPU (updated automatically by the hypervisor). */
134 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
136 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
137 #define NS_PER_TICK (1000000000LL/HZ)
139 static inline void __normalize_time(time_t *sec, s64 *nsec)
140 {
141 while (*nsec >= NSEC_PER_SEC) {
142 (*nsec) -= NSEC_PER_SEC;
143 (*sec)++;
144 }
145 while (*nsec < 0) {
146 (*nsec) += NSEC_PER_SEC;
147 (*sec)--;
148 }
149 }
151 /* Does this guest OS track Xen time, or set its wall clock independently? */
152 static int independent_wallclock = 0;
153 static int __init __independent_wallclock(char *str)
154 {
155 independent_wallclock = 1;
156 return 1;
157 }
158 __setup("independent_wallclock", __independent_wallclock);
160 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
161 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
162 static int __init __permitted_clock_jitter(char *str)
163 {
164 permitted_clock_jitter = simple_strtoul(str, NULL, 0);
165 return 1;
166 }
167 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
169 int tsc_disable __devinitdata = 0;
171 static void delay_tsc(unsigned long loops)
172 {
173 unsigned long bclock, now;
175 rdtscl(bclock);
176 do {
177 rep_nop();
178 rdtscl(now);
179 } while ((now - bclock) < loops);
180 }
182 struct timer_opts timer_tsc = {
183 .name = "tsc",
184 .delay = delay_tsc,
185 };
187 /*
188 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
189 * yielding a 64-bit result.
190 */
191 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
192 {
193 u64 product;
194 #ifdef __i386__
195 u32 tmp1, tmp2;
196 #endif
198 if (shift < 0)
199 delta >>= -shift;
200 else
201 delta <<= shift;
203 #ifdef __i386__
204 __asm__ (
205 "mul %5 ; "
206 "mov %4,%%eax ; "
207 "mov %%edx,%4 ; "
208 "mul %5 ; "
209 "xor %5,%5 ; "
210 "add %4,%%eax ; "
211 "adc %5,%%edx ; "
212 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
213 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
214 #else
215 __asm__ (
216 "mul %%rdx ; shrd $32,%%rdx,%%rax"
217 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
218 #endif
220 return product;
221 }
223 #if defined (__i386__)
224 int read_current_timer(unsigned long *timer_val)
225 {
226 rdtscl(*timer_val);
227 return 0;
228 }
229 #endif
231 void init_cpu_khz(void)
232 {
233 u64 __cpu_khz = 1000000ULL << 32;
234 struct vcpu_time_info *info;
235 info = &HYPERVISOR_shared_info->vcpu_info[0].time;
236 do_div(__cpu_khz, info->tsc_to_system_mul);
237 if (info->tsc_shift < 0)
238 cpu_khz = __cpu_khz << -info->tsc_shift;
239 else
240 cpu_khz = __cpu_khz >> info->tsc_shift;
241 }
243 static u64 get_nsec_offset(struct shadow_time_info *shadow)
244 {
245 u64 now, delta;
246 rdtscll(now);
247 delta = now - shadow->tsc_timestamp;
248 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
249 }
251 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
252 {
253 u64 now, delta;
254 rdtscll(now);
255 delta = now - shadow->tsc_timestamp;
256 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
257 }
259 static void __update_wallclock(time_t sec, long nsec)
260 {
261 long wtm_nsec, xtime_nsec;
262 time_t wtm_sec, xtime_sec;
263 u64 tmp, wc_nsec;
265 /* Adjust wall-clock time base based on wall_jiffies ticks. */
266 wc_nsec = processed_system_time;
267 wc_nsec += sec * (u64)NSEC_PER_SEC;
268 wc_nsec += nsec;
269 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
271 /* Split wallclock base into seconds and nanoseconds. */
272 tmp = wc_nsec;
273 xtime_nsec = do_div(tmp, 1000000000);
274 xtime_sec = (time_t)tmp;
276 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
277 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
279 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
280 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
282 ntp_clear();
283 }
285 static void update_wallclock(void)
286 {
287 shared_info_t *s = HYPERVISOR_shared_info;
289 do {
290 shadow_tv_version = s->wc_version;
291 rmb();
292 shadow_tv.tv_sec = s->wc_sec;
293 shadow_tv.tv_nsec = s->wc_nsec;
294 rmb();
295 } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
297 if (!independent_wallclock)
298 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
299 }
301 /*
302 * Reads a consistent set of time-base values from Xen, into a shadow data
303 * area.
304 */
305 static void get_time_values_from_xen(void)
306 {
307 shared_info_t *s = HYPERVISOR_shared_info;
308 struct vcpu_time_info *src;
309 struct shadow_time_info *dst;
311 src = &s->vcpu_info[smp_processor_id()].time;
312 dst = &per_cpu(shadow_time, smp_processor_id());
314 do {
315 dst->version = src->version;
316 rmb();
317 dst->tsc_timestamp = src->tsc_timestamp;
318 dst->system_timestamp = src->system_time;
319 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
320 dst->tsc_shift = src->tsc_shift;
321 rmb();
322 } while ((src->version & 1) | (dst->version ^ src->version));
324 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
325 }
327 static inline int time_values_up_to_date(int cpu)
328 {
329 struct vcpu_time_info *src;
330 struct shadow_time_info *dst;
332 src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
333 dst = &per_cpu(shadow_time, cpu);
335 rmb();
336 return (dst->version == src->version);
337 }
339 /*
340 * This is a special lock that is owned by the CPU and holds the index
341 * register we are working with. It is required for NMI access to the
342 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
343 */
344 volatile unsigned long cmos_lock = 0;
345 EXPORT_SYMBOL(cmos_lock);
347 /* Routines for accessing the CMOS RAM/RTC. */
348 unsigned char rtc_cmos_read(unsigned char addr)
349 {
350 unsigned char val;
351 lock_cmos_prefix(addr);
352 outb_p(addr, RTC_PORT(0));
353 val = inb_p(RTC_PORT(1));
354 lock_cmos_suffix(addr);
355 return val;
356 }
357 EXPORT_SYMBOL(rtc_cmos_read);
359 void rtc_cmos_write(unsigned char val, unsigned char addr)
360 {
361 lock_cmos_prefix(addr);
362 outb_p(addr, RTC_PORT(0));
363 outb_p(val, RTC_PORT(1));
364 lock_cmos_suffix(addr);
365 }
366 EXPORT_SYMBOL(rtc_cmos_write);
368 /*
369 * This version of gettimeofday has microsecond resolution
370 * and better than microsecond precision on fast x86 machines with TSC.
371 */
372 void do_gettimeofday(struct timeval *tv)
373 {
374 unsigned long seq;
375 unsigned long usec, sec;
376 unsigned long max_ntp_tick;
377 s64 nsec;
378 unsigned int cpu;
379 struct shadow_time_info *shadow;
380 u32 local_time_version;
382 cpu = get_cpu();
383 shadow = &per_cpu(shadow_time, cpu);
385 do {
386 unsigned long lost;
388 local_time_version = shadow->version;
389 seq = read_seqbegin(&xtime_lock);
391 usec = get_usec_offset(shadow);
392 lost = jiffies - wall_jiffies;
394 /*
395 * If time_adjust is negative then NTP is slowing the clock
396 * so make sure not to go into next possible interval.
397 * Better to lose some accuracy than have time go backwards..
398 */
399 if (unlikely(time_adjust < 0)) {
400 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
401 usec = min(usec, max_ntp_tick);
403 if (lost)
404 usec += lost * max_ntp_tick;
405 }
406 else if (unlikely(lost))
407 usec += lost * (USEC_PER_SEC / HZ);
409 sec = xtime.tv_sec;
410 usec += (xtime.tv_nsec / NSEC_PER_USEC);
412 nsec = shadow->system_timestamp - processed_system_time;
413 __normalize_time(&sec, &nsec);
414 usec += (long)nsec / NSEC_PER_USEC;
416 if (unlikely(!time_values_up_to_date(cpu))) {
417 /*
418 * We may have blocked for a long time,
419 * rendering our calculations invalid
420 * (e.g. the time delta may have
421 * overflowed). Detect that and recalculate
422 * with fresh values.
423 */
424 get_time_values_from_xen();
425 continue;
426 }
427 } while (read_seqretry(&xtime_lock, seq) ||
428 (local_time_version != shadow->version));
430 put_cpu();
432 while (usec >= USEC_PER_SEC) {
433 usec -= USEC_PER_SEC;
434 sec++;
435 }
437 tv->tv_sec = sec;
438 tv->tv_usec = usec;
439 }
441 EXPORT_SYMBOL(do_gettimeofday);
443 int do_settimeofday(struct timespec *tv)
444 {
445 time_t sec;
446 s64 nsec;
447 unsigned int cpu;
448 struct shadow_time_info *shadow;
449 dom0_op_t op;
451 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
452 return -EINVAL;
454 cpu = get_cpu();
455 shadow = &per_cpu(shadow_time, cpu);
457 write_seqlock_irq(&xtime_lock);
459 /*
460 * Ensure we don't get blocked for a long time so that our time delta
461 * overflows. If that were to happen then our shadow time values would
462 * be stale, so we can retry with fresh ones.
463 */
464 for (;;) {
465 nsec = tv->tv_nsec - get_nsec_offset(shadow);
466 if (time_values_up_to_date(cpu))
467 break;
468 get_time_values_from_xen();
469 }
470 sec = tv->tv_sec;
471 __normalize_time(&sec, &nsec);
473 if ((xen_start_info->flags & SIF_INITDOMAIN) &&
474 !independent_wallclock) {
475 op.cmd = DOM0_SETTIME;
476 op.u.settime.secs = sec;
477 op.u.settime.nsecs = nsec;
478 op.u.settime.system_time = shadow->system_timestamp;
479 HYPERVISOR_dom0_op(&op);
480 update_wallclock();
481 } else if (independent_wallclock) {
482 nsec -= shadow->system_timestamp;
483 __normalize_time(&sec, &nsec);
484 __update_wallclock(sec, nsec);
485 }
487 write_sequnlock_irq(&xtime_lock);
489 put_cpu();
491 clock_was_set();
492 return 0;
493 }
495 EXPORT_SYMBOL(do_settimeofday);
497 static void sync_xen_wallclock(unsigned long dummy);
498 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
499 static void sync_xen_wallclock(unsigned long dummy)
500 {
501 time_t sec;
502 s64 nsec;
503 dom0_op_t op;
505 if (!ntp_synced() || independent_wallclock ||
506 !(xen_start_info->flags & SIF_INITDOMAIN))
507 return;
509 write_seqlock_irq(&xtime_lock);
511 sec = xtime.tv_sec;
512 nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
513 __normalize_time(&sec, &nsec);
515 op.cmd = DOM0_SETTIME;
516 op.u.settime.secs = sec;
517 op.u.settime.nsecs = nsec;
518 op.u.settime.system_time = processed_system_time;
519 HYPERVISOR_dom0_op(&op);
521 update_wallclock();
523 write_sequnlock_irq(&xtime_lock);
525 /* Once per minute. */
526 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
527 }
529 static int set_rtc_mmss(unsigned long nowtime)
530 {
531 int retval;
533 WARN_ON(irqs_disabled());
535 if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
536 return 0;
538 /* gets recalled with irq locally disabled */
539 spin_lock_irq(&rtc_lock);
540 if (efi_enabled)
541 retval = efi_set_rtc_mmss(nowtime);
542 else
543 retval = mach_set_rtc_mmss(nowtime);
544 spin_unlock_irq(&rtc_lock);
546 return retval;
547 }
549 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
550 * Note: This function is required to return accurate
551 * time even in the absence of multiple timer ticks.
552 */
553 unsigned long long monotonic_clock(void)
554 {
555 int cpu = get_cpu();
556 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
557 u64 time;
558 u32 local_time_version;
560 do {
561 local_time_version = shadow->version;
562 barrier();
563 time = shadow->system_timestamp + get_nsec_offset(shadow);
564 if (!time_values_up_to_date(cpu))
565 get_time_values_from_xen();
566 barrier();
567 } while (local_time_version != shadow->version);
569 put_cpu();
571 return time;
572 }
573 EXPORT_SYMBOL(monotonic_clock);
575 unsigned long long sched_clock(void)
576 {
577 return monotonic_clock();
578 }
580 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
581 unsigned long profile_pc(struct pt_regs *regs)
582 {
583 unsigned long pc = instruction_pointer(regs);
585 #ifdef __x86_64__
586 /* Assume the lock function has either no stack frame or only a single word.
587 This checks if the address on the stack looks like a kernel text address.
588 There is a small window for false hits, but in that case the tick
589 is just accounted to the spinlock function.
590 Better would be to write these functions in assembler again
591 and check exactly. */
592 if (in_lock_functions(pc)) {
593 char *v = *(char **)regs->rsp;
594 if ((v >= _stext && v <= _etext) ||
595 (v >= _sinittext && v <= _einittext) ||
596 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
597 return (unsigned long)v;
598 return ((unsigned long *)regs->rsp)[1];
599 }
600 #else
601 if (in_lock_functions(pc))
602 return *(unsigned long *)(regs->ebp + 4);
603 #endif
605 return pc;
606 }
607 EXPORT_SYMBOL(profile_pc);
608 #endif
610 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
611 {
612 s64 delta, delta_cpu, stolen, blocked;
613 u64 sched_time;
614 int i, cpu = smp_processor_id();
615 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
616 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
618 write_seqlock(&xtime_lock);
620 do {
621 get_time_values_from_xen();
623 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
624 delta = delta_cpu =
625 shadow->system_timestamp + get_nsec_offset(shadow);
626 delta -= processed_system_time;
627 delta_cpu -= per_cpu(processed_system_time, cpu);
629 /*
630 * Obtain a consistent snapshot of stolen/blocked cycles. We
631 * can use state_entry_time to detect if we get preempted here.
632 */
633 do {
634 sched_time = runstate->state_entry_time;
635 barrier();
636 stolen = runstate->time[RUNSTATE_runnable] +
637 runstate->time[RUNSTATE_offline] -
638 per_cpu(processed_stolen_time, cpu);
639 blocked = runstate->time[RUNSTATE_blocked] -
640 per_cpu(processed_blocked_time, cpu);
641 barrier();
642 } while (sched_time != runstate->state_entry_time);
643 } while (!time_values_up_to_date(cpu));
645 if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
646 unlikely(delta_cpu < -(s64)permitted_clock_jitter))
647 && printk_ratelimit()) {
648 printk("Timer ISR/%d: Time went backwards: "
649 "delta=%lld delta_cpu=%lld shadow=%lld "
650 "off=%lld processed=%lld cpu_processed=%lld\n",
651 cpu, delta, delta_cpu, shadow->system_timestamp,
652 (s64)get_nsec_offset(shadow),
653 processed_system_time,
654 per_cpu(processed_system_time, cpu));
655 for (i = 0; i < num_online_cpus(); i++)
656 printk(" %d: %lld\n", i,
657 per_cpu(processed_system_time, i));
658 }
660 /* System-wide jiffy work. */
661 while (delta >= NS_PER_TICK) {
662 delta -= NS_PER_TICK;
663 processed_system_time += NS_PER_TICK;
664 do_timer(regs);
665 }
667 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
668 update_wallclock();
669 clock_was_set();
670 }
672 write_sequnlock(&xtime_lock);
674 /*
675 * Account stolen ticks.
676 * HACK: Passing NULL to account_steal_time()
677 * ensures that the ticks are accounted as stolen.
678 */
679 if ((stolen > 0) && (delta_cpu > 0)) {
680 delta_cpu -= stolen;
681 if (unlikely(delta_cpu < 0))
682 stolen += delta_cpu; /* clamp local-time progress */
683 do_div(stolen, NS_PER_TICK);
684 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
685 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
686 account_steal_time(NULL, (cputime_t)stolen);
687 }
689 /*
690 * Account blocked ticks.
691 * HACK: Passing idle_task to account_steal_time()
692 * ensures that the ticks are accounted as idle/wait.
693 */
694 if ((blocked > 0) && (delta_cpu > 0)) {
695 delta_cpu -= blocked;
696 if (unlikely(delta_cpu < 0))
697 blocked += delta_cpu; /* clamp local-time progress */
698 do_div(blocked, NS_PER_TICK);
699 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
700 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
701 account_steal_time(idle_task(cpu), (cputime_t)blocked);
702 }
704 /* Account user/system ticks. */
705 if (delta_cpu > 0) {
706 do_div(delta_cpu, NS_PER_TICK);
707 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
708 if (user_mode(regs))
709 account_user_time(current, (cputime_t)delta_cpu);
710 else
711 account_system_time(current, HARDIRQ_OFFSET,
712 (cputime_t)delta_cpu);
713 }
715 /* Local timer processing (see update_process_times()). */
716 run_local_timers();
717 if (rcu_pending(cpu))
718 rcu_check_callbacks(cpu, user_mode(regs));
719 scheduler_tick();
720 run_posix_cpu_timers(current);
722 return IRQ_HANDLED;
723 }
725 static void init_missing_ticks_accounting(int cpu)
726 {
727 struct vcpu_register_runstate_memory_area area;
728 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
730 memset(runstate, 0, sizeof(*runstate));
732 area.addr.v = runstate;
733 HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
735 per_cpu(processed_blocked_time, cpu) =
736 runstate->time[RUNSTATE_blocked];
737 per_cpu(processed_stolen_time, cpu) =
738 runstate->time[RUNSTATE_runnable] +
739 runstate->time[RUNSTATE_offline];
740 }
742 /* not static: needed by APM */
743 unsigned long get_cmos_time(void)
744 {
745 unsigned long retval;
747 spin_lock(&rtc_lock);
749 if (efi_enabled)
750 retval = efi_get_time();
751 else
752 retval = mach_get_cmos_time();
754 spin_unlock(&rtc_lock);
756 return retval;
757 }
758 EXPORT_SYMBOL(get_cmos_time);
760 static void sync_cmos_clock(unsigned long dummy);
762 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
764 static void sync_cmos_clock(unsigned long dummy)
765 {
766 struct timeval now, next;
767 int fail = 1;
769 /*
770 * If we have an externally synchronized Linux clock, then update
771 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
772 * called as close as possible to 500 ms before the new second starts.
773 * This code is run on a timer. If the clock is set, that timer
774 * may not expire at the correct time. Thus, we adjust...
775 */
776 if (!ntp_synced())
777 /*
778 * Not synced, exit, do not restart a timer (if one is
779 * running, let it run out).
780 */
781 return;
783 do_gettimeofday(&now);
784 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
785 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
786 fail = set_rtc_mmss(now.tv_sec);
788 next.tv_usec = USEC_AFTER - now.tv_usec;
789 if (next.tv_usec <= 0)
790 next.tv_usec += USEC_PER_SEC;
792 if (!fail)
793 next.tv_sec = 659;
794 else
795 next.tv_sec = 0;
797 if (next.tv_usec >= USEC_PER_SEC) {
798 next.tv_sec++;
799 next.tv_usec -= USEC_PER_SEC;
800 }
801 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
802 }
804 void notify_arch_cmos_timer(void)
805 {
806 mod_timer(&sync_cmos_timer, jiffies + 1);
807 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
808 }
810 static long clock_cmos_diff, sleep_start;
812 static struct timer_opts *last_timer;
813 static int timer_suspend(struct sys_device *dev, pm_message_t state)
814 {
815 /*
816 * Estimate time zone so that set_time can update the clock
817 */
818 clock_cmos_diff = -get_cmos_time();
819 clock_cmos_diff += get_seconds();
820 sleep_start = get_cmos_time();
821 last_timer = cur_timer;
822 cur_timer = &timer_none;
823 if (last_timer->suspend)
824 last_timer->suspend(state);
825 return 0;
826 }
828 static int timer_resume(struct sys_device *dev)
829 {
830 unsigned long flags;
831 unsigned long sec;
832 unsigned long sleep_length;
834 #ifdef CONFIG_HPET_TIMER
835 if (is_hpet_enabled())
836 hpet_reenable();
837 #endif
838 sec = get_cmos_time() + clock_cmos_diff;
839 sleep_length = (get_cmos_time() - sleep_start) * HZ;
840 write_seqlock_irqsave(&xtime_lock, flags);
841 xtime.tv_sec = sec;
842 xtime.tv_nsec = 0;
843 jiffies_64 += sleep_length;
844 wall_jiffies += sleep_length;
845 write_sequnlock_irqrestore(&xtime_lock, flags);
846 if (last_timer->resume)
847 last_timer->resume();
848 cur_timer = last_timer;
849 last_timer = NULL;
850 touch_softlockup_watchdog();
851 return 0;
852 }
854 static struct sysdev_class timer_sysclass = {
855 .resume = timer_resume,
856 .suspend = timer_suspend,
857 set_kset_name("timer"),
858 };
861 /* XXX this driverfs stuff should probably go elsewhere later -john */
862 static struct sys_device device_timer = {
863 .id = 0,
864 .cls = &timer_sysclass,
865 };
867 static int time_init_device(void)
868 {
869 int error = sysdev_class_register(&timer_sysclass);
870 if (!error)
871 error = sysdev_register(&device_timer);
872 return error;
873 }
875 device_initcall(time_init_device);
877 #ifdef CONFIG_HPET_TIMER
878 extern void (*late_time_init)(void);
879 /* Duplicate of time_init() below, with hpet_enable part added */
880 static void __init hpet_time_init(void)
881 {
882 xtime.tv_sec = get_cmos_time();
883 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
884 set_normalized_timespec(&wall_to_monotonic,
885 -xtime.tv_sec, -xtime.tv_nsec);
887 if ((hpet_enable() >= 0) && hpet_use_timer) {
888 printk("Using HPET for base-timer\n");
889 }
891 cur_timer = select_timer();
892 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
894 time_init_hook();
895 }
896 #endif
898 /* Dynamically-mapped IRQ. */
899 DEFINE_PER_CPU(int, timer_irq);
901 extern void (*late_time_init)(void);
902 static void setup_cpu0_timer_irq(void)
903 {
904 per_cpu(timer_irq, 0) =
905 bind_virq_to_irqhandler(
906 VIRQ_TIMER,
907 0,
908 timer_interrupt,
909 SA_INTERRUPT,
910 "timer0",
911 NULL);
912 BUG_ON(per_cpu(timer_irq, 0) < 0);
913 }
915 void __init time_init(void)
916 {
917 #ifdef CONFIG_HPET_TIMER
918 if (is_hpet_capable()) {
919 /*
920 * HPET initialization needs to do memory-mapped io. So, let
921 * us do a late initialization after mem_init().
922 */
923 late_time_init = hpet_time_init;
924 return;
925 }
926 #endif
927 get_time_values_from_xen();
929 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
930 per_cpu(processed_system_time, 0) = processed_system_time;
931 init_missing_ticks_accounting(0);
933 update_wallclock();
935 init_cpu_khz();
936 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
937 cpu_khz / 1000, cpu_khz % 1000);
939 #if defined(__x86_64__)
940 vxtime.mode = VXTIME_TSC;
941 vxtime.quot = (1000000L << 32) / vxtime_hz;
942 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
943 sync_core();
944 rdtscll(vxtime.last_tsc);
945 #endif
947 /* Cannot request_irq() until kmem is initialised. */
948 late_time_init = setup_cpu0_timer_irq;
949 }
951 /* Convert jiffies to system time. */
952 u64 jiffies_to_st(unsigned long j)
953 {
954 unsigned long seq;
955 long delta;
956 u64 st;
958 do {
959 seq = read_seqbegin(&xtime_lock);
960 delta = j - jiffies;
961 if (delta < 1) {
962 /* Triggers in some wrap-around cases, but that's okay:
963 * we just end up with a shorter timeout. */
964 st = processed_system_time + NS_PER_TICK;
965 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
966 /* Very long timeout means there is no pending timer.
967 * We indicate this to Xen by passing zero timeout. */
968 st = 0;
969 } else {
970 st = processed_system_time + delta * (u64)NS_PER_TICK;
971 }
972 } while (read_seqretry(&xtime_lock, seq));
974 return st;
975 }
976 EXPORT_SYMBOL(jiffies_to_st);
978 /*
979 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
980 * These functions are based on implementations from arch/s390/kernel/time.c
981 */
982 static void stop_hz_timer(void)
983 {
984 unsigned int cpu = smp_processor_id();
985 unsigned long j;
987 cpu_set(cpu, nohz_cpu_mask);
989 /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
990 /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
991 /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
992 /* stop the hz timer then the cpumasks created for subsequent values */
993 /* of cur in rcu_start_batch are guaranteed to pick up the updated */
994 /* nohz_cpu_mask and so will not depend on this cpu. */
996 smp_mb();
998 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
999 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
1000 (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
1001 cpu_clear(cpu, nohz_cpu_mask);
1002 j = jiffies + 1;
1005 if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
1006 BUG();
1009 static void start_hz_timer(void)
1011 cpu_clear(smp_processor_id(), nohz_cpu_mask);
1014 void safe_halt(void)
1016 stop_hz_timer();
1017 /* Blocking includes an implicit local_irq_enable(). */
1018 HYPERVISOR_block();
1019 start_hz_timer();
1021 EXPORT_SYMBOL(safe_halt);
1023 void halt(void)
1025 if (irqs_disabled())
1026 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
1028 EXPORT_SYMBOL(halt);
1030 /* No locking required. We are only CPU running, and interrupts are off. */
1031 void time_resume(void)
1033 init_cpu_khz();
1035 get_time_values_from_xen();
1037 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1038 per_cpu(processed_system_time, 0) = processed_system_time;
1039 init_missing_ticks_accounting(0);
1041 update_wallclock();
1044 #ifdef CONFIG_SMP
1045 static char timer_name[NR_CPUS][15];
1047 void local_setup_timer(unsigned int cpu)
1049 int seq;
1051 BUG_ON(cpu == 0);
1053 do {
1054 seq = read_seqbegin(&xtime_lock);
1055 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1056 per_cpu(processed_system_time, cpu) =
1057 per_cpu(shadow_time, 0).system_timestamp;
1058 init_missing_ticks_accounting(cpu);
1059 } while (read_seqretry(&xtime_lock, seq));
1061 sprintf(timer_name[cpu], "timer%d", cpu);
1062 per_cpu(timer_irq, cpu) =
1063 bind_virq_to_irqhandler(
1064 VIRQ_TIMER,
1065 cpu,
1066 timer_interrupt,
1067 SA_INTERRUPT,
1068 timer_name[cpu],
1069 NULL);
1070 BUG_ON(per_cpu(timer_irq, cpu) < 0);
1073 void local_teardown_timer(unsigned int cpu)
1075 BUG_ON(cpu == 0);
1076 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1078 #endif
1080 /*
1081 * /proc/sys/xen: This really belongs in another file. It can stay here for
1082 * now however.
1083 */
1084 static ctl_table xen_subtable[] = {
1086 .ctl_name = 1,
1087 .procname = "independent_wallclock",
1088 .data = &independent_wallclock,
1089 .maxlen = sizeof(independent_wallclock),
1090 .mode = 0644,
1091 .proc_handler = proc_dointvec
1092 },
1094 .ctl_name = 2,
1095 .procname = "permitted_clock_jitter",
1096 .data = &permitted_clock_jitter,
1097 .maxlen = sizeof(permitted_clock_jitter),
1098 .mode = 0644,
1099 .proc_handler = proc_doulongvec_minmax
1100 },
1101 { 0 }
1102 };
1103 static ctl_table xen_table[] = {
1105 .ctl_name = 123,
1106 .procname = "xen",
1107 .mode = 0555,
1108 .child = xen_subtable},
1109 { 0 }
1110 };
1111 static int __init xen_sysctl_init(void)
1113 (void)register_sysctl_table(xen_table, 0);
1114 return 0;
1116 __initcall(xen_sysctl_init);