ia64/xen-unstable

view linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @ 9031:5541ea99106a

Periodically sytnchronize Xen's wallclock time with NTP-synchronized time in domain0.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sun Feb 26 16:38:59 2006 +0100 (2006-02-26)
parents d0b7281556f2
children e9daf5307296
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
65 #include "mach_time.h"
67 #include <linux/timex.h>
68 #include <linux/config.h>
70 #include <asm/hpet.h>
72 #include <asm/arch_hooks.h>
74 #include <xen/evtchn.h>
75 #include <xen/interface/vcpu.h>
77 #if defined (__i386__)
78 #include <asm/i8259.h>
79 #endif
81 int pit_latch_buggy; /* extern */
83 #if defined(__x86_64__)
84 unsigned long vxtime_hz = PIT_TICK_RATE;
85 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
86 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
87 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
88 struct timespec __xtime __section_xtime;
89 struct timezone __sys_tz __section_sys_tz;
90 #endif
92 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
93 EXPORT_SYMBOL(cpu_khz);
95 extern unsigned long wall_jiffies;
97 DEFINE_SPINLOCK(rtc_lock);
98 EXPORT_SYMBOL(rtc_lock);
100 #if defined (__i386__)
101 #include <asm/i8253.h>
102 #endif
104 DEFINE_SPINLOCK(i8253_lock);
105 EXPORT_SYMBOL(i8253_lock);
107 extern struct init_timer_opts timer_tsc_init;
108 extern struct timer_opts timer_tsc;
109 #define timer_none timer_tsc
110 struct timer_opts *cur_timer __read_mostly = &timer_tsc;
112 /* These are peridically updated in shared_info, and then copied here. */
113 struct shadow_time_info {
114 u64 tsc_timestamp; /* TSC at last update of time vals. */
115 u64 system_timestamp; /* Time, in nanosecs, since boot. */
116 u32 tsc_to_nsec_mul;
117 u32 tsc_to_usec_mul;
118 int tsc_shift;
119 u32 version;
120 };
121 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
122 static struct timespec shadow_tv;
123 static u32 shadow_tv_version;
125 /* Keep track of last time we did processing/updating of jiffies and xtime. */
126 static u64 processed_system_time; /* System time (ns) at last processing. */
127 static DEFINE_PER_CPU(u64, processed_system_time);
129 /* How much CPU time was spent blocked and how much was 'stolen'? */
130 static DEFINE_PER_CPU(u64, processed_stolen_time);
131 static DEFINE_PER_CPU(u64, processed_blocked_time);
133 /* Current runstate of each CPU (updated automatically by the hypervisor). */
134 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
136 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
137 #define NS_PER_TICK (1000000000LL/HZ)
139 static inline void __normalize_time(time_t *sec, s64 *nsec)
140 {
141 while (*nsec >= NSEC_PER_SEC) {
142 (*nsec) -= NSEC_PER_SEC;
143 (*sec)++;
144 }
145 while (*nsec < 0) {
146 (*nsec) += NSEC_PER_SEC;
147 (*sec)--;
148 }
149 }
151 /* Does this guest OS track Xen time, or set its wall clock independently? */
152 static int independent_wallclock = 0;
153 static int __init __independent_wallclock(char *str)
154 {
155 independent_wallclock = 1;
156 return 1;
157 }
158 __setup("independent_wallclock", __independent_wallclock);
160 int tsc_disable __devinitdata = 0;
162 static void delay_tsc(unsigned long loops)
163 {
164 unsigned long bclock, now;
166 rdtscl(bclock);
167 do
168 {
169 rep_nop();
170 rdtscl(now);
171 } while ((now-bclock) < loops);
172 }
174 struct timer_opts timer_tsc = {
175 .name = "tsc",
176 .delay = delay_tsc,
177 };
179 /*
180 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
181 * yielding a 64-bit result.
182 */
183 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
184 {
185 u64 product;
186 #ifdef __i386__
187 u32 tmp1, tmp2;
188 #endif
190 if ( shift < 0 )
191 delta >>= -shift;
192 else
193 delta <<= shift;
195 #ifdef __i386__
196 __asm__ (
197 "mul %5 ; "
198 "mov %4,%%eax ; "
199 "mov %%edx,%4 ; "
200 "mul %5 ; "
201 "xor %5,%5 ; "
202 "add %4,%%eax ; "
203 "adc %5,%%edx ; "
204 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
205 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
206 #else
207 __asm__ (
208 "mul %%rdx ; shrd $32,%%rdx,%%rax"
209 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
210 #endif
212 return product;
213 }
215 #if defined (__i386__)
216 int read_current_timer(unsigned long *timer_val)
217 {
218 rdtscl(*timer_val);
219 return 0;
220 }
221 #endif
223 void init_cpu_khz(void)
224 {
225 u64 __cpu_khz = 1000000ULL << 32;
226 struct vcpu_time_info *info;
227 info = &HYPERVISOR_shared_info->vcpu_info[0].time;
228 do_div(__cpu_khz, info->tsc_to_system_mul);
229 if ( info->tsc_shift < 0 )
230 cpu_khz = __cpu_khz << -info->tsc_shift;
231 else
232 cpu_khz = __cpu_khz >> info->tsc_shift;
233 }
235 static u64 get_nsec_offset(struct shadow_time_info *shadow)
236 {
237 u64 now, delta;
238 rdtscll(now);
239 delta = now - shadow->tsc_timestamp;
240 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
241 }
243 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
244 {
245 u64 now, delta;
246 rdtscll(now);
247 delta = now - shadow->tsc_timestamp;
248 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
249 }
251 static void __update_wallclock(time_t sec, long nsec)
252 {
253 long wtm_nsec, xtime_nsec;
254 time_t wtm_sec, xtime_sec;
255 u64 tmp, wc_nsec;
257 /* Adjust wall-clock time base based on wall_jiffies ticks. */
258 wc_nsec = processed_system_time;
259 wc_nsec += sec * (u64)NSEC_PER_SEC;
260 wc_nsec += nsec;
261 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
263 /* Split wallclock base into seconds and nanoseconds. */
264 tmp = wc_nsec;
265 xtime_nsec = do_div(tmp, 1000000000);
266 xtime_sec = (time_t)tmp;
268 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
269 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
271 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
272 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
274 ntp_clear();
275 }
277 static void update_wallclock(void)
278 {
279 shared_info_t *s = HYPERVISOR_shared_info;
281 do {
282 shadow_tv_version = s->wc_version;
283 rmb();
284 shadow_tv.tv_sec = s->wc_sec;
285 shadow_tv.tv_nsec = s->wc_nsec;
286 rmb();
287 }
288 while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
290 if (!independent_wallclock)
291 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
292 }
294 /*
295 * Reads a consistent set of time-base values from Xen, into a shadow data
296 * area.
297 */
298 static void get_time_values_from_xen(void)
299 {
300 shared_info_t *s = HYPERVISOR_shared_info;
301 struct vcpu_time_info *src;
302 struct shadow_time_info *dst;
304 src = &s->vcpu_info[smp_processor_id()].time;
305 dst = &per_cpu(shadow_time, smp_processor_id());
307 do {
308 dst->version = src->version;
309 rmb();
310 dst->tsc_timestamp = src->tsc_timestamp;
311 dst->system_timestamp = src->system_time;
312 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
313 dst->tsc_shift = src->tsc_shift;
314 rmb();
315 }
316 while ((src->version & 1) | (dst->version ^ src->version));
318 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
319 }
321 static inline int time_values_up_to_date(int cpu)
322 {
323 struct vcpu_time_info *src;
324 struct shadow_time_info *dst;
326 src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
327 dst = &per_cpu(shadow_time, cpu);
329 return (dst->version == src->version);
330 }
332 /*
333 * This is a special lock that is owned by the CPU and holds the index
334 * register we are working with. It is required for NMI access to the
335 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
336 */
337 volatile unsigned long cmos_lock = 0;
338 EXPORT_SYMBOL(cmos_lock);
340 /* Routines for accessing the CMOS RAM/RTC. */
341 unsigned char rtc_cmos_read(unsigned char addr)
342 {
343 unsigned char val;
344 lock_cmos_prefix(addr);
345 outb_p(addr, RTC_PORT(0));
346 val = inb_p(RTC_PORT(1));
347 lock_cmos_suffix(addr);
348 return val;
349 }
350 EXPORT_SYMBOL(rtc_cmos_read);
352 void rtc_cmos_write(unsigned char val, unsigned char addr)
353 {
354 lock_cmos_prefix(addr);
355 outb_p(addr, RTC_PORT(0));
356 outb_p(val, RTC_PORT(1));
357 lock_cmos_suffix(addr);
358 }
359 EXPORT_SYMBOL(rtc_cmos_write);
361 /*
362 * This version of gettimeofday has microsecond resolution
363 * and better than microsecond precision on fast x86 machines with TSC.
364 */
365 void do_gettimeofday(struct timeval *tv)
366 {
367 unsigned long seq;
368 unsigned long usec, sec;
369 unsigned long max_ntp_tick;
370 s64 nsec;
371 unsigned int cpu;
372 struct shadow_time_info *shadow;
373 u32 local_time_version;
375 cpu = get_cpu();
376 shadow = &per_cpu(shadow_time, cpu);
378 do {
379 unsigned long lost;
381 local_time_version = shadow->version;
382 seq = read_seqbegin(&xtime_lock);
384 usec = get_usec_offset(shadow);
385 lost = jiffies - wall_jiffies;
387 /*
388 * If time_adjust is negative then NTP is slowing the clock
389 * so make sure not to go into next possible interval.
390 * Better to lose some accuracy than have time go backwards..
391 */
392 if (unlikely(time_adjust < 0)) {
393 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
394 usec = min(usec, max_ntp_tick);
396 if (lost)
397 usec += lost * max_ntp_tick;
398 }
399 else if (unlikely(lost))
400 usec += lost * (USEC_PER_SEC / HZ);
402 sec = xtime.tv_sec;
403 usec += (xtime.tv_nsec / NSEC_PER_USEC);
405 nsec = shadow->system_timestamp - processed_system_time;
406 __normalize_time(&sec, &nsec);
407 usec += (long)nsec / NSEC_PER_USEC;
409 if (unlikely(!time_values_up_to_date(cpu))) {
410 /*
411 * We may have blocked for a long time,
412 * rendering our calculations invalid
413 * (e.g. the time delta may have
414 * overflowed). Detect that and recalculate
415 * with fresh values.
416 */
417 get_time_values_from_xen();
418 continue;
419 }
420 } while (read_seqretry(&xtime_lock, seq) ||
421 (local_time_version != shadow->version));
423 put_cpu();
425 while (usec >= USEC_PER_SEC) {
426 usec -= USEC_PER_SEC;
427 sec++;
428 }
430 tv->tv_sec = sec;
431 tv->tv_usec = usec;
432 }
434 EXPORT_SYMBOL(do_gettimeofday);
436 int do_settimeofday(struct timespec *tv)
437 {
438 time_t sec;
439 s64 nsec;
440 unsigned int cpu;
441 struct shadow_time_info *shadow;
442 dom0_op_t op;
444 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
445 return -EINVAL;
447 cpu = get_cpu();
448 shadow = &per_cpu(shadow_time, cpu);
450 write_seqlock_irq(&xtime_lock);
452 /*
453 * Ensure we don't get blocked for a long time so that our time delta
454 * overflows. If that were to happen then our shadow time values would
455 * be stale, so we can retry with fresh ones.
456 */
457 for ( ; ; ) {
458 nsec = tv->tv_nsec - get_nsec_offset(shadow);
459 if (time_values_up_to_date(cpu))
460 break;
461 get_time_values_from_xen();
462 }
463 sec = tv->tv_sec;
464 __normalize_time(&sec, &nsec);
466 if ((xen_start_info->flags & SIF_INITDOMAIN) &&
467 !independent_wallclock) {
468 op.cmd = DOM0_SETTIME;
469 op.u.settime.secs = sec;
470 op.u.settime.nsecs = nsec;
471 op.u.settime.system_time = shadow->system_timestamp;
472 HYPERVISOR_dom0_op(&op);
473 update_wallclock();
474 } else if (independent_wallclock) {
475 nsec -= shadow->system_timestamp;
476 __normalize_time(&sec, &nsec);
477 __update_wallclock(sec, nsec);
478 }
480 write_sequnlock_irq(&xtime_lock);
482 put_cpu();
484 clock_was_set();
485 return 0;
486 }
488 EXPORT_SYMBOL(do_settimeofday);
490 static void sync_xen_wallclock(unsigned long dummy);
491 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
492 static void sync_xen_wallclock(unsigned long dummy)
493 {
494 time_t sec;
495 s64 nsec;
496 dom0_op_t op;
498 if (!ntp_synced() || independent_wallclock ||
499 !(xen_start_info->flags & SIF_INITDOMAIN))
500 return;
502 write_seqlock_irq(&xtime_lock);
504 sec = xtime.tv_sec;
505 nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
506 __normalize_time(&sec, &nsec);
508 op.cmd = DOM0_SETTIME;
509 op.u.settime.secs = sec;
510 op.u.settime.nsecs = nsec;
511 op.u.settime.system_time = processed_system_time;
512 HYPERVISOR_dom0_op(&op);
514 update_wallclock();
516 write_sequnlock_irq(&xtime_lock);
518 /* Once per minute. */
519 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
520 }
522 static int set_rtc_mmss(unsigned long nowtime)
523 {
524 int retval;
526 WARN_ON(irqs_disabled());
528 if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
529 return 0;
531 /* gets recalled with irq locally disabled */
532 spin_lock_irq(&rtc_lock);
533 if (efi_enabled)
534 retval = efi_set_rtc_mmss(nowtime);
535 else
536 retval = mach_set_rtc_mmss(nowtime);
537 spin_unlock_irq(&rtc_lock);
539 return retval;
540 }
542 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
543 * Note: This function is required to return accurate
544 * time even in the absence of multiple timer ticks.
545 */
546 unsigned long long monotonic_clock(void)
547 {
548 int cpu = get_cpu();
549 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
550 u64 time;
551 u32 local_time_version;
553 do {
554 local_time_version = shadow->version;
555 smp_rmb();
556 time = shadow->system_timestamp + get_nsec_offset(shadow);
557 if (!time_values_up_to_date(cpu))
558 get_time_values_from_xen();
559 smp_rmb();
560 } while (local_time_version != shadow->version);
562 put_cpu();
564 return time;
565 }
566 EXPORT_SYMBOL(monotonic_clock);
568 unsigned long long sched_clock(void)
569 {
570 return monotonic_clock();
571 }
573 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
574 unsigned long profile_pc(struct pt_regs *regs)
575 {
576 unsigned long pc = instruction_pointer(regs);
578 #ifdef __x86_64__
579 /* Assume the lock function has either no stack frame or only a single word.
580 This checks if the address on the stack looks like a kernel text address.
581 There is a small window for false hits, but in that case the tick
582 is just accounted to the spinlock function.
583 Better would be to write these functions in assembler again
584 and check exactly. */
585 if (in_lock_functions(pc)) {
586 char *v = *(char **)regs->rsp;
587 if ((v >= _stext && v <= _etext) ||
588 (v >= _sinittext && v <= _einittext) ||
589 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
590 return (unsigned long)v;
591 return ((unsigned long *)regs->rsp)[1];
592 }
593 #else
594 if (in_lock_functions(pc))
595 return *(unsigned long *)(regs->ebp + 4);
596 #endif
598 return pc;
599 }
600 EXPORT_SYMBOL(profile_pc);
601 #endif
603 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
604 {
605 s64 delta, delta_cpu, stolen, blocked;
606 u64 sched_time;
607 int i, cpu = smp_processor_id();
608 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
609 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
611 write_seqlock(&xtime_lock);
613 do {
614 get_time_values_from_xen();
616 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
617 delta = delta_cpu =
618 shadow->system_timestamp + get_nsec_offset(shadow);
619 delta -= processed_system_time;
620 delta_cpu -= per_cpu(processed_system_time, cpu);
622 /*
623 * Obtain a consistent snapshot of stolen/blocked cycles. We
624 * can use state_entry_time to detect if we get preempted here.
625 */
626 do {
627 sched_time = runstate->state_entry_time;
628 barrier();
629 stolen = runstate->time[RUNSTATE_runnable] +
630 runstate->time[RUNSTATE_offline] -
631 per_cpu(processed_stolen_time, cpu);
632 blocked = runstate->time[RUNSTATE_blocked] -
633 per_cpu(processed_blocked_time, cpu);
634 barrier();
635 } while (sched_time != runstate->state_entry_time);
636 }
637 while (!time_values_up_to_date(cpu));
639 if ((unlikely(delta < -1000000LL) || unlikely(delta_cpu < 0))
640 && printk_ratelimit()) {
641 printk("Timer ISR/%d: Time went backwards: "
642 "delta=%lld cpu_delta=%lld shadow=%lld "
643 "off=%lld processed=%lld cpu_processed=%lld\n",
644 cpu, delta, delta_cpu, shadow->system_timestamp,
645 (s64)get_nsec_offset(shadow),
646 processed_system_time,
647 per_cpu(processed_system_time, cpu));
648 for (i = 0; i < num_online_cpus(); i++)
649 printk(" %d: %lld\n", i,
650 per_cpu(processed_system_time, i));
651 }
653 /* System-wide jiffy work. */
654 while (delta >= NS_PER_TICK) {
655 delta -= NS_PER_TICK;
656 processed_system_time += NS_PER_TICK;
657 do_timer(regs);
658 }
660 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
661 update_wallclock();
662 clock_was_set();
663 }
665 write_sequnlock(&xtime_lock);
667 /*
668 * Account stolen ticks.
669 * HACK: Passing NULL to account_steal_time()
670 * ensures that the ticks are accounted as stolen.
671 */
672 if (stolen > 0) {
673 delta_cpu -= stolen;
674 do_div(stolen, NS_PER_TICK);
675 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
676 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
677 account_steal_time(NULL, (cputime_t)stolen);
678 }
680 /*
681 * Account blocked ticks.
682 * HACK: Passing idle_task to account_steal_time()
683 * ensures that the ticks are accounted as idle/wait.
684 */
685 if (blocked > 0) {
686 delta_cpu -= blocked;
687 do_div(blocked, NS_PER_TICK);
688 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
689 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
690 account_steal_time(idle_task(cpu), (cputime_t)blocked);
691 }
693 /* Account user/system ticks. */
694 if (delta_cpu > 0) {
695 do_div(delta_cpu, NS_PER_TICK);
696 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
697 if (user_mode(regs))
698 account_user_time(current, (cputime_t)delta_cpu);
699 else
700 account_system_time(current, HARDIRQ_OFFSET,
701 (cputime_t)delta_cpu);
702 }
704 /* Local timer processing (see update_process_times()). */
705 run_local_timers();
706 if (rcu_pending(cpu))
707 rcu_check_callbacks(cpu, user_mode(regs));
708 scheduler_tick();
709 run_posix_cpu_timers(current);
711 return IRQ_HANDLED;
712 }
714 static void init_missing_ticks_accounting(int cpu)
715 {
716 struct vcpu_register_runstate_memory_area area;
717 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
719 memset(runstate, 0, sizeof(*runstate));
721 area.addr.v = runstate;
722 HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
724 per_cpu(processed_blocked_time, cpu) =
725 runstate->time[RUNSTATE_blocked];
726 per_cpu(processed_stolen_time, cpu) =
727 runstate->time[RUNSTATE_runnable] +
728 runstate->time[RUNSTATE_offline];
729 }
731 /* not static: needed by APM */
732 unsigned long get_cmos_time(void)
733 {
734 unsigned long retval;
736 spin_lock(&rtc_lock);
738 if (efi_enabled)
739 retval = efi_get_time();
740 else
741 retval = mach_get_cmos_time();
743 spin_unlock(&rtc_lock);
745 return retval;
746 }
747 EXPORT_SYMBOL(get_cmos_time);
749 static void sync_cmos_clock(unsigned long dummy);
751 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
753 static void sync_cmos_clock(unsigned long dummy)
754 {
755 struct timeval now, next;
756 int fail = 1;
758 /*
759 * If we have an externally synchronized Linux clock, then update
760 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
761 * called as close as possible to 500 ms before the new second starts.
762 * This code is run on a timer. If the clock is set, that timer
763 * may not expire at the correct time. Thus, we adjust...
764 */
765 if (!ntp_synced())
766 /*
767 * Not synced, exit, do not restart a timer (if one is
768 * running, let it run out).
769 */
770 return;
772 do_gettimeofday(&now);
773 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
774 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
775 fail = set_rtc_mmss(now.tv_sec);
777 next.tv_usec = USEC_AFTER - now.tv_usec;
778 if (next.tv_usec <= 0)
779 next.tv_usec += USEC_PER_SEC;
781 if (!fail)
782 next.tv_sec = 659;
783 else
784 next.tv_sec = 0;
786 if (next.tv_usec >= USEC_PER_SEC) {
787 next.tv_sec++;
788 next.tv_usec -= USEC_PER_SEC;
789 }
790 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
791 }
793 void notify_arch_cmos_timer(void)
794 {
795 mod_timer(&sync_cmos_timer, jiffies + 1);
796 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
797 }
799 static long clock_cmos_diff, sleep_start;
801 static struct timer_opts *last_timer;
802 static int timer_suspend(struct sys_device *dev, pm_message_t state)
803 {
804 /*
805 * Estimate time zone so that set_time can update the clock
806 */
807 clock_cmos_diff = -get_cmos_time();
808 clock_cmos_diff += get_seconds();
809 sleep_start = get_cmos_time();
810 last_timer = cur_timer;
811 cur_timer = &timer_none;
812 if (last_timer->suspend)
813 last_timer->suspend(state);
814 return 0;
815 }
817 static int timer_resume(struct sys_device *dev)
818 {
819 unsigned long flags;
820 unsigned long sec;
821 unsigned long sleep_length;
823 #ifdef CONFIG_HPET_TIMER
824 if (is_hpet_enabled())
825 hpet_reenable();
826 #endif
827 sec = get_cmos_time() + clock_cmos_diff;
828 sleep_length = (get_cmos_time() - sleep_start) * HZ;
829 write_seqlock_irqsave(&xtime_lock, flags);
830 xtime.tv_sec = sec;
831 xtime.tv_nsec = 0;
832 write_sequnlock_irqrestore(&xtime_lock, flags);
833 jiffies += sleep_length;
834 wall_jiffies += sleep_length;
835 if (last_timer->resume)
836 last_timer->resume();
837 cur_timer = last_timer;
838 last_timer = NULL;
839 touch_softlockup_watchdog();
840 return 0;
841 }
843 static struct sysdev_class timer_sysclass = {
844 .resume = timer_resume,
845 .suspend = timer_suspend,
846 set_kset_name("timer"),
847 };
850 /* XXX this driverfs stuff should probably go elsewhere later -john */
851 static struct sys_device device_timer = {
852 .id = 0,
853 .cls = &timer_sysclass,
854 };
856 static int time_init_device(void)
857 {
858 int error = sysdev_class_register(&timer_sysclass);
859 if (!error)
860 error = sysdev_register(&device_timer);
861 return error;
862 }
864 device_initcall(time_init_device);
866 #ifdef CONFIG_HPET_TIMER
867 extern void (*late_time_init)(void);
868 /* Duplicate of time_init() below, with hpet_enable part added */
869 static void __init hpet_time_init(void)
870 {
871 xtime.tv_sec = get_cmos_time();
872 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
873 set_normalized_timespec(&wall_to_monotonic,
874 -xtime.tv_sec, -xtime.tv_nsec);
876 if ((hpet_enable() >= 0) && hpet_use_timer) {
877 printk("Using HPET for base-timer\n");
878 }
880 cur_timer = select_timer();
881 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
883 time_init_hook();
884 }
885 #endif
887 /* Dynamically-mapped IRQ. */
888 DEFINE_PER_CPU(int, timer_irq);
890 extern void (*late_time_init)(void);
891 static void setup_cpu0_timer_irq(void)
892 {
893 per_cpu(timer_irq, 0) =
894 bind_virq_to_irqhandler(
895 VIRQ_TIMER,
896 0,
897 timer_interrupt,
898 SA_INTERRUPT,
899 "timer0",
900 NULL);
901 BUG_ON(per_cpu(timer_irq, 0) < 0);
902 }
904 void __init time_init(void)
905 {
906 #ifdef CONFIG_HPET_TIMER
907 if (is_hpet_capable()) {
908 /*
909 * HPET initialization needs to do memory-mapped io. So, let
910 * us do a late initialization after mem_init().
911 */
912 late_time_init = hpet_time_init;
913 return;
914 }
915 #endif
916 get_time_values_from_xen();
918 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
919 per_cpu(processed_system_time, 0) = processed_system_time;
920 init_missing_ticks_accounting(0);
922 update_wallclock();
924 init_cpu_khz();
925 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
926 cpu_khz / 1000, cpu_khz % 1000);
928 #if defined(__x86_64__)
929 vxtime.mode = VXTIME_TSC;
930 vxtime.quot = (1000000L << 32) / vxtime_hz;
931 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
932 sync_core();
933 rdtscll(vxtime.last_tsc);
934 #endif
936 /* Cannot request_irq() until kmem is initialised. */
937 late_time_init = setup_cpu0_timer_irq;
938 }
940 /* Convert jiffies to system time. */
941 static inline u64 jiffies_to_st(unsigned long j)
942 {
943 unsigned long seq;
944 long delta;
945 u64 st;
947 do {
948 seq = read_seqbegin(&xtime_lock);
949 delta = j - jiffies;
950 /* NB. The next check can trigger in some wrap-around cases,
951 * but that's ok: we'll just end up with a shorter timeout. */
952 if (delta < 1)
953 delta = 1;
954 st = processed_system_time + (delta * (u64)NS_PER_TICK);
955 } while (read_seqretry(&xtime_lock, seq));
957 return st;
958 }
960 /*
961 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
962 * These functions are based on implementations from arch/s390/kernel/time.c
963 */
964 void stop_hz_timer(void)
965 {
966 unsigned int cpu = smp_processor_id();
967 unsigned long j;
969 /* We must do this /before/ checking rcu_pending(). */
970 cpu_set(cpu, nohz_cpu_mask);
971 smp_mb();
973 /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
974 if (rcu_pending(cpu) || local_softirq_pending()) {
975 cpu_clear(cpu, nohz_cpu_mask);
976 j = jiffies + 1;
977 } else {
978 j = next_timer_interrupt();
979 }
981 BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
982 }
984 void start_hz_timer(void)
985 {
986 cpu_clear(smp_processor_id(), nohz_cpu_mask);
987 }
989 /* No locking required. We are only CPU running, and interrupts are off. */
990 void time_resume(void)
991 {
992 init_cpu_khz();
994 get_time_values_from_xen();
996 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
997 per_cpu(processed_system_time, 0) = processed_system_time;
998 init_missing_ticks_accounting(0);
1000 update_wallclock();
1003 #ifdef CONFIG_SMP
1004 static char timer_name[NR_CPUS][15];
1006 void local_setup_timer(unsigned int cpu)
1008 int seq;
1010 BUG_ON(cpu == 0);
1012 do {
1013 seq = read_seqbegin(&xtime_lock);
1014 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1015 per_cpu(processed_system_time, cpu) =
1016 per_cpu(shadow_time, 0).system_timestamp;
1017 init_missing_ticks_accounting(cpu);
1018 } while (read_seqretry(&xtime_lock, seq));
1020 sprintf(timer_name[cpu], "timer%d", cpu);
1021 per_cpu(timer_irq, cpu) =
1022 bind_virq_to_irqhandler(
1023 VIRQ_TIMER,
1024 cpu,
1025 timer_interrupt,
1026 SA_INTERRUPT,
1027 timer_name[cpu],
1028 NULL);
1029 BUG_ON(per_cpu(timer_irq, cpu) < 0);
1032 void local_teardown_timer(unsigned int cpu)
1034 BUG_ON(cpu == 0);
1035 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1037 #endif
1039 /*
1040 * /proc/sys/xen: This really belongs in another file. It can stay here for
1041 * now however.
1042 */
1043 static ctl_table xen_subtable[] = {
1044 {1, "independent_wallclock", &independent_wallclock,
1045 sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
1046 {0}
1047 };
1048 static ctl_table xen_table[] = {
1049 {123, "xen", NULL, 0, 0555, xen_subtable},
1050 {0}
1051 };
1052 static int __init xen_sysctl_init(void)
1054 (void)register_sysctl_table(xen_table, 0);
1055 return 0;
1057 __initcall(xen_sysctl_init);
1059 /*
1060 * Local variables:
1061 * c-file-style: "linux"
1062 * indent-tabs-mode: t
1063 * c-indent-level: 8
1064 * c-basic-offset: 8
1065 * tab-width: 8
1066 * End:
1067 */