ia64/xen-unstable

view linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @ 12505:51edd3c6a4d8

[LINUX] Add process profiling hook to timer ISR.
Signed-off-by: Rik van Riel <riel@redhat.com>
author kfraser@localhost.localdomain
date Fri Nov 17 10:32:57 2006 +0000 (2006-11-17)
parents d8338b28bcd6
children 53795f0a41b1
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
65 #include "mach_time.h"
67 #include <linux/timex.h>
68 #include <linux/config.h>
70 #include <asm/hpet.h>
72 #include <asm/arch_hooks.h>
74 #include <xen/evtchn.h>
75 #include <xen/interface/vcpu.h>
77 #if defined (__i386__)
78 #include <asm/i8259.h>
79 #endif
81 int pit_latch_buggy; /* extern */
83 #if defined(__x86_64__)
84 unsigned long vxtime_hz = PIT_TICK_RATE;
85 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
86 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
87 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
88 struct timespec __xtime __section_xtime;
89 struct timezone __sys_tz __section_sys_tz;
90 #endif
92 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
93 EXPORT_SYMBOL(cpu_khz);
95 extern unsigned long wall_jiffies;
97 DEFINE_SPINLOCK(rtc_lock);
98 EXPORT_SYMBOL(rtc_lock);
100 #if defined (__i386__)
101 #include <asm/i8253.h>
102 #endif
104 DEFINE_SPINLOCK(i8253_lock);
105 EXPORT_SYMBOL(i8253_lock);
107 extern struct init_timer_opts timer_tsc_init;
108 extern struct timer_opts timer_tsc;
109 #define timer_none timer_tsc
110 struct timer_opts *cur_timer __read_mostly = &timer_tsc;
112 /* These are peridically updated in shared_info, and then copied here. */
113 struct shadow_time_info {
114 u64 tsc_timestamp; /* TSC at last update of time vals. */
115 u64 system_timestamp; /* Time, in nanosecs, since boot. */
116 u32 tsc_to_nsec_mul;
117 u32 tsc_to_usec_mul;
118 int tsc_shift;
119 u32 version;
120 };
121 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
122 static struct timespec shadow_tv;
123 static u32 shadow_tv_version;
125 /* Keep track of last time we did processing/updating of jiffies and xtime. */
126 static u64 processed_system_time; /* System time (ns) at last processing. */
127 static DEFINE_PER_CPU(u64, processed_system_time);
129 /* How much CPU time was spent blocked and how much was 'stolen'? */
130 static DEFINE_PER_CPU(u64, processed_stolen_time);
131 static DEFINE_PER_CPU(u64, processed_blocked_time);
133 /* Current runstate of each CPU (updated automatically by the hypervisor). */
134 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
136 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
137 #define NS_PER_TICK (1000000000LL/HZ)
139 static inline void __normalize_time(time_t *sec, s64 *nsec)
140 {
141 while (*nsec >= NSEC_PER_SEC) {
142 (*nsec) -= NSEC_PER_SEC;
143 (*sec)++;
144 }
145 while (*nsec < 0) {
146 (*nsec) += NSEC_PER_SEC;
147 (*sec)--;
148 }
149 }
151 /* Does this guest OS track Xen time, or set its wall clock independently? */
152 static int independent_wallclock = 0;
153 static int __init __independent_wallclock(char *str)
154 {
155 independent_wallclock = 1;
156 return 1;
157 }
158 __setup("independent_wallclock", __independent_wallclock);
160 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
161 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
162 static int __init __permitted_clock_jitter(char *str)
163 {
164 permitted_clock_jitter = simple_strtoul(str, NULL, 0);
165 return 1;
166 }
167 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
169 int tsc_disable __devinitdata = 0;
171 static void delay_tsc(unsigned long loops)
172 {
173 unsigned long bclock, now;
175 rdtscl(bclock);
176 do {
177 rep_nop();
178 rdtscl(now);
179 } while ((now - bclock) < loops);
180 }
182 struct timer_opts timer_tsc = {
183 .name = "tsc",
184 .delay = delay_tsc,
185 };
187 /*
188 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
189 * yielding a 64-bit result.
190 */
191 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
192 {
193 u64 product;
194 #ifdef __i386__
195 u32 tmp1, tmp2;
196 #endif
198 if (shift < 0)
199 delta >>= -shift;
200 else
201 delta <<= shift;
203 #ifdef __i386__
204 __asm__ (
205 "mul %5 ; "
206 "mov %4,%%eax ; "
207 "mov %%edx,%4 ; "
208 "mul %5 ; "
209 "xor %5,%5 ; "
210 "add %4,%%eax ; "
211 "adc %5,%%edx ; "
212 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
213 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
214 #else
215 __asm__ (
216 "mul %%rdx ; shrd $32,%%rdx,%%rax"
217 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
218 #endif
220 return product;
221 }
223 #if defined (__i386__)
224 int read_current_timer(unsigned long *timer_val)
225 {
226 rdtscl(*timer_val);
227 return 0;
228 }
229 #endif
231 void init_cpu_khz(void)
232 {
233 u64 __cpu_khz = 1000000ULL << 32;
234 struct vcpu_time_info *info;
235 info = &HYPERVISOR_shared_info->vcpu_info[0].time;
236 do_div(__cpu_khz, info->tsc_to_system_mul);
237 if (info->tsc_shift < 0)
238 cpu_khz = __cpu_khz << -info->tsc_shift;
239 else
240 cpu_khz = __cpu_khz >> info->tsc_shift;
241 }
243 static u64 get_nsec_offset(struct shadow_time_info *shadow)
244 {
245 u64 now, delta;
246 rdtscll(now);
247 delta = now - shadow->tsc_timestamp;
248 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
249 }
251 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
252 {
253 u64 now, delta;
254 rdtscll(now);
255 delta = now - shadow->tsc_timestamp;
256 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
257 }
259 static void __update_wallclock(time_t sec, long nsec)
260 {
261 long wtm_nsec, xtime_nsec;
262 time_t wtm_sec, xtime_sec;
263 u64 tmp, wc_nsec;
265 /* Adjust wall-clock time base based on wall_jiffies ticks. */
266 wc_nsec = processed_system_time;
267 wc_nsec += sec * (u64)NSEC_PER_SEC;
268 wc_nsec += nsec;
269 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
271 /* Split wallclock base into seconds and nanoseconds. */
272 tmp = wc_nsec;
273 xtime_nsec = do_div(tmp, 1000000000);
274 xtime_sec = (time_t)tmp;
276 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
277 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
279 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
280 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
282 ntp_clear();
283 }
285 static void update_wallclock(void)
286 {
287 shared_info_t *s = HYPERVISOR_shared_info;
289 do {
290 shadow_tv_version = s->wc_version;
291 rmb();
292 shadow_tv.tv_sec = s->wc_sec;
293 shadow_tv.tv_nsec = s->wc_nsec;
294 rmb();
295 } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
297 if (!independent_wallclock)
298 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
299 }
301 /*
302 * Reads a consistent set of time-base values from Xen, into a shadow data
303 * area.
304 */
305 static void get_time_values_from_xen(void)
306 {
307 shared_info_t *s = HYPERVISOR_shared_info;
308 struct vcpu_time_info *src;
309 struct shadow_time_info *dst;
311 src = &s->vcpu_info[smp_processor_id()].time;
312 dst = &per_cpu(shadow_time, smp_processor_id());
314 do {
315 dst->version = src->version;
316 rmb();
317 dst->tsc_timestamp = src->tsc_timestamp;
318 dst->system_timestamp = src->system_time;
319 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
320 dst->tsc_shift = src->tsc_shift;
321 rmb();
322 } while ((src->version & 1) | (dst->version ^ src->version));
324 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
325 }
327 static inline int time_values_up_to_date(int cpu)
328 {
329 struct vcpu_time_info *src;
330 struct shadow_time_info *dst;
332 src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
333 dst = &per_cpu(shadow_time, cpu);
335 rmb();
336 return (dst->version == src->version);
337 }
339 /*
340 * This is a special lock that is owned by the CPU and holds the index
341 * register we are working with. It is required for NMI access to the
342 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
343 */
344 volatile unsigned long cmos_lock = 0;
345 EXPORT_SYMBOL(cmos_lock);
347 /* Routines for accessing the CMOS RAM/RTC. */
348 unsigned char rtc_cmos_read(unsigned char addr)
349 {
350 unsigned char val;
351 lock_cmos_prefix(addr);
352 outb_p(addr, RTC_PORT(0));
353 val = inb_p(RTC_PORT(1));
354 lock_cmos_suffix(addr);
355 return val;
356 }
357 EXPORT_SYMBOL(rtc_cmos_read);
359 void rtc_cmos_write(unsigned char val, unsigned char addr)
360 {
361 lock_cmos_prefix(addr);
362 outb_p(addr, RTC_PORT(0));
363 outb_p(val, RTC_PORT(1));
364 lock_cmos_suffix(addr);
365 }
366 EXPORT_SYMBOL(rtc_cmos_write);
368 /*
369 * This version of gettimeofday has microsecond resolution
370 * and better than microsecond precision on fast x86 machines with TSC.
371 */
372 void do_gettimeofday(struct timeval *tv)
373 {
374 unsigned long seq;
375 unsigned long usec, sec;
376 unsigned long max_ntp_tick;
377 s64 nsec;
378 unsigned int cpu;
379 struct shadow_time_info *shadow;
380 u32 local_time_version;
382 cpu = get_cpu();
383 shadow = &per_cpu(shadow_time, cpu);
385 do {
386 unsigned long lost;
388 local_time_version = shadow->version;
389 seq = read_seqbegin(&xtime_lock);
391 usec = get_usec_offset(shadow);
392 lost = jiffies - wall_jiffies;
394 /*
395 * If time_adjust is negative then NTP is slowing the clock
396 * so make sure not to go into next possible interval.
397 * Better to lose some accuracy than have time go backwards..
398 */
399 if (unlikely(time_adjust < 0)) {
400 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
401 usec = min(usec, max_ntp_tick);
403 if (lost)
404 usec += lost * max_ntp_tick;
405 }
406 else if (unlikely(lost))
407 usec += lost * (USEC_PER_SEC / HZ);
409 sec = xtime.tv_sec;
410 usec += (xtime.tv_nsec / NSEC_PER_USEC);
412 nsec = shadow->system_timestamp - processed_system_time;
413 __normalize_time(&sec, &nsec);
414 usec += (long)nsec / NSEC_PER_USEC;
416 if (unlikely(!time_values_up_to_date(cpu))) {
417 /*
418 * We may have blocked for a long time,
419 * rendering our calculations invalid
420 * (e.g. the time delta may have
421 * overflowed). Detect that and recalculate
422 * with fresh values.
423 */
424 get_time_values_from_xen();
425 continue;
426 }
427 } while (read_seqretry(&xtime_lock, seq) ||
428 (local_time_version != shadow->version));
430 put_cpu();
432 while (usec >= USEC_PER_SEC) {
433 usec -= USEC_PER_SEC;
434 sec++;
435 }
437 tv->tv_sec = sec;
438 tv->tv_usec = usec;
439 }
441 EXPORT_SYMBOL(do_gettimeofday);
443 int do_settimeofday(struct timespec *tv)
444 {
445 time_t sec;
446 s64 nsec;
447 unsigned int cpu;
448 struct shadow_time_info *shadow;
449 dom0_op_t op;
451 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
452 return -EINVAL;
454 cpu = get_cpu();
455 shadow = &per_cpu(shadow_time, cpu);
457 write_seqlock_irq(&xtime_lock);
459 /*
460 * Ensure we don't get blocked for a long time so that our time delta
461 * overflows. If that were to happen then our shadow time values would
462 * be stale, so we can retry with fresh ones.
463 */
464 for (;;) {
465 nsec = tv->tv_nsec - get_nsec_offset(shadow);
466 if (time_values_up_to_date(cpu))
467 break;
468 get_time_values_from_xen();
469 }
470 sec = tv->tv_sec;
471 __normalize_time(&sec, &nsec);
473 if (is_initial_xendomain() && !independent_wallclock) {
474 op.cmd = DOM0_SETTIME;
475 op.u.settime.secs = sec;
476 op.u.settime.nsecs = nsec;
477 op.u.settime.system_time = shadow->system_timestamp;
478 HYPERVISOR_dom0_op(&op);
479 update_wallclock();
480 } else if (independent_wallclock) {
481 nsec -= shadow->system_timestamp;
482 __normalize_time(&sec, &nsec);
483 __update_wallclock(sec, nsec);
484 }
486 write_sequnlock_irq(&xtime_lock);
488 put_cpu();
490 clock_was_set();
491 return 0;
492 }
494 EXPORT_SYMBOL(do_settimeofday);
496 static void sync_xen_wallclock(unsigned long dummy);
497 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
498 static void sync_xen_wallclock(unsigned long dummy)
499 {
500 time_t sec;
501 s64 nsec;
502 dom0_op_t op;
504 if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
505 return;
507 write_seqlock_irq(&xtime_lock);
509 sec = xtime.tv_sec;
510 nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
511 __normalize_time(&sec, &nsec);
513 op.cmd = DOM0_SETTIME;
514 op.u.settime.secs = sec;
515 op.u.settime.nsecs = nsec;
516 op.u.settime.system_time = processed_system_time;
517 HYPERVISOR_dom0_op(&op);
519 update_wallclock();
521 write_sequnlock_irq(&xtime_lock);
523 /* Once per minute. */
524 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
525 }
527 static int set_rtc_mmss(unsigned long nowtime)
528 {
529 int retval;
531 WARN_ON(irqs_disabled());
533 if (independent_wallclock || !is_initial_xendomain())
534 return 0;
536 /* gets recalled with irq locally disabled */
537 spin_lock_irq(&rtc_lock);
538 if (efi_enabled)
539 retval = efi_set_rtc_mmss(nowtime);
540 else
541 retval = mach_set_rtc_mmss(nowtime);
542 spin_unlock_irq(&rtc_lock);
544 return retval;
545 }
547 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
548 * Note: This function is required to return accurate
549 * time even in the absence of multiple timer ticks.
550 */
551 unsigned long long monotonic_clock(void)
552 {
553 int cpu = get_cpu();
554 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
555 u64 time;
556 u32 local_time_version;
558 do {
559 local_time_version = shadow->version;
560 barrier();
561 time = shadow->system_timestamp + get_nsec_offset(shadow);
562 if (!time_values_up_to_date(cpu))
563 get_time_values_from_xen();
564 barrier();
565 } while (local_time_version != shadow->version);
567 put_cpu();
569 return time;
570 }
571 EXPORT_SYMBOL(monotonic_clock);
573 unsigned long long sched_clock(void)
574 {
575 return monotonic_clock();
576 }
578 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
579 unsigned long profile_pc(struct pt_regs *regs)
580 {
581 unsigned long pc = instruction_pointer(regs);
583 #ifdef __x86_64__
584 /* Assume the lock function has either no stack frame or only a single word.
585 This checks if the address on the stack looks like a kernel text address.
586 There is a small window for false hits, but in that case the tick
587 is just accounted to the spinlock function.
588 Better would be to write these functions in assembler again
589 and check exactly. */
590 if (in_lock_functions(pc)) {
591 char *v = *(char **)regs->rsp;
592 if ((v >= _stext && v <= _etext) ||
593 (v >= _sinittext && v <= _einittext) ||
594 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
595 return (unsigned long)v;
596 return ((unsigned long *)regs->rsp)[1];
597 }
598 #else
599 if (in_lock_functions(pc))
600 return *(unsigned long *)(regs->ebp + 4);
601 #endif
603 return pc;
604 }
605 EXPORT_SYMBOL(profile_pc);
606 #endif
608 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
609 {
610 s64 delta, delta_cpu, stolen, blocked;
611 u64 sched_time;
612 int i, cpu = smp_processor_id();
613 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
614 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
616 write_seqlock(&xtime_lock);
618 do {
619 get_time_values_from_xen();
621 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
622 delta = delta_cpu =
623 shadow->system_timestamp + get_nsec_offset(shadow);
624 delta -= processed_system_time;
625 delta_cpu -= per_cpu(processed_system_time, cpu);
627 /*
628 * Obtain a consistent snapshot of stolen/blocked cycles. We
629 * can use state_entry_time to detect if we get preempted here.
630 */
631 do {
632 sched_time = runstate->state_entry_time;
633 barrier();
634 stolen = runstate->time[RUNSTATE_runnable] +
635 runstate->time[RUNSTATE_offline] -
636 per_cpu(processed_stolen_time, cpu);
637 blocked = runstate->time[RUNSTATE_blocked] -
638 per_cpu(processed_blocked_time, cpu);
639 barrier();
640 } while (sched_time != runstate->state_entry_time);
641 } while (!time_values_up_to_date(cpu));
643 if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
644 unlikely(delta_cpu < -(s64)permitted_clock_jitter))
645 && printk_ratelimit()) {
646 printk("Timer ISR/%d: Time went backwards: "
647 "delta=%lld delta_cpu=%lld shadow=%lld "
648 "off=%lld processed=%lld cpu_processed=%lld\n",
649 cpu, delta, delta_cpu, shadow->system_timestamp,
650 (s64)get_nsec_offset(shadow),
651 processed_system_time,
652 per_cpu(processed_system_time, cpu));
653 for (i = 0; i < num_online_cpus(); i++)
654 printk(" %d: %lld\n", i,
655 per_cpu(processed_system_time, i));
656 }
658 /* System-wide jiffy work. */
659 while (delta >= NS_PER_TICK) {
660 delta -= NS_PER_TICK;
661 processed_system_time += NS_PER_TICK;
662 do_timer(regs);
663 }
665 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
666 update_wallclock();
667 clock_was_set();
668 }
670 write_sequnlock(&xtime_lock);
672 /*
673 * Account stolen ticks.
674 * HACK: Passing NULL to account_steal_time()
675 * ensures that the ticks are accounted as stolen.
676 */
677 if ((stolen > 0) && (delta_cpu > 0)) {
678 delta_cpu -= stolen;
679 if (unlikely(delta_cpu < 0))
680 stolen += delta_cpu; /* clamp local-time progress */
681 do_div(stolen, NS_PER_TICK);
682 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
683 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
684 account_steal_time(NULL, (cputime_t)stolen);
685 }
687 /*
688 * Account blocked ticks.
689 * HACK: Passing idle_task to account_steal_time()
690 * ensures that the ticks are accounted as idle/wait.
691 */
692 if ((blocked > 0) && (delta_cpu > 0)) {
693 delta_cpu -= blocked;
694 if (unlikely(delta_cpu < 0))
695 blocked += delta_cpu; /* clamp local-time progress */
696 do_div(blocked, NS_PER_TICK);
697 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
698 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
699 account_steal_time(idle_task(cpu), (cputime_t)blocked);
700 }
702 /* Account user/system ticks. */
703 if (delta_cpu > 0) {
704 do_div(delta_cpu, NS_PER_TICK);
705 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
706 if (user_mode(regs))
707 account_user_time(current, (cputime_t)delta_cpu);
708 else
709 account_system_time(current, HARDIRQ_OFFSET,
710 (cputime_t)delta_cpu);
711 }
713 /* Local timer processing (see update_process_times()). */
714 run_local_timers();
715 if (rcu_pending(cpu))
716 rcu_check_callbacks(cpu, user_mode(regs));
717 scheduler_tick();
718 run_posix_cpu_timers(current);
719 profile_tick(CPU_PROFILING, regs);
721 return IRQ_HANDLED;
722 }
724 static void init_missing_ticks_accounting(int cpu)
725 {
726 struct vcpu_register_runstate_memory_area area;
727 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
729 memset(runstate, 0, sizeof(*runstate));
731 area.addr.v = runstate;
732 HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
734 per_cpu(processed_blocked_time, cpu) =
735 runstate->time[RUNSTATE_blocked];
736 per_cpu(processed_stolen_time, cpu) =
737 runstate->time[RUNSTATE_runnable] +
738 runstate->time[RUNSTATE_offline];
739 }
741 /* not static: needed by APM */
742 unsigned long get_cmos_time(void)
743 {
744 unsigned long retval;
746 spin_lock(&rtc_lock);
748 if (efi_enabled)
749 retval = efi_get_time();
750 else
751 retval = mach_get_cmos_time();
753 spin_unlock(&rtc_lock);
755 return retval;
756 }
757 EXPORT_SYMBOL(get_cmos_time);
759 static void sync_cmos_clock(unsigned long dummy);
761 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
763 static void sync_cmos_clock(unsigned long dummy)
764 {
765 struct timeval now, next;
766 int fail = 1;
768 /*
769 * If we have an externally synchronized Linux clock, then update
770 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
771 * called as close as possible to 500 ms before the new second starts.
772 * This code is run on a timer. If the clock is set, that timer
773 * may not expire at the correct time. Thus, we adjust...
774 */
775 if (!ntp_synced())
776 /*
777 * Not synced, exit, do not restart a timer (if one is
778 * running, let it run out).
779 */
780 return;
782 do_gettimeofday(&now);
783 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
784 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
785 fail = set_rtc_mmss(now.tv_sec);
787 next.tv_usec = USEC_AFTER - now.tv_usec;
788 if (next.tv_usec <= 0)
789 next.tv_usec += USEC_PER_SEC;
791 if (!fail)
792 next.tv_sec = 659;
793 else
794 next.tv_sec = 0;
796 if (next.tv_usec >= USEC_PER_SEC) {
797 next.tv_sec++;
798 next.tv_usec -= USEC_PER_SEC;
799 }
800 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
801 }
803 void notify_arch_cmos_timer(void)
804 {
805 mod_timer(&sync_cmos_timer, jiffies + 1);
806 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
807 }
809 static long clock_cmos_diff, sleep_start;
811 static struct timer_opts *last_timer;
812 static int timer_suspend(struct sys_device *dev, pm_message_t state)
813 {
814 /*
815 * Estimate time zone so that set_time can update the clock
816 */
817 clock_cmos_diff = -get_cmos_time();
818 clock_cmos_diff += get_seconds();
819 sleep_start = get_cmos_time();
820 last_timer = cur_timer;
821 cur_timer = &timer_none;
822 if (last_timer->suspend)
823 last_timer->suspend(state);
824 return 0;
825 }
827 static int timer_resume(struct sys_device *dev)
828 {
829 unsigned long flags;
830 unsigned long sec;
831 unsigned long sleep_length;
833 #ifdef CONFIG_HPET_TIMER
834 if (is_hpet_enabled())
835 hpet_reenable();
836 #endif
837 sec = get_cmos_time() + clock_cmos_diff;
838 sleep_length = (get_cmos_time() - sleep_start) * HZ;
839 write_seqlock_irqsave(&xtime_lock, flags);
840 xtime.tv_sec = sec;
841 xtime.tv_nsec = 0;
842 jiffies_64 += sleep_length;
843 wall_jiffies += sleep_length;
844 write_sequnlock_irqrestore(&xtime_lock, flags);
845 if (last_timer->resume)
846 last_timer->resume();
847 cur_timer = last_timer;
848 last_timer = NULL;
849 touch_softlockup_watchdog();
850 return 0;
851 }
853 static struct sysdev_class timer_sysclass = {
854 .resume = timer_resume,
855 .suspend = timer_suspend,
856 set_kset_name("timer"),
857 };
860 /* XXX this driverfs stuff should probably go elsewhere later -john */
861 static struct sys_device device_timer = {
862 .id = 0,
863 .cls = &timer_sysclass,
864 };
866 static int time_init_device(void)
867 {
868 int error = sysdev_class_register(&timer_sysclass);
869 if (!error)
870 error = sysdev_register(&device_timer);
871 return error;
872 }
874 device_initcall(time_init_device);
876 #ifdef CONFIG_HPET_TIMER
877 extern void (*late_time_init)(void);
878 /* Duplicate of time_init() below, with hpet_enable part added */
879 static void __init hpet_time_init(void)
880 {
881 xtime.tv_sec = get_cmos_time();
882 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
883 set_normalized_timespec(&wall_to_monotonic,
884 -xtime.tv_sec, -xtime.tv_nsec);
886 if ((hpet_enable() >= 0) && hpet_use_timer) {
887 printk("Using HPET for base-timer\n");
888 }
890 cur_timer = select_timer();
891 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
893 time_init_hook();
894 }
895 #endif
897 /* Dynamically-mapped IRQ. */
898 DEFINE_PER_CPU(int, timer_irq);
900 extern void (*late_time_init)(void);
901 static void setup_cpu0_timer_irq(void)
902 {
903 per_cpu(timer_irq, 0) =
904 bind_virq_to_irqhandler(
905 VIRQ_TIMER,
906 0,
907 timer_interrupt,
908 SA_INTERRUPT,
909 "timer0",
910 NULL);
911 BUG_ON(per_cpu(timer_irq, 0) < 0);
912 }
914 void __init time_init(void)
915 {
916 #ifdef CONFIG_HPET_TIMER
917 if (is_hpet_capable()) {
918 /*
919 * HPET initialization needs to do memory-mapped io. So, let
920 * us do a late initialization after mem_init().
921 */
922 late_time_init = hpet_time_init;
923 return;
924 }
925 #endif
926 get_time_values_from_xen();
928 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
929 per_cpu(processed_system_time, 0) = processed_system_time;
930 init_missing_ticks_accounting(0);
932 update_wallclock();
934 init_cpu_khz();
935 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
936 cpu_khz / 1000, cpu_khz % 1000);
938 #if defined(__x86_64__)
939 vxtime.mode = VXTIME_TSC;
940 vxtime.quot = (1000000L << 32) / vxtime_hz;
941 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
942 sync_core();
943 rdtscll(vxtime.last_tsc);
944 #endif
946 /* Cannot request_irq() until kmem is initialised. */
947 late_time_init = setup_cpu0_timer_irq;
948 }
950 /* Convert jiffies to system time. */
951 u64 jiffies_to_st(unsigned long j)
952 {
953 unsigned long seq;
954 long delta;
955 u64 st;
957 do {
958 seq = read_seqbegin(&xtime_lock);
959 delta = j - jiffies;
960 if (delta < 1) {
961 /* Triggers in some wrap-around cases, but that's okay:
962 * we just end up with a shorter timeout. */
963 st = processed_system_time + NS_PER_TICK;
964 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
965 /* Very long timeout means there is no pending timer.
966 * We indicate this to Xen by passing zero timeout. */
967 st = 0;
968 } else {
969 st = processed_system_time + delta * (u64)NS_PER_TICK;
970 }
971 } while (read_seqretry(&xtime_lock, seq));
973 return st;
974 }
975 EXPORT_SYMBOL(jiffies_to_st);
977 /*
978 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
979 * These functions are based on implementations from arch/s390/kernel/time.c
980 */
981 static void stop_hz_timer(void)
982 {
983 unsigned int cpu = smp_processor_id();
984 unsigned long j;
986 cpu_set(cpu, nohz_cpu_mask);
988 /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
989 /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
990 /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
991 /* stop the hz timer then the cpumasks created for subsequent values */
992 /* of cur in rcu_start_batch are guaranteed to pick up the updated */
993 /* nohz_cpu_mask and so will not depend on this cpu. */
995 smp_mb();
997 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
998 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
999 (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
1000 cpu_clear(cpu, nohz_cpu_mask);
1001 j = jiffies + 1;
1004 if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
1005 BUG();
1008 static void start_hz_timer(void)
1010 cpu_clear(smp_processor_id(), nohz_cpu_mask);
1013 void safe_halt(void)
1015 stop_hz_timer();
1016 /* Blocking includes an implicit local_irq_enable(). */
1017 HYPERVISOR_block();
1018 start_hz_timer();
1020 EXPORT_SYMBOL(safe_halt);
1022 void halt(void)
1024 if (irqs_disabled())
1025 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
1027 EXPORT_SYMBOL(halt);
1029 /* No locking required. We are only CPU running, and interrupts are off. */
1030 void time_resume(void)
1032 init_cpu_khz();
1034 get_time_values_from_xen();
1036 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1037 per_cpu(processed_system_time, 0) = processed_system_time;
1038 init_missing_ticks_accounting(0);
1040 update_wallclock();
1043 #ifdef CONFIG_SMP
1044 static char timer_name[NR_CPUS][15];
1046 void local_setup_timer(unsigned int cpu)
1048 int seq;
1050 BUG_ON(cpu == 0);
1052 do {
1053 seq = read_seqbegin(&xtime_lock);
1054 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1055 per_cpu(processed_system_time, cpu) =
1056 per_cpu(shadow_time, 0).system_timestamp;
1057 init_missing_ticks_accounting(cpu);
1058 } while (read_seqretry(&xtime_lock, seq));
1060 sprintf(timer_name[cpu], "timer%d", cpu);
1061 per_cpu(timer_irq, cpu) =
1062 bind_virq_to_irqhandler(
1063 VIRQ_TIMER,
1064 cpu,
1065 timer_interrupt,
1066 SA_INTERRUPT,
1067 timer_name[cpu],
1068 NULL);
1069 BUG_ON(per_cpu(timer_irq, cpu) < 0);
1072 void local_teardown_timer(unsigned int cpu)
1074 BUG_ON(cpu == 0);
1075 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1077 #endif
1079 /*
1080 * /proc/sys/xen: This really belongs in another file. It can stay here for
1081 * now however.
1082 */
1083 static ctl_table xen_subtable[] = {
1085 .ctl_name = 1,
1086 .procname = "independent_wallclock",
1087 .data = &independent_wallclock,
1088 .maxlen = sizeof(independent_wallclock),
1089 .mode = 0644,
1090 .proc_handler = proc_dointvec
1091 },
1093 .ctl_name = 2,
1094 .procname = "permitted_clock_jitter",
1095 .data = &permitted_clock_jitter,
1096 .maxlen = sizeof(permitted_clock_jitter),
1097 .mode = 0644,
1098 .proc_handler = proc_doulongvec_minmax
1099 },
1100 { 0 }
1101 };
1102 static ctl_table xen_table[] = {
1104 .ctl_name = 123,
1105 .procname = "xen",
1106 .mode = 0555,
1107 .child = xen_subtable},
1108 { 0 }
1109 };
1110 static int __init xen_sysctl_init(void)
1112 (void)register_sysctl_table(xen_table, 0);
1113 return 0;
1115 __initcall(xen_sysctl_init);