direct-io.hg

view linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @ 14357:468ec3d142ad

[LINUX] Fix typo so 3.0.4 compatability works.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian Campbell <ian.campbell@xensource.com>
date Tue Mar 13 09:07:55 2007 +0000 (2007-03-13)
parents 7f624c770dbd
children 2b24d842bbd3
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
65 #include "mach_time.h"
67 #include <linux/timex.h>
69 #include <asm/hpet.h>
71 #include <asm/arch_hooks.h>
73 #include <xen/evtchn.h>
74 #include <xen/interface/vcpu.h>
76 #if defined (__i386__)
77 #include <asm/i8259.h>
78 #endif
80 int pit_latch_buggy; /* extern */
82 #if defined(__x86_64__)
83 unsigned long vxtime_hz = PIT_TICK_RATE;
84 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
85 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
86 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
87 struct timespec __xtime __section_xtime;
88 struct timezone __sys_tz __section_sys_tz;
89 #endif
91 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
92 EXPORT_SYMBOL(cpu_khz);
94 extern unsigned long wall_jiffies;
96 DEFINE_SPINLOCK(rtc_lock);
97 EXPORT_SYMBOL(rtc_lock);
99 extern struct init_timer_opts timer_tsc_init;
100 extern struct timer_opts timer_tsc;
101 #define timer_none timer_tsc
103 /* These are peridically updated in shared_info, and then copied here. */
104 struct shadow_time_info {
105 u64 tsc_timestamp; /* TSC at last update of time vals. */
106 u64 system_timestamp; /* Time, in nanosecs, since boot. */
107 u32 tsc_to_nsec_mul;
108 u32 tsc_to_usec_mul;
109 int tsc_shift;
110 u32 version;
111 };
112 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
113 static struct timespec shadow_tv;
114 static u32 shadow_tv_version;
116 /* Keep track of last time we did processing/updating of jiffies and xtime. */
117 static u64 processed_system_time; /* System time (ns) at last processing. */
118 static DEFINE_PER_CPU(u64, processed_system_time);
120 /* How much CPU time was spent blocked and how much was 'stolen'? */
121 static DEFINE_PER_CPU(u64, processed_stolen_time);
122 static DEFINE_PER_CPU(u64, processed_blocked_time);
124 /* Current runstate of each CPU (updated automatically by the hypervisor). */
125 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
127 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
128 #define NS_PER_TICK (1000000000LL/HZ)
130 static inline void __normalize_time(time_t *sec, s64 *nsec)
131 {
132 while (*nsec >= NSEC_PER_SEC) {
133 (*nsec) -= NSEC_PER_SEC;
134 (*sec)++;
135 }
136 while (*nsec < 0) {
137 (*nsec) += NSEC_PER_SEC;
138 (*sec)--;
139 }
140 }
142 /* Does this guest OS track Xen time, or set its wall clock independently? */
143 static int independent_wallclock = 0;
144 static int __init __independent_wallclock(char *str)
145 {
146 independent_wallclock = 1;
147 return 1;
148 }
149 __setup("independent_wallclock", __independent_wallclock);
151 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
152 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
153 static int __init __permitted_clock_jitter(char *str)
154 {
155 permitted_clock_jitter = simple_strtoul(str, NULL, 0);
156 return 1;
157 }
158 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
160 #if 0
161 static void delay_tsc(unsigned long loops)
162 {
163 unsigned long bclock, now;
165 rdtscl(bclock);
166 do {
167 rep_nop();
168 rdtscl(now);
169 } while ((now - bclock) < loops);
170 }
172 struct timer_opts timer_tsc = {
173 .name = "tsc",
174 .delay = delay_tsc,
175 };
176 #endif
178 /*
179 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
180 * yielding a 64-bit result.
181 */
182 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
183 {
184 u64 product;
185 #ifdef __i386__
186 u32 tmp1, tmp2;
187 #endif
189 if (shift < 0)
190 delta >>= -shift;
191 else
192 delta <<= shift;
194 #ifdef __i386__
195 __asm__ (
196 "mul %5 ; "
197 "mov %4,%%eax ; "
198 "mov %%edx,%4 ; "
199 "mul %5 ; "
200 "xor %5,%5 ; "
201 "add %4,%%eax ; "
202 "adc %5,%%edx ; "
203 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
204 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
205 #else
206 __asm__ (
207 "mul %%rdx ; shrd $32,%%rdx,%%rax"
208 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
209 #endif
211 return product;
212 }
214 #if 0 /* defined (__i386__) */
215 int read_current_timer(unsigned long *timer_val)
216 {
217 rdtscl(*timer_val);
218 return 0;
219 }
220 #endif
222 void init_cpu_khz(void)
223 {
224 u64 __cpu_khz = 1000000ULL << 32;
225 struct vcpu_time_info *info = &vcpu_info(0)->time;
226 do_div(__cpu_khz, info->tsc_to_system_mul);
227 if (info->tsc_shift < 0)
228 cpu_khz = __cpu_khz << -info->tsc_shift;
229 else
230 cpu_khz = __cpu_khz >> info->tsc_shift;
231 }
233 static u64 get_nsec_offset(struct shadow_time_info *shadow)
234 {
235 u64 now, delta;
236 rdtscll(now);
237 delta = now - shadow->tsc_timestamp;
238 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
239 }
241 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
242 {
243 u64 now, delta;
244 rdtscll(now);
245 delta = now - shadow->tsc_timestamp;
246 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
247 }
249 static void __update_wallclock(time_t sec, long nsec)
250 {
251 long wtm_nsec, xtime_nsec;
252 time_t wtm_sec, xtime_sec;
253 u64 tmp, wc_nsec;
255 /* Adjust wall-clock time base based on wall_jiffies ticks. */
256 wc_nsec = processed_system_time;
257 wc_nsec += sec * (u64)NSEC_PER_SEC;
258 wc_nsec += nsec;
259 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
261 /* Split wallclock base into seconds and nanoseconds. */
262 tmp = wc_nsec;
263 xtime_nsec = do_div(tmp, 1000000000);
264 xtime_sec = (time_t)tmp;
266 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
267 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
269 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
270 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
272 ntp_clear();
273 }
275 static void update_wallclock(void)
276 {
277 shared_info_t *s = HYPERVISOR_shared_info;
279 do {
280 shadow_tv_version = s->wc_version;
281 rmb();
282 shadow_tv.tv_sec = s->wc_sec;
283 shadow_tv.tv_nsec = s->wc_nsec;
284 rmb();
285 } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
287 if (!independent_wallclock)
288 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
289 }
291 /*
292 * Reads a consistent set of time-base values from Xen, into a shadow data
293 * area.
294 */
295 static void get_time_values_from_xen(int cpu)
296 {
297 struct vcpu_time_info *src;
298 struct shadow_time_info *dst;
300 src = &vcpu_info(cpu)->time;
301 dst = &per_cpu(shadow_time, cpu);
303 do {
304 dst->version = src->version;
305 rmb();
306 dst->tsc_timestamp = src->tsc_timestamp;
307 dst->system_timestamp = src->system_time;
308 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
309 dst->tsc_shift = src->tsc_shift;
310 rmb();
311 } while ((src->version & 1) | (dst->version ^ src->version));
313 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
314 }
316 static inline int time_values_up_to_date(int cpu)
317 {
318 struct vcpu_time_info *src;
319 struct shadow_time_info *dst;
321 src = &vcpu_info(cpu)->time;
322 dst = &per_cpu(shadow_time, cpu);
324 rmb();
325 return (dst->version == src->version);
326 }
328 /*
329 * This is a special lock that is owned by the CPU and holds the index
330 * register we are working with. It is required for NMI access to the
331 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
332 */
333 volatile unsigned long cmos_lock = 0;
334 EXPORT_SYMBOL(cmos_lock);
336 /* Routines for accessing the CMOS RAM/RTC. */
337 unsigned char rtc_cmos_read(unsigned char addr)
338 {
339 unsigned char val;
340 lock_cmos_prefix(addr);
341 outb_p(addr, RTC_PORT(0));
342 val = inb_p(RTC_PORT(1));
343 lock_cmos_suffix(addr);
344 return val;
345 }
346 EXPORT_SYMBOL(rtc_cmos_read);
348 void rtc_cmos_write(unsigned char val, unsigned char addr)
349 {
350 lock_cmos_prefix(addr);
351 outb_p(addr, RTC_PORT(0));
352 outb_p(val, RTC_PORT(1));
353 lock_cmos_suffix(addr);
354 }
355 EXPORT_SYMBOL(rtc_cmos_write);
357 /*
358 * This version of gettimeofday has microsecond resolution
359 * and better than microsecond precision on fast x86 machines with TSC.
360 */
361 void do_gettimeofday(struct timeval *tv)
362 {
363 unsigned long seq;
364 unsigned long usec, sec;
365 unsigned long max_ntp_tick;
366 s64 nsec;
367 unsigned int cpu;
368 struct shadow_time_info *shadow;
369 u32 local_time_version;
371 cpu = get_cpu();
372 shadow = &per_cpu(shadow_time, cpu);
374 do {
375 unsigned long lost;
377 local_time_version = shadow->version;
378 seq = read_seqbegin(&xtime_lock);
380 usec = get_usec_offset(shadow);
381 lost = jiffies - wall_jiffies;
383 /*
384 * If time_adjust is negative then NTP is slowing the clock
385 * so make sure not to go into next possible interval.
386 * Better to lose some accuracy than have time go backwards..
387 */
388 if (unlikely(time_adjust < 0)) {
389 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
390 usec = min(usec, max_ntp_tick);
392 if (lost)
393 usec += lost * max_ntp_tick;
394 }
395 else if (unlikely(lost))
396 usec += lost * (USEC_PER_SEC / HZ);
398 sec = xtime.tv_sec;
399 usec += (xtime.tv_nsec / NSEC_PER_USEC);
401 nsec = shadow->system_timestamp - processed_system_time;
402 __normalize_time(&sec, &nsec);
403 usec += (long)nsec / NSEC_PER_USEC;
405 if (unlikely(!time_values_up_to_date(cpu))) {
406 /*
407 * We may have blocked for a long time,
408 * rendering our calculations invalid
409 * (e.g. the time delta may have
410 * overflowed). Detect that and recalculate
411 * with fresh values.
412 */
413 get_time_values_from_xen(cpu);
414 continue;
415 }
416 } while (read_seqretry(&xtime_lock, seq) ||
417 (local_time_version != shadow->version));
419 put_cpu();
421 while (usec >= USEC_PER_SEC) {
422 usec -= USEC_PER_SEC;
423 sec++;
424 }
426 tv->tv_sec = sec;
427 tv->tv_usec = usec;
428 }
430 EXPORT_SYMBOL(do_gettimeofday);
432 int do_settimeofday(struct timespec *tv)
433 {
434 time_t sec;
435 s64 nsec;
436 unsigned int cpu;
437 struct shadow_time_info *shadow;
438 struct xen_platform_op op;
440 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
441 return -EINVAL;
443 cpu = get_cpu();
444 shadow = &per_cpu(shadow_time, cpu);
446 write_seqlock_irq(&xtime_lock);
448 /*
449 * Ensure we don't get blocked for a long time so that our time delta
450 * overflows. If that were to happen then our shadow time values would
451 * be stale, so we can retry with fresh ones.
452 */
453 for (;;) {
454 nsec = tv->tv_nsec - get_nsec_offset(shadow);
455 if (time_values_up_to_date(cpu))
456 break;
457 get_time_values_from_xen(cpu);
458 }
459 sec = tv->tv_sec;
460 __normalize_time(&sec, &nsec);
462 if (is_initial_xendomain() && !independent_wallclock) {
463 op.cmd = XENPF_settime;
464 op.u.settime.secs = sec;
465 op.u.settime.nsecs = nsec;
466 op.u.settime.system_time = shadow->system_timestamp;
467 HYPERVISOR_platform_op(&op);
468 update_wallclock();
469 } else if (independent_wallclock) {
470 nsec -= shadow->system_timestamp;
471 __normalize_time(&sec, &nsec);
472 __update_wallclock(sec, nsec);
473 }
475 write_sequnlock_irq(&xtime_lock);
477 put_cpu();
479 clock_was_set();
480 return 0;
481 }
483 EXPORT_SYMBOL(do_settimeofday);
485 static void sync_xen_wallclock(unsigned long dummy);
486 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
487 static void sync_xen_wallclock(unsigned long dummy)
488 {
489 time_t sec;
490 s64 nsec;
491 struct xen_platform_op op;
493 if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
494 return;
496 write_seqlock_irq(&xtime_lock);
498 sec = xtime.tv_sec;
499 nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
500 __normalize_time(&sec, &nsec);
502 op.cmd = XENPF_settime;
503 op.u.settime.secs = sec;
504 op.u.settime.nsecs = nsec;
505 op.u.settime.system_time = processed_system_time;
506 HYPERVISOR_platform_op(&op);
508 update_wallclock();
510 write_sequnlock_irq(&xtime_lock);
512 /* Once per minute. */
513 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
514 }
516 static int set_rtc_mmss(unsigned long nowtime)
517 {
518 int retval;
519 unsigned long flags;
521 if (independent_wallclock || !is_initial_xendomain())
522 return 0;
524 /* gets recalled with irq locally disabled */
525 /* XXX - does irqsave resolve this? -johnstul */
526 spin_lock_irqsave(&rtc_lock, flags);
527 if (efi_enabled)
528 retval = efi_set_rtc_mmss(nowtime);
529 else
530 retval = mach_set_rtc_mmss(nowtime);
531 spin_unlock_irqrestore(&rtc_lock, flags);
533 return retval;
534 }
536 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
537 * Note: This function is required to return accurate
538 * time even in the absence of multiple timer ticks.
539 */
540 unsigned long long monotonic_clock(void)
541 {
542 int cpu = get_cpu();
543 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
544 u64 time;
545 u32 local_time_version;
547 do {
548 local_time_version = shadow->version;
549 barrier();
550 time = shadow->system_timestamp + get_nsec_offset(shadow);
551 if (!time_values_up_to_date(cpu))
552 get_time_values_from_xen(cpu);
553 barrier();
554 } while (local_time_version != shadow->version);
556 put_cpu();
558 return time;
559 }
560 EXPORT_SYMBOL(monotonic_clock);
562 #ifdef __x86_64__
563 unsigned long long sched_clock(void)
564 {
565 return monotonic_clock();
566 }
567 #endif
569 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
570 unsigned long profile_pc(struct pt_regs *regs)
571 {
572 unsigned long pc = instruction_pointer(regs);
574 #ifdef __x86_64__
575 /* Assume the lock function has either no stack frame or only a single word.
576 This checks if the address on the stack looks like a kernel text address.
577 There is a small window for false hits, but in that case the tick
578 is just accounted to the spinlock function.
579 Better would be to write these functions in assembler again
580 and check exactly. */
581 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
582 char *v = *(char **)regs->rsp;
583 if ((v >= _stext && v <= _etext) ||
584 (v >= _sinittext && v <= _einittext) ||
585 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
586 return (unsigned long)v;
587 return ((unsigned long *)regs->rsp)[1];
588 }
589 #else
590 if (!user_mode_vm(regs) && in_lock_functions(pc))
591 return *(unsigned long *)(regs->ebp + 4);
592 #endif
594 return pc;
595 }
596 EXPORT_SYMBOL(profile_pc);
597 #endif
599 /*
600 * This is the same as the above, except we _also_ save the current
601 * Time Stamp Counter value at the time of the timer interrupt, so that
602 * we later on can estimate the time of day more exactly.
603 */
604 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
605 {
606 s64 delta, delta_cpu, stolen, blocked;
607 u64 sched_time;
608 int i, cpu = smp_processor_id();
609 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
610 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
612 /*
613 * Here we are in the timer irq handler. We just have irqs locally
614 * disabled but we don't know if the timer_bh is running on the other
615 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
616 * the irq version of write_lock because as just said we have irq
617 * locally disabled. -arca
618 */
619 write_seqlock(&xtime_lock);
621 do {
622 get_time_values_from_xen(cpu);
624 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
625 delta = delta_cpu =
626 shadow->system_timestamp + get_nsec_offset(shadow);
627 delta -= processed_system_time;
628 delta_cpu -= per_cpu(processed_system_time, cpu);
630 /*
631 * Obtain a consistent snapshot of stolen/blocked cycles. We
632 * can use state_entry_time to detect if we get preempted here.
633 */
634 do {
635 sched_time = runstate->state_entry_time;
636 barrier();
637 stolen = runstate->time[RUNSTATE_runnable] +
638 runstate->time[RUNSTATE_offline] -
639 per_cpu(processed_stolen_time, cpu);
640 blocked = runstate->time[RUNSTATE_blocked] -
641 per_cpu(processed_blocked_time, cpu);
642 barrier();
643 } while (sched_time != runstate->state_entry_time);
644 } while (!time_values_up_to_date(cpu));
646 if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
647 unlikely(delta_cpu < -(s64)permitted_clock_jitter))
648 && printk_ratelimit()) {
649 printk("Timer ISR/%d: Time went backwards: "
650 "delta=%lld delta_cpu=%lld shadow=%lld "
651 "off=%lld processed=%lld cpu_processed=%lld\n",
652 cpu, delta, delta_cpu, shadow->system_timestamp,
653 (s64)get_nsec_offset(shadow),
654 processed_system_time,
655 per_cpu(processed_system_time, cpu));
656 for (i = 0; i < num_online_cpus(); i++)
657 printk(" %d: %lld\n", i,
658 per_cpu(processed_system_time, i));
659 }
661 /* System-wide jiffy work. */
662 while (delta >= NS_PER_TICK) {
663 delta -= NS_PER_TICK;
664 processed_system_time += NS_PER_TICK;
665 do_timer(regs);
666 }
668 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
669 update_wallclock();
670 clock_was_set();
671 }
673 write_sequnlock(&xtime_lock);
675 /*
676 * Account stolen ticks.
677 * HACK: Passing NULL to account_steal_time()
678 * ensures that the ticks are accounted as stolen.
679 */
680 if ((stolen > 0) && (delta_cpu > 0)) {
681 delta_cpu -= stolen;
682 if (unlikely(delta_cpu < 0))
683 stolen += delta_cpu; /* clamp local-time progress */
684 do_div(stolen, NS_PER_TICK);
685 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
686 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
687 account_steal_time(NULL, (cputime_t)stolen);
688 }
690 /*
691 * Account blocked ticks.
692 * HACK: Passing idle_task to account_steal_time()
693 * ensures that the ticks are accounted as idle/wait.
694 */
695 if ((blocked > 0) && (delta_cpu > 0)) {
696 delta_cpu -= blocked;
697 if (unlikely(delta_cpu < 0))
698 blocked += delta_cpu; /* clamp local-time progress */
699 do_div(blocked, NS_PER_TICK);
700 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
701 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
702 account_steal_time(idle_task(cpu), (cputime_t)blocked);
703 }
705 /* Account user/system ticks. */
706 if (delta_cpu > 0) {
707 do_div(delta_cpu, NS_PER_TICK);
708 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
709 if (user_mode_vm(regs))
710 account_user_time(current, (cputime_t)delta_cpu);
711 else
712 account_system_time(current, HARDIRQ_OFFSET,
713 (cputime_t)delta_cpu);
714 }
716 /* Offlined for more than a few seconds? Avoid lockup warnings. */
717 if (stolen > 5*HZ)
718 touch_softlockup_watchdog();
720 /* Local timer processing (see update_process_times()). */
721 run_local_timers();
722 if (rcu_pending(cpu))
723 rcu_check_callbacks(cpu, user_mode_vm(regs));
724 scheduler_tick();
725 run_posix_cpu_timers(current);
726 profile_tick(CPU_PROFILING, regs);
728 return IRQ_HANDLED;
729 }
731 static void init_missing_ticks_accounting(int cpu)
732 {
733 struct vcpu_register_runstate_memory_area area;
734 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
736 memset(runstate, 0, sizeof(*runstate));
738 area.addr.v = runstate;
739 HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
741 per_cpu(processed_blocked_time, cpu) =
742 runstate->time[RUNSTATE_blocked];
743 per_cpu(processed_stolen_time, cpu) =
744 runstate->time[RUNSTATE_runnable] +
745 runstate->time[RUNSTATE_offline];
746 }
748 /* not static: needed by APM */
749 unsigned long get_cmos_time(void)
750 {
751 unsigned long retval;
752 unsigned long flags;
754 spin_lock_irqsave(&rtc_lock, flags);
756 if (efi_enabled)
757 retval = efi_get_time();
758 else
759 retval = mach_get_cmos_time();
761 spin_unlock_irqrestore(&rtc_lock, flags);
763 return retval;
764 }
765 EXPORT_SYMBOL(get_cmos_time);
767 static void sync_cmos_clock(unsigned long dummy);
769 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
771 static void sync_cmos_clock(unsigned long dummy)
772 {
773 struct timeval now, next;
774 int fail = 1;
776 /*
777 * If we have an externally synchronized Linux clock, then update
778 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
779 * called as close as possible to 500 ms before the new second starts.
780 * This code is run on a timer. If the clock is set, that timer
781 * may not expire at the correct time. Thus, we adjust...
782 */
783 if (!ntp_synced())
784 /*
785 * Not synced, exit, do not restart a timer (if one is
786 * running, let it run out).
787 */
788 return;
790 do_gettimeofday(&now);
791 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
792 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
793 fail = set_rtc_mmss(now.tv_sec);
795 next.tv_usec = USEC_AFTER - now.tv_usec;
796 if (next.tv_usec <= 0)
797 next.tv_usec += USEC_PER_SEC;
799 if (!fail)
800 next.tv_sec = 659;
801 else
802 next.tv_sec = 0;
804 if (next.tv_usec >= USEC_PER_SEC) {
805 next.tv_sec++;
806 next.tv_usec -= USEC_PER_SEC;
807 }
808 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
809 }
811 void notify_arch_cmos_timer(void)
812 {
813 mod_timer(&sync_cmos_timer, jiffies + 1);
814 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
815 }
817 static long clock_cmos_diff, sleep_start;
819 static int timer_suspend(struct sys_device *dev, pm_message_t state)
820 {
821 /*
822 * Estimate time zone so that set_time can update the clock
823 */
824 clock_cmos_diff = -get_cmos_time();
825 clock_cmos_diff += get_seconds();
826 sleep_start = get_cmos_time();
827 return 0;
828 }
830 static int timer_resume(struct sys_device *dev)
831 {
832 unsigned long flags;
833 unsigned long sec;
834 unsigned long sleep_length;
836 #ifdef CONFIG_HPET_TIMER
837 if (is_hpet_enabled())
838 hpet_reenable();
839 #endif
840 sec = get_cmos_time() + clock_cmos_diff;
841 sleep_length = (get_cmos_time() - sleep_start) * HZ;
842 write_seqlock_irqsave(&xtime_lock, flags);
843 xtime.tv_sec = sec;
844 xtime.tv_nsec = 0;
845 jiffies_64 += sleep_length;
846 wall_jiffies += sleep_length;
847 write_sequnlock_irqrestore(&xtime_lock, flags);
848 touch_softlockup_watchdog();
849 return 0;
850 }
852 static struct sysdev_class timer_sysclass = {
853 .resume = timer_resume,
854 .suspend = timer_suspend,
855 set_kset_name("timer"),
856 };
859 /* XXX this driverfs stuff should probably go elsewhere later -john */
860 static struct sys_device device_timer = {
861 .id = 0,
862 .cls = &timer_sysclass,
863 };
865 static int time_init_device(void)
866 {
867 int error = sysdev_class_register(&timer_sysclass);
868 if (!error)
869 error = sysdev_register(&device_timer);
870 return error;
871 }
873 device_initcall(time_init_device);
875 #ifdef CONFIG_HPET_TIMER
876 extern void (*late_time_init)(void);
877 /* Duplicate of time_init() below, with hpet_enable part added */
878 static void __init hpet_time_init(void)
879 {
880 xtime.tv_sec = get_cmos_time();
881 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
882 set_normalized_timespec(&wall_to_monotonic,
883 -xtime.tv_sec, -xtime.tv_nsec);
885 if ((hpet_enable() >= 0) && hpet_use_timer) {
886 printk("Using HPET for base-timer\n");
887 }
889 time_init_hook();
890 }
891 #endif
893 /* Dynamically-mapped IRQ. */
894 DEFINE_PER_CPU(int, timer_irq);
896 extern void (*late_time_init)(void);
897 static void setup_cpu0_timer_irq(void)
898 {
899 per_cpu(timer_irq, 0) =
900 bind_virq_to_irqhandler(
901 VIRQ_TIMER,
902 0,
903 timer_interrupt,
904 SA_INTERRUPT,
905 "timer0",
906 NULL);
907 BUG_ON(per_cpu(timer_irq, 0) < 0);
908 }
910 static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
911 .period_ns = NS_PER_TICK
912 };
914 void __init time_init(void)
915 {
916 #ifdef CONFIG_HPET_TIMER
917 if (is_hpet_capable()) {
918 /*
919 * HPET initialization needs to do memory-mapped io. So, let
920 * us do a late initialization after mem_init().
921 */
922 late_time_init = hpet_time_init;
923 return;
924 }
925 #endif
927 HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
928 &xen_set_periodic_tick);
930 get_time_values_from_xen(0);
932 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
933 per_cpu(processed_system_time, 0) = processed_system_time;
934 init_missing_ticks_accounting(0);
936 update_wallclock();
938 init_cpu_khz();
939 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
940 cpu_khz / 1000, cpu_khz % 1000);
942 #if defined(__x86_64__)
943 vxtime.mode = VXTIME_TSC;
944 vxtime.quot = (1000000L << 32) / vxtime_hz;
945 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
946 sync_core();
947 rdtscll(vxtime.last_tsc);
948 #endif
950 /* Cannot request_irq() until kmem is initialised. */
951 late_time_init = setup_cpu0_timer_irq;
952 }
954 /* Convert jiffies to system time. */
955 u64 jiffies_to_st(unsigned long j)
956 {
957 unsigned long seq;
958 long delta;
959 u64 st;
961 do {
962 seq = read_seqbegin(&xtime_lock);
963 delta = j - jiffies;
964 if (delta < 1) {
965 /* Triggers in some wrap-around cases, but that's okay:
966 * we just end up with a shorter timeout. */
967 st = processed_system_time + NS_PER_TICK;
968 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
969 /* Very long timeout means there is no pending timer.
970 * We indicate this to Xen by passing zero timeout. */
971 st = 0;
972 } else {
973 st = processed_system_time + delta * (u64)NS_PER_TICK;
974 }
975 } while (read_seqretry(&xtime_lock, seq));
977 return st;
978 }
979 EXPORT_SYMBOL(jiffies_to_st);
981 /*
982 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
983 * These functions are based on implementations from arch/s390/kernel/time.c
984 */
985 static void stop_hz_timer(void)
986 {
987 struct vcpu_set_singleshot_timer singleshot;
988 unsigned int cpu = smp_processor_id();
989 unsigned long j;
990 int rc;
992 cpu_set(cpu, nohz_cpu_mask);
994 /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
995 /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
996 /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
997 /* stop the hz timer then the cpumasks created for subsequent values */
998 /* of cur in rcu_start_batch are guaranteed to pick up the updated */
999 /* nohz_cpu_mask and so will not depend on this cpu. */
1001 smp_mb();
1003 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
1004 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
1005 (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
1006 cpu_clear(cpu, nohz_cpu_mask);
1007 j = jiffies + 1;
1010 singleshot.timeout_abs_ns = jiffies_to_st(j);
1011 singleshot.flags = 0;
1012 rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
1013 #ifdef CONFIG_XEN_COMPAT_030004
1014 if (rc) {
1015 BUG_ON(rc != -ENOSYS);
1016 rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
1018 #endif
1019 BUG_ON(rc);
1022 static void start_hz_timer(void)
1024 cpu_clear(smp_processor_id(), nohz_cpu_mask);
1027 void raw_safe_halt(void)
1029 stop_hz_timer();
1030 /* Blocking includes an implicit local_irq_enable(). */
1031 HYPERVISOR_block();
1032 start_hz_timer();
1034 EXPORT_SYMBOL(raw_safe_halt);
1036 void halt(void)
1038 if (irqs_disabled())
1039 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
1041 EXPORT_SYMBOL(halt);
1043 /* No locking required. Interrupts are disabled on all CPUs. */
1044 void time_resume(void)
1046 unsigned int cpu;
1048 init_cpu_khz();
1050 for_each_online_cpu(cpu) {
1051 HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
1052 &xen_set_periodic_tick);
1053 get_time_values_from_xen(cpu);
1054 per_cpu(processed_system_time, cpu) =
1055 per_cpu(shadow_time, 0).system_timestamp;
1056 init_missing_ticks_accounting(cpu);
1059 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1061 update_wallclock();
1064 #ifdef CONFIG_SMP
1065 static char timer_name[NR_CPUS][15];
1067 int local_setup_timer(unsigned int cpu)
1069 int seq, irq;
1071 BUG_ON(cpu == 0);
1073 HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
1074 &xen_set_periodic_tick);
1076 do {
1077 seq = read_seqbegin(&xtime_lock);
1078 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1079 per_cpu(processed_system_time, cpu) =
1080 per_cpu(shadow_time, 0).system_timestamp;
1081 init_missing_ticks_accounting(cpu);
1082 } while (read_seqretry(&xtime_lock, seq));
1084 sprintf(timer_name[cpu], "timer%d", cpu);
1085 irq = bind_virq_to_irqhandler(VIRQ_TIMER,
1086 cpu,
1087 timer_interrupt,
1088 SA_INTERRUPT,
1089 timer_name[cpu],
1090 NULL);
1091 if (irq < 0)
1092 return irq;
1093 per_cpu(timer_irq, cpu) = irq;
1095 return 0;
1098 void local_teardown_timer(unsigned int cpu)
1100 BUG_ON(cpu == 0);
1101 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1103 #endif
1105 /*
1106 * /proc/sys/xen: This really belongs in another file. It can stay here for
1107 * now however.
1108 */
1109 static ctl_table xen_subtable[] = {
1111 .ctl_name = 1,
1112 .procname = "independent_wallclock",
1113 .data = &independent_wallclock,
1114 .maxlen = sizeof(independent_wallclock),
1115 .mode = 0644,
1116 .proc_handler = proc_dointvec
1117 },
1119 .ctl_name = 2,
1120 .procname = "permitted_clock_jitter",
1121 .data = &permitted_clock_jitter,
1122 .maxlen = sizeof(permitted_clock_jitter),
1123 .mode = 0644,
1124 .proc_handler = proc_doulongvec_minmax
1125 },
1126 { 0 }
1127 };
1128 static ctl_table xen_table[] = {
1130 .ctl_name = 123,
1131 .procname = "xen",
1132 .mode = 0555,
1133 .child = xen_subtable},
1134 { 0 }
1135 };
1136 static int __init xen_sysctl_init(void)
1138 (void)register_sysctl_table(xen_table, 0);
1139 return 0;
1141 __initcall(xen_sysctl_init);