ia64/xen-unstable

view linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @ 13976:e253a63651bf

linux: user user_mode_vm() in place of user_mode() where necessary.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Thu Feb 15 13:51:25 2007 +0000 (2007-02-15)
parents 3adf00179a63
children 168030c8e0a9
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
65 #include "mach_time.h"
67 #include <linux/timex.h>
69 #include <asm/hpet.h>
71 #include <asm/arch_hooks.h>
73 #include <xen/evtchn.h>
74 #include <xen/interface/vcpu.h>
76 #if defined (__i386__)
77 #include <asm/i8259.h>
78 #endif
80 int pit_latch_buggy; /* extern */
82 #if defined(__x86_64__)
83 unsigned long vxtime_hz = PIT_TICK_RATE;
84 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
85 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
86 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
87 struct timespec __xtime __section_xtime;
88 struct timezone __sys_tz __section_sys_tz;
89 #endif
91 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
92 EXPORT_SYMBOL(cpu_khz);
94 extern unsigned long wall_jiffies;
96 DEFINE_SPINLOCK(rtc_lock);
97 EXPORT_SYMBOL(rtc_lock);
99 extern struct init_timer_opts timer_tsc_init;
100 extern struct timer_opts timer_tsc;
101 #define timer_none timer_tsc
103 /* These are peridically updated in shared_info, and then copied here. */
104 struct shadow_time_info {
105 u64 tsc_timestamp; /* TSC at last update of time vals. */
106 u64 system_timestamp; /* Time, in nanosecs, since boot. */
107 u32 tsc_to_nsec_mul;
108 u32 tsc_to_usec_mul;
109 int tsc_shift;
110 u32 version;
111 };
112 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
113 static struct timespec shadow_tv;
114 static u32 shadow_tv_version;
116 /* Keep track of last time we did processing/updating of jiffies and xtime. */
117 static u64 processed_system_time; /* System time (ns) at last processing. */
118 static DEFINE_PER_CPU(u64, processed_system_time);
120 /* How much CPU time was spent blocked and how much was 'stolen'? */
121 static DEFINE_PER_CPU(u64, processed_stolen_time);
122 static DEFINE_PER_CPU(u64, processed_blocked_time);
124 /* Current runstate of each CPU (updated automatically by the hypervisor). */
125 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
127 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
128 #define NS_PER_TICK (1000000000LL/HZ)
130 static inline void __normalize_time(time_t *sec, s64 *nsec)
131 {
132 while (*nsec >= NSEC_PER_SEC) {
133 (*nsec) -= NSEC_PER_SEC;
134 (*sec)++;
135 }
136 while (*nsec < 0) {
137 (*nsec) += NSEC_PER_SEC;
138 (*sec)--;
139 }
140 }
142 /* Does this guest OS track Xen time, or set its wall clock independently? */
143 static int independent_wallclock = 0;
144 static int __init __independent_wallclock(char *str)
145 {
146 independent_wallclock = 1;
147 return 1;
148 }
149 __setup("independent_wallclock", __independent_wallclock);
151 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
152 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
153 static int __init __permitted_clock_jitter(char *str)
154 {
155 permitted_clock_jitter = simple_strtoul(str, NULL, 0);
156 return 1;
157 }
158 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
160 #if 0
161 static void delay_tsc(unsigned long loops)
162 {
163 unsigned long bclock, now;
165 rdtscl(bclock);
166 do {
167 rep_nop();
168 rdtscl(now);
169 } while ((now - bclock) < loops);
170 }
172 struct timer_opts timer_tsc = {
173 .name = "tsc",
174 .delay = delay_tsc,
175 };
176 #endif
178 /*
179 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
180 * yielding a 64-bit result.
181 */
182 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
183 {
184 u64 product;
185 #ifdef __i386__
186 u32 tmp1, tmp2;
187 #endif
189 if (shift < 0)
190 delta >>= -shift;
191 else
192 delta <<= shift;
194 #ifdef __i386__
195 __asm__ (
196 "mul %5 ; "
197 "mov %4,%%eax ; "
198 "mov %%edx,%4 ; "
199 "mul %5 ; "
200 "xor %5,%5 ; "
201 "add %4,%%eax ; "
202 "adc %5,%%edx ; "
203 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
204 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
205 #else
206 __asm__ (
207 "mul %%rdx ; shrd $32,%%rdx,%%rax"
208 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
209 #endif
211 return product;
212 }
214 #if 0 /* defined (__i386__) */
215 int read_current_timer(unsigned long *timer_val)
216 {
217 rdtscl(*timer_val);
218 return 0;
219 }
220 #endif
222 void init_cpu_khz(void)
223 {
224 u64 __cpu_khz = 1000000ULL << 32;
225 struct vcpu_time_info *info;
226 info = &HYPERVISOR_shared_info->vcpu_info[0].time;
227 do_div(__cpu_khz, info->tsc_to_system_mul);
228 if (info->tsc_shift < 0)
229 cpu_khz = __cpu_khz << -info->tsc_shift;
230 else
231 cpu_khz = __cpu_khz >> info->tsc_shift;
232 }
234 static u64 get_nsec_offset(struct shadow_time_info *shadow)
235 {
236 u64 now, delta;
237 rdtscll(now);
238 delta = now - shadow->tsc_timestamp;
239 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
240 }
242 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
243 {
244 u64 now, delta;
245 rdtscll(now);
246 delta = now - shadow->tsc_timestamp;
247 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
248 }
250 static void __update_wallclock(time_t sec, long nsec)
251 {
252 long wtm_nsec, xtime_nsec;
253 time_t wtm_sec, xtime_sec;
254 u64 tmp, wc_nsec;
256 /* Adjust wall-clock time base based on wall_jiffies ticks. */
257 wc_nsec = processed_system_time;
258 wc_nsec += sec * (u64)NSEC_PER_SEC;
259 wc_nsec += nsec;
260 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
262 /* Split wallclock base into seconds and nanoseconds. */
263 tmp = wc_nsec;
264 xtime_nsec = do_div(tmp, 1000000000);
265 xtime_sec = (time_t)tmp;
267 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
268 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
270 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
271 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
273 ntp_clear();
274 }
276 static void update_wallclock(void)
277 {
278 shared_info_t *s = HYPERVISOR_shared_info;
280 do {
281 shadow_tv_version = s->wc_version;
282 rmb();
283 shadow_tv.tv_sec = s->wc_sec;
284 shadow_tv.tv_nsec = s->wc_nsec;
285 rmb();
286 } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
288 if (!independent_wallclock)
289 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
290 }
292 /*
293 * Reads a consistent set of time-base values from Xen, into a shadow data
294 * area.
295 */
296 static void get_time_values_from_xen(void)
297 {
298 shared_info_t *s = HYPERVISOR_shared_info;
299 struct vcpu_time_info *src;
300 struct shadow_time_info *dst;
302 src = &s->vcpu_info[smp_processor_id()].time;
303 dst = &per_cpu(shadow_time, smp_processor_id());
305 do {
306 dst->version = src->version;
307 rmb();
308 dst->tsc_timestamp = src->tsc_timestamp;
309 dst->system_timestamp = src->system_time;
310 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
311 dst->tsc_shift = src->tsc_shift;
312 rmb();
313 } while ((src->version & 1) | (dst->version ^ src->version));
315 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
316 }
318 static inline int time_values_up_to_date(int cpu)
319 {
320 struct vcpu_time_info *src;
321 struct shadow_time_info *dst;
323 src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
324 dst = &per_cpu(shadow_time, cpu);
326 rmb();
327 return (dst->version == src->version);
328 }
330 /*
331 * This is a special lock that is owned by the CPU and holds the index
332 * register we are working with. It is required for NMI access to the
333 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
334 */
335 volatile unsigned long cmos_lock = 0;
336 EXPORT_SYMBOL(cmos_lock);
338 /* Routines for accessing the CMOS RAM/RTC. */
339 unsigned char rtc_cmos_read(unsigned char addr)
340 {
341 unsigned char val;
342 lock_cmos_prefix(addr);
343 outb_p(addr, RTC_PORT(0));
344 val = inb_p(RTC_PORT(1));
345 lock_cmos_suffix(addr);
346 return val;
347 }
348 EXPORT_SYMBOL(rtc_cmos_read);
350 void rtc_cmos_write(unsigned char val, unsigned char addr)
351 {
352 lock_cmos_prefix(addr);
353 outb_p(addr, RTC_PORT(0));
354 outb_p(val, RTC_PORT(1));
355 lock_cmos_suffix(addr);
356 }
357 EXPORT_SYMBOL(rtc_cmos_write);
359 /*
360 * This version of gettimeofday has microsecond resolution
361 * and better than microsecond precision on fast x86 machines with TSC.
362 */
363 void do_gettimeofday(struct timeval *tv)
364 {
365 unsigned long seq;
366 unsigned long usec, sec;
367 unsigned long max_ntp_tick;
368 s64 nsec;
369 unsigned int cpu;
370 struct shadow_time_info *shadow;
371 u32 local_time_version;
373 cpu = get_cpu();
374 shadow = &per_cpu(shadow_time, cpu);
376 do {
377 unsigned long lost;
379 local_time_version = shadow->version;
380 seq = read_seqbegin(&xtime_lock);
382 usec = get_usec_offset(shadow);
383 lost = jiffies - wall_jiffies;
385 /*
386 * If time_adjust is negative then NTP is slowing the clock
387 * so make sure not to go into next possible interval.
388 * Better to lose some accuracy than have time go backwards..
389 */
390 if (unlikely(time_adjust < 0)) {
391 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
392 usec = min(usec, max_ntp_tick);
394 if (lost)
395 usec += lost * max_ntp_tick;
396 }
397 else if (unlikely(lost))
398 usec += lost * (USEC_PER_SEC / HZ);
400 sec = xtime.tv_sec;
401 usec += (xtime.tv_nsec / NSEC_PER_USEC);
403 nsec = shadow->system_timestamp - processed_system_time;
404 __normalize_time(&sec, &nsec);
405 usec += (long)nsec / NSEC_PER_USEC;
407 if (unlikely(!time_values_up_to_date(cpu))) {
408 /*
409 * We may have blocked for a long time,
410 * rendering our calculations invalid
411 * (e.g. the time delta may have
412 * overflowed). Detect that and recalculate
413 * with fresh values.
414 */
415 get_time_values_from_xen();
416 continue;
417 }
418 } while (read_seqretry(&xtime_lock, seq) ||
419 (local_time_version != shadow->version));
421 put_cpu();
423 while (usec >= USEC_PER_SEC) {
424 usec -= USEC_PER_SEC;
425 sec++;
426 }
428 tv->tv_sec = sec;
429 tv->tv_usec = usec;
430 }
432 EXPORT_SYMBOL(do_gettimeofday);
434 int do_settimeofday(struct timespec *tv)
435 {
436 time_t sec;
437 s64 nsec;
438 unsigned int cpu;
439 struct shadow_time_info *shadow;
440 dom0_op_t op;
442 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
443 return -EINVAL;
445 cpu = get_cpu();
446 shadow = &per_cpu(shadow_time, cpu);
448 write_seqlock_irq(&xtime_lock);
450 /*
451 * Ensure we don't get blocked for a long time so that our time delta
452 * overflows. If that were to happen then our shadow time values would
453 * be stale, so we can retry with fresh ones.
454 */
455 for (;;) {
456 nsec = tv->tv_nsec - get_nsec_offset(shadow);
457 if (time_values_up_to_date(cpu))
458 break;
459 get_time_values_from_xen();
460 }
461 sec = tv->tv_sec;
462 __normalize_time(&sec, &nsec);
464 if (is_initial_xendomain() && !independent_wallclock) {
465 op.cmd = DOM0_SETTIME;
466 op.u.settime.secs = sec;
467 op.u.settime.nsecs = nsec;
468 op.u.settime.system_time = shadow->system_timestamp;
469 HYPERVISOR_dom0_op(&op);
470 update_wallclock();
471 } else if (independent_wallclock) {
472 nsec -= shadow->system_timestamp;
473 __normalize_time(&sec, &nsec);
474 __update_wallclock(sec, nsec);
475 }
477 write_sequnlock_irq(&xtime_lock);
479 put_cpu();
481 clock_was_set();
482 return 0;
483 }
485 EXPORT_SYMBOL(do_settimeofday);
487 static void sync_xen_wallclock(unsigned long dummy);
488 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
489 static void sync_xen_wallclock(unsigned long dummy)
490 {
491 time_t sec;
492 s64 nsec;
493 dom0_op_t op;
495 if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
496 return;
498 write_seqlock_irq(&xtime_lock);
500 sec = xtime.tv_sec;
501 nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
502 __normalize_time(&sec, &nsec);
504 op.cmd = DOM0_SETTIME;
505 op.u.settime.secs = sec;
506 op.u.settime.nsecs = nsec;
507 op.u.settime.system_time = processed_system_time;
508 HYPERVISOR_dom0_op(&op);
510 update_wallclock();
512 write_sequnlock_irq(&xtime_lock);
514 /* Once per minute. */
515 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
516 }
518 static int set_rtc_mmss(unsigned long nowtime)
519 {
520 int retval;
521 unsigned long flags;
523 if (independent_wallclock || !is_initial_xendomain())
524 return 0;
526 /* gets recalled with irq locally disabled */
527 /* XXX - does irqsave resolve this? -johnstul */
528 spin_lock_irqsave(&rtc_lock, flags);
529 if (efi_enabled)
530 retval = efi_set_rtc_mmss(nowtime);
531 else
532 retval = mach_set_rtc_mmss(nowtime);
533 spin_unlock_irqrestore(&rtc_lock, flags);
535 return retval;
536 }
538 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
539 * Note: This function is required to return accurate
540 * time even in the absence of multiple timer ticks.
541 */
542 unsigned long long monotonic_clock(void)
543 {
544 int cpu = get_cpu();
545 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
546 u64 time;
547 u32 local_time_version;
549 do {
550 local_time_version = shadow->version;
551 barrier();
552 time = shadow->system_timestamp + get_nsec_offset(shadow);
553 if (!time_values_up_to_date(cpu))
554 get_time_values_from_xen();
555 barrier();
556 } while (local_time_version != shadow->version);
558 put_cpu();
560 return time;
561 }
562 EXPORT_SYMBOL(monotonic_clock);
564 #ifdef __x86_64__
565 unsigned long long sched_clock(void)
566 {
567 return monotonic_clock();
568 }
569 #endif
571 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
572 unsigned long profile_pc(struct pt_regs *regs)
573 {
574 unsigned long pc = instruction_pointer(regs);
576 #ifdef __x86_64__
577 /* Assume the lock function has either no stack frame or only a single word.
578 This checks if the address on the stack looks like a kernel text address.
579 There is a small window for false hits, but in that case the tick
580 is just accounted to the spinlock function.
581 Better would be to write these functions in assembler again
582 and check exactly. */
583 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
584 char *v = *(char **)regs->rsp;
585 if ((v >= _stext && v <= _etext) ||
586 (v >= _sinittext && v <= _einittext) ||
587 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
588 return (unsigned long)v;
589 return ((unsigned long *)regs->rsp)[1];
590 }
591 #else
592 if (!user_mode_vm(regs) && in_lock_functions(pc))
593 return *(unsigned long *)(regs->ebp + 4);
594 #endif
596 return pc;
597 }
598 EXPORT_SYMBOL(profile_pc);
599 #endif
601 /*
602 * This is the same as the above, except we _also_ save the current
603 * Time Stamp Counter value at the time of the timer interrupt, so that
604 * we later on can estimate the time of day more exactly.
605 */
606 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
607 {
608 s64 delta, delta_cpu, stolen, blocked;
609 u64 sched_time;
610 int i, cpu = smp_processor_id();
611 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
612 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
614 /*
615 * Here we are in the timer irq handler. We just have irqs locally
616 * disabled but we don't know if the timer_bh is running on the other
617 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
618 * the irq version of write_lock because as just said we have irq
619 * locally disabled. -arca
620 */
621 write_seqlock(&xtime_lock);
623 do {
624 get_time_values_from_xen();
626 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
627 delta = delta_cpu =
628 shadow->system_timestamp + get_nsec_offset(shadow);
629 delta -= processed_system_time;
630 delta_cpu -= per_cpu(processed_system_time, cpu);
632 /*
633 * Obtain a consistent snapshot of stolen/blocked cycles. We
634 * can use state_entry_time to detect if we get preempted here.
635 */
636 do {
637 sched_time = runstate->state_entry_time;
638 barrier();
639 stolen = runstate->time[RUNSTATE_runnable] +
640 runstate->time[RUNSTATE_offline] -
641 per_cpu(processed_stolen_time, cpu);
642 blocked = runstate->time[RUNSTATE_blocked] -
643 per_cpu(processed_blocked_time, cpu);
644 barrier();
645 } while (sched_time != runstate->state_entry_time);
646 } while (!time_values_up_to_date(cpu));
648 if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
649 unlikely(delta_cpu < -(s64)permitted_clock_jitter))
650 && printk_ratelimit()) {
651 printk("Timer ISR/%d: Time went backwards: "
652 "delta=%lld delta_cpu=%lld shadow=%lld "
653 "off=%lld processed=%lld cpu_processed=%lld\n",
654 cpu, delta, delta_cpu, shadow->system_timestamp,
655 (s64)get_nsec_offset(shadow),
656 processed_system_time,
657 per_cpu(processed_system_time, cpu));
658 for (i = 0; i < num_online_cpus(); i++)
659 printk(" %d: %lld\n", i,
660 per_cpu(processed_system_time, i));
661 }
663 /* System-wide jiffy work. */
664 while (delta >= NS_PER_TICK) {
665 delta -= NS_PER_TICK;
666 processed_system_time += NS_PER_TICK;
667 do_timer(regs);
668 }
670 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
671 update_wallclock();
672 clock_was_set();
673 }
675 write_sequnlock(&xtime_lock);
677 /*
678 * Account stolen ticks.
679 * HACK: Passing NULL to account_steal_time()
680 * ensures that the ticks are accounted as stolen.
681 */
682 if ((stolen > 0) && (delta_cpu > 0)) {
683 delta_cpu -= stolen;
684 if (unlikely(delta_cpu < 0))
685 stolen += delta_cpu; /* clamp local-time progress */
686 do_div(stolen, NS_PER_TICK);
687 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
688 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
689 account_steal_time(NULL, (cputime_t)stolen);
690 }
692 /*
693 * Account blocked ticks.
694 * HACK: Passing idle_task to account_steal_time()
695 * ensures that the ticks are accounted as idle/wait.
696 */
697 if ((blocked > 0) && (delta_cpu > 0)) {
698 delta_cpu -= blocked;
699 if (unlikely(delta_cpu < 0))
700 blocked += delta_cpu; /* clamp local-time progress */
701 do_div(blocked, NS_PER_TICK);
702 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
703 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
704 account_steal_time(idle_task(cpu), (cputime_t)blocked);
705 }
707 /* Account user/system ticks. */
708 if (delta_cpu > 0) {
709 do_div(delta_cpu, NS_PER_TICK);
710 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
711 if (user_mode_vm(regs))
712 account_user_time(current, (cputime_t)delta_cpu);
713 else
714 account_system_time(current, HARDIRQ_OFFSET,
715 (cputime_t)delta_cpu);
716 }
718 /* Offlined for more than a few seconds? Avoid lockup warnings. */
719 if (stolen > 5*HZ)
720 touch_softlockup_watchdog();
722 /* Local timer processing (see update_process_times()). */
723 run_local_timers();
724 if (rcu_pending(cpu))
725 rcu_check_callbacks(cpu, user_mode_vm(regs));
726 scheduler_tick();
727 run_posix_cpu_timers(current);
728 profile_tick(CPU_PROFILING, regs);
730 return IRQ_HANDLED;
731 }
733 static void init_missing_ticks_accounting(int cpu)
734 {
735 struct vcpu_register_runstate_memory_area area;
736 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
738 memset(runstate, 0, sizeof(*runstate));
740 area.addr.v = runstate;
741 HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
743 per_cpu(processed_blocked_time, cpu) =
744 runstate->time[RUNSTATE_blocked];
745 per_cpu(processed_stolen_time, cpu) =
746 runstate->time[RUNSTATE_runnable] +
747 runstate->time[RUNSTATE_offline];
748 }
750 /* not static: needed by APM */
751 unsigned long get_cmos_time(void)
752 {
753 unsigned long retval;
754 unsigned long flags;
756 spin_lock_irqsave(&rtc_lock, flags);
758 if (efi_enabled)
759 retval = efi_get_time();
760 else
761 retval = mach_get_cmos_time();
763 spin_unlock_irqrestore(&rtc_lock, flags);
765 return retval;
766 }
767 EXPORT_SYMBOL(get_cmos_time);
769 static void sync_cmos_clock(unsigned long dummy);
771 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
773 static void sync_cmos_clock(unsigned long dummy)
774 {
775 struct timeval now, next;
776 int fail = 1;
778 /*
779 * If we have an externally synchronized Linux clock, then update
780 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
781 * called as close as possible to 500 ms before the new second starts.
782 * This code is run on a timer. If the clock is set, that timer
783 * may not expire at the correct time. Thus, we adjust...
784 */
785 if (!ntp_synced())
786 /*
787 * Not synced, exit, do not restart a timer (if one is
788 * running, let it run out).
789 */
790 return;
792 do_gettimeofday(&now);
793 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
794 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
795 fail = set_rtc_mmss(now.tv_sec);
797 next.tv_usec = USEC_AFTER - now.tv_usec;
798 if (next.tv_usec <= 0)
799 next.tv_usec += USEC_PER_SEC;
801 if (!fail)
802 next.tv_sec = 659;
803 else
804 next.tv_sec = 0;
806 if (next.tv_usec >= USEC_PER_SEC) {
807 next.tv_sec++;
808 next.tv_usec -= USEC_PER_SEC;
809 }
810 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
811 }
813 void notify_arch_cmos_timer(void)
814 {
815 mod_timer(&sync_cmos_timer, jiffies + 1);
816 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
817 }
819 static long clock_cmos_diff, sleep_start;
821 static int timer_suspend(struct sys_device *dev, pm_message_t state)
822 {
823 /*
824 * Estimate time zone so that set_time can update the clock
825 */
826 clock_cmos_diff = -get_cmos_time();
827 clock_cmos_diff += get_seconds();
828 sleep_start = get_cmos_time();
829 return 0;
830 }
832 static int timer_resume(struct sys_device *dev)
833 {
834 unsigned long flags;
835 unsigned long sec;
836 unsigned long sleep_length;
838 #ifdef CONFIG_HPET_TIMER
839 if (is_hpet_enabled())
840 hpet_reenable();
841 #endif
842 sec = get_cmos_time() + clock_cmos_diff;
843 sleep_length = (get_cmos_time() - sleep_start) * HZ;
844 write_seqlock_irqsave(&xtime_lock, flags);
845 xtime.tv_sec = sec;
846 xtime.tv_nsec = 0;
847 jiffies_64 += sleep_length;
848 wall_jiffies += sleep_length;
849 write_sequnlock_irqrestore(&xtime_lock, flags);
850 touch_softlockup_watchdog();
851 return 0;
852 }
854 static struct sysdev_class timer_sysclass = {
855 .resume = timer_resume,
856 .suspend = timer_suspend,
857 set_kset_name("timer"),
858 };
861 /* XXX this driverfs stuff should probably go elsewhere later -john */
862 static struct sys_device device_timer = {
863 .id = 0,
864 .cls = &timer_sysclass,
865 };
867 static int time_init_device(void)
868 {
869 int error = sysdev_class_register(&timer_sysclass);
870 if (!error)
871 error = sysdev_register(&device_timer);
872 return error;
873 }
875 device_initcall(time_init_device);
877 #ifdef CONFIG_HPET_TIMER
878 extern void (*late_time_init)(void);
879 /* Duplicate of time_init() below, with hpet_enable part added */
880 static void __init hpet_time_init(void)
881 {
882 xtime.tv_sec = get_cmos_time();
883 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
884 set_normalized_timespec(&wall_to_monotonic,
885 -xtime.tv_sec, -xtime.tv_nsec);
887 if ((hpet_enable() >= 0) && hpet_use_timer) {
888 printk("Using HPET for base-timer\n");
889 }
891 time_init_hook();
892 }
893 #endif
895 /* Dynamically-mapped IRQ. */
896 DEFINE_PER_CPU(int, timer_irq);
898 extern void (*late_time_init)(void);
899 static void setup_cpu0_timer_irq(void)
900 {
901 per_cpu(timer_irq, 0) =
902 bind_virq_to_irqhandler(
903 VIRQ_TIMER,
904 0,
905 timer_interrupt,
906 SA_INTERRUPT,
907 "timer0",
908 NULL);
909 BUG_ON(per_cpu(timer_irq, 0) < 0);
910 }
912 void __init time_init(void)
913 {
914 #ifdef CONFIG_HPET_TIMER
915 if (is_hpet_capable()) {
916 /*
917 * HPET initialization needs to do memory-mapped io. So, let
918 * us do a late initialization after mem_init().
919 */
920 late_time_init = hpet_time_init;
921 return;
922 }
923 #endif
924 get_time_values_from_xen();
926 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
927 per_cpu(processed_system_time, 0) = processed_system_time;
928 init_missing_ticks_accounting(0);
930 update_wallclock();
932 init_cpu_khz();
933 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
934 cpu_khz / 1000, cpu_khz % 1000);
936 #if defined(__x86_64__)
937 vxtime.mode = VXTIME_TSC;
938 vxtime.quot = (1000000L << 32) / vxtime_hz;
939 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
940 sync_core();
941 rdtscll(vxtime.last_tsc);
942 #endif
944 /* Cannot request_irq() until kmem is initialised. */
945 late_time_init = setup_cpu0_timer_irq;
946 }
948 /* Convert jiffies to system time. */
949 u64 jiffies_to_st(unsigned long j)
950 {
951 unsigned long seq;
952 long delta;
953 u64 st;
955 do {
956 seq = read_seqbegin(&xtime_lock);
957 delta = j - jiffies;
958 if (delta < 1) {
959 /* Triggers in some wrap-around cases, but that's okay:
960 * we just end up with a shorter timeout. */
961 st = processed_system_time + NS_PER_TICK;
962 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
963 /* Very long timeout means there is no pending timer.
964 * We indicate this to Xen by passing zero timeout. */
965 st = 0;
966 } else {
967 st = processed_system_time + delta * (u64)NS_PER_TICK;
968 }
969 } while (read_seqretry(&xtime_lock, seq));
971 return st;
972 }
973 EXPORT_SYMBOL(jiffies_to_st);
975 /*
976 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
977 * These functions are based on implementations from arch/s390/kernel/time.c
978 */
979 static void stop_hz_timer(void)
980 {
981 unsigned int cpu = smp_processor_id();
982 unsigned long j;
984 cpu_set(cpu, nohz_cpu_mask);
986 /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
987 /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
988 /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
989 /* stop the hz timer then the cpumasks created for subsequent values */
990 /* of cur in rcu_start_batch are guaranteed to pick up the updated */
991 /* nohz_cpu_mask and so will not depend on this cpu. */
993 smp_mb();
995 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
996 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
997 (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
998 cpu_clear(cpu, nohz_cpu_mask);
999 j = jiffies + 1;
1002 if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
1003 BUG();
1006 static void start_hz_timer(void)
1008 cpu_clear(smp_processor_id(), nohz_cpu_mask);
1011 void raw_safe_halt(void)
1013 stop_hz_timer();
1014 /* Blocking includes an implicit local_irq_enable(). */
1015 HYPERVISOR_block();
1016 start_hz_timer();
1018 EXPORT_SYMBOL(raw_safe_halt);
1020 void halt(void)
1022 if (irqs_disabled())
1023 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
1025 EXPORT_SYMBOL(halt);
1027 /* No locking required. We are only CPU running, and interrupts are off. */
1028 void time_resume(void)
1030 init_cpu_khz();
1032 get_time_values_from_xen();
1034 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1035 per_cpu(processed_system_time, 0) = processed_system_time;
1036 init_missing_ticks_accounting(0);
1038 update_wallclock();
1041 #ifdef CONFIG_SMP
1042 static char timer_name[NR_CPUS][15];
1044 int local_setup_timer(unsigned int cpu)
1046 int seq, irq;
1048 BUG_ON(cpu == 0);
1050 do {
1051 seq = read_seqbegin(&xtime_lock);
1052 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1053 per_cpu(processed_system_time, cpu) =
1054 per_cpu(shadow_time, 0).system_timestamp;
1055 init_missing_ticks_accounting(cpu);
1056 } while (read_seqretry(&xtime_lock, seq));
1058 sprintf(timer_name[cpu], "timer%d", cpu);
1059 irq = bind_virq_to_irqhandler(VIRQ_TIMER,
1060 cpu,
1061 timer_interrupt,
1062 SA_INTERRUPT,
1063 timer_name[cpu],
1064 NULL);
1065 if (irq < 0)
1066 return irq;
1067 per_cpu(timer_irq, cpu) = irq;
1069 return 0;
1072 void local_teardown_timer(unsigned int cpu)
1074 BUG_ON(cpu == 0);
1075 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1077 #endif
1079 /*
1080 * /proc/sys/xen: This really belongs in another file. It can stay here for
1081 * now however.
1082 */
1083 static ctl_table xen_subtable[] = {
1085 .ctl_name = 1,
1086 .procname = "independent_wallclock",
1087 .data = &independent_wallclock,
1088 .maxlen = sizeof(independent_wallclock),
1089 .mode = 0644,
1090 .proc_handler = proc_dointvec
1091 },
1093 .ctl_name = 2,
1094 .procname = "permitted_clock_jitter",
1095 .data = &permitted_clock_jitter,
1096 .maxlen = sizeof(permitted_clock_jitter),
1097 .mode = 0644,
1098 .proc_handler = proc_doulongvec_minmax
1099 },
1100 { 0 }
1101 };
1102 static ctl_table xen_table[] = {
1104 .ctl_name = 123,
1105 .procname = "xen",
1106 .mode = 0555,
1107 .child = xen_subtable},
1108 { 0 }
1109 };
1110 static int __init xen_sysctl_init(void)
1112 (void)register_sysctl_table(xen_table, 0);
1113 return 0;
1115 __initcall(xen_sysctl_init);