ia64/xen-unstable

view linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @ 9024:d0b7281556f2

New VCPUOP_register_runstate_memory_area hypercall. Avoids
need for a hypercall in the guest timer interrupt handler.

Cleaned up stolen/blocked tick handling in Linux.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Feb 25 21:28:27 2006 +0100 (2006-02-25)
parents c375c2109452
children 5541ea99106a
line source
1 /*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
65 #include "mach_time.h"
67 #include <linux/timex.h>
68 #include <linux/config.h>
70 #include <asm/hpet.h>
72 #include <asm/arch_hooks.h>
74 #include <xen/evtchn.h>
75 #include <xen/interface/vcpu.h>
77 #if defined (__i386__)
78 #include <asm/i8259.h>
79 #endif
81 int pit_latch_buggy; /* extern */
83 #if defined(__x86_64__)
84 unsigned long vxtime_hz = PIT_TICK_RATE;
85 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
86 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
87 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
88 struct timespec __xtime __section_xtime;
89 struct timezone __sys_tz __section_sys_tz;
90 #endif
92 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
93 EXPORT_SYMBOL(cpu_khz);
95 extern unsigned long wall_jiffies;
97 DEFINE_SPINLOCK(rtc_lock);
98 EXPORT_SYMBOL(rtc_lock);
100 #if defined (__i386__)
101 #include <asm/i8253.h>
102 #endif
104 DEFINE_SPINLOCK(i8253_lock);
105 EXPORT_SYMBOL(i8253_lock);
107 extern struct init_timer_opts timer_tsc_init;
108 extern struct timer_opts timer_tsc;
109 #define timer_none timer_tsc
110 struct timer_opts *cur_timer __read_mostly = &timer_tsc;
112 /* These are peridically updated in shared_info, and then copied here. */
113 struct shadow_time_info {
114 u64 tsc_timestamp; /* TSC at last update of time vals. */
115 u64 system_timestamp; /* Time, in nanosecs, since boot. */
116 u32 tsc_to_nsec_mul;
117 u32 tsc_to_usec_mul;
118 int tsc_shift;
119 u32 version;
120 };
121 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
122 static struct timespec shadow_tv;
123 static u32 shadow_tv_version;
125 /* Keep track of last time we did processing/updating of jiffies and xtime. */
126 static u64 processed_system_time; /* System time (ns) at last processing. */
127 static DEFINE_PER_CPU(u64, processed_system_time);
129 /* How much CPU time was spent blocked and how much was 'stolen'? */
130 static DEFINE_PER_CPU(u64, processed_stolen_time);
131 static DEFINE_PER_CPU(u64, processed_blocked_time);
133 /* Current runstate of each CPU (updated automatically by the hypervisor). */
134 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
136 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
137 #define NS_PER_TICK (1000000000LL/HZ)
139 static inline void __normalize_time(time_t *sec, s64 *nsec)
140 {
141 while (*nsec >= NSEC_PER_SEC) {
142 (*nsec) -= NSEC_PER_SEC;
143 (*sec)++;
144 }
145 while (*nsec < 0) {
146 (*nsec) += NSEC_PER_SEC;
147 (*sec)--;
148 }
149 }
151 /* Does this guest OS track Xen time, or set its wall clock independently? */
152 static int independent_wallclock = 0;
153 static int __init __independent_wallclock(char *str)
154 {
155 independent_wallclock = 1;
156 return 1;
157 }
158 __setup("independent_wallclock", __independent_wallclock);
160 int tsc_disable __devinitdata = 0;
162 static void delay_tsc(unsigned long loops)
163 {
164 unsigned long bclock, now;
166 rdtscl(bclock);
167 do
168 {
169 rep_nop();
170 rdtscl(now);
171 } while ((now-bclock) < loops);
172 }
174 struct timer_opts timer_tsc = {
175 .name = "tsc",
176 .delay = delay_tsc,
177 };
179 /*
180 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
181 * yielding a 64-bit result.
182 */
183 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
184 {
185 u64 product;
186 #ifdef __i386__
187 u32 tmp1, tmp2;
188 #endif
190 if ( shift < 0 )
191 delta >>= -shift;
192 else
193 delta <<= shift;
195 #ifdef __i386__
196 __asm__ (
197 "mul %5 ; "
198 "mov %4,%%eax ; "
199 "mov %%edx,%4 ; "
200 "mul %5 ; "
201 "xor %5,%5 ; "
202 "add %4,%%eax ; "
203 "adc %5,%%edx ; "
204 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
205 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
206 #else
207 __asm__ (
208 "mul %%rdx ; shrd $32,%%rdx,%%rax"
209 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
210 #endif
212 return product;
213 }
215 #if defined (__i386__)
216 int read_current_timer(unsigned long *timer_val)
217 {
218 rdtscl(*timer_val);
219 return 0;
220 }
221 #endif
223 void init_cpu_khz(void)
224 {
225 u64 __cpu_khz = 1000000ULL << 32;
226 struct vcpu_time_info *info;
227 info = &HYPERVISOR_shared_info->vcpu_info[0].time;
228 do_div(__cpu_khz, info->tsc_to_system_mul);
229 if ( info->tsc_shift < 0 )
230 cpu_khz = __cpu_khz << -info->tsc_shift;
231 else
232 cpu_khz = __cpu_khz >> info->tsc_shift;
233 }
235 static u64 get_nsec_offset(struct shadow_time_info *shadow)
236 {
237 u64 now, delta;
238 rdtscll(now);
239 delta = now - shadow->tsc_timestamp;
240 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
241 }
243 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
244 {
245 u64 now, delta;
246 rdtscll(now);
247 delta = now - shadow->tsc_timestamp;
248 return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
249 }
251 static void __update_wallclock(time_t sec, long nsec)
252 {
253 long wtm_nsec, xtime_nsec;
254 time_t wtm_sec, xtime_sec;
255 u64 tmp, wc_nsec;
257 /* Adjust wall-clock time base based on wall_jiffies ticks. */
258 wc_nsec = processed_system_time;
259 wc_nsec += sec * (u64)NSEC_PER_SEC;
260 wc_nsec += nsec;
261 wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
263 /* Split wallclock base into seconds and nanoseconds. */
264 tmp = wc_nsec;
265 xtime_nsec = do_div(tmp, 1000000000);
266 xtime_sec = (time_t)tmp;
268 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
269 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
271 set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
272 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
274 ntp_clear();
275 }
277 static void update_wallclock(void)
278 {
279 shared_info_t *s = HYPERVISOR_shared_info;
281 do {
282 shadow_tv_version = s->wc_version;
283 rmb();
284 shadow_tv.tv_sec = s->wc_sec;
285 shadow_tv.tv_nsec = s->wc_nsec;
286 rmb();
287 }
288 while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
290 if (!independent_wallclock)
291 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
292 }
294 /*
295 * Reads a consistent set of time-base values from Xen, into a shadow data
296 * area.
297 */
298 static void get_time_values_from_xen(void)
299 {
300 shared_info_t *s = HYPERVISOR_shared_info;
301 struct vcpu_time_info *src;
302 struct shadow_time_info *dst;
304 src = &s->vcpu_info[smp_processor_id()].time;
305 dst = &per_cpu(shadow_time, smp_processor_id());
307 do {
308 dst->version = src->version;
309 rmb();
310 dst->tsc_timestamp = src->tsc_timestamp;
311 dst->system_timestamp = src->system_time;
312 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
313 dst->tsc_shift = src->tsc_shift;
314 rmb();
315 }
316 while ((src->version & 1) | (dst->version ^ src->version));
318 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
319 }
321 static inline int time_values_up_to_date(int cpu)
322 {
323 struct vcpu_time_info *src;
324 struct shadow_time_info *dst;
326 src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
327 dst = &per_cpu(shadow_time, cpu);
329 return (dst->version == src->version);
330 }
332 /*
333 * This is a special lock that is owned by the CPU and holds the index
334 * register we are working with. It is required for NMI access to the
335 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
336 */
337 volatile unsigned long cmos_lock = 0;
338 EXPORT_SYMBOL(cmos_lock);
340 /* Routines for accessing the CMOS RAM/RTC. */
341 unsigned char rtc_cmos_read(unsigned char addr)
342 {
343 unsigned char val;
344 lock_cmos_prefix(addr);
345 outb_p(addr, RTC_PORT(0));
346 val = inb_p(RTC_PORT(1));
347 lock_cmos_suffix(addr);
348 return val;
349 }
350 EXPORT_SYMBOL(rtc_cmos_read);
352 void rtc_cmos_write(unsigned char val, unsigned char addr)
353 {
354 lock_cmos_prefix(addr);
355 outb_p(addr, RTC_PORT(0));
356 outb_p(val, RTC_PORT(1));
357 lock_cmos_suffix(addr);
358 }
359 EXPORT_SYMBOL(rtc_cmos_write);
361 /*
362 * This version of gettimeofday has microsecond resolution
363 * and better than microsecond precision on fast x86 machines with TSC.
364 */
365 void do_gettimeofday(struct timeval *tv)
366 {
367 unsigned long seq;
368 unsigned long usec, sec;
369 unsigned long max_ntp_tick;
370 s64 nsec;
371 unsigned int cpu;
372 struct shadow_time_info *shadow;
373 u32 local_time_version;
375 cpu = get_cpu();
376 shadow = &per_cpu(shadow_time, cpu);
378 do {
379 unsigned long lost;
381 local_time_version = shadow->version;
382 seq = read_seqbegin(&xtime_lock);
384 usec = get_usec_offset(shadow);
385 lost = jiffies - wall_jiffies;
387 /*
388 * If time_adjust is negative then NTP is slowing the clock
389 * so make sure not to go into next possible interval.
390 * Better to lose some accuracy than have time go backwards..
391 */
392 if (unlikely(time_adjust < 0)) {
393 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
394 usec = min(usec, max_ntp_tick);
396 if (lost)
397 usec += lost * max_ntp_tick;
398 }
399 else if (unlikely(lost))
400 usec += lost * (USEC_PER_SEC / HZ);
402 sec = xtime.tv_sec;
403 usec += (xtime.tv_nsec / NSEC_PER_USEC);
405 nsec = shadow->system_timestamp - processed_system_time;
406 __normalize_time(&sec, &nsec);
407 usec += (long)nsec / NSEC_PER_USEC;
409 if (unlikely(!time_values_up_to_date(cpu))) {
410 /*
411 * We may have blocked for a long time,
412 * rendering our calculations invalid
413 * (e.g. the time delta may have
414 * overflowed). Detect that and recalculate
415 * with fresh values.
416 */
417 get_time_values_from_xen();
418 continue;
419 }
420 } while (read_seqretry(&xtime_lock, seq) ||
421 (local_time_version != shadow->version));
423 put_cpu();
425 while (usec >= USEC_PER_SEC) {
426 usec -= USEC_PER_SEC;
427 sec++;
428 }
430 tv->tv_sec = sec;
431 tv->tv_usec = usec;
432 }
434 EXPORT_SYMBOL(do_gettimeofday);
436 int do_settimeofday(struct timespec *tv)
437 {
438 time_t sec;
439 s64 nsec;
440 unsigned int cpu;
441 struct shadow_time_info *shadow;
442 dom0_op_t op;
444 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
445 return -EINVAL;
447 cpu = get_cpu();
448 shadow = &per_cpu(shadow_time, cpu);
450 write_seqlock_irq(&xtime_lock);
452 /*
453 * Ensure we don't get blocked for a long time so that our time delta
454 * overflows. If that were to happen then our shadow time values would
455 * be stale, so we can retry with fresh ones.
456 */
457 for ( ; ; ) {
458 nsec = tv->tv_nsec - get_nsec_offset(shadow);
459 if (time_values_up_to_date(cpu))
460 break;
461 get_time_values_from_xen();
462 }
463 sec = tv->tv_sec;
464 __normalize_time(&sec, &nsec);
466 if ((xen_start_info->flags & SIF_INITDOMAIN) &&
467 !independent_wallclock) {
468 op.cmd = DOM0_SETTIME;
469 op.u.settime.secs = sec;
470 op.u.settime.nsecs = nsec;
471 op.u.settime.system_time = shadow->system_timestamp;
472 HYPERVISOR_dom0_op(&op);
473 update_wallclock();
474 } else if (independent_wallclock) {
475 nsec -= shadow->system_timestamp;
476 __normalize_time(&sec, &nsec);
477 __update_wallclock(sec, nsec);
478 }
480 write_sequnlock_irq(&xtime_lock);
482 put_cpu();
484 clock_was_set();
485 return 0;
486 }
488 EXPORT_SYMBOL(do_settimeofday);
490 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
491 static int set_rtc_mmss(unsigned long nowtime)
492 {
493 int retval;
495 WARN_ON(irqs_disabled());
497 if (!(xen_start_info->flags & SIF_INITDOMAIN))
498 return 0;
500 /* gets recalled with irq locally disabled */
501 spin_lock_irq(&rtc_lock);
502 if (efi_enabled)
503 retval = efi_set_rtc_mmss(nowtime);
504 else
505 retval = mach_set_rtc_mmss(nowtime);
506 spin_unlock_irq(&rtc_lock);
508 return retval;
509 }
510 #else
511 static int set_rtc_mmss(unsigned long nowtime)
512 {
513 return 0;
514 }
515 #endif
517 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
518 * Note: This function is required to return accurate
519 * time even in the absence of multiple timer ticks.
520 */
521 unsigned long long monotonic_clock(void)
522 {
523 int cpu = get_cpu();
524 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
525 u64 time;
526 u32 local_time_version;
528 do {
529 local_time_version = shadow->version;
530 smp_rmb();
531 time = shadow->system_timestamp + get_nsec_offset(shadow);
532 if (!time_values_up_to_date(cpu))
533 get_time_values_from_xen();
534 smp_rmb();
535 } while (local_time_version != shadow->version);
537 put_cpu();
539 return time;
540 }
541 EXPORT_SYMBOL(monotonic_clock);
543 unsigned long long sched_clock(void)
544 {
545 return monotonic_clock();
546 }
548 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
549 unsigned long profile_pc(struct pt_regs *regs)
550 {
551 unsigned long pc = instruction_pointer(regs);
553 #ifdef __x86_64__
554 /* Assume the lock function has either no stack frame or only a single word.
555 This checks if the address on the stack looks like a kernel text address.
556 There is a small window for false hits, but in that case the tick
557 is just accounted to the spinlock function.
558 Better would be to write these functions in assembler again
559 and check exactly. */
560 if (in_lock_functions(pc)) {
561 char *v = *(char **)regs->rsp;
562 if ((v >= _stext && v <= _etext) ||
563 (v >= _sinittext && v <= _einittext) ||
564 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
565 return (unsigned long)v;
566 return ((unsigned long *)regs->rsp)[1];
567 }
568 #else
569 if (in_lock_functions(pc))
570 return *(unsigned long *)(regs->ebp + 4);
571 #endif
573 return pc;
574 }
575 EXPORT_SYMBOL(profile_pc);
576 #endif
578 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
579 {
580 s64 delta, delta_cpu, stolen, blocked;
581 u64 sched_time;
582 int i, cpu = smp_processor_id();
583 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
584 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
586 write_seqlock(&xtime_lock);
588 do {
589 get_time_values_from_xen();
591 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
592 delta = delta_cpu =
593 shadow->system_timestamp + get_nsec_offset(shadow);
594 delta -= processed_system_time;
595 delta_cpu -= per_cpu(processed_system_time, cpu);
597 /*
598 * Obtain a consistent snapshot of stolen/blocked cycles. We
599 * can use state_entry_time to detect if we get preempted here.
600 */
601 do {
602 sched_time = runstate->state_entry_time;
603 barrier();
604 stolen = runstate->time[RUNSTATE_runnable] +
605 runstate->time[RUNSTATE_offline] -
606 per_cpu(processed_stolen_time, cpu);
607 blocked = runstate->time[RUNSTATE_blocked] -
608 per_cpu(processed_blocked_time, cpu);
609 barrier();
610 } while (sched_time != runstate->state_entry_time);
611 }
612 while (!time_values_up_to_date(cpu));
614 if ((unlikely(delta < -1000000LL) || unlikely(delta_cpu < 0))
615 && printk_ratelimit()) {
616 printk("Timer ISR/%d: Time went backwards: "
617 "delta=%lld cpu_delta=%lld shadow=%lld "
618 "off=%lld processed=%lld cpu_processed=%lld\n",
619 cpu, delta, delta_cpu, shadow->system_timestamp,
620 (s64)get_nsec_offset(shadow),
621 processed_system_time,
622 per_cpu(processed_system_time, cpu));
623 for (i = 0; i < num_online_cpus(); i++)
624 printk(" %d: %lld\n", i,
625 per_cpu(processed_system_time, i));
626 }
628 /* System-wide jiffy work. */
629 while (delta >= NS_PER_TICK) {
630 delta -= NS_PER_TICK;
631 processed_system_time += NS_PER_TICK;
632 do_timer(regs);
633 }
635 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
636 update_wallclock();
637 clock_was_set();
638 }
640 write_sequnlock(&xtime_lock);
642 /*
643 * Account stolen ticks.
644 * HACK: Passing NULL to account_steal_time()
645 * ensures that the ticks are accounted as stolen.
646 */
647 if (stolen > 0) {
648 delta_cpu -= stolen;
649 do_div(stolen, NS_PER_TICK);
650 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
651 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
652 account_steal_time(NULL, (cputime_t)stolen);
653 }
655 /*
656 * Account blocked ticks.
657 * HACK: Passing idle_task to account_steal_time()
658 * ensures that the ticks are accounted as idle/wait.
659 */
660 if (blocked > 0) {
661 delta_cpu -= blocked;
662 do_div(blocked, NS_PER_TICK);
663 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
664 per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
665 account_steal_time(idle_task(cpu), (cputime_t)blocked);
666 }
668 /* Account user/system ticks. */
669 if (delta_cpu > 0) {
670 do_div(delta_cpu, NS_PER_TICK);
671 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
672 if (user_mode(regs))
673 account_user_time(current, (cputime_t)delta_cpu);
674 else
675 account_system_time(current, HARDIRQ_OFFSET,
676 (cputime_t)delta_cpu);
677 }
679 /* Local timer processing (see update_process_times()). */
680 run_local_timers();
681 if (rcu_pending(cpu))
682 rcu_check_callbacks(cpu, user_mode(regs));
683 scheduler_tick();
684 run_posix_cpu_timers(current);
686 return IRQ_HANDLED;
687 }
689 static void init_missing_ticks_accounting(int cpu)
690 {
691 struct vcpu_register_runstate_memory_area area;
692 struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
694 memset(runstate, 0, sizeof(*runstate));
696 area.addr.v = runstate;
697 HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
699 per_cpu(processed_blocked_time, cpu) =
700 runstate->time[RUNSTATE_blocked];
701 per_cpu(processed_stolen_time, cpu) =
702 runstate->time[RUNSTATE_runnable] +
703 runstate->time[RUNSTATE_offline];
704 }
706 /* not static: needed by APM */
707 unsigned long get_cmos_time(void)
708 {
709 unsigned long retval;
711 spin_lock(&rtc_lock);
713 if (efi_enabled)
714 retval = efi_get_time();
715 else
716 retval = mach_get_cmos_time();
718 spin_unlock(&rtc_lock);
720 return retval;
721 }
722 EXPORT_SYMBOL(get_cmos_time);
724 static void sync_cmos_clock(unsigned long dummy);
726 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
728 static void sync_cmos_clock(unsigned long dummy)
729 {
730 struct timeval now, next;
731 int fail = 1;
733 /*
734 * If we have an externally synchronized Linux clock, then update
735 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
736 * called as close as possible to 500 ms before the new second starts.
737 * This code is run on a timer. If the clock is set, that timer
738 * may not expire at the correct time. Thus, we adjust...
739 */
740 if (!ntp_synced())
741 /*
742 * Not synced, exit, do not restart a timer (if one is
743 * running, let it run out).
744 */
745 return;
747 do_gettimeofday(&now);
748 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
749 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
750 fail = set_rtc_mmss(now.tv_sec);
752 next.tv_usec = USEC_AFTER - now.tv_usec;
753 if (next.tv_usec <= 0)
754 next.tv_usec += USEC_PER_SEC;
756 if (!fail)
757 next.tv_sec = 659;
758 else
759 next.tv_sec = 0;
761 if (next.tv_usec >= USEC_PER_SEC) {
762 next.tv_sec++;
763 next.tv_usec -= USEC_PER_SEC;
764 }
765 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
766 }
768 void notify_arch_cmos_timer(void)
769 {
770 mod_timer(&sync_cmos_timer, jiffies + 1);
771 }
773 static long clock_cmos_diff, sleep_start;
775 static struct timer_opts *last_timer;
776 static int timer_suspend(struct sys_device *dev, pm_message_t state)
777 {
778 /*
779 * Estimate time zone so that set_time can update the clock
780 */
781 clock_cmos_diff = -get_cmos_time();
782 clock_cmos_diff += get_seconds();
783 sleep_start = get_cmos_time();
784 last_timer = cur_timer;
785 cur_timer = &timer_none;
786 if (last_timer->suspend)
787 last_timer->suspend(state);
788 return 0;
789 }
791 static int timer_resume(struct sys_device *dev)
792 {
793 unsigned long flags;
794 unsigned long sec;
795 unsigned long sleep_length;
797 #ifdef CONFIG_HPET_TIMER
798 if (is_hpet_enabled())
799 hpet_reenable();
800 #endif
801 sec = get_cmos_time() + clock_cmos_diff;
802 sleep_length = (get_cmos_time() - sleep_start) * HZ;
803 write_seqlock_irqsave(&xtime_lock, flags);
804 xtime.tv_sec = sec;
805 xtime.tv_nsec = 0;
806 write_sequnlock_irqrestore(&xtime_lock, flags);
807 jiffies += sleep_length;
808 wall_jiffies += sleep_length;
809 if (last_timer->resume)
810 last_timer->resume();
811 cur_timer = last_timer;
812 last_timer = NULL;
813 touch_softlockup_watchdog();
814 return 0;
815 }
817 static struct sysdev_class timer_sysclass = {
818 .resume = timer_resume,
819 .suspend = timer_suspend,
820 set_kset_name("timer"),
821 };
824 /* XXX this driverfs stuff should probably go elsewhere later -john */
825 static struct sys_device device_timer = {
826 .id = 0,
827 .cls = &timer_sysclass,
828 };
830 static int time_init_device(void)
831 {
832 int error = sysdev_class_register(&timer_sysclass);
833 if (!error)
834 error = sysdev_register(&device_timer);
835 return error;
836 }
838 device_initcall(time_init_device);
840 #ifdef CONFIG_HPET_TIMER
841 extern void (*late_time_init)(void);
842 /* Duplicate of time_init() below, with hpet_enable part added */
843 static void __init hpet_time_init(void)
844 {
845 xtime.tv_sec = get_cmos_time();
846 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
847 set_normalized_timespec(&wall_to_monotonic,
848 -xtime.tv_sec, -xtime.tv_nsec);
850 if ((hpet_enable() >= 0) && hpet_use_timer) {
851 printk("Using HPET for base-timer\n");
852 }
854 cur_timer = select_timer();
855 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
857 time_init_hook();
858 }
859 #endif
861 /* Dynamically-mapped IRQ. */
862 DEFINE_PER_CPU(int, timer_irq);
864 extern void (*late_time_init)(void);
865 static void setup_cpu0_timer_irq(void)
866 {
867 per_cpu(timer_irq, 0) =
868 bind_virq_to_irqhandler(
869 VIRQ_TIMER,
870 0,
871 timer_interrupt,
872 SA_INTERRUPT,
873 "timer0",
874 NULL);
875 BUG_ON(per_cpu(timer_irq, 0) < 0);
876 }
878 void __init time_init(void)
879 {
880 #ifdef CONFIG_HPET_TIMER
881 if (is_hpet_capable()) {
882 /*
883 * HPET initialization needs to do memory-mapped io. So, let
884 * us do a late initialization after mem_init().
885 */
886 late_time_init = hpet_time_init;
887 return;
888 }
889 #endif
890 get_time_values_from_xen();
892 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
893 per_cpu(processed_system_time, 0) = processed_system_time;
894 init_missing_ticks_accounting(0);
896 update_wallclock();
898 init_cpu_khz();
899 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
900 cpu_khz / 1000, cpu_khz % 1000);
902 #if defined(__x86_64__)
903 vxtime.mode = VXTIME_TSC;
904 vxtime.quot = (1000000L << 32) / vxtime_hz;
905 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
906 sync_core();
907 rdtscll(vxtime.last_tsc);
908 #endif
910 /* Cannot request_irq() until kmem is initialised. */
911 late_time_init = setup_cpu0_timer_irq;
912 }
914 /* Convert jiffies to system time. */
915 static inline u64 jiffies_to_st(unsigned long j)
916 {
917 unsigned long seq;
918 long delta;
919 u64 st;
921 do {
922 seq = read_seqbegin(&xtime_lock);
923 delta = j - jiffies;
924 /* NB. The next check can trigger in some wrap-around cases,
925 * but that's ok: we'll just end up with a shorter timeout. */
926 if (delta < 1)
927 delta = 1;
928 st = processed_system_time + (delta * (u64)NS_PER_TICK);
929 } while (read_seqretry(&xtime_lock, seq));
931 return st;
932 }
934 /*
935 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
936 * These functions are based on implementations from arch/s390/kernel/time.c
937 */
938 void stop_hz_timer(void)
939 {
940 unsigned int cpu = smp_processor_id();
941 unsigned long j;
943 /* We must do this /before/ checking rcu_pending(). */
944 cpu_set(cpu, nohz_cpu_mask);
945 smp_mb();
947 /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
948 if (rcu_pending(cpu) || local_softirq_pending()) {
949 cpu_clear(cpu, nohz_cpu_mask);
950 j = jiffies + 1;
951 } else {
952 j = next_timer_interrupt();
953 }
955 BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
956 }
958 void start_hz_timer(void)
959 {
960 cpu_clear(smp_processor_id(), nohz_cpu_mask);
961 }
963 /* No locking required. We are only CPU running, and interrupts are off. */
964 void time_resume(void)
965 {
966 init_cpu_khz();
968 get_time_values_from_xen();
970 processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
971 per_cpu(processed_system_time, 0) = processed_system_time;
972 init_missing_ticks_accounting(0);
974 update_wallclock();
975 }
977 #ifdef CONFIG_SMP
978 static char timer_name[NR_CPUS][15];
980 void local_setup_timer(unsigned int cpu)
981 {
982 int seq;
984 BUG_ON(cpu == 0);
986 do {
987 seq = read_seqbegin(&xtime_lock);
988 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
989 per_cpu(processed_system_time, cpu) =
990 per_cpu(shadow_time, 0).system_timestamp;
991 init_missing_ticks_accounting(cpu);
992 } while (read_seqretry(&xtime_lock, seq));
994 sprintf(timer_name[cpu], "timer%d", cpu);
995 per_cpu(timer_irq, cpu) =
996 bind_virq_to_irqhandler(
997 VIRQ_TIMER,
998 cpu,
999 timer_interrupt,
1000 SA_INTERRUPT,
1001 timer_name[cpu],
1002 NULL);
1003 BUG_ON(per_cpu(timer_irq, cpu) < 0);
1006 void local_teardown_timer(unsigned int cpu)
1008 BUG_ON(cpu == 0);
1009 unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1011 #endif
1013 /*
1014 * /proc/sys/xen: This really belongs in another file. It can stay here for
1015 * now however.
1016 */
1017 static ctl_table xen_subtable[] = {
1018 {1, "independent_wallclock", &independent_wallclock,
1019 sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
1020 {0}
1021 };
1022 static ctl_table xen_table[] = {
1023 {123, "xen", NULL, 0, 0555, xen_subtable},
1024 {0}
1025 };
1026 static int __init xen_sysctl_init(void)
1028 (void)register_sysctl_table(xen_table, 0);
1029 return 0;
1031 __initcall(xen_sysctl_init);
1033 /*
1034 * Local variables:
1035 * c-file-style: "linux"
1036 * indent-tabs-mode: t
1037 * c-indent-level: 8
1038 * c-basic-offset: 8
1039 * tab-width: 8
1040 * End:
1041 */