ia64/xen-unstable

view xen/arch/x86/time.c @ 18389:2eefc8294358

x86: Signal softirq-context calibration with an actual first-class
softirq handle rather than kludging it with set_timer().

Should be faster and is definitely clearer. Also avoids us using
set_timer() in IRQ context (which is currently broken but soon won't
be).

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 27 10:12:49 2008 +0100 (2008-08-27)
parents 95f1dc27e182
children a37902923a67
line source
1 /******************************************************************************
2 * arch/x86/time.c
3 *
4 * Per-CPU time calibration and management.
5 *
6 * Copyright (c) 2002-2005, K A Fraser
7 *
8 * Portions from Linux are:
9 * Copyright (c) 1991, 1992, 1995 Linus Torvalds
10 */
12 #include <xen/config.h>
13 #include <xen/errno.h>
14 #include <xen/event.h>
15 #include <xen/sched.h>
16 #include <xen/lib.h>
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/time.h>
20 #include <xen/timer.h>
21 #include <xen/smp.h>
22 #include <xen/irq.h>
23 #include <xen/softirq.h>
24 #include <asm/io.h>
25 #include <asm/msr.h>
26 #include <asm/mpspec.h>
27 #include <asm/processor.h>
28 #include <asm/fixmap.h>
29 #include <asm/mc146818rtc.h>
30 #include <asm/div64.h>
31 #include <asm/hpet.h>
32 #include <io_ports.h>
34 /* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
35 static char opt_clocksource[10];
36 string_param("clocksource", opt_clocksource);
38 unsigned long cpu_khz; /* CPU clock frequency in kHz. */
39 DEFINE_SPINLOCK(rtc_lock);
40 unsigned long pit0_ticks;
41 static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
42 static DEFINE_SPINLOCK(wc_lock);
44 struct time_scale {
45 int shift;
46 u32 mul_frac;
47 };
49 struct cpu_time {
50 u64 local_tsc_stamp;
51 u64 cstate_tsc_stamp;
52 s_time_t stime_local_stamp;
53 s_time_t stime_master_stamp;
54 struct time_scale tsc_scale;
55 u64 cstate_plt_count_stamp;
56 };
58 struct platform_timesource {
59 char *name;
60 u64 frequency;
61 u64 (*read_counter)(void);
62 int counter_bits;
63 };
65 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
67 /* Calibrate all CPUs to platform timer every EPOCH. */
68 #define EPOCH MILLISECS(1000)
69 static struct timer calibration_timer;
71 /* TSC is invariant on C state entry? */
72 static bool_t tsc_invariant;
74 /*
75 * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
76 * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
77 * softirq handling will happen in time.
78 *
79 * The pit_lock protects the 16- and 32-bit stamp fields as well as the
80 */
81 static DEFINE_SPINLOCK(pit_lock);
82 static u16 pit_stamp16;
83 static u32 pit_stamp32;
84 static int using_pit;
86 /*
87 * 32-bit division of integer dividend and integer divisor yielding
88 * 32-bit fractional quotient.
89 */
90 static inline u32 div_frac(u32 dividend, u32 divisor)
91 {
92 u32 quotient, remainder;
93 ASSERT(dividend < divisor);
94 asm (
95 "divl %4"
96 : "=a" (quotient), "=d" (remainder)
97 : "0" (0), "1" (dividend), "r" (divisor) );
98 return quotient;
99 }
101 /*
102 * 32-bit multiplication of multiplicand and fractional multiplier
103 * yielding 32-bit product (radix point at same position as in multiplicand).
104 */
105 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
106 {
107 u32 product_int, product_frac;
108 asm (
109 "mul %3"
110 : "=a" (product_frac), "=d" (product_int)
111 : "0" (multiplicand), "r" (multiplier) );
112 return product_int;
113 }
115 /*
116 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
117 * yielding a 64-bit result.
118 */
119 static inline u64 scale_delta(u64 delta, struct time_scale *scale)
120 {
121 u64 product;
122 #ifdef CONFIG_X86_32
123 u32 tmp1, tmp2;
124 #endif
126 if ( scale->shift < 0 )
127 delta >>= -scale->shift;
128 else
129 delta <<= scale->shift;
131 #ifdef CONFIG_X86_32
132 asm (
133 "mul %5 ; "
134 "mov %4,%%eax ; "
135 "mov %%edx,%4 ; "
136 "mul %5 ; "
137 "xor %5,%5 ; "
138 "add %4,%%eax ; "
139 "adc %5,%%edx ; "
140 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
141 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (scale->mul_frac) );
142 #else
143 asm (
144 "mul %%rdx ; shrd $32,%%rdx,%%rax"
145 : "=a" (product) : "0" (delta), "d" ((u64)scale->mul_frac) );
146 #endif
148 return product;
149 }
151 /*
152 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
153 * IPIs in place of local APIC timers
154 */
155 extern int xen_cpuidle;
156 static cpumask_t pit_broadcast_mask;
158 static void smp_send_timer_broadcast_ipi(void)
159 {
160 int cpu = smp_processor_id();
161 cpumask_t mask;
163 cpus_and(mask, cpu_online_map, pit_broadcast_mask);
165 if ( cpu_isset(cpu, mask) )
166 {
167 cpu_clear(cpu, mask);
168 raise_softirq(TIMER_SOFTIRQ);
169 }
171 if ( !cpus_empty(mask) )
172 {
173 cpumask_raise_softirq(mask, TIMER_SOFTIRQ);
174 }
175 }
177 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
178 {
179 ASSERT(local_irq_is_enabled());
181 if ( hpet_legacy_irq_tick() )
182 return;
184 /* Only for start-of-day interruopt tests in io_apic.c. */
185 (*(volatile unsigned long *)&pit0_ticks)++;
187 /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
188 if ( !cpu_has_apic )
189 raise_softirq(TIMER_SOFTIRQ);
191 if ( xen_cpuidle )
192 smp_send_timer_broadcast_ipi();
194 /* Emulate a 32-bit PIT counter. */
195 if ( using_pit )
196 {
197 u16 count;
199 spin_lock_irq(&pit_lock);
201 outb(0x80, PIT_MODE);
202 count = inb(PIT_CH2);
203 count |= inb(PIT_CH2) << 8;
205 pit_stamp32 += (u16)(pit_stamp16 - count);
206 pit_stamp16 = count;
208 spin_unlock_irq(&pit_lock);
209 }
210 }
212 static struct irqaction irq0 = { timer_interrupt, "timer", NULL };
214 /* ------ Calibrate the TSC -------
215 * Return processor ticks per second / CALIBRATE_FRAC.
216 */
218 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
219 #define CALIBRATE_FRAC 20 /* calibrate over 50ms */
220 #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC)
222 static u64 init_pit_and_calibrate_tsc(void)
223 {
224 u64 start, end;
225 unsigned long count;
227 /* Set PIT channel 0 to HZ Hz. */
228 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
229 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
230 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
231 outb(LATCH >> 8, PIT_CH0); /* MSB */
233 /* Set the Gate high, disable speaker */
234 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
236 /*
237 * Now let's take care of CTC channel 2
238 *
239 * Set the Gate high, program CTC channel 2 for mode 0, (interrupt on
240 * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB)
241 * to begin countdown.
242 */
243 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
244 outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
245 outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */
247 rdtscll(start);
248 for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
249 continue;
250 rdtscll(end);
252 /* Error if the CTC doesn't behave itself. */
253 if ( count == 0 )
254 return 0;
256 return ((end - start) * (u64)CALIBRATE_FRAC);
257 }
259 static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
260 {
261 u64 tps64 = ticks_per_sec;
262 u32 tps32;
263 int shift = 0;
265 ASSERT(tps64 != 0);
267 while ( tps64 > (MILLISECS(1000)*2) )
268 {
269 tps64 >>= 1;
270 shift--;
271 }
273 tps32 = (u32)tps64;
274 while ( tps32 <= (u32)MILLISECS(1000) )
275 {
276 tps32 <<= 1;
277 shift++;
278 }
280 ts->mul_frac = div_frac(MILLISECS(1000), tps32);
281 ts->shift = shift;
282 }
284 static atomic_t tsc_calibrate_gang = ATOMIC_INIT(0);
285 static unsigned int tsc_calibrate_status = 0;
287 void calibrate_tsc_bp(void)
288 {
289 while ( atomic_read(&tsc_calibrate_gang) != (num_booting_cpus() - 1) )
290 mb();
292 outb(CALIBRATE_LATCH & 0xff, PIT_CH2);
293 outb(CALIBRATE_LATCH >> 8, PIT_CH2);
295 tsc_calibrate_status = 1;
296 wmb();
298 while ( (inb(0x61) & 0x20) == 0 )
299 continue;
301 tsc_calibrate_status = 2;
302 wmb();
304 while ( atomic_read(&tsc_calibrate_gang) != 0 )
305 mb();
306 }
308 void calibrate_tsc_ap(void)
309 {
310 u64 t1, t2, ticks_per_sec;
312 atomic_inc(&tsc_calibrate_gang);
314 while ( tsc_calibrate_status < 1 )
315 mb();
317 rdtscll(t1);
319 while ( tsc_calibrate_status < 2 )
320 mb();
322 rdtscll(t2);
324 ticks_per_sec = (t2 - t1) * (u64)CALIBRATE_FRAC;
325 set_time_scale(&this_cpu(cpu_time).tsc_scale, ticks_per_sec);
327 atomic_dec(&tsc_calibrate_gang);
328 }
330 static char *freq_string(u64 freq)
331 {
332 static char s[20];
333 unsigned int x, y;
334 y = (unsigned int)do_div(freq, 1000000) / 1000;
335 x = (unsigned int)freq;
336 snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
337 return s;
338 }
340 /************************************************************
341 * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
342 */
344 static u64 read_pit_count(void)
345 {
346 u16 count16;
347 u32 count32;
348 unsigned long flags;
350 spin_lock_irqsave(&pit_lock, flags);
352 outb(0x80, PIT_MODE);
353 count16 = inb(PIT_CH2);
354 count16 |= inb(PIT_CH2) << 8;
356 count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
358 spin_unlock_irqrestore(&pit_lock, flags);
360 return count32;
361 }
363 static void init_pit(struct platform_timesource *pts)
364 {
365 pts->name = "PIT";
366 pts->frequency = CLOCK_TICK_RATE;
367 pts->read_counter = read_pit_count;
368 pts->counter_bits = 32;
369 using_pit = 1;
370 }
372 /************************************************************
373 * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
374 */
376 static u64 read_hpet_count(void)
377 {
378 return hpet_read32(HPET_COUNTER);
379 }
381 static int init_hpet(struct platform_timesource *pts)
382 {
383 u64 hpet_rate = hpet_setup();
385 if ( hpet_rate == 0 )
386 return 0;
388 pts->name = "HPET";
389 pts->frequency = hpet_rate;
390 pts->read_counter = read_hpet_count;
391 pts->counter_bits = 32;
393 return 1;
394 }
396 /************************************************************
397 * PLATFORM TIMER 3: IBM 'CYCLONE' TIMER
398 */
400 int use_cyclone;
402 /*
403 * Although the counter is read via a 64-bit register, I believe it is actually
404 * a 40-bit counter. Since this will wrap, I read only the low 32 bits and
405 * periodically fold into a 64-bit software counter, just as for PIT and HPET.
406 */
407 #define CYCLONE_CBAR_ADDR 0xFEB00CD0
408 #define CYCLONE_PMCC_OFFSET 0x51A0
409 #define CYCLONE_MPMC_OFFSET 0x51D0
410 #define CYCLONE_MPCS_OFFSET 0x51A8
411 #define CYCLONE_TIMER_FREQ 100000000
413 /* Cyclone MPMC0 register. */
414 static volatile u32 *cyclone_timer;
416 static u64 read_cyclone_count(void)
417 {
418 return *cyclone_timer;
419 }
421 static volatile u32 *map_cyclone_reg(unsigned long regaddr)
422 {
423 unsigned long pageaddr = regaddr & PAGE_MASK;
424 unsigned long offset = regaddr & ~PAGE_MASK;
425 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
426 return (volatile u32 *)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
427 }
429 static int init_cyclone(struct platform_timesource *pts)
430 {
431 u32 base;
433 if ( !use_cyclone )
434 return 0;
436 /* Find base address. */
437 base = *(map_cyclone_reg(CYCLONE_CBAR_ADDR));
438 if ( base == 0 )
439 {
440 printk(KERN_ERR "Cyclone: Could not find valid CBAR value.\n");
441 return 0;
442 }
444 /* Enable timer and map the counter register. */
445 *(map_cyclone_reg(base + CYCLONE_PMCC_OFFSET)) = 1;
446 *(map_cyclone_reg(base + CYCLONE_MPCS_OFFSET)) = 1;
447 cyclone_timer = map_cyclone_reg(base + CYCLONE_MPMC_OFFSET);
449 pts->name = "IBM Cyclone";
450 pts->frequency = CYCLONE_TIMER_FREQ;
451 pts->read_counter = read_cyclone_count;
452 pts->counter_bits = 32;
454 return 1;
455 }
457 /************************************************************
458 * PLATFORM TIMER 4: ACPI PM TIMER
459 */
461 u32 pmtmr_ioport;
463 /* ACPI PM timer ticks at 3.579545 MHz. */
464 #define ACPI_PM_FREQUENCY 3579545
466 static u64 read_pmtimer_count(void)
467 {
468 return inl(pmtmr_ioport);
469 }
471 static int init_pmtimer(struct platform_timesource *pts)
472 {
473 if ( pmtmr_ioport == 0 )
474 return 0;
476 pts->name = "ACPI PM Timer";
477 pts->frequency = ACPI_PM_FREQUENCY;
478 pts->read_counter = read_pmtimer_count;
479 pts->counter_bits = 24;
481 return 1;
482 }
484 /************************************************************
485 * GENERIC PLATFORM TIMER INFRASTRUCTURE
486 */
488 static struct platform_timesource plt_src; /* details of chosen timesource */
489 static u64 plt_mask; /* hardware-width mask */
490 static u64 plt_overflow_period; /* ns between calls to plt_overflow() */
491 static struct time_scale plt_scale; /* scale: platform counter -> nanosecs */
493 /* Protected by platform_timer_lock. */
494 static DEFINE_SPINLOCK(platform_timer_lock);
495 static s_time_t stime_platform_stamp; /* System time at below platform time */
496 static u64 platform_timer_stamp; /* Platform time at above system time */
497 static u64 plt_stamp64; /* 64-bit platform counter stamp */
498 static u64 plt_stamp; /* hardware-width platform counter stamp */
499 static struct timer plt_overflow_timer;
501 static void plt_overflow(void *unused)
502 {
503 u64 count;
505 spin_lock_irq(&platform_timer_lock);
506 count = plt_src.read_counter();
507 plt_stamp64 += (count - plt_stamp) & plt_mask;
508 plt_stamp = count;
509 spin_unlock_irq(&platform_timer_lock);
511 set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
512 }
514 static s_time_t __read_platform_stime(u64 platform_time)
515 {
516 u64 diff = platform_time - platform_timer_stamp;
517 ASSERT(spin_is_locked(&platform_timer_lock));
518 return (stime_platform_stamp + scale_delta(diff, &plt_scale));
519 }
521 static s_time_t read_platform_stime(void)
522 {
523 u64 count;
524 s_time_t stime;
526 ASSERT(!local_irq_is_enabled());
528 spin_lock(&platform_timer_lock);
529 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
530 stime = __read_platform_stime(count);
531 spin_unlock(&platform_timer_lock);
533 return stime;
534 }
536 static void platform_time_calibration(void)
537 {
538 u64 count;
539 s_time_t stamp;
541 spin_lock_irq(&platform_timer_lock);
542 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
543 stamp = __read_platform_stime(count);
544 stime_platform_stamp = stamp;
545 platform_timer_stamp = count;
546 spin_unlock_irq(&platform_timer_lock);
547 }
549 static void resume_platform_timer(void)
550 {
551 /* No change in platform_stime across suspend/resume. */
552 platform_timer_stamp = plt_stamp64;
553 plt_stamp = plt_src.read_counter();
554 }
556 static void init_platform_timer(void)
557 {
558 struct platform_timesource *pts = &plt_src;
559 int rc = -1;
561 if ( opt_clocksource[0] != '\0' )
562 {
563 if ( !strcmp(opt_clocksource, "pit") )
564 rc = (init_pit(pts), 1);
565 else if ( !strcmp(opt_clocksource, "hpet") )
566 rc = init_hpet(pts);
567 else if ( !strcmp(opt_clocksource, "cyclone") )
568 rc = init_cyclone(pts);
569 else if ( !strcmp(opt_clocksource, "acpi") )
570 rc = init_pmtimer(pts);
572 if ( rc <= 0 )
573 printk("WARNING: %s clocksource '%s'.\n",
574 (rc == 0) ? "Could not initialise" : "Unrecognised",
575 opt_clocksource);
576 }
578 if ( (rc <= 0) &&
579 !init_cyclone(pts) &&
580 !init_hpet(pts) &&
581 !init_pmtimer(pts) )
582 init_pit(pts);
584 plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
586 set_time_scale(&plt_scale, pts->frequency);
588 plt_overflow_period = scale_delta(
589 1ull << (pts->counter_bits-1), &plt_scale);
590 init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
591 plt_overflow(NULL);
593 platform_timer_stamp = plt_stamp64;
595 printk("Platform timer is %s %s\n",
596 freq_string(pts->frequency), pts->name);
597 }
599 void cstate_save_tsc(void)
600 {
601 struct cpu_time *t = &this_cpu(cpu_time);
603 if ( tsc_invariant )
604 return;
606 t->cstate_plt_count_stamp = plt_src.read_counter();
607 rdtscll(t->cstate_tsc_stamp);
608 }
610 void cstate_restore_tsc(void)
611 {
612 struct cpu_time *t = &this_cpu(cpu_time);
613 u64 plt_count_delta, tsc_delta;
615 if ( tsc_invariant )
616 return;
618 plt_count_delta = (plt_src.read_counter() -
619 t->cstate_plt_count_stamp) & plt_mask;
620 tsc_delta = scale_delta(plt_count_delta, &plt_scale) * cpu_khz/1000000UL;
621 wrmsrl(MSR_IA32_TSC, t->cstate_tsc_stamp + tsc_delta);
622 }
624 /***************************************************************************
625 * CMOS Timer functions
626 ***************************************************************************/
628 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
629 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
630 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
631 *
632 * [For the Julian calendar (which was used in Russia before 1917,
633 * Britain & colonies before 1752, anywhere else before 1582,
634 * and is still in use by some communities) leave out the
635 * -year/100+year/400 terms, and add 10.]
636 *
637 * This algorithm was first published by Gauss (I think).
638 *
639 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
640 * machines were long is 32-bit! (However, as time_t is signed, we
641 * will already get problems at other places on 2038-01-19 03:14:08)
642 */
643 unsigned long
644 mktime (unsigned int year, unsigned int mon,
645 unsigned int day, unsigned int hour,
646 unsigned int min, unsigned int sec)
647 {
648 /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
649 if ( 0 >= (int) (mon -= 2) )
650 {
651 mon += 12;
652 year -= 1;
653 }
655 return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
656 year*365 - 719499
657 )*24 + hour /* now have hours */
658 )*60 + min /* now have minutes */
659 )*60 + sec; /* finally seconds */
660 }
662 static unsigned long __get_cmos_time(void)
663 {
664 unsigned int year, mon, day, hour, min, sec;
666 sec = CMOS_READ(RTC_SECONDS);
667 min = CMOS_READ(RTC_MINUTES);
668 hour = CMOS_READ(RTC_HOURS);
669 day = CMOS_READ(RTC_DAY_OF_MONTH);
670 mon = CMOS_READ(RTC_MONTH);
671 year = CMOS_READ(RTC_YEAR);
673 if ( !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD )
674 {
675 BCD_TO_BIN(sec);
676 BCD_TO_BIN(min);
677 BCD_TO_BIN(hour);
678 BCD_TO_BIN(day);
679 BCD_TO_BIN(mon);
680 BCD_TO_BIN(year);
681 }
683 if ( (year += 1900) < 1970 )
684 year += 100;
686 return mktime(year, mon, day, hour, min, sec);
687 }
689 static unsigned long get_cmos_time(void)
690 {
691 unsigned long res, flags;
692 int i;
694 spin_lock_irqsave(&rtc_lock, flags);
696 /* read RTC exactly on falling edge of update flag */
697 for ( i = 0 ; i < 1000000 ; i++ ) /* may take up to 1 second... */
698 if ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) )
699 break;
700 for ( i = 0 ; i < 1000000 ; i++ ) /* must try at least 2.228 ms */
701 if ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) )
702 break;
704 res = __get_cmos_time();
706 spin_unlock_irqrestore(&rtc_lock, flags);
707 return res;
708 }
710 /***************************************************************************
711 * System Time
712 ***************************************************************************/
714 s_time_t get_s_time(void)
715 {
716 struct cpu_time *t = &this_cpu(cpu_time);
717 u64 tsc, delta;
718 s_time_t now;
720 rdtscll(tsc);
721 delta = tsc - t->local_tsc_stamp;
722 now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
724 return now;
725 }
727 static inline void version_update_begin(u32 *version)
728 {
729 /* Explicitly OR with 1 just in case version number gets out of sync. */
730 *version = (*version + 1) | 1;
731 wmb();
732 }
734 static inline void version_update_end(u32 *version)
735 {
736 wmb();
737 (*version)++;
738 }
740 void update_vcpu_system_time(struct vcpu *v)
741 {
742 struct cpu_time *t;
743 struct vcpu_time_info *u;
745 if ( v->vcpu_info == NULL )
746 return;
748 t = &this_cpu(cpu_time);
749 u = &vcpu_info(v, time);
751 if ( u->tsc_timestamp == t->local_tsc_stamp )
752 return;
754 version_update_begin(&u->version);
756 u->tsc_timestamp = t->local_tsc_stamp;
757 u->system_time = t->stime_local_stamp;
758 u->tsc_to_system_mul = t->tsc_scale.mul_frac;
759 u->tsc_shift = (s8)t->tsc_scale.shift;
761 version_update_end(&u->version);
762 }
764 void update_domain_wallclock_time(struct domain *d)
765 {
766 spin_lock(&wc_lock);
767 version_update_begin(&shared_info(d, wc_version));
768 shared_info(d, wc_sec) = wc_sec + d->time_offset_seconds;
769 shared_info(d, wc_nsec) = wc_nsec;
770 version_update_end(&shared_info(d, wc_version));
771 spin_unlock(&wc_lock);
772 }
774 void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds)
775 {
776 d->time_offset_seconds = time_offset_seconds;
777 if ( is_hvm_domain(d) )
778 rtc_update_clock(d);
779 }
781 int cpu_frequency_change(u64 freq)
782 {
783 struct cpu_time *t = &this_cpu(cpu_time);
784 u64 curr_tsc;
786 /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
787 if ( freq < 1000000u )
788 {
789 gdprintk(XENLOG_WARNING, "Rejecting CPU frequency change "
790 "to %"PRIu64" Hz.\n", freq);
791 return -EINVAL;
792 }
794 local_irq_disable();
795 /* Platform time /first/, as we may be delayed by platform_timer_lock. */
796 t->stime_master_stamp = read_platform_stime();
797 /* TSC-extrapolated time may be bogus after frequency change. */
798 /*t->stime_local_stamp = get_s_time();*/
799 t->stime_local_stamp = t->stime_master_stamp;
800 rdtscll(curr_tsc);
801 t->local_tsc_stamp = curr_tsc;
802 set_time_scale(&t->tsc_scale, freq);
803 local_irq_enable();
805 update_vcpu_system_time(current);
807 /* A full epoch should pass before we check for deviation. */
808 if ( smp_processor_id() == 0 )
809 {
810 set_timer(&calibration_timer, NOW() + EPOCH);
811 platform_time_calibration();
812 }
814 return 0;
815 }
817 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
818 void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base)
819 {
820 u64 x;
821 u32 y, _wc_sec, _wc_nsec;
822 struct domain *d;
824 x = (secs * 1000000000ULL) + (u64)nsecs - system_time_base;
825 y = do_div(x, 1000000000);
827 spin_lock(&wc_lock);
828 wc_sec = _wc_sec = (u32)x;
829 wc_nsec = _wc_nsec = (u32)y;
830 spin_unlock(&wc_lock);
832 rcu_read_lock(&domlist_read_lock);
833 for_each_domain ( d )
834 update_domain_wallclock_time(d);
835 rcu_read_unlock(&domlist_read_lock);
836 }
838 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
839 struct cpu_calibration {
840 u64 local_tsc_stamp;
841 s_time_t stime_local_stamp;
842 s_time_t stime_master_stamp;
843 };
844 static DEFINE_PER_CPU(struct cpu_calibration, cpu_calibration);
846 /* Softirq handler for per-CPU time calibration. */
847 static void local_time_calibration(void)
848 {
849 struct cpu_time *t = &this_cpu(cpu_time);
850 struct cpu_calibration *c = &this_cpu(cpu_calibration);
852 /*
853 * System timestamps, extrapolated from local and master oscillators,
854 * taken during this calibration and the previous calibration.
855 */
856 s_time_t prev_local_stime, curr_local_stime;
857 s_time_t prev_master_stime, curr_master_stime;
859 /* TSC timestamps taken during this calibration and prev calibration. */
860 u64 prev_tsc, curr_tsc;
862 /*
863 * System time and TSC ticks elapsed during the previous calibration
864 * 'epoch'. These values are down-shifted to fit in 32 bits.
865 */
866 u64 stime_elapsed64, tsc_elapsed64;
867 u32 stime_elapsed32, tsc_elapsed32;
869 /* The accumulated error in the local estimate. */
870 u64 local_stime_err;
872 /* Error correction to slow down a fast local clock. */
873 u32 error_factor = 0;
875 /* Calculated TSC shift to ensure 32-bit scale multiplier. */
876 int tsc_shift = 0;
878 /* The overall calibration scale multiplier. */
879 u32 calibration_mul_frac;
881 prev_tsc = t->local_tsc_stamp;
882 prev_local_stime = t->stime_local_stamp;
883 prev_master_stime = t->stime_master_stamp;
885 /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
886 local_irq_disable();
887 curr_tsc = c->local_tsc_stamp;
888 curr_local_stime = c->stime_local_stamp;
889 curr_master_stime = c->stime_master_stamp;
890 local_irq_enable();
892 #if 0
893 printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
894 smp_processor_id(), prev_tsc, prev_local_stime, prev_master_stime);
895 printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
896 " -> %"PRId64"\n",
897 smp_processor_id(), curr_tsc, curr_local_stime, curr_master_stime,
898 curr_master_stime - curr_local_stime);
899 #endif
901 /* Local time warps forward if it lags behind master time. */
902 if ( curr_local_stime < curr_master_stime )
903 curr_local_stime = curr_master_stime;
905 stime_elapsed64 = curr_master_stime - prev_master_stime;
906 tsc_elapsed64 = curr_tsc - prev_tsc;
908 /*
909 * Weirdness can happen if we lose sync with the platform timer.
910 * We could be smarter here: resync platform timer with local timer?
911 */
912 if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
913 goto out;
915 /*
916 * Calculate error-correction factor. This only slows down a fast local
917 * clock (slow clocks are warped forwards). The scale factor is clamped
918 * to >= 0.5.
919 */
920 if ( curr_local_stime != curr_master_stime )
921 {
922 local_stime_err = curr_local_stime - curr_master_stime;
923 if ( local_stime_err > EPOCH )
924 local_stime_err = EPOCH;
925 error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
926 }
928 /*
929 * We require 0 < stime_elapsed < 2^31.
930 * This allows us to binary shift a 32-bit tsc_elapsed such that:
931 * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
932 */
933 while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
934 ((s32)stime_elapsed64 < 0) )
935 {
936 stime_elapsed64 >>= 1;
937 tsc_elapsed64 >>= 1;
938 }
940 /* stime_master_diff now fits in a 32-bit word. */
941 stime_elapsed32 = (u32)stime_elapsed64;
943 /* tsc_elapsed <= 2*stime_elapsed */
944 while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
945 {
946 tsc_elapsed64 >>= 1;
947 tsc_shift--;
948 }
950 /* Local difference must now fit in 32 bits. */
951 ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
952 tsc_elapsed32 = (u32)tsc_elapsed64;
954 /* tsc_elapsed > stime_elapsed */
955 ASSERT(tsc_elapsed32 != 0);
956 while ( tsc_elapsed32 <= stime_elapsed32 )
957 {
958 tsc_elapsed32 <<= 1;
959 tsc_shift++;
960 }
962 calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
963 if ( error_factor != 0 )
964 calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
966 #if 0
967 printk("---%d: %08x %08x %d\n", smp_processor_id(),
968 error_factor, calibration_mul_frac, tsc_shift);
969 #endif
971 /* Record new timestamp information, atomically w.r.t. interrupts. */
972 local_irq_disable();
973 t->tsc_scale.mul_frac = calibration_mul_frac;
974 t->tsc_scale.shift = tsc_shift;
975 t->local_tsc_stamp = curr_tsc;
976 t->stime_local_stamp = curr_local_stime;
977 t->stime_master_stamp = curr_master_stime;
978 local_irq_enable();
980 update_vcpu_system_time(current);
982 out:
983 if ( smp_processor_id() == 0 )
984 {
985 set_timer(&calibration_timer, NOW() + EPOCH);
986 platform_time_calibration();
987 }
988 }
990 /*
991 * Rendezvous for all CPUs in IRQ context.
992 * Master CPU snapshots the platform timer.
993 * All CPUS snapshot their local TSC and extrapolation of system time.
994 */
995 struct calibration_rendezvous {
996 atomic_t nr_cpus;
997 s_time_t master_stime;
998 };
1000 static void time_calibration_rendezvous(void *_r)
1002 unsigned int total_cpus = num_online_cpus();
1003 struct cpu_calibration *c = &this_cpu(cpu_calibration);
1004 struct calibration_rendezvous *r = _r;
1006 if ( smp_processor_id() == 0 )
1008 while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
1009 cpu_relax();
1010 r->master_stime = read_platform_stime();
1011 atomic_inc(&r->nr_cpus);
1013 else
1015 atomic_inc(&r->nr_cpus);
1016 while ( atomic_read(&r->nr_cpus) != total_cpus )
1017 cpu_relax();
1020 rdtscll(c->local_tsc_stamp);
1021 c->stime_local_stamp = get_s_time();
1022 c->stime_master_stamp = r->master_stime;
1024 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1027 static void time_calibration(void *unused)
1029 struct calibration_rendezvous r = {
1030 .nr_cpus = ATOMIC_INIT(0)
1031 };
1033 on_each_cpu(time_calibration_rendezvous, &r, 0, 0);
1036 void init_percpu_time(void)
1038 struct cpu_time *t = &this_cpu(cpu_time);
1039 unsigned long flags;
1040 s_time_t now;
1042 local_irq_save(flags);
1043 rdtscll(t->local_tsc_stamp);
1044 now = !plt_src.read_counter ? 0 : read_platform_stime();
1045 local_irq_restore(flags);
1047 t->stime_master_stamp = now;
1048 t->stime_local_stamp = now;
1050 if ( smp_processor_id() == 0 )
1052 init_timer(&calibration_timer, time_calibration, NULL, 0);
1053 set_timer(&calibration_timer, NOW() + EPOCH);
1057 /* Late init function (after all CPUs are booted). */
1058 int __init init_xen_time(void)
1060 local_irq_disable();
1062 /* check if TSC is invariant during deep C state
1063 this is a new feature introduced by Nehalem*/
1064 if ( cpuid_edx(0x80000007) & (1u<<8) )
1065 tsc_invariant = 1;
1067 open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
1069 init_percpu_time();
1071 stime_platform_stamp = 0;
1072 init_platform_timer();
1074 do_settime(get_cmos_time(), 0, NOW());
1076 local_irq_enable();
1078 return 0;
1082 /* Early init function. */
1083 void __init early_time_init(void)
1085 u64 tmp = init_pit_and_calibrate_tsc();
1087 set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);
1089 do_div(tmp, 1000);
1090 cpu_khz = (unsigned long)tmp;
1091 printk("Detected %lu.%03lu MHz processor.\n",
1092 cpu_khz / 1000, cpu_khz % 1000);
1094 setup_irq(0, &irq0);
1097 /* force_hpet_broadcast: if true, force using hpet_broadcast to fix lapic stop
1098 issue for deep C state with pit disabled */
1099 static int force_hpet_broadcast;
1100 boolean_param("hpetbroadcast", force_hpet_broadcast);
1102 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
1103 static int disable_pit_irq(void)
1105 if ( using_pit || !cpu_has_apic || (xen_cpuidle && !force_hpet_broadcast) )
1106 return 0;
1108 /*
1109 * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
1110 * emulation when entering deep C states.
1111 * XXX dom0 may rely on RTC interrupt delivery, so only enable
1112 * hpet_broadcast if force_hpet_broadcast.
1113 */
1114 if ( xen_cpuidle && force_hpet_broadcast )
1116 hpet_broadcast_init();
1117 if ( !hpet_broadcast_is_available() )
1119 printk("HPET broadcast init failed, turn to PIT broadcast.\n");
1120 return 0;
1124 /* Disable PIT CH0 timer interrupt. */
1125 outb_p(0x30, PIT_MODE);
1126 outb_p(0, PIT_CH0);
1127 outb_p(0, PIT_CH0);
1129 return 0;
1131 __initcall(disable_pit_irq);
1133 void pit_broadcast_enter(void)
1135 cpu_set(smp_processor_id(), pit_broadcast_mask);
1138 void pit_broadcast_exit(void)
1140 int cpu = smp_processor_id();
1142 if ( cpu_test_and_clear(cpu, pit_broadcast_mask) )
1143 reprogram_timer(per_cpu(timer_deadline, cpu));
1146 int pit_broadcast_is_available(void)
1148 return xen_cpuidle;
1151 void send_timer_event(struct vcpu *v)
1153 send_guest_vcpu_virq(v, VIRQ_TIMER);
1156 /* Return secs after 00:00:00 localtime, 1 January, 1970. */
1157 unsigned long get_localtime(struct domain *d)
1159 return wc_sec + (wc_nsec + NOW()) / 1000000000ULL
1160 + d->time_offset_seconds;
1163 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
1164 static long cmos_utc_offset; /* in seconds */
1166 int time_suspend(void)
1168 if ( smp_processor_id() == 0 )
1170 cmos_utc_offset = -get_cmos_time();
1171 cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL);
1172 kill_timer(&calibration_timer);
1175 /* Better to cancel calibration timer for accuracy. */
1176 clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
1178 return 0;
1181 int time_resume(void)
1183 /*u64 tmp = */init_pit_and_calibrate_tsc();
1185 disable_pit_irq();
1187 /* Disable this while calibrate_tsc_ap() also is skipped. */
1188 /*set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);*/
1190 resume_platform_timer();
1192 init_percpu_time();
1194 do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW());
1196 if ( !is_idle_vcpu(current) )
1197 update_vcpu_system_time(current);
1199 return 0;
1202 int dom0_pit_access(struct ioreq *ioreq)
1204 /* Is Xen using Channel 2? Then disallow direct dom0 access. */
1205 if ( using_pit )
1206 return 0;
1208 switch ( ioreq->addr )
1210 case PIT_CH2:
1211 if ( ioreq->dir == IOREQ_READ )
1212 ioreq->data = inb(PIT_CH2);
1213 else
1214 outb(ioreq->data, PIT_CH2);
1215 return 1;
1217 case PIT_MODE:
1218 if ( ioreq->dir == IOREQ_READ )
1219 return 0; /* urk! */
1220 switch ( ioreq->data & 0xc0 )
1222 case 0xc0: /* Read Back */
1223 if ( ioreq->data & 0x08 ) /* Select Channel 2? */
1224 outb(ioreq->data & 0xf8, PIT_MODE);
1225 if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
1226 return 1; /* no - we're done */
1227 /* Filter Channel 2 and reserved bit 0. */
1228 ioreq->data &= ~0x09;
1229 return 0; /* emulate ch0/1 readback */
1230 case 0x80: /* Select Counter 2 */
1231 outb(ioreq->data, PIT_MODE);
1232 return 1;
1235 case 0x61:
1236 if ( ioreq->dir == IOREQ_READ )
1237 ioreq->data = inb(0x61);
1238 else
1239 outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
1240 return 1;
1243 return 0;
1246 struct tm wallclock_time(void)
1248 uint64_t seconds;
1250 if ( !wc_sec )
1251 return (struct tm) { 0 };
1253 seconds = NOW() + (wc_sec * 1000000000ull) + wc_nsec;
1254 do_div(seconds, 1000000000);
1255 return gmtime(seconds);
1258 /*
1259 * Local variables:
1260 * mode: C
1261 * c-set-style: "BSD"
1262 * c-basic-offset: 4
1263 * tab-width: 4
1264 * indent-tabs-mode: nil
1265 * End:
1266 */