ia64/xen-unstable

changeset 2122:eb5e0b2dcb3e

bitkeeper revision 1.1159.1.12 (41168c230W3pxxUssBkT_C2wDyotvA)

Bring 2.6 time code up to 2.4 spec. This brought in a bunch of DOM0 code,
in particular, and may help with some of the time problems that we've been
seeing.
author kaf24@scramble.cl.cam.ac.uk
date Sun Aug 08 20:25:07 2004 +0000 (2004-08-08)
parents 0064c2d431f4
children c43fdad0eca0 66687fe8d1c7
files linux-2.6.7-xen-sparse/arch/xen/i386/kernel/time.c linux-2.6.7-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c
line diff
     1.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/time.c	Sun Aug 08 19:11:16 2004 +0000
     1.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/time.c	Sun Aug 08 20:25:07 2004 +0000
     1.3 @@ -45,6 +45,7 @@
     1.4  #include <linux/sysdev.h>
     1.5  #include <linux/bcd.h>
     1.6  #include <linux/efi.h>
     1.7 +#include <linux/sysctl.h>
     1.8  
     1.9  #include <asm/io.h>
    1.10  #include <asm/smp.h>
    1.11 @@ -94,64 +95,76 @@ static u32 shadow_time_version;
    1.12  static struct timeval shadow_tv;
    1.13  extern u64 processed_system_time;
    1.14  
    1.15 -#define NS_PER_TICK (1000000000ULL/HZ)
    1.16 -
    1.17 -/*
    1.18 - * Reads a consistent set of time-base values from Xen, into a shadow data
    1.19 - * area. Must be called with the xtime_lock held for writing.
    1.20 - */
    1.21 -int __get_time_values_from_xen(void)
    1.22 -{
    1.23 -	s64 delta;
    1.24 -	unsigned int ticks = 0;
    1.25 -
    1.26 -	do {
    1.27 -		shadow_time_version = HYPERVISOR_shared_info->time_version2;
    1.28 -		rmb();
    1.29 -		shadow_tv.tv_sec    = HYPERVISOR_shared_info->wc_sec;
    1.30 -		shadow_tv.tv_usec   = HYPERVISOR_shared_info->wc_usec;
    1.31 -		shadow_tsc_stamp    = HYPERVISOR_shared_info->tsc_timestamp.tsc_bits;
    1.32 -		shadow_system_time  = HYPERVISOR_shared_info->system_time;
    1.33 -		rmb();
    1.34 -	}
    1.35 -	while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
    1.36 -
    1.37 -	delta = (s64)(shadow_system_time +
    1.38 -		      cur_timer->get_offset() * NSEC_PER_USEC -
    1.39 -		      processed_system_time);
    1.40 -	if (delta < 0) {
    1.41 -		printk("Timer ISR: Time went backwards: %lld\n", delta);
    1.42 -		return 1;
    1.43 -	}
    1.44 -
    1.45 -	if (delta < NS_PER_TICK)
    1.46 -		return 1;
    1.47 -
    1.48 -	/* Process elapsed jiffies since last call. */
    1.49 -	while (delta >= NS_PER_TICK) {
    1.50 -		ticks++;
    1.51 -		delta -= NS_PER_TICK;
    1.52 -		processed_system_time += NS_PER_TICK;
    1.53 -	}
    1.54 -	jiffies_64 += ticks - 1;
    1.55 -	/* We leave one tick for the caller to add to jiffies since
    1.56 -	 * the timer interrupt will call do_timer(). */
    1.57 -
    1.58 -	return 0;
    1.59 -}
    1.60 -
    1.61 -#define TIME_VALUES_UP_TO_DATE \
    1.62 -	(shadow_time_version == HYPERVISOR_shared_info->time_version2)
    1.63 -
    1.64  /*
    1.65   * We use this to ensure that gettimeofday() is monotonically increasing. We
    1.66   * only break this guarantee if the wall clock jumps backwards "a long way".
    1.67   */
    1.68  static struct timeval last_seen_tv = {0,0};
    1.69  
    1.70 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
    1.71 +/* Periodically propagate synchronised time base to the RTC and to Xen. */
    1.72 +static long last_rtc_update, last_update_to_xen;
    1.73 +#endif
    1.74 +
    1.75 +/* Periodically take synchronised time base from Xen, if we need it. */
    1.76 +static long last_update_from_xen;   /* UTC seconds when last read Xen clock. */
    1.77 +
    1.78  /* Keep track of last time we did processing/updating of jiffies and xtime. */
    1.79  u64 processed_system_time;   /* System time (ns) at last processing. */
    1.80  
    1.81 +#define NS_PER_TICK (1000000000ULL/HZ)
    1.82 +
    1.83 +#define HANDLE_USEC_UNDERFLOW(_tv)         \
    1.84 +    do {                                   \
    1.85 +        while ( (_tv).tv_usec < 0 )        \
    1.86 +        {                                  \
    1.87 +            (_tv).tv_usec += 1000000;      \
    1.88 +            (_tv).tv_sec--;                \
    1.89 +        }                                  \
    1.90 +    } while ( 0 )
    1.91 +#define HANDLE_USEC_OVERFLOW(_tv)          \
    1.92 +    do {                                   \
    1.93 +        while ( (_tv).tv_usec >= 1000000 ) \
    1.94 +        {                                  \
    1.95 +            (_tv).tv_usec -= 1000000;      \
    1.96 +            (_tv).tv_sec++;                \
    1.97 +        }                                  \
    1.98 +    } while ( 0 )
    1.99 +
   1.100 +/* Does this guest OS track Xen time, or set its wall clock independently? */
   1.101 +static int independent_wallclock = 0;
   1.102 +static int __init __independent_wallclock(char *str)
   1.103 +{
   1.104 +    independent_wallclock = 1;
   1.105 +    return 1;
   1.106 +}
   1.107 +__setup("independent_wallclock", __independent_wallclock);
   1.108 +
   1.109 +/*
   1.110 + * Reads a consistent set of time-base values from Xen, into a shadow data
   1.111 + * area. Must be called with the xtime_lock held for writing.
   1.112 + */
   1.113 +static void __get_time_values_from_xen(void)
   1.114 +{
   1.115 +	shared_info_t *s = HYPERVISOR_shared_info;
   1.116 +
   1.117 +	do {
   1.118 +		shadow_time_version = s->time_version2;
   1.119 +		rmb();
   1.120 +		shadow_tv.tv_sec    = s->wc_sec;
   1.121 +		shadow_tv.tv_usec   = s->wc_usec;
   1.122 +		shadow_tsc_stamp    = s->tsc_timestamp.tsc_bits;
   1.123 +		shadow_system_time  = s->system_time;
   1.124 +		rmb();
   1.125 +	}
   1.126 +	while (shadow_time_version != s->time_version1);
   1.127 +
   1.128 +	cur_timer->mark_offset();
   1.129 +}
   1.130 +
   1.131 +#define TIME_VALUES_UP_TO_DATE \
   1.132 +	(shadow_time_version == HYPERVISOR_shared_info->time_version2)
   1.133 +
   1.134  /*
   1.135   * This version of gettimeofday has microsecond resolution
   1.136   * and better than microsecond precision on fast x86 machines with TSC.
   1.137 @@ -196,10 +209,9 @@ void do_gettimeofday(struct timeval *tv)
   1.138  			 * overflowed). Detect that and recalculate
   1.139  			 * with fresh values.
   1.140  			 */
   1.141 -			write_seqlock(&xtime_lock);
   1.142 -			if (__get_time_values_from_xen() == 0)
   1.143 -				jiffies_64++;
   1.144 -			write_sequnlock(&xtime_lock);
   1.145 +			write_seqlock_irq(&xtime_lock);
   1.146 +			__get_time_values_from_xen();
   1.147 +			write_sequnlock_irq(&xtime_lock);
   1.148  			continue;
   1.149  		}
   1.150  	} while (read_seqretry(&xtime_lock, seq));
   1.151 @@ -229,18 +241,36 @@ int do_settimeofday(struct timespec *tv)
   1.152  {
   1.153  	time_t wtm_sec, sec = tv->tv_sec;
   1.154  	long wtm_nsec, nsec = tv->tv_nsec;
   1.155 +	struct timespec xentime;
   1.156  
   1.157  	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
   1.158  		return -EINVAL;
   1.159  
   1.160 +	if (!independent_wallclock && !(start_info.flags & SIF_INITDOMAIN))
   1.161 +		return 0; /* Silent failure? */
   1.162 +
   1.163  	write_seqlock_irq(&xtime_lock);
   1.164 +
   1.165 +	/*
   1.166 +	 * Ensure we don't get blocked for a long time so that our time delta
   1.167 +	 * overflows. If that were to happen then our shadow time values would
   1.168 +	 * be stale, so we can retry with fresh ones.
   1.169 +	 */
   1.170 + again:
   1.171 +	nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
   1.172 +	if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
   1.173 +		__get_time_values_from_xen();
   1.174 +		goto again;
   1.175 +	}
   1.176 +
   1.177 +	set_normalized_timespec(&xentime, sec, nsec);
   1.178 +
   1.179  	/*
   1.180  	 * This is revolting. We need to set "xtime" correctly. However, the
   1.181  	 * value in this location is the value at the most recent update of
   1.182  	 * wall time.  Discover what correction gettimeofday() would have
   1.183  	 * made, and then undo it!
   1.184  	 */
   1.185 -	nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
   1.186  	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
   1.187  
   1.188  	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
   1.189 @@ -254,16 +284,31 @@ int do_settimeofday(struct timespec *tv)
   1.190  	time_maxerror = NTP_PHASE_LIMIT;
   1.191  	time_esterror = NTP_PHASE_LIMIT;
   1.192  
   1.193 +	/* Reset all our running time counts. They make no sense now. */
   1.194  	last_seen_tv.tv_sec = 0;
   1.195 +	last_update_from_xen = 0;
   1.196  
   1.197 -	write_sequnlock_irq(&xtime_lock);
   1.198 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
   1.199 +	if (start_info.flags & SIF_INITDOMAIN) {
   1.200 +		dom0_op_t op;
   1.201 +		last_rtc_update = last_update_to_xen = 0;
   1.202 +		op.cmd = DOM0_SETTIME;
   1.203 +		op.u.settime.secs        = xentime.tv_sec;
   1.204 +		op.u.settime.usecs       = xentime.tv_nsec / 1000;
   1.205 +		op.u.settime.system_time = shadow_system_time;
   1.206 +		write_sequnlock_irq(&xtime_lock);
   1.207 +		HYPERVISOR_dom0_op(&op);
   1.208 +	} else
   1.209 +#endif
   1.210 +		write_sequnlock_irq(&xtime_lock);
   1.211 +
   1.212  	clock_was_set();
   1.213  	return 0;
   1.214  }
   1.215  
   1.216  EXPORT_SYMBOL(do_settimeofday);
   1.217  
   1.218 -#if 0
   1.219 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
   1.220  static int set_rtc_mmss(unsigned long nowtime)
   1.221  {
   1.222  	int retval;
   1.223 @@ -278,13 +323,8 @@ static int set_rtc_mmss(unsigned long no
   1.224  
   1.225  	return retval;
   1.226  }
   1.227 -
   1.228 -/* last time the cmos clock got updated */
   1.229 -static long last_rtc_update;
   1.230  #endif
   1.231  
   1.232 -int timer_ack;
   1.233 -
   1.234  /* monotonic_clock(): returns # of nanoseconds passed since time_init()
   1.235   *		Note: This function is required to return accurate
   1.236   *		time even in the absence of multiple timer ticks.
   1.237 @@ -303,26 +343,91 @@ EXPORT_SYMBOL(monotonic_clock);
   1.238  static inline void do_timer_interrupt(int irq, void *dev_id,
   1.239  					struct pt_regs *regs)
   1.240  {
   1.241 +	s64 delta;
   1.242 +	unsigned int ticks = 0;
   1.243 +	long sec_diff;
   1.244  
   1.245 -#ifdef CONFIG_X86_IO_APIC
   1.246 -	if (timer_ack) {
   1.247 +	__get_time_values_from_xen();
   1.248 +
   1.249 +	delta = (s64)(shadow_system_time +
   1.250 +		      (cur_timer->get_offset() * NSEC_PER_USEC) -
   1.251 +		      processed_system_time);
   1.252 +	if (delta < 0) {
   1.253 +		printk("Timer ISR: Time went backwards: %lld\n", delta);
   1.254 +		return;
   1.255 +	}
   1.256 +
   1.257 +	/* Process elapsed jiffies since last call. */
   1.258 +	while (delta >= NS_PER_TICK) {
   1.259 +		ticks++;
   1.260 +		delta -= NS_PER_TICK;
   1.261 +		processed_system_time += NS_PER_TICK;
   1.262 +	}
   1.263 +
   1.264 +	if (ticks != 0) {
   1.265 +		jiffies_64 += ticks - 1;
   1.266 +		do_timer_interrupt_hook(regs); /* implicit 'jiffies_64++' */
   1.267 +	}
   1.268 +
   1.269 +	/*
   1.270 +	 * Take synchronised time from Xen once a minute if we're not
   1.271 +	 * synchronised ourselves, and we haven't chosen to keep an independent
   1.272 +	 * time base.
   1.273 +	 */
   1.274 +	if (!independent_wallclock && 
   1.275 +	    ((time_status & STA_UNSYNC) != 0) &&
   1.276 +	    (xtime.tv_sec > (last_update_from_xen + 60))) {
   1.277 +		/* Adjust shadow for jiffies that haven't updated xtime yet. */
   1.278 +		shadow_tv.tv_usec -= 
   1.279 +			(jiffies - wall_jiffies) * (1000000/HZ);
   1.280 +		HANDLE_USEC_UNDERFLOW(shadow_tv);
   1.281 +
   1.282  		/*
   1.283 -		 * Subtle, when I/O APICs are used we have to ack timer IRQ
   1.284 -		 * manually to reset the IRR bit for do_slow_gettimeoffset().
   1.285 -		 * This will also deassert NMI lines for the watchdog if run
   1.286 -		 * on an 82489DX-based system.
   1.287 +		 * Reset our running time counts if they are invalidated by
   1.288 +		 * a warp backwards of more than 500ms.
   1.289  		 */
   1.290 -		spin_lock(&i8259A_lock);
   1.291 -		outb(0x0c, PIC_MASTER_OCW3);
   1.292 -		/* Ack the IRQ; AEOI will end it automatically. */
   1.293 -		inb(PIC_MASTER_POLL);
   1.294 -		spin_unlock(&i8259A_lock);
   1.295 +		sec_diff = xtime.tv_sec - shadow_tv.tv_sec;
   1.296 +		if (unlikely(abs(sec_diff) > 1) ||
   1.297 +		    unlikely(((sec_diff * 1000000) + 
   1.298 +		              (xtime.tv_nsec/1000) - shadow_tv.tv_usec) >
   1.299 +		             500000)) {
   1.300 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
   1.301 +			last_rtc_update = last_update_to_xen = 0;
   1.302 +#endif
   1.303 +			last_seen_tv.tv_sec = 0;
   1.304 +		}
   1.305 +
   1.306 +		/* Update our unsynchronised xtime appropriately. */
   1.307 +		xtime.tv_sec  = shadow_tv.tv_sec;
   1.308 +		xtime.tv_nsec = shadow_tv.tv_usec * 1000;
   1.309 +
   1.310 +		last_update_from_xen = xtime.tv_sec;
   1.311  	}
   1.312 -#endif
   1.313 +
   1.314 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
   1.315 +	if (!(start_info.flags & SIF_INITDOMAIN))
   1.316 +		return;
   1.317 +
   1.318 +	/* Send synchronised time to Xen approximately every minute. */
   1.319 +	if (((time_status & STA_UNSYNC) == 0) &&
   1.320 +	    (xtime.tv_sec > (last_update_to_xen + 60))) {
   1.321 +		dom0_op_t op;
   1.322 +		struct timeval tv;
   1.323  
   1.324 -	do_timer_interrupt_hook(regs);
   1.325 +		tv.tv_sec   = xtime.tv_sec;
   1.326 +		tv.tv_usec  = xtime.tv_nsec / 1000;
   1.327 +		tv.tv_usec += (jiffies - wall_jiffies) * (1000000/HZ);
   1.328 +		HANDLE_USEC_OVERFLOW(tv);
   1.329  
   1.330 -#if 0				/* XEN PRIV */
   1.331 +		op.cmd = DOM0_SETTIME;
   1.332 +		op.u.settime.secs        = tv.tv_sec;
   1.333 +		op.u.settime.usecs       = tv.tv_usec;
   1.334 +		op.u.settime.system_time = shadow_system_time;
   1.335 +		HYPERVISOR_dom0_op(&op);
   1.336 +
   1.337 +		last_update_to_xen = xtime.tv_sec;
   1.338 +	}
   1.339 +
   1.340  	/*
   1.341  	 * If we have an externally synchronized Linux clock, then update
   1.342  	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
   1.343 @@ -346,22 +451,6 @@ static inline void do_timer_interrupt(in
   1.344  			last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
   1.345  	}
   1.346  #endif
   1.347 -
   1.348 -#ifdef CONFIG_MCA
   1.349 -	if( MCA_bus ) {
   1.350 -		/* The PS/2 uses level-triggered interrupts.  You can't
   1.351 -		turn them off, nor would you want to (any attempt to
   1.352 -		enable edge-triggered interrupts usually gets intercepted by a
   1.353 -		special hardware circuit).  Hence we have to acknowledge
   1.354 -		the timer interrupt.  Through some incredibly stupid
   1.355 -		design idea, the reset for IRQ 0 is done by setting the
   1.356 -		high bit of the PPI port B (0x61).  Note that some PS/2s,
   1.357 -		notably the 55SX, work fine if this is removed.  */
   1.358 -
   1.359 -		irq = inb_p( 0x61 );	/* read the current state */
   1.360 -		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
   1.361 -	}
   1.362 -#endif
   1.363  }
   1.364  
   1.365  /*
   1.366 @@ -379,15 +468,7 @@ irqreturn_t timer_interrupt(int irq, voi
   1.367  	 * locally disabled. -arca
   1.368  	 */
   1.369  	write_seqlock(&xtime_lock);
   1.370 -
   1.371 -	if (__get_time_values_from_xen())
   1.372 -		goto out;
   1.373 -
   1.374 -	cur_timer->mark_offset();
   1.375 - 
   1.376  	do_timer_interrupt(irq, NULL, regs);
   1.377 -
   1.378 - out:
   1.379  	write_sequnlock(&xtime_lock);
   1.380  	return IRQ_HANDLED;
   1.381  }
   1.382 @@ -506,7 +587,7 @@ void __init time_init(void)
   1.383  	__get_time_values_from_xen();
   1.384  	processed_system_time = shadow_system_time;
   1.385  
   1.386 -	time_irq  = bind_virq_to_irq(VIRQ_TIMER);
   1.387 +	time_irq = bind_virq_to_irq(VIRQ_TIMER);
   1.388  
   1.389  	(void)setup_irq(time_irq, &irq_timer);
   1.390  }
   1.391 @@ -547,3 +628,23 @@ int set_timeout_timer(void)
   1.392  
   1.393  	return ret;
   1.394  }
   1.395 +
   1.396 +/*
   1.397 + * /proc/sys/xen: This really belongs in another file. It can stay here for
   1.398 + * now however.
   1.399 + */
   1.400 +static ctl_table xen_subtable[] = {
   1.401 +    {1, "independent_wallclock", &independent_wallclock,
   1.402 +     sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
   1.403 +    {0}
   1.404 +};
   1.405 +static ctl_table xen_table[] = {
   1.406 +    {123, "xen", NULL, 0, 0555, xen_subtable},
   1.407 +    {0}
   1.408 +};
   1.409 +static int __init xen_sysctl_init(void)
   1.410 +{
   1.411 +    (void)register_sysctl_table(xen_table, 0);
   1.412 +    return 0;
   1.413 +}
   1.414 +__initcall(xen_sysctl_init);
     2.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c	Sun Aug 08 19:11:16 2004 +0000
     2.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c	Sun Aug 08 20:25:07 2004 +0000
     2.3 @@ -358,22 +358,11 @@ static int __init init_tsc(char* overrid
     2.4  	return 0;
     2.5  }
     2.6  
     2.7 -#ifndef CONFIG_X86_TSC
     2.8 -/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
     2.9 - * in cpu/common.c */
    2.10  static int __init tsc_setup(char *str)
    2.11  {
    2.12 -	tsc_disable = 1;
    2.13 +	printk(KERN_WARNING "notsc: cannot disable TSC in Xen/Linux.\n");
    2.14  	return 1;
    2.15  }
    2.16 -#else
    2.17 -static int __init tsc_setup(char *str)
    2.18 -{
    2.19 -	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
    2.20 -				"cannot disable TSC.\n");
    2.21 -	return 1;
    2.22 -}
    2.23 -#endif
    2.24  __setup("notsc", tsc_setup);
    2.25  
    2.26