ia64/linux-2.6.18-xen.hg

view arch/alpha/kernel/time.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * linux/arch/alpha/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995, 1999, 2000 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14 * "A Kernel Model for Precision Timekeeping" by Dave Mills
15 * 1997-01-09 Adrian Sun
16 * use interval timer if CONFIG_RTC=y
17 * 1997-10-29 John Bowman (bowman@math.ualberta.ca)
18 * fixed tick loss calculation in timer_interrupt
19 * (round system clock to nearest tick instead of truncating)
20 * fixed algorithm in time_init for getting time from CMOS clock
21 * 1999-04-16 Thorsten Kranzkowski (dl8bcu@gmx.net)
22 * fixed algorithm in do_gettimeofday() for calculating the precise time
23 * from processor cycle counter (now taking lost_ticks into account)
24 * 2000-08-13 Jan-Benedict Glaw <jbglaw@lug-owl.de>
25 * Fixed time_init to be aware of epoches != 1900. This prevents
26 * booting up in 2048 for me;) Code is stolen from rtc.c.
27 * 2003-06-03 R. Scott Bailey <scott.bailey@eds.com>
28 * Tighten sanity in time_init from 1% (10,000 PPM) to 250 PPM
29 */
30 #include <linux/errno.h>
31 #include <linux/module.h>
32 #include <linux/sched.h>
33 #include <linux/kernel.h>
34 #include <linux/param.h>
35 #include <linux/string.h>
36 #include <linux/mm.h>
37 #include <linux/delay.h>
38 #include <linux/ioport.h>
39 #include <linux/irq.h>
40 #include <linux/interrupt.h>
41 #include <linux/init.h>
42 #include <linux/bcd.h>
43 #include <linux/profile.h>
45 #include <asm/uaccess.h>
46 #include <asm/io.h>
47 #include <asm/hwrpb.h>
48 #include <asm/8253pit.h>
50 #include <linux/mc146818rtc.h>
51 #include <linux/time.h>
52 #include <linux/timex.h>
54 #include "proto.h"
55 #include "irq_impl.h"
57 extern unsigned long wall_jiffies; /* kernel/timer.c */
59 static int set_rtc_mmss(unsigned long);
61 DEFINE_SPINLOCK(rtc_lock);
63 #define TICK_SIZE (tick_nsec / 1000)
65 /*
66 * Shift amount by which scaled_ticks_per_cycle is scaled. Shifting
67 * by 48 gives us 16 bits for HZ while keeping the accuracy good even
68 * for large CPU clock rates.
69 */
70 #define FIX_SHIFT 48
72 /* lump static variables together for more efficient access: */
73 static struct {
74 /* cycle counter last time it got invoked */
75 __u32 last_time;
76 /* ticks/cycle * 2^48 */
77 unsigned long scaled_ticks_per_cycle;
78 /* last time the CMOS clock got updated */
79 time_t last_rtc_update;
80 /* partial unused tick */
81 unsigned long partial_tick;
82 } state;
84 unsigned long est_cycle_freq;
87 static inline __u32 rpcc(void)
88 {
89 __u32 result;
90 asm volatile ("rpcc %0" : "=r"(result));
91 return result;
92 }
94 /*
95 * Scheduler clock - returns current time in nanosec units.
96 *
97 * Copied from ARM code for expediency... ;-}
98 */
99 unsigned long long sched_clock(void)
100 {
101 return (unsigned long long)jiffies * (1000000000 / HZ);
102 }
105 /*
106 * timer_interrupt() needs to keep up the real-time clock,
107 * as well as call the "do_timer()" routine every clocktick
108 */
109 irqreturn_t timer_interrupt(int irq, void *dev, struct pt_regs * regs)
110 {
111 unsigned long delta;
112 __u32 now;
113 long nticks;
115 #ifndef CONFIG_SMP
116 /* Not SMP, do kernel PC profiling here. */
117 profile_tick(CPU_PROFILING, regs);
118 #endif
120 write_seqlock(&xtime_lock);
122 /*
123 * Calculate how many ticks have passed since the last update,
124 * including any previous partial leftover. Save any resulting
125 * fraction for the next pass.
126 */
127 now = rpcc();
128 delta = now - state.last_time;
129 state.last_time = now;
130 delta = delta * state.scaled_ticks_per_cycle + state.partial_tick;
131 state.partial_tick = delta & ((1UL << FIX_SHIFT) - 1);
132 nticks = delta >> FIX_SHIFT;
134 while (nticks > 0) {
135 do_timer(regs);
136 #ifndef CONFIG_SMP
137 update_process_times(user_mode(regs));
138 #endif
139 nticks--;
140 }
142 /*
143 * If we have an externally synchronized Linux clock, then update
144 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
145 * called as close as possible to 500 ms before the new second starts.
146 */
147 if (ntp_synced()
148 && xtime.tv_sec > state.last_rtc_update + 660
149 && xtime.tv_nsec >= 500000 - ((unsigned) TICK_SIZE) / 2
150 && xtime.tv_nsec <= 500000 + ((unsigned) TICK_SIZE) / 2) {
151 int tmp = set_rtc_mmss(xtime.tv_sec);
152 state.last_rtc_update = xtime.tv_sec - (tmp ? 600 : 0);
153 }
155 write_sequnlock(&xtime_lock);
156 return IRQ_HANDLED;
157 }
159 void
160 common_init_rtc(void)
161 {
162 unsigned char x;
164 /* Reset periodic interrupt frequency. */
165 x = CMOS_READ(RTC_FREQ_SELECT) & 0x3f;
166 /* Test includes known working values on various platforms
167 where 0x26 is wrong; we refuse to change those. */
168 if (x != 0x26 && x != 0x25 && x != 0x19 && x != 0x06) {
169 printk("Setting RTC_FREQ to 1024 Hz (%x)\n", x);
170 CMOS_WRITE(0x26, RTC_FREQ_SELECT);
171 }
173 /* Turn on periodic interrupts. */
174 x = CMOS_READ(RTC_CONTROL);
175 if (!(x & RTC_PIE)) {
176 printk("Turning on RTC interrupts.\n");
177 x |= RTC_PIE;
178 x &= ~(RTC_AIE | RTC_UIE);
179 CMOS_WRITE(x, RTC_CONTROL);
180 }
181 (void) CMOS_READ(RTC_INTR_FLAGS);
183 outb(0x36, 0x43); /* pit counter 0: system timer */
184 outb(0x00, 0x40);
185 outb(0x00, 0x40);
187 outb(0xb6, 0x43); /* pit counter 2: speaker */
188 outb(0x31, 0x42);
189 outb(0x13, 0x42);
191 init_rtc_irq();
192 }
195 /* Validate a computed cycle counter result against the known bounds for
196 the given processor core. There's too much brokenness in the way of
197 timing hardware for any one method to work everywhere. :-(
199 Return 0 if the result cannot be trusted, otherwise return the argument. */
201 static unsigned long __init
202 validate_cc_value(unsigned long cc)
203 {
204 static struct bounds {
205 unsigned int min, max;
206 } cpu_hz[] __initdata = {
207 [EV3_CPU] = { 50000000, 200000000 }, /* guess */
208 [EV4_CPU] = { 100000000, 300000000 },
209 [LCA4_CPU] = { 100000000, 300000000 }, /* guess */
210 [EV45_CPU] = { 200000000, 300000000 },
211 [EV5_CPU] = { 250000000, 433000000 },
212 [EV56_CPU] = { 333000000, 667000000 },
213 [PCA56_CPU] = { 400000000, 600000000 }, /* guess */
214 [PCA57_CPU] = { 500000000, 600000000 }, /* guess */
215 [EV6_CPU] = { 466000000, 600000000 },
216 [EV67_CPU] = { 600000000, 750000000 },
217 [EV68AL_CPU] = { 750000000, 940000000 },
218 [EV68CB_CPU] = { 1000000000, 1333333333 },
219 /* None of the following are shipping as of 2001-11-01. */
220 [EV68CX_CPU] = { 1000000000, 1700000000 }, /* guess */
221 [EV69_CPU] = { 1000000000, 1700000000 }, /* guess */
222 [EV7_CPU] = { 800000000, 1400000000 }, /* guess */
223 [EV79_CPU] = { 1000000000, 2000000000 }, /* guess */
224 };
226 /* Allow for some drift in the crystal. 10MHz is more than enough. */
227 const unsigned int deviation = 10000000;
229 struct percpu_struct *cpu;
230 unsigned int index;
232 cpu = (struct percpu_struct *)((char*)hwrpb + hwrpb->processor_offset);
233 index = cpu->type & 0xffffffff;
235 /* If index out of bounds, no way to validate. */
236 if (index >= ARRAY_SIZE(cpu_hz))
237 return cc;
239 /* If index contains no data, no way to validate. */
240 if (cpu_hz[index].max == 0)
241 return cc;
243 if (cc < cpu_hz[index].min - deviation
244 || cc > cpu_hz[index].max + deviation)
245 return 0;
247 return cc;
248 }
251 /*
252 * Calibrate CPU clock using legacy 8254 timer/counter. Stolen from
253 * arch/i386/time.c.
254 */
256 #define CALIBRATE_LATCH 0xffff
257 #define TIMEOUT_COUNT 0x100000
259 static unsigned long __init
260 calibrate_cc_with_pit(void)
261 {
262 int cc, count = 0;
264 /* Set the Gate high, disable speaker */
265 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
267 /*
268 * Now let's take care of CTC channel 2
269 *
270 * Set the Gate high, program CTC channel 2 for mode 0,
271 * (interrupt on terminal count mode), binary count,
272 * load 5 * LATCH count, (LSB and MSB) to begin countdown.
273 */
274 outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
275 outb(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
276 outb(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
278 cc = rpcc();
279 do {
280 count++;
281 } while ((inb(0x61) & 0x20) == 0 && count < TIMEOUT_COUNT);
282 cc = rpcc() - cc;
284 /* Error: ECTCNEVERSET or ECPUTOOFAST. */
285 if (count <= 1 || count == TIMEOUT_COUNT)
286 return 0;
288 return ((long)cc * PIT_TICK_RATE) / (CALIBRATE_LATCH + 1);
289 }
291 /* The Linux interpretation of the CMOS clock register contents:
292 When the Update-In-Progress (UIP) flag goes from 1 to 0, the
293 RTC registers show the second which has precisely just started.
294 Let's hope other operating systems interpret the RTC the same way. */
296 static unsigned long __init
297 rpcc_after_update_in_progress(void)
298 {
299 do { } while (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP));
300 do { } while (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP);
302 return rpcc();
303 }
305 void __init
306 time_init(void)
307 {
308 unsigned int year, mon, day, hour, min, sec, cc1, cc2, epoch;
309 unsigned long cycle_freq, tolerance;
310 long diff;
312 /* Calibrate CPU clock -- attempt #1. */
313 if (!est_cycle_freq)
314 est_cycle_freq = validate_cc_value(calibrate_cc_with_pit());
316 cc1 = rpcc();
318 /* Calibrate CPU clock -- attempt #2. */
319 if (!est_cycle_freq) {
320 cc1 = rpcc_after_update_in_progress();
321 cc2 = rpcc_after_update_in_progress();
322 est_cycle_freq = validate_cc_value(cc2 - cc1);
323 cc1 = cc2;
324 }
326 cycle_freq = hwrpb->cycle_freq;
327 if (est_cycle_freq) {
328 /* If the given value is within 250 PPM of what we calculated,
329 accept it. Otherwise, use what we found. */
330 tolerance = cycle_freq / 4000;
331 diff = cycle_freq - est_cycle_freq;
332 if (diff < 0)
333 diff = -diff;
334 if ((unsigned long)diff > tolerance) {
335 cycle_freq = est_cycle_freq;
336 printk("HWRPB cycle frequency bogus. "
337 "Estimated %lu Hz\n", cycle_freq);
338 } else {
339 est_cycle_freq = 0;
340 }
341 } else if (! validate_cc_value (cycle_freq)) {
342 printk("HWRPB cycle frequency bogus, "
343 "and unable to estimate a proper value!\n");
344 }
346 /* From John Bowman <bowman@math.ualberta.ca>: allow the values
347 to settle, as the Update-In-Progress bit going low isn't good
348 enough on some hardware. 2ms is our guess; we haven't found
349 bogomips yet, but this is close on a 500Mhz box. */
350 __delay(1000000);
352 sec = CMOS_READ(RTC_SECONDS);
353 min = CMOS_READ(RTC_MINUTES);
354 hour = CMOS_READ(RTC_HOURS);
355 day = CMOS_READ(RTC_DAY_OF_MONTH);
356 mon = CMOS_READ(RTC_MONTH);
357 year = CMOS_READ(RTC_YEAR);
359 if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
360 BCD_TO_BIN(sec);
361 BCD_TO_BIN(min);
362 BCD_TO_BIN(hour);
363 BCD_TO_BIN(day);
364 BCD_TO_BIN(mon);
365 BCD_TO_BIN(year);
366 }
368 /* PC-like is standard; used for year >= 70 */
369 epoch = 1900;
370 if (year < 20)
371 epoch = 2000;
372 else if (year >= 20 && year < 48)
373 /* NT epoch */
374 epoch = 1980;
375 else if (year >= 48 && year < 70)
376 /* Digital UNIX epoch */
377 epoch = 1952;
379 printk(KERN_INFO "Using epoch = %d\n", epoch);
381 if ((year += epoch) < 1970)
382 year += 100;
384 xtime.tv_sec = mktime(year, mon, day, hour, min, sec);
385 xtime.tv_nsec = 0;
387 wall_to_monotonic.tv_sec -= xtime.tv_sec;
388 wall_to_monotonic.tv_nsec = 0;
390 if (HZ > (1<<16)) {
391 extern void __you_loose (void);
392 __you_loose();
393 }
395 state.last_time = cc1;
396 state.scaled_ticks_per_cycle
397 = ((unsigned long) HZ << FIX_SHIFT) / cycle_freq;
398 state.last_rtc_update = 0;
399 state.partial_tick = 0L;
401 /* Startup the timer source. */
402 alpha_mv.init_rtc();
403 }
405 /*
406 * Use the cycle counter to estimate an displacement from the last time
407 * tick. Unfortunately the Alpha designers made only the low 32-bits of
408 * the cycle counter active, so we overflow on 8.2 seconds on a 500MHz
409 * part. So we can't do the "find absolute time in terms of cycles" thing
410 * that the other ports do.
411 */
412 void
413 do_gettimeofday(struct timeval *tv)
414 {
415 unsigned long flags;
416 unsigned long sec, usec, lost, seq;
417 unsigned long delta_cycles, delta_usec, partial_tick;
419 do {
420 seq = read_seqbegin_irqsave(&xtime_lock, flags);
422 delta_cycles = rpcc() - state.last_time;
423 sec = xtime.tv_sec;
424 usec = (xtime.tv_nsec / 1000);
425 partial_tick = state.partial_tick;
426 lost = jiffies - wall_jiffies;
428 } while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
430 #ifdef CONFIG_SMP
431 /* Until and unless we figure out how to get cpu cycle counters
432 in sync and keep them there, we can't use the rpcc tricks. */
433 delta_usec = lost * (1000000 / HZ);
434 #else
435 /*
436 * usec = cycles * ticks_per_cycle * 2**48 * 1e6 / (2**48 * ticks)
437 * = cycles * (s_t_p_c) * 1e6 / (2**48 * ticks)
438 * = cycles * (s_t_p_c) * 15625 / (2**42 * ticks)
439 *
440 * which, given a 600MHz cycle and a 1024Hz tick, has a
441 * dynamic range of about 1.7e17, which is less than the
442 * 1.8e19 in an unsigned long, so we are safe from overflow.
443 *
444 * Round, but with .5 up always, since .5 to even is harder
445 * with no clear gain.
446 */
448 delta_usec = (delta_cycles * state.scaled_ticks_per_cycle
449 + partial_tick
450 + (lost << FIX_SHIFT)) * 15625;
451 delta_usec = ((delta_usec / ((1UL << (FIX_SHIFT-6-1)) * HZ)) + 1) / 2;
452 #endif
454 usec += delta_usec;
455 if (usec >= 1000000) {
456 sec += 1;
457 usec -= 1000000;
458 }
460 tv->tv_sec = sec;
461 tv->tv_usec = usec;
462 }
464 EXPORT_SYMBOL(do_gettimeofday);
466 int
467 do_settimeofday(struct timespec *tv)
468 {
469 time_t wtm_sec, sec = tv->tv_sec;
470 long wtm_nsec, nsec = tv->tv_nsec;
471 unsigned long delta_nsec;
473 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
474 return -EINVAL;
476 write_seqlock_irq(&xtime_lock);
478 /* The offset that is added into time in do_gettimeofday above
479 must be subtracted out here to keep a coherent view of the
480 time. Without this, a full-tick error is possible. */
482 #ifdef CONFIG_SMP
483 delta_nsec = (jiffies - wall_jiffies) * (NSEC_PER_SEC / HZ);
484 #else
485 delta_nsec = rpcc() - state.last_time;
486 delta_nsec = (delta_nsec * state.scaled_ticks_per_cycle
487 + state.partial_tick
488 + ((jiffies - wall_jiffies) << FIX_SHIFT)) * 15625;
489 delta_nsec = ((delta_nsec / ((1UL << (FIX_SHIFT-6-1)) * HZ)) + 1) / 2;
490 delta_nsec *= 1000;
491 #endif
493 nsec -= delta_nsec;
495 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
496 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
498 set_normalized_timespec(&xtime, sec, nsec);
499 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
501 ntp_clear();
503 write_sequnlock_irq(&xtime_lock);
504 clock_was_set();
505 return 0;
506 }
508 EXPORT_SYMBOL(do_settimeofday);
511 /*
512 * In order to set the CMOS clock precisely, set_rtc_mmss has to be
513 * called 500 ms after the second nowtime has started, because when
514 * nowtime is written into the registers of the CMOS clock, it will
515 * jump to the next second precisely 500 ms later. Check the Motorola
516 * MC146818A or Dallas DS12887 data sheet for details.
517 *
518 * BUG: This routine does not handle hour overflow properly; it just
519 * sets the minutes. Usually you won't notice until after reboot!
520 */
523 static int
524 set_rtc_mmss(unsigned long nowtime)
525 {
526 int retval = 0;
527 int real_seconds, real_minutes, cmos_minutes;
528 unsigned char save_control, save_freq_select;
530 /* irq are locally disabled here */
531 spin_lock(&rtc_lock);
532 /* Tell the clock it's being set */
533 save_control = CMOS_READ(RTC_CONTROL);
534 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
536 /* Stop and reset prescaler */
537 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
538 CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
540 cmos_minutes = CMOS_READ(RTC_MINUTES);
541 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
542 BCD_TO_BIN(cmos_minutes);
544 /*
545 * since we're only adjusting minutes and seconds,
546 * don't interfere with hour overflow. This avoids
547 * messing with unknown time zones but requires your
548 * RTC not to be off by more than 15 minutes
549 */
550 real_seconds = nowtime % 60;
551 real_minutes = nowtime / 60;
552 if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) {
553 /* correct for half hour time zone */
554 real_minutes += 30;
555 }
556 real_minutes %= 60;
558 if (abs(real_minutes - cmos_minutes) < 30) {
559 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
560 BIN_TO_BCD(real_seconds);
561 BIN_TO_BCD(real_minutes);
562 }
563 CMOS_WRITE(real_seconds,RTC_SECONDS);
564 CMOS_WRITE(real_minutes,RTC_MINUTES);
565 } else {
566 printk(KERN_WARNING
567 "set_rtc_mmss: can't update from %d to %d\n",
568 cmos_minutes, real_minutes);
569 retval = -1;
570 }
572 /* The following flags have to be released exactly in this order,
573 * otherwise the DS12887 (popular MC146818A clone with integrated
574 * battery and quartz) will not reset the oscillator and will not
575 * update precisely 500 ms later. You won't find this mentioned in
576 * the Dallas Semiconductor data sheets, but who believes data
577 * sheets anyway ... -- Markus Kuhn
578 */
579 CMOS_WRITE(save_control, RTC_CONTROL);
580 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
581 spin_unlock(&rtc_lock);
583 return retval;
584 }