ia64/linux-2.6.18-xen.hg

view drivers/acpi/processor_idle.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents ff9683032b76
children
line source
1 /*
2 * processor_idle - idle state submodule to the ACPI processor driver
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8 * - Added processor hotplug support
9 * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
10 * - Added support for C3 on SMP
11 *
12 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or (at
17 * your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful, but
20 * WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 *
28 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 */
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/init.h>
34 #include <linux/cpufreq.h>
35 #include <linux/proc_fs.h>
36 #include <linux/seq_file.h>
37 #include <linux/acpi.h>
38 #include <linux/dmi.h>
39 #include <linux/moduleparam.h>
40 #include <linux/sched.h> /* need_resched() */
42 #include <asm/io.h>
43 #include <asm/uaccess.h>
45 #include <acpi/acpi_bus.h>
46 #include <acpi/processor.h>
48 #define ACPI_PROCESSOR_COMPONENT 0x01000000
49 #define ACPI_PROCESSOR_CLASS "processor"
50 #define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver"
51 #define _COMPONENT ACPI_PROCESSOR_COMPONENT
52 ACPI_MODULE_NAME("acpi_processor")
53 #define ACPI_PROCESSOR_FILE_POWER "power"
54 #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
55 #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
56 #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
57 static void (*pm_idle_save) (void) __read_mostly;
58 module_param(max_cstate, uint, 0644);
60 static unsigned int nocst __read_mostly;
61 module_param(nocst, uint, 0000);
63 /*
64 * bm_history -- bit-mask with a bit per jiffy of bus-master activity
65 * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
66 * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
67 * 100 HZ: 0x0000000F: 4 jiffies = 40ms
68 * reduce history for more aggressive entry into C3
69 */
70 static unsigned int bm_history __read_mostly =
71 (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
72 module_param(bm_history, uint, 0644);
73 /* --------------------------------------------------------------------------
74 Power Management
75 -------------------------------------------------------------------------- */
77 /*
78 * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
79 * For now disable this. Probably a bug somewhere else.
80 *
81 * To skip this limit, boot/load with a large max_cstate limit.
82 */
83 static int set_max_cstate(struct dmi_system_id *id)
84 {
85 if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
86 return 0;
88 printk(KERN_NOTICE PREFIX "%s detected - limiting to C%ld max_cstate."
89 " Override with \"processor.max_cstate=%d\"\n", id->ident,
90 (long)id->driver_data, ACPI_PROCESSOR_MAX_POWER + 1);
92 max_cstate = (long)id->driver_data;
94 return 0;
95 }
97 /* Actually this shouldn't be __cpuinitdata, would be better to fix the
98 callers to only run once -AK */
99 static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
100 { set_max_cstate, "IBM ThinkPad R40e", {
101 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
102 DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
103 { set_max_cstate, "IBM ThinkPad R40e", {
104 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
105 DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
106 { set_max_cstate, "IBM ThinkPad R40e", {
107 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
108 DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
109 { set_max_cstate, "IBM ThinkPad R40e", {
110 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
111 DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
112 { set_max_cstate, "IBM ThinkPad R40e", {
113 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
114 DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
115 { set_max_cstate, "IBM ThinkPad R40e", {
116 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
117 DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
118 { set_max_cstate, "IBM ThinkPad R40e", {
119 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
120 DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
121 { set_max_cstate, "IBM ThinkPad R40e", {
122 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
123 DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
124 { set_max_cstate, "IBM ThinkPad R40e", {
125 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
126 DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
127 { set_max_cstate, "IBM ThinkPad R40e", {
128 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
129 DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
130 { set_max_cstate, "IBM ThinkPad R40e", {
131 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
132 DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
133 { set_max_cstate, "IBM ThinkPad R40e", {
134 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
135 DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
136 { set_max_cstate, "IBM ThinkPad R40e", {
137 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
138 DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
139 { set_max_cstate, "IBM ThinkPad R40e", {
140 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
141 DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
142 { set_max_cstate, "IBM ThinkPad R40e", {
143 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
144 DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
145 { set_max_cstate, "IBM ThinkPad R40e", {
146 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
147 DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
148 { set_max_cstate, "Medion 41700", {
149 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
150 DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
151 { set_max_cstate, "Clevo 5600D", {
152 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
153 DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
154 (void *)2},
155 {},
156 };
158 static inline u32 ticks_elapsed(u32 t1, u32 t2)
159 {
160 if (t2 >= t1)
161 return (t2 - t1);
162 else if (!acpi_fadt.tmr_val_ext)
163 return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
164 else
165 return ((0xFFFFFFFF - t1) + t2);
166 }
168 static void
169 acpi_processor_power_activate(struct acpi_processor *pr,
170 struct acpi_processor_cx *new)
171 {
172 struct acpi_processor_cx *old;
174 if (!pr || !new)
175 return;
177 old = pr->power.state;
179 if (old)
180 old->promotion.count = 0;
181 new->demotion.count = 0;
183 /* Cleanup from old state. */
184 if (old) {
185 switch (old->type) {
186 case ACPI_STATE_C3:
187 /* Disable bus master reload */
188 if (new->type != ACPI_STATE_C3 && pr->flags.bm_check)
189 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0,
190 ACPI_MTX_DO_NOT_LOCK);
191 break;
192 }
193 }
195 /* Prepare to use new state. */
196 switch (new->type) {
197 case ACPI_STATE_C3:
198 /* Enable bus master reload */
199 if (old->type != ACPI_STATE_C3 && pr->flags.bm_check)
200 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1,
201 ACPI_MTX_DO_NOT_LOCK);
202 break;
203 }
205 pr->power.state = new;
207 return;
208 }
210 static void acpi_safe_halt(void)
211 {
212 current_thread_info()->status &= ~TS_POLLING;
213 smp_mb__after_clear_bit();
214 if (!need_resched())
215 safe_halt();
216 current_thread_info()->status |= TS_POLLING;
217 }
219 static atomic_t c3_cpu_count;
221 static void acpi_processor_idle(void)
222 {
223 struct acpi_processor *pr = NULL;
224 struct acpi_processor_cx *cx = NULL;
225 struct acpi_processor_cx *next_state = NULL;
226 int sleep_ticks = 0;
227 u32 t1, t2 = 0;
229 pr = processors[smp_processor_id()];
230 if (!pr)
231 return;
233 /*
234 * Interrupts must be disabled during bus mastering calculations and
235 * for C2/C3 transitions.
236 */
237 local_irq_disable();
239 /*
240 * Check whether we truly need to go idle, or should
241 * reschedule:
242 */
243 if (unlikely(need_resched())) {
244 local_irq_enable();
245 return;
246 }
248 cx = pr->power.state;
249 if (!cx) {
250 if (pm_idle_save)
251 pm_idle_save();
252 else
253 acpi_safe_halt();
254 return;
255 }
257 /*
258 * Check BM Activity
259 * -----------------
260 * Check for bus mastering activity (if required), record, and check
261 * for demotion.
262 */
263 if (pr->flags.bm_check) {
264 u32 bm_status = 0;
265 unsigned long diff = jiffies - pr->power.bm_check_timestamp;
267 if (diff > 31)
268 diff = 31;
270 pr->power.bm_activity <<= diff;
272 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS,
273 &bm_status, ACPI_MTX_DO_NOT_LOCK);
274 if (bm_status) {
275 pr->power.bm_activity |= 0x1;
276 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS,
277 1, ACPI_MTX_DO_NOT_LOCK);
278 }
279 /*
280 * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
281 * the true state of bus mastering activity; forcing us to
282 * manually check the BMIDEA bit of each IDE channel.
283 */
284 else if (errata.piix4.bmisx) {
285 if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
286 || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
287 pr->power.bm_activity |= 0x1;
288 }
290 pr->power.bm_check_timestamp = jiffies;
292 /*
293 * If bus mastering is or was active this jiffy, demote
294 * to avoid a faulty transition. Note that the processor
295 * won't enter a low-power state during this call (to this
296 * function) but should upon the next.
297 *
298 * TBD: A better policy might be to fallback to the demotion
299 * state (use it for this quantum only) istead of
300 * demoting -- and rely on duration as our sole demotion
301 * qualification. This may, however, introduce DMA
302 * issues (e.g. floppy DMA transfer overrun/underrun).
303 */
304 if ((pr->power.bm_activity & 0x1) &&
305 cx->demotion.threshold.bm) {
306 local_irq_enable();
307 next_state = cx->demotion.state;
308 goto end;
309 }
310 }
312 #ifdef CONFIG_HOTPLUG_CPU
313 /*
314 * Check for P_LVL2_UP flag before entering C2 and above on
315 * an SMP system. We do it here instead of doing it at _CST/P_LVL
316 * detection phase, to work cleanly with logical CPU hotplug.
317 */
318 if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
319 !pr->flags.has_cst && !acpi_fadt.plvl2_up)
320 cx = &pr->power.states[ACPI_STATE_C1];
321 #endif
323 /*
324 * Sleep:
325 * ------
326 * Invoke the current Cx state to put the processor to sleep.
327 */
328 if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) {
329 current_thread_info()->status &= ~TS_POLLING;
330 smp_mb__after_clear_bit();
331 if (need_resched()) {
332 current_thread_info()->status |= TS_POLLING;
333 local_irq_enable();
334 return;
335 }
336 }
338 switch (cx->type) {
340 case ACPI_STATE_C1:
341 /*
342 * Invoke C1.
343 * Use the appropriate idle routine, the one that would
344 * be used without acpi C-states.
345 */
346 if (pm_idle_save)
347 pm_idle_save();
348 else
349 acpi_safe_halt();
351 /*
352 * TBD: Can't get time duration while in C1, as resumes
353 * go to an ISR rather than here. Need to instrument
354 * base interrupt handler.
355 */
356 sleep_ticks = 0xFFFFFFFF;
357 break;
359 case ACPI_STATE_C2:
360 /* Get start time (ticks) */
361 t1 = inl(acpi_fadt.xpm_tmr_blk.address);
362 /* Invoke C2 */
363 inb(cx->address);
364 /* Dummy wait op - must do something useless after P_LVL2 read
365 because chipsets cannot guarantee that STPCLK# signal
366 gets asserted in time to freeze execution properly. */
367 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
368 /* Get end time (ticks) */
369 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
371 #ifdef CONFIG_GENERIC_TIME
372 /* TSC halts in C2, so notify users */
373 mark_tsc_unstable();
374 #endif
375 /* Re-enable interrupts */
376 local_irq_enable();
377 current_thread_info()->status |= TS_POLLING;
378 /* Compute time (ticks) that we were actually asleep */
379 sleep_ticks =
380 ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
381 break;
383 case ACPI_STATE_C3:
385 if (pr->flags.bm_check) {
386 if (atomic_inc_return(&c3_cpu_count) ==
387 num_online_cpus()) {
388 /*
389 * All CPUs are trying to go to C3
390 * Disable bus master arbitration
391 */
392 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1,
393 ACPI_MTX_DO_NOT_LOCK);
394 }
395 } else {
396 /* SMP with no shared cache... Invalidate cache */
397 ACPI_FLUSH_CPU_CACHE();
398 }
400 /* Get start time (ticks) */
401 t1 = inl(acpi_fadt.xpm_tmr_blk.address);
402 /* Invoke C3 */
403 inb(cx->address);
404 /* Dummy wait op (see above) */
405 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
406 /* Get end time (ticks) */
407 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
408 if (pr->flags.bm_check) {
409 /* Enable bus master arbitration */
410 atomic_dec(&c3_cpu_count);
411 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0,
412 ACPI_MTX_DO_NOT_LOCK);
413 }
415 #ifdef CONFIG_GENERIC_TIME
416 /* TSC halts in C3, so notify users */
417 mark_tsc_unstable();
418 #endif
419 /* Re-enable interrupts */
420 local_irq_enable();
421 current_thread_info()->status |= TS_POLLING;
422 /* Compute time (ticks) that we were actually asleep */
423 sleep_ticks =
424 ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
425 break;
427 default:
428 local_irq_enable();
429 return;
430 }
431 cx->usage++;
432 if ((cx->type != ACPI_STATE_C1) && (sleep_ticks > 0))
433 cx->time += sleep_ticks;
435 next_state = pr->power.state;
437 #ifdef CONFIG_HOTPLUG_CPU
438 /* Don't do promotion/demotion */
439 if ((cx->type == ACPI_STATE_C1) && (num_online_cpus() > 1) &&
440 !pr->flags.has_cst && !acpi_fadt.plvl2_up) {
441 next_state = cx;
442 goto end;
443 }
444 #endif
446 /*
447 * Promotion?
448 * ----------
449 * Track the number of longs (time asleep is greater than threshold)
450 * and promote when the count threshold is reached. Note that bus
451 * mastering activity may prevent promotions.
452 * Do not promote above max_cstate.
453 */
454 if (cx->promotion.state &&
455 ((cx->promotion.state - pr->power.states) <= max_cstate)) {
456 if (sleep_ticks > cx->promotion.threshold.ticks) {
457 cx->promotion.count++;
458 cx->demotion.count = 0;
459 if (cx->promotion.count >=
460 cx->promotion.threshold.count) {
461 if (pr->flags.bm_check) {
462 if (!
463 (pr->power.bm_activity & cx->
464 promotion.threshold.bm)) {
465 next_state =
466 cx->promotion.state;
467 goto end;
468 }
469 } else {
470 next_state = cx->promotion.state;
471 goto end;
472 }
473 }
474 }
475 }
477 /*
478 * Demotion?
479 * ---------
480 * Track the number of shorts (time asleep is less than time threshold)
481 * and demote when the usage threshold is reached.
482 */
483 if (cx->demotion.state) {
484 if (sleep_ticks < cx->demotion.threshold.ticks) {
485 cx->demotion.count++;
486 cx->promotion.count = 0;
487 if (cx->demotion.count >= cx->demotion.threshold.count) {
488 next_state = cx->demotion.state;
489 goto end;
490 }
491 }
492 }
494 end:
495 /*
496 * Demote if current state exceeds max_cstate
497 */
498 if ((pr->power.state - pr->power.states) > max_cstate) {
499 if (cx->demotion.state)
500 next_state = cx->demotion.state;
501 }
503 /*
504 * New Cx State?
505 * -------------
506 * If we're going to start using a new Cx state we must clean up
507 * from the previous and prepare to use the new.
508 */
509 if (next_state != pr->power.state)
510 acpi_processor_power_activate(pr, next_state);
511 }
513 static int acpi_processor_set_power_policy(struct acpi_processor *pr)
514 {
515 unsigned int i;
516 unsigned int state_is_set = 0;
517 struct acpi_processor_cx *lower = NULL;
518 struct acpi_processor_cx *higher = NULL;
519 struct acpi_processor_cx *cx;
522 if (!pr)
523 return -EINVAL;
525 /*
526 * This function sets the default Cx state policy (OS idle handler).
527 * Our scheme is to promote quickly to C2 but more conservatively
528 * to C3. We're favoring C2 for its characteristics of low latency
529 * (quick response), good power savings, and ability to allow bus
530 * mastering activity. Note that the Cx state policy is completely
531 * customizable and can be altered dynamically.
532 */
534 /* startup state */
535 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
536 cx = &pr->power.states[i];
537 if (!cx->valid)
538 continue;
540 if (!state_is_set)
541 pr->power.state = cx;
542 state_is_set++;
543 break;
544 }
546 if (!state_is_set)
547 return -ENODEV;
549 /* demotion */
550 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
551 cx = &pr->power.states[i];
552 if (!cx->valid)
553 continue;
555 if (lower) {
556 cx->demotion.state = lower;
557 cx->demotion.threshold.ticks = cx->latency_ticks;
558 cx->demotion.threshold.count = 1;
559 if (cx->type == ACPI_STATE_C3)
560 cx->demotion.threshold.bm = bm_history;
561 }
563 lower = cx;
564 }
566 /* promotion */
567 for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
568 cx = &pr->power.states[i];
569 if (!cx->valid)
570 continue;
572 if (higher) {
573 cx->promotion.state = higher;
574 cx->promotion.threshold.ticks = cx->latency_ticks;
575 if (cx->type >= ACPI_STATE_C2)
576 cx->promotion.threshold.count = 4;
577 else
578 cx->promotion.threshold.count = 10;
579 if (higher->type == ACPI_STATE_C3)
580 cx->promotion.threshold.bm = bm_history;
581 }
583 higher = cx;
584 }
586 return 0;
587 }
589 static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr)
590 {
592 if (!pr)
593 return -EINVAL;
595 if (!pr->pblk)
596 return -ENODEV;
598 /* if info is obtained from pblk/fadt, type equals state */
599 pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;
600 pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;
602 #ifndef CONFIG_HOTPLUG_CPU
603 /*
604 * Check for P_LVL2_UP flag before entering C2 and above on
605 * an SMP system.
606 */
607 if ((num_online_cpus() > 1) && !acpi_fadt.plvl2_up)
608 return -ENODEV;
609 #endif
611 /* determine C2 and C3 address from pblk */
612 pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;
613 pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;
615 /* determine latencies from FADT */
616 pr->power.states[ACPI_STATE_C2].latency = acpi_fadt.plvl2_lat;
617 pr->power.states[ACPI_STATE_C3].latency = acpi_fadt.plvl3_lat;
619 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
620 "lvl2[0x%08x] lvl3[0x%08x]\n",
621 pr->power.states[ACPI_STATE_C2].address,
622 pr->power.states[ACPI_STATE_C3].address));
624 return 0;
625 }
627 static int acpi_processor_get_power_info_default_c1(struct acpi_processor *pr)
628 {
630 /* Zero initialize all the C-states info. */
631 memset(pr->power.states, 0, sizeof(pr->power.states));
633 /* set the first C-State to C1 */
634 pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
636 /* the C0 state only exists as a filler in our array,
637 * and all processors need to support C1 */
638 pr->power.states[ACPI_STATE_C0].valid = 1;
639 pr->power.states[ACPI_STATE_C1].valid = 1;
641 return 0;
642 }
644 static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
645 {
646 acpi_status status = 0;
647 acpi_integer count;
648 int current_count;
649 int i;
650 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
651 union acpi_object *cst;
654 if (nocst)
655 return -ENODEV;
657 current_count = 0;
659 /* Zero initialize C2 onwards and prepare for fresh CST lookup */
660 for (i = 2; i < ACPI_PROCESSOR_MAX_POWER; i++)
661 memset(&(pr->power.states[i]), 0,
662 sizeof(struct acpi_processor_cx));
664 status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
665 if (ACPI_FAILURE(status)) {
666 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
667 return -ENODEV;
668 }
670 cst = (union acpi_object *)buffer.pointer;
672 /* There must be at least 2 elements */
673 if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
674 printk(KERN_ERR PREFIX "not enough elements in _CST\n");
675 status = -EFAULT;
676 goto end;
677 }
679 count = cst->package.elements[0].integer.value;
681 /* Validate number of power states. */
682 if (count < 1 || count != cst->package.count - 1) {
683 printk(KERN_ERR PREFIX "count given by _CST is not valid\n");
684 status = -EFAULT;
685 goto end;
686 }
688 /* Tell driver that at least _CST is supported. */
689 pr->flags.has_cst = 1;
691 for (i = 1; i <= count; i++) {
692 union acpi_object *element;
693 union acpi_object *obj;
694 struct acpi_power_register *reg;
695 struct acpi_processor_cx cx;
697 memset(&cx, 0, sizeof(cx));
699 element = (union acpi_object *)&(cst->package.elements[i]);
700 if (element->type != ACPI_TYPE_PACKAGE)
701 continue;
703 if (element->package.count != 4)
704 continue;
706 obj = (union acpi_object *)&(element->package.elements[0]);
708 if (obj->type != ACPI_TYPE_BUFFER)
709 continue;
711 reg = (struct acpi_power_register *)obj->buffer.pointer;
713 if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
714 (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
715 continue;
717 if (!processor_pm_external())
718 cx.address = (reg->space_id ==
719 ACPI_ADR_SPACE_FIXED_HARDWARE) ?
720 0 : reg->address;
721 else
722 cx.address = reg->address;
724 /* There should be an easy way to extract an integer... */
725 obj = (union acpi_object *)&(element->package.elements[1]);
726 if (obj->type != ACPI_TYPE_INTEGER)
727 continue;
729 cx.type = obj->integer.value;
731 /*
732 * Some buggy BIOSes won't list C1 in _CST -
733 * Let acpi_processor_get_power_info_default() handle them later
734 */
735 if (i == 1 && cx.type != ACPI_STATE_C1)
736 current_count++;
738 /* Following check doesn't apply to external control case */
739 if (!processor_pm_external() &&
740 (cx.type != ACPI_STATE_C1) &&
741 (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO))
742 continue;
744 obj = (union acpi_object *)&(element->package.elements[2]);
745 if (obj->type != ACPI_TYPE_INTEGER)
746 continue;
748 cx.latency = obj->integer.value;
750 obj = (union acpi_object *)&(element->package.elements[3]);
751 if (obj->type != ACPI_TYPE_INTEGER)
752 continue;
754 cx.power = obj->integer.value;
756 #ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
757 /* cache control methods to notify external logic */
758 if (processor_pm_external())
759 memcpy(&cx.reg, reg, sizeof(*reg));
760 #endif
762 current_count++;
763 memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
765 /*
766 * We support total ACPI_PROCESSOR_MAX_POWER - 1
767 * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
768 */
769 if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
770 printk(KERN_WARNING
771 "Limiting number of power states to max (%d)\n",
772 ACPI_PROCESSOR_MAX_POWER);
773 printk(KERN_WARNING
774 "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
775 break;
776 }
777 }
779 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
780 current_count));
782 /* Validate number of power states discovered */
783 if (current_count < 2)
784 status = -EFAULT;
786 end:
787 kfree(buffer.pointer);
789 return status;
790 }
792 static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)
793 {
795 if (!cx->address)
796 return;
798 /*
799 * C2 latency must be less than or equal to 100
800 * microseconds.
801 */
802 else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {
803 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
804 "latency too large [%d]\n", cx->latency));
805 return;
806 }
808 /*
809 * Otherwise we've met all of our C2 requirements.
810 * Normalize the C2 latency to expidite policy
811 */
812 cx->valid = 1;
813 cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
815 return;
816 }
818 static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
819 struct acpi_processor_cx *cx)
820 {
821 static int bm_check_flag;
824 if (!cx->address)
825 return;
827 /*
828 * C3 latency must be less than or equal to 1000
829 * microseconds.
830 */
831 else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {
832 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
833 "latency too large [%d]\n", cx->latency));
834 return;
835 }
837 /*
838 * PIIX4 Erratum #18: We don't support C3 when Type-F (fast)
839 * DMA transfers are used by any ISA device to avoid livelock.
840 * Note that we could disable Type-F DMA (as recommended by
841 * the erratum), but this is known to disrupt certain ISA
842 * devices thus we take the conservative approach.
843 */
844 else if (errata.piix4.fdma) {
845 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
846 "C3 not supported on PIIX4 with Type-F DMA\n"));
847 return;
848 }
850 /* All the logic here assumes flags.bm_check is same across all CPUs */
851 if (!bm_check_flag) {
852 /* Determine whether bm_check is needed based on CPU */
853 acpi_processor_power_init_bm_check(&(pr->flags), pr->id);
854 bm_check_flag = pr->flags.bm_check;
855 } else {
856 pr->flags.bm_check = bm_check_flag;
857 }
859 if (pr->flags.bm_check) {
860 if (!pr->flags.bm_control) {
861 if (pr->flags.has_cst != 1) {
862 /* bus mastering control is necessary */
863 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
864 "C3 support requires BM control\n"));
865 return;
866 } else {
867 /* Here we enter C3 without bus mastering */
868 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
869 "C3 support without BM control\n"));
870 }
871 }
872 } else {
873 /*
874 * WBINVD should be set in fadt, for C3 state to be
875 * supported on when bm_check is not required.
876 */
877 if (acpi_fadt.wb_invd != 1) {
878 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
879 "Cache invalidation should work properly"
880 " for C3 to be enabled on SMP systems\n"));
881 return;
882 }
883 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD,
884 0, ACPI_MTX_DO_NOT_LOCK);
885 }
887 /*
888 * Otherwise we've met all of our C3 requirements.
889 * Normalize the C3 latency to expidite policy. Enable
890 * checking of bus mastering status (bm_check) so we can
891 * use this in our C3 policy
892 */
893 cx->valid = 1;
894 cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
896 return;
897 }
899 static int acpi_processor_power_verify(struct acpi_processor *pr)
900 {
901 unsigned int i;
902 unsigned int working = 0;
904 #ifdef ARCH_APICTIMER_STOPS_ON_C3
905 int timer_broadcast = 0;
906 cpumask_t mask = cpumask_of_cpu(pr->id);
907 on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1);
908 #endif
910 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
911 struct acpi_processor_cx *cx = &pr->power.states[i];
913 switch (cx->type) {
914 case ACPI_STATE_C1:
915 cx->valid = 1;
916 break;
918 case ACPI_STATE_C2:
919 acpi_processor_power_verify_c2(cx);
920 #ifdef ARCH_APICTIMER_STOPS_ON_C3
921 /* Some AMD systems fake C3 as C2, but still
922 have timer troubles */
923 if (cx->valid &&
924 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
925 timer_broadcast++;
926 #endif
927 break;
929 case ACPI_STATE_C3:
930 acpi_processor_power_verify_c3(pr, cx);
931 #ifdef ARCH_APICTIMER_STOPS_ON_C3
932 if (cx->valid)
933 timer_broadcast++;
934 #endif
935 break;
936 }
938 if (cx->valid)
939 working++;
940 }
942 #ifdef ARCH_APICTIMER_STOPS_ON_C3
943 if (timer_broadcast)
944 on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1);
945 #endif
947 return (working);
948 }
950 static int acpi_processor_get_power_info(struct acpi_processor *pr)
951 {
952 unsigned int i;
953 int result;
956 /* NOTE: the idle thread may not be running while calling
957 * this function */
959 /* Adding C1 state */
960 acpi_processor_get_power_info_default_c1(pr);
961 result = acpi_processor_get_power_info_cst(pr);
962 if (result == -ENODEV)
963 acpi_processor_get_power_info_fadt(pr);
965 pr->power.count = acpi_processor_power_verify(pr);
967 /*
968 * Set Default Policy
969 * ------------------
970 * Now that we know which states are supported, set the default
971 * policy. Note that this policy can be changed dynamically
972 * (e.g. encourage deeper sleeps to conserve battery life when
973 * not on AC).
974 */
975 result = acpi_processor_set_power_policy(pr);
976 if (result)
977 return result;
979 /*
980 * if one state of type C2 or C3 is available, mark this
981 * CPU as being "idle manageable"
982 */
983 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
984 if (pr->power.states[i].valid) {
985 pr->power.count = i;
986 if (pr->power.states[i].type >= ACPI_STATE_C2)
987 pr->flags.power = 1;
988 }
989 }
991 return 0;
992 }
994 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
995 {
996 int result = 0;
999 if (!pr)
1000 return -EINVAL;
1002 if (nocst) {
1003 return -ENODEV;
1006 if (!pr->flags.power_setup_done)
1007 return -ENODEV;
1009 /* Fall back to the default idle loop */
1010 if (!processor_pm_external())
1011 pm_idle = pm_idle_save;
1012 synchronize_sched(); /* Relies on interrupts forcing exit from idle. */
1014 pr->flags.power = 0;
1015 result = acpi_processor_get_power_info(pr);
1016 if (processor_pm_external())
1017 processor_notify_external(pr,
1018 PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
1019 else if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
1020 pm_idle = acpi_processor_idle;
1022 return result;
1025 /* proc interface */
1027 static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
1029 struct acpi_processor *pr = (struct acpi_processor *)seq->private;
1030 unsigned int i;
1033 if (!pr)
1034 goto end;
1036 seq_printf(seq, "active state: C%zd\n"
1037 "max_cstate: C%d\n"
1038 "bus master activity: %08x\n",
1039 pr->power.state ? pr->power.state - pr->power.states : 0,
1040 max_cstate, (unsigned)pr->power.bm_activity);
1042 seq_puts(seq, "states:\n");
1044 for (i = 1; i <= pr->power.count; i++) {
1045 seq_printf(seq, " %cC%d: ",
1046 (&pr->power.states[i] ==
1047 pr->power.state ? '*' : ' '), i);
1049 if (!pr->power.states[i].valid) {
1050 seq_puts(seq, "<not supported>\n");
1051 continue;
1054 switch (pr->power.states[i].type) {
1055 case ACPI_STATE_C1:
1056 seq_printf(seq, "type[C1] ");
1057 break;
1058 case ACPI_STATE_C2:
1059 seq_printf(seq, "type[C2] ");
1060 break;
1061 case ACPI_STATE_C3:
1062 seq_printf(seq, "type[C3] ");
1063 break;
1064 default:
1065 seq_printf(seq, "type[--] ");
1066 break;
1069 if (pr->power.states[i].promotion.state)
1070 seq_printf(seq, "promotion[C%zd] ",
1071 (pr->power.states[i].promotion.state -
1072 pr->power.states));
1073 else
1074 seq_puts(seq, "promotion[--] ");
1076 if (pr->power.states[i].demotion.state)
1077 seq_printf(seq, "demotion[C%zd] ",
1078 (pr->power.states[i].demotion.state -
1079 pr->power.states));
1080 else
1081 seq_puts(seq, "demotion[--] ");
1083 seq_printf(seq, "latency[%03d] usage[%08d] duration[%020llu]\n",
1084 pr->power.states[i].latency,
1085 pr->power.states[i].usage,
1086 pr->power.states[i].time);
1089 end:
1090 return 0;
1093 static int acpi_processor_power_open_fs(struct inode *inode, struct file *file)
1095 return single_open(file, acpi_processor_power_seq_show,
1096 PDE(inode)->data);
1099 static const struct file_operations acpi_processor_power_fops = {
1100 .open = acpi_processor_power_open_fs,
1101 .read = seq_read,
1102 .llseek = seq_lseek,
1103 .release = single_release,
1104 };
1106 int acpi_processor_power_init(struct acpi_processor *pr,
1107 struct acpi_device *device)
1109 acpi_status status = 0;
1110 static int first_run;
1111 struct proc_dir_entry *entry = NULL;
1112 unsigned int i;
1115 if (!first_run) {
1116 dmi_check_system(processor_power_dmi_table);
1117 if (max_cstate < ACPI_C_STATES_MAX)
1118 printk(KERN_NOTICE
1119 "ACPI: processor limited to max C-state %d\n",
1120 max_cstate);
1121 first_run++;
1124 if (!pr)
1125 return -EINVAL;
1127 if (acpi_fadt.cst_cnt && !nocst) {
1128 status =
1129 acpi_os_write_port(acpi_fadt.smi_cmd, acpi_fadt.cst_cnt, 8);
1130 if (ACPI_FAILURE(status)) {
1131 ACPI_EXCEPTION((AE_INFO, status,
1132 "Notifying BIOS of _CST ability failed"));
1136 acpi_processor_get_power_info(pr);
1138 /*
1139 * Install the idle handler if processor power management is supported.
1140 * Note that we use previously set idle handler will be used on
1141 * platforms that only support C1.
1142 */
1143 if ((pr->flags.power) && (!boot_option_idle_override)) {
1144 printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
1145 for (i = 1; i <= pr->power.count; i++)
1146 if (pr->power.states[i].valid)
1147 printk(" C%d[C%d]", i,
1148 pr->power.states[i].type);
1149 printk(")\n");
1151 if (!processor_pm_external() && (pr->id == 0)) {
1152 pm_idle_save = pm_idle;
1153 pm_idle = acpi_processor_idle;
1157 /* 'power' [R] */
1158 entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,
1159 S_IRUGO, acpi_device_dir(device));
1160 if (!entry)
1161 return -EIO;
1162 else {
1163 entry->proc_fops = &acpi_processor_power_fops;
1164 entry->data = acpi_driver_data(device);
1165 entry->owner = THIS_MODULE;
1168 pr->flags.power_setup_done = 1;
1170 if (processor_pm_external())
1171 processor_notify_external(pr,
1172 PROCESSOR_PM_INIT, PM_TYPE_IDLE);
1173 return 0;
1176 int acpi_processor_power_exit(struct acpi_processor *pr,
1177 struct acpi_device *device)
1180 pr->flags.power_setup_done = 0;
1182 if (acpi_device_dir(device))
1183 remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,
1184 acpi_device_dir(device));
1186 /* Unregister the idle handler when processor #0 is removed. */
1187 if (pr->id == 0) {
1188 pm_idle = pm_idle_save;
1190 /*
1191 * We are about to unload the current idle thread pm callback
1192 * (pm_idle), Wait for all processors to update cached/local
1193 * copies of pm_idle before proceeding.
1194 */
1195 cpu_idle_wait();
1198 return 0;