ia64/xen-unstable

view xen/common/sched_credit.c @ 12279:cb8eeadd8eae

[XEN] When balancing idlers per socket/core, do it one at a time.
Signed-off-by: Emmanuel Ackaouy <ack@xensource.com>
author Emmanuel Ackaouy <ack@xensource.com>
date Tue Nov 07 10:37:30 2006 +0000 (2006-11-07)
parents bb6cd7ba259b
children 05e1863cc2a3
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_DEFAULT_WEIGHT 256
40 #define CSCHED_TICKS_PER_TSLICE 3
41 #define CSCHED_TICKS_PER_ACCT 3
42 #define CSCHED_MSECS_PER_TICK 10
43 #define CSCHED_MSECS_PER_TSLICE \
44 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
45 #define CSCHED_CREDITS_PER_TICK 100
46 #define CSCHED_CREDITS_PER_TSLICE \
47 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48 #define CSCHED_CREDITS_PER_ACCT \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
52 /*
53 * Priorities
54 */
55 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
56 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
57 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
58 #define CSCHED_PRI_IDLE -64 /* idle */
59 #define CSCHED_PRI_TS_PARKED -65 /* time-share w/ capped credits */
62 /*
63 * Useful macros
64 */
65 #define CSCHED_PCPU(_c) \
66 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
67 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
68 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
69 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
72 /*
73 * Stats
74 */
75 #ifdef CSCHED_STATS
77 #define CSCHED_STAT(_X) (csched_priv.stats._X)
78 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
79 #define CSCHED_STAT_PRINTK(_X) \
80 do \
81 { \
82 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
83 } while ( 0 );
85 /*
86 * Try and keep often cranked stats on top so they'll fit on one
87 * cache line.
88 */
89 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
90 _MACRO(schedule) \
91 _MACRO(acct_run) \
92 _MACRO(acct_no_work) \
93 _MACRO(acct_balance) \
94 _MACRO(acct_reorder) \
95 _MACRO(acct_min_credit) \
96 _MACRO(acct_vcpu_active) \
97 _MACRO(acct_vcpu_idle) \
98 _MACRO(vcpu_sleep) \
99 _MACRO(vcpu_wake_running) \
100 _MACRO(vcpu_wake_onrunq) \
101 _MACRO(vcpu_wake_runnable) \
102 _MACRO(vcpu_wake_not_runnable) \
103 _MACRO(tickle_local_idler) \
104 _MACRO(tickle_local_over) \
105 _MACRO(tickle_local_under) \
106 _MACRO(tickle_local_other) \
107 _MACRO(tickle_idlers_none) \
108 _MACRO(tickle_idlers_some) \
109 _MACRO(vcpu_migrate) \
110 _MACRO(load_balance_idle) \
111 _MACRO(load_balance_over) \
112 _MACRO(load_balance_other) \
113 _MACRO(steal_trylock_failed) \
114 _MACRO(steal_peer_down) \
115 _MACRO(steal_peer_idle) \
116 _MACRO(steal_peer_running) \
117 _MACRO(steal_peer_pinned) \
118 _MACRO(steal_peer_migrating) \
119 _MACRO(steal_peer_best_idler) \
120 _MACRO(steal_loner_candidate) \
121 _MACRO(steal_loner_signal) \
122 _MACRO(cpu_pick) \
123 _MACRO(dom_init) \
124 _MACRO(dom_destroy) \
125 _MACRO(vcpu_init) \
126 _MACRO(vcpu_destroy)
128 #ifndef NDEBUG
129 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
130 _MACRO(vcpu_check)
131 #else
132 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
133 #endif
135 #define CSCHED_STATS_EXPAND(_MACRO) \
136 CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
137 CSCHED_STATS_EXPAND_SCHED(_MACRO)
139 #define CSCHED_STATS_RESET() \
140 do \
141 { \
142 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
143 } while ( 0 )
145 #define CSCHED_STATS_DEFINE() \
146 struct \
147 { \
148 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
149 } stats
151 #define CSCHED_STATS_PRINTK() \
152 do \
153 { \
154 printk("stats:\n"); \
155 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
156 } while ( 0 )
158 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
160 #else /* CSCHED_STATS */
162 #define CSCHED_STATS_RESET() do {} while ( 0 )
163 #define CSCHED_STATS_DEFINE() do {} while ( 0 )
164 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
165 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
167 #endif /* CSCHED_STATS */
170 /*
171 * Physical CPU
172 */
173 struct csched_pcpu {
174 struct list_head runq;
175 uint32_t runq_sort_last;
176 };
178 /*
179 * Virtual CPU
180 */
181 struct csched_vcpu {
182 struct list_head runq_elem;
183 struct list_head active_vcpu_elem;
184 struct csched_dom *sdom;
185 struct vcpu *vcpu;
186 atomic_t credit;
187 int16_t pri;
188 struct {
189 int credit_last;
190 uint32_t credit_incr;
191 uint32_t state_active;
192 uint32_t state_idle;
193 uint32_t migrate;
194 } stats;
195 };
197 /*
198 * Domain
199 */
200 struct csched_dom {
201 struct list_head active_vcpu;
202 struct list_head active_sdom_elem;
203 struct domain *dom;
204 uint16_t active_vcpu_count;
205 uint16_t weight;
206 uint16_t cap;
207 };
209 /*
210 * System-wide private data
211 */
212 struct csched_private {
213 spinlock_t lock;
214 struct list_head active_sdom;
215 uint32_t ncpus;
216 unsigned int master;
217 cpumask_t idlers;
218 uint32_t weight;
219 uint32_t credit;
220 int credit_balance;
221 uint32_t runq_sort;
222 CSCHED_STATS_DEFINE();
223 };
226 /*
227 * Global variables
228 */
229 static struct csched_private csched_priv;
233 static inline int
234 __vcpu_on_runq(struct csched_vcpu *svc)
235 {
236 return !list_empty(&svc->runq_elem);
237 }
239 static inline struct csched_vcpu *
240 __runq_elem(struct list_head *elem)
241 {
242 return list_entry(elem, struct csched_vcpu, runq_elem);
243 }
245 static inline void
246 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
247 {
248 const struct list_head * const runq = RUNQ(cpu);
249 struct list_head *iter;
251 BUG_ON( __vcpu_on_runq(svc) );
252 BUG_ON( cpu != svc->vcpu->processor );
254 list_for_each( iter, runq )
255 {
256 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
257 if ( svc->pri > iter_svc->pri )
258 break;
259 }
261 list_add_tail(&svc->runq_elem, iter);
262 }
264 static inline void
265 __runq_remove(struct csched_vcpu *svc)
266 {
267 BUG_ON( !__vcpu_on_runq(svc) );
268 list_del_init(&svc->runq_elem);
269 }
271 static inline void
272 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
273 {
274 struct csched_vcpu * const cur =
275 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
276 cpumask_t mask;
278 ASSERT(cur);
279 cpus_clear(mask);
281 /* If strictly higher priority than current VCPU, signal the CPU */
282 if ( new->pri > cur->pri )
283 {
284 if ( cur->pri == CSCHED_PRI_IDLE )
285 CSCHED_STAT_CRANK(tickle_local_idler);
286 else if ( cur->pri == CSCHED_PRI_TS_OVER )
287 CSCHED_STAT_CRANK(tickle_local_over);
288 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
289 CSCHED_STAT_CRANK(tickle_local_under);
290 else
291 CSCHED_STAT_CRANK(tickle_local_other);
293 cpu_set(cpu, mask);
294 }
296 /*
297 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
298 * let them know there is runnable work in the system...
299 */
300 if ( cur->pri > CSCHED_PRI_IDLE )
301 {
302 if ( cpus_empty(csched_priv.idlers) )
303 {
304 CSCHED_STAT_CRANK(tickle_idlers_none);
305 }
306 else
307 {
308 CSCHED_STAT_CRANK(tickle_idlers_some);
309 cpus_or(mask, mask, csched_priv.idlers);
310 cpus_and(mask, mask, new->vcpu->cpu_affinity);
311 }
312 }
314 /* Send scheduler interrupts to designated CPUs */
315 if ( !cpus_empty(mask) )
316 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
317 }
319 static int
320 csched_pcpu_init(int cpu)
321 {
322 struct csched_pcpu *spc;
323 unsigned long flags;
325 /* Allocate per-PCPU info */
326 spc = xmalloc(struct csched_pcpu);
327 if ( spc == NULL )
328 return -1;
330 spin_lock_irqsave(&csched_priv.lock, flags);
332 /* Initialize/update system-wide config */
333 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
334 if ( csched_priv.ncpus <= cpu )
335 csched_priv.ncpus = cpu + 1;
336 if ( csched_priv.master >= csched_priv.ncpus )
337 csched_priv.master = cpu;
339 INIT_LIST_HEAD(&spc->runq);
340 spc->runq_sort_last = csched_priv.runq_sort;
341 per_cpu(schedule_data, cpu).sched_priv = spc;
343 /* Start off idling... */
344 BUG_ON( !is_idle_vcpu(per_cpu(schedule_data, cpu).curr) );
345 cpu_set(cpu, csched_priv.idlers);
347 spin_unlock_irqrestore(&csched_priv.lock, flags);
349 return 0;
350 }
352 #ifndef NDEBUG
353 static inline void
354 __csched_vcpu_check(struct vcpu *vc)
355 {
356 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
357 struct csched_dom * const sdom = svc->sdom;
359 BUG_ON( svc->vcpu != vc );
360 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
361 if ( sdom )
362 {
363 BUG_ON( is_idle_vcpu(vc) );
364 BUG_ON( sdom->dom != vc->domain );
365 }
366 else
367 {
368 BUG_ON( !is_idle_vcpu(vc) );
369 }
371 CSCHED_STAT_CRANK(vcpu_check);
372 }
373 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
374 #else
375 #define CSCHED_VCPU_CHECK(_vc)
376 #endif
378 /*
379 * Indicates which of two given idlers is most efficient to run
380 * an additional VCPU.
381 *
382 * Returns:
383 * 0: They are the same.
384 * negative: One is less efficient than Two.
385 * positive: One is more efficient than Two.
386 */
387 static int
388 csched_idler_compare(int one, int two)
389 {
390 cpumask_t idlers;
391 cpumask_t one_idlers;
392 cpumask_t two_idlers;
394 idlers = csched_priv.idlers;
395 cpu_clear(one, idlers);
396 cpu_clear(two, idlers);
398 if ( cpu_isset(one, cpu_core_map[two]) )
399 {
400 cpus_and(one_idlers, idlers, cpu_sibling_map[one]);
401 cpus_and(two_idlers, idlers, cpu_sibling_map[two]);
402 }
403 else
404 {
405 cpus_and(one_idlers, idlers, cpu_core_map[one]);
406 cpus_and(two_idlers, idlers, cpu_core_map[two]);
407 }
409 return cpus_weight(one_idlers) - cpus_weight(two_idlers);
410 }
412 static inline int
413 __csched_queued_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
414 {
415 /*
416 * Don't pick up work that's in the peer's scheduling tail. Also only pick
417 * up work that's allowed to run on our CPU.
418 */
419 if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
420 {
421 CSCHED_STAT_CRANK(steal_peer_running);
422 return 0;
423 }
425 if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
426 {
427 CSCHED_STAT_CRANK(steal_peer_pinned);
428 return 0;
429 }
431 return 1;
432 }
434 static inline int
435 __csched_running_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
436 {
437 BUG_ON( is_idle_vcpu(vc) );
439 if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
440 {
441 CSCHED_STAT_CRANK(steal_peer_pinned);
442 return 0;
443 }
445 if ( test_bit(_VCPUF_migrating, &vc->vcpu_flags) )
446 {
447 CSCHED_STAT_CRANK(steal_peer_migrating);
448 return 0;
449 }
451 if ( csched_idler_compare(local_cpu, vc->processor) <= 0 )
452 {
453 CSCHED_STAT_CRANK(steal_peer_best_idler);
454 return 0;
455 }
457 return 1;
458 }
460 static void
461 csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
462 {
463 struct csched_dom * const sdom = svc->sdom;
464 unsigned long flags;
466 /* Update credits */
467 atomic_sub(credit_dec, &svc->credit);
469 /* Put this VCPU and domain back on the active list if it was idling */
470 if ( list_empty(&svc->active_vcpu_elem) )
471 {
472 spin_lock_irqsave(&csched_priv.lock, flags);
474 if ( list_empty(&svc->active_vcpu_elem) )
475 {
476 CSCHED_STAT_CRANK(acct_vcpu_active);
477 svc->stats.state_active++;
479 sdom->active_vcpu_count++;
480 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
481 if ( list_empty(&sdom->active_sdom_elem) )
482 {
483 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
484 csched_priv.weight += sdom->weight;
485 }
486 }
488 spin_unlock_irqrestore(&csched_priv.lock, flags);
489 }
491 /*
492 * If this VCPU's priority was boosted when it last awoke, reset it.
493 * If the VCPU is found here, then it's consuming a non-negligeable
494 * amount of CPU resources and should no longer be boosted.
495 */
496 if ( svc->pri == CSCHED_PRI_TS_BOOST )
497 svc->pri = CSCHED_PRI_TS_UNDER;
498 }
500 static inline void
501 __csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
502 {
503 struct csched_dom * const sdom = svc->sdom;
505 BUG_ON( list_empty(&svc->active_vcpu_elem) );
507 CSCHED_STAT_CRANK(acct_vcpu_idle);
508 svc->stats.state_idle++;
510 sdom->active_vcpu_count--;
511 list_del_init(&svc->active_vcpu_elem);
512 if ( list_empty(&sdom->active_vcpu) )
513 {
514 BUG_ON( csched_priv.weight < sdom->weight );
515 list_del_init(&sdom->active_sdom_elem);
516 csched_priv.weight -= sdom->weight;
517 }
518 }
520 static int
521 csched_vcpu_init(struct vcpu *vc)
522 {
523 struct domain * const dom = vc->domain;
524 struct csched_dom *sdom = CSCHED_DOM(dom);
525 struct csched_vcpu *svc;
527 CSCHED_STAT_CRANK(vcpu_init);
529 /* Allocate per-VCPU info */
530 svc = xmalloc(struct csched_vcpu);
531 if ( svc == NULL )
532 return -1;
534 INIT_LIST_HEAD(&svc->runq_elem);
535 INIT_LIST_HEAD(&svc->active_vcpu_elem);
536 svc->sdom = sdom;
537 svc->vcpu = vc;
538 atomic_set(&svc->credit, 0);
539 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
540 memset(&svc->stats, 0, sizeof(svc->stats));
541 vc->sched_priv = svc;
543 CSCHED_VCPU_CHECK(vc);
545 /* Attach fair-share VCPUs to the accounting list */
546 if ( likely(sdom != NULL) )
547 csched_vcpu_acct(svc, 0);
549 /* Allocate per-PCPU info */
550 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
551 {
552 if ( csched_pcpu_init(vc->processor) != 0 )
553 return -1;
554 }
556 CSCHED_VCPU_CHECK(vc);
558 return 0;
559 }
561 static void
562 csched_vcpu_destroy(struct vcpu *vc)
563 {
564 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
565 struct csched_dom * const sdom = svc->sdom;
566 unsigned long flags;
568 CSCHED_STAT_CRANK(vcpu_destroy);
570 BUG_ON( sdom == NULL );
571 BUG_ON( !list_empty(&svc->runq_elem) );
573 spin_lock_irqsave(&csched_priv.lock, flags);
575 if ( !list_empty(&svc->active_vcpu_elem) )
576 __csched_vcpu_acct_idle_locked(svc);
578 spin_unlock_irqrestore(&csched_priv.lock, flags);
580 xfree(svc);
581 }
583 static void
584 csched_vcpu_sleep(struct vcpu *vc)
585 {
586 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
588 CSCHED_STAT_CRANK(vcpu_sleep);
590 BUG_ON( is_idle_vcpu(vc) );
592 if ( per_cpu(schedule_data, vc->processor).curr == vc )
593 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
594 else if ( __vcpu_on_runq(svc) )
595 __runq_remove(svc);
596 }
598 static void
599 csched_vcpu_wake(struct vcpu *vc)
600 {
601 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
602 const unsigned int cpu = vc->processor;
604 BUG_ON( is_idle_vcpu(vc) );
606 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
607 {
608 CSCHED_STAT_CRANK(vcpu_wake_running);
609 return;
610 }
611 if ( unlikely(__vcpu_on_runq(svc)) )
612 {
613 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
614 return;
615 }
617 if ( likely(vcpu_runnable(vc)) )
618 CSCHED_STAT_CRANK(vcpu_wake_runnable);
619 else
620 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
622 /*
623 * We temporarly boost the priority of awaking VCPUs!
624 *
625 * If this VCPU consumes a non negligeable amount of CPU, it
626 * will eventually find itself in the credit accounting code
627 * path where its priority will be reset to normal.
628 *
629 * If on the other hand the VCPU consumes little CPU and is
630 * blocking and awoken a lot (doing I/O for example), its
631 * priority will remain boosted, optimizing it's wake-to-run
632 * latencies.
633 *
634 * This allows wake-to-run latency sensitive VCPUs to preempt
635 * more CPU resource intensive VCPUs without impacting overall
636 * system fairness.
637 */
638 if ( svc->pri == CSCHED_PRI_TS_UNDER )
639 svc->pri = CSCHED_PRI_TS_BOOST;
641 /* Put the VCPU on the runq and tickle CPUs */
642 __runq_insert(cpu, svc);
643 __runq_tickle(cpu, svc);
644 }
646 static int
647 csched_dom_cntl(
648 struct domain *d,
649 struct xen_domctl_scheduler_op *op)
650 {
651 struct csched_dom * const sdom = CSCHED_DOM(d);
652 unsigned long flags;
654 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
655 {
656 op->u.credit.weight = sdom->weight;
657 op->u.credit.cap = sdom->cap;
658 }
659 else
660 {
661 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
663 spin_lock_irqsave(&csched_priv.lock, flags);
665 if ( op->u.credit.weight != 0 )
666 {
667 if ( !list_empty(&sdom->active_sdom_elem) )
668 {
669 csched_priv.weight -= sdom->weight;
670 csched_priv.weight += op->u.credit.weight;
671 }
672 sdom->weight = op->u.credit.weight;
673 }
675 if ( op->u.credit.cap != (uint16_t)~0U )
676 sdom->cap = op->u.credit.cap;
678 spin_unlock_irqrestore(&csched_priv.lock, flags);
679 }
681 return 0;
682 }
684 static int
685 csched_dom_init(struct domain *dom)
686 {
687 struct csched_dom *sdom;
689 CSCHED_STAT_CRANK(dom_init);
691 if ( is_idle_domain(dom) )
692 return 0;
694 sdom = xmalloc(struct csched_dom);
695 if ( sdom == NULL )
696 return -ENOMEM;
698 /* Initialize credit and weight */
699 INIT_LIST_HEAD(&sdom->active_vcpu);
700 sdom->active_vcpu_count = 0;
701 INIT_LIST_HEAD(&sdom->active_sdom_elem);
702 sdom->dom = dom;
703 sdom->weight = CSCHED_DEFAULT_WEIGHT;
704 sdom->cap = 0U;
705 dom->sched_priv = sdom;
707 return 0;
708 }
710 static void
711 csched_dom_destroy(struct domain *dom)
712 {
713 struct csched_dom * const sdom = CSCHED_DOM(dom);
715 CSCHED_STAT_CRANK(dom_destroy);
717 xfree(sdom);
718 }
720 static int
721 csched_cpu_pick(struct vcpu *vc)
722 {
723 cpumask_t cpus;
724 int cpu, nxt;
726 CSCHED_STAT_CRANK(cpu_pick);
728 /*
729 * Pick from online CPUs in VCPU's affinity mask, giving a
730 * preference to its current processor if it's in there.
731 */
732 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
733 ASSERT( !cpus_empty(cpus) );
734 cpu = cpu_isset(vc->processor, cpus) ? vc->processor : first_cpu(cpus);
736 /*
737 * Try to find an idle processor within the above constraints.
738 */
739 cpus_and(cpus, cpus, csched_priv.idlers);
740 if ( !cpus_empty(cpus) )
741 {
742 cpu = cpu_isset(cpu, cpus) ? cpu : first_cpu(cpus);
743 cpu_clear(cpu, cpus);
745 /*
746 * In multi-core and multi-threaded CPUs, not all idle execution
747 * vehicles are equal!
748 *
749 * We give preference to the idle execution vehicle with the most
750 * idling neighbours in its grouping. This distributes work across
751 * distinct cores first and guarantees we don't do something stupid
752 * like run two VCPUs on co-hyperthreads while there are idle cores
753 * or sockets.
754 */
755 while ( !cpus_empty(cpus) )
756 {
757 nxt = first_cpu(cpus);
759 if ( csched_idler_compare(cpu, nxt) < 0 )
760 {
761 cpu = nxt;
762 cpu_clear(nxt, cpus);
763 }
764 else if ( cpu_isset(cpu, cpu_core_map[nxt]) )
765 {
766 cpus_andnot(cpus, cpus, cpu_sibling_map[nxt]);
767 }
768 else
769 {
770 cpus_andnot(cpus, cpus, cpu_core_map[nxt]);
771 }
773 ASSERT( !cpu_isset(nxt, cpus) );
774 }
775 }
777 return cpu;
778 }
780 /*
781 * This is a O(n) optimized sort of the runq.
782 *
783 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
784 * through the runq and move up any UNDERs that are preceded by OVERS. We
785 * remember the last UNDER to make the move up operation O(1).
786 */
787 static void
788 csched_runq_sort(unsigned int cpu)
789 {
790 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
791 struct list_head *runq, *elem, *next, *last_under;
792 struct csched_vcpu *svc_elem;
793 unsigned long flags;
794 int sort_epoch;
796 sort_epoch = csched_priv.runq_sort;
797 if ( sort_epoch == spc->runq_sort_last )
798 return;
800 spc->runq_sort_last = sort_epoch;
802 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
804 runq = &spc->runq;
805 elem = runq->next;
806 last_under = runq;
808 while ( elem != runq )
809 {
810 next = elem->next;
811 svc_elem = __runq_elem(elem);
813 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
814 {
815 /* does elem need to move up the runq? */
816 if ( elem->prev != last_under )
817 {
818 list_del(elem);
819 list_add(elem, last_under);
820 }
821 last_under = elem;
822 }
824 elem = next;
825 }
827 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
828 }
830 static void
831 csched_acct(void)
832 {
833 unsigned long flags;
834 struct list_head *iter_vcpu, *next_vcpu;
835 struct list_head *iter_sdom, *next_sdom;
836 struct csched_vcpu *svc;
837 struct csched_dom *sdom;
838 uint32_t credit_total;
839 uint32_t weight_total;
840 uint32_t weight_left;
841 uint32_t credit_fair;
842 uint32_t credit_peak;
843 uint32_t credit_cap;
844 int credit_balance;
845 int credit_xtra;
846 int credit;
849 spin_lock_irqsave(&csched_priv.lock, flags);
851 weight_total = csched_priv.weight;
852 credit_total = csched_priv.credit;
854 /* Converge balance towards 0 when it drops negative */
855 if ( csched_priv.credit_balance < 0 )
856 {
857 credit_total -= csched_priv.credit_balance;
858 CSCHED_STAT_CRANK(acct_balance);
859 }
861 if ( unlikely(weight_total == 0) )
862 {
863 csched_priv.credit_balance = 0;
864 spin_unlock_irqrestore(&csched_priv.lock, flags);
865 CSCHED_STAT_CRANK(acct_no_work);
866 return;
867 }
869 CSCHED_STAT_CRANK(acct_run);
871 weight_left = weight_total;
872 credit_balance = 0;
873 credit_xtra = 0;
874 credit_cap = 0U;
876 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
877 {
878 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
880 BUG_ON( is_idle_domain(sdom->dom) );
881 BUG_ON( sdom->active_vcpu_count == 0 );
882 BUG_ON( sdom->weight == 0 );
883 BUG_ON( sdom->weight > weight_left );
885 weight_left -= sdom->weight;
887 /*
888 * A domain's fair share is computed using its weight in competition
889 * with that of all other active domains.
890 *
891 * At most, a domain can use credits to run all its active VCPUs
892 * for one full accounting period. We allow a domain to earn more
893 * only when the system-wide credit balance is negative.
894 */
895 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
896 if ( csched_priv.credit_balance < 0 )
897 {
898 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
899 (weight_total - 1)
900 ) / weight_total;
901 }
903 if ( sdom->cap != 0U )
904 {
905 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
906 if ( credit_cap < credit_peak )
907 credit_peak = credit_cap;
909 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
910 ) / sdom->active_vcpu_count;
911 }
913 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
914 ) / weight_total;
916 if ( credit_fair < credit_peak )
917 {
918 credit_xtra = 1;
919 }
920 else
921 {
922 if ( weight_left != 0U )
923 {
924 /* Give other domains a chance at unused credits */
925 credit_total += ( ( ( credit_fair - credit_peak
926 ) * weight_total
927 ) + ( weight_left - 1 )
928 ) / weight_left;
929 }
931 if ( credit_xtra )
932 {
933 /*
934 * Lazily keep domains with extra credits at the head of
935 * the queue to give others a chance at them in future
936 * accounting periods.
937 */
938 CSCHED_STAT_CRANK(acct_reorder);
939 list_del(&sdom->active_sdom_elem);
940 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
941 }
943 credit_fair = credit_peak;
944 }
946 /* Compute fair share per VCPU */
947 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
948 ) / sdom->active_vcpu_count;
951 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
952 {
953 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
954 BUG_ON( sdom != svc->sdom );
956 /* Increment credit */
957 atomic_add(credit_fair, &svc->credit);
958 credit = atomic_read(&svc->credit);
960 /*
961 * Recompute priority or, if VCPU is idling, remove it from
962 * the active list.
963 */
964 if ( credit < 0 )
965 {
966 if ( sdom->cap != 0U && credit < -credit_cap )
967 svc->pri = CSCHED_PRI_TS_PARKED;
968 else
969 svc->pri = CSCHED_PRI_TS_OVER;
971 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
972 {
973 CSCHED_STAT_CRANK(acct_min_credit);
974 credit = -CSCHED_CREDITS_PER_TSLICE;
975 atomic_set(&svc->credit, credit);
976 }
977 }
978 else
979 {
980 svc->pri = CSCHED_PRI_TS_UNDER;
982 if ( credit > CSCHED_CREDITS_PER_TSLICE )
983 {
984 __csched_vcpu_acct_idle_locked(svc);
985 credit = 0;
986 atomic_set(&svc->credit, credit);
987 }
988 }
990 svc->stats.credit_last = credit;
991 svc->stats.credit_incr = credit_fair;
992 credit_balance += credit;
993 }
994 }
996 csched_priv.credit_balance = credit_balance;
998 spin_unlock_irqrestore(&csched_priv.lock, flags);
1000 /* Inform each CPU that its runq needs to be sorted */
1001 csched_priv.runq_sort++;
1004 static void
1005 csched_tick(unsigned int cpu)
1007 struct csched_vcpu * const svc = CSCHED_VCPU(current);
1008 struct csched_dom * const sdom = svc->sdom;
1010 /*
1011 * Accounting for running VCPU
1013 * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
1014 */
1015 if ( likely(sdom != NULL) )
1017 csched_vcpu_acct(svc, CSCHED_CREDITS_PER_TICK);
1020 /*
1021 * Accounting duty
1023 * Note: Currently, this is always done by the master boot CPU. Eventually,
1024 * we could distribute or at the very least cycle the duty.
1025 */
1026 if ( (csched_priv.master == cpu) &&
1027 (per_cpu(schedule_data, cpu).tick % CSCHED_TICKS_PER_ACCT) == 0 )
1029 csched_acct();
1032 /*
1033 * Check if runq needs to be sorted
1035 * Every physical CPU resorts the runq after the accounting master has
1036 * modified priorities. This is a special O(n) sort and runs at most
1037 * once per accounting period (currently 30 milliseconds).
1038 */
1039 csched_runq_sort(cpu);
1042 static struct csched_vcpu *
1043 csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
1045 struct list_head *iter;
1046 struct csched_vcpu *speer;
1047 struct vcpu *vc;
1049 list_for_each( iter, &spc->runq )
1051 speer = __runq_elem(iter);
1053 /*
1054 * If next available VCPU here is not of higher priority than ours,
1055 * this PCPU is useless to us.
1056 */
1057 if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
1059 CSCHED_STAT_CRANK(steal_peer_idle);
1060 break;
1063 /* Is this VCPU is runnable on our PCPU? */
1064 vc = speer->vcpu;
1065 BUG_ON( is_idle_vcpu(vc) );
1067 if ( __csched_queued_vcpu_is_stealable(cpu, vc) )
1069 /* We got a candidate. Grab it! */
1070 __runq_remove(speer);
1071 vc->processor = cpu;
1073 return speer;
1077 return NULL;
1080 static struct csched_vcpu *
1081 csched_load_balance(int cpu, struct csched_vcpu *snext)
1083 struct csched_vcpu *speer;
1084 struct csched_pcpu *spc;
1085 struct vcpu *peer_vcpu;
1086 cpumask_t workers;
1087 cpumask_t loners;
1088 int peer_cpu;
1090 if ( snext->pri == CSCHED_PRI_IDLE )
1091 CSCHED_STAT_CRANK(load_balance_idle);
1092 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1093 CSCHED_STAT_CRANK(load_balance_over);
1094 else
1095 CSCHED_STAT_CRANK(load_balance_other);
1097 /*
1098 * Peek at non-idling CPUs in the system
1099 */
1100 cpus_clear(loners);
1101 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1102 cpu_clear(cpu, workers);
1104 peer_cpu = cpu;
1105 BUG_ON( peer_cpu != snext->vcpu->processor );
1107 while ( !cpus_empty(workers) )
1109 /* For each CPU of interest, starting with our neighbour... */
1110 peer_cpu = next_cpu(peer_cpu, workers);
1111 if ( peer_cpu == NR_CPUS )
1112 peer_cpu = first_cpu(workers);
1114 cpu_clear(peer_cpu, workers);
1116 /*
1117 * Get ahold of the scheduler lock for this peer CPU.
1119 * Note: We don't spin on this lock but simply try it. Spinning could
1120 * cause a deadlock if the peer CPU is also load balancing and trying
1121 * to lock this CPU.
1122 */
1123 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1125 CSCHED_STAT_CRANK(steal_trylock_failed);
1126 continue;
1129 peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1130 spc = CSCHED_PCPU(peer_cpu);
1132 if ( unlikely(spc == NULL) )
1134 CSCHED_STAT_CRANK(steal_peer_down);
1136 else if ( unlikely(is_idle_vcpu(peer_vcpu)) )
1138 /*
1139 * Don't steal from an idle CPU's runq because it's about to
1140 * pick up work from it itself.
1141 */
1142 CSCHED_STAT_CRANK(steal_peer_idle);
1144 else if ( is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) )
1146 if ( snext->pri == CSCHED_PRI_IDLE &&
1147 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
1149 CSCHED_STAT_CRANK(steal_loner_candidate);
1150 cpu_set(peer_cpu, loners);
1153 else
1155 /* Try to steal work from a remote CPU's runq. */
1156 speer = csched_runq_steal(spc, cpu, snext->pri);
1157 if ( speer != NULL )
1159 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1160 CSCHED_STAT_CRANK(vcpu_migrate);
1161 speer->stats.migrate++;
1162 return speer;
1166 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1169 /*
1170 * If we failed to find any remotely queued VCPUs to move here,
1171 * see if it would be more efficient to move any of the running
1172 * remote VCPUs over here.
1173 */
1174 while ( !cpus_empty(loners) )
1176 /* For each CPU of interest, starting with our neighbour... */
1177 peer_cpu = next_cpu(peer_cpu, loners);
1178 if ( peer_cpu == NR_CPUS )
1179 peer_cpu = first_cpu(loners);
1181 cpu_clear(peer_cpu, loners);
1183 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1185 CSCHED_STAT_CRANK(steal_trylock_failed);
1186 continue;
1189 peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1190 spc = CSCHED_PCPU(peer_cpu);
1192 /* Signal the first candidate only. */
1193 if ( !is_idle_vcpu(peer_vcpu) &&
1194 is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) &&
1195 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
1197 set_bit(_VCPUF_migrating, &peer_vcpu->vcpu_flags);
1198 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1200 CSCHED_STAT_CRANK(steal_loner_signal);
1201 cpu_raise_softirq(peer_cpu, SCHEDULE_SOFTIRQ);
1202 break;
1205 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1208 /* Failed to find more important work elsewhere... */
1209 __runq_remove(snext);
1210 return snext;
1213 /*
1214 * This function is in the critical path. It is designed to be simple and
1215 * fast for the common case.
1216 */
1217 static struct task_slice
1218 csched_schedule(s_time_t now)
1220 const int cpu = smp_processor_id();
1221 struct list_head * const runq = RUNQ(cpu);
1222 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1223 struct csched_vcpu *snext;
1224 struct task_slice ret;
1226 CSCHED_STAT_CRANK(schedule);
1227 CSCHED_VCPU_CHECK(current);
1229 /*
1230 * Select next runnable local VCPU (ie top of local runq)
1231 */
1232 if ( vcpu_runnable(current) )
1233 __runq_insert(cpu, scurr);
1234 else
1235 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1237 snext = __runq_elem(runq->next);
1239 /*
1240 * SMP Load balance:
1242 * If the next highest priority local runnable VCPU has already eaten
1243 * through its credits, look on other PCPUs to see if we have more
1244 * urgent work... If not, csched_load_balance() will return snext, but
1245 * already removed from the runq.
1246 */
1247 if ( snext->pri > CSCHED_PRI_TS_OVER )
1248 __runq_remove(snext);
1249 else
1250 snext = csched_load_balance(cpu, snext);
1252 /*
1253 * Update idlers mask if necessary. When we're idling, other CPUs
1254 * will tickle us when they get extra work.
1255 */
1256 if ( snext->pri == CSCHED_PRI_IDLE )
1258 if ( !cpu_isset(cpu, csched_priv.idlers) )
1259 cpu_set(cpu, csched_priv.idlers);
1261 else if ( cpu_isset(cpu, csched_priv.idlers) )
1263 cpu_clear(cpu, csched_priv.idlers);
1266 /*
1267 * Return task to run next...
1268 */
1269 ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
1270 ret.task = snext->vcpu;
1272 CSCHED_VCPU_CHECK(ret.task);
1274 return ret;
1277 static void
1278 csched_dump_vcpu(struct csched_vcpu *svc)
1280 struct csched_dom * const sdom = svc->sdom;
1282 printk("[%i.%i] pri=%i cpu=%i",
1283 svc->vcpu->domain->domain_id,
1284 svc->vcpu->vcpu_id,
1285 svc->pri,
1286 svc->vcpu->processor);
1288 if ( sdom )
1290 printk(" credit=%i (%d+%u) {a/i=%u/%u m=%u w=%u}",
1291 atomic_read(&svc->credit),
1292 svc->stats.credit_last,
1293 svc->stats.credit_incr,
1294 svc->stats.state_active,
1295 svc->stats.state_idle,
1296 svc->stats.migrate,
1297 sdom->weight);
1300 printk("\n");
1303 static void
1304 csched_dump_pcpu(int cpu)
1306 struct list_head *runq, *iter;
1307 struct csched_pcpu *spc;
1308 struct csched_vcpu *svc;
1309 int loop;
1311 spc = CSCHED_PCPU(cpu);
1312 runq = &spc->runq;
1314 printk(" tick=%lu, sort=%d, sibling=0x%lx, core=0x%lx\n",
1315 per_cpu(schedule_data, cpu).tick,
1316 spc->runq_sort_last,
1317 cpu_sibling_map[cpu].bits[0],
1318 cpu_core_map[cpu].bits[0]);
1320 /* current VCPU */
1321 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1322 if ( svc )
1324 printk("\trun: ");
1325 csched_dump_vcpu(svc);
1328 loop = 0;
1329 list_for_each( iter, runq )
1331 svc = __runq_elem(iter);
1332 if ( svc )
1334 printk("\t%3d: ", ++loop);
1335 csched_dump_vcpu(svc);
1340 static void
1341 csched_dump(void)
1343 struct list_head *iter_sdom, *iter_svc;
1344 int loop;
1346 printk("info:\n"
1347 "\tncpus = %u\n"
1348 "\tmaster = %u\n"
1349 "\tcredit = %u\n"
1350 "\tcredit balance = %d\n"
1351 "\tweight = %u\n"
1352 "\trunq_sort = %u\n"
1353 "\tdefault-weight = %d\n"
1354 "\tmsecs per tick = %dms\n"
1355 "\tcredits per tick = %d\n"
1356 "\tticks per tslice = %d\n"
1357 "\tticks per acct = %d\n",
1358 csched_priv.ncpus,
1359 csched_priv.master,
1360 csched_priv.credit,
1361 csched_priv.credit_balance,
1362 csched_priv.weight,
1363 csched_priv.runq_sort,
1364 CSCHED_DEFAULT_WEIGHT,
1365 CSCHED_MSECS_PER_TICK,
1366 CSCHED_CREDITS_PER_TICK,
1367 CSCHED_TICKS_PER_TSLICE,
1368 CSCHED_TICKS_PER_ACCT);
1370 printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
1372 CSCHED_STATS_PRINTK();
1374 printk("active vcpus:\n");
1375 loop = 0;
1376 list_for_each( iter_sdom, &csched_priv.active_sdom )
1378 struct csched_dom *sdom;
1379 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1381 list_for_each( iter_svc, &sdom->active_vcpu )
1383 struct csched_vcpu *svc;
1384 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1386 printk("\t%3d: ", ++loop);
1387 csched_dump_vcpu(svc);
1392 static void
1393 csched_init(void)
1395 spin_lock_init(&csched_priv.lock);
1396 INIT_LIST_HEAD(&csched_priv.active_sdom);
1397 csched_priv.ncpus = 0;
1398 csched_priv.master = UINT_MAX;
1399 cpus_clear(csched_priv.idlers);
1400 csched_priv.weight = 0U;
1401 csched_priv.credit = 0U;
1402 csched_priv.credit_balance = 0;
1403 csched_priv.runq_sort = 0U;
1404 CSCHED_STATS_RESET();
1408 struct scheduler sched_credit_def = {
1409 .name = "SMP Credit Scheduler",
1410 .opt_name = "credit",
1411 .sched_id = XEN_SCHEDULER_CREDIT,
1413 .init_domain = csched_dom_init,
1414 .destroy_domain = csched_dom_destroy,
1416 .init_vcpu = csched_vcpu_init,
1417 .destroy_vcpu = csched_vcpu_destroy,
1419 .sleep = csched_vcpu_sleep,
1420 .wake = csched_vcpu_wake,
1422 .adjust = csched_dom_cntl,
1424 .pick_cpu = csched_cpu_pick,
1425 .tick = csched_tick,
1426 .do_schedule = csched_schedule,
1428 .dump_cpu_state = csched_dump_pcpu,
1429 .dump_settings = csched_dump,
1430 .init = csched_init,
1431 };