ia64/xen-unstable

view xen/common/sched_credit.c @ 13011:360eb996fa38

[XEN] Improve scheduler cap mechanism
Somewhat unbastardize the scheduler cap mechanism. We now cleanly
pause and unpause running VCPUs of capped out domains instead of
using sub-idle priorities. This also improves the precision of
caps a bit.
Signed-off-by: Emmanuel Ackaouy <ack@xensource.com>
author Emmanuel Ackaouy <ack@xensource.com>
date Wed Dec 13 16:13:26 2006 +0000 (2006-12-13)
parents 05e1863cc2a3
children 215b799fa181
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_DEFAULT_WEIGHT 256
40 #define CSCHED_TICKS_PER_TSLICE 3
41 #define CSCHED_TICKS_PER_ACCT 3
42 #define CSCHED_MSECS_PER_TICK 10
43 #define CSCHED_MSECS_PER_TSLICE \
44 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
45 #define CSCHED_CREDITS_PER_TICK 100
46 #define CSCHED_CREDITS_PER_TSLICE \
47 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48 #define CSCHED_CREDITS_PER_ACCT \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
52 /*
53 * Priorities
54 */
55 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
56 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
57 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
58 #define CSCHED_PRI_IDLE -64 /* idle */
61 /*
62 * Flags
63 */
64 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
67 /*
68 * Useful macros
69 */
70 #define CSCHED_PCPU(_c) \
71 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
72 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
73 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
74 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
77 /*
78 * Stats
79 */
80 #ifdef CSCHED_STATS
82 #define CSCHED_STAT(_X) (csched_priv.stats._X)
83 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
84 #define CSCHED_STAT_PRINTK(_X) \
85 do \
86 { \
87 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
88 } while ( 0 );
90 /*
91 * Try and keep often cranked stats on top so they'll fit on one
92 * cache line.
93 */
94 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
95 _MACRO(schedule) \
96 _MACRO(acct_run) \
97 _MACRO(acct_no_work) \
98 _MACRO(acct_balance) \
99 _MACRO(acct_reorder) \
100 _MACRO(acct_min_credit) \
101 _MACRO(acct_vcpu_active) \
102 _MACRO(acct_vcpu_idle) \
103 _MACRO(vcpu_sleep) \
104 _MACRO(vcpu_wake_running) \
105 _MACRO(vcpu_wake_onrunq) \
106 _MACRO(vcpu_wake_runnable) \
107 _MACRO(vcpu_wake_not_runnable) \
108 _MACRO(vcpu_park) \
109 _MACRO(vcpu_unpark) \
110 _MACRO(tickle_local_idler) \
111 _MACRO(tickle_local_over) \
112 _MACRO(tickle_local_under) \
113 _MACRO(tickle_local_other) \
114 _MACRO(tickle_idlers_none) \
115 _MACRO(tickle_idlers_some) \
116 _MACRO(load_balance_idle) \
117 _MACRO(load_balance_over) \
118 _MACRO(load_balance_other) \
119 _MACRO(steal_trylock_failed) \
120 _MACRO(steal_peer_idle) \
121 _MACRO(migrate_queued) \
122 _MACRO(migrate_running) \
123 _MACRO(dom_init) \
124 _MACRO(dom_destroy) \
125 _MACRO(vcpu_init) \
126 _MACRO(vcpu_destroy)
128 #ifndef NDEBUG
129 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
130 _MACRO(vcpu_check)
131 #else
132 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
133 #endif
135 #define CSCHED_STATS_EXPAND(_MACRO) \
136 CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
137 CSCHED_STATS_EXPAND_SCHED(_MACRO)
139 #define CSCHED_STATS_RESET() \
140 do \
141 { \
142 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
143 } while ( 0 )
145 #define CSCHED_STATS_DEFINE() \
146 struct \
147 { \
148 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
149 } stats;
151 #define CSCHED_STATS_PRINTK() \
152 do \
153 { \
154 printk("stats:\n"); \
155 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
156 } while ( 0 )
158 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
160 #define CSCHED_VCPU_STATS_RESET(_V) \
161 do \
162 { \
163 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
164 } while ( 0 )
166 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
168 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
170 #else /* CSCHED_STATS */
172 #define CSCHED_STATS_RESET() do {} while ( 0 )
173 #define CSCHED_STATS_DEFINE()
174 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
175 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
176 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
177 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
178 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
180 #endif /* CSCHED_STATS */
183 /*
184 * Physical CPU
185 */
186 struct csched_pcpu {
187 struct list_head runq;
188 uint32_t runq_sort_last;
189 };
191 /*
192 * Virtual CPU
193 */
194 struct csched_vcpu {
195 struct list_head runq_elem;
196 struct list_head active_vcpu_elem;
197 struct csched_dom *sdom;
198 struct vcpu *vcpu;
199 atomic_t credit;
200 uint16_t flags;
201 int16_t pri;
202 #ifdef CSCHED_STATS
203 struct {
204 int credit_last;
205 uint32_t credit_incr;
206 uint32_t state_active;
207 uint32_t state_idle;
208 uint32_t migrate_q;
209 uint32_t migrate_r;
210 } stats;
211 #endif
212 };
214 /*
215 * Domain
216 */
217 struct csched_dom {
218 struct list_head active_vcpu;
219 struct list_head active_sdom_elem;
220 struct domain *dom;
221 uint16_t active_vcpu_count;
222 uint16_t weight;
223 uint16_t cap;
224 };
226 /*
227 * System-wide private data
228 */
229 struct csched_private {
230 spinlock_t lock;
231 struct list_head active_sdom;
232 uint32_t ncpus;
233 unsigned int master;
234 cpumask_t idlers;
235 uint32_t weight;
236 uint32_t credit;
237 int credit_balance;
238 uint32_t runq_sort;
239 CSCHED_STATS_DEFINE()
240 };
243 /*
244 * Global variables
245 */
246 static struct csched_private csched_priv;
250 static inline int
251 __cycle_cpu(int cpu, const cpumask_t *mask)
252 {
253 int nxt = next_cpu(cpu, *mask);
254 if (nxt == NR_CPUS)
255 nxt = first_cpu(*mask);
256 return nxt;
257 }
259 static inline int
260 __vcpu_on_runq(struct csched_vcpu *svc)
261 {
262 return !list_empty(&svc->runq_elem);
263 }
265 static inline struct csched_vcpu *
266 __runq_elem(struct list_head *elem)
267 {
268 return list_entry(elem, struct csched_vcpu, runq_elem);
269 }
271 static inline void
272 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
273 {
274 const struct list_head * const runq = RUNQ(cpu);
275 struct list_head *iter;
277 BUG_ON( __vcpu_on_runq(svc) );
278 BUG_ON( cpu != svc->vcpu->processor );
280 list_for_each( iter, runq )
281 {
282 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
283 if ( svc->pri > iter_svc->pri )
284 break;
285 }
287 list_add_tail(&svc->runq_elem, iter);
288 }
290 static inline void
291 __runq_remove(struct csched_vcpu *svc)
292 {
293 BUG_ON( !__vcpu_on_runq(svc) );
294 list_del_init(&svc->runq_elem);
295 }
297 static inline void
298 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
299 {
300 struct csched_vcpu * const cur =
301 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
302 cpumask_t mask;
304 ASSERT(cur);
305 cpus_clear(mask);
307 /* If strictly higher priority than current VCPU, signal the CPU */
308 if ( new->pri > cur->pri )
309 {
310 if ( cur->pri == CSCHED_PRI_IDLE )
311 CSCHED_STAT_CRANK(tickle_local_idler);
312 else if ( cur->pri == CSCHED_PRI_TS_OVER )
313 CSCHED_STAT_CRANK(tickle_local_over);
314 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
315 CSCHED_STAT_CRANK(tickle_local_under);
316 else
317 CSCHED_STAT_CRANK(tickle_local_other);
319 cpu_set(cpu, mask);
320 }
322 /*
323 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
324 * let them know there is runnable work in the system...
325 */
326 if ( cur->pri > CSCHED_PRI_IDLE )
327 {
328 if ( cpus_empty(csched_priv.idlers) )
329 {
330 CSCHED_STAT_CRANK(tickle_idlers_none);
331 }
332 else
333 {
334 CSCHED_STAT_CRANK(tickle_idlers_some);
335 cpus_or(mask, mask, csched_priv.idlers);
336 cpus_and(mask, mask, new->vcpu->cpu_affinity);
337 }
338 }
340 /* Send scheduler interrupts to designated CPUs */
341 if ( !cpus_empty(mask) )
342 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
343 }
345 static int
346 csched_pcpu_init(int cpu)
347 {
348 struct csched_pcpu *spc;
349 unsigned long flags;
351 /* Allocate per-PCPU info */
352 spc = xmalloc(struct csched_pcpu);
353 if ( spc == NULL )
354 return -1;
356 spin_lock_irqsave(&csched_priv.lock, flags);
358 /* Initialize/update system-wide config */
359 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
360 if ( csched_priv.ncpus <= cpu )
361 csched_priv.ncpus = cpu + 1;
362 if ( csched_priv.master >= csched_priv.ncpus )
363 csched_priv.master = cpu;
365 INIT_LIST_HEAD(&spc->runq);
366 spc->runq_sort_last = csched_priv.runq_sort;
367 per_cpu(schedule_data, cpu).sched_priv = spc;
369 /* Start off idling... */
370 BUG_ON( !is_idle_vcpu(per_cpu(schedule_data, cpu).curr) );
371 cpu_set(cpu, csched_priv.idlers);
373 spin_unlock_irqrestore(&csched_priv.lock, flags);
375 return 0;
376 }
378 #ifndef NDEBUG
379 static inline void
380 __csched_vcpu_check(struct vcpu *vc)
381 {
382 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
383 struct csched_dom * const sdom = svc->sdom;
385 BUG_ON( svc->vcpu != vc );
386 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
387 if ( sdom )
388 {
389 BUG_ON( is_idle_vcpu(vc) );
390 BUG_ON( sdom->dom != vc->domain );
391 }
392 else
393 {
394 BUG_ON( !is_idle_vcpu(vc) );
395 }
397 CSCHED_STAT_CRANK(vcpu_check);
398 }
399 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
400 #else
401 #define CSCHED_VCPU_CHECK(_vc)
402 #endif
404 static inline int
405 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
406 {
407 /*
408 * Don't pick up work that's in the peer's scheduling tail. Also only pick
409 * up work that's allowed to run on our CPU.
410 */
411 return !test_bit(_VCPUF_running, &vc->vcpu_flags) &&
412 cpu_isset(dest_cpu, vc->cpu_affinity);
413 }
415 static int
416 csched_cpu_pick(struct vcpu *vc)
417 {
418 cpumask_t cpus;
419 cpumask_t idlers;
420 int cpu;
422 /*
423 * Pick from online CPUs in VCPU's affinity mask, giving a
424 * preference to its current processor if it's in there.
425 */
426 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
427 cpu = cpu_isset(vc->processor, cpus)
428 ? vc->processor
429 : __cycle_cpu(vc->processor, &cpus);
430 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
432 /*
433 * Try to find an idle processor within the above constraints.
434 *
435 * In multi-core and multi-threaded CPUs, not all idle execution
436 * vehicles are equal!
437 *
438 * We give preference to the idle execution vehicle with the most
439 * idling neighbours in its grouping. This distributes work across
440 * distinct cores first and guarantees we don't do something stupid
441 * like run two VCPUs on co-hyperthreads while there are idle cores
442 * or sockets.
443 */
444 idlers = csched_priv.idlers;
445 cpu_set(cpu, idlers);
446 cpus_and(cpus, cpus, idlers);
447 cpu_clear(cpu, cpus);
449 while ( !cpus_empty(cpus) )
450 {
451 cpumask_t cpu_idlers;
452 cpumask_t nxt_idlers;
453 int nxt;
455 nxt = __cycle_cpu(cpu, &cpus);
457 if ( cpu_isset(cpu, cpu_core_map[nxt]) )
458 {
459 ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
460 cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
461 cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
462 }
463 else
464 {
465 ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
466 cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
467 cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
468 }
470 if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
471 {
472 cpu = nxt;
473 cpu_clear(cpu, cpus);
474 }
475 else
476 {
477 cpus_andnot(cpus, cpus, nxt_idlers);
478 }
479 }
481 return cpu;
482 }
484 static inline void
485 __csched_vcpu_acct_start(struct csched_vcpu *svc)
486 {
487 struct csched_dom * const sdom = svc->sdom;
488 unsigned long flags;
490 spin_lock_irqsave(&csched_priv.lock, flags);
492 if ( list_empty(&svc->active_vcpu_elem) )
493 {
494 CSCHED_VCPU_STAT_CRANK(svc, state_active);
495 CSCHED_STAT_CRANK(acct_vcpu_active);
497 sdom->active_vcpu_count++;
498 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
499 if ( list_empty(&sdom->active_sdom_elem) )
500 {
501 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
502 csched_priv.weight += sdom->weight;
503 }
504 }
506 spin_unlock_irqrestore(&csched_priv.lock, flags);
507 }
509 static inline void
510 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
511 {
512 struct csched_dom * const sdom = svc->sdom;
514 BUG_ON( list_empty(&svc->active_vcpu_elem) );
516 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
517 CSCHED_STAT_CRANK(acct_vcpu_idle);
519 sdom->active_vcpu_count--;
520 list_del_init(&svc->active_vcpu_elem);
521 if ( list_empty(&sdom->active_vcpu) )
522 {
523 BUG_ON( csched_priv.weight < sdom->weight );
524 list_del_init(&sdom->active_sdom_elem);
525 csched_priv.weight -= sdom->weight;
526 }
527 }
529 static void
530 csched_vcpu_acct(unsigned int cpu)
531 {
532 struct csched_vcpu * const svc = CSCHED_VCPU(current);
534 ASSERT( current->processor == cpu );
535 ASSERT( svc->sdom != NULL );
537 /*
538 * If this VCPU's priority was boosted when it last awoke, reset it.
539 * If the VCPU is found here, then it's consuming a non-negligeable
540 * amount of CPU resources and should no longer be boosted.
541 */
542 if ( svc->pri == CSCHED_PRI_TS_BOOST )
543 svc->pri = CSCHED_PRI_TS_UNDER;
545 /*
546 * Update credits
547 */
548 atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
550 /*
551 * Put this VCPU and domain back on the active list if it was
552 * idling.
553 *
554 * If it's been active a while, check if we'd be better off
555 * migrating it to run elsewhere (see multi-core and multi-thread
556 * support in csched_cpu_pick()).
557 */
558 if ( list_empty(&svc->active_vcpu_elem) )
559 {
560 __csched_vcpu_acct_start(svc);
561 }
562 else if ( csched_cpu_pick(current) != cpu )
563 {
564 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
565 CSCHED_STAT_CRANK(migrate_running);
566 set_bit(_VCPUF_migrating, &current->vcpu_flags);
567 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
568 }
569 }
571 static int
572 csched_vcpu_init(struct vcpu *vc)
573 {
574 struct domain * const dom = vc->domain;
575 struct csched_dom *sdom = CSCHED_DOM(dom);
576 struct csched_vcpu *svc;
578 CSCHED_STAT_CRANK(vcpu_init);
580 /* Allocate per-VCPU info */
581 svc = xmalloc(struct csched_vcpu);
582 if ( svc == NULL )
583 return -1;
585 INIT_LIST_HEAD(&svc->runq_elem);
586 INIT_LIST_HEAD(&svc->active_vcpu_elem);
587 svc->sdom = sdom;
588 svc->vcpu = vc;
589 atomic_set(&svc->credit, 0);
590 svc->flags = 0U;
591 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
592 CSCHED_VCPU_STATS_RESET(svc);
593 vc->sched_priv = svc;
595 /* Allocate per-PCPU info */
596 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
597 {
598 if ( csched_pcpu_init(vc->processor) != 0 )
599 return -1;
600 }
602 CSCHED_VCPU_CHECK(vc);
603 return 0;
604 }
606 static void
607 csched_vcpu_destroy(struct vcpu *vc)
608 {
609 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
610 struct csched_dom * const sdom = svc->sdom;
611 unsigned long flags;
613 CSCHED_STAT_CRANK(vcpu_destroy);
615 BUG_ON( sdom == NULL );
616 BUG_ON( !list_empty(&svc->runq_elem) );
618 spin_lock_irqsave(&csched_priv.lock, flags);
620 if ( !list_empty(&svc->active_vcpu_elem) )
621 __csched_vcpu_acct_stop_locked(svc);
623 spin_unlock_irqrestore(&csched_priv.lock, flags);
625 xfree(svc);
626 }
628 static void
629 csched_vcpu_sleep(struct vcpu *vc)
630 {
631 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
633 CSCHED_STAT_CRANK(vcpu_sleep);
635 BUG_ON( is_idle_vcpu(vc) );
637 if ( per_cpu(schedule_data, vc->processor).curr == vc )
638 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
639 else if ( __vcpu_on_runq(svc) )
640 __runq_remove(svc);
641 }
643 static void
644 csched_vcpu_wake(struct vcpu *vc)
645 {
646 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
647 const unsigned int cpu = vc->processor;
649 BUG_ON( is_idle_vcpu(vc) );
651 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
652 {
653 CSCHED_STAT_CRANK(vcpu_wake_running);
654 return;
655 }
656 if ( unlikely(__vcpu_on_runq(svc)) )
657 {
658 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
659 return;
660 }
662 if ( likely(vcpu_runnable(vc)) )
663 CSCHED_STAT_CRANK(vcpu_wake_runnable);
664 else
665 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
667 /*
668 * We temporarly boost the priority of awaking VCPUs!
669 *
670 * If this VCPU consumes a non negligeable amount of CPU, it
671 * will eventually find itself in the credit accounting code
672 * path where its priority will be reset to normal.
673 *
674 * If on the other hand the VCPU consumes little CPU and is
675 * blocking and awoken a lot (doing I/O for example), its
676 * priority will remain boosted, optimizing it's wake-to-run
677 * latencies.
678 *
679 * This allows wake-to-run latency sensitive VCPUs to preempt
680 * more CPU resource intensive VCPUs without impacting overall
681 * system fairness.
682 *
683 * The one exception is for VCPUs of capped domains unpausing
684 * after earning credits they had overspent. We don't boost
685 * those.
686 */
687 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
688 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
689 {
690 svc->pri = CSCHED_PRI_TS_BOOST;
691 }
693 /* Put the VCPU on the runq and tickle CPUs */
694 __runq_insert(cpu, svc);
695 __runq_tickle(cpu, svc);
696 }
698 static int
699 csched_dom_cntl(
700 struct domain *d,
701 struct xen_domctl_scheduler_op *op)
702 {
703 struct csched_dom * const sdom = CSCHED_DOM(d);
704 unsigned long flags;
706 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
707 {
708 op->u.credit.weight = sdom->weight;
709 op->u.credit.cap = sdom->cap;
710 }
711 else
712 {
713 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
715 spin_lock_irqsave(&csched_priv.lock, flags);
717 if ( op->u.credit.weight != 0 )
718 {
719 if ( !list_empty(&sdom->active_sdom_elem) )
720 {
721 csched_priv.weight -= sdom->weight;
722 csched_priv.weight += op->u.credit.weight;
723 }
724 sdom->weight = op->u.credit.weight;
725 }
727 if ( op->u.credit.cap != (uint16_t)~0U )
728 sdom->cap = op->u.credit.cap;
730 spin_unlock_irqrestore(&csched_priv.lock, flags);
731 }
733 return 0;
734 }
736 static int
737 csched_dom_init(struct domain *dom)
738 {
739 struct csched_dom *sdom;
741 CSCHED_STAT_CRANK(dom_init);
743 if ( is_idle_domain(dom) )
744 return 0;
746 sdom = xmalloc(struct csched_dom);
747 if ( sdom == NULL )
748 return -ENOMEM;
750 /* Initialize credit and weight */
751 INIT_LIST_HEAD(&sdom->active_vcpu);
752 sdom->active_vcpu_count = 0;
753 INIT_LIST_HEAD(&sdom->active_sdom_elem);
754 sdom->dom = dom;
755 sdom->weight = CSCHED_DEFAULT_WEIGHT;
756 sdom->cap = 0U;
757 dom->sched_priv = sdom;
759 return 0;
760 }
762 static void
763 csched_dom_destroy(struct domain *dom)
764 {
765 CSCHED_STAT_CRANK(dom_destroy);
766 xfree(CSCHED_DOM(dom));
767 }
769 /*
770 * This is a O(n) optimized sort of the runq.
771 *
772 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
773 * through the runq and move up any UNDERs that are preceded by OVERS. We
774 * remember the last UNDER to make the move up operation O(1).
775 */
776 static void
777 csched_runq_sort(unsigned int cpu)
778 {
779 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
780 struct list_head *runq, *elem, *next, *last_under;
781 struct csched_vcpu *svc_elem;
782 unsigned long flags;
783 int sort_epoch;
785 sort_epoch = csched_priv.runq_sort;
786 if ( sort_epoch == spc->runq_sort_last )
787 return;
789 spc->runq_sort_last = sort_epoch;
791 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
793 runq = &spc->runq;
794 elem = runq->next;
795 last_under = runq;
797 while ( elem != runq )
798 {
799 next = elem->next;
800 svc_elem = __runq_elem(elem);
802 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
803 {
804 /* does elem need to move up the runq? */
805 if ( elem->prev != last_under )
806 {
807 list_del(elem);
808 list_add(elem, last_under);
809 }
810 last_under = elem;
811 }
813 elem = next;
814 }
816 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
817 }
819 static void
820 csched_acct(void)
821 {
822 unsigned long flags;
823 struct list_head *iter_vcpu, *next_vcpu;
824 struct list_head *iter_sdom, *next_sdom;
825 struct csched_vcpu *svc;
826 struct csched_dom *sdom;
827 uint32_t credit_total;
828 uint32_t weight_total;
829 uint32_t weight_left;
830 uint32_t credit_fair;
831 uint32_t credit_peak;
832 uint32_t credit_cap;
833 int credit_balance;
834 int credit_xtra;
835 int credit;
838 spin_lock_irqsave(&csched_priv.lock, flags);
840 weight_total = csched_priv.weight;
841 credit_total = csched_priv.credit;
843 /* Converge balance towards 0 when it drops negative */
844 if ( csched_priv.credit_balance < 0 )
845 {
846 credit_total -= csched_priv.credit_balance;
847 CSCHED_STAT_CRANK(acct_balance);
848 }
850 if ( unlikely(weight_total == 0) )
851 {
852 csched_priv.credit_balance = 0;
853 spin_unlock_irqrestore(&csched_priv.lock, flags);
854 CSCHED_STAT_CRANK(acct_no_work);
855 return;
856 }
858 CSCHED_STAT_CRANK(acct_run);
860 weight_left = weight_total;
861 credit_balance = 0;
862 credit_xtra = 0;
863 credit_cap = 0U;
865 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
866 {
867 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
869 BUG_ON( is_idle_domain(sdom->dom) );
870 BUG_ON( sdom->active_vcpu_count == 0 );
871 BUG_ON( sdom->weight == 0 );
872 BUG_ON( sdom->weight > weight_left );
874 weight_left -= sdom->weight;
876 /*
877 * A domain's fair share is computed using its weight in competition
878 * with that of all other active domains.
879 *
880 * At most, a domain can use credits to run all its active VCPUs
881 * for one full accounting period. We allow a domain to earn more
882 * only when the system-wide credit balance is negative.
883 */
884 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
885 if ( csched_priv.credit_balance < 0 )
886 {
887 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
888 (weight_total - 1)
889 ) / weight_total;
890 }
892 if ( sdom->cap != 0U )
893 {
894 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
895 if ( credit_cap < credit_peak )
896 credit_peak = credit_cap;
898 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
899 ) / sdom->active_vcpu_count;
900 }
902 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
903 ) / weight_total;
905 if ( credit_fair < credit_peak )
906 {
907 credit_xtra = 1;
908 }
909 else
910 {
911 if ( weight_left != 0U )
912 {
913 /* Give other domains a chance at unused credits */
914 credit_total += ( ( ( credit_fair - credit_peak
915 ) * weight_total
916 ) + ( weight_left - 1 )
917 ) / weight_left;
918 }
920 if ( credit_xtra )
921 {
922 /*
923 * Lazily keep domains with extra credits at the head of
924 * the queue to give others a chance at them in future
925 * accounting periods.
926 */
927 CSCHED_STAT_CRANK(acct_reorder);
928 list_del(&sdom->active_sdom_elem);
929 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
930 }
932 credit_fair = credit_peak;
933 }
935 /* Compute fair share per VCPU */
936 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
937 ) / sdom->active_vcpu_count;
940 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
941 {
942 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
943 BUG_ON( sdom != svc->sdom );
945 /* Increment credit */
946 atomic_add(credit_fair, &svc->credit);
947 credit = atomic_read(&svc->credit);
949 /*
950 * Recompute priority or, if VCPU is idling, remove it from
951 * the active list.
952 */
953 if ( credit < 0 )
954 {
955 svc->pri = CSCHED_PRI_TS_OVER;
957 /* Park running VCPUs of capped-out domains */
958 if ( sdom->cap != 0U &&
959 credit < -credit_cap &&
960 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
961 {
962 CSCHED_STAT_CRANK(vcpu_park);
963 vcpu_pause_nosync(svc->vcpu);
964 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
965 }
967 /* Lower bound on credits */
968 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
969 {
970 CSCHED_STAT_CRANK(acct_min_credit);
971 credit = -CSCHED_CREDITS_PER_TSLICE;
972 atomic_set(&svc->credit, credit);
973 }
974 }
975 else
976 {
977 svc->pri = CSCHED_PRI_TS_UNDER;
979 /* Unpark any capped domains whose credits go positive */
980 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
981 {
982 /*
983 * It's important to unset the flag AFTER the unpause()
984 * call to make sure the VCPU's priority is not boosted
985 * if it is woken up here.
986 */
987 CSCHED_STAT_CRANK(vcpu_unpark);
988 vcpu_unpause(svc->vcpu);
989 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
990 }
992 /* Upper bound on credits means VCPU stops earning */
993 if ( credit > CSCHED_CREDITS_PER_TSLICE )
994 {
995 __csched_vcpu_acct_stop_locked(svc);
996 credit = 0;
997 atomic_set(&svc->credit, credit);
998 }
999 }
1001 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
1002 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
1003 credit_balance += credit;
1007 csched_priv.credit_balance = credit_balance;
1009 spin_unlock_irqrestore(&csched_priv.lock, flags);
1011 /* Inform each CPU that its runq needs to be sorted */
1012 csched_priv.runq_sort++;
1015 static void
1016 csched_tick(unsigned int cpu)
1018 /*
1019 * Accounting for running VCPU
1020 */
1021 if ( !is_idle_vcpu(current) )
1022 csched_vcpu_acct(cpu);
1024 /*
1025 * Host-wide accounting duty
1027 * Note: Currently, this is always done by the master boot CPU. Eventually,
1028 * we could distribute or at the very least cycle the duty.
1029 */
1030 if ( (csched_priv.master == cpu) &&
1031 (per_cpu(schedule_data, cpu).tick % CSCHED_TICKS_PER_ACCT) == 0 )
1033 csched_acct();
1036 /*
1037 * Check if runq needs to be sorted
1039 * Every physical CPU resorts the runq after the accounting master has
1040 * modified priorities. This is a special O(n) sort and runs at most
1041 * once per accounting period (currently 30 milliseconds).
1042 */
1043 csched_runq_sort(cpu);
1046 static struct csched_vcpu *
1047 csched_runq_steal(int peer_cpu, int cpu, int pri)
1049 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1050 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1051 struct csched_vcpu *speer;
1052 struct list_head *iter;
1053 struct vcpu *vc;
1055 /*
1056 * Don't steal from an idle CPU's runq because it's about to
1057 * pick up work from it itself.
1058 */
1059 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1061 list_for_each( iter, &peer_pcpu->runq )
1063 speer = __runq_elem(iter);
1065 /*
1066 * If next available VCPU here is not of strictly higher
1067 * priority than ours, this PCPU is useless to us.
1068 */
1069 if ( speer->pri <= pri )
1070 break;
1072 /* Is this VCPU is runnable on our PCPU? */
1073 vc = speer->vcpu;
1074 BUG_ON( is_idle_vcpu(vc) );
1076 if (__csched_vcpu_is_migrateable(vc, cpu))
1078 /* We got a candidate. Grab it! */
1079 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1080 CSCHED_STAT_CRANK(migrate_queued);
1081 __runq_remove(speer);
1082 vc->processor = cpu;
1083 return speer;
1088 CSCHED_STAT_CRANK(steal_peer_idle);
1089 return NULL;
1092 static struct csched_vcpu *
1093 csched_load_balance(int cpu, struct csched_vcpu *snext)
1095 struct csched_vcpu *speer;
1096 cpumask_t workers;
1097 int peer_cpu;
1099 BUG_ON( cpu != snext->vcpu->processor );
1101 if ( snext->pri == CSCHED_PRI_IDLE )
1102 CSCHED_STAT_CRANK(load_balance_idle);
1103 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1104 CSCHED_STAT_CRANK(load_balance_over);
1105 else
1106 CSCHED_STAT_CRANK(load_balance_other);
1108 /*
1109 * Peek at non-idling CPUs in the system, starting with our
1110 * immediate neighbour.
1111 */
1112 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1113 cpu_clear(cpu, workers);
1114 peer_cpu = cpu;
1116 while ( !cpus_empty(workers) )
1118 peer_cpu = __cycle_cpu(peer_cpu, &workers);
1119 cpu_clear(peer_cpu, workers);
1121 /*
1122 * Get ahold of the scheduler lock for this peer CPU.
1124 * Note: We don't spin on this lock but simply try it. Spinning could
1125 * cause a deadlock if the peer CPU is also load balancing and trying
1126 * to lock this CPU.
1127 */
1128 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1130 CSCHED_STAT_CRANK(steal_trylock_failed);
1131 continue;
1134 /*
1135 * Any work over there to steal?
1136 */
1137 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1138 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1139 if ( speer != NULL )
1140 return speer;
1143 /* Failed to find more important work elsewhere... */
1144 __runq_remove(snext);
1145 return snext;
1148 /*
1149 * This function is in the critical path. It is designed to be simple and
1150 * fast for the common case.
1151 */
1152 static struct task_slice
1153 csched_schedule(s_time_t now)
1155 const int cpu = smp_processor_id();
1156 struct list_head * const runq = RUNQ(cpu);
1157 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1158 struct csched_vcpu *snext;
1159 struct task_slice ret;
1161 CSCHED_STAT_CRANK(schedule);
1162 CSCHED_VCPU_CHECK(current);
1164 /*
1165 * Select next runnable local VCPU (ie top of local runq)
1166 */
1167 if ( vcpu_runnable(current) )
1168 __runq_insert(cpu, scurr);
1169 else
1170 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1172 snext = __runq_elem(runq->next);
1174 /*
1175 * SMP Load balance:
1177 * If the next highest priority local runnable VCPU has already eaten
1178 * through its credits, look on other PCPUs to see if we have more
1179 * urgent work... If not, csched_load_balance() will return snext, but
1180 * already removed from the runq.
1181 */
1182 if ( snext->pri > CSCHED_PRI_TS_OVER )
1183 __runq_remove(snext);
1184 else
1185 snext = csched_load_balance(cpu, snext);
1187 /*
1188 * Update idlers mask if necessary. When we're idling, other CPUs
1189 * will tickle us when they get extra work.
1190 */
1191 if ( snext->pri == CSCHED_PRI_IDLE )
1193 if ( !cpu_isset(cpu, csched_priv.idlers) )
1194 cpu_set(cpu, csched_priv.idlers);
1196 else if ( cpu_isset(cpu, csched_priv.idlers) )
1198 cpu_clear(cpu, csched_priv.idlers);
1201 /*
1202 * Return task to run next...
1203 */
1204 ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
1205 ret.task = snext->vcpu;
1207 CSCHED_VCPU_CHECK(ret.task);
1208 return ret;
1211 static void
1212 csched_dump_vcpu(struct csched_vcpu *svc)
1214 struct csched_dom * const sdom = svc->sdom;
1216 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1217 svc->vcpu->domain->domain_id,
1218 svc->vcpu->vcpu_id,
1219 svc->pri,
1220 svc->flags,
1221 svc->vcpu->processor);
1223 if ( sdom )
1225 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1226 #ifdef CSCHED_STATS
1227 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1228 svc->stats.credit_last,
1229 svc->stats.credit_incr,
1230 svc->stats.state_active,
1231 svc->stats.state_idle,
1232 svc->stats.migrate_q,
1233 svc->stats.migrate_r);
1234 #endif
1237 printk("\n");
1240 static void
1241 csched_dump_pcpu(int cpu)
1243 struct list_head *runq, *iter;
1244 struct csched_pcpu *spc;
1245 struct csched_vcpu *svc;
1246 int loop;
1248 spc = CSCHED_PCPU(cpu);
1249 runq = &spc->runq;
1251 printk(" tick=%lu, sort=%d, sibling=0x%lx, core=0x%lx\n",
1252 per_cpu(schedule_data, cpu).tick,
1253 spc->runq_sort_last,
1254 cpu_sibling_map[cpu].bits[0],
1255 cpu_core_map[cpu].bits[0]);
1257 /* current VCPU */
1258 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1259 if ( svc )
1261 printk("\trun: ");
1262 csched_dump_vcpu(svc);
1265 loop = 0;
1266 list_for_each( iter, runq )
1268 svc = __runq_elem(iter);
1269 if ( svc )
1271 printk("\t%3d: ", ++loop);
1272 csched_dump_vcpu(svc);
1277 static void
1278 csched_dump(void)
1280 struct list_head *iter_sdom, *iter_svc;
1281 int loop;
1283 printk("info:\n"
1284 "\tncpus = %u\n"
1285 "\tmaster = %u\n"
1286 "\tcredit = %u\n"
1287 "\tcredit balance = %d\n"
1288 "\tweight = %u\n"
1289 "\trunq_sort = %u\n"
1290 "\tdefault-weight = %d\n"
1291 "\tmsecs per tick = %dms\n"
1292 "\tcredits per tick = %d\n"
1293 "\tticks per tslice = %d\n"
1294 "\tticks per acct = %d\n",
1295 csched_priv.ncpus,
1296 csched_priv.master,
1297 csched_priv.credit,
1298 csched_priv.credit_balance,
1299 csched_priv.weight,
1300 csched_priv.runq_sort,
1301 CSCHED_DEFAULT_WEIGHT,
1302 CSCHED_MSECS_PER_TICK,
1303 CSCHED_CREDITS_PER_TICK,
1304 CSCHED_TICKS_PER_TSLICE,
1305 CSCHED_TICKS_PER_ACCT);
1307 printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
1309 CSCHED_STATS_PRINTK();
1311 printk("active vcpus:\n");
1312 loop = 0;
1313 list_for_each( iter_sdom, &csched_priv.active_sdom )
1315 struct csched_dom *sdom;
1316 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1318 list_for_each( iter_svc, &sdom->active_vcpu )
1320 struct csched_vcpu *svc;
1321 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1323 printk("\t%3d: ", ++loop);
1324 csched_dump_vcpu(svc);
1329 static void
1330 csched_init(void)
1332 spin_lock_init(&csched_priv.lock);
1333 INIT_LIST_HEAD(&csched_priv.active_sdom);
1334 csched_priv.ncpus = 0;
1335 csched_priv.master = UINT_MAX;
1336 cpus_clear(csched_priv.idlers);
1337 csched_priv.weight = 0U;
1338 csched_priv.credit = 0U;
1339 csched_priv.credit_balance = 0;
1340 csched_priv.runq_sort = 0U;
1341 CSCHED_STATS_RESET();
1345 struct scheduler sched_credit_def = {
1346 .name = "SMP Credit Scheduler",
1347 .opt_name = "credit",
1348 .sched_id = XEN_SCHEDULER_CREDIT,
1350 .init_domain = csched_dom_init,
1351 .destroy_domain = csched_dom_destroy,
1353 .init_vcpu = csched_vcpu_init,
1354 .destroy_vcpu = csched_vcpu_destroy,
1356 .sleep = csched_vcpu_sleep,
1357 .wake = csched_vcpu_wake,
1359 .adjust = csched_dom_cntl,
1361 .pick_cpu = csched_cpu_pick,
1362 .tick = csched_tick,
1363 .do_schedule = csched_schedule,
1365 .dump_cpu_state = csched_dump_pcpu,
1366 .dump_settings = csched_dump,
1367 .init = csched_init,
1368 };