ia64/xen-unstable

view xen/common/sched_credit.c @ 11128:f2f584093379

[POWERPC] Update .hgignore
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
author kfraser@localhost.localdomain
date Tue Aug 15 10:38:59 2006 +0100 (2006-08-15)
parents 5e8c254c9dcd
children 03fd2accb4d9
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_TICK 10 /* milliseconds */
40 #define CSCHED_TSLICE 30 /* milliseconds */
41 #define CSCHED_ACCT_NTICKS 3
42 #define CSCHED_ACCT_PERIOD (CSCHED_ACCT_NTICKS * CSCHED_TICK)
43 #define CSCHED_DEFAULT_WEIGHT 256
46 /*
47 * Priorities
48 */
49 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
50 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
51 #define CSCHED_PRI_IDLE -64 /* idle */
52 #define CSCHED_PRI_TS_PARKED -65 /* time-share w/ capped credits */
55 /*
56 * Useful macros
57 */
58 #define CSCHED_PCPU(_c) \
59 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
60 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
61 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
62 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
65 /*
66 * Stats
67 */
68 #ifdef CSCHED_STATS
70 #define CSCHED_STAT(_X) (csched_priv.stats._X)
71 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
72 #define CSCHED_STAT_PRINTK(_X) \
73 do \
74 { \
75 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
76 } while ( 0 );
78 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
79 _MACRO(vcpu_init) \
80 _MACRO(vcpu_sleep) \
81 _MACRO(vcpu_wake_running) \
82 _MACRO(vcpu_wake_onrunq) \
83 _MACRO(vcpu_wake_runnable) \
84 _MACRO(vcpu_wake_not_runnable) \
85 _MACRO(dom_destroy) \
86 _MACRO(schedule) \
87 _MACRO(tickle_local_idler) \
88 _MACRO(tickle_local_over) \
89 _MACRO(tickle_local_under) \
90 _MACRO(tickle_local_other) \
91 _MACRO(acct_run) \
92 _MACRO(acct_no_work) \
93 _MACRO(acct_balance) \
94 _MACRO(acct_reorder) \
95 _MACRO(acct_min_credit) \
96 _MACRO(acct_vcpu_active) \
97 _MACRO(acct_vcpu_idle) \
98 _MACRO(acct_vcpu_credit_min)
100 #define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \
101 _MACRO(vcpu_migrate) \
102 _MACRO(load_balance_idle) \
103 _MACRO(load_balance_over) \
104 _MACRO(load_balance_other) \
105 _MACRO(steal_trylock_failed) \
106 _MACRO(steal_peer_down) \
107 _MACRO(steal_peer_idle) \
108 _MACRO(steal_peer_running) \
109 _MACRO(steal_peer_pinned) \
110 _MACRO(tickle_idlers_none) \
111 _MACRO(tickle_idlers_some)
113 #ifndef NDEBUG
114 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
115 _MACRO(vcpu_check)
116 #else
117 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
118 #endif
120 #define CSCHED_STATS_EXPAND(_MACRO) \
121 CSCHED_STATS_EXPAND_SCHED(_MACRO) \
122 CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \
123 CSCHED_STATS_EXPAND_CHECKS(_MACRO)
125 #define CSCHED_STATS_RESET() \
126 do \
127 { \
128 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
129 } while ( 0 )
131 #define CSCHED_STATS_DEFINE() \
132 struct \
133 { \
134 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
135 } stats
137 #define CSCHED_STATS_PRINTK() \
138 do \
139 { \
140 printk("stats:\n"); \
141 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
142 } while ( 0 )
144 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
146 #else /* CSCHED_STATS */
148 #define CSCHED_STATS_RESET() do {} while ( 0 )
149 #define CSCHED_STATS_DEFINE() do {} while ( 0 )
150 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
151 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
153 #endif /* CSCHED_STATS */
156 /*
157 * Physical CPU
158 */
159 struct csched_pcpu {
160 struct list_head runq;
161 uint32_t runq_sort_last;
162 };
164 /*
165 * Virtual CPU
166 */
167 struct csched_vcpu {
168 struct list_head runq_elem;
169 struct list_head active_vcpu_elem;
170 struct csched_dom *sdom;
171 struct vcpu *vcpu;
172 atomic_t credit;
173 int credit_last;
174 uint32_t credit_incr;
175 uint32_t state_active;
176 uint32_t state_idle;
177 int16_t pri;
178 };
180 /*
181 * Domain
182 */
183 struct csched_dom {
184 struct list_head active_vcpu;
185 struct list_head active_sdom_elem;
186 struct domain *dom;
187 uint16_t active_vcpu_count;
188 uint16_t weight;
189 uint16_t cap;
190 };
192 /*
193 * System-wide private data
194 */
195 struct csched_private {
196 spinlock_t lock;
197 struct list_head active_sdom;
198 uint32_t ncpus;
199 unsigned int master;
200 cpumask_t idlers;
201 uint32_t weight;
202 uint32_t credit;
203 int credit_balance;
204 uint32_t runq_sort;
205 CSCHED_STATS_DEFINE();
206 };
209 /*
210 * Global variables
211 */
212 static struct csched_private csched_priv;
216 static inline int
217 __vcpu_on_runq(struct csched_vcpu *svc)
218 {
219 return !list_empty(&svc->runq_elem);
220 }
222 static inline struct csched_vcpu *
223 __runq_elem(struct list_head *elem)
224 {
225 return list_entry(elem, struct csched_vcpu, runq_elem);
226 }
228 static inline void
229 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
230 {
231 const struct list_head * const runq = RUNQ(cpu);
232 struct list_head *iter;
234 BUG_ON( __vcpu_on_runq(svc) );
235 BUG_ON( cpu != svc->vcpu->processor );
237 list_for_each( iter, runq )
238 {
239 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
240 if ( svc->pri > iter_svc->pri )
241 break;
242 }
244 list_add_tail(&svc->runq_elem, iter);
245 }
247 static inline void
248 __runq_remove(struct csched_vcpu *svc)
249 {
250 BUG_ON( !__vcpu_on_runq(svc) );
251 list_del_init(&svc->runq_elem);
252 }
254 static inline void
255 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
256 {
257 struct csched_vcpu * const cur =
258 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
259 cpumask_t mask;
261 ASSERT(cur);
262 cpus_clear(mask);
264 /* If strictly higher priority than current VCPU, signal the CPU */
265 if ( new->pri > cur->pri )
266 {
267 if ( cur->pri == CSCHED_PRI_IDLE )
268 CSCHED_STAT_CRANK(tickle_local_idler);
269 else if ( cur->pri == CSCHED_PRI_TS_OVER )
270 CSCHED_STAT_CRANK(tickle_local_over);
271 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
272 CSCHED_STAT_CRANK(tickle_local_under);
273 else
274 CSCHED_STAT_CRANK(tickle_local_other);
276 cpu_set(cpu, mask);
277 }
279 /*
280 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
281 * let them know there is runnable work in the system...
282 */
283 if ( cur->pri > CSCHED_PRI_IDLE )
284 {
285 if ( cpus_empty(csched_priv.idlers) )
286 {
287 CSCHED_STAT_CRANK(tickle_idlers_none);
288 }
289 else
290 {
291 CSCHED_STAT_CRANK(tickle_idlers_some);
292 cpus_or(mask, mask, csched_priv.idlers);
293 }
294 }
296 /* Send scheduler interrupts to designated CPUs */
297 if ( !cpus_empty(mask) )
298 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
299 }
301 static int
302 csched_pcpu_init(int cpu)
303 {
304 struct csched_pcpu *spc;
305 unsigned long flags;
307 /* Allocate per-PCPU info */
308 spc = xmalloc(struct csched_pcpu);
309 if ( spc == NULL )
310 return -1;
312 spin_lock_irqsave(&csched_priv.lock, flags);
314 /* Initialize/update system-wide config */
315 csched_priv.credit += CSCHED_ACCT_PERIOD;
316 if ( csched_priv.ncpus <= cpu )
317 csched_priv.ncpus = cpu + 1;
318 if ( csched_priv.master >= csched_priv.ncpus )
319 csched_priv.master = cpu;
321 INIT_LIST_HEAD(&spc->runq);
322 spc->runq_sort_last = csched_priv.runq_sort;
323 per_cpu(schedule_data, cpu).sched_priv = spc;
325 /* Start off idling... */
326 BUG_ON( !is_idle_vcpu(per_cpu(schedule_data, cpu).curr) );
327 cpu_set(cpu, csched_priv.idlers);
329 spin_unlock_irqrestore(&csched_priv.lock, flags);
331 return 0;
332 }
334 #ifndef NDEBUG
335 static inline void
336 __csched_vcpu_check(struct vcpu *vc)
337 {
338 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
339 struct csched_dom * const sdom = svc->sdom;
341 BUG_ON( svc->vcpu != vc );
342 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
343 if ( sdom )
344 {
345 BUG_ON( is_idle_vcpu(vc) );
346 BUG_ON( sdom->dom != vc->domain );
347 }
348 else
349 {
350 BUG_ON( !is_idle_vcpu(vc) );
351 }
353 CSCHED_STAT_CRANK(vcpu_check);
354 }
355 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
356 #else
357 #define CSCHED_VCPU_CHECK(_vc)
358 #endif
360 static inline int
361 __csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
362 {
363 /*
364 * Don't pick up work that's in the peer's scheduling tail. Also only pick
365 * up work that's allowed to run on our CPU.
366 */
367 if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
368 {
369 CSCHED_STAT_CRANK(steal_peer_running);
370 return 0;
371 }
373 if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
374 {
375 CSCHED_STAT_CRANK(steal_peer_pinned);
376 return 0;
377 }
379 return 1;
380 }
382 static void
383 csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
384 {
385 struct csched_dom * const sdom = svc->sdom;
386 unsigned long flags;
388 /* Update credits */
389 atomic_sub(credit_dec, &svc->credit);
391 /* Put this VCPU and domain back on the active list if it was idling */
392 if ( list_empty(&svc->active_vcpu_elem) )
393 {
394 spin_lock_irqsave(&csched_priv.lock, flags);
396 if ( list_empty(&svc->active_vcpu_elem) )
397 {
398 CSCHED_STAT_CRANK(acct_vcpu_active);
399 svc->state_active++;
401 sdom->active_vcpu_count++;
402 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
403 if ( list_empty(&sdom->active_sdom_elem) )
404 {
405 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
406 csched_priv.weight += sdom->weight;
407 }
408 }
410 spin_unlock_irqrestore(&csched_priv.lock, flags);
411 }
412 }
414 static inline void
415 __csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
416 {
417 struct csched_dom * const sdom = svc->sdom;
419 BUG_ON( list_empty(&svc->active_vcpu_elem) );
421 CSCHED_STAT_CRANK(acct_vcpu_idle);
422 svc->state_idle++;
424 sdom->active_vcpu_count--;
425 list_del_init(&svc->active_vcpu_elem);
426 if ( list_empty(&sdom->active_vcpu) )
427 {
428 BUG_ON( csched_priv.weight < sdom->weight );
429 list_del_init(&sdom->active_sdom_elem);
430 csched_priv.weight -= sdom->weight;
431 }
433 atomic_set(&svc->credit, 0);
434 }
436 static int
437 csched_vcpu_init(struct vcpu *vc)
438 {
439 struct domain * const dom = vc->domain;
440 struct csched_dom *sdom;
441 struct csched_vcpu *svc;
442 int16_t pri;
444 CSCHED_STAT_CRANK(vcpu_init);
446 /* Allocate, if appropriate, per-domain info */
447 if ( is_idle_vcpu(vc) )
448 {
449 sdom = NULL;
450 pri = CSCHED_PRI_IDLE;
451 }
452 else if ( CSCHED_DOM(dom) )
453 {
454 sdom = CSCHED_DOM(dom);
455 pri = CSCHED_PRI_TS_UNDER;
456 }
457 else
458 {
459 sdom = xmalloc(struct csched_dom);
460 if ( !sdom )
461 return -1;
463 /* Initialize credit and weight */
464 INIT_LIST_HEAD(&sdom->active_vcpu);
465 sdom->active_vcpu_count = 0;
466 INIT_LIST_HEAD(&sdom->active_sdom_elem);
467 sdom->dom = dom;
468 sdom->weight = CSCHED_DEFAULT_WEIGHT;
469 sdom->cap = 0U;
470 dom->sched_priv = sdom;
471 pri = CSCHED_PRI_TS_UNDER;
472 }
474 /* Allocate per-VCPU info */
475 svc = xmalloc(struct csched_vcpu);
476 if ( !svc )
477 return -1;
479 INIT_LIST_HEAD(&svc->runq_elem);
480 INIT_LIST_HEAD(&svc->active_vcpu_elem);
481 svc->sdom = sdom;
482 svc->vcpu = vc;
483 atomic_set(&svc->credit, 0);
484 svc->credit_last = 0;
485 svc->credit_incr = 0U;
486 svc->state_active = 0U;
487 svc->state_idle = 0U;
488 svc->pri = pri;
489 vc->sched_priv = svc;
491 CSCHED_VCPU_CHECK(vc);
493 /* Attach fair-share VCPUs to the accounting list */
494 if ( likely(sdom != NULL) )
495 csched_vcpu_acct(svc, 0);
497 /* Allocate per-PCPU info */
498 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
499 {
500 if ( csched_pcpu_init(vc->processor) != 0 )
501 return -1;
502 }
504 CSCHED_VCPU_CHECK(vc);
506 return 0;
507 }
509 static void
510 csched_vcpu_free(struct vcpu *vc)
511 {
512 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
513 struct csched_dom * const sdom = svc->sdom;
514 unsigned long flags;
516 BUG_ON( sdom == NULL );
517 BUG_ON( !list_empty(&svc->runq_elem) );
519 spin_lock_irqsave(&csched_priv.lock, flags);
521 if ( !list_empty(&svc->active_vcpu_elem) )
522 __csched_vcpu_acct_idle_locked(svc);
524 spin_unlock_irqrestore(&csched_priv.lock, flags);
526 xfree(svc);
527 }
529 static void
530 csched_vcpu_sleep(struct vcpu *vc)
531 {
532 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
534 CSCHED_STAT_CRANK(vcpu_sleep);
536 BUG_ON( is_idle_vcpu(vc) );
538 if ( per_cpu(schedule_data, vc->processor).curr == vc )
539 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
540 else if ( __vcpu_on_runq(svc) )
541 __runq_remove(svc);
542 }
544 static void
545 csched_vcpu_wake(struct vcpu *vc)
546 {
547 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
548 const unsigned int cpu = vc->processor;
550 BUG_ON( is_idle_vcpu(vc) );
552 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
553 {
554 CSCHED_STAT_CRANK(vcpu_wake_running);
555 return;
556 }
557 if ( unlikely(__vcpu_on_runq(svc)) )
558 {
559 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
560 return;
561 }
563 if ( likely(vcpu_runnable(vc)) )
564 CSCHED_STAT_CRANK(vcpu_wake_runnable);
565 else
566 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
568 /* Put the VCPU on the runq and tickle CPUs */
569 __runq_insert(cpu, svc);
570 __runq_tickle(cpu, svc);
571 }
573 static int
574 csched_vcpu_set_affinity(struct vcpu *vc, cpumask_t *affinity)
575 {
576 unsigned long flags;
577 int lcpu;
579 if ( vc == current )
580 {
581 /* No locking needed but also can't move on the spot... */
582 if ( !cpu_isset(vc->processor, *affinity) )
583 return -EBUSY;
585 vc->cpu_affinity = *affinity;
586 }
587 else
588 {
589 /* Pause, modify, and unpause. */
590 vcpu_pause(vc);
592 vc->cpu_affinity = *affinity;
593 if ( !cpu_isset(vc->processor, vc->cpu_affinity) )
594 {
595 /*
596 * We must grab the scheduler lock for the CPU currently owning
597 * this VCPU before changing its ownership.
598 */
599 vcpu_schedule_lock_irqsave(vc, flags);
600 lcpu = vc->processor;
602 vc->processor = first_cpu(vc->cpu_affinity);
604 spin_unlock_irqrestore(&per_cpu(schedule_data, lcpu).schedule_lock,
605 flags);
606 }
608 vcpu_unpause(vc);
609 }
611 return 0;
612 }
614 static int
615 csched_dom_cntl(
616 struct domain *d,
617 struct sched_adjdom_cmd *cmd)
618 {
619 struct csched_dom * const sdom = CSCHED_DOM(d);
620 unsigned long flags;
622 if ( cmd->direction == SCHED_INFO_GET )
623 {
624 cmd->u.credit.weight = sdom->weight;
625 cmd->u.credit.cap = sdom->cap;
626 }
627 else
628 {
629 ASSERT( cmd->direction == SCHED_INFO_PUT );
631 spin_lock_irqsave(&csched_priv.lock, flags);
633 if ( cmd->u.credit.weight != 0 )
634 {
635 if ( !list_empty(&sdom->active_sdom_elem) )
636 {
637 csched_priv.weight -= sdom->weight;
638 csched_priv.weight += cmd->u.credit.weight;
639 }
640 sdom->weight = cmd->u.credit.weight;
641 }
643 if ( cmd->u.credit.cap != (uint16_t)~0U )
644 sdom->cap = cmd->u.credit.cap;
646 spin_unlock_irqrestore(&csched_priv.lock, flags);
647 }
649 return 0;
650 }
652 static void
653 csched_dom_destroy(struct domain *dom)
654 {
655 struct csched_dom * const sdom = CSCHED_DOM(dom);
656 int i;
658 CSCHED_STAT_CRANK(dom_destroy);
660 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
661 {
662 if ( dom->vcpu[i] )
663 csched_vcpu_free(dom->vcpu[i]);
664 }
666 xfree(sdom);
667 }
669 /*
670 * This is a O(n) optimized sort of the runq.
671 *
672 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
673 * through the runq and move up any UNDERs that are preceded by OVERS. We
674 * remember the last UNDER to make the move up operation O(1).
675 */
676 static void
677 csched_runq_sort(unsigned int cpu)
678 {
679 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
680 struct list_head *runq, *elem, *next, *last_under;
681 struct csched_vcpu *svc_elem;
682 unsigned long flags;
683 int sort_epoch;
685 sort_epoch = csched_priv.runq_sort;
686 if ( sort_epoch == spc->runq_sort_last )
687 return;
689 spc->runq_sort_last = sort_epoch;
691 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
693 runq = &spc->runq;
694 elem = runq->next;
695 last_under = runq;
697 while ( elem != runq )
698 {
699 next = elem->next;
700 svc_elem = __runq_elem(elem);
702 if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
703 {
704 /* does elem need to move up the runq? */
705 if ( elem->prev != last_under )
706 {
707 list_del(elem);
708 list_add(elem, last_under);
709 }
710 last_under = elem;
711 }
713 elem = next;
714 }
716 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
717 }
719 static void
720 csched_acct(void)
721 {
722 unsigned long flags;
723 struct list_head *iter_vcpu, *next_vcpu;
724 struct list_head *iter_sdom, *next_sdom;
725 struct csched_vcpu *svc;
726 struct csched_dom *sdom;
727 uint32_t credit_total;
728 uint32_t weight_total;
729 uint32_t weight_left;
730 uint32_t credit_fair;
731 uint32_t credit_peak;
732 int credit_balance;
733 int credit_xtra;
734 int credit;
737 spin_lock_irqsave(&csched_priv.lock, flags);
739 weight_total = csched_priv.weight;
740 credit_total = csched_priv.credit;
742 /* Converge balance towards 0 when it drops negative */
743 if ( csched_priv.credit_balance < 0 )
744 {
745 credit_total -= csched_priv.credit_balance;
746 CSCHED_STAT_CRANK(acct_balance);
747 }
749 if ( unlikely(weight_total == 0) )
750 {
751 csched_priv.credit_balance = 0;
752 spin_unlock_irqrestore(&csched_priv.lock, flags);
753 CSCHED_STAT_CRANK(acct_no_work);
754 return;
755 }
757 CSCHED_STAT_CRANK(acct_run);
759 weight_left = weight_total;
760 credit_balance = 0;
761 credit_xtra = 0;
763 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
764 {
765 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
767 BUG_ON( is_idle_domain(sdom->dom) );
768 BUG_ON( sdom->active_vcpu_count == 0 );
769 BUG_ON( sdom->weight == 0 );
770 BUG_ON( sdom->weight > weight_left );
772 weight_left -= sdom->weight;
774 /*
775 * A domain's fair share is computed using its weight in competition
776 * with that of all other active domains.
777 *
778 * At most, a domain can use credits to run all its active VCPUs
779 * for one full accounting period. We allow a domain to earn more
780 * only when the system-wide credit balance is negative.
781 */
782 credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
783 if ( csched_priv.credit_balance < 0 )
784 {
785 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
786 (weight_total - 1)
787 ) / weight_total;
788 }
789 if ( sdom->cap != 0U )
790 {
791 uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 100;
792 if ( credit_cap < credit_peak )
793 credit_peak = credit_cap;
794 }
796 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
797 ) / weight_total;
799 if ( credit_fair < credit_peak )
800 {
801 credit_xtra = 1;
802 }
803 else
804 {
805 if ( weight_left != 0U )
806 {
807 /* Give other domains a chance at unused credits */
808 credit_total += ( ( ( credit_fair - credit_peak
809 ) * weight_total
810 ) + ( weight_left - 1 )
811 ) / weight_left;
812 }
814 if ( credit_xtra )
815 {
816 /*
817 * Lazily keep domains with extra credits at the head of
818 * the queue to give others a chance at them in future
819 * accounting periods.
820 */
821 CSCHED_STAT_CRANK(acct_reorder);
822 list_del(&sdom->active_sdom_elem);
823 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
824 }
826 credit_fair = credit_peak;
827 }
829 /* Compute fair share per VCPU */
830 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
831 ) / sdom->active_vcpu_count;
834 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
835 {
836 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
837 BUG_ON( sdom != svc->sdom );
839 /* Increment credit */
840 atomic_add(credit_fair, &svc->credit);
841 credit = atomic_read(&svc->credit);
843 /*
844 * Recompute priority or, if VCPU is idling, remove it from
845 * the active list.
846 */
847 if ( credit < 0 )
848 {
849 if ( sdom->cap == 0U )
850 svc->pri = CSCHED_PRI_TS_OVER;
851 else
852 svc->pri = CSCHED_PRI_TS_PARKED;
854 if ( credit < -CSCHED_TSLICE )
855 {
856 CSCHED_STAT_CRANK(acct_min_credit);
857 credit = -CSCHED_TSLICE;
858 atomic_set(&svc->credit, credit);
859 }
860 }
861 else
862 {
863 svc->pri = CSCHED_PRI_TS_UNDER;
865 if ( credit > CSCHED_TSLICE )
866 __csched_vcpu_acct_idle_locked(svc);
867 }
869 svc->credit_last = credit;
870 svc->credit_incr = credit_fair;
871 credit_balance += credit;
872 }
873 }
875 csched_priv.credit_balance = credit_balance;
877 spin_unlock_irqrestore(&csched_priv.lock, flags);
879 /* Inform each CPU that its runq needs to be sorted */
880 csched_priv.runq_sort++;
881 }
883 static void
884 csched_tick(unsigned int cpu)
885 {
886 struct csched_vcpu * const svc = CSCHED_VCPU(current);
887 struct csched_dom * const sdom = svc->sdom;
889 /*
890 * Accounting for running VCPU
891 *
892 * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
893 */
894 if ( likely(sdom != NULL) )
895 {
896 csched_vcpu_acct(svc, CSCHED_TICK);
897 }
899 /*
900 * Accounting duty
901 *
902 * Note: Currently, this is always done by the master boot CPU. Eventually,
903 * we could distribute or at the very least cycle the duty.
904 */
905 if ( (csched_priv.master == cpu) &&
906 (per_cpu(schedule_data, cpu).tick % CSCHED_ACCT_NTICKS) == 0 )
907 {
908 csched_acct();
909 }
911 /*
912 * Check if runq needs to be sorted
913 *
914 * Every physical CPU resorts the runq after the accounting master has
915 * modified priorities. This is a special O(n) sort and runs at most
916 * once per accounting period (currently 30 milliseconds).
917 */
918 csched_runq_sort(cpu);
919 }
921 static struct csched_vcpu *
922 csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
923 {
924 struct list_head *iter;
925 struct csched_vcpu *speer;
926 struct vcpu *vc;
928 list_for_each( iter, &spc->runq )
929 {
930 speer = __runq_elem(iter);
932 /*
933 * If next available VCPU here is not of higher priority than ours,
934 * this PCPU is useless to us.
935 */
936 if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
937 {
938 CSCHED_STAT_CRANK(steal_peer_idle);
939 break;
940 }
942 /* Is this VCPU is runnable on our PCPU? */
943 vc = speer->vcpu;
944 BUG_ON( is_idle_vcpu(vc) );
946 if ( __csched_vcpu_is_stealable(cpu, vc) )
947 {
948 /* We got a candidate. Grab it! */
949 __runq_remove(speer);
950 vc->processor = cpu;
952 return speer;
953 }
954 }
956 return NULL;
957 }
959 static struct csched_vcpu *
960 csched_load_balance(int cpu, struct csched_vcpu *snext)
961 {
962 struct csched_pcpu *spc;
963 struct csched_vcpu *speer;
964 int peer_cpu;
966 if ( snext->pri == CSCHED_PRI_IDLE )
967 CSCHED_STAT_CRANK(load_balance_idle);
968 else if ( snext->pri == CSCHED_PRI_TS_OVER )
969 CSCHED_STAT_CRANK(load_balance_over);
970 else
971 CSCHED_STAT_CRANK(load_balance_other);
973 peer_cpu = cpu;
974 BUG_ON( peer_cpu != snext->vcpu->processor );
976 while ( 1 )
977 {
978 /* For each PCPU in the system starting with our neighbour... */
979 peer_cpu = (peer_cpu + 1) % csched_priv.ncpus;
980 if ( peer_cpu == cpu )
981 break;
983 /*
984 * Get ahold of the scheduler lock for this peer CPU.
985 *
986 * Note: We don't spin on this lock but simply try it. Spinning could
987 * cause a deadlock if the peer CPU is also load balancing and trying
988 * to lock this CPU.
989 */
990 if ( spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
991 {
993 spc = CSCHED_PCPU(peer_cpu);
994 if ( unlikely(spc == NULL) )
995 {
996 CSCHED_STAT_CRANK(steal_peer_down);
997 speer = NULL;
998 }
999 else
1001 speer = csched_runq_steal(spc, cpu, snext->pri);
1004 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1006 /* Got one! */
1007 if ( speer )
1009 CSCHED_STAT_CRANK(vcpu_migrate);
1010 return speer;
1013 else
1015 CSCHED_STAT_CRANK(steal_trylock_failed);
1020 /* Failed to find more important work */
1021 __runq_remove(snext);
1022 return snext;
1025 /*
1026 * This function is in the critical path. It is designed to be simple and
1027 * fast for the common case.
1028 */
1029 static struct task_slice
1030 csched_schedule(s_time_t now)
1032 const int cpu = smp_processor_id();
1033 struct list_head * const runq = RUNQ(cpu);
1034 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1035 struct csched_vcpu *snext;
1036 struct task_slice ret;
1038 CSCHED_STAT_CRANK(schedule);
1039 CSCHED_VCPU_CHECK(current);
1041 /*
1042 * Select next runnable local VCPU (ie top of local runq)
1043 */
1044 if ( vcpu_runnable(current) )
1045 __runq_insert(cpu, scurr);
1046 else
1047 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1049 snext = __runq_elem(runq->next);
1051 /*
1052 * SMP Load balance:
1054 * If the next highest priority local runnable VCPU has already eaten
1055 * through its credits, look on other PCPUs to see if we have more
1056 * urgent work... If not, csched_load_balance() will return snext, but
1057 * already removed from the runq.
1058 */
1059 if ( snext->pri > CSCHED_PRI_TS_OVER )
1060 __runq_remove(snext);
1061 else
1062 snext = csched_load_balance(cpu, snext);
1064 /*
1065 * Update idlers mask if necessary. When we're idling, other CPUs
1066 * will tickle us when they get extra work.
1067 */
1068 if ( snext->pri == CSCHED_PRI_IDLE )
1070 if ( !cpu_isset(cpu, csched_priv.idlers) )
1071 cpu_set(cpu, csched_priv.idlers);
1073 else if ( cpu_isset(cpu, csched_priv.idlers) )
1075 cpu_clear(cpu, csched_priv.idlers);
1078 /*
1079 * Return task to run next...
1080 */
1081 ret.time = MILLISECS(CSCHED_TSLICE);
1082 ret.task = snext->vcpu;
1084 CSCHED_VCPU_CHECK(ret.task);
1086 return ret;
1089 static void
1090 csched_dump_vcpu(struct csched_vcpu *svc)
1092 struct csched_dom * const sdom = svc->sdom;
1094 printk("[%i.%i] pri=%i cpu=%i",
1095 svc->vcpu->domain->domain_id,
1096 svc->vcpu->vcpu_id,
1097 svc->pri,
1098 svc->vcpu->processor);
1100 if ( sdom )
1102 printk(" credit=%i (%d+%u) {a=%u i=%u w=%u}",
1103 atomic_read(&svc->credit),
1104 svc->credit_last,
1105 svc->credit_incr,
1106 svc->state_active,
1107 svc->state_idle,
1108 sdom->weight);
1111 printk("\n");
1114 static void
1115 csched_dump_pcpu(int cpu)
1117 struct list_head *runq, *iter;
1118 struct csched_pcpu *spc;
1119 struct csched_vcpu *svc;
1120 int loop;
1122 spc = CSCHED_PCPU(cpu);
1123 runq = &spc->runq;
1125 printk(" tick=%lu, sort=%d\n",
1126 per_cpu(schedule_data, cpu).tick,
1127 spc->runq_sort_last);
1129 /* current VCPU */
1130 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1131 if ( svc )
1133 printk("\trun: ");
1134 csched_dump_vcpu(svc);
1137 loop = 0;
1138 list_for_each( iter, runq )
1140 svc = __runq_elem(iter);
1141 if ( svc )
1143 printk("\t%3d: ", ++loop);
1144 csched_dump_vcpu(svc);
1149 static void
1150 csched_dump(void)
1152 struct list_head *iter_sdom, *iter_svc;
1153 int loop;
1155 printk("info:\n"
1156 "\tncpus = %u\n"
1157 "\tmaster = %u\n"
1158 "\tcredit = %u\n"
1159 "\tcredit balance = %d\n"
1160 "\tweight = %u\n"
1161 "\trunq_sort = %u\n"
1162 "\ttick = %dms\n"
1163 "\ttslice = %dms\n"
1164 "\taccounting period = %dms\n"
1165 "\tdefault-weight = %d\n",
1166 csched_priv.ncpus,
1167 csched_priv.master,
1168 csched_priv.credit,
1169 csched_priv.credit_balance,
1170 csched_priv.weight,
1171 csched_priv.runq_sort,
1172 CSCHED_TICK,
1173 CSCHED_TSLICE,
1174 CSCHED_ACCT_PERIOD,
1175 CSCHED_DEFAULT_WEIGHT);
1177 printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
1179 CSCHED_STATS_PRINTK();
1181 printk("active vcpus:\n");
1182 loop = 0;
1183 list_for_each( iter_sdom, &csched_priv.active_sdom )
1185 struct csched_dom *sdom;
1186 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1188 list_for_each( iter_svc, &sdom->active_vcpu )
1190 struct csched_vcpu *svc;
1191 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1193 printk("\t%3d: ", ++loop);
1194 csched_dump_vcpu(svc);
1199 static void
1200 csched_init(void)
1202 spin_lock_init(&csched_priv.lock);
1203 INIT_LIST_HEAD(&csched_priv.active_sdom);
1204 csched_priv.ncpus = 0;
1205 csched_priv.master = UINT_MAX;
1206 cpus_clear(csched_priv.idlers);
1207 csched_priv.weight = 0U;
1208 csched_priv.credit = 0U;
1209 csched_priv.credit_balance = 0;
1210 csched_priv.runq_sort = 0U;
1211 CSCHED_STATS_RESET();
1215 struct scheduler sched_credit_def = {
1216 .name = "SMP Credit Scheduler",
1217 .opt_name = "credit",
1218 .sched_id = SCHED_CREDIT,
1220 .init_vcpu = csched_vcpu_init,
1221 .destroy_domain = csched_dom_destroy,
1223 .sleep = csched_vcpu_sleep,
1224 .wake = csched_vcpu_wake,
1226 .set_affinity = csched_vcpu_set_affinity,
1228 .adjdom = csched_dom_cntl,
1230 .tick = csched_tick,
1231 .do_schedule = csched_schedule,
1233 .dump_cpu_state = csched_dump_pcpu,
1234 .dump_settings = csched_dump,
1235 .init = csched_init,
1236 };