ia64/xen-unstable

view xen/common/sched_credit.c @ 10490:7172935d10d4

Remove some spurious BUG_ON()'s from the credit scheduler.

Signed-off-by: Steven Hand <steven@xensource.com>
author shand@kneesaa.uk.xensource.com
date Tue Jun 20 11:02:49 2006 +0100 (2006-06-20)
parents 6993a0f91efc
children 64f9f308e109
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_TICK 10 /* milliseconds */
40 #define CSCHED_TSLICE 30 /* milliseconds */
41 #define CSCHED_ACCT_NTICKS 3
42 #define CSCHED_ACCT_PERIOD (CSCHED_ACCT_NTICKS * CSCHED_TICK)
43 #define CSCHED_DEFAULT_WEIGHT 256
46 /*
47 * Priorities
48 */
49 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
50 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
51 #define CSCHED_PRI_IDLE -64 /* idle */
52 #define CSCHED_PRI_TS_PARKED -65 /* time-share w/ capped credits */
55 /*
56 * Useful macros
57 */
58 #define CSCHED_PCPU(_c) ((struct csched_pcpu *)schedule_data[_c].sched_priv)
59 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
60 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
61 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
64 /*
65 * Stats
66 */
67 #ifdef CSCHED_STATS
69 #define CSCHED_STAT(_X) (csched_priv.stats._X)
70 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
71 #define CSCHED_STAT_PRINTK(_X) \
72 do \
73 { \
74 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
75 } while ( 0 );
77 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
78 _MACRO(vcpu_init) \
79 _MACRO(vcpu_sleep) \
80 _MACRO(vcpu_wake_running) \
81 _MACRO(vcpu_wake_onrunq) \
82 _MACRO(vcpu_wake_runnable) \
83 _MACRO(vcpu_wake_not_runnable) \
84 _MACRO(dom_destroy) \
85 _MACRO(schedule) \
86 _MACRO(tickle_local_idler) \
87 _MACRO(tickle_local_over) \
88 _MACRO(tickle_local_under) \
89 _MACRO(tickle_local_other) \
90 _MACRO(acct_run) \
91 _MACRO(acct_no_work) \
92 _MACRO(acct_balance) \
93 _MACRO(acct_reorder) \
94 _MACRO(acct_min_credit) \
95 _MACRO(acct_vcpu_active) \
96 _MACRO(acct_vcpu_idle) \
97 _MACRO(acct_vcpu_credit_min)
99 #define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \
100 _MACRO(vcpu_migrate) \
101 _MACRO(load_balance_idle) \
102 _MACRO(load_balance_over) \
103 _MACRO(load_balance_other) \
104 _MACRO(steal_trylock_failed) \
105 _MACRO(steal_peer_down) \
106 _MACRO(steal_peer_idle) \
107 _MACRO(steal_peer_running) \
108 _MACRO(steal_peer_pinned) \
109 _MACRO(tickle_idlers_none) \
110 _MACRO(tickle_idlers_some)
112 #ifndef NDEBUG
113 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
114 _MACRO(vcpu_check)
115 #else
116 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
117 #endif
119 #define CSCHED_STATS_EXPAND(_MACRO) \
120 CSCHED_STATS_EXPAND_SCHED(_MACRO) \
121 CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \
122 CSCHED_STATS_EXPAND_CHECKS(_MACRO)
124 #define CSCHED_STATS_RESET() \
125 do \
126 { \
127 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
128 } while ( 0 )
130 #define CSCHED_STATS_DEFINE() \
131 struct \
132 { \
133 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
134 } stats
136 #define CSCHED_STATS_PRINTK() \
137 do \
138 { \
139 printk("stats:\n"); \
140 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
141 } while ( 0 )
143 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
145 #else /* CSCHED_STATS */
147 #define CSCHED_STATS_RESET() do {} while ( 0 )
148 #define CSCHED_STATS_DEFINE() do {} while ( 0 )
149 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
150 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
152 #endif /* CSCHED_STATS */
155 /*
156 * Physical CPU
157 */
158 struct csched_pcpu {
159 struct list_head runq;
160 uint32_t runq_sort_last;
161 };
163 /*
164 * Virtual CPU
165 */
166 struct csched_vcpu {
167 struct list_head runq_elem;
168 struct list_head active_vcpu_elem;
169 struct csched_dom *sdom;
170 struct vcpu *vcpu;
171 atomic_t credit;
172 int credit_last;
173 uint32_t credit_incr;
174 uint32_t state_active;
175 uint32_t state_idle;
176 int16_t pri;
177 };
179 /*
180 * Domain
181 */
182 struct csched_dom {
183 struct list_head active_vcpu;
184 struct list_head active_sdom_elem;
185 struct domain *dom;
186 uint16_t active_vcpu_count;
187 uint16_t weight;
188 uint16_t cap;
189 };
191 /*
192 * System-wide private data
193 */
194 struct csched_private {
195 spinlock_t lock;
196 struct list_head active_sdom;
197 uint32_t ncpus;
198 unsigned int master;
199 cpumask_t idlers;
200 uint32_t weight;
201 uint32_t credit;
202 int credit_balance;
203 uint32_t runq_sort;
204 CSCHED_STATS_DEFINE();
205 };
208 /*
209 * Global variables
210 */
211 static struct csched_private csched_priv;
215 static inline int
216 __vcpu_on_runq(struct csched_vcpu *svc)
217 {
218 return !list_empty(&svc->runq_elem);
219 }
221 static inline struct csched_vcpu *
222 __runq_elem(struct list_head *elem)
223 {
224 return list_entry(elem, struct csched_vcpu, runq_elem);
225 }
227 static inline void
228 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
229 {
230 const struct list_head * const runq = RUNQ(cpu);
231 struct list_head *iter;
233 BUG_ON( __vcpu_on_runq(svc) );
234 BUG_ON( cpu != svc->vcpu->processor );
236 list_for_each( iter, runq )
237 {
238 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
239 if ( svc->pri > iter_svc->pri )
240 break;
241 }
243 list_add_tail(&svc->runq_elem, iter);
244 }
246 static inline void
247 __runq_remove(struct csched_vcpu *svc)
248 {
249 BUG_ON( !__vcpu_on_runq(svc) );
250 list_del_init(&svc->runq_elem);
251 }
253 static inline void
254 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
255 {
256 struct csched_vcpu * const cur = CSCHED_VCPU(schedule_data[cpu].curr);
257 cpumask_t mask;
259 ASSERT(cur);
260 cpus_clear(mask);
262 /* If strictly higher priority than current VCPU, signal the CPU */
263 if ( new->pri > cur->pri )
264 {
265 if ( cur->pri == CSCHED_PRI_IDLE )
266 CSCHED_STAT_CRANK(tickle_local_idler);
267 else if ( cur->pri == CSCHED_PRI_TS_OVER )
268 CSCHED_STAT_CRANK(tickle_local_over);
269 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
270 CSCHED_STAT_CRANK(tickle_local_under);
271 else
272 CSCHED_STAT_CRANK(tickle_local_other);
274 cpu_set(cpu, mask);
275 }
277 /*
278 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
279 * let them know there is runnable work in the system...
280 */
281 if ( cur->pri > CSCHED_PRI_IDLE )
282 {
283 if ( cpus_empty(csched_priv.idlers) )
284 {
285 CSCHED_STAT_CRANK(tickle_idlers_none);
286 }
287 else
288 {
289 CSCHED_STAT_CRANK(tickle_idlers_some);
290 cpus_or(mask, mask, csched_priv.idlers);
291 }
292 }
294 /* Send scheduler interrupts to designated CPUs */
295 if ( !cpus_empty(mask) )
296 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
297 }
299 static void
300 csched_pcpu_init(int cpu)
301 {
302 struct csched_pcpu *spc;
303 unsigned long flags;
305 spin_lock_irqsave(&csched_priv.lock, flags);
307 /* Initialize/update system-wide config */
308 csched_priv.credit += CSCHED_ACCT_PERIOD;
309 if ( csched_priv.ncpus <= cpu )
310 csched_priv.ncpus = cpu + 1;
311 if ( csched_priv.master >= csched_priv.ncpus )
312 csched_priv.master = cpu;
314 /* Allocate per-PCPU info */
315 spc = xmalloc(struct csched_pcpu);
316 BUG_ON( spc == NULL );
317 INIT_LIST_HEAD(&spc->runq);
318 spc->runq_sort_last = csched_priv.runq_sort;
319 schedule_data[cpu].sched_priv = spc;
321 /* Start off idling... */
322 BUG_ON( !is_idle_vcpu(schedule_data[cpu].curr) );
323 cpu_set(cpu, csched_priv.idlers);
325 spin_unlock_irqrestore(&csched_priv.lock, flags);
326 }
328 #ifndef NDEBUG
329 static inline void
330 __csched_vcpu_check(struct vcpu *vc)
331 {
332 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
333 struct csched_dom * const sdom = svc->sdom;
335 BUG_ON( svc->vcpu != vc );
336 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
337 if ( sdom )
338 {
339 BUG_ON( is_idle_vcpu(vc) );
340 BUG_ON( sdom->dom != vc->domain );
341 }
342 else
343 {
344 BUG_ON( !is_idle_vcpu(vc) );
345 }
347 CSCHED_STAT_CRANK(vcpu_check);
348 }
349 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
350 #else
351 #define CSCHED_VCPU_CHECK(_vc)
352 #endif
354 static inline int
355 __csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
356 {
357 /*
358 * Don't pick up work that's in the peer's scheduling tail. Also only pick
359 * up work that's allowed to run on our CPU.
360 */
361 if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
362 {
363 CSCHED_STAT_CRANK(steal_peer_running);
364 return 0;
365 }
367 if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
368 {
369 CSCHED_STAT_CRANK(steal_peer_pinned);
370 return 0;
371 }
373 return 1;
374 }
376 static void
377 csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
378 {
379 struct csched_dom * const sdom = svc->sdom;
380 unsigned long flags;
382 /* Update credits */
383 atomic_sub(credit_dec, &svc->credit);
385 /* Put this VCPU and domain back on the active list if it was idling */
386 if ( list_empty(&svc->active_vcpu_elem) )
387 {
388 spin_lock_irqsave(&csched_priv.lock, flags);
390 if ( list_empty(&svc->active_vcpu_elem) )
391 {
392 CSCHED_STAT_CRANK(acct_vcpu_active);
393 svc->state_active++;
395 sdom->active_vcpu_count++;
396 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
397 if ( list_empty(&sdom->active_sdom_elem) )
398 {
399 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
400 csched_priv.weight += sdom->weight;
401 }
402 }
404 spin_unlock_irqrestore(&csched_priv.lock, flags);
405 }
406 }
408 static inline void
409 __csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
410 {
411 struct csched_dom * const sdom = svc->sdom;
413 BUG_ON( list_empty(&svc->active_vcpu_elem) );
415 CSCHED_STAT_CRANK(acct_vcpu_idle);
416 svc->state_idle++;
418 sdom->active_vcpu_count--;
419 list_del_init(&svc->active_vcpu_elem);
420 if ( list_empty(&sdom->active_vcpu) )
421 {
422 BUG_ON( csched_priv.weight < sdom->weight );
423 list_del_init(&sdom->active_sdom_elem);
424 csched_priv.weight -= sdom->weight;
425 }
427 atomic_set(&svc->credit, 0);
428 }
430 static int
431 csched_vcpu_init(struct vcpu *vc)
432 {
433 struct domain * const dom = vc->domain;
434 struct csched_dom *sdom;
435 struct csched_vcpu *svc;
436 int16_t pri;
438 CSCHED_STAT_CRANK(vcpu_init);
440 /* Allocate, if appropriate, per-domain info */
441 if ( is_idle_vcpu(vc) )
442 {
443 sdom = NULL;
444 pri = CSCHED_PRI_IDLE;
445 }
446 else if ( CSCHED_DOM(dom) )
447 {
448 sdom = CSCHED_DOM(dom);
449 pri = CSCHED_PRI_TS_UNDER;
450 }
451 else
452 {
453 sdom = xmalloc(struct csched_dom);
454 if ( !sdom )
455 return -1;
457 /* Initialize credit and weight */
458 INIT_LIST_HEAD(&sdom->active_vcpu);
459 sdom->active_vcpu_count = 0;
460 INIT_LIST_HEAD(&sdom->active_sdom_elem);
461 sdom->dom = dom;
462 sdom->weight = CSCHED_DEFAULT_WEIGHT;
463 sdom->cap = 0U;
464 dom->sched_priv = sdom;
465 pri = CSCHED_PRI_TS_UNDER;
466 }
468 /* Allocate per-VCPU info */
469 svc = xmalloc(struct csched_vcpu);
470 if ( !svc )
471 return -1;
473 INIT_LIST_HEAD(&svc->runq_elem);
474 INIT_LIST_HEAD(&svc->active_vcpu_elem);
475 svc->sdom = sdom;
476 svc->vcpu = vc;
477 atomic_set(&svc->credit, 0);
478 svc->credit_last = 0;
479 svc->credit_incr = 0U;
480 svc->state_active = 0U;
481 svc->state_idle = 0U;
482 svc->pri = pri;
483 vc->sched_priv = svc;
485 CSCHED_VCPU_CHECK(vc);
487 /* Attach fair-share VCPUs to the accounting list */
488 if ( likely(sdom != NULL) )
489 csched_vcpu_acct(svc, 0);
491 /* Allocate per-PCPU info */
492 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
493 csched_pcpu_init(vc->processor);
495 CSCHED_VCPU_CHECK(vc);
497 return 0;
498 }
500 static void
501 csched_vcpu_free(struct vcpu *vc)
502 {
503 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
504 struct csched_dom * const sdom = svc->sdom;
505 unsigned long flags;
507 BUG_ON( sdom == NULL );
508 BUG_ON( !list_empty(&svc->runq_elem) );
510 spin_lock_irqsave(&csched_priv.lock, flags);
512 if ( !list_empty(&svc->active_vcpu_elem) )
513 __csched_vcpu_acct_idle_locked(svc);
515 spin_unlock_irqrestore(&csched_priv.lock, flags);
517 xfree(svc);
518 }
520 static void
521 csched_vcpu_sleep(struct vcpu *vc)
522 {
523 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
525 CSCHED_STAT_CRANK(vcpu_sleep);
527 BUG_ON( is_idle_vcpu(vc) );
529 if ( schedule_data[vc->processor].curr == vc )
530 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
531 else if ( __vcpu_on_runq(svc) )
532 __runq_remove(svc);
533 }
535 static void
536 csched_vcpu_wake(struct vcpu *vc)
537 {
538 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
539 const unsigned int cpu = vc->processor;
541 BUG_ON( is_idle_vcpu(vc) );
543 if ( unlikely(schedule_data[cpu].curr == vc) )
544 {
545 CSCHED_STAT_CRANK(vcpu_wake_running);
546 return;
547 }
548 if ( unlikely(__vcpu_on_runq(svc)) )
549 {
550 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
551 return;
552 }
554 if ( likely(vcpu_runnable(vc)) )
555 CSCHED_STAT_CRANK(vcpu_wake_runnable);
556 else
557 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
559 /* Put the VCPU on the runq and tickle CPUs */
560 __runq_insert(cpu, svc);
561 __runq_tickle(cpu, svc);
562 }
564 static int
565 csched_vcpu_set_affinity(struct vcpu *vc, cpumask_t *affinity)
566 {
567 unsigned long flags;
568 int lcpu;
570 if ( vc == current )
571 {
572 /* No locking needed but also can't move on the spot... */
573 if ( !cpu_isset(vc->processor, *affinity) )
574 return -EBUSY;
576 vc->cpu_affinity = *affinity;
577 }
578 else
579 {
580 /* Pause, modify, and unpause. */
581 vcpu_pause(vc);
583 vc->cpu_affinity = *affinity;
584 if ( !cpu_isset(vc->processor, vc->cpu_affinity) )
585 {
586 /*
587 * We must grab the scheduler lock for the CPU currently owning
588 * this VCPU before changing its ownership.
589 */
590 vcpu_schedule_lock_irqsave(vc, flags);
591 lcpu = vc->processor;
593 vc->processor = first_cpu(vc->cpu_affinity);
595 spin_unlock_irqrestore(&schedule_data[lcpu].schedule_lock, flags);
596 }
598 vcpu_unpause(vc);
599 }
601 return 0;
602 }
604 static int
605 csched_dom_cntl(
606 struct domain *d,
607 struct sched_adjdom_cmd *cmd)
608 {
609 struct csched_dom * const sdom = CSCHED_DOM(d);
610 unsigned long flags;
612 if ( cmd->direction == SCHED_INFO_GET )
613 {
614 cmd->u.credit.weight = sdom->weight;
615 cmd->u.credit.cap = sdom->cap;
616 }
617 else
618 {
619 ASSERT( cmd->direction == SCHED_INFO_PUT );
621 spin_lock_irqsave(&csched_priv.lock, flags);
623 if ( cmd->u.credit.weight != 0 )
624 {
625 csched_priv.weight -= sdom->weight;
626 sdom->weight = cmd->u.credit.weight;
627 csched_priv.weight += sdom->weight;
628 }
630 if ( cmd->u.credit.cap != (uint16_t)~0U )
631 sdom->cap = cmd->u.credit.cap;
633 spin_unlock_irqrestore(&csched_priv.lock, flags);
634 }
636 return 0;
637 }
639 static void
640 csched_dom_destroy(struct domain *dom)
641 {
642 struct csched_dom * const sdom = CSCHED_DOM(dom);
643 int i;
645 CSCHED_STAT_CRANK(dom_destroy);
647 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
648 {
649 if ( dom->vcpu[i] )
650 csched_vcpu_free(dom->vcpu[i]);
651 }
653 xfree(sdom);
654 }
656 /*
657 * This is a O(n) optimized sort of the runq.
658 *
659 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
660 * through the runq and move up any UNDERs that are preceded by OVERS. We
661 * remember the last UNDER to make the move up operation O(1).
662 */
663 static void
664 csched_runq_sort(unsigned int cpu)
665 {
666 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
667 struct list_head *runq, *elem, *next, *last_under;
668 struct csched_vcpu *svc_elem;
669 unsigned long flags;
670 int sort_epoch;
672 sort_epoch = csched_priv.runq_sort;
673 if ( sort_epoch == spc->runq_sort_last )
674 return;
676 spc->runq_sort_last = sort_epoch;
678 spin_lock_irqsave(&schedule_data[cpu].schedule_lock, flags);
680 runq = &spc->runq;
681 elem = runq->next;
682 last_under = runq;
684 while ( elem != runq )
685 {
686 next = elem->next;
687 svc_elem = __runq_elem(elem);
689 if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
690 {
691 /* does elem need to move up the runq? */
692 if ( elem->prev != last_under )
693 {
694 list_del(elem);
695 list_add(elem, last_under);
696 }
697 last_under = elem;
698 }
700 elem = next;
701 }
703 spin_unlock_irqrestore(&schedule_data[cpu].schedule_lock, flags);
704 }
706 static void
707 csched_acct(void)
708 {
709 unsigned long flags;
710 struct list_head *iter_vcpu, *next_vcpu;
711 struct list_head *iter_sdom, *next_sdom;
712 struct csched_vcpu *svc;
713 struct csched_dom *sdom;
714 uint32_t credit_total;
715 uint32_t weight_total;
716 uint32_t weight_left;
717 uint32_t credit_fair;
718 uint32_t credit_peak;
719 int credit_balance;
720 int credit_xtra;
721 int credit;
724 spin_lock_irqsave(&csched_priv.lock, flags);
726 weight_total = csched_priv.weight;
727 credit_total = csched_priv.credit;
729 /* Converge balance towards 0 when it drops negative */
730 if ( csched_priv.credit_balance < 0 )
731 {
732 credit_total -= csched_priv.credit_balance;
733 CSCHED_STAT_CRANK(acct_balance);
734 }
736 if ( unlikely(weight_total == 0) )
737 {
738 csched_priv.credit_balance = 0;
739 spin_unlock_irqrestore(&csched_priv.lock, flags);
740 CSCHED_STAT_CRANK(acct_no_work);
741 return;
742 }
744 CSCHED_STAT_CRANK(acct_run);
746 weight_left = weight_total;
747 credit_balance = 0;
748 credit_xtra = 0;
750 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
751 {
752 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
754 BUG_ON( is_idle_domain(sdom->dom) );
755 BUG_ON( sdom->active_vcpu_count == 0 );
756 BUG_ON( sdom->weight == 0 );
757 BUG_ON( sdom->weight > weight_left );
759 weight_left -= sdom->weight;
761 /*
762 * A domain's fair share is computed using its weight in competition
763 * with that of all other active domains.
764 *
765 * At most, a domain can use credits to run all its active VCPUs
766 * for one full accounting period. We allow a domain to earn more
767 * only when the system-wide credit balance is negative.
768 */
769 credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
770 if ( csched_priv.credit_balance < 0 )
771 {
772 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
773 (weight_total - 1)
774 ) / weight_total;
775 }
776 if ( sdom->cap != 0U )
777 {
778 uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 100;
779 if ( credit_cap < credit_peak )
780 credit_peak = credit_cap;
781 }
783 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
784 ) / weight_total;
786 if ( credit_fair < credit_peak )
787 {
788 credit_xtra = 1;
789 }
790 else
791 {
792 if ( weight_left != 0U )
793 {
794 /* Give other domains a chance at unused credits */
795 credit_total += ( ( ( credit_fair - credit_peak
796 ) * weight_total
797 ) + ( weight_left - 1 )
798 ) / weight_left;
799 }
801 if ( credit_xtra )
802 {
803 /*
804 * Lazily keep domains with extra credits at the head of
805 * the queue to give others a chance at them in future
806 * accounting periods.
807 */
808 CSCHED_STAT_CRANK(acct_reorder);
809 list_del(&sdom->active_sdom_elem);
810 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
811 }
813 credit_fair = credit_peak;
814 }
816 /* Compute fair share per VCPU */
817 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
818 ) / sdom->active_vcpu_count;
821 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
822 {
823 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
824 BUG_ON( sdom != svc->sdom );
826 /* Increment credit */
827 atomic_add(credit_fair, &svc->credit);
828 credit = atomic_read(&svc->credit);
830 /*
831 * Recompute priority or, if VCPU is idling, remove it from
832 * the active list.
833 */
834 if ( credit < 0 )
835 {
836 if ( sdom->cap == 0U )
837 svc->pri = CSCHED_PRI_TS_OVER;
838 else
839 svc->pri = CSCHED_PRI_TS_PARKED;
841 if ( credit < -CSCHED_TSLICE )
842 {
843 CSCHED_STAT_CRANK(acct_min_credit);
844 credit = -CSCHED_TSLICE;
845 atomic_set(&svc->credit, credit);
846 }
847 }
848 else
849 {
850 svc->pri = CSCHED_PRI_TS_UNDER;
852 if ( credit > CSCHED_TSLICE )
853 __csched_vcpu_acct_idle_locked(svc);
854 }
856 svc->credit_last = credit;
857 svc->credit_incr = credit_fair;
858 credit_balance += credit;
859 }
860 }
862 csched_priv.credit_balance = credit_balance;
864 spin_unlock_irqrestore(&csched_priv.lock, flags);
866 /* Inform each CPU that its runq needs to be sorted */
867 csched_priv.runq_sort++;
868 }
870 static void
871 csched_tick(unsigned int cpu)
872 {
873 struct csched_vcpu * const svc = CSCHED_VCPU(current);
874 struct csched_dom * const sdom = svc->sdom;
876 /*
877 * Accounting for running VCPU
878 *
879 * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
880 */
881 if ( likely(sdom != NULL) )
882 {
883 csched_vcpu_acct(svc, CSCHED_TICK);
884 }
886 /*
887 * Accounting duty
888 *
889 * Note: Currently, this is always done by the master boot CPU. Eventually,
890 * we could distribute or at the very least cycle the duty.
891 */
892 if ( (csched_priv.master == cpu) &&
893 (schedule_data[cpu].tick % CSCHED_ACCT_NTICKS) == 0 )
894 {
895 csched_acct();
896 }
898 /*
899 * Check if runq needs to be sorted
900 *
901 * Every physical CPU resorts the runq after the accounting master has
902 * modified priorities. This is a special O(n) sort and runs at most
903 * once per accounting period (currently 30 milliseconds).
904 */
905 csched_runq_sort(cpu);
906 }
908 static struct csched_vcpu *
909 csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
910 {
911 struct list_head *iter;
912 struct csched_vcpu *speer;
913 struct vcpu *vc;
915 list_for_each( iter, &spc->runq )
916 {
917 speer = __runq_elem(iter);
919 /*
920 * If next available VCPU here is not of higher priority than ours,
921 * this PCPU is useless to us.
922 */
923 if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
924 {
925 CSCHED_STAT_CRANK(steal_peer_idle);
926 break;
927 }
929 /* Is this VCPU is runnable on our PCPU? */
930 vc = speer->vcpu;
931 BUG_ON( is_idle_vcpu(vc) );
933 if ( __csched_vcpu_is_stealable(cpu, vc) )
934 {
935 /* We got a candidate. Grab it! */
936 __runq_remove(speer);
937 vc->processor = cpu;
939 return speer;
940 }
941 }
943 return NULL;
944 }
946 static struct csched_vcpu *
947 csched_load_balance(int cpu, struct csched_vcpu *snext)
948 {
949 struct csched_pcpu *spc;
950 struct csched_vcpu *speer;
951 int peer_cpu;
953 if ( snext->pri == CSCHED_PRI_IDLE )
954 CSCHED_STAT_CRANK(load_balance_idle);
955 else if ( snext->pri == CSCHED_PRI_TS_OVER )
956 CSCHED_STAT_CRANK(load_balance_over);
957 else
958 CSCHED_STAT_CRANK(load_balance_other);
960 peer_cpu = cpu;
961 BUG_ON( peer_cpu != snext->vcpu->processor );
963 while ( 1 )
964 {
965 /* For each PCPU in the system starting with our neighbour... */
966 peer_cpu = (peer_cpu + 1) % csched_priv.ncpus;
967 if ( peer_cpu == cpu )
968 break;
970 /*
971 * Get ahold of the scheduler lock for this peer CPU.
972 *
973 * Note: We don't spin on this lock but simply try it. Spinning could
974 * cause a deadlock if the peer CPU is also load balancing and trying
975 * to lock this CPU.
976 */
977 if ( spin_trylock(&schedule_data[peer_cpu].schedule_lock) )
978 {
980 spc = CSCHED_PCPU(peer_cpu);
981 if ( unlikely(spc == NULL) )
982 {
983 CSCHED_STAT_CRANK(steal_peer_down);
984 speer = NULL;
985 }
986 else
987 {
988 speer = csched_runq_steal(spc, cpu, snext->pri);
989 }
991 spin_unlock(&schedule_data[peer_cpu].schedule_lock);
993 /* Got one! */
994 if ( speer )
995 {
996 CSCHED_STAT_CRANK(vcpu_migrate);
997 return speer;
998 }
999 }
1000 else
1002 CSCHED_STAT_CRANK(steal_trylock_failed);
1007 /* Failed to find more important work */
1008 __runq_remove(snext);
1009 return snext;
1012 /*
1013 * This function is in the critical path. It is designed to be simple and
1014 * fast for the common case.
1015 */
1016 static struct task_slice
1017 csched_schedule(s_time_t now)
1019 const int cpu = smp_processor_id();
1020 struct list_head * const runq = RUNQ(cpu);
1021 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1022 struct csched_vcpu *snext;
1023 struct task_slice ret;
1025 CSCHED_STAT_CRANK(schedule);
1026 CSCHED_VCPU_CHECK(current);
1028 /*
1029 * Select next runnable local VCPU (ie top of local runq)
1030 */
1031 if ( vcpu_runnable(current) )
1032 __runq_insert(cpu, scurr);
1033 else
1034 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1036 snext = __runq_elem(runq->next);
1038 /*
1039 * SMP Load balance:
1041 * If the next highest priority local runnable VCPU has already eaten
1042 * through its credits, look on other PCPUs to see if we have more
1043 * urgent work... If not, csched_load_balance() will return snext, but
1044 * already removed from the runq.
1045 */
1046 if ( snext->pri > CSCHED_PRI_TS_OVER )
1047 __runq_remove(snext);
1048 else
1049 snext = csched_load_balance(cpu, snext);
1051 /*
1052 * Update idlers mask if necessary. When we're idling, other CPUs
1053 * will tickle us when they get extra work.
1054 */
1055 if ( snext->pri == CSCHED_PRI_IDLE )
1057 if ( !cpu_isset(cpu, csched_priv.idlers) )
1058 cpu_set(cpu, csched_priv.idlers);
1060 else if ( cpu_isset(cpu, csched_priv.idlers) )
1062 cpu_clear(cpu, csched_priv.idlers);
1065 /*
1066 * Return task to run next...
1067 */
1068 ret.time = MILLISECS(CSCHED_TSLICE);
1069 ret.task = snext->vcpu;
1071 CSCHED_VCPU_CHECK(ret.task);
1073 return ret;
1076 static void
1077 csched_dump_vcpu(struct csched_vcpu *svc)
1079 struct csched_dom * const sdom = svc->sdom;
1081 printk("[%i.%i] pri=%i cpu=%i",
1082 svc->vcpu->domain->domain_id,
1083 svc->vcpu->vcpu_id,
1084 svc->pri,
1085 svc->vcpu->processor);
1087 if ( sdom )
1089 printk(" credit=%i (%d+%u) {a=%u i=%u w=%u}",
1090 atomic_read(&svc->credit),
1091 svc->credit_last,
1092 svc->credit_incr,
1093 svc->state_active,
1094 svc->state_idle,
1095 sdom->weight);
1098 printk("\n");
1101 static void
1102 csched_dump_pcpu(int cpu)
1104 struct list_head *runq, *iter;
1105 struct csched_pcpu *spc;
1106 struct csched_vcpu *svc;
1107 int loop;
1109 spc = CSCHED_PCPU(cpu);
1110 runq = &spc->runq;
1112 printk(" tick=%lu, sort=%d\n",
1113 schedule_data[cpu].tick,
1114 spc->runq_sort_last);
1116 /* current VCPU */
1117 svc = CSCHED_VCPU(schedule_data[cpu].curr);
1118 if ( svc )
1120 printk("\trun: ");
1121 csched_dump_vcpu(svc);
1124 loop = 0;
1125 list_for_each( iter, runq )
1127 svc = __runq_elem(iter);
1128 if ( svc )
1130 printk("\t%3d: ", ++loop);
1131 csched_dump_vcpu(svc);
1136 static void
1137 csched_dump(void)
1139 struct list_head *iter_sdom, *iter_svc;
1140 int loop;
1142 printk("info:\n"
1143 "\tncpus = %u\n"
1144 "\tmaster = %u\n"
1145 "\tcredit = %u\n"
1146 "\tcredit balance = %d\n"
1147 "\tweight = %u\n"
1148 "\trunq_sort = %u\n"
1149 "\ttick = %dms\n"
1150 "\ttslice = %dms\n"
1151 "\taccounting period = %dms\n"
1152 "\tdefault-weight = %d\n",
1153 csched_priv.ncpus,
1154 csched_priv.master,
1155 csched_priv.credit,
1156 csched_priv.credit_balance,
1157 csched_priv.weight,
1158 csched_priv.runq_sort,
1159 CSCHED_TICK,
1160 CSCHED_TSLICE,
1161 CSCHED_ACCT_PERIOD,
1162 CSCHED_DEFAULT_WEIGHT);
1164 printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
1166 CSCHED_STATS_PRINTK();
1168 printk("active vcpus:\n");
1169 loop = 0;
1170 list_for_each( iter_sdom, &csched_priv.active_sdom )
1172 struct csched_dom *sdom;
1173 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1175 list_for_each( iter_svc, &sdom->active_vcpu )
1177 struct csched_vcpu *svc;
1178 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1180 printk("\t%3d: ", ++loop);
1181 csched_dump_vcpu(svc);
1186 static void
1187 csched_init(void)
1189 spin_lock_init(&csched_priv.lock);
1190 INIT_LIST_HEAD(&csched_priv.active_sdom);
1191 csched_priv.ncpus = 0;
1192 csched_priv.master = UINT_MAX;
1193 cpus_clear(csched_priv.idlers);
1194 csched_priv.weight = 0U;
1195 csched_priv.credit = 0U;
1196 csched_priv.credit_balance = 0;
1197 csched_priv.runq_sort = 0U;
1198 CSCHED_STATS_RESET();
1202 struct scheduler sched_credit_def = {
1203 .name = "SMP Credit Scheduler",
1204 .opt_name = "credit",
1205 .sched_id = SCHED_CREDIT,
1207 .init_vcpu = csched_vcpu_init,
1208 .destroy_domain = csched_dom_destroy,
1210 .sleep = csched_vcpu_sleep,
1211 .wake = csched_vcpu_wake,
1213 .set_affinity = csched_vcpu_set_affinity,
1215 .adjdom = csched_dom_cntl,
1217 .tick = csched_tick,
1218 .do_schedule = csched_schedule,
1220 .dump_cpu_state = csched_dump_pcpu,
1221 .dump_settings = csched_dump,
1222 .init = csched_init,
1223 };