ia64/xen-unstable

view xen/common/sched_credit.c @ 19313:cfacba42091c

Improve vcpu_migration_delay handling.

Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Mar 11 10:12:14 2009 +0000 (2009-03-11)
parents b249f3e979a5
children e5bed83d5180
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic per-vCPU counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #ifdef PERF_COUNTERS
34 #define CSCHED_STATS
35 #endif
38 /*
39 * Basic constants
40 */
41 #define CSCHED_DEFAULT_WEIGHT 256
42 #define CSCHED_TICKS_PER_TSLICE 3
43 #define CSCHED_TICKS_PER_ACCT 3
44 #define CSCHED_MSECS_PER_TICK 10
45 #define CSCHED_MSECS_PER_TSLICE \
46 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
47 #define CSCHED_CREDITS_PER_TICK 100
48 #define CSCHED_CREDITS_PER_TSLICE \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
50 #define CSCHED_CREDITS_PER_ACCT \
51 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
54 /*
55 * Priorities
56 */
57 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
58 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
59 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
60 #define CSCHED_PRI_IDLE -64 /* idle */
63 /*
64 * Flags
65 */
66 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
69 /*
70 * Useful macros
71 */
72 #define CSCHED_PCPU(_c) \
73 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
74 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
75 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
76 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
79 /*
80 * Stats
81 */
82 #define CSCHED_STAT_CRANK(_X) (perfc_incr(_X))
84 #ifdef CSCHED_STATS
86 #define CSCHED_VCPU_STATS_RESET(_V) \
87 do \
88 { \
89 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
90 } while ( 0 )
92 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
94 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
96 #else /* CSCHED_STATS */
98 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
99 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
100 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
102 #endif /* CSCHED_STATS */
105 /*
106 * Physical CPU
107 */
108 struct csched_pcpu {
109 struct list_head runq;
110 uint32_t runq_sort_last;
111 struct timer ticker;
112 unsigned int tick;
113 };
115 /*
116 * Virtual CPU
117 */
118 struct csched_vcpu {
119 struct list_head runq_elem;
120 struct list_head active_vcpu_elem;
121 struct csched_dom *sdom;
122 struct vcpu *vcpu;
123 atomic_t credit;
124 uint16_t flags;
125 int16_t pri;
126 #ifdef CSCHED_STATS
127 struct {
128 int credit_last;
129 uint32_t credit_incr;
130 uint32_t state_active;
131 uint32_t state_idle;
132 uint32_t migrate_q;
133 uint32_t migrate_r;
134 } stats;
135 #endif
136 };
138 /*
139 * Domain
140 */
141 struct csched_dom {
142 struct list_head active_vcpu;
143 struct list_head active_sdom_elem;
144 struct domain *dom;
145 uint16_t active_vcpu_count;
146 uint16_t weight;
147 uint16_t cap;
148 };
150 /*
151 * System-wide private data
152 */
153 struct csched_private {
154 spinlock_t lock;
155 struct list_head active_sdom;
156 uint32_t ncpus;
157 unsigned int master;
158 cpumask_t idlers;
159 uint32_t weight;
160 uint32_t credit;
161 int credit_balance;
162 uint32_t runq_sort;
163 };
166 /*
167 * Global variables
168 */
169 static struct csched_private csched_priv;
171 static void csched_tick(void *_cpu);
173 static inline int
174 __vcpu_on_runq(struct csched_vcpu *svc)
175 {
176 return !list_empty(&svc->runq_elem);
177 }
179 static inline struct csched_vcpu *
180 __runq_elem(struct list_head *elem)
181 {
182 return list_entry(elem, struct csched_vcpu, runq_elem);
183 }
185 static inline void
186 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
187 {
188 const struct list_head * const runq = RUNQ(cpu);
189 struct list_head *iter;
191 BUG_ON( __vcpu_on_runq(svc) );
192 BUG_ON( cpu != svc->vcpu->processor );
194 list_for_each( iter, runq )
195 {
196 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
197 if ( svc->pri > iter_svc->pri )
198 break;
199 }
201 list_add_tail(&svc->runq_elem, iter);
202 }
204 static inline void
205 __runq_remove(struct csched_vcpu *svc)
206 {
207 BUG_ON( !__vcpu_on_runq(svc) );
208 list_del_init(&svc->runq_elem);
209 }
211 static inline void
212 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
213 {
214 struct csched_vcpu * const cur =
215 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
216 cpumask_t mask;
218 ASSERT(cur);
219 cpus_clear(mask);
221 /* If strictly higher priority than current VCPU, signal the CPU */
222 if ( new->pri > cur->pri )
223 {
224 if ( cur->pri == CSCHED_PRI_IDLE )
225 CSCHED_STAT_CRANK(tickle_local_idler);
226 else if ( cur->pri == CSCHED_PRI_TS_OVER )
227 CSCHED_STAT_CRANK(tickle_local_over);
228 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
229 CSCHED_STAT_CRANK(tickle_local_under);
230 else
231 CSCHED_STAT_CRANK(tickle_local_other);
233 cpu_set(cpu, mask);
234 }
236 /*
237 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
238 * let them know there is runnable work in the system...
239 */
240 if ( cur->pri > CSCHED_PRI_IDLE )
241 {
242 if ( cpus_empty(csched_priv.idlers) )
243 {
244 CSCHED_STAT_CRANK(tickle_idlers_none);
245 }
246 else
247 {
248 CSCHED_STAT_CRANK(tickle_idlers_some);
249 cpus_or(mask, mask, csched_priv.idlers);
250 cpus_and(mask, mask, new->vcpu->cpu_affinity);
251 }
252 }
254 /* Send scheduler interrupts to designated CPUs */
255 if ( !cpus_empty(mask) )
256 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
257 }
259 static int
260 csched_pcpu_init(int cpu)
261 {
262 struct csched_pcpu *spc;
263 unsigned long flags;
265 /* Allocate per-PCPU info */
266 spc = xmalloc(struct csched_pcpu);
267 if ( spc == NULL )
268 return -1;
270 spin_lock_irqsave(&csched_priv.lock, flags);
272 /* Initialize/update system-wide config */
273 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
274 if ( csched_priv.ncpus <= cpu )
275 csched_priv.ncpus = cpu + 1;
276 if ( csched_priv.master >= csched_priv.ncpus )
277 csched_priv.master = cpu;
279 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
280 INIT_LIST_HEAD(&spc->runq);
281 spc->runq_sort_last = csched_priv.runq_sort;
282 per_cpu(schedule_data, cpu).sched_priv = spc;
284 /* Start off idling... */
285 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
286 cpu_set(cpu, csched_priv.idlers);
288 spin_unlock_irqrestore(&csched_priv.lock, flags);
290 return 0;
291 }
293 #ifndef NDEBUG
294 static inline void
295 __csched_vcpu_check(struct vcpu *vc)
296 {
297 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
298 struct csched_dom * const sdom = svc->sdom;
300 BUG_ON( svc->vcpu != vc );
301 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
302 if ( sdom )
303 {
304 BUG_ON( is_idle_vcpu(vc) );
305 BUG_ON( sdom->dom != vc->domain );
306 }
307 else
308 {
309 BUG_ON( !is_idle_vcpu(vc) );
310 }
312 CSCHED_STAT_CRANK(vcpu_check);
313 }
314 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
315 #else
316 #define CSCHED_VCPU_CHECK(_vc)
317 #endif
319 /*
320 * Delay, in microseconds, between migrations of a VCPU between PCPUs.
321 * This prevents rapid fluttering of a VCPU between CPUs, and reduces the
322 * implicit overheads such as cache-warming. 1ms (1000) has been measured
323 * as a good value.
324 */
325 static unsigned int vcpu_migration_delay;
326 integer_param("vcpu_migration_delay", vcpu_migration_delay);
328 static inline int
329 __csched_vcpu_is_cache_hot(struct vcpu *v)
330 {
331 int hot = ((NOW() - v->last_run_time) <
332 ((uint64_t)vcpu_migration_delay * 1000u));
334 if ( hot )
335 CSCHED_STAT_CRANK(vcpu_hot);
337 return hot;
338 }
340 static inline int
341 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
342 {
343 /*
344 * Don't pick up work that's in the peer's scheduling tail or hot on
345 * peer PCPU. Only pick up work that's allowed to run on our CPU.
346 */
347 return !vc->is_running &&
348 !__csched_vcpu_is_cache_hot(vc) &&
349 cpu_isset(dest_cpu, vc->cpu_affinity);
350 }
352 static int
353 csched_cpu_pick(struct vcpu *vc)
354 {
355 cpumask_t cpus;
356 cpumask_t idlers;
357 int cpu;
359 /*
360 * Pick from online CPUs in VCPU's affinity mask, giving a
361 * preference to its current processor if it's in there.
362 */
363 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
364 cpu = cpu_isset(vc->processor, cpus)
365 ? vc->processor
366 : cycle_cpu(vc->processor, cpus);
367 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
369 /*
370 * Try to find an idle processor within the above constraints.
371 *
372 * In multi-core and multi-threaded CPUs, not all idle execution
373 * vehicles are equal!
374 *
375 * We give preference to the idle execution vehicle with the most
376 * idling neighbours in its grouping. This distributes work across
377 * distinct cores first and guarantees we don't do something stupid
378 * like run two VCPUs on co-hyperthreads while there are idle cores
379 * or sockets.
380 */
381 idlers = csched_priv.idlers;
382 cpu_set(cpu, idlers);
383 cpus_and(cpus, cpus, idlers);
384 cpu_clear(cpu, cpus);
386 while ( !cpus_empty(cpus) )
387 {
388 cpumask_t cpu_idlers;
389 cpumask_t nxt_idlers;
390 int nxt;
392 nxt = cycle_cpu(cpu, cpus);
394 if ( cpu_isset(cpu, cpu_core_map[nxt]) )
395 {
396 ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
397 cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
398 cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
399 }
400 else
401 {
402 ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
403 cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
404 cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
405 }
407 if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
408 {
409 cpu = nxt;
410 cpu_clear(cpu, cpus);
411 }
412 else
413 {
414 cpus_andnot(cpus, cpus, nxt_idlers);
415 }
416 }
418 return cpu;
419 }
421 static inline void
422 __csched_vcpu_acct_start(struct csched_vcpu *svc)
423 {
424 struct csched_dom * const sdom = svc->sdom;
425 unsigned long flags;
427 spin_lock_irqsave(&csched_priv.lock, flags);
429 if ( list_empty(&svc->active_vcpu_elem) )
430 {
431 CSCHED_VCPU_STAT_CRANK(svc, state_active);
432 CSCHED_STAT_CRANK(acct_vcpu_active);
434 sdom->active_vcpu_count++;
435 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
436 if ( list_empty(&sdom->active_sdom_elem) )
437 {
438 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
439 csched_priv.weight += sdom->weight;
440 }
441 }
443 spin_unlock_irqrestore(&csched_priv.lock, flags);
444 }
446 static inline void
447 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
448 {
449 struct csched_dom * const sdom = svc->sdom;
451 BUG_ON( list_empty(&svc->active_vcpu_elem) );
453 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
454 CSCHED_STAT_CRANK(acct_vcpu_idle);
456 sdom->active_vcpu_count--;
457 list_del_init(&svc->active_vcpu_elem);
458 if ( list_empty(&sdom->active_vcpu) )
459 {
460 BUG_ON( csched_priv.weight < sdom->weight );
461 list_del_init(&sdom->active_sdom_elem);
462 csched_priv.weight -= sdom->weight;
463 }
464 }
466 static void
467 csched_vcpu_acct(unsigned int cpu)
468 {
469 struct csched_vcpu * const svc = CSCHED_VCPU(current);
471 ASSERT( current->processor == cpu );
472 ASSERT( svc->sdom != NULL );
474 /*
475 * If this VCPU's priority was boosted when it last awoke, reset it.
476 * If the VCPU is found here, then it's consuming a non-negligeable
477 * amount of CPU resources and should no longer be boosted.
478 */
479 if ( svc->pri == CSCHED_PRI_TS_BOOST )
480 svc->pri = CSCHED_PRI_TS_UNDER;
482 /*
483 * Update credits
484 */
485 atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
487 /*
488 * Put this VCPU and domain back on the active list if it was
489 * idling.
490 *
491 * If it's been active a while, check if we'd be better off
492 * migrating it to run elsewhere (see multi-core and multi-thread
493 * support in csched_cpu_pick()).
494 */
495 if ( list_empty(&svc->active_vcpu_elem) )
496 {
497 __csched_vcpu_acct_start(svc);
498 }
499 else if ( csched_cpu_pick(current) != cpu )
500 {
501 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
502 CSCHED_STAT_CRANK(migrate_running);
503 set_bit(_VPF_migrating, &current->pause_flags);
504 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
505 }
506 }
508 static int
509 csched_vcpu_init(struct vcpu *vc)
510 {
511 struct domain * const dom = vc->domain;
512 struct csched_dom *sdom = CSCHED_DOM(dom);
513 struct csched_vcpu *svc;
515 CSCHED_STAT_CRANK(vcpu_init);
517 /* Allocate per-VCPU info */
518 svc = xmalloc(struct csched_vcpu);
519 if ( svc == NULL )
520 return -1;
522 INIT_LIST_HEAD(&svc->runq_elem);
523 INIT_LIST_HEAD(&svc->active_vcpu_elem);
524 svc->sdom = sdom;
525 svc->vcpu = vc;
526 atomic_set(&svc->credit, 0);
527 svc->flags = 0U;
528 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
529 CSCHED_VCPU_STATS_RESET(svc);
530 vc->sched_priv = svc;
532 /* Allocate per-PCPU info */
533 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
534 {
535 if ( csched_pcpu_init(vc->processor) != 0 )
536 return -1;
537 }
539 CSCHED_VCPU_CHECK(vc);
540 return 0;
541 }
543 static void
544 csched_vcpu_destroy(struct vcpu *vc)
545 {
546 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
547 struct csched_dom * const sdom = svc->sdom;
548 unsigned long flags;
550 CSCHED_STAT_CRANK(vcpu_destroy);
552 BUG_ON( sdom == NULL );
553 BUG_ON( !list_empty(&svc->runq_elem) );
555 spin_lock_irqsave(&csched_priv.lock, flags);
557 if ( !list_empty(&svc->active_vcpu_elem) )
558 __csched_vcpu_acct_stop_locked(svc);
560 spin_unlock_irqrestore(&csched_priv.lock, flags);
562 xfree(svc);
563 }
565 static void
566 csched_vcpu_sleep(struct vcpu *vc)
567 {
568 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
570 CSCHED_STAT_CRANK(vcpu_sleep);
572 BUG_ON( is_idle_vcpu(vc) );
574 if ( per_cpu(schedule_data, vc->processor).curr == vc )
575 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
576 else if ( __vcpu_on_runq(svc) )
577 __runq_remove(svc);
578 }
580 static void
581 csched_vcpu_wake(struct vcpu *vc)
582 {
583 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
584 const unsigned int cpu = vc->processor;
586 BUG_ON( is_idle_vcpu(vc) );
588 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
589 {
590 CSCHED_STAT_CRANK(vcpu_wake_running);
591 return;
592 }
593 if ( unlikely(__vcpu_on_runq(svc)) )
594 {
595 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
596 return;
597 }
599 if ( likely(vcpu_runnable(vc)) )
600 CSCHED_STAT_CRANK(vcpu_wake_runnable);
601 else
602 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
604 /*
605 * We temporarly boost the priority of awaking VCPUs!
606 *
607 * If this VCPU consumes a non negligeable amount of CPU, it
608 * will eventually find itself in the credit accounting code
609 * path where its priority will be reset to normal.
610 *
611 * If on the other hand the VCPU consumes little CPU and is
612 * blocking and awoken a lot (doing I/O for example), its
613 * priority will remain boosted, optimizing it's wake-to-run
614 * latencies.
615 *
616 * This allows wake-to-run latency sensitive VCPUs to preempt
617 * more CPU resource intensive VCPUs without impacting overall
618 * system fairness.
619 *
620 * The one exception is for VCPUs of capped domains unpausing
621 * after earning credits they had overspent. We don't boost
622 * those.
623 */
624 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
625 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
626 {
627 svc->pri = CSCHED_PRI_TS_BOOST;
628 }
630 /* Put the VCPU on the runq and tickle CPUs */
631 __runq_insert(cpu, svc);
632 __runq_tickle(cpu, svc);
633 }
635 static int
636 csched_dom_cntl(
637 struct domain *d,
638 struct xen_domctl_scheduler_op *op)
639 {
640 struct csched_dom * const sdom = CSCHED_DOM(d);
641 unsigned long flags;
643 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
644 {
645 op->u.credit.weight = sdom->weight;
646 op->u.credit.cap = sdom->cap;
647 }
648 else
649 {
650 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
652 spin_lock_irqsave(&csched_priv.lock, flags);
654 if ( op->u.credit.weight != 0 )
655 {
656 if ( !list_empty(&sdom->active_sdom_elem) )
657 {
658 csched_priv.weight -= sdom->weight;
659 csched_priv.weight += op->u.credit.weight;
660 }
661 sdom->weight = op->u.credit.weight;
662 }
664 if ( op->u.credit.cap != (uint16_t)~0U )
665 sdom->cap = op->u.credit.cap;
667 spin_unlock_irqrestore(&csched_priv.lock, flags);
668 }
670 return 0;
671 }
673 static int
674 csched_dom_init(struct domain *dom)
675 {
676 struct csched_dom *sdom;
678 CSCHED_STAT_CRANK(dom_init);
680 if ( is_idle_domain(dom) )
681 return 0;
683 sdom = xmalloc(struct csched_dom);
684 if ( sdom == NULL )
685 return -ENOMEM;
687 /* Initialize credit and weight */
688 INIT_LIST_HEAD(&sdom->active_vcpu);
689 sdom->active_vcpu_count = 0;
690 INIT_LIST_HEAD(&sdom->active_sdom_elem);
691 sdom->dom = dom;
692 sdom->weight = CSCHED_DEFAULT_WEIGHT;
693 sdom->cap = 0U;
694 dom->sched_priv = sdom;
696 return 0;
697 }
699 static void
700 csched_dom_destroy(struct domain *dom)
701 {
702 CSCHED_STAT_CRANK(dom_destroy);
703 xfree(CSCHED_DOM(dom));
704 }
706 /*
707 * This is a O(n) optimized sort of the runq.
708 *
709 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
710 * through the runq and move up any UNDERs that are preceded by OVERS. We
711 * remember the last UNDER to make the move up operation O(1).
712 */
713 static void
714 csched_runq_sort(unsigned int cpu)
715 {
716 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
717 struct list_head *runq, *elem, *next, *last_under;
718 struct csched_vcpu *svc_elem;
719 unsigned long flags;
720 int sort_epoch;
722 sort_epoch = csched_priv.runq_sort;
723 if ( sort_epoch == spc->runq_sort_last )
724 return;
726 spc->runq_sort_last = sort_epoch;
728 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
730 runq = &spc->runq;
731 elem = runq->next;
732 last_under = runq;
734 while ( elem != runq )
735 {
736 next = elem->next;
737 svc_elem = __runq_elem(elem);
739 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
740 {
741 /* does elem need to move up the runq? */
742 if ( elem->prev != last_under )
743 {
744 list_del(elem);
745 list_add(elem, last_under);
746 }
747 last_under = elem;
748 }
750 elem = next;
751 }
753 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
754 }
756 static void
757 csched_acct(void)
758 {
759 unsigned long flags;
760 struct list_head *iter_vcpu, *next_vcpu;
761 struct list_head *iter_sdom, *next_sdom;
762 struct csched_vcpu *svc;
763 struct csched_dom *sdom;
764 uint32_t credit_total;
765 uint32_t weight_total;
766 uint32_t weight_left;
767 uint32_t credit_fair;
768 uint32_t credit_peak;
769 uint32_t credit_cap;
770 int credit_balance;
771 int credit_xtra;
772 int credit;
775 spin_lock_irqsave(&csched_priv.lock, flags);
777 weight_total = csched_priv.weight;
778 credit_total = csched_priv.credit;
780 /* Converge balance towards 0 when it drops negative */
781 if ( csched_priv.credit_balance < 0 )
782 {
783 credit_total -= csched_priv.credit_balance;
784 CSCHED_STAT_CRANK(acct_balance);
785 }
787 if ( unlikely(weight_total == 0) )
788 {
789 csched_priv.credit_balance = 0;
790 spin_unlock_irqrestore(&csched_priv.lock, flags);
791 CSCHED_STAT_CRANK(acct_no_work);
792 return;
793 }
795 CSCHED_STAT_CRANK(acct_run);
797 weight_left = weight_total;
798 credit_balance = 0;
799 credit_xtra = 0;
800 credit_cap = 0U;
802 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
803 {
804 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
806 BUG_ON( is_idle_domain(sdom->dom) );
807 BUG_ON( sdom->active_vcpu_count == 0 );
808 BUG_ON( sdom->weight == 0 );
809 BUG_ON( sdom->weight > weight_left );
811 weight_left -= sdom->weight;
813 /*
814 * A domain's fair share is computed using its weight in competition
815 * with that of all other active domains.
816 *
817 * At most, a domain can use credits to run all its active VCPUs
818 * for one full accounting period. We allow a domain to earn more
819 * only when the system-wide credit balance is negative.
820 */
821 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
822 if ( csched_priv.credit_balance < 0 )
823 {
824 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
825 (weight_total - 1)
826 ) / weight_total;
827 }
829 if ( sdom->cap != 0U )
830 {
831 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
832 if ( credit_cap < credit_peak )
833 credit_peak = credit_cap;
835 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
836 ) / sdom->active_vcpu_count;
837 }
839 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
840 ) / weight_total;
842 if ( credit_fair < credit_peak )
843 {
844 credit_xtra = 1;
845 }
846 else
847 {
848 if ( weight_left != 0U )
849 {
850 /* Give other domains a chance at unused credits */
851 credit_total += ( ( ( credit_fair - credit_peak
852 ) * weight_total
853 ) + ( weight_left - 1 )
854 ) / weight_left;
855 }
857 if ( credit_xtra )
858 {
859 /*
860 * Lazily keep domains with extra credits at the head of
861 * the queue to give others a chance at them in future
862 * accounting periods.
863 */
864 CSCHED_STAT_CRANK(acct_reorder);
865 list_del(&sdom->active_sdom_elem);
866 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
867 }
869 credit_fair = credit_peak;
870 }
872 /* Compute fair share per VCPU */
873 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
874 ) / sdom->active_vcpu_count;
877 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
878 {
879 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
880 BUG_ON( sdom != svc->sdom );
882 /* Increment credit */
883 atomic_add(credit_fair, &svc->credit);
884 credit = atomic_read(&svc->credit);
886 /*
887 * Recompute priority or, if VCPU is idling, remove it from
888 * the active list.
889 */
890 if ( credit < 0 )
891 {
892 svc->pri = CSCHED_PRI_TS_OVER;
894 /* Park running VCPUs of capped-out domains */
895 if ( sdom->cap != 0U &&
896 credit < -credit_cap &&
897 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
898 {
899 CSCHED_STAT_CRANK(vcpu_park);
900 vcpu_pause_nosync(svc->vcpu);
901 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
902 }
904 /* Lower bound on credits */
905 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
906 {
907 CSCHED_STAT_CRANK(acct_min_credit);
908 credit = -CSCHED_CREDITS_PER_TSLICE;
909 atomic_set(&svc->credit, credit);
910 }
911 }
912 else
913 {
914 svc->pri = CSCHED_PRI_TS_UNDER;
916 /* Unpark any capped domains whose credits go positive */
917 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
918 {
919 /*
920 * It's important to unset the flag AFTER the unpause()
921 * call to make sure the VCPU's priority is not boosted
922 * if it is woken up here.
923 */
924 CSCHED_STAT_CRANK(vcpu_unpark);
925 vcpu_unpause(svc->vcpu);
926 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
927 }
929 /* Upper bound on credits means VCPU stops earning */
930 if ( credit > CSCHED_CREDITS_PER_TSLICE )
931 {
932 __csched_vcpu_acct_stop_locked(svc);
933 credit = 0;
934 atomic_set(&svc->credit, credit);
935 }
936 }
938 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
939 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
940 credit_balance += credit;
941 }
942 }
944 csched_priv.credit_balance = credit_balance;
946 spin_unlock_irqrestore(&csched_priv.lock, flags);
948 /* Inform each CPU that its runq needs to be sorted */
949 csched_priv.runq_sort++;
950 }
952 static void
953 csched_tick(void *_cpu)
954 {
955 unsigned int cpu = (unsigned long)_cpu;
956 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
958 spc->tick++;
960 /*
961 * Accounting for running VCPU
962 */
963 if ( !is_idle_vcpu(current) )
964 csched_vcpu_acct(cpu);
966 /*
967 * Host-wide accounting duty
968 *
969 * Note: Currently, this is always done by the master boot CPU. Eventually,
970 * we could distribute or at the very least cycle the duty.
971 */
972 if ( (csched_priv.master == cpu) &&
973 (spc->tick % CSCHED_TICKS_PER_ACCT) == 0 )
974 {
975 csched_acct();
976 }
978 /*
979 * Check if runq needs to be sorted
980 *
981 * Every physical CPU resorts the runq after the accounting master has
982 * modified priorities. This is a special O(n) sort and runs at most
983 * once per accounting period (currently 30 milliseconds).
984 */
985 csched_runq_sort(cpu);
987 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
988 }
990 static struct csched_vcpu *
991 csched_runq_steal(int peer_cpu, int cpu, int pri)
992 {
993 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
994 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
995 struct csched_vcpu *speer;
996 struct list_head *iter;
997 struct vcpu *vc;
999 /*
1000 * Don't steal from an idle CPU's runq because it's about to
1001 * pick up work from it itself.
1002 */
1003 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1005 list_for_each( iter, &peer_pcpu->runq )
1007 speer = __runq_elem(iter);
1009 /*
1010 * If next available VCPU here is not of strictly higher
1011 * priority than ours, this PCPU is useless to us.
1012 */
1013 if ( speer->pri <= pri )
1014 break;
1016 /* Is this VCPU is runnable on our PCPU? */
1017 vc = speer->vcpu;
1018 BUG_ON( is_idle_vcpu(vc) );
1020 if (__csched_vcpu_is_migrateable(vc, cpu))
1022 /* We got a candidate. Grab it! */
1023 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1024 CSCHED_STAT_CRANK(migrate_queued);
1025 __runq_remove(speer);
1026 vc->processor = cpu;
1027 return speer;
1032 CSCHED_STAT_CRANK(steal_peer_idle);
1033 return NULL;
1036 static struct csched_vcpu *
1037 csched_load_balance(int cpu, struct csched_vcpu *snext)
1039 struct csched_vcpu *speer;
1040 cpumask_t workers;
1041 int peer_cpu;
1043 BUG_ON( cpu != snext->vcpu->processor );
1045 /* If this CPU is going offline we shouldn't steal work. */
1046 if ( unlikely(!cpu_online(cpu)) )
1047 goto out;
1049 if ( snext->pri == CSCHED_PRI_IDLE )
1050 CSCHED_STAT_CRANK(load_balance_idle);
1051 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1052 CSCHED_STAT_CRANK(load_balance_over);
1053 else
1054 CSCHED_STAT_CRANK(load_balance_other);
1056 /*
1057 * Peek at non-idling CPUs in the system, starting with our
1058 * immediate neighbour.
1059 */
1060 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1061 cpu_clear(cpu, workers);
1062 peer_cpu = cpu;
1064 while ( !cpus_empty(workers) )
1066 peer_cpu = cycle_cpu(peer_cpu, workers);
1067 cpu_clear(peer_cpu, workers);
1069 /*
1070 * Get ahold of the scheduler lock for this peer CPU.
1072 * Note: We don't spin on this lock but simply try it. Spinning could
1073 * cause a deadlock if the peer CPU is also load balancing and trying
1074 * to lock this CPU.
1075 */
1076 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1078 CSCHED_STAT_CRANK(steal_trylock_failed);
1079 continue;
1082 /*
1083 * Any work over there to steal?
1084 */
1085 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1086 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1087 if ( speer != NULL )
1088 return speer;
1091 out:
1092 /* Failed to find more important work elsewhere... */
1093 __runq_remove(snext);
1094 return snext;
1097 /*
1098 * This function is in the critical path. It is designed to be simple and
1099 * fast for the common case.
1100 */
1101 static struct task_slice
1102 csched_schedule(s_time_t now)
1104 const int cpu = smp_processor_id();
1105 struct list_head * const runq = RUNQ(cpu);
1106 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1107 struct csched_vcpu *snext;
1108 struct task_slice ret;
1110 CSCHED_STAT_CRANK(schedule);
1111 CSCHED_VCPU_CHECK(current);
1113 /*
1114 * Select next runnable local VCPU (ie top of local runq)
1115 */
1116 if ( vcpu_runnable(current) )
1117 __runq_insert(cpu, scurr);
1118 else
1119 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1121 snext = __runq_elem(runq->next);
1123 /*
1124 * SMP Load balance:
1126 * If the next highest priority local runnable VCPU has already eaten
1127 * through its credits, look on other PCPUs to see if we have more
1128 * urgent work... If not, csched_load_balance() will return snext, but
1129 * already removed from the runq.
1130 */
1131 if ( snext->pri > CSCHED_PRI_TS_OVER )
1132 __runq_remove(snext);
1133 else
1134 snext = csched_load_balance(cpu, snext);
1136 /*
1137 * Update idlers mask if necessary. When we're idling, other CPUs
1138 * will tickle us when they get extra work.
1139 */
1140 if ( snext->pri == CSCHED_PRI_IDLE )
1142 if ( !cpu_isset(cpu, csched_priv.idlers) )
1143 cpu_set(cpu, csched_priv.idlers);
1145 else if ( cpu_isset(cpu, csched_priv.idlers) )
1147 cpu_clear(cpu, csched_priv.idlers);
1150 /*
1151 * Return task to run next...
1152 */
1153 ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
1154 ret.task = snext->vcpu;
1156 CSCHED_VCPU_CHECK(ret.task);
1157 return ret;
1160 static void
1161 csched_dump_vcpu(struct csched_vcpu *svc)
1163 struct csched_dom * const sdom = svc->sdom;
1165 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1166 svc->vcpu->domain->domain_id,
1167 svc->vcpu->vcpu_id,
1168 svc->pri,
1169 svc->flags,
1170 svc->vcpu->processor);
1172 if ( sdom )
1174 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1175 #ifdef CSCHED_STATS
1176 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1177 svc->stats.credit_last,
1178 svc->stats.credit_incr,
1179 svc->stats.state_active,
1180 svc->stats.state_idle,
1181 svc->stats.migrate_q,
1182 svc->stats.migrate_r);
1183 #endif
1186 printk("\n");
1189 static void
1190 csched_dump_pcpu(int cpu)
1192 struct list_head *runq, *iter;
1193 struct csched_pcpu *spc;
1194 struct csched_vcpu *svc;
1195 int loop;
1196 char cpustr[100];
1198 spc = CSCHED_PCPU(cpu);
1199 runq = &spc->runq;
1201 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_sibling_map[cpu]);
1202 printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
1203 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_core_map[cpu]);
1204 printk("core=%s\n", cpustr);
1206 /* current VCPU */
1207 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1208 if ( svc )
1210 printk("\trun: ");
1211 csched_dump_vcpu(svc);
1214 loop = 0;
1215 list_for_each( iter, runq )
1217 svc = __runq_elem(iter);
1218 if ( svc )
1220 printk("\t%3d: ", ++loop);
1221 csched_dump_vcpu(svc);
1226 static void
1227 csched_dump(void)
1229 struct list_head *iter_sdom, *iter_svc;
1230 int loop;
1231 char idlers_buf[100];
1233 printk("info:\n"
1234 "\tncpus = %u\n"
1235 "\tmaster = %u\n"
1236 "\tcredit = %u\n"
1237 "\tcredit balance = %d\n"
1238 "\tweight = %u\n"
1239 "\trunq_sort = %u\n"
1240 "\tdefault-weight = %d\n"
1241 "\tmsecs per tick = %dms\n"
1242 "\tcredits per tick = %d\n"
1243 "\tticks per tslice = %d\n"
1244 "\tticks per acct = %d\n"
1245 "\tmigration delay = %uus\n",
1246 csched_priv.ncpus,
1247 csched_priv.master,
1248 csched_priv.credit,
1249 csched_priv.credit_balance,
1250 csched_priv.weight,
1251 csched_priv.runq_sort,
1252 CSCHED_DEFAULT_WEIGHT,
1253 CSCHED_MSECS_PER_TICK,
1254 CSCHED_CREDITS_PER_TICK,
1255 CSCHED_TICKS_PER_TSLICE,
1256 CSCHED_TICKS_PER_ACCT,
1257 vcpu_migration_delay);
1259 cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
1260 printk("idlers: %s\n", idlers_buf);
1262 printk("active vcpus:\n");
1263 loop = 0;
1264 list_for_each( iter_sdom, &csched_priv.active_sdom )
1266 struct csched_dom *sdom;
1267 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1269 list_for_each( iter_svc, &sdom->active_vcpu )
1271 struct csched_vcpu *svc;
1272 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1274 printk("\t%3d: ", ++loop);
1275 csched_dump_vcpu(svc);
1280 static void
1281 csched_init(void)
1283 spin_lock_init(&csched_priv.lock);
1284 INIT_LIST_HEAD(&csched_priv.active_sdom);
1285 csched_priv.ncpus = 0;
1286 csched_priv.master = UINT_MAX;
1287 cpus_clear(csched_priv.idlers);
1288 csched_priv.weight = 0U;
1289 csched_priv.credit = 0U;
1290 csched_priv.credit_balance = 0;
1291 csched_priv.runq_sort = 0U;
1294 /* Tickers cannot be kicked until SMP subsystem is alive. */
1295 static __init int csched_start_tickers(void)
1297 struct csched_pcpu *spc;
1298 unsigned int cpu;
1300 /* Is the credit scheduler initialised? */
1301 if ( csched_priv.ncpus == 0 )
1302 return 0;
1304 for_each_online_cpu ( cpu )
1306 spc = CSCHED_PCPU(cpu);
1307 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1310 return 0;
1312 __initcall(csched_start_tickers);
1315 struct scheduler sched_credit_def = {
1316 .name = "SMP Credit Scheduler",
1317 .opt_name = "credit",
1318 .sched_id = XEN_SCHEDULER_CREDIT,
1320 .init_domain = csched_dom_init,
1321 .destroy_domain = csched_dom_destroy,
1323 .init_vcpu = csched_vcpu_init,
1324 .destroy_vcpu = csched_vcpu_destroy,
1326 .sleep = csched_vcpu_sleep,
1327 .wake = csched_vcpu_wake,
1329 .adjust = csched_dom_cntl,
1331 .pick_cpu = csched_cpu_pick,
1332 .do_schedule = csched_schedule,
1334 .dump_cpu_state = csched_dump_pcpu,
1335 .dump_settings = csched_dump,
1336 .init = csched_init,
1337 };