ia64/xen-unstable

view xen/common/sched_credit.c @ 19279:a44751edcb76

Fix cpu selection at the time vCPU allocation

After cpu_[online/offline], set bits in cpu_online_map could be not
continuous. Use cycle_cpu() to pick the next one.

Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Mar 06 18:54:09 2009 +0000 (2009-03-06)
parents c0db74e41662
children 95e3cd67add2
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_DEFAULT_WEIGHT 256
40 #define CSCHED_TICKS_PER_TSLICE 3
41 #define CSCHED_TICKS_PER_ACCT 3
42 #define CSCHED_MSECS_PER_TICK 10
43 #define CSCHED_MSECS_PER_TSLICE \
44 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
45 #define CSCHED_CREDITS_PER_TICK 100
46 #define CSCHED_CREDITS_PER_TSLICE \
47 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48 #define CSCHED_CREDITS_PER_ACCT \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
52 /*
53 * Priorities
54 */
55 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
56 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
57 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
58 #define CSCHED_PRI_IDLE -64 /* idle */
61 /*
62 * Flags
63 */
64 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
67 /*
68 * Useful macros
69 */
70 #define CSCHED_PCPU(_c) \
71 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
72 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
73 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
74 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
77 /*
78 * Stats
79 */
80 #ifdef CSCHED_STATS
82 #define CSCHED_STAT(_X) (csched_priv.stats._X)
83 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
84 #define CSCHED_STAT_PRINTK(_X) \
85 do \
86 { \
87 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
88 } while ( 0 );
90 /*
91 * Try and keep often cranked stats on top so they'll fit on one
92 * cache line.
93 */
94 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
95 _MACRO(schedule) \
96 _MACRO(acct_run) \
97 _MACRO(acct_no_work) \
98 _MACRO(acct_balance) \
99 _MACRO(acct_reorder) \
100 _MACRO(acct_min_credit) \
101 _MACRO(acct_vcpu_active) \
102 _MACRO(acct_vcpu_idle) \
103 _MACRO(vcpu_sleep) \
104 _MACRO(vcpu_wake_running) \
105 _MACRO(vcpu_wake_onrunq) \
106 _MACRO(vcpu_wake_runnable) \
107 _MACRO(vcpu_wake_not_runnable) \
108 _MACRO(vcpu_park) \
109 _MACRO(vcpu_unpark) \
110 _MACRO(tickle_local_idler) \
111 _MACRO(tickle_local_over) \
112 _MACRO(tickle_local_under) \
113 _MACRO(tickle_local_other) \
114 _MACRO(tickle_idlers_none) \
115 _MACRO(tickle_idlers_some) \
116 _MACRO(load_balance_idle) \
117 _MACRO(load_balance_over) \
118 _MACRO(load_balance_other) \
119 _MACRO(steal_trylock_failed) \
120 _MACRO(steal_peer_idle) \
121 _MACRO(migrate_queued) \
122 _MACRO(migrate_running) \
123 _MACRO(dom_init) \
124 _MACRO(dom_destroy) \
125 _MACRO(vcpu_init) \
126 _MACRO(vcpu_destroy)
128 #ifndef NDEBUG
129 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
130 _MACRO(vcpu_check)
131 #else
132 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
133 #endif
135 #define CSCHED_STATS_EXPAND(_MACRO) \
136 CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
137 CSCHED_STATS_EXPAND_SCHED(_MACRO)
139 #define CSCHED_STATS_RESET() \
140 do \
141 { \
142 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
143 } while ( 0 )
145 #define CSCHED_STATS_DEFINE() \
146 struct \
147 { \
148 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
149 } stats;
151 #define CSCHED_STATS_PRINTK() \
152 do \
153 { \
154 printk("stats:\n"); \
155 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
156 } while ( 0 )
158 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
160 #define CSCHED_VCPU_STATS_RESET(_V) \
161 do \
162 { \
163 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
164 } while ( 0 )
166 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
168 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
170 #else /* CSCHED_STATS */
172 #define CSCHED_STATS_RESET() do {} while ( 0 )
173 #define CSCHED_STATS_DEFINE()
174 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
175 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
176 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
177 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
178 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
180 #endif /* CSCHED_STATS */
183 /*
184 * Physical CPU
185 */
186 struct csched_pcpu {
187 struct list_head runq;
188 uint32_t runq_sort_last;
189 struct timer ticker;
190 unsigned int tick;
191 };
193 /*
194 * Virtual CPU
195 */
196 struct csched_vcpu {
197 struct list_head runq_elem;
198 struct list_head active_vcpu_elem;
199 struct csched_dom *sdom;
200 struct vcpu *vcpu;
201 atomic_t credit;
202 uint16_t flags;
203 int16_t pri;
204 #ifdef CSCHED_STATS
205 struct {
206 int credit_last;
207 uint32_t credit_incr;
208 uint32_t state_active;
209 uint32_t state_idle;
210 uint32_t migrate_q;
211 uint32_t migrate_r;
212 } stats;
213 #endif
214 };
216 /*
217 * Domain
218 */
219 struct csched_dom {
220 struct list_head active_vcpu;
221 struct list_head active_sdom_elem;
222 struct domain *dom;
223 uint16_t active_vcpu_count;
224 uint16_t weight;
225 uint16_t cap;
226 };
228 /*
229 * System-wide private data
230 */
231 struct csched_private {
232 spinlock_t lock;
233 struct list_head active_sdom;
234 uint32_t ncpus;
235 unsigned int master;
236 cpumask_t idlers;
237 uint32_t weight;
238 uint32_t credit;
239 int credit_balance;
240 uint32_t runq_sort;
241 CSCHED_STATS_DEFINE()
242 };
245 /*
246 * Global variables
247 */
248 static struct csched_private csched_priv;
250 static void csched_tick(void *_cpu);
252 static inline int
253 __vcpu_on_runq(struct csched_vcpu *svc)
254 {
255 return !list_empty(&svc->runq_elem);
256 }
258 static inline struct csched_vcpu *
259 __runq_elem(struct list_head *elem)
260 {
261 return list_entry(elem, struct csched_vcpu, runq_elem);
262 }
264 static inline void
265 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
266 {
267 const struct list_head * const runq = RUNQ(cpu);
268 struct list_head *iter;
270 BUG_ON( __vcpu_on_runq(svc) );
271 BUG_ON( cpu != svc->vcpu->processor );
273 list_for_each( iter, runq )
274 {
275 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
276 if ( svc->pri > iter_svc->pri )
277 break;
278 }
280 list_add_tail(&svc->runq_elem, iter);
281 }
283 static inline void
284 __runq_remove(struct csched_vcpu *svc)
285 {
286 BUG_ON( !__vcpu_on_runq(svc) );
287 list_del_init(&svc->runq_elem);
288 }
290 static inline void
291 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
292 {
293 struct csched_vcpu * const cur =
294 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
295 cpumask_t mask;
297 ASSERT(cur);
298 cpus_clear(mask);
300 /* If strictly higher priority than current VCPU, signal the CPU */
301 if ( new->pri > cur->pri )
302 {
303 if ( cur->pri == CSCHED_PRI_IDLE )
304 CSCHED_STAT_CRANK(tickle_local_idler);
305 else if ( cur->pri == CSCHED_PRI_TS_OVER )
306 CSCHED_STAT_CRANK(tickle_local_over);
307 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
308 CSCHED_STAT_CRANK(tickle_local_under);
309 else
310 CSCHED_STAT_CRANK(tickle_local_other);
312 cpu_set(cpu, mask);
313 }
315 /*
316 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
317 * let them know there is runnable work in the system...
318 */
319 if ( cur->pri > CSCHED_PRI_IDLE )
320 {
321 if ( cpus_empty(csched_priv.idlers) )
322 {
323 CSCHED_STAT_CRANK(tickle_idlers_none);
324 }
325 else
326 {
327 CSCHED_STAT_CRANK(tickle_idlers_some);
328 cpus_or(mask, mask, csched_priv.idlers);
329 cpus_and(mask, mask, new->vcpu->cpu_affinity);
330 }
331 }
333 /* Send scheduler interrupts to designated CPUs */
334 if ( !cpus_empty(mask) )
335 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
336 }
338 static int
339 csched_pcpu_init(int cpu)
340 {
341 struct csched_pcpu *spc;
342 unsigned long flags;
344 /* Allocate per-PCPU info */
345 spc = xmalloc(struct csched_pcpu);
346 if ( spc == NULL )
347 return -1;
349 spin_lock_irqsave(&csched_priv.lock, flags);
351 /* Initialize/update system-wide config */
352 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
353 if ( csched_priv.ncpus <= cpu )
354 csched_priv.ncpus = cpu + 1;
355 if ( csched_priv.master >= csched_priv.ncpus )
356 csched_priv.master = cpu;
358 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
359 INIT_LIST_HEAD(&spc->runq);
360 spc->runq_sort_last = csched_priv.runq_sort;
361 per_cpu(schedule_data, cpu).sched_priv = spc;
363 /* Start off idling... */
364 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
365 cpu_set(cpu, csched_priv.idlers);
367 spin_unlock_irqrestore(&csched_priv.lock, flags);
369 return 0;
370 }
372 #ifndef NDEBUG
373 static inline void
374 __csched_vcpu_check(struct vcpu *vc)
375 {
376 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
377 struct csched_dom * const sdom = svc->sdom;
379 BUG_ON( svc->vcpu != vc );
380 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
381 if ( sdom )
382 {
383 BUG_ON( is_idle_vcpu(vc) );
384 BUG_ON( sdom->dom != vc->domain );
385 }
386 else
387 {
388 BUG_ON( !is_idle_vcpu(vc) );
389 }
391 CSCHED_STAT_CRANK(vcpu_check);
392 }
393 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
394 #else
395 #define CSCHED_VCPU_CHECK(_vc)
396 #endif
398 static inline int
399 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
400 {
401 /*
402 * Don't pick up work that's in the peer's scheduling tail. Also only pick
403 * up work that's allowed to run on our CPU.
404 */
405 return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
406 }
408 static int
409 csched_cpu_pick(struct vcpu *vc)
410 {
411 cpumask_t cpus;
412 cpumask_t idlers;
413 int cpu;
415 /*
416 * Pick from online CPUs in VCPU's affinity mask, giving a
417 * preference to its current processor if it's in there.
418 */
419 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
420 cpu = cpu_isset(vc->processor, cpus)
421 ? vc->processor
422 : cycle_cpu(vc->processor, cpus);
423 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
425 /*
426 * Try to find an idle processor within the above constraints.
427 *
428 * In multi-core and multi-threaded CPUs, not all idle execution
429 * vehicles are equal!
430 *
431 * We give preference to the idle execution vehicle with the most
432 * idling neighbours in its grouping. This distributes work across
433 * distinct cores first and guarantees we don't do something stupid
434 * like run two VCPUs on co-hyperthreads while there are idle cores
435 * or sockets.
436 */
437 idlers = csched_priv.idlers;
438 cpu_set(cpu, idlers);
439 cpus_and(cpus, cpus, idlers);
440 cpu_clear(cpu, cpus);
442 while ( !cpus_empty(cpus) )
443 {
444 cpumask_t cpu_idlers;
445 cpumask_t nxt_idlers;
446 int nxt;
448 nxt = cycle_cpu(cpu, cpus);
450 if ( cpu_isset(cpu, cpu_core_map[nxt]) )
451 {
452 ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
453 cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
454 cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
455 }
456 else
457 {
458 ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
459 cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
460 cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
461 }
463 if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
464 {
465 cpu = nxt;
466 cpu_clear(cpu, cpus);
467 }
468 else
469 {
470 cpus_andnot(cpus, cpus, nxt_idlers);
471 }
472 }
474 return cpu;
475 }
477 static inline void
478 __csched_vcpu_acct_start(struct csched_vcpu *svc)
479 {
480 struct csched_dom * const sdom = svc->sdom;
481 unsigned long flags;
483 spin_lock_irqsave(&csched_priv.lock, flags);
485 if ( list_empty(&svc->active_vcpu_elem) )
486 {
487 CSCHED_VCPU_STAT_CRANK(svc, state_active);
488 CSCHED_STAT_CRANK(acct_vcpu_active);
490 sdom->active_vcpu_count++;
491 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
492 if ( list_empty(&sdom->active_sdom_elem) )
493 {
494 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
495 csched_priv.weight += sdom->weight;
496 }
497 }
499 spin_unlock_irqrestore(&csched_priv.lock, flags);
500 }
502 static inline void
503 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
504 {
505 struct csched_dom * const sdom = svc->sdom;
507 BUG_ON( list_empty(&svc->active_vcpu_elem) );
509 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
510 CSCHED_STAT_CRANK(acct_vcpu_idle);
512 sdom->active_vcpu_count--;
513 list_del_init(&svc->active_vcpu_elem);
514 if ( list_empty(&sdom->active_vcpu) )
515 {
516 BUG_ON( csched_priv.weight < sdom->weight );
517 list_del_init(&sdom->active_sdom_elem);
518 csched_priv.weight -= sdom->weight;
519 }
520 }
522 static void
523 csched_vcpu_acct(unsigned int cpu)
524 {
525 struct csched_vcpu * const svc = CSCHED_VCPU(current);
527 ASSERT( current->processor == cpu );
528 ASSERT( svc->sdom != NULL );
530 /*
531 * If this VCPU's priority was boosted when it last awoke, reset it.
532 * If the VCPU is found here, then it's consuming a non-negligeable
533 * amount of CPU resources and should no longer be boosted.
534 */
535 if ( svc->pri == CSCHED_PRI_TS_BOOST )
536 svc->pri = CSCHED_PRI_TS_UNDER;
538 /*
539 * Update credits
540 */
541 atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
543 /*
544 * Put this VCPU and domain back on the active list if it was
545 * idling.
546 *
547 * If it's been active a while, check if we'd be better off
548 * migrating it to run elsewhere (see multi-core and multi-thread
549 * support in csched_cpu_pick()).
550 */
551 if ( list_empty(&svc->active_vcpu_elem) )
552 {
553 __csched_vcpu_acct_start(svc);
554 }
555 else if ( csched_cpu_pick(current) != cpu )
556 {
557 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
558 CSCHED_STAT_CRANK(migrate_running);
559 set_bit(_VPF_migrating, &current->pause_flags);
560 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
561 }
562 }
564 static int
565 csched_vcpu_init(struct vcpu *vc)
566 {
567 struct domain * const dom = vc->domain;
568 struct csched_dom *sdom = CSCHED_DOM(dom);
569 struct csched_vcpu *svc;
571 CSCHED_STAT_CRANK(vcpu_init);
573 /* Allocate per-VCPU info */
574 svc = xmalloc(struct csched_vcpu);
575 if ( svc == NULL )
576 return -1;
578 INIT_LIST_HEAD(&svc->runq_elem);
579 INIT_LIST_HEAD(&svc->active_vcpu_elem);
580 svc->sdom = sdom;
581 svc->vcpu = vc;
582 atomic_set(&svc->credit, 0);
583 svc->flags = 0U;
584 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
585 CSCHED_VCPU_STATS_RESET(svc);
586 vc->sched_priv = svc;
588 /* Allocate per-PCPU info */
589 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
590 {
591 if ( csched_pcpu_init(vc->processor) != 0 )
592 return -1;
593 }
595 CSCHED_VCPU_CHECK(vc);
596 return 0;
597 }
599 static void
600 csched_vcpu_destroy(struct vcpu *vc)
601 {
602 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
603 struct csched_dom * const sdom = svc->sdom;
604 unsigned long flags;
606 CSCHED_STAT_CRANK(vcpu_destroy);
608 BUG_ON( sdom == NULL );
609 BUG_ON( !list_empty(&svc->runq_elem) );
611 spin_lock_irqsave(&csched_priv.lock, flags);
613 if ( !list_empty(&svc->active_vcpu_elem) )
614 __csched_vcpu_acct_stop_locked(svc);
616 spin_unlock_irqrestore(&csched_priv.lock, flags);
618 xfree(svc);
619 }
621 static void
622 csched_vcpu_sleep(struct vcpu *vc)
623 {
624 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
626 CSCHED_STAT_CRANK(vcpu_sleep);
628 BUG_ON( is_idle_vcpu(vc) );
630 if ( per_cpu(schedule_data, vc->processor).curr == vc )
631 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
632 else if ( __vcpu_on_runq(svc) )
633 __runq_remove(svc);
634 }
636 static void
637 csched_vcpu_wake(struct vcpu *vc)
638 {
639 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
640 const unsigned int cpu = vc->processor;
642 BUG_ON( is_idle_vcpu(vc) );
644 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
645 {
646 CSCHED_STAT_CRANK(vcpu_wake_running);
647 return;
648 }
649 if ( unlikely(__vcpu_on_runq(svc)) )
650 {
651 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
652 return;
653 }
655 if ( likely(vcpu_runnable(vc)) )
656 CSCHED_STAT_CRANK(vcpu_wake_runnable);
657 else
658 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
660 /*
661 * We temporarly boost the priority of awaking VCPUs!
662 *
663 * If this VCPU consumes a non negligeable amount of CPU, it
664 * will eventually find itself in the credit accounting code
665 * path where its priority will be reset to normal.
666 *
667 * If on the other hand the VCPU consumes little CPU and is
668 * blocking and awoken a lot (doing I/O for example), its
669 * priority will remain boosted, optimizing it's wake-to-run
670 * latencies.
671 *
672 * This allows wake-to-run latency sensitive VCPUs to preempt
673 * more CPU resource intensive VCPUs without impacting overall
674 * system fairness.
675 *
676 * The one exception is for VCPUs of capped domains unpausing
677 * after earning credits they had overspent. We don't boost
678 * those.
679 */
680 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
681 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
682 {
683 svc->pri = CSCHED_PRI_TS_BOOST;
684 }
686 /* Put the VCPU on the runq and tickle CPUs */
687 __runq_insert(cpu, svc);
688 __runq_tickle(cpu, svc);
689 }
691 static int
692 csched_dom_cntl(
693 struct domain *d,
694 struct xen_domctl_scheduler_op *op)
695 {
696 struct csched_dom * const sdom = CSCHED_DOM(d);
697 unsigned long flags;
699 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
700 {
701 op->u.credit.weight = sdom->weight;
702 op->u.credit.cap = sdom->cap;
703 }
704 else
705 {
706 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
708 spin_lock_irqsave(&csched_priv.lock, flags);
710 if ( op->u.credit.weight != 0 )
711 {
712 if ( !list_empty(&sdom->active_sdom_elem) )
713 {
714 csched_priv.weight -= sdom->weight;
715 csched_priv.weight += op->u.credit.weight;
716 }
717 sdom->weight = op->u.credit.weight;
718 }
720 if ( op->u.credit.cap != (uint16_t)~0U )
721 sdom->cap = op->u.credit.cap;
723 spin_unlock_irqrestore(&csched_priv.lock, flags);
724 }
726 return 0;
727 }
729 static int
730 csched_dom_init(struct domain *dom)
731 {
732 struct csched_dom *sdom;
734 CSCHED_STAT_CRANK(dom_init);
736 if ( is_idle_domain(dom) )
737 return 0;
739 sdom = xmalloc(struct csched_dom);
740 if ( sdom == NULL )
741 return -ENOMEM;
743 /* Initialize credit and weight */
744 INIT_LIST_HEAD(&sdom->active_vcpu);
745 sdom->active_vcpu_count = 0;
746 INIT_LIST_HEAD(&sdom->active_sdom_elem);
747 sdom->dom = dom;
748 sdom->weight = CSCHED_DEFAULT_WEIGHT;
749 sdom->cap = 0U;
750 dom->sched_priv = sdom;
752 return 0;
753 }
755 static void
756 csched_dom_destroy(struct domain *dom)
757 {
758 CSCHED_STAT_CRANK(dom_destroy);
759 xfree(CSCHED_DOM(dom));
760 }
762 /*
763 * This is a O(n) optimized sort of the runq.
764 *
765 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
766 * through the runq and move up any UNDERs that are preceded by OVERS. We
767 * remember the last UNDER to make the move up operation O(1).
768 */
769 static void
770 csched_runq_sort(unsigned int cpu)
771 {
772 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
773 struct list_head *runq, *elem, *next, *last_under;
774 struct csched_vcpu *svc_elem;
775 unsigned long flags;
776 int sort_epoch;
778 sort_epoch = csched_priv.runq_sort;
779 if ( sort_epoch == spc->runq_sort_last )
780 return;
782 spc->runq_sort_last = sort_epoch;
784 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
786 runq = &spc->runq;
787 elem = runq->next;
788 last_under = runq;
790 while ( elem != runq )
791 {
792 next = elem->next;
793 svc_elem = __runq_elem(elem);
795 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
796 {
797 /* does elem need to move up the runq? */
798 if ( elem->prev != last_under )
799 {
800 list_del(elem);
801 list_add(elem, last_under);
802 }
803 last_under = elem;
804 }
806 elem = next;
807 }
809 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
810 }
812 static void
813 csched_acct(void)
814 {
815 unsigned long flags;
816 struct list_head *iter_vcpu, *next_vcpu;
817 struct list_head *iter_sdom, *next_sdom;
818 struct csched_vcpu *svc;
819 struct csched_dom *sdom;
820 uint32_t credit_total;
821 uint32_t weight_total;
822 uint32_t weight_left;
823 uint32_t credit_fair;
824 uint32_t credit_peak;
825 uint32_t credit_cap;
826 int credit_balance;
827 int credit_xtra;
828 int credit;
831 spin_lock_irqsave(&csched_priv.lock, flags);
833 weight_total = csched_priv.weight;
834 credit_total = csched_priv.credit;
836 /* Converge balance towards 0 when it drops negative */
837 if ( csched_priv.credit_balance < 0 )
838 {
839 credit_total -= csched_priv.credit_balance;
840 CSCHED_STAT_CRANK(acct_balance);
841 }
843 if ( unlikely(weight_total == 0) )
844 {
845 csched_priv.credit_balance = 0;
846 spin_unlock_irqrestore(&csched_priv.lock, flags);
847 CSCHED_STAT_CRANK(acct_no_work);
848 return;
849 }
851 CSCHED_STAT_CRANK(acct_run);
853 weight_left = weight_total;
854 credit_balance = 0;
855 credit_xtra = 0;
856 credit_cap = 0U;
858 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
859 {
860 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
862 BUG_ON( is_idle_domain(sdom->dom) );
863 BUG_ON( sdom->active_vcpu_count == 0 );
864 BUG_ON( sdom->weight == 0 );
865 BUG_ON( sdom->weight > weight_left );
867 weight_left -= sdom->weight;
869 /*
870 * A domain's fair share is computed using its weight in competition
871 * with that of all other active domains.
872 *
873 * At most, a domain can use credits to run all its active VCPUs
874 * for one full accounting period. We allow a domain to earn more
875 * only when the system-wide credit balance is negative.
876 */
877 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
878 if ( csched_priv.credit_balance < 0 )
879 {
880 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
881 (weight_total - 1)
882 ) / weight_total;
883 }
885 if ( sdom->cap != 0U )
886 {
887 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
888 if ( credit_cap < credit_peak )
889 credit_peak = credit_cap;
891 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
892 ) / sdom->active_vcpu_count;
893 }
895 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
896 ) / weight_total;
898 if ( credit_fair < credit_peak )
899 {
900 credit_xtra = 1;
901 }
902 else
903 {
904 if ( weight_left != 0U )
905 {
906 /* Give other domains a chance at unused credits */
907 credit_total += ( ( ( credit_fair - credit_peak
908 ) * weight_total
909 ) + ( weight_left - 1 )
910 ) / weight_left;
911 }
913 if ( credit_xtra )
914 {
915 /*
916 * Lazily keep domains with extra credits at the head of
917 * the queue to give others a chance at them in future
918 * accounting periods.
919 */
920 CSCHED_STAT_CRANK(acct_reorder);
921 list_del(&sdom->active_sdom_elem);
922 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
923 }
925 credit_fair = credit_peak;
926 }
928 /* Compute fair share per VCPU */
929 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
930 ) / sdom->active_vcpu_count;
933 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
934 {
935 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
936 BUG_ON( sdom != svc->sdom );
938 /* Increment credit */
939 atomic_add(credit_fair, &svc->credit);
940 credit = atomic_read(&svc->credit);
942 /*
943 * Recompute priority or, if VCPU is idling, remove it from
944 * the active list.
945 */
946 if ( credit < 0 )
947 {
948 svc->pri = CSCHED_PRI_TS_OVER;
950 /* Park running VCPUs of capped-out domains */
951 if ( sdom->cap != 0U &&
952 credit < -credit_cap &&
953 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
954 {
955 CSCHED_STAT_CRANK(vcpu_park);
956 vcpu_pause_nosync(svc->vcpu);
957 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
958 }
960 /* Lower bound on credits */
961 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
962 {
963 CSCHED_STAT_CRANK(acct_min_credit);
964 credit = -CSCHED_CREDITS_PER_TSLICE;
965 atomic_set(&svc->credit, credit);
966 }
967 }
968 else
969 {
970 svc->pri = CSCHED_PRI_TS_UNDER;
972 /* Unpark any capped domains whose credits go positive */
973 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
974 {
975 /*
976 * It's important to unset the flag AFTER the unpause()
977 * call to make sure the VCPU's priority is not boosted
978 * if it is woken up here.
979 */
980 CSCHED_STAT_CRANK(vcpu_unpark);
981 vcpu_unpause(svc->vcpu);
982 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
983 }
985 /* Upper bound on credits means VCPU stops earning */
986 if ( credit > CSCHED_CREDITS_PER_TSLICE )
987 {
988 __csched_vcpu_acct_stop_locked(svc);
989 credit = 0;
990 atomic_set(&svc->credit, credit);
991 }
992 }
994 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
995 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
996 credit_balance += credit;
997 }
998 }
1000 csched_priv.credit_balance = credit_balance;
1002 spin_unlock_irqrestore(&csched_priv.lock, flags);
1004 /* Inform each CPU that its runq needs to be sorted */
1005 csched_priv.runq_sort++;
1008 static void
1009 csched_tick(void *_cpu)
1011 unsigned int cpu = (unsigned long)_cpu;
1012 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
1014 spc->tick++;
1016 /*
1017 * Accounting for running VCPU
1018 */
1019 if ( !is_idle_vcpu(current) )
1020 csched_vcpu_acct(cpu);
1022 /*
1023 * Host-wide accounting duty
1025 * Note: Currently, this is always done by the master boot CPU. Eventually,
1026 * we could distribute or at the very least cycle the duty.
1027 */
1028 if ( (csched_priv.master == cpu) &&
1029 (spc->tick % CSCHED_TICKS_PER_ACCT) == 0 )
1031 csched_acct();
1034 /*
1035 * Check if runq needs to be sorted
1037 * Every physical CPU resorts the runq after the accounting master has
1038 * modified priorities. This is a special O(n) sort and runs at most
1039 * once per accounting period (currently 30 milliseconds).
1040 */
1041 csched_runq_sort(cpu);
1043 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1046 static struct csched_vcpu *
1047 csched_runq_steal(int peer_cpu, int cpu, int pri)
1049 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1050 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1051 struct csched_vcpu *speer;
1052 struct list_head *iter;
1053 struct vcpu *vc;
1055 /*
1056 * Don't steal from an idle CPU's runq because it's about to
1057 * pick up work from it itself.
1058 */
1059 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1061 list_for_each( iter, &peer_pcpu->runq )
1063 speer = __runq_elem(iter);
1065 /*
1066 * If next available VCPU here is not of strictly higher
1067 * priority than ours, this PCPU is useless to us.
1068 */
1069 if ( speer->pri <= pri )
1070 break;
1072 /* Is this VCPU is runnable on our PCPU? */
1073 vc = speer->vcpu;
1074 BUG_ON( is_idle_vcpu(vc) );
1076 if (__csched_vcpu_is_migrateable(vc, cpu))
1078 /* We got a candidate. Grab it! */
1079 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1080 CSCHED_STAT_CRANK(migrate_queued);
1081 __runq_remove(speer);
1082 vc->processor = cpu;
1083 return speer;
1088 CSCHED_STAT_CRANK(steal_peer_idle);
1089 return NULL;
1092 static struct csched_vcpu *
1093 csched_load_balance(int cpu, struct csched_vcpu *snext)
1095 struct csched_vcpu *speer;
1096 cpumask_t workers;
1097 int peer_cpu;
1099 BUG_ON( cpu != snext->vcpu->processor );
1101 /* If this CPU is going offline we shouldn't steal work. */
1102 if ( unlikely(!cpu_online(cpu)) )
1103 goto out;
1105 if ( snext->pri == CSCHED_PRI_IDLE )
1106 CSCHED_STAT_CRANK(load_balance_idle);
1107 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1108 CSCHED_STAT_CRANK(load_balance_over);
1109 else
1110 CSCHED_STAT_CRANK(load_balance_other);
1112 /*
1113 * Peek at non-idling CPUs in the system, starting with our
1114 * immediate neighbour.
1115 */
1116 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1117 cpu_clear(cpu, workers);
1118 peer_cpu = cpu;
1120 while ( !cpus_empty(workers) )
1122 peer_cpu = cycle_cpu(peer_cpu, workers);
1123 cpu_clear(peer_cpu, workers);
1125 /*
1126 * Get ahold of the scheduler lock for this peer CPU.
1128 * Note: We don't spin on this lock but simply try it. Spinning could
1129 * cause a deadlock if the peer CPU is also load balancing and trying
1130 * to lock this CPU.
1131 */
1132 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1134 CSCHED_STAT_CRANK(steal_trylock_failed);
1135 continue;
1138 /*
1139 * Any work over there to steal?
1140 */
1141 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1142 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1143 if ( speer != NULL )
1144 return speer;
1147 out:
1148 /* Failed to find more important work elsewhere... */
1149 __runq_remove(snext);
1150 return snext;
1153 /*
1154 * This function is in the critical path. It is designed to be simple and
1155 * fast for the common case.
1156 */
1157 static struct task_slice
1158 csched_schedule(s_time_t now)
1160 const int cpu = smp_processor_id();
1161 struct list_head * const runq = RUNQ(cpu);
1162 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1163 struct csched_vcpu *snext;
1164 struct task_slice ret;
1166 CSCHED_STAT_CRANK(schedule);
1167 CSCHED_VCPU_CHECK(current);
1169 /*
1170 * Select next runnable local VCPU (ie top of local runq)
1171 */
1172 if ( vcpu_runnable(current) )
1173 __runq_insert(cpu, scurr);
1174 else
1175 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1177 snext = __runq_elem(runq->next);
1179 /*
1180 * SMP Load balance:
1182 * If the next highest priority local runnable VCPU has already eaten
1183 * through its credits, look on other PCPUs to see if we have more
1184 * urgent work... If not, csched_load_balance() will return snext, but
1185 * already removed from the runq.
1186 */
1187 if ( snext->pri > CSCHED_PRI_TS_OVER )
1188 __runq_remove(snext);
1189 else
1190 snext = csched_load_balance(cpu, snext);
1192 /*
1193 * Update idlers mask if necessary. When we're idling, other CPUs
1194 * will tickle us when they get extra work.
1195 */
1196 if ( snext->pri == CSCHED_PRI_IDLE )
1198 if ( !cpu_isset(cpu, csched_priv.idlers) )
1199 cpu_set(cpu, csched_priv.idlers);
1201 else if ( cpu_isset(cpu, csched_priv.idlers) )
1203 cpu_clear(cpu, csched_priv.idlers);
1206 /*
1207 * Return task to run next...
1208 */
1209 ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
1210 ret.task = snext->vcpu;
1212 CSCHED_VCPU_CHECK(ret.task);
1213 return ret;
1216 static void
1217 csched_dump_vcpu(struct csched_vcpu *svc)
1219 struct csched_dom * const sdom = svc->sdom;
1221 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1222 svc->vcpu->domain->domain_id,
1223 svc->vcpu->vcpu_id,
1224 svc->pri,
1225 svc->flags,
1226 svc->vcpu->processor);
1228 if ( sdom )
1230 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1231 #ifdef CSCHED_STATS
1232 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1233 svc->stats.credit_last,
1234 svc->stats.credit_incr,
1235 svc->stats.state_active,
1236 svc->stats.state_idle,
1237 svc->stats.migrate_q,
1238 svc->stats.migrate_r);
1239 #endif
1242 printk("\n");
1245 static void
1246 csched_dump_pcpu(int cpu)
1248 struct list_head *runq, *iter;
1249 struct csched_pcpu *spc;
1250 struct csched_vcpu *svc;
1251 int loop;
1252 char cpustr[100];
1254 spc = CSCHED_PCPU(cpu);
1255 runq = &spc->runq;
1257 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_sibling_map[cpu]);
1258 printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
1259 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_core_map[cpu]);
1260 printk("core=%s\n", cpustr);
1262 /* current VCPU */
1263 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1264 if ( svc )
1266 printk("\trun: ");
1267 csched_dump_vcpu(svc);
1270 loop = 0;
1271 list_for_each( iter, runq )
1273 svc = __runq_elem(iter);
1274 if ( svc )
1276 printk("\t%3d: ", ++loop);
1277 csched_dump_vcpu(svc);
1282 static void
1283 csched_dump(void)
1285 struct list_head *iter_sdom, *iter_svc;
1286 int loop;
1287 char idlers_buf[100];
1289 printk("info:\n"
1290 "\tncpus = %u\n"
1291 "\tmaster = %u\n"
1292 "\tcredit = %u\n"
1293 "\tcredit balance = %d\n"
1294 "\tweight = %u\n"
1295 "\trunq_sort = %u\n"
1296 "\tdefault-weight = %d\n"
1297 "\tmsecs per tick = %dms\n"
1298 "\tcredits per tick = %d\n"
1299 "\tticks per tslice = %d\n"
1300 "\tticks per acct = %d\n",
1301 csched_priv.ncpus,
1302 csched_priv.master,
1303 csched_priv.credit,
1304 csched_priv.credit_balance,
1305 csched_priv.weight,
1306 csched_priv.runq_sort,
1307 CSCHED_DEFAULT_WEIGHT,
1308 CSCHED_MSECS_PER_TICK,
1309 CSCHED_CREDITS_PER_TICK,
1310 CSCHED_TICKS_PER_TSLICE,
1311 CSCHED_TICKS_PER_ACCT);
1313 cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
1314 printk("idlers: %s\n", idlers_buf);
1316 CSCHED_STATS_PRINTK();
1318 printk("active vcpus:\n");
1319 loop = 0;
1320 list_for_each( iter_sdom, &csched_priv.active_sdom )
1322 struct csched_dom *sdom;
1323 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1325 list_for_each( iter_svc, &sdom->active_vcpu )
1327 struct csched_vcpu *svc;
1328 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1330 printk("\t%3d: ", ++loop);
1331 csched_dump_vcpu(svc);
1336 static void
1337 csched_init(void)
1339 spin_lock_init(&csched_priv.lock);
1340 INIT_LIST_HEAD(&csched_priv.active_sdom);
1341 csched_priv.ncpus = 0;
1342 csched_priv.master = UINT_MAX;
1343 cpus_clear(csched_priv.idlers);
1344 csched_priv.weight = 0U;
1345 csched_priv.credit = 0U;
1346 csched_priv.credit_balance = 0;
1347 csched_priv.runq_sort = 0U;
1348 CSCHED_STATS_RESET();
1351 /* Tickers cannot be kicked until SMP subsystem is alive. */
1352 static __init int csched_start_tickers(void)
1354 struct csched_pcpu *spc;
1355 unsigned int cpu;
1357 /* Is the credit scheduler initialised? */
1358 if ( csched_priv.ncpus == 0 )
1359 return 0;
1361 for_each_online_cpu ( cpu )
1363 spc = CSCHED_PCPU(cpu);
1364 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1367 return 0;
1369 __initcall(csched_start_tickers);
1372 struct scheduler sched_credit_def = {
1373 .name = "SMP Credit Scheduler",
1374 .opt_name = "credit",
1375 .sched_id = XEN_SCHEDULER_CREDIT,
1377 .init_domain = csched_dom_init,
1378 .destroy_domain = csched_dom_destroy,
1380 .init_vcpu = csched_vcpu_init,
1381 .destroy_vcpu = csched_vcpu_destroy,
1383 .sleep = csched_vcpu_sleep,
1384 .wake = csched_vcpu_wake,
1386 .adjust = csched_dom_cntl,
1388 .pick_cpu = csched_cpu_pick,
1389 .do_schedule = csched_schedule,
1391 .dump_cpu_state = csched_dump_pcpu,
1392 .dump_settings = csched_dump,
1393 .init = csched_init,
1394 };