ia64/xen-unstable

view xen/common/sched_credit.c @ 18594:5e4e234d58be

x86: Define __per_cpu_shift label to help kdump/crashdump.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Oct 08 13:11:06 2008 +0100 (2008-10-08)
parents c0db74e41662
children a44751edcb76
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_DEFAULT_WEIGHT 256
40 #define CSCHED_TICKS_PER_TSLICE 3
41 #define CSCHED_TICKS_PER_ACCT 3
42 #define CSCHED_MSECS_PER_TICK 10
43 #define CSCHED_MSECS_PER_TSLICE \
44 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
45 #define CSCHED_CREDITS_PER_TICK 100
46 #define CSCHED_CREDITS_PER_TSLICE \
47 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48 #define CSCHED_CREDITS_PER_ACCT \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
52 /*
53 * Priorities
54 */
55 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
56 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
57 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
58 #define CSCHED_PRI_IDLE -64 /* idle */
61 /*
62 * Flags
63 */
64 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
67 /*
68 * Useful macros
69 */
70 #define CSCHED_PCPU(_c) \
71 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
72 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
73 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
74 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
77 /*
78 * Stats
79 */
80 #ifdef CSCHED_STATS
82 #define CSCHED_STAT(_X) (csched_priv.stats._X)
83 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
84 #define CSCHED_STAT_PRINTK(_X) \
85 do \
86 { \
87 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
88 } while ( 0 );
90 /*
91 * Try and keep often cranked stats on top so they'll fit on one
92 * cache line.
93 */
94 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
95 _MACRO(schedule) \
96 _MACRO(acct_run) \
97 _MACRO(acct_no_work) \
98 _MACRO(acct_balance) \
99 _MACRO(acct_reorder) \
100 _MACRO(acct_min_credit) \
101 _MACRO(acct_vcpu_active) \
102 _MACRO(acct_vcpu_idle) \
103 _MACRO(vcpu_sleep) \
104 _MACRO(vcpu_wake_running) \
105 _MACRO(vcpu_wake_onrunq) \
106 _MACRO(vcpu_wake_runnable) \
107 _MACRO(vcpu_wake_not_runnable) \
108 _MACRO(vcpu_park) \
109 _MACRO(vcpu_unpark) \
110 _MACRO(tickle_local_idler) \
111 _MACRO(tickle_local_over) \
112 _MACRO(tickle_local_under) \
113 _MACRO(tickle_local_other) \
114 _MACRO(tickle_idlers_none) \
115 _MACRO(tickle_idlers_some) \
116 _MACRO(load_balance_idle) \
117 _MACRO(load_balance_over) \
118 _MACRO(load_balance_other) \
119 _MACRO(steal_trylock_failed) \
120 _MACRO(steal_peer_idle) \
121 _MACRO(migrate_queued) \
122 _MACRO(migrate_running) \
123 _MACRO(dom_init) \
124 _MACRO(dom_destroy) \
125 _MACRO(vcpu_init) \
126 _MACRO(vcpu_destroy)
128 #ifndef NDEBUG
129 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
130 _MACRO(vcpu_check)
131 #else
132 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
133 #endif
135 #define CSCHED_STATS_EXPAND(_MACRO) \
136 CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
137 CSCHED_STATS_EXPAND_SCHED(_MACRO)
139 #define CSCHED_STATS_RESET() \
140 do \
141 { \
142 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
143 } while ( 0 )
145 #define CSCHED_STATS_DEFINE() \
146 struct \
147 { \
148 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
149 } stats;
151 #define CSCHED_STATS_PRINTK() \
152 do \
153 { \
154 printk("stats:\n"); \
155 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
156 } while ( 0 )
158 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
160 #define CSCHED_VCPU_STATS_RESET(_V) \
161 do \
162 { \
163 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
164 } while ( 0 )
166 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
168 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
170 #else /* CSCHED_STATS */
172 #define CSCHED_STATS_RESET() do {} while ( 0 )
173 #define CSCHED_STATS_DEFINE()
174 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
175 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
176 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
177 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
178 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
180 #endif /* CSCHED_STATS */
183 /*
184 * Physical CPU
185 */
186 struct csched_pcpu {
187 struct list_head runq;
188 uint32_t runq_sort_last;
189 struct timer ticker;
190 unsigned int tick;
191 };
193 /*
194 * Virtual CPU
195 */
196 struct csched_vcpu {
197 struct list_head runq_elem;
198 struct list_head active_vcpu_elem;
199 struct csched_dom *sdom;
200 struct vcpu *vcpu;
201 atomic_t credit;
202 uint16_t flags;
203 int16_t pri;
204 #ifdef CSCHED_STATS
205 struct {
206 int credit_last;
207 uint32_t credit_incr;
208 uint32_t state_active;
209 uint32_t state_idle;
210 uint32_t migrate_q;
211 uint32_t migrate_r;
212 } stats;
213 #endif
214 };
216 /*
217 * Domain
218 */
219 struct csched_dom {
220 struct list_head active_vcpu;
221 struct list_head active_sdom_elem;
222 struct domain *dom;
223 uint16_t active_vcpu_count;
224 uint16_t weight;
225 uint16_t cap;
226 };
228 /*
229 * System-wide private data
230 */
231 struct csched_private {
232 spinlock_t lock;
233 struct list_head active_sdom;
234 uint32_t ncpus;
235 unsigned int master;
236 cpumask_t idlers;
237 uint32_t weight;
238 uint32_t credit;
239 int credit_balance;
240 uint32_t runq_sort;
241 CSCHED_STATS_DEFINE()
242 };
245 /*
246 * Global variables
247 */
248 static struct csched_private csched_priv;
250 static void csched_tick(void *_cpu);
252 static inline int
253 __cycle_cpu(int cpu, const cpumask_t *mask)
254 {
255 int nxt = next_cpu(cpu, *mask);
256 if (nxt == NR_CPUS)
257 nxt = first_cpu(*mask);
258 return nxt;
259 }
261 static inline int
262 __vcpu_on_runq(struct csched_vcpu *svc)
263 {
264 return !list_empty(&svc->runq_elem);
265 }
267 static inline struct csched_vcpu *
268 __runq_elem(struct list_head *elem)
269 {
270 return list_entry(elem, struct csched_vcpu, runq_elem);
271 }
273 static inline void
274 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
275 {
276 const struct list_head * const runq = RUNQ(cpu);
277 struct list_head *iter;
279 BUG_ON( __vcpu_on_runq(svc) );
280 BUG_ON( cpu != svc->vcpu->processor );
282 list_for_each( iter, runq )
283 {
284 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
285 if ( svc->pri > iter_svc->pri )
286 break;
287 }
289 list_add_tail(&svc->runq_elem, iter);
290 }
292 static inline void
293 __runq_remove(struct csched_vcpu *svc)
294 {
295 BUG_ON( !__vcpu_on_runq(svc) );
296 list_del_init(&svc->runq_elem);
297 }
299 static inline void
300 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
301 {
302 struct csched_vcpu * const cur =
303 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
304 cpumask_t mask;
306 ASSERT(cur);
307 cpus_clear(mask);
309 /* If strictly higher priority than current VCPU, signal the CPU */
310 if ( new->pri > cur->pri )
311 {
312 if ( cur->pri == CSCHED_PRI_IDLE )
313 CSCHED_STAT_CRANK(tickle_local_idler);
314 else if ( cur->pri == CSCHED_PRI_TS_OVER )
315 CSCHED_STAT_CRANK(tickle_local_over);
316 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
317 CSCHED_STAT_CRANK(tickle_local_under);
318 else
319 CSCHED_STAT_CRANK(tickle_local_other);
321 cpu_set(cpu, mask);
322 }
324 /*
325 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
326 * let them know there is runnable work in the system...
327 */
328 if ( cur->pri > CSCHED_PRI_IDLE )
329 {
330 if ( cpus_empty(csched_priv.idlers) )
331 {
332 CSCHED_STAT_CRANK(tickle_idlers_none);
333 }
334 else
335 {
336 CSCHED_STAT_CRANK(tickle_idlers_some);
337 cpus_or(mask, mask, csched_priv.idlers);
338 cpus_and(mask, mask, new->vcpu->cpu_affinity);
339 }
340 }
342 /* Send scheduler interrupts to designated CPUs */
343 if ( !cpus_empty(mask) )
344 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
345 }
347 static int
348 csched_pcpu_init(int cpu)
349 {
350 struct csched_pcpu *spc;
351 unsigned long flags;
353 /* Allocate per-PCPU info */
354 spc = xmalloc(struct csched_pcpu);
355 if ( spc == NULL )
356 return -1;
358 spin_lock_irqsave(&csched_priv.lock, flags);
360 /* Initialize/update system-wide config */
361 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
362 if ( csched_priv.ncpus <= cpu )
363 csched_priv.ncpus = cpu + 1;
364 if ( csched_priv.master >= csched_priv.ncpus )
365 csched_priv.master = cpu;
367 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
368 INIT_LIST_HEAD(&spc->runq);
369 spc->runq_sort_last = csched_priv.runq_sort;
370 per_cpu(schedule_data, cpu).sched_priv = spc;
372 /* Start off idling... */
373 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
374 cpu_set(cpu, csched_priv.idlers);
376 spin_unlock_irqrestore(&csched_priv.lock, flags);
378 return 0;
379 }
381 #ifndef NDEBUG
382 static inline void
383 __csched_vcpu_check(struct vcpu *vc)
384 {
385 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
386 struct csched_dom * const sdom = svc->sdom;
388 BUG_ON( svc->vcpu != vc );
389 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
390 if ( sdom )
391 {
392 BUG_ON( is_idle_vcpu(vc) );
393 BUG_ON( sdom->dom != vc->domain );
394 }
395 else
396 {
397 BUG_ON( !is_idle_vcpu(vc) );
398 }
400 CSCHED_STAT_CRANK(vcpu_check);
401 }
402 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
403 #else
404 #define CSCHED_VCPU_CHECK(_vc)
405 #endif
407 static inline int
408 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
409 {
410 /*
411 * Don't pick up work that's in the peer's scheduling tail. Also only pick
412 * up work that's allowed to run on our CPU.
413 */
414 return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
415 }
417 static int
418 csched_cpu_pick(struct vcpu *vc)
419 {
420 cpumask_t cpus;
421 cpumask_t idlers;
422 int cpu;
424 /*
425 * Pick from online CPUs in VCPU's affinity mask, giving a
426 * preference to its current processor if it's in there.
427 */
428 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
429 cpu = cpu_isset(vc->processor, cpus)
430 ? vc->processor
431 : __cycle_cpu(vc->processor, &cpus);
432 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
434 /*
435 * Try to find an idle processor within the above constraints.
436 *
437 * In multi-core and multi-threaded CPUs, not all idle execution
438 * vehicles are equal!
439 *
440 * We give preference to the idle execution vehicle with the most
441 * idling neighbours in its grouping. This distributes work across
442 * distinct cores first and guarantees we don't do something stupid
443 * like run two VCPUs on co-hyperthreads while there are idle cores
444 * or sockets.
445 */
446 idlers = csched_priv.idlers;
447 cpu_set(cpu, idlers);
448 cpus_and(cpus, cpus, idlers);
449 cpu_clear(cpu, cpus);
451 while ( !cpus_empty(cpus) )
452 {
453 cpumask_t cpu_idlers;
454 cpumask_t nxt_idlers;
455 int nxt;
457 nxt = __cycle_cpu(cpu, &cpus);
459 if ( cpu_isset(cpu, cpu_core_map[nxt]) )
460 {
461 ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
462 cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
463 cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
464 }
465 else
466 {
467 ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
468 cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
469 cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
470 }
472 if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
473 {
474 cpu = nxt;
475 cpu_clear(cpu, cpus);
476 }
477 else
478 {
479 cpus_andnot(cpus, cpus, nxt_idlers);
480 }
481 }
483 return cpu;
484 }
486 static inline void
487 __csched_vcpu_acct_start(struct csched_vcpu *svc)
488 {
489 struct csched_dom * const sdom = svc->sdom;
490 unsigned long flags;
492 spin_lock_irqsave(&csched_priv.lock, flags);
494 if ( list_empty(&svc->active_vcpu_elem) )
495 {
496 CSCHED_VCPU_STAT_CRANK(svc, state_active);
497 CSCHED_STAT_CRANK(acct_vcpu_active);
499 sdom->active_vcpu_count++;
500 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
501 if ( list_empty(&sdom->active_sdom_elem) )
502 {
503 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
504 csched_priv.weight += sdom->weight;
505 }
506 }
508 spin_unlock_irqrestore(&csched_priv.lock, flags);
509 }
511 static inline void
512 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
513 {
514 struct csched_dom * const sdom = svc->sdom;
516 BUG_ON( list_empty(&svc->active_vcpu_elem) );
518 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
519 CSCHED_STAT_CRANK(acct_vcpu_idle);
521 sdom->active_vcpu_count--;
522 list_del_init(&svc->active_vcpu_elem);
523 if ( list_empty(&sdom->active_vcpu) )
524 {
525 BUG_ON( csched_priv.weight < sdom->weight );
526 list_del_init(&sdom->active_sdom_elem);
527 csched_priv.weight -= sdom->weight;
528 }
529 }
531 static void
532 csched_vcpu_acct(unsigned int cpu)
533 {
534 struct csched_vcpu * const svc = CSCHED_VCPU(current);
536 ASSERT( current->processor == cpu );
537 ASSERT( svc->sdom != NULL );
539 /*
540 * If this VCPU's priority was boosted when it last awoke, reset it.
541 * If the VCPU is found here, then it's consuming a non-negligeable
542 * amount of CPU resources and should no longer be boosted.
543 */
544 if ( svc->pri == CSCHED_PRI_TS_BOOST )
545 svc->pri = CSCHED_PRI_TS_UNDER;
547 /*
548 * Update credits
549 */
550 atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
552 /*
553 * Put this VCPU and domain back on the active list if it was
554 * idling.
555 *
556 * If it's been active a while, check if we'd be better off
557 * migrating it to run elsewhere (see multi-core and multi-thread
558 * support in csched_cpu_pick()).
559 */
560 if ( list_empty(&svc->active_vcpu_elem) )
561 {
562 __csched_vcpu_acct_start(svc);
563 }
564 else if ( csched_cpu_pick(current) != cpu )
565 {
566 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
567 CSCHED_STAT_CRANK(migrate_running);
568 set_bit(_VPF_migrating, &current->pause_flags);
569 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
570 }
571 }
573 static int
574 csched_vcpu_init(struct vcpu *vc)
575 {
576 struct domain * const dom = vc->domain;
577 struct csched_dom *sdom = CSCHED_DOM(dom);
578 struct csched_vcpu *svc;
580 CSCHED_STAT_CRANK(vcpu_init);
582 /* Allocate per-VCPU info */
583 svc = xmalloc(struct csched_vcpu);
584 if ( svc == NULL )
585 return -1;
587 INIT_LIST_HEAD(&svc->runq_elem);
588 INIT_LIST_HEAD(&svc->active_vcpu_elem);
589 svc->sdom = sdom;
590 svc->vcpu = vc;
591 atomic_set(&svc->credit, 0);
592 svc->flags = 0U;
593 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
594 CSCHED_VCPU_STATS_RESET(svc);
595 vc->sched_priv = svc;
597 /* Allocate per-PCPU info */
598 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
599 {
600 if ( csched_pcpu_init(vc->processor) != 0 )
601 return -1;
602 }
604 CSCHED_VCPU_CHECK(vc);
605 return 0;
606 }
608 static void
609 csched_vcpu_destroy(struct vcpu *vc)
610 {
611 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
612 struct csched_dom * const sdom = svc->sdom;
613 unsigned long flags;
615 CSCHED_STAT_CRANK(vcpu_destroy);
617 BUG_ON( sdom == NULL );
618 BUG_ON( !list_empty(&svc->runq_elem) );
620 spin_lock_irqsave(&csched_priv.lock, flags);
622 if ( !list_empty(&svc->active_vcpu_elem) )
623 __csched_vcpu_acct_stop_locked(svc);
625 spin_unlock_irqrestore(&csched_priv.lock, flags);
627 xfree(svc);
628 }
630 static void
631 csched_vcpu_sleep(struct vcpu *vc)
632 {
633 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
635 CSCHED_STAT_CRANK(vcpu_sleep);
637 BUG_ON( is_idle_vcpu(vc) );
639 if ( per_cpu(schedule_data, vc->processor).curr == vc )
640 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
641 else if ( __vcpu_on_runq(svc) )
642 __runq_remove(svc);
643 }
645 static void
646 csched_vcpu_wake(struct vcpu *vc)
647 {
648 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
649 const unsigned int cpu = vc->processor;
651 BUG_ON( is_idle_vcpu(vc) );
653 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
654 {
655 CSCHED_STAT_CRANK(vcpu_wake_running);
656 return;
657 }
658 if ( unlikely(__vcpu_on_runq(svc)) )
659 {
660 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
661 return;
662 }
664 if ( likely(vcpu_runnable(vc)) )
665 CSCHED_STAT_CRANK(vcpu_wake_runnable);
666 else
667 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
669 /*
670 * We temporarly boost the priority of awaking VCPUs!
671 *
672 * If this VCPU consumes a non negligeable amount of CPU, it
673 * will eventually find itself in the credit accounting code
674 * path where its priority will be reset to normal.
675 *
676 * If on the other hand the VCPU consumes little CPU and is
677 * blocking and awoken a lot (doing I/O for example), its
678 * priority will remain boosted, optimizing it's wake-to-run
679 * latencies.
680 *
681 * This allows wake-to-run latency sensitive VCPUs to preempt
682 * more CPU resource intensive VCPUs without impacting overall
683 * system fairness.
684 *
685 * The one exception is for VCPUs of capped domains unpausing
686 * after earning credits they had overspent. We don't boost
687 * those.
688 */
689 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
690 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
691 {
692 svc->pri = CSCHED_PRI_TS_BOOST;
693 }
695 /* Put the VCPU on the runq and tickle CPUs */
696 __runq_insert(cpu, svc);
697 __runq_tickle(cpu, svc);
698 }
700 static int
701 csched_dom_cntl(
702 struct domain *d,
703 struct xen_domctl_scheduler_op *op)
704 {
705 struct csched_dom * const sdom = CSCHED_DOM(d);
706 unsigned long flags;
708 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
709 {
710 op->u.credit.weight = sdom->weight;
711 op->u.credit.cap = sdom->cap;
712 }
713 else
714 {
715 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
717 spin_lock_irqsave(&csched_priv.lock, flags);
719 if ( op->u.credit.weight != 0 )
720 {
721 if ( !list_empty(&sdom->active_sdom_elem) )
722 {
723 csched_priv.weight -= sdom->weight;
724 csched_priv.weight += op->u.credit.weight;
725 }
726 sdom->weight = op->u.credit.weight;
727 }
729 if ( op->u.credit.cap != (uint16_t)~0U )
730 sdom->cap = op->u.credit.cap;
732 spin_unlock_irqrestore(&csched_priv.lock, flags);
733 }
735 return 0;
736 }
738 static int
739 csched_dom_init(struct domain *dom)
740 {
741 struct csched_dom *sdom;
743 CSCHED_STAT_CRANK(dom_init);
745 if ( is_idle_domain(dom) )
746 return 0;
748 sdom = xmalloc(struct csched_dom);
749 if ( sdom == NULL )
750 return -ENOMEM;
752 /* Initialize credit and weight */
753 INIT_LIST_HEAD(&sdom->active_vcpu);
754 sdom->active_vcpu_count = 0;
755 INIT_LIST_HEAD(&sdom->active_sdom_elem);
756 sdom->dom = dom;
757 sdom->weight = CSCHED_DEFAULT_WEIGHT;
758 sdom->cap = 0U;
759 dom->sched_priv = sdom;
761 return 0;
762 }
764 static void
765 csched_dom_destroy(struct domain *dom)
766 {
767 CSCHED_STAT_CRANK(dom_destroy);
768 xfree(CSCHED_DOM(dom));
769 }
771 /*
772 * This is a O(n) optimized sort of the runq.
773 *
774 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
775 * through the runq and move up any UNDERs that are preceded by OVERS. We
776 * remember the last UNDER to make the move up operation O(1).
777 */
778 static void
779 csched_runq_sort(unsigned int cpu)
780 {
781 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
782 struct list_head *runq, *elem, *next, *last_under;
783 struct csched_vcpu *svc_elem;
784 unsigned long flags;
785 int sort_epoch;
787 sort_epoch = csched_priv.runq_sort;
788 if ( sort_epoch == spc->runq_sort_last )
789 return;
791 spc->runq_sort_last = sort_epoch;
793 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
795 runq = &spc->runq;
796 elem = runq->next;
797 last_under = runq;
799 while ( elem != runq )
800 {
801 next = elem->next;
802 svc_elem = __runq_elem(elem);
804 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
805 {
806 /* does elem need to move up the runq? */
807 if ( elem->prev != last_under )
808 {
809 list_del(elem);
810 list_add(elem, last_under);
811 }
812 last_under = elem;
813 }
815 elem = next;
816 }
818 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
819 }
821 static void
822 csched_acct(void)
823 {
824 unsigned long flags;
825 struct list_head *iter_vcpu, *next_vcpu;
826 struct list_head *iter_sdom, *next_sdom;
827 struct csched_vcpu *svc;
828 struct csched_dom *sdom;
829 uint32_t credit_total;
830 uint32_t weight_total;
831 uint32_t weight_left;
832 uint32_t credit_fair;
833 uint32_t credit_peak;
834 uint32_t credit_cap;
835 int credit_balance;
836 int credit_xtra;
837 int credit;
840 spin_lock_irqsave(&csched_priv.lock, flags);
842 weight_total = csched_priv.weight;
843 credit_total = csched_priv.credit;
845 /* Converge balance towards 0 when it drops negative */
846 if ( csched_priv.credit_balance < 0 )
847 {
848 credit_total -= csched_priv.credit_balance;
849 CSCHED_STAT_CRANK(acct_balance);
850 }
852 if ( unlikely(weight_total == 0) )
853 {
854 csched_priv.credit_balance = 0;
855 spin_unlock_irqrestore(&csched_priv.lock, flags);
856 CSCHED_STAT_CRANK(acct_no_work);
857 return;
858 }
860 CSCHED_STAT_CRANK(acct_run);
862 weight_left = weight_total;
863 credit_balance = 0;
864 credit_xtra = 0;
865 credit_cap = 0U;
867 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
868 {
869 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
871 BUG_ON( is_idle_domain(sdom->dom) );
872 BUG_ON( sdom->active_vcpu_count == 0 );
873 BUG_ON( sdom->weight == 0 );
874 BUG_ON( sdom->weight > weight_left );
876 weight_left -= sdom->weight;
878 /*
879 * A domain's fair share is computed using its weight in competition
880 * with that of all other active domains.
881 *
882 * At most, a domain can use credits to run all its active VCPUs
883 * for one full accounting period. We allow a domain to earn more
884 * only when the system-wide credit balance is negative.
885 */
886 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
887 if ( csched_priv.credit_balance < 0 )
888 {
889 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
890 (weight_total - 1)
891 ) / weight_total;
892 }
894 if ( sdom->cap != 0U )
895 {
896 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
897 if ( credit_cap < credit_peak )
898 credit_peak = credit_cap;
900 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
901 ) / sdom->active_vcpu_count;
902 }
904 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
905 ) / weight_total;
907 if ( credit_fair < credit_peak )
908 {
909 credit_xtra = 1;
910 }
911 else
912 {
913 if ( weight_left != 0U )
914 {
915 /* Give other domains a chance at unused credits */
916 credit_total += ( ( ( credit_fair - credit_peak
917 ) * weight_total
918 ) + ( weight_left - 1 )
919 ) / weight_left;
920 }
922 if ( credit_xtra )
923 {
924 /*
925 * Lazily keep domains with extra credits at the head of
926 * the queue to give others a chance at them in future
927 * accounting periods.
928 */
929 CSCHED_STAT_CRANK(acct_reorder);
930 list_del(&sdom->active_sdom_elem);
931 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
932 }
934 credit_fair = credit_peak;
935 }
937 /* Compute fair share per VCPU */
938 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
939 ) / sdom->active_vcpu_count;
942 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
943 {
944 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
945 BUG_ON( sdom != svc->sdom );
947 /* Increment credit */
948 atomic_add(credit_fair, &svc->credit);
949 credit = atomic_read(&svc->credit);
951 /*
952 * Recompute priority or, if VCPU is idling, remove it from
953 * the active list.
954 */
955 if ( credit < 0 )
956 {
957 svc->pri = CSCHED_PRI_TS_OVER;
959 /* Park running VCPUs of capped-out domains */
960 if ( sdom->cap != 0U &&
961 credit < -credit_cap &&
962 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
963 {
964 CSCHED_STAT_CRANK(vcpu_park);
965 vcpu_pause_nosync(svc->vcpu);
966 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
967 }
969 /* Lower bound on credits */
970 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
971 {
972 CSCHED_STAT_CRANK(acct_min_credit);
973 credit = -CSCHED_CREDITS_PER_TSLICE;
974 atomic_set(&svc->credit, credit);
975 }
976 }
977 else
978 {
979 svc->pri = CSCHED_PRI_TS_UNDER;
981 /* Unpark any capped domains whose credits go positive */
982 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
983 {
984 /*
985 * It's important to unset the flag AFTER the unpause()
986 * call to make sure the VCPU's priority is not boosted
987 * if it is woken up here.
988 */
989 CSCHED_STAT_CRANK(vcpu_unpark);
990 vcpu_unpause(svc->vcpu);
991 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
992 }
994 /* Upper bound on credits means VCPU stops earning */
995 if ( credit > CSCHED_CREDITS_PER_TSLICE )
996 {
997 __csched_vcpu_acct_stop_locked(svc);
998 credit = 0;
999 atomic_set(&svc->credit, credit);
1003 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
1004 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
1005 credit_balance += credit;
1009 csched_priv.credit_balance = credit_balance;
1011 spin_unlock_irqrestore(&csched_priv.lock, flags);
1013 /* Inform each CPU that its runq needs to be sorted */
1014 csched_priv.runq_sort++;
1017 static void
1018 csched_tick(void *_cpu)
1020 unsigned int cpu = (unsigned long)_cpu;
1021 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
1023 spc->tick++;
1025 /*
1026 * Accounting for running VCPU
1027 */
1028 if ( !is_idle_vcpu(current) )
1029 csched_vcpu_acct(cpu);
1031 /*
1032 * Host-wide accounting duty
1034 * Note: Currently, this is always done by the master boot CPU. Eventually,
1035 * we could distribute or at the very least cycle the duty.
1036 */
1037 if ( (csched_priv.master == cpu) &&
1038 (spc->tick % CSCHED_TICKS_PER_ACCT) == 0 )
1040 csched_acct();
1043 /*
1044 * Check if runq needs to be sorted
1046 * Every physical CPU resorts the runq after the accounting master has
1047 * modified priorities. This is a special O(n) sort and runs at most
1048 * once per accounting period (currently 30 milliseconds).
1049 */
1050 csched_runq_sort(cpu);
1052 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1055 static struct csched_vcpu *
1056 csched_runq_steal(int peer_cpu, int cpu, int pri)
1058 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1059 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1060 struct csched_vcpu *speer;
1061 struct list_head *iter;
1062 struct vcpu *vc;
1064 /*
1065 * Don't steal from an idle CPU's runq because it's about to
1066 * pick up work from it itself.
1067 */
1068 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1070 list_for_each( iter, &peer_pcpu->runq )
1072 speer = __runq_elem(iter);
1074 /*
1075 * If next available VCPU here is not of strictly higher
1076 * priority than ours, this PCPU is useless to us.
1077 */
1078 if ( speer->pri <= pri )
1079 break;
1081 /* Is this VCPU is runnable on our PCPU? */
1082 vc = speer->vcpu;
1083 BUG_ON( is_idle_vcpu(vc) );
1085 if (__csched_vcpu_is_migrateable(vc, cpu))
1087 /* We got a candidate. Grab it! */
1088 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1089 CSCHED_STAT_CRANK(migrate_queued);
1090 __runq_remove(speer);
1091 vc->processor = cpu;
1092 return speer;
1097 CSCHED_STAT_CRANK(steal_peer_idle);
1098 return NULL;
1101 static struct csched_vcpu *
1102 csched_load_balance(int cpu, struct csched_vcpu *snext)
1104 struct csched_vcpu *speer;
1105 cpumask_t workers;
1106 int peer_cpu;
1108 BUG_ON( cpu != snext->vcpu->processor );
1110 /* If this CPU is going offline we shouldn't steal work. */
1111 if ( unlikely(!cpu_online(cpu)) )
1112 goto out;
1114 if ( snext->pri == CSCHED_PRI_IDLE )
1115 CSCHED_STAT_CRANK(load_balance_idle);
1116 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1117 CSCHED_STAT_CRANK(load_balance_over);
1118 else
1119 CSCHED_STAT_CRANK(load_balance_other);
1121 /*
1122 * Peek at non-idling CPUs in the system, starting with our
1123 * immediate neighbour.
1124 */
1125 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1126 cpu_clear(cpu, workers);
1127 peer_cpu = cpu;
1129 while ( !cpus_empty(workers) )
1131 peer_cpu = __cycle_cpu(peer_cpu, &workers);
1132 cpu_clear(peer_cpu, workers);
1134 /*
1135 * Get ahold of the scheduler lock for this peer CPU.
1137 * Note: We don't spin on this lock but simply try it. Spinning could
1138 * cause a deadlock if the peer CPU is also load balancing and trying
1139 * to lock this CPU.
1140 */
1141 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1143 CSCHED_STAT_CRANK(steal_trylock_failed);
1144 continue;
1147 /*
1148 * Any work over there to steal?
1149 */
1150 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1151 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1152 if ( speer != NULL )
1153 return speer;
1156 out:
1157 /* Failed to find more important work elsewhere... */
1158 __runq_remove(snext);
1159 return snext;
1162 /*
1163 * This function is in the critical path. It is designed to be simple and
1164 * fast for the common case.
1165 */
1166 static struct task_slice
1167 csched_schedule(s_time_t now)
1169 const int cpu = smp_processor_id();
1170 struct list_head * const runq = RUNQ(cpu);
1171 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1172 struct csched_vcpu *snext;
1173 struct task_slice ret;
1175 CSCHED_STAT_CRANK(schedule);
1176 CSCHED_VCPU_CHECK(current);
1178 /*
1179 * Select next runnable local VCPU (ie top of local runq)
1180 */
1181 if ( vcpu_runnable(current) )
1182 __runq_insert(cpu, scurr);
1183 else
1184 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1186 snext = __runq_elem(runq->next);
1188 /*
1189 * SMP Load balance:
1191 * If the next highest priority local runnable VCPU has already eaten
1192 * through its credits, look on other PCPUs to see if we have more
1193 * urgent work... If not, csched_load_balance() will return snext, but
1194 * already removed from the runq.
1195 */
1196 if ( snext->pri > CSCHED_PRI_TS_OVER )
1197 __runq_remove(snext);
1198 else
1199 snext = csched_load_balance(cpu, snext);
1201 /*
1202 * Update idlers mask if necessary. When we're idling, other CPUs
1203 * will tickle us when they get extra work.
1204 */
1205 if ( snext->pri == CSCHED_PRI_IDLE )
1207 if ( !cpu_isset(cpu, csched_priv.idlers) )
1208 cpu_set(cpu, csched_priv.idlers);
1210 else if ( cpu_isset(cpu, csched_priv.idlers) )
1212 cpu_clear(cpu, csched_priv.idlers);
1215 /*
1216 * Return task to run next...
1217 */
1218 ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
1219 ret.task = snext->vcpu;
1221 CSCHED_VCPU_CHECK(ret.task);
1222 return ret;
1225 static void
1226 csched_dump_vcpu(struct csched_vcpu *svc)
1228 struct csched_dom * const sdom = svc->sdom;
1230 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1231 svc->vcpu->domain->domain_id,
1232 svc->vcpu->vcpu_id,
1233 svc->pri,
1234 svc->flags,
1235 svc->vcpu->processor);
1237 if ( sdom )
1239 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1240 #ifdef CSCHED_STATS
1241 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1242 svc->stats.credit_last,
1243 svc->stats.credit_incr,
1244 svc->stats.state_active,
1245 svc->stats.state_idle,
1246 svc->stats.migrate_q,
1247 svc->stats.migrate_r);
1248 #endif
1251 printk("\n");
1254 static void
1255 csched_dump_pcpu(int cpu)
1257 struct list_head *runq, *iter;
1258 struct csched_pcpu *spc;
1259 struct csched_vcpu *svc;
1260 int loop;
1261 char cpustr[100];
1263 spc = CSCHED_PCPU(cpu);
1264 runq = &spc->runq;
1266 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_sibling_map[cpu]);
1267 printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
1268 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_core_map[cpu]);
1269 printk("core=%s\n", cpustr);
1271 /* current VCPU */
1272 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1273 if ( svc )
1275 printk("\trun: ");
1276 csched_dump_vcpu(svc);
1279 loop = 0;
1280 list_for_each( iter, runq )
1282 svc = __runq_elem(iter);
1283 if ( svc )
1285 printk("\t%3d: ", ++loop);
1286 csched_dump_vcpu(svc);
1291 static void
1292 csched_dump(void)
1294 struct list_head *iter_sdom, *iter_svc;
1295 int loop;
1296 char idlers_buf[100];
1298 printk("info:\n"
1299 "\tncpus = %u\n"
1300 "\tmaster = %u\n"
1301 "\tcredit = %u\n"
1302 "\tcredit balance = %d\n"
1303 "\tweight = %u\n"
1304 "\trunq_sort = %u\n"
1305 "\tdefault-weight = %d\n"
1306 "\tmsecs per tick = %dms\n"
1307 "\tcredits per tick = %d\n"
1308 "\tticks per tslice = %d\n"
1309 "\tticks per acct = %d\n",
1310 csched_priv.ncpus,
1311 csched_priv.master,
1312 csched_priv.credit,
1313 csched_priv.credit_balance,
1314 csched_priv.weight,
1315 csched_priv.runq_sort,
1316 CSCHED_DEFAULT_WEIGHT,
1317 CSCHED_MSECS_PER_TICK,
1318 CSCHED_CREDITS_PER_TICK,
1319 CSCHED_TICKS_PER_TSLICE,
1320 CSCHED_TICKS_PER_ACCT);
1322 cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
1323 printk("idlers: %s\n", idlers_buf);
1325 CSCHED_STATS_PRINTK();
1327 printk("active vcpus:\n");
1328 loop = 0;
1329 list_for_each( iter_sdom, &csched_priv.active_sdom )
1331 struct csched_dom *sdom;
1332 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1334 list_for_each( iter_svc, &sdom->active_vcpu )
1336 struct csched_vcpu *svc;
1337 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1339 printk("\t%3d: ", ++loop);
1340 csched_dump_vcpu(svc);
1345 static void
1346 csched_init(void)
1348 spin_lock_init(&csched_priv.lock);
1349 INIT_LIST_HEAD(&csched_priv.active_sdom);
1350 csched_priv.ncpus = 0;
1351 csched_priv.master = UINT_MAX;
1352 cpus_clear(csched_priv.idlers);
1353 csched_priv.weight = 0U;
1354 csched_priv.credit = 0U;
1355 csched_priv.credit_balance = 0;
1356 csched_priv.runq_sort = 0U;
1357 CSCHED_STATS_RESET();
1360 /* Tickers cannot be kicked until SMP subsystem is alive. */
1361 static __init int csched_start_tickers(void)
1363 struct csched_pcpu *spc;
1364 unsigned int cpu;
1366 /* Is the credit scheduler initialised? */
1367 if ( csched_priv.ncpus == 0 )
1368 return 0;
1370 for_each_online_cpu ( cpu )
1372 spc = CSCHED_PCPU(cpu);
1373 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1376 return 0;
1378 __initcall(csched_start_tickers);
1381 struct scheduler sched_credit_def = {
1382 .name = "SMP Credit Scheduler",
1383 .opt_name = "credit",
1384 .sched_id = XEN_SCHEDULER_CREDIT,
1386 .init_domain = csched_dom_init,
1387 .destroy_domain = csched_dom_destroy,
1389 .init_vcpu = csched_vcpu_init,
1390 .destroy_vcpu = csched_vcpu_destroy,
1392 .sleep = csched_vcpu_sleep,
1393 .wake = csched_vcpu_wake,
1395 .adjust = csched_dom_cntl,
1397 .pick_cpu = csched_cpu_pick,
1398 .do_schedule = csched_schedule,
1400 .dump_cpu_state = csched_dump_pcpu,
1401 .dump_settings = csched_dump,
1402 .init = csched_init,
1403 };