ia64/xen-unstable

view xen/common/sched_credit.c @ 17062:0769835cf50f

x86 shadow: Reduce scope of shadow lock.

emulate_map_dest doesn't require holding lock, since
only shadow related operation possibly involved is to
remove shadow which is less frequent and can acquire
lock inside. Rest are either guest table walk or
per-vcpu monitor table manipulation

Signed-off-by Kevin Tian <kevin.tian@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Feb 14 10:33:12 2008 +0000 (2008-02-14)
parents c5bf8919938b
children 4ffc70556000
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_DEFAULT_WEIGHT 256
40 #define CSCHED_TICKS_PER_TSLICE 3
41 #define CSCHED_TICKS_PER_ACCT 3
42 #define CSCHED_MSECS_PER_TICK 10
43 #define CSCHED_MSECS_PER_TSLICE \
44 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
45 #define CSCHED_CREDITS_PER_TICK 100
46 #define CSCHED_CREDITS_PER_TSLICE \
47 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48 #define CSCHED_CREDITS_PER_ACCT \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
52 /*
53 * Priorities
54 */
55 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
56 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
57 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
58 #define CSCHED_PRI_IDLE -64 /* idle */
61 /*
62 * Flags
63 */
64 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
67 /*
68 * Useful macros
69 */
70 #define CSCHED_PCPU(_c) \
71 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
72 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
73 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
74 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
77 /*
78 * Stats
79 */
80 #ifdef CSCHED_STATS
82 #define CSCHED_STAT(_X) (csched_priv.stats._X)
83 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
84 #define CSCHED_STAT_PRINTK(_X) \
85 do \
86 { \
87 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
88 } while ( 0 );
90 /*
91 * Try and keep often cranked stats on top so they'll fit on one
92 * cache line.
93 */
94 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
95 _MACRO(schedule) \
96 _MACRO(acct_run) \
97 _MACRO(acct_no_work) \
98 _MACRO(acct_balance) \
99 _MACRO(acct_reorder) \
100 _MACRO(acct_min_credit) \
101 _MACRO(acct_vcpu_active) \
102 _MACRO(acct_vcpu_idle) \
103 _MACRO(vcpu_sleep) \
104 _MACRO(vcpu_wake_running) \
105 _MACRO(vcpu_wake_onrunq) \
106 _MACRO(vcpu_wake_runnable) \
107 _MACRO(vcpu_wake_not_runnable) \
108 _MACRO(vcpu_park) \
109 _MACRO(vcpu_unpark) \
110 _MACRO(tickle_local_idler) \
111 _MACRO(tickle_local_over) \
112 _MACRO(tickle_local_under) \
113 _MACRO(tickle_local_other) \
114 _MACRO(tickle_idlers_none) \
115 _MACRO(tickle_idlers_some) \
116 _MACRO(load_balance_idle) \
117 _MACRO(load_balance_over) \
118 _MACRO(load_balance_other) \
119 _MACRO(steal_trylock_failed) \
120 _MACRO(steal_peer_idle) \
121 _MACRO(migrate_queued) \
122 _MACRO(migrate_running) \
123 _MACRO(dom_init) \
124 _MACRO(dom_destroy) \
125 _MACRO(vcpu_init) \
126 _MACRO(vcpu_destroy)
128 #ifndef NDEBUG
129 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
130 _MACRO(vcpu_check)
131 #else
132 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
133 #endif
135 #define CSCHED_STATS_EXPAND(_MACRO) \
136 CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
137 CSCHED_STATS_EXPAND_SCHED(_MACRO)
139 #define CSCHED_STATS_RESET() \
140 do \
141 { \
142 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
143 } while ( 0 )
145 #define CSCHED_STATS_DEFINE() \
146 struct \
147 { \
148 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
149 } stats;
151 #define CSCHED_STATS_PRINTK() \
152 do \
153 { \
154 printk("stats:\n"); \
155 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
156 } while ( 0 )
158 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
160 #define CSCHED_VCPU_STATS_RESET(_V) \
161 do \
162 { \
163 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
164 } while ( 0 )
166 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
168 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
170 #else /* CSCHED_STATS */
172 #define CSCHED_STATS_RESET() do {} while ( 0 )
173 #define CSCHED_STATS_DEFINE()
174 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
175 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
176 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
177 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
178 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
180 #endif /* CSCHED_STATS */
183 /*
184 * Physical CPU
185 */
186 struct csched_pcpu {
187 struct list_head runq;
188 uint32_t runq_sort_last;
189 struct timer ticker;
190 unsigned int tick;
191 };
193 /*
194 * Virtual CPU
195 */
196 struct csched_vcpu {
197 struct list_head runq_elem;
198 struct list_head active_vcpu_elem;
199 struct csched_dom *sdom;
200 struct vcpu *vcpu;
201 atomic_t credit;
202 uint16_t flags;
203 int16_t pri;
204 #ifdef CSCHED_STATS
205 struct {
206 int credit_last;
207 uint32_t credit_incr;
208 uint32_t state_active;
209 uint32_t state_idle;
210 uint32_t migrate_q;
211 uint32_t migrate_r;
212 } stats;
213 #endif
214 };
216 /*
217 * Domain
218 */
219 struct csched_dom {
220 struct list_head active_vcpu;
221 struct list_head active_sdom_elem;
222 struct domain *dom;
223 uint16_t active_vcpu_count;
224 uint16_t weight;
225 uint16_t cap;
226 };
228 /*
229 * System-wide private data
230 */
231 struct csched_private {
232 spinlock_t lock;
233 struct list_head active_sdom;
234 uint32_t ncpus;
235 unsigned int master;
236 cpumask_t idlers;
237 uint32_t weight;
238 uint32_t credit;
239 int credit_balance;
240 uint32_t runq_sort;
241 CSCHED_STATS_DEFINE()
242 };
245 /*
246 * Global variables
247 */
248 static struct csched_private csched_priv;
250 static void csched_tick(void *_cpu);
252 static inline int
253 __cycle_cpu(int cpu, const cpumask_t *mask)
254 {
255 int nxt = next_cpu(cpu, *mask);
256 if (nxt == NR_CPUS)
257 nxt = first_cpu(*mask);
258 return nxt;
259 }
261 static inline int
262 __vcpu_on_runq(struct csched_vcpu *svc)
263 {
264 return !list_empty(&svc->runq_elem);
265 }
267 static inline struct csched_vcpu *
268 __runq_elem(struct list_head *elem)
269 {
270 return list_entry(elem, struct csched_vcpu, runq_elem);
271 }
273 static inline void
274 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
275 {
276 const struct list_head * const runq = RUNQ(cpu);
277 struct list_head *iter;
279 BUG_ON( __vcpu_on_runq(svc) );
280 BUG_ON( cpu != svc->vcpu->processor );
282 list_for_each( iter, runq )
283 {
284 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
285 if ( svc->pri > iter_svc->pri )
286 break;
287 }
289 list_add_tail(&svc->runq_elem, iter);
290 }
292 static inline void
293 __runq_remove(struct csched_vcpu *svc)
294 {
295 BUG_ON( !__vcpu_on_runq(svc) );
296 list_del_init(&svc->runq_elem);
297 }
299 static inline void
300 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
301 {
302 struct csched_vcpu * const cur =
303 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
304 cpumask_t mask;
306 ASSERT(cur);
307 cpus_clear(mask);
309 /* If strictly higher priority than current VCPU, signal the CPU */
310 if ( new->pri > cur->pri )
311 {
312 if ( cur->pri == CSCHED_PRI_IDLE )
313 CSCHED_STAT_CRANK(tickle_local_idler);
314 else if ( cur->pri == CSCHED_PRI_TS_OVER )
315 CSCHED_STAT_CRANK(tickle_local_over);
316 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
317 CSCHED_STAT_CRANK(tickle_local_under);
318 else
319 CSCHED_STAT_CRANK(tickle_local_other);
321 cpu_set(cpu, mask);
322 }
324 /*
325 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
326 * let them know there is runnable work in the system...
327 */
328 if ( cur->pri > CSCHED_PRI_IDLE )
329 {
330 if ( cpus_empty(csched_priv.idlers) )
331 {
332 CSCHED_STAT_CRANK(tickle_idlers_none);
333 }
334 else
335 {
336 CSCHED_STAT_CRANK(tickle_idlers_some);
337 cpus_or(mask, mask, csched_priv.idlers);
338 cpus_and(mask, mask, new->vcpu->cpu_affinity);
339 }
340 }
342 /* Send scheduler interrupts to designated CPUs */
343 if ( !cpus_empty(mask) )
344 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
345 }
347 static int
348 csched_pcpu_init(int cpu)
349 {
350 struct csched_pcpu *spc;
351 unsigned long flags;
353 /* Allocate per-PCPU info */
354 spc = xmalloc(struct csched_pcpu);
355 if ( spc == NULL )
356 return -1;
358 spin_lock_irqsave(&csched_priv.lock, flags);
360 /* Initialize/update system-wide config */
361 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
362 if ( csched_priv.ncpus <= cpu )
363 csched_priv.ncpus = cpu + 1;
364 if ( csched_priv.master >= csched_priv.ncpus )
365 csched_priv.master = cpu;
367 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
368 INIT_LIST_HEAD(&spc->runq);
369 spc->runq_sort_last = csched_priv.runq_sort;
370 per_cpu(schedule_data, cpu).sched_priv = spc;
372 /* Start off idling... */
373 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
374 cpu_set(cpu, csched_priv.idlers);
376 spin_unlock_irqrestore(&csched_priv.lock, flags);
378 return 0;
379 }
381 #ifndef NDEBUG
382 static inline void
383 __csched_vcpu_check(struct vcpu *vc)
384 {
385 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
386 struct csched_dom * const sdom = svc->sdom;
388 BUG_ON( svc->vcpu != vc );
389 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
390 if ( sdom )
391 {
392 BUG_ON( is_idle_vcpu(vc) );
393 BUG_ON( sdom->dom != vc->domain );
394 }
395 else
396 {
397 BUG_ON( !is_idle_vcpu(vc) );
398 }
400 CSCHED_STAT_CRANK(vcpu_check);
401 }
402 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
403 #else
404 #define CSCHED_VCPU_CHECK(_vc)
405 #endif
407 static inline int
408 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
409 {
410 /*
411 * Don't pick up work that's in the peer's scheduling tail. Also only pick
412 * up work that's allowed to run on our CPU.
413 */
414 return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
415 }
417 static int
418 csched_cpu_pick(struct vcpu *vc)
419 {
420 cpumask_t cpus;
421 cpumask_t idlers;
422 int cpu;
424 /*
425 * Pick from online CPUs in VCPU's affinity mask, giving a
426 * preference to its current processor if it's in there.
427 */
428 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
429 cpu = cpu_isset(vc->processor, cpus)
430 ? vc->processor
431 : __cycle_cpu(vc->processor, &cpus);
432 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
434 /*
435 * Try to find an idle processor within the above constraints.
436 *
437 * In multi-core and multi-threaded CPUs, not all idle execution
438 * vehicles are equal!
439 *
440 * We give preference to the idle execution vehicle with the most
441 * idling neighbours in its grouping. This distributes work across
442 * distinct cores first and guarantees we don't do something stupid
443 * like run two VCPUs on co-hyperthreads while there are idle cores
444 * or sockets.
445 */
446 idlers = csched_priv.idlers;
447 cpu_set(cpu, idlers);
448 cpus_and(cpus, cpus, idlers);
449 cpu_clear(cpu, cpus);
451 while ( !cpus_empty(cpus) )
452 {
453 cpumask_t cpu_idlers;
454 cpumask_t nxt_idlers;
455 int nxt;
457 nxt = __cycle_cpu(cpu, &cpus);
459 if ( cpu_isset(cpu, cpu_core_map[nxt]) )
460 {
461 ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
462 cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
463 cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
464 }
465 else
466 {
467 ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
468 cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
469 cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
470 }
472 if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
473 {
474 cpu = nxt;
475 cpu_clear(cpu, cpus);
476 }
477 else
478 {
479 cpus_andnot(cpus, cpus, nxt_idlers);
480 }
481 }
483 return cpu;
484 }
486 static inline void
487 __csched_vcpu_acct_start(struct csched_vcpu *svc)
488 {
489 struct csched_dom * const sdom = svc->sdom;
490 unsigned long flags;
492 spin_lock_irqsave(&csched_priv.lock, flags);
494 if ( list_empty(&svc->active_vcpu_elem) )
495 {
496 CSCHED_VCPU_STAT_CRANK(svc, state_active);
497 CSCHED_STAT_CRANK(acct_vcpu_active);
499 sdom->active_vcpu_count++;
500 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
501 if ( list_empty(&sdom->active_sdom_elem) )
502 {
503 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
504 csched_priv.weight += sdom->weight;
505 }
506 }
508 spin_unlock_irqrestore(&csched_priv.lock, flags);
509 }
511 static inline void
512 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
513 {
514 struct csched_dom * const sdom = svc->sdom;
516 BUG_ON( list_empty(&svc->active_vcpu_elem) );
518 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
519 CSCHED_STAT_CRANK(acct_vcpu_idle);
521 sdom->active_vcpu_count--;
522 list_del_init(&svc->active_vcpu_elem);
523 if ( list_empty(&sdom->active_vcpu) )
524 {
525 BUG_ON( csched_priv.weight < sdom->weight );
526 list_del_init(&sdom->active_sdom_elem);
527 csched_priv.weight -= sdom->weight;
528 }
529 }
531 static void
532 csched_vcpu_acct(unsigned int cpu)
533 {
534 struct csched_vcpu * const svc = CSCHED_VCPU(current);
536 ASSERT( current->processor == cpu );
537 ASSERT( svc->sdom != NULL );
539 /*
540 * If this VCPU's priority was boosted when it last awoke, reset it.
541 * If the VCPU is found here, then it's consuming a non-negligeable
542 * amount of CPU resources and should no longer be boosted.
543 */
544 if ( svc->pri == CSCHED_PRI_TS_BOOST )
545 svc->pri = CSCHED_PRI_TS_UNDER;
547 /*
548 * Update credits
549 */
550 atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
552 /*
553 * Put this VCPU and domain back on the active list if it was
554 * idling.
555 *
556 * If it's been active a while, check if we'd be better off
557 * migrating it to run elsewhere (see multi-core and multi-thread
558 * support in csched_cpu_pick()).
559 */
560 if ( list_empty(&svc->active_vcpu_elem) )
561 {
562 __csched_vcpu_acct_start(svc);
563 }
564 else if ( csched_cpu_pick(current) != cpu )
565 {
566 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
567 CSCHED_STAT_CRANK(migrate_running);
568 set_bit(_VPF_migrating, &current->pause_flags);
569 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
570 }
571 }
573 static int
574 csched_vcpu_init(struct vcpu *vc)
575 {
576 struct domain * const dom = vc->domain;
577 struct csched_dom *sdom = CSCHED_DOM(dom);
578 struct csched_vcpu *svc;
580 CSCHED_STAT_CRANK(vcpu_init);
582 /* Allocate per-VCPU info */
583 svc = xmalloc(struct csched_vcpu);
584 if ( svc == NULL )
585 return -1;
587 INIT_LIST_HEAD(&svc->runq_elem);
588 INIT_LIST_HEAD(&svc->active_vcpu_elem);
589 svc->sdom = sdom;
590 svc->vcpu = vc;
591 atomic_set(&svc->credit, 0);
592 svc->flags = 0U;
593 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
594 CSCHED_VCPU_STATS_RESET(svc);
595 vc->sched_priv = svc;
597 /* Allocate per-PCPU info */
598 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
599 {
600 if ( csched_pcpu_init(vc->processor) != 0 )
601 return -1;
602 }
604 CSCHED_VCPU_CHECK(vc);
605 return 0;
606 }
608 static void
609 csched_vcpu_destroy(struct vcpu *vc)
610 {
611 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
612 struct csched_dom * const sdom = svc->sdom;
613 unsigned long flags;
615 CSCHED_STAT_CRANK(vcpu_destroy);
617 BUG_ON( sdom == NULL );
618 BUG_ON( !list_empty(&svc->runq_elem) );
620 spin_lock_irqsave(&csched_priv.lock, flags);
622 if ( !list_empty(&svc->active_vcpu_elem) )
623 __csched_vcpu_acct_stop_locked(svc);
625 spin_unlock_irqrestore(&csched_priv.lock, flags);
627 xfree(svc);
628 }
630 static void
631 csched_vcpu_sleep(struct vcpu *vc)
632 {
633 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
635 CSCHED_STAT_CRANK(vcpu_sleep);
637 BUG_ON( is_idle_vcpu(vc) );
639 if ( per_cpu(schedule_data, vc->processor).curr == vc )
640 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
641 else if ( __vcpu_on_runq(svc) )
642 __runq_remove(svc);
643 }
645 static void
646 csched_vcpu_wake(struct vcpu *vc)
647 {
648 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
649 const unsigned int cpu = vc->processor;
651 BUG_ON( is_idle_vcpu(vc) );
653 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
654 {
655 CSCHED_STAT_CRANK(vcpu_wake_running);
656 return;
657 }
658 if ( unlikely(__vcpu_on_runq(svc)) )
659 {
660 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
661 return;
662 }
664 if ( likely(vcpu_runnable(vc)) )
665 CSCHED_STAT_CRANK(vcpu_wake_runnable);
666 else
667 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
669 /*
670 * We temporarly boost the priority of awaking VCPUs!
671 *
672 * If this VCPU consumes a non negligeable amount of CPU, it
673 * will eventually find itself in the credit accounting code
674 * path where its priority will be reset to normal.
675 *
676 * If on the other hand the VCPU consumes little CPU and is
677 * blocking and awoken a lot (doing I/O for example), its
678 * priority will remain boosted, optimizing it's wake-to-run
679 * latencies.
680 *
681 * This allows wake-to-run latency sensitive VCPUs to preempt
682 * more CPU resource intensive VCPUs without impacting overall
683 * system fairness.
684 *
685 * The one exception is for VCPUs of capped domains unpausing
686 * after earning credits they had overspent. We don't boost
687 * those.
688 */
689 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
690 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
691 {
692 svc->pri = CSCHED_PRI_TS_BOOST;
693 }
695 /* Put the VCPU on the runq and tickle CPUs */
696 __runq_insert(cpu, svc);
697 __runq_tickle(cpu, svc);
698 }
700 static int
701 csched_dom_cntl(
702 struct domain *d,
703 struct xen_domctl_scheduler_op *op)
704 {
705 struct csched_dom * const sdom = CSCHED_DOM(d);
706 unsigned long flags;
708 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
709 {
710 op->u.credit.weight = sdom->weight;
711 op->u.credit.cap = sdom->cap;
712 }
713 else
714 {
715 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
717 spin_lock_irqsave(&csched_priv.lock, flags);
719 if ( op->u.credit.weight != 0 )
720 {
721 if ( !list_empty(&sdom->active_sdom_elem) )
722 {
723 csched_priv.weight -= sdom->weight;
724 csched_priv.weight += op->u.credit.weight;
725 }
726 sdom->weight = op->u.credit.weight;
727 }
729 if ( op->u.credit.cap != (uint16_t)~0U )
730 sdom->cap = op->u.credit.cap;
732 spin_unlock_irqrestore(&csched_priv.lock, flags);
733 }
735 return 0;
736 }
738 static int
739 csched_dom_init(struct domain *dom)
740 {
741 struct csched_dom *sdom;
743 CSCHED_STAT_CRANK(dom_init);
745 if ( is_idle_domain(dom) )
746 return 0;
748 sdom = xmalloc(struct csched_dom);
749 if ( sdom == NULL )
750 return -ENOMEM;
752 /* Initialize credit and weight */
753 INIT_LIST_HEAD(&sdom->active_vcpu);
754 sdom->active_vcpu_count = 0;
755 INIT_LIST_HEAD(&sdom->active_sdom_elem);
756 sdom->dom = dom;
757 sdom->weight = CSCHED_DEFAULT_WEIGHT;
758 sdom->cap = 0U;
759 dom->sched_priv = sdom;
761 return 0;
762 }
764 static void
765 csched_dom_destroy(struct domain *dom)
766 {
767 CSCHED_STAT_CRANK(dom_destroy);
768 xfree(CSCHED_DOM(dom));
769 }
771 /*
772 * This is a O(n) optimized sort of the runq.
773 *
774 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
775 * through the runq and move up any UNDERs that are preceded by OVERS. We
776 * remember the last UNDER to make the move up operation O(1).
777 */
778 static void
779 csched_runq_sort(unsigned int cpu)
780 {
781 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
782 struct list_head *runq, *elem, *next, *last_under;
783 struct csched_vcpu *svc_elem;
784 unsigned long flags;
785 int sort_epoch;
787 sort_epoch = csched_priv.runq_sort;
788 if ( sort_epoch == spc->runq_sort_last )
789 return;
791 spc->runq_sort_last = sort_epoch;
793 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
795 runq = &spc->runq;
796 elem = runq->next;
797 last_under = runq;
799 while ( elem != runq )
800 {
801 next = elem->next;
802 svc_elem = __runq_elem(elem);
804 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
805 {
806 /* does elem need to move up the runq? */
807 if ( elem->prev != last_under )
808 {
809 list_del(elem);
810 list_add(elem, last_under);
811 }
812 last_under = elem;
813 }
815 elem = next;
816 }
818 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
819 }
821 static void
822 csched_acct(void)
823 {
824 unsigned long flags;
825 struct list_head *iter_vcpu, *next_vcpu;
826 struct list_head *iter_sdom, *next_sdom;
827 struct csched_vcpu *svc;
828 struct csched_dom *sdom;
829 uint32_t credit_total;
830 uint32_t weight_total;
831 uint32_t weight_left;
832 uint32_t credit_fair;
833 uint32_t credit_peak;
834 uint32_t credit_cap;
835 int credit_balance;
836 int credit_xtra;
837 int credit;
840 spin_lock_irqsave(&csched_priv.lock, flags);
842 weight_total = csched_priv.weight;
843 credit_total = csched_priv.credit;
845 /* Converge balance towards 0 when it drops negative */
846 if ( csched_priv.credit_balance < 0 )
847 {
848 credit_total -= csched_priv.credit_balance;
849 CSCHED_STAT_CRANK(acct_balance);
850 }
852 if ( unlikely(weight_total == 0) )
853 {
854 csched_priv.credit_balance = 0;
855 spin_unlock_irqrestore(&csched_priv.lock, flags);
856 CSCHED_STAT_CRANK(acct_no_work);
857 return;
858 }
860 CSCHED_STAT_CRANK(acct_run);
862 weight_left = weight_total;
863 credit_balance = 0;
864 credit_xtra = 0;
865 credit_cap = 0U;
867 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
868 {
869 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
871 BUG_ON( is_idle_domain(sdom->dom) );
872 BUG_ON( sdom->active_vcpu_count == 0 );
873 BUG_ON( sdom->weight == 0 );
874 BUG_ON( sdom->weight > weight_left );
876 weight_left -= sdom->weight;
878 /*
879 * A domain's fair share is computed using its weight in competition
880 * with that of all other active domains.
881 *
882 * At most, a domain can use credits to run all its active VCPUs
883 * for one full accounting period. We allow a domain to earn more
884 * only when the system-wide credit balance is negative.
885 */
886 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
887 if ( csched_priv.credit_balance < 0 )
888 {
889 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
890 (weight_total - 1)
891 ) / weight_total;
892 }
894 if ( sdom->cap != 0U )
895 {
896 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
897 if ( credit_cap < credit_peak )
898 credit_peak = credit_cap;
900 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
901 ) / sdom->active_vcpu_count;
902 }
904 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
905 ) / weight_total;
907 if ( credit_fair < credit_peak )
908 {
909 credit_xtra = 1;
910 }
911 else
912 {
913 if ( weight_left != 0U )
914 {
915 /* Give other domains a chance at unused credits */
916 credit_total += ( ( ( credit_fair - credit_peak
917 ) * weight_total
918 ) + ( weight_left - 1 )
919 ) / weight_left;
920 }
922 if ( credit_xtra )
923 {
924 /*
925 * Lazily keep domains with extra credits at the head of
926 * the queue to give others a chance at them in future
927 * accounting periods.
928 */
929 CSCHED_STAT_CRANK(acct_reorder);
930 list_del(&sdom->active_sdom_elem);
931 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
932 }
934 credit_fair = credit_peak;
935 }
937 /* Compute fair share per VCPU */
938 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
939 ) / sdom->active_vcpu_count;
942 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
943 {
944 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
945 BUG_ON( sdom != svc->sdom );
947 /* Increment credit */
948 atomic_add(credit_fair, &svc->credit);
949 credit = atomic_read(&svc->credit);
951 /*
952 * Recompute priority or, if VCPU is idling, remove it from
953 * the active list.
954 */
955 if ( credit < 0 )
956 {
957 svc->pri = CSCHED_PRI_TS_OVER;
959 /* Park running VCPUs of capped-out domains */
960 if ( sdom->cap != 0U &&
961 credit < -credit_cap &&
962 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
963 {
964 CSCHED_STAT_CRANK(vcpu_park);
965 vcpu_pause_nosync(svc->vcpu);
966 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
967 }
969 /* Lower bound on credits */
970 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
971 {
972 CSCHED_STAT_CRANK(acct_min_credit);
973 credit = -CSCHED_CREDITS_PER_TSLICE;
974 atomic_set(&svc->credit, credit);
975 }
976 }
977 else
978 {
979 svc->pri = CSCHED_PRI_TS_UNDER;
981 /* Unpark any capped domains whose credits go positive */
982 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
983 {
984 /*
985 * It's important to unset the flag AFTER the unpause()
986 * call to make sure the VCPU's priority is not boosted
987 * if it is woken up here.
988 */
989 CSCHED_STAT_CRANK(vcpu_unpark);
990 vcpu_unpause(svc->vcpu);
991 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
992 }
994 /* Upper bound on credits means VCPU stops earning */
995 if ( credit > CSCHED_CREDITS_PER_TSLICE )
996 {
997 __csched_vcpu_acct_stop_locked(svc);
998 credit = 0;
999 atomic_set(&svc->credit, credit);
1003 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
1004 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
1005 credit_balance += credit;
1009 csched_priv.credit_balance = credit_balance;
1011 spin_unlock_irqrestore(&csched_priv.lock, flags);
1013 /* Inform each CPU that its runq needs to be sorted */
1014 csched_priv.runq_sort++;
1017 static void
1018 csched_tick(void *_cpu)
1020 unsigned int cpu = (unsigned long)_cpu;
1021 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
1023 spc->tick++;
1025 /*
1026 * Accounting for running VCPU
1027 */
1028 if ( !is_idle_vcpu(current) )
1029 csched_vcpu_acct(cpu);
1031 /*
1032 * Host-wide accounting duty
1034 * Note: Currently, this is always done by the master boot CPU. Eventually,
1035 * we could distribute or at the very least cycle the duty.
1036 */
1037 if ( (csched_priv.master == cpu) &&
1038 (spc->tick % CSCHED_TICKS_PER_ACCT) == 0 )
1040 csched_acct();
1043 /*
1044 * Check if runq needs to be sorted
1046 * Every physical CPU resorts the runq after the accounting master has
1047 * modified priorities. This is a special O(n) sort and runs at most
1048 * once per accounting period (currently 30 milliseconds).
1049 */
1050 csched_runq_sort(cpu);
1052 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1055 static struct csched_vcpu *
1056 csched_runq_steal(int peer_cpu, int cpu, int pri)
1058 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1059 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1060 struct csched_vcpu *speer;
1061 struct list_head *iter;
1062 struct vcpu *vc;
1064 /*
1065 * Don't steal from an idle CPU's runq because it's about to
1066 * pick up work from it itself.
1067 */
1068 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1070 list_for_each( iter, &peer_pcpu->runq )
1072 speer = __runq_elem(iter);
1074 /*
1075 * If next available VCPU here is not of strictly higher
1076 * priority than ours, this PCPU is useless to us.
1077 */
1078 if ( speer->pri <= pri )
1079 break;
1081 /* Is this VCPU is runnable on our PCPU? */
1082 vc = speer->vcpu;
1083 BUG_ON( is_idle_vcpu(vc) );
1085 if (__csched_vcpu_is_migrateable(vc, cpu))
1087 /* We got a candidate. Grab it! */
1088 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1089 CSCHED_STAT_CRANK(migrate_queued);
1090 __runq_remove(speer);
1091 vc->processor = cpu;
1092 return speer;
1097 CSCHED_STAT_CRANK(steal_peer_idle);
1098 return NULL;
1101 static struct csched_vcpu *
1102 csched_load_balance(int cpu, struct csched_vcpu *snext)
1104 struct csched_vcpu *speer;
1105 cpumask_t workers;
1106 int peer_cpu;
1108 BUG_ON( cpu != snext->vcpu->processor );
1110 if ( snext->pri == CSCHED_PRI_IDLE )
1111 CSCHED_STAT_CRANK(load_balance_idle);
1112 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1113 CSCHED_STAT_CRANK(load_balance_over);
1114 else
1115 CSCHED_STAT_CRANK(load_balance_other);
1117 /*
1118 * Peek at non-idling CPUs in the system, starting with our
1119 * immediate neighbour.
1120 */
1121 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1122 cpu_clear(cpu, workers);
1123 peer_cpu = cpu;
1125 while ( !cpus_empty(workers) )
1127 peer_cpu = __cycle_cpu(peer_cpu, &workers);
1128 cpu_clear(peer_cpu, workers);
1130 /*
1131 * Get ahold of the scheduler lock for this peer CPU.
1133 * Note: We don't spin on this lock but simply try it. Spinning could
1134 * cause a deadlock if the peer CPU is also load balancing and trying
1135 * to lock this CPU.
1136 */
1137 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1139 CSCHED_STAT_CRANK(steal_trylock_failed);
1140 continue;
1143 /*
1144 * Any work over there to steal?
1145 */
1146 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1147 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1148 if ( speer != NULL )
1149 return speer;
1152 /* Failed to find more important work elsewhere... */
1153 __runq_remove(snext);
1154 return snext;
1157 /*
1158 * This function is in the critical path. It is designed to be simple and
1159 * fast for the common case.
1160 */
1161 static struct task_slice
1162 csched_schedule(s_time_t now)
1164 const int cpu = smp_processor_id();
1165 struct list_head * const runq = RUNQ(cpu);
1166 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1167 struct csched_vcpu *snext;
1168 struct task_slice ret;
1170 CSCHED_STAT_CRANK(schedule);
1171 CSCHED_VCPU_CHECK(current);
1173 /*
1174 * Select next runnable local VCPU (ie top of local runq)
1175 */
1176 if ( vcpu_runnable(current) )
1177 __runq_insert(cpu, scurr);
1178 else
1179 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1181 snext = __runq_elem(runq->next);
1183 /*
1184 * SMP Load balance:
1186 * If the next highest priority local runnable VCPU has already eaten
1187 * through its credits, look on other PCPUs to see if we have more
1188 * urgent work... If not, csched_load_balance() will return snext, but
1189 * already removed from the runq.
1190 */
1191 if ( snext->pri > CSCHED_PRI_TS_OVER )
1192 __runq_remove(snext);
1193 else
1194 snext = csched_load_balance(cpu, snext);
1196 /*
1197 * Update idlers mask if necessary. When we're idling, other CPUs
1198 * will tickle us when they get extra work.
1199 */
1200 if ( snext->pri == CSCHED_PRI_IDLE )
1202 if ( !cpu_isset(cpu, csched_priv.idlers) )
1203 cpu_set(cpu, csched_priv.idlers);
1205 else if ( cpu_isset(cpu, csched_priv.idlers) )
1207 cpu_clear(cpu, csched_priv.idlers);
1210 /*
1211 * Return task to run next...
1212 */
1213 ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
1214 ret.task = snext->vcpu;
1216 CSCHED_VCPU_CHECK(ret.task);
1217 return ret;
1220 static void
1221 csched_dump_vcpu(struct csched_vcpu *svc)
1223 struct csched_dom * const sdom = svc->sdom;
1225 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1226 svc->vcpu->domain->domain_id,
1227 svc->vcpu->vcpu_id,
1228 svc->pri,
1229 svc->flags,
1230 svc->vcpu->processor);
1232 if ( sdom )
1234 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1235 #ifdef CSCHED_STATS
1236 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1237 svc->stats.credit_last,
1238 svc->stats.credit_incr,
1239 svc->stats.state_active,
1240 svc->stats.state_idle,
1241 svc->stats.migrate_q,
1242 svc->stats.migrate_r);
1243 #endif
1246 printk("\n");
1249 static void
1250 csched_dump_pcpu(int cpu)
1252 struct list_head *runq, *iter;
1253 struct csched_pcpu *spc;
1254 struct csched_vcpu *svc;
1255 int loop;
1257 spc = CSCHED_PCPU(cpu);
1258 runq = &spc->runq;
1260 printk(" sort=%d, sibling=0x%lx, core=0x%lx\n",
1261 spc->runq_sort_last,
1262 cpu_sibling_map[cpu].bits[0],
1263 cpu_core_map[cpu].bits[0]);
1265 /* current VCPU */
1266 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1267 if ( svc )
1269 printk("\trun: ");
1270 csched_dump_vcpu(svc);
1273 loop = 0;
1274 list_for_each( iter, runq )
1276 svc = __runq_elem(iter);
1277 if ( svc )
1279 printk("\t%3d: ", ++loop);
1280 csched_dump_vcpu(svc);
1285 static void
1286 csched_dump(void)
1288 struct list_head *iter_sdom, *iter_svc;
1289 int loop;
1291 printk("info:\n"
1292 "\tncpus = %u\n"
1293 "\tmaster = %u\n"
1294 "\tcredit = %u\n"
1295 "\tcredit balance = %d\n"
1296 "\tweight = %u\n"
1297 "\trunq_sort = %u\n"
1298 "\tdefault-weight = %d\n"
1299 "\tmsecs per tick = %dms\n"
1300 "\tcredits per tick = %d\n"
1301 "\tticks per tslice = %d\n"
1302 "\tticks per acct = %d\n",
1303 csched_priv.ncpus,
1304 csched_priv.master,
1305 csched_priv.credit,
1306 csched_priv.credit_balance,
1307 csched_priv.weight,
1308 csched_priv.runq_sort,
1309 CSCHED_DEFAULT_WEIGHT,
1310 CSCHED_MSECS_PER_TICK,
1311 CSCHED_CREDITS_PER_TICK,
1312 CSCHED_TICKS_PER_TSLICE,
1313 CSCHED_TICKS_PER_ACCT);
1315 printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
1317 CSCHED_STATS_PRINTK();
1319 printk("active vcpus:\n");
1320 loop = 0;
1321 list_for_each( iter_sdom, &csched_priv.active_sdom )
1323 struct csched_dom *sdom;
1324 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1326 list_for_each( iter_svc, &sdom->active_vcpu )
1328 struct csched_vcpu *svc;
1329 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1331 printk("\t%3d: ", ++loop);
1332 csched_dump_vcpu(svc);
1337 static void
1338 csched_init(void)
1340 spin_lock_init(&csched_priv.lock);
1341 INIT_LIST_HEAD(&csched_priv.active_sdom);
1342 csched_priv.ncpus = 0;
1343 csched_priv.master = UINT_MAX;
1344 cpus_clear(csched_priv.idlers);
1345 csched_priv.weight = 0U;
1346 csched_priv.credit = 0U;
1347 csched_priv.credit_balance = 0;
1348 csched_priv.runq_sort = 0U;
1349 CSCHED_STATS_RESET();
1352 /* Tickers cannot be kicked until SMP subsystem is alive. */
1353 static __init int csched_start_tickers(void)
1355 struct csched_pcpu *spc;
1356 unsigned int cpu;
1358 /* Is the credit scheduler initialised? */
1359 if ( csched_priv.ncpus == 0 )
1360 return 0;
1362 for_each_online_cpu ( cpu )
1364 spc = CSCHED_PCPU(cpu);
1365 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1368 return 0;
1370 __initcall(csched_start_tickers);
1373 struct scheduler sched_credit_def = {
1374 .name = "SMP Credit Scheduler",
1375 .opt_name = "credit",
1376 .sched_id = XEN_SCHEDULER_CREDIT,
1378 .init_domain = csched_dom_init,
1379 .destroy_domain = csched_dom_destroy,
1381 .init_vcpu = csched_vcpu_init,
1382 .destroy_vcpu = csched_vcpu_destroy,
1384 .sleep = csched_vcpu_sleep,
1385 .wake = csched_vcpu_wake,
1387 .adjust = csched_dom_cntl,
1389 .pick_cpu = csched_cpu_pick,
1390 .do_schedule = csched_schedule,
1392 .dump_cpu_state = csched_dump_pcpu,
1393 .dump_settings = csched_dump,
1394 .init = csched_init,
1395 };