ia64/xen-unstable

view xen/common/sched_credit.c @ 19835:edfdeb150f27

Fix buildsystem to detect udev > version 124

udev removed the udevinfo symlink from versions higher than 123 and
xen's build-system could not detect if udev is in place and has the
required version.

Signed-off-by: Marc-A. Dahlhaus <mad@wol.de>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 25 13:02:37 2009 +0100 (2009-06-25)
parents 5966b71195b4
children
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic per-vCPU counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #ifdef PERF_COUNTERS
34 #define CSCHED_STATS
35 #endif
38 /*
39 * Basic constants
40 */
41 #define CSCHED_DEFAULT_WEIGHT 256
42 #define CSCHED_TICKS_PER_TSLICE 3
43 #define CSCHED_TICKS_PER_ACCT 3
44 #define CSCHED_MSECS_PER_TICK 10
45 #define CSCHED_MSECS_PER_TSLICE \
46 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
47 #define CSCHED_CREDITS_PER_TICK 100
48 #define CSCHED_CREDITS_PER_TSLICE \
49 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
50 #define CSCHED_CREDITS_PER_ACCT \
51 (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
54 /*
55 * Priorities
56 */
57 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
58 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
59 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
60 #define CSCHED_PRI_IDLE -64 /* idle */
63 /*
64 * Flags
65 */
66 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
69 /*
70 * Useful macros
71 */
72 #define CSCHED_PCPU(_c) \
73 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
74 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
75 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
76 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
79 /*
80 * Stats
81 */
82 #define CSCHED_STAT_CRANK(_X) (perfc_incr(_X))
84 #ifdef CSCHED_STATS
86 #define CSCHED_VCPU_STATS_RESET(_V) \
87 do \
88 { \
89 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
90 } while ( 0 )
92 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
94 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
96 #else /* CSCHED_STATS */
98 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
99 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
100 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
102 #endif /* CSCHED_STATS */
105 /*
106 * Physical CPU
107 */
108 struct csched_pcpu {
109 struct list_head runq;
110 uint32_t runq_sort_last;
111 struct timer ticker;
112 unsigned int tick;
113 };
115 /*
116 * Virtual CPU
117 */
118 struct csched_vcpu {
119 struct list_head runq_elem;
120 struct list_head active_vcpu_elem;
121 struct csched_dom *sdom;
122 struct vcpu *vcpu;
123 atomic_t credit;
124 uint16_t flags;
125 int16_t pri;
126 #ifdef CSCHED_STATS
127 struct {
128 int credit_last;
129 uint32_t credit_incr;
130 uint32_t state_active;
131 uint32_t state_idle;
132 uint32_t migrate_q;
133 uint32_t migrate_r;
134 } stats;
135 #endif
136 };
138 /*
139 * Domain
140 */
141 struct csched_dom {
142 struct list_head active_vcpu;
143 struct list_head active_sdom_elem;
144 struct domain *dom;
145 uint16_t active_vcpu_count;
146 uint16_t weight;
147 uint16_t cap;
148 };
150 /*
151 * System-wide private data
152 */
153 struct csched_private {
154 spinlock_t lock;
155 struct list_head active_sdom;
156 uint32_t ncpus;
157 struct timer master_ticker;
158 unsigned int master;
159 cpumask_t idlers;
160 uint32_t weight;
161 uint32_t credit;
162 int credit_balance;
163 uint32_t runq_sort;
164 };
167 /*
168 * Global variables
169 */
170 static struct csched_private csched_priv;
172 static void csched_tick(void *_cpu);
174 static inline int
175 __vcpu_on_runq(struct csched_vcpu *svc)
176 {
177 return !list_empty(&svc->runq_elem);
178 }
180 static inline struct csched_vcpu *
181 __runq_elem(struct list_head *elem)
182 {
183 return list_entry(elem, struct csched_vcpu, runq_elem);
184 }
186 static inline void
187 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
188 {
189 const struct list_head * const runq = RUNQ(cpu);
190 struct list_head *iter;
192 BUG_ON( __vcpu_on_runq(svc) );
193 BUG_ON( cpu != svc->vcpu->processor );
195 list_for_each( iter, runq )
196 {
197 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
198 if ( svc->pri > iter_svc->pri )
199 break;
200 }
202 list_add_tail(&svc->runq_elem, iter);
203 }
205 static inline void
206 __runq_remove(struct csched_vcpu *svc)
207 {
208 BUG_ON( !__vcpu_on_runq(svc) );
209 list_del_init(&svc->runq_elem);
210 }
212 static inline void
213 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
214 {
215 struct csched_vcpu * const cur =
216 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
217 cpumask_t mask;
219 ASSERT(cur);
220 cpus_clear(mask);
222 /* If strictly higher priority than current VCPU, signal the CPU */
223 if ( new->pri > cur->pri )
224 {
225 if ( cur->pri == CSCHED_PRI_IDLE )
226 CSCHED_STAT_CRANK(tickle_local_idler);
227 else if ( cur->pri == CSCHED_PRI_TS_OVER )
228 CSCHED_STAT_CRANK(tickle_local_over);
229 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
230 CSCHED_STAT_CRANK(tickle_local_under);
231 else
232 CSCHED_STAT_CRANK(tickle_local_other);
234 cpu_set(cpu, mask);
235 }
237 /*
238 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
239 * let them know there is runnable work in the system...
240 */
241 if ( cur->pri > CSCHED_PRI_IDLE )
242 {
243 if ( cpus_empty(csched_priv.idlers) )
244 {
245 CSCHED_STAT_CRANK(tickle_idlers_none);
246 }
247 else
248 {
249 CSCHED_STAT_CRANK(tickle_idlers_some);
250 cpus_or(mask, mask, csched_priv.idlers);
251 cpus_and(mask, mask, new->vcpu->cpu_affinity);
252 }
253 }
255 /* Send scheduler interrupts to designated CPUs */
256 if ( !cpus_empty(mask) )
257 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
258 }
260 static int
261 csched_pcpu_init(int cpu)
262 {
263 struct csched_pcpu *spc;
264 unsigned long flags;
266 /* Allocate per-PCPU info */
267 spc = xmalloc(struct csched_pcpu);
268 if ( spc == NULL )
269 return -1;
271 spin_lock_irqsave(&csched_priv.lock, flags);
273 /* Initialize/update system-wide config */
274 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
275 if ( csched_priv.ncpus <= cpu )
276 csched_priv.ncpus = cpu + 1;
277 if ( csched_priv.master >= csched_priv.ncpus )
278 csched_priv.master = cpu;
280 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
281 INIT_LIST_HEAD(&spc->runq);
282 spc->runq_sort_last = csched_priv.runq_sort;
283 per_cpu(schedule_data, cpu).sched_priv = spc;
285 /* Start off idling... */
286 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
287 cpu_set(cpu, csched_priv.idlers);
289 spin_unlock_irqrestore(&csched_priv.lock, flags);
291 return 0;
292 }
294 #ifndef NDEBUG
295 static inline void
296 __csched_vcpu_check(struct vcpu *vc)
297 {
298 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
299 struct csched_dom * const sdom = svc->sdom;
301 BUG_ON( svc->vcpu != vc );
302 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
303 if ( sdom )
304 {
305 BUG_ON( is_idle_vcpu(vc) );
306 BUG_ON( sdom->dom != vc->domain );
307 }
308 else
309 {
310 BUG_ON( !is_idle_vcpu(vc) );
311 }
313 CSCHED_STAT_CRANK(vcpu_check);
314 }
315 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
316 #else
317 #define CSCHED_VCPU_CHECK(_vc)
318 #endif
320 /*
321 * Delay, in microseconds, between migrations of a VCPU between PCPUs.
322 * This prevents rapid fluttering of a VCPU between CPUs, and reduces the
323 * implicit overheads such as cache-warming. 1ms (1000) has been measured
324 * as a good value.
325 */
326 static unsigned int vcpu_migration_delay;
327 integer_param("vcpu_migration_delay", vcpu_migration_delay);
329 void set_vcpu_migration_delay(unsigned int delay)
330 {
331 vcpu_migration_delay = delay;
332 }
334 unsigned int get_vcpu_migration_delay(void)
335 {
336 return vcpu_migration_delay;
337 }
339 static inline int
340 __csched_vcpu_is_cache_hot(struct vcpu *v)
341 {
342 int hot = ((NOW() - v->last_run_time) <
343 ((uint64_t)vcpu_migration_delay * 1000u));
345 if ( hot )
346 CSCHED_STAT_CRANK(vcpu_hot);
348 return hot;
349 }
351 static inline int
352 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
353 {
354 /*
355 * Don't pick up work that's in the peer's scheduling tail or hot on
356 * peer PCPU. Only pick up work that's allowed to run on our CPU.
357 */
358 return !vc->is_running &&
359 !__csched_vcpu_is_cache_hot(vc) &&
360 cpu_isset(dest_cpu, vc->cpu_affinity);
361 }
363 static int
364 csched_cpu_pick(struct vcpu *vc)
365 {
366 cpumask_t cpus;
367 cpumask_t idlers;
368 int cpu;
370 /*
371 * Pick from online CPUs in VCPU's affinity mask, giving a
372 * preference to its current processor if it's in there.
373 */
374 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
375 cpu = cpu_isset(vc->processor, cpus)
376 ? vc->processor
377 : cycle_cpu(vc->processor, cpus);
378 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
380 /*
381 * Try to find an idle processor within the above constraints.
382 *
383 * In multi-core and multi-threaded CPUs, not all idle execution
384 * vehicles are equal!
385 *
386 * We give preference to the idle execution vehicle with the most
387 * idling neighbours in its grouping. This distributes work across
388 * distinct cores first and guarantees we don't do something stupid
389 * like run two VCPUs on co-hyperthreads while there are idle cores
390 * or sockets.
391 */
392 idlers = csched_priv.idlers;
393 cpu_set(cpu, idlers);
394 cpus_and(cpus, cpus, idlers);
395 cpu_clear(cpu, cpus);
397 while ( !cpus_empty(cpus) )
398 {
399 cpumask_t cpu_idlers;
400 cpumask_t nxt_idlers;
401 int nxt, weight_cpu, weight_nxt;
403 nxt = cycle_cpu(cpu, cpus);
405 if ( cpu_isset(cpu, cpu_core_map[nxt]) )
406 {
407 ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
408 cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
409 cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
410 }
411 else
412 {
413 ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
414 cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
415 cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
416 }
418 weight_cpu = cpus_weight(cpu_idlers);
419 weight_nxt = cpus_weight(nxt_idlers);
420 if ( ( (weight_cpu < weight_nxt) ^ sched_smt_power_savings )
421 && (weight_cpu != weight_nxt) )
422 {
423 cpu = nxt;
424 cpu_clear(cpu, cpus);
425 }
426 else
427 {
428 cpus_andnot(cpus, cpus, nxt_idlers);
429 }
430 }
432 return cpu;
433 }
435 static inline void
436 __csched_vcpu_acct_start(struct csched_vcpu *svc)
437 {
438 struct csched_dom * const sdom = svc->sdom;
439 unsigned long flags;
441 spin_lock_irqsave(&csched_priv.lock, flags);
443 if ( list_empty(&svc->active_vcpu_elem) )
444 {
445 CSCHED_VCPU_STAT_CRANK(svc, state_active);
446 CSCHED_STAT_CRANK(acct_vcpu_active);
448 sdom->active_vcpu_count++;
449 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
450 if ( list_empty(&sdom->active_sdom_elem) )
451 {
452 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
453 csched_priv.weight += sdom->weight;
454 }
455 }
457 spin_unlock_irqrestore(&csched_priv.lock, flags);
458 }
460 static inline void
461 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
462 {
463 struct csched_dom * const sdom = svc->sdom;
465 BUG_ON( list_empty(&svc->active_vcpu_elem) );
467 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
468 CSCHED_STAT_CRANK(acct_vcpu_idle);
470 sdom->active_vcpu_count--;
471 list_del_init(&svc->active_vcpu_elem);
472 if ( list_empty(&sdom->active_vcpu) )
473 {
474 BUG_ON( csched_priv.weight < sdom->weight );
475 list_del_init(&sdom->active_sdom_elem);
476 csched_priv.weight -= sdom->weight;
477 }
478 }
480 static void
481 csched_vcpu_acct(unsigned int cpu)
482 {
483 struct csched_vcpu * const svc = CSCHED_VCPU(current);
485 ASSERT( current->processor == cpu );
486 ASSERT( svc->sdom != NULL );
488 /*
489 * If this VCPU's priority was boosted when it last awoke, reset it.
490 * If the VCPU is found here, then it's consuming a non-negligeable
491 * amount of CPU resources and should no longer be boosted.
492 */
493 if ( svc->pri == CSCHED_PRI_TS_BOOST )
494 svc->pri = CSCHED_PRI_TS_UNDER;
496 /*
497 * Update credits
498 */
499 atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
501 /*
502 * Put this VCPU and domain back on the active list if it was
503 * idling.
504 *
505 * If it's been active a while, check if we'd be better off
506 * migrating it to run elsewhere (see multi-core and multi-thread
507 * support in csched_cpu_pick()).
508 */
509 if ( list_empty(&svc->active_vcpu_elem) )
510 {
511 __csched_vcpu_acct_start(svc);
512 }
513 else if ( csched_cpu_pick(current) != cpu )
514 {
515 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
516 CSCHED_STAT_CRANK(migrate_running);
517 set_bit(_VPF_migrating, &current->pause_flags);
518 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
519 }
520 }
522 static int
523 csched_vcpu_init(struct vcpu *vc)
524 {
525 struct domain * const dom = vc->domain;
526 struct csched_dom *sdom = CSCHED_DOM(dom);
527 struct csched_vcpu *svc;
529 CSCHED_STAT_CRANK(vcpu_init);
531 /* Allocate per-VCPU info */
532 svc = xmalloc(struct csched_vcpu);
533 if ( svc == NULL )
534 return -1;
536 INIT_LIST_HEAD(&svc->runq_elem);
537 INIT_LIST_HEAD(&svc->active_vcpu_elem);
538 svc->sdom = sdom;
539 svc->vcpu = vc;
540 atomic_set(&svc->credit, 0);
541 svc->flags = 0U;
542 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
543 CSCHED_VCPU_STATS_RESET(svc);
544 vc->sched_priv = svc;
546 /* Allocate per-PCPU info */
547 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
548 {
549 if ( csched_pcpu_init(vc->processor) != 0 )
550 return -1;
551 }
553 CSCHED_VCPU_CHECK(vc);
554 return 0;
555 }
557 static void
558 csched_vcpu_destroy(struct vcpu *vc)
559 {
560 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
561 struct csched_dom * const sdom = svc->sdom;
562 unsigned long flags;
564 CSCHED_STAT_CRANK(vcpu_destroy);
566 BUG_ON( sdom == NULL );
567 BUG_ON( !list_empty(&svc->runq_elem) );
569 spin_lock_irqsave(&csched_priv.lock, flags);
571 if ( !list_empty(&svc->active_vcpu_elem) )
572 __csched_vcpu_acct_stop_locked(svc);
574 spin_unlock_irqrestore(&csched_priv.lock, flags);
576 xfree(svc);
577 }
579 static void
580 csched_vcpu_sleep(struct vcpu *vc)
581 {
582 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
584 CSCHED_STAT_CRANK(vcpu_sleep);
586 BUG_ON( is_idle_vcpu(vc) );
588 if ( per_cpu(schedule_data, vc->processor).curr == vc )
589 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
590 else if ( __vcpu_on_runq(svc) )
591 __runq_remove(svc);
592 }
594 static void
595 csched_vcpu_wake(struct vcpu *vc)
596 {
597 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
598 const unsigned int cpu = vc->processor;
600 BUG_ON( is_idle_vcpu(vc) );
602 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
603 {
604 CSCHED_STAT_CRANK(vcpu_wake_running);
605 return;
606 }
607 if ( unlikely(__vcpu_on_runq(svc)) )
608 {
609 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
610 return;
611 }
613 if ( likely(vcpu_runnable(vc)) )
614 CSCHED_STAT_CRANK(vcpu_wake_runnable);
615 else
616 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
618 /*
619 * We temporarly boost the priority of awaking VCPUs!
620 *
621 * If this VCPU consumes a non negligeable amount of CPU, it
622 * will eventually find itself in the credit accounting code
623 * path where its priority will be reset to normal.
624 *
625 * If on the other hand the VCPU consumes little CPU and is
626 * blocking and awoken a lot (doing I/O for example), its
627 * priority will remain boosted, optimizing it's wake-to-run
628 * latencies.
629 *
630 * This allows wake-to-run latency sensitive VCPUs to preempt
631 * more CPU resource intensive VCPUs without impacting overall
632 * system fairness.
633 *
634 * The one exception is for VCPUs of capped domains unpausing
635 * after earning credits they had overspent. We don't boost
636 * those.
637 */
638 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
639 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
640 {
641 svc->pri = CSCHED_PRI_TS_BOOST;
642 }
644 /* Put the VCPU on the runq and tickle CPUs */
645 __runq_insert(cpu, svc);
646 __runq_tickle(cpu, svc);
647 }
649 static int
650 csched_dom_cntl(
651 struct domain *d,
652 struct xen_domctl_scheduler_op *op)
653 {
654 struct csched_dom * const sdom = CSCHED_DOM(d);
655 unsigned long flags;
657 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
658 {
659 op->u.credit.weight = sdom->weight;
660 op->u.credit.cap = sdom->cap;
661 }
662 else
663 {
664 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
666 spin_lock_irqsave(&csched_priv.lock, flags);
668 if ( op->u.credit.weight != 0 )
669 {
670 if ( !list_empty(&sdom->active_sdom_elem) )
671 {
672 csched_priv.weight -= sdom->weight;
673 csched_priv.weight += op->u.credit.weight;
674 }
675 sdom->weight = op->u.credit.weight;
676 }
678 if ( op->u.credit.cap != (uint16_t)~0U )
679 sdom->cap = op->u.credit.cap;
681 spin_unlock_irqrestore(&csched_priv.lock, flags);
682 }
684 return 0;
685 }
687 static int
688 csched_dom_init(struct domain *dom)
689 {
690 struct csched_dom *sdom;
692 CSCHED_STAT_CRANK(dom_init);
694 if ( is_idle_domain(dom) )
695 return 0;
697 sdom = xmalloc(struct csched_dom);
698 if ( sdom == NULL )
699 return -ENOMEM;
701 /* Initialize credit and weight */
702 INIT_LIST_HEAD(&sdom->active_vcpu);
703 sdom->active_vcpu_count = 0;
704 INIT_LIST_HEAD(&sdom->active_sdom_elem);
705 sdom->dom = dom;
706 sdom->weight = CSCHED_DEFAULT_WEIGHT;
707 sdom->cap = 0U;
708 dom->sched_priv = sdom;
710 return 0;
711 }
713 static void
714 csched_dom_destroy(struct domain *dom)
715 {
716 CSCHED_STAT_CRANK(dom_destroy);
717 xfree(CSCHED_DOM(dom));
718 }
720 /*
721 * This is a O(n) optimized sort of the runq.
722 *
723 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
724 * through the runq and move up any UNDERs that are preceded by OVERS. We
725 * remember the last UNDER to make the move up operation O(1).
726 */
727 static void
728 csched_runq_sort(unsigned int cpu)
729 {
730 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
731 struct list_head *runq, *elem, *next, *last_under;
732 struct csched_vcpu *svc_elem;
733 unsigned long flags;
734 int sort_epoch;
736 sort_epoch = csched_priv.runq_sort;
737 if ( sort_epoch == spc->runq_sort_last )
738 return;
740 spc->runq_sort_last = sort_epoch;
742 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
744 runq = &spc->runq;
745 elem = runq->next;
746 last_under = runq;
748 while ( elem != runq )
749 {
750 next = elem->next;
751 svc_elem = __runq_elem(elem);
753 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
754 {
755 /* does elem need to move up the runq? */
756 if ( elem->prev != last_under )
757 {
758 list_del(elem);
759 list_add(elem, last_under);
760 }
761 last_under = elem;
762 }
764 elem = next;
765 }
767 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
768 }
770 static void
771 csched_acct(void* dummy)
772 {
773 unsigned long flags;
774 struct list_head *iter_vcpu, *next_vcpu;
775 struct list_head *iter_sdom, *next_sdom;
776 struct csched_vcpu *svc;
777 struct csched_dom *sdom;
778 uint32_t credit_total;
779 uint32_t weight_total;
780 uint32_t weight_left;
781 uint32_t credit_fair;
782 uint32_t credit_peak;
783 uint32_t credit_cap;
784 int credit_balance;
785 int credit_xtra;
786 int credit;
789 spin_lock_irqsave(&csched_priv.lock, flags);
791 weight_total = csched_priv.weight;
792 credit_total = csched_priv.credit;
794 /* Converge balance towards 0 when it drops negative */
795 if ( csched_priv.credit_balance < 0 )
796 {
797 credit_total -= csched_priv.credit_balance;
798 CSCHED_STAT_CRANK(acct_balance);
799 }
801 if ( unlikely(weight_total == 0) )
802 {
803 csched_priv.credit_balance = 0;
804 spin_unlock_irqrestore(&csched_priv.lock, flags);
805 CSCHED_STAT_CRANK(acct_no_work);
806 goto out;
807 }
809 CSCHED_STAT_CRANK(acct_run);
811 weight_left = weight_total;
812 credit_balance = 0;
813 credit_xtra = 0;
814 credit_cap = 0U;
816 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
817 {
818 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
820 BUG_ON( is_idle_domain(sdom->dom) );
821 BUG_ON( sdom->active_vcpu_count == 0 );
822 BUG_ON( sdom->weight == 0 );
823 BUG_ON( sdom->weight > weight_left );
825 weight_left -= sdom->weight;
827 /*
828 * A domain's fair share is computed using its weight in competition
829 * with that of all other active domains.
830 *
831 * At most, a domain can use credits to run all its active VCPUs
832 * for one full accounting period. We allow a domain to earn more
833 * only when the system-wide credit balance is negative.
834 */
835 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
836 if ( csched_priv.credit_balance < 0 )
837 {
838 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
839 (weight_total - 1)
840 ) / weight_total;
841 }
843 if ( sdom->cap != 0U )
844 {
845 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
846 if ( credit_cap < credit_peak )
847 credit_peak = credit_cap;
849 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
850 ) / sdom->active_vcpu_count;
851 }
853 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
854 ) / weight_total;
856 if ( credit_fair < credit_peak )
857 {
858 credit_xtra = 1;
859 }
860 else
861 {
862 if ( weight_left != 0U )
863 {
864 /* Give other domains a chance at unused credits */
865 credit_total += ( ( ( credit_fair - credit_peak
866 ) * weight_total
867 ) + ( weight_left - 1 )
868 ) / weight_left;
869 }
871 if ( credit_xtra )
872 {
873 /*
874 * Lazily keep domains with extra credits at the head of
875 * the queue to give others a chance at them in future
876 * accounting periods.
877 */
878 CSCHED_STAT_CRANK(acct_reorder);
879 list_del(&sdom->active_sdom_elem);
880 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
881 }
883 credit_fair = credit_peak;
884 }
886 /* Compute fair share per VCPU */
887 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
888 ) / sdom->active_vcpu_count;
891 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
892 {
893 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
894 BUG_ON( sdom != svc->sdom );
896 /* Increment credit */
897 atomic_add(credit_fair, &svc->credit);
898 credit = atomic_read(&svc->credit);
900 /*
901 * Recompute priority or, if VCPU is idling, remove it from
902 * the active list.
903 */
904 if ( credit < 0 )
905 {
906 svc->pri = CSCHED_PRI_TS_OVER;
908 /* Park running VCPUs of capped-out domains */
909 if ( sdom->cap != 0U &&
910 credit < -credit_cap &&
911 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
912 {
913 CSCHED_STAT_CRANK(vcpu_park);
914 vcpu_pause_nosync(svc->vcpu);
915 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
916 }
918 /* Lower bound on credits */
919 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
920 {
921 CSCHED_STAT_CRANK(acct_min_credit);
922 credit = -CSCHED_CREDITS_PER_TSLICE;
923 atomic_set(&svc->credit, credit);
924 }
925 }
926 else
927 {
928 svc->pri = CSCHED_PRI_TS_UNDER;
930 /* Unpark any capped domains whose credits go positive */
931 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
932 {
933 /*
934 * It's important to unset the flag AFTER the unpause()
935 * call to make sure the VCPU's priority is not boosted
936 * if it is woken up here.
937 */
938 CSCHED_STAT_CRANK(vcpu_unpark);
939 vcpu_unpause(svc->vcpu);
940 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
941 }
943 /* Upper bound on credits means VCPU stops earning */
944 if ( credit > CSCHED_CREDITS_PER_TSLICE )
945 {
946 __csched_vcpu_acct_stop_locked(svc);
947 credit = 0;
948 atomic_set(&svc->credit, credit);
949 }
950 }
952 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
953 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
954 credit_balance += credit;
955 }
956 }
958 csched_priv.credit_balance = credit_balance;
960 spin_unlock_irqrestore(&csched_priv.lock, flags);
962 /* Inform each CPU that its runq needs to be sorted */
963 csched_priv.runq_sort++;
965 out:
966 set_timer( &csched_priv.master_ticker, NOW() +
967 MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
968 }
970 static void
971 csched_tick(void *_cpu)
972 {
973 unsigned int cpu = (unsigned long)_cpu;
974 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
976 spc->tick++;
978 /*
979 * Accounting for running VCPU
980 */
981 if ( !is_idle_vcpu(current) )
982 csched_vcpu_acct(cpu);
984 /*
985 * Check if runq needs to be sorted
986 *
987 * Every physical CPU resorts the runq after the accounting master has
988 * modified priorities. This is a special O(n) sort and runs at most
989 * once per accounting period (currently 30 milliseconds).
990 */
991 csched_runq_sort(cpu);
993 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
994 }
996 static struct csched_vcpu *
997 csched_runq_steal(int peer_cpu, int cpu, int pri)
998 {
999 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1000 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1001 struct csched_vcpu *speer;
1002 struct list_head *iter;
1003 struct vcpu *vc;
1005 /*
1006 * Don't steal from an idle CPU's runq because it's about to
1007 * pick up work from it itself.
1008 */
1009 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1011 list_for_each( iter, &peer_pcpu->runq )
1013 speer = __runq_elem(iter);
1015 /*
1016 * If next available VCPU here is not of strictly higher
1017 * priority than ours, this PCPU is useless to us.
1018 */
1019 if ( speer->pri <= pri )
1020 break;
1022 /* Is this VCPU is runnable on our PCPU? */
1023 vc = speer->vcpu;
1024 BUG_ON( is_idle_vcpu(vc) );
1026 if (__csched_vcpu_is_migrateable(vc, cpu))
1028 /* We got a candidate. Grab it! */
1029 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1030 CSCHED_STAT_CRANK(migrate_queued);
1031 __runq_remove(speer);
1032 vc->processor = cpu;
1033 return speer;
1038 CSCHED_STAT_CRANK(steal_peer_idle);
1039 return NULL;
1042 static struct csched_vcpu *
1043 csched_load_balance(int cpu, struct csched_vcpu *snext)
1045 struct csched_vcpu *speer;
1046 cpumask_t workers;
1047 int peer_cpu;
1049 BUG_ON( cpu != snext->vcpu->processor );
1051 /* If this CPU is going offline we shouldn't steal work. */
1052 if ( unlikely(!cpu_online(cpu)) )
1053 goto out;
1055 if ( snext->pri == CSCHED_PRI_IDLE )
1056 CSCHED_STAT_CRANK(load_balance_idle);
1057 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1058 CSCHED_STAT_CRANK(load_balance_over);
1059 else
1060 CSCHED_STAT_CRANK(load_balance_other);
1062 /*
1063 * Peek at non-idling CPUs in the system, starting with our
1064 * immediate neighbour.
1065 */
1066 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1067 cpu_clear(cpu, workers);
1068 peer_cpu = cpu;
1070 while ( !cpus_empty(workers) )
1072 peer_cpu = cycle_cpu(peer_cpu, workers);
1073 cpu_clear(peer_cpu, workers);
1075 /*
1076 * Get ahold of the scheduler lock for this peer CPU.
1078 * Note: We don't spin on this lock but simply try it. Spinning could
1079 * cause a deadlock if the peer CPU is also load balancing and trying
1080 * to lock this CPU.
1081 */
1082 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1084 CSCHED_STAT_CRANK(steal_trylock_failed);
1085 continue;
1088 /*
1089 * Any work over there to steal?
1090 */
1091 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1092 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1093 if ( speer != NULL )
1094 return speer;
1097 out:
1098 /* Failed to find more important work elsewhere... */
1099 __runq_remove(snext);
1100 return snext;
1103 /*
1104 * This function is in the critical path. It is designed to be simple and
1105 * fast for the common case.
1106 */
1107 static struct task_slice
1108 csched_schedule(s_time_t now)
1110 const int cpu = smp_processor_id();
1111 struct list_head * const runq = RUNQ(cpu);
1112 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1113 struct csched_vcpu *snext;
1114 struct task_slice ret;
1116 CSCHED_STAT_CRANK(schedule);
1117 CSCHED_VCPU_CHECK(current);
1119 /*
1120 * Select next runnable local VCPU (ie top of local runq)
1121 */
1122 if ( vcpu_runnable(current) )
1123 __runq_insert(cpu, scurr);
1124 else
1125 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1127 snext = __runq_elem(runq->next);
1129 /*
1130 * SMP Load balance:
1132 * If the next highest priority local runnable VCPU has already eaten
1133 * through its credits, look on other PCPUs to see if we have more
1134 * urgent work... If not, csched_load_balance() will return snext, but
1135 * already removed from the runq.
1136 */
1137 if ( snext->pri > CSCHED_PRI_TS_OVER )
1138 __runq_remove(snext);
1139 else
1140 snext = csched_load_balance(cpu, snext);
1142 /*
1143 * Update idlers mask if necessary. When we're idling, other CPUs
1144 * will tickle us when they get extra work.
1145 */
1146 if ( snext->pri == CSCHED_PRI_IDLE )
1148 if ( !cpu_isset(cpu, csched_priv.idlers) )
1149 cpu_set(cpu, csched_priv.idlers);
1151 else if ( cpu_isset(cpu, csched_priv.idlers) )
1153 cpu_clear(cpu, csched_priv.idlers);
1156 /*
1157 * Return task to run next...
1158 */
1159 ret.time = (is_idle_vcpu(snext->vcpu) ?
1160 -1 : MILLISECS(CSCHED_MSECS_PER_TSLICE));
1161 ret.task = snext->vcpu;
1163 CSCHED_VCPU_CHECK(ret.task);
1164 return ret;
1167 static void
1168 csched_dump_vcpu(struct csched_vcpu *svc)
1170 struct csched_dom * const sdom = svc->sdom;
1172 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1173 svc->vcpu->domain->domain_id,
1174 svc->vcpu->vcpu_id,
1175 svc->pri,
1176 svc->flags,
1177 svc->vcpu->processor);
1179 if ( sdom )
1181 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1182 #ifdef CSCHED_STATS
1183 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1184 svc->stats.credit_last,
1185 svc->stats.credit_incr,
1186 svc->stats.state_active,
1187 svc->stats.state_idle,
1188 svc->stats.migrate_q,
1189 svc->stats.migrate_r);
1190 #endif
1193 printk("\n");
1196 static void
1197 csched_dump_pcpu(int cpu)
1199 struct list_head *runq, *iter;
1200 struct csched_pcpu *spc;
1201 struct csched_vcpu *svc;
1202 int loop;
1203 char cpustr[100];
1205 spc = CSCHED_PCPU(cpu);
1206 runq = &spc->runq;
1208 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_sibling_map[cpu]);
1209 printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
1210 cpumask_scnprintf(cpustr, sizeof(cpustr), cpu_core_map[cpu]);
1211 printk("core=%s\n", cpustr);
1213 /* current VCPU */
1214 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1215 if ( svc )
1217 printk("\trun: ");
1218 csched_dump_vcpu(svc);
1221 loop = 0;
1222 list_for_each( iter, runq )
1224 svc = __runq_elem(iter);
1225 if ( svc )
1227 printk("\t%3d: ", ++loop);
1228 csched_dump_vcpu(svc);
1233 static void
1234 csched_dump(void)
1236 struct list_head *iter_sdom, *iter_svc;
1237 int loop;
1238 char idlers_buf[100];
1240 printk("info:\n"
1241 "\tncpus = %u\n"
1242 "\tmaster = %u\n"
1243 "\tcredit = %u\n"
1244 "\tcredit balance = %d\n"
1245 "\tweight = %u\n"
1246 "\trunq_sort = %u\n"
1247 "\tdefault-weight = %d\n"
1248 "\tmsecs per tick = %dms\n"
1249 "\tcredits per tick = %d\n"
1250 "\tticks per tslice = %d\n"
1251 "\tticks per acct = %d\n"
1252 "\tmigration delay = %uus\n",
1253 csched_priv.ncpus,
1254 csched_priv.master,
1255 csched_priv.credit,
1256 csched_priv.credit_balance,
1257 csched_priv.weight,
1258 csched_priv.runq_sort,
1259 CSCHED_DEFAULT_WEIGHT,
1260 CSCHED_MSECS_PER_TICK,
1261 CSCHED_CREDITS_PER_TICK,
1262 CSCHED_TICKS_PER_TSLICE,
1263 CSCHED_TICKS_PER_ACCT,
1264 vcpu_migration_delay);
1266 cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
1267 printk("idlers: %s\n", idlers_buf);
1269 printk("active vcpus:\n");
1270 loop = 0;
1271 list_for_each( iter_sdom, &csched_priv.active_sdom )
1273 struct csched_dom *sdom;
1274 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1276 list_for_each( iter_svc, &sdom->active_vcpu )
1278 struct csched_vcpu *svc;
1279 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1281 printk("\t%3d: ", ++loop);
1282 csched_dump_vcpu(svc);
1287 static void
1288 csched_init(void)
1290 spin_lock_init(&csched_priv.lock);
1291 INIT_LIST_HEAD(&csched_priv.active_sdom);
1292 csched_priv.ncpus = 0;
1293 csched_priv.master = UINT_MAX;
1294 cpus_clear(csched_priv.idlers);
1295 csched_priv.weight = 0U;
1296 csched_priv.credit = 0U;
1297 csched_priv.credit_balance = 0;
1298 csched_priv.runq_sort = 0U;
1301 /* Tickers cannot be kicked until SMP subsystem is alive. */
1302 static __init int csched_start_tickers(void)
1304 struct csched_pcpu *spc;
1305 unsigned int cpu;
1307 /* Is the credit scheduler initialised? */
1308 if ( csched_priv.ncpus == 0 )
1309 return 0;
1311 for_each_online_cpu ( cpu )
1313 spc = CSCHED_PCPU(cpu);
1314 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1317 init_timer( &csched_priv.master_ticker, csched_acct, NULL,
1318 csched_priv.master);
1320 set_timer( &csched_priv.master_ticker, NOW() +
1321 MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
1323 return 0;
1325 __initcall(csched_start_tickers);
1327 static void csched_tick_suspend(void)
1329 struct csched_pcpu *spc;
1331 spc = CSCHED_PCPU(smp_processor_id());
1333 stop_timer(&spc->ticker);
1336 static void csched_tick_resume(void)
1338 struct csched_pcpu *spc;
1339 uint64_t now = NOW();
1341 spc = CSCHED_PCPU(smp_processor_id());
1343 set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
1344 - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
1347 struct scheduler sched_credit_def = {
1348 .name = "SMP Credit Scheduler",
1349 .opt_name = "credit",
1350 .sched_id = XEN_SCHEDULER_CREDIT,
1352 .init_domain = csched_dom_init,
1353 .destroy_domain = csched_dom_destroy,
1355 .init_vcpu = csched_vcpu_init,
1356 .destroy_vcpu = csched_vcpu_destroy,
1358 .sleep = csched_vcpu_sleep,
1359 .wake = csched_vcpu_wake,
1361 .adjust = csched_dom_cntl,
1363 .pick_cpu = csched_cpu_pick,
1364 .do_schedule = csched_schedule,
1366 .dump_cpu_state = csched_dump_pcpu,
1367 .dump_settings = csched_dump,
1368 .init = csched_init,
1370 .tick_suspend = csched_tick_suspend,
1371 .tick_resume = csched_tick_resume,
1372 };