direct-io.hg

view xen/common/sched_credit.c @ 10734:9b7e1ea4c4d2

[HVM] Sync p2m table across all vcpus on x86_32p xen.
We found VGA acceleration can not work on SMP VMX guests on x86_32p
xen, this is caused by the way we construct p2m table today: only the 1st
l2 page table slot that maps p2m table pages is copied to none-vcpu0 vcpu
monitor page table when VMX is created. But VGA acceleration will
create some p2m table entries beyond the 1st l2 page table slot after HVM is
created, so only vcpu0 can get these p2m entries, and other vcpu can
not do VGA acceleration.

Signed-off-by: Xin Li <xin.b.li@intel.com>
author kfraser@localhost.localdomain
date Wed Jul 26 11:34:12 2006 +0100 (2006-07-26)
parents 64f9f308e109
children 556022fb8eb6
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
25 /*
26 * CSCHED_STATS
27 *
28 * Manage very basic counters and stats.
29 *
30 * Useful for debugging live systems. The stats are displayed
31 * with runq dumps ('r' on the Xen console).
32 */
33 #define CSCHED_STATS
36 /*
37 * Basic constants
38 */
39 #define CSCHED_TICK 10 /* milliseconds */
40 #define CSCHED_TSLICE 30 /* milliseconds */
41 #define CSCHED_ACCT_NTICKS 3
42 #define CSCHED_ACCT_PERIOD (CSCHED_ACCT_NTICKS * CSCHED_TICK)
43 #define CSCHED_DEFAULT_WEIGHT 256
46 /*
47 * Priorities
48 */
49 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
50 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
51 #define CSCHED_PRI_IDLE -64 /* idle */
52 #define CSCHED_PRI_TS_PARKED -65 /* time-share w/ capped credits */
55 /*
56 * Useful macros
57 */
58 #define CSCHED_PCPU(_c) ((struct csched_pcpu *)schedule_data[_c].sched_priv)
59 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
60 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
61 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
64 /*
65 * Stats
66 */
67 #ifdef CSCHED_STATS
69 #define CSCHED_STAT(_X) (csched_priv.stats._X)
70 #define CSCHED_STAT_DEFINE(_X) uint32_t _X;
71 #define CSCHED_STAT_PRINTK(_X) \
72 do \
73 { \
74 printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \
75 } while ( 0 );
77 #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \
78 _MACRO(vcpu_init) \
79 _MACRO(vcpu_sleep) \
80 _MACRO(vcpu_wake_running) \
81 _MACRO(vcpu_wake_onrunq) \
82 _MACRO(vcpu_wake_runnable) \
83 _MACRO(vcpu_wake_not_runnable) \
84 _MACRO(dom_destroy) \
85 _MACRO(schedule) \
86 _MACRO(tickle_local_idler) \
87 _MACRO(tickle_local_over) \
88 _MACRO(tickle_local_under) \
89 _MACRO(tickle_local_other) \
90 _MACRO(acct_run) \
91 _MACRO(acct_no_work) \
92 _MACRO(acct_balance) \
93 _MACRO(acct_reorder) \
94 _MACRO(acct_min_credit) \
95 _MACRO(acct_vcpu_active) \
96 _MACRO(acct_vcpu_idle) \
97 _MACRO(acct_vcpu_credit_min)
99 #define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \
100 _MACRO(vcpu_migrate) \
101 _MACRO(load_balance_idle) \
102 _MACRO(load_balance_over) \
103 _MACRO(load_balance_other) \
104 _MACRO(steal_trylock_failed) \
105 _MACRO(steal_peer_down) \
106 _MACRO(steal_peer_idle) \
107 _MACRO(steal_peer_running) \
108 _MACRO(steal_peer_pinned) \
109 _MACRO(tickle_idlers_none) \
110 _MACRO(tickle_idlers_some)
112 #ifndef NDEBUG
113 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
114 _MACRO(vcpu_check)
115 #else
116 #define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
117 #endif
119 #define CSCHED_STATS_EXPAND(_MACRO) \
120 CSCHED_STATS_EXPAND_SCHED(_MACRO) \
121 CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \
122 CSCHED_STATS_EXPAND_CHECKS(_MACRO)
124 #define CSCHED_STATS_RESET() \
125 do \
126 { \
127 memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \
128 } while ( 0 )
130 #define CSCHED_STATS_DEFINE() \
131 struct \
132 { \
133 CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
134 } stats
136 #define CSCHED_STATS_PRINTK() \
137 do \
138 { \
139 printk("stats:\n"); \
140 CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
141 } while ( 0 )
143 #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++)
145 #else /* CSCHED_STATS */
147 #define CSCHED_STATS_RESET() do {} while ( 0 )
148 #define CSCHED_STATS_DEFINE() do {} while ( 0 )
149 #define CSCHED_STATS_PRINTK() do {} while ( 0 )
150 #define CSCHED_STAT_CRANK(_X) do {} while ( 0 )
152 #endif /* CSCHED_STATS */
155 /*
156 * Physical CPU
157 */
158 struct csched_pcpu {
159 struct list_head runq;
160 uint32_t runq_sort_last;
161 };
163 /*
164 * Virtual CPU
165 */
166 struct csched_vcpu {
167 struct list_head runq_elem;
168 struct list_head active_vcpu_elem;
169 struct csched_dom *sdom;
170 struct vcpu *vcpu;
171 atomic_t credit;
172 int credit_last;
173 uint32_t credit_incr;
174 uint32_t state_active;
175 uint32_t state_idle;
176 int16_t pri;
177 };
179 /*
180 * Domain
181 */
182 struct csched_dom {
183 struct list_head active_vcpu;
184 struct list_head active_sdom_elem;
185 struct domain *dom;
186 uint16_t active_vcpu_count;
187 uint16_t weight;
188 uint16_t cap;
189 };
191 /*
192 * System-wide private data
193 */
194 struct csched_private {
195 spinlock_t lock;
196 struct list_head active_sdom;
197 uint32_t ncpus;
198 unsigned int master;
199 cpumask_t idlers;
200 uint32_t weight;
201 uint32_t credit;
202 int credit_balance;
203 uint32_t runq_sort;
204 CSCHED_STATS_DEFINE();
205 };
208 /*
209 * Global variables
210 */
211 static struct csched_private csched_priv;
215 static inline int
216 __vcpu_on_runq(struct csched_vcpu *svc)
217 {
218 return !list_empty(&svc->runq_elem);
219 }
221 static inline struct csched_vcpu *
222 __runq_elem(struct list_head *elem)
223 {
224 return list_entry(elem, struct csched_vcpu, runq_elem);
225 }
227 static inline void
228 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
229 {
230 const struct list_head * const runq = RUNQ(cpu);
231 struct list_head *iter;
233 BUG_ON( __vcpu_on_runq(svc) );
234 BUG_ON( cpu != svc->vcpu->processor );
236 list_for_each( iter, runq )
237 {
238 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
239 if ( svc->pri > iter_svc->pri )
240 break;
241 }
243 list_add_tail(&svc->runq_elem, iter);
244 }
246 static inline void
247 __runq_remove(struct csched_vcpu *svc)
248 {
249 BUG_ON( !__vcpu_on_runq(svc) );
250 list_del_init(&svc->runq_elem);
251 }
253 static inline void
254 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
255 {
256 struct csched_vcpu * const cur = CSCHED_VCPU(schedule_data[cpu].curr);
257 cpumask_t mask;
259 ASSERT(cur);
260 cpus_clear(mask);
262 /* If strictly higher priority than current VCPU, signal the CPU */
263 if ( new->pri > cur->pri )
264 {
265 if ( cur->pri == CSCHED_PRI_IDLE )
266 CSCHED_STAT_CRANK(tickle_local_idler);
267 else if ( cur->pri == CSCHED_PRI_TS_OVER )
268 CSCHED_STAT_CRANK(tickle_local_over);
269 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
270 CSCHED_STAT_CRANK(tickle_local_under);
271 else
272 CSCHED_STAT_CRANK(tickle_local_other);
274 cpu_set(cpu, mask);
275 }
277 /*
278 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
279 * let them know there is runnable work in the system...
280 */
281 if ( cur->pri > CSCHED_PRI_IDLE )
282 {
283 if ( cpus_empty(csched_priv.idlers) )
284 {
285 CSCHED_STAT_CRANK(tickle_idlers_none);
286 }
287 else
288 {
289 CSCHED_STAT_CRANK(tickle_idlers_some);
290 cpus_or(mask, mask, csched_priv.idlers);
291 }
292 }
294 /* Send scheduler interrupts to designated CPUs */
295 if ( !cpus_empty(mask) )
296 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
297 }
299 static void
300 csched_pcpu_init(int cpu)
301 {
302 struct csched_pcpu *spc;
303 unsigned long flags;
305 spin_lock_irqsave(&csched_priv.lock, flags);
307 /* Initialize/update system-wide config */
308 csched_priv.credit += CSCHED_ACCT_PERIOD;
309 if ( csched_priv.ncpus <= cpu )
310 csched_priv.ncpus = cpu + 1;
311 if ( csched_priv.master >= csched_priv.ncpus )
312 csched_priv.master = cpu;
314 /* Allocate per-PCPU info */
315 spc = xmalloc(struct csched_pcpu);
316 BUG_ON( spc == NULL );
317 INIT_LIST_HEAD(&spc->runq);
318 spc->runq_sort_last = csched_priv.runq_sort;
319 schedule_data[cpu].sched_priv = spc;
321 /* Start off idling... */
322 BUG_ON( !is_idle_vcpu(schedule_data[cpu].curr) );
323 cpu_set(cpu, csched_priv.idlers);
325 spin_unlock_irqrestore(&csched_priv.lock, flags);
326 }
328 #ifndef NDEBUG
329 static inline void
330 __csched_vcpu_check(struct vcpu *vc)
331 {
332 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
333 struct csched_dom * const sdom = svc->sdom;
335 BUG_ON( svc->vcpu != vc );
336 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
337 if ( sdom )
338 {
339 BUG_ON( is_idle_vcpu(vc) );
340 BUG_ON( sdom->dom != vc->domain );
341 }
342 else
343 {
344 BUG_ON( !is_idle_vcpu(vc) );
345 }
347 CSCHED_STAT_CRANK(vcpu_check);
348 }
349 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
350 #else
351 #define CSCHED_VCPU_CHECK(_vc)
352 #endif
354 static inline int
355 __csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
356 {
357 /*
358 * Don't pick up work that's in the peer's scheduling tail. Also only pick
359 * up work that's allowed to run on our CPU.
360 */
361 if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
362 {
363 CSCHED_STAT_CRANK(steal_peer_running);
364 return 0;
365 }
367 if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
368 {
369 CSCHED_STAT_CRANK(steal_peer_pinned);
370 return 0;
371 }
373 return 1;
374 }
376 static void
377 csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
378 {
379 struct csched_dom * const sdom = svc->sdom;
380 unsigned long flags;
382 /* Update credits */
383 atomic_sub(credit_dec, &svc->credit);
385 /* Put this VCPU and domain back on the active list if it was idling */
386 if ( list_empty(&svc->active_vcpu_elem) )
387 {
388 spin_lock_irqsave(&csched_priv.lock, flags);
390 if ( list_empty(&svc->active_vcpu_elem) )
391 {
392 CSCHED_STAT_CRANK(acct_vcpu_active);
393 svc->state_active++;
395 sdom->active_vcpu_count++;
396 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
397 if ( list_empty(&sdom->active_sdom_elem) )
398 {
399 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
400 csched_priv.weight += sdom->weight;
401 }
402 }
404 spin_unlock_irqrestore(&csched_priv.lock, flags);
405 }
406 }
408 static inline void
409 __csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
410 {
411 struct csched_dom * const sdom = svc->sdom;
413 BUG_ON( list_empty(&svc->active_vcpu_elem) );
415 CSCHED_STAT_CRANK(acct_vcpu_idle);
416 svc->state_idle++;
418 sdom->active_vcpu_count--;
419 list_del_init(&svc->active_vcpu_elem);
420 if ( list_empty(&sdom->active_vcpu) )
421 {
422 BUG_ON( csched_priv.weight < sdom->weight );
423 list_del_init(&sdom->active_sdom_elem);
424 csched_priv.weight -= sdom->weight;
425 }
427 atomic_set(&svc->credit, 0);
428 }
430 static int
431 csched_vcpu_init(struct vcpu *vc)
432 {
433 struct domain * const dom = vc->domain;
434 struct csched_dom *sdom;
435 struct csched_vcpu *svc;
436 int16_t pri;
438 CSCHED_STAT_CRANK(vcpu_init);
440 /* Allocate, if appropriate, per-domain info */
441 if ( is_idle_vcpu(vc) )
442 {
443 sdom = NULL;
444 pri = CSCHED_PRI_IDLE;
445 }
446 else if ( CSCHED_DOM(dom) )
447 {
448 sdom = CSCHED_DOM(dom);
449 pri = CSCHED_PRI_TS_UNDER;
450 }
451 else
452 {
453 sdom = xmalloc(struct csched_dom);
454 if ( !sdom )
455 return -1;
457 /* Initialize credit and weight */
458 INIT_LIST_HEAD(&sdom->active_vcpu);
459 sdom->active_vcpu_count = 0;
460 INIT_LIST_HEAD(&sdom->active_sdom_elem);
461 sdom->dom = dom;
462 sdom->weight = CSCHED_DEFAULT_WEIGHT;
463 sdom->cap = 0U;
464 dom->sched_priv = sdom;
465 pri = CSCHED_PRI_TS_UNDER;
466 }
468 /* Allocate per-VCPU info */
469 svc = xmalloc(struct csched_vcpu);
470 if ( !svc )
471 return -1;
473 INIT_LIST_HEAD(&svc->runq_elem);
474 INIT_LIST_HEAD(&svc->active_vcpu_elem);
475 svc->sdom = sdom;
476 svc->vcpu = vc;
477 atomic_set(&svc->credit, 0);
478 svc->credit_last = 0;
479 svc->credit_incr = 0U;
480 svc->state_active = 0U;
481 svc->state_idle = 0U;
482 svc->pri = pri;
483 vc->sched_priv = svc;
485 CSCHED_VCPU_CHECK(vc);
487 /* Attach fair-share VCPUs to the accounting list */
488 if ( likely(sdom != NULL) )
489 csched_vcpu_acct(svc, 0);
491 /* Allocate per-PCPU info */
492 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
493 csched_pcpu_init(vc->processor);
495 CSCHED_VCPU_CHECK(vc);
497 return 0;
498 }
500 static void
501 csched_vcpu_free(struct vcpu *vc)
502 {
503 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
504 struct csched_dom * const sdom = svc->sdom;
505 unsigned long flags;
507 BUG_ON( sdom == NULL );
508 BUG_ON( !list_empty(&svc->runq_elem) );
510 spin_lock_irqsave(&csched_priv.lock, flags);
512 if ( !list_empty(&svc->active_vcpu_elem) )
513 __csched_vcpu_acct_idle_locked(svc);
515 spin_unlock_irqrestore(&csched_priv.lock, flags);
517 xfree(svc);
518 }
520 static void
521 csched_vcpu_sleep(struct vcpu *vc)
522 {
523 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
525 CSCHED_STAT_CRANK(vcpu_sleep);
527 BUG_ON( is_idle_vcpu(vc) );
529 if ( schedule_data[vc->processor].curr == vc )
530 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
531 else if ( __vcpu_on_runq(svc) )
532 __runq_remove(svc);
533 }
535 static void
536 csched_vcpu_wake(struct vcpu *vc)
537 {
538 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
539 const unsigned int cpu = vc->processor;
541 BUG_ON( is_idle_vcpu(vc) );
543 if ( unlikely(schedule_data[cpu].curr == vc) )
544 {
545 CSCHED_STAT_CRANK(vcpu_wake_running);
546 return;
547 }
548 if ( unlikely(__vcpu_on_runq(svc)) )
549 {
550 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
551 return;
552 }
554 if ( likely(vcpu_runnable(vc)) )
555 CSCHED_STAT_CRANK(vcpu_wake_runnable);
556 else
557 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
559 /* Put the VCPU on the runq and tickle CPUs */
560 __runq_insert(cpu, svc);
561 __runq_tickle(cpu, svc);
562 }
564 static int
565 csched_vcpu_set_affinity(struct vcpu *vc, cpumask_t *affinity)
566 {
567 unsigned long flags;
568 int lcpu;
570 if ( vc == current )
571 {
572 /* No locking needed but also can't move on the spot... */
573 if ( !cpu_isset(vc->processor, *affinity) )
574 return -EBUSY;
576 vc->cpu_affinity = *affinity;
577 }
578 else
579 {
580 /* Pause, modify, and unpause. */
581 vcpu_pause(vc);
583 vc->cpu_affinity = *affinity;
584 if ( !cpu_isset(vc->processor, vc->cpu_affinity) )
585 {
586 /*
587 * We must grab the scheduler lock for the CPU currently owning
588 * this VCPU before changing its ownership.
589 */
590 vcpu_schedule_lock_irqsave(vc, flags);
591 lcpu = vc->processor;
593 vc->processor = first_cpu(vc->cpu_affinity);
595 spin_unlock_irqrestore(&schedule_data[lcpu].schedule_lock, flags);
596 }
598 vcpu_unpause(vc);
599 }
601 return 0;
602 }
604 static int
605 csched_dom_cntl(
606 struct domain *d,
607 struct sched_adjdom_cmd *cmd)
608 {
609 struct csched_dom * const sdom = CSCHED_DOM(d);
610 unsigned long flags;
612 if ( cmd->direction == SCHED_INFO_GET )
613 {
614 cmd->u.credit.weight = sdom->weight;
615 cmd->u.credit.cap = sdom->cap;
616 }
617 else
618 {
619 ASSERT( cmd->direction == SCHED_INFO_PUT );
621 spin_lock_irqsave(&csched_priv.lock, flags);
623 if ( cmd->u.credit.weight != 0 )
624 {
625 if ( !list_empty(&sdom->active_sdom_elem) )
626 {
627 csched_priv.weight -= sdom->weight;
628 csched_priv.weight += cmd->u.credit.weight;
629 }
630 sdom->weight = cmd->u.credit.weight;
631 }
633 if ( cmd->u.credit.cap != (uint16_t)~0U )
634 sdom->cap = cmd->u.credit.cap;
636 spin_unlock_irqrestore(&csched_priv.lock, flags);
637 }
639 return 0;
640 }
642 static void
643 csched_dom_destroy(struct domain *dom)
644 {
645 struct csched_dom * const sdom = CSCHED_DOM(dom);
646 int i;
648 CSCHED_STAT_CRANK(dom_destroy);
650 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
651 {
652 if ( dom->vcpu[i] )
653 csched_vcpu_free(dom->vcpu[i]);
654 }
656 xfree(sdom);
657 }
659 /*
660 * This is a O(n) optimized sort of the runq.
661 *
662 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
663 * through the runq and move up any UNDERs that are preceded by OVERS. We
664 * remember the last UNDER to make the move up operation O(1).
665 */
666 static void
667 csched_runq_sort(unsigned int cpu)
668 {
669 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
670 struct list_head *runq, *elem, *next, *last_under;
671 struct csched_vcpu *svc_elem;
672 unsigned long flags;
673 int sort_epoch;
675 sort_epoch = csched_priv.runq_sort;
676 if ( sort_epoch == spc->runq_sort_last )
677 return;
679 spc->runq_sort_last = sort_epoch;
681 spin_lock_irqsave(&schedule_data[cpu].schedule_lock, flags);
683 runq = &spc->runq;
684 elem = runq->next;
685 last_under = runq;
687 while ( elem != runq )
688 {
689 next = elem->next;
690 svc_elem = __runq_elem(elem);
692 if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
693 {
694 /* does elem need to move up the runq? */
695 if ( elem->prev != last_under )
696 {
697 list_del(elem);
698 list_add(elem, last_under);
699 }
700 last_under = elem;
701 }
703 elem = next;
704 }
706 spin_unlock_irqrestore(&schedule_data[cpu].schedule_lock, flags);
707 }
709 static void
710 csched_acct(void)
711 {
712 unsigned long flags;
713 struct list_head *iter_vcpu, *next_vcpu;
714 struct list_head *iter_sdom, *next_sdom;
715 struct csched_vcpu *svc;
716 struct csched_dom *sdom;
717 uint32_t credit_total;
718 uint32_t weight_total;
719 uint32_t weight_left;
720 uint32_t credit_fair;
721 uint32_t credit_peak;
722 int credit_balance;
723 int credit_xtra;
724 int credit;
727 spin_lock_irqsave(&csched_priv.lock, flags);
729 weight_total = csched_priv.weight;
730 credit_total = csched_priv.credit;
732 /* Converge balance towards 0 when it drops negative */
733 if ( csched_priv.credit_balance < 0 )
734 {
735 credit_total -= csched_priv.credit_balance;
736 CSCHED_STAT_CRANK(acct_balance);
737 }
739 if ( unlikely(weight_total == 0) )
740 {
741 csched_priv.credit_balance = 0;
742 spin_unlock_irqrestore(&csched_priv.lock, flags);
743 CSCHED_STAT_CRANK(acct_no_work);
744 return;
745 }
747 CSCHED_STAT_CRANK(acct_run);
749 weight_left = weight_total;
750 credit_balance = 0;
751 credit_xtra = 0;
753 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
754 {
755 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
757 BUG_ON( is_idle_domain(sdom->dom) );
758 BUG_ON( sdom->active_vcpu_count == 0 );
759 BUG_ON( sdom->weight == 0 );
760 BUG_ON( sdom->weight > weight_left );
762 weight_left -= sdom->weight;
764 /*
765 * A domain's fair share is computed using its weight in competition
766 * with that of all other active domains.
767 *
768 * At most, a domain can use credits to run all its active VCPUs
769 * for one full accounting period. We allow a domain to earn more
770 * only when the system-wide credit balance is negative.
771 */
772 credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
773 if ( csched_priv.credit_balance < 0 )
774 {
775 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
776 (weight_total - 1)
777 ) / weight_total;
778 }
779 if ( sdom->cap != 0U )
780 {
781 uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 100;
782 if ( credit_cap < credit_peak )
783 credit_peak = credit_cap;
784 }
786 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
787 ) / weight_total;
789 if ( credit_fair < credit_peak )
790 {
791 credit_xtra = 1;
792 }
793 else
794 {
795 if ( weight_left != 0U )
796 {
797 /* Give other domains a chance at unused credits */
798 credit_total += ( ( ( credit_fair - credit_peak
799 ) * weight_total
800 ) + ( weight_left - 1 )
801 ) / weight_left;
802 }
804 if ( credit_xtra )
805 {
806 /*
807 * Lazily keep domains with extra credits at the head of
808 * the queue to give others a chance at them in future
809 * accounting periods.
810 */
811 CSCHED_STAT_CRANK(acct_reorder);
812 list_del(&sdom->active_sdom_elem);
813 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
814 }
816 credit_fair = credit_peak;
817 }
819 /* Compute fair share per VCPU */
820 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
821 ) / sdom->active_vcpu_count;
824 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
825 {
826 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
827 BUG_ON( sdom != svc->sdom );
829 /* Increment credit */
830 atomic_add(credit_fair, &svc->credit);
831 credit = atomic_read(&svc->credit);
833 /*
834 * Recompute priority or, if VCPU is idling, remove it from
835 * the active list.
836 */
837 if ( credit < 0 )
838 {
839 if ( sdom->cap == 0U )
840 svc->pri = CSCHED_PRI_TS_OVER;
841 else
842 svc->pri = CSCHED_PRI_TS_PARKED;
844 if ( credit < -CSCHED_TSLICE )
845 {
846 CSCHED_STAT_CRANK(acct_min_credit);
847 credit = -CSCHED_TSLICE;
848 atomic_set(&svc->credit, credit);
849 }
850 }
851 else
852 {
853 svc->pri = CSCHED_PRI_TS_UNDER;
855 if ( credit > CSCHED_TSLICE )
856 __csched_vcpu_acct_idle_locked(svc);
857 }
859 svc->credit_last = credit;
860 svc->credit_incr = credit_fair;
861 credit_balance += credit;
862 }
863 }
865 csched_priv.credit_balance = credit_balance;
867 spin_unlock_irqrestore(&csched_priv.lock, flags);
869 /* Inform each CPU that its runq needs to be sorted */
870 csched_priv.runq_sort++;
871 }
873 static void
874 csched_tick(unsigned int cpu)
875 {
876 struct csched_vcpu * const svc = CSCHED_VCPU(current);
877 struct csched_dom * const sdom = svc->sdom;
879 /*
880 * Accounting for running VCPU
881 *
882 * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
883 */
884 if ( likely(sdom != NULL) )
885 {
886 csched_vcpu_acct(svc, CSCHED_TICK);
887 }
889 /*
890 * Accounting duty
891 *
892 * Note: Currently, this is always done by the master boot CPU. Eventually,
893 * we could distribute or at the very least cycle the duty.
894 */
895 if ( (csched_priv.master == cpu) &&
896 (schedule_data[cpu].tick % CSCHED_ACCT_NTICKS) == 0 )
897 {
898 csched_acct();
899 }
901 /*
902 * Check if runq needs to be sorted
903 *
904 * Every physical CPU resorts the runq after the accounting master has
905 * modified priorities. This is a special O(n) sort and runs at most
906 * once per accounting period (currently 30 milliseconds).
907 */
908 csched_runq_sort(cpu);
909 }
911 static struct csched_vcpu *
912 csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
913 {
914 struct list_head *iter;
915 struct csched_vcpu *speer;
916 struct vcpu *vc;
918 list_for_each( iter, &spc->runq )
919 {
920 speer = __runq_elem(iter);
922 /*
923 * If next available VCPU here is not of higher priority than ours,
924 * this PCPU is useless to us.
925 */
926 if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
927 {
928 CSCHED_STAT_CRANK(steal_peer_idle);
929 break;
930 }
932 /* Is this VCPU is runnable on our PCPU? */
933 vc = speer->vcpu;
934 BUG_ON( is_idle_vcpu(vc) );
936 if ( __csched_vcpu_is_stealable(cpu, vc) )
937 {
938 /* We got a candidate. Grab it! */
939 __runq_remove(speer);
940 vc->processor = cpu;
942 return speer;
943 }
944 }
946 return NULL;
947 }
949 static struct csched_vcpu *
950 csched_load_balance(int cpu, struct csched_vcpu *snext)
951 {
952 struct csched_pcpu *spc;
953 struct csched_vcpu *speer;
954 int peer_cpu;
956 if ( snext->pri == CSCHED_PRI_IDLE )
957 CSCHED_STAT_CRANK(load_balance_idle);
958 else if ( snext->pri == CSCHED_PRI_TS_OVER )
959 CSCHED_STAT_CRANK(load_balance_over);
960 else
961 CSCHED_STAT_CRANK(load_balance_other);
963 peer_cpu = cpu;
964 BUG_ON( peer_cpu != snext->vcpu->processor );
966 while ( 1 )
967 {
968 /* For each PCPU in the system starting with our neighbour... */
969 peer_cpu = (peer_cpu + 1) % csched_priv.ncpus;
970 if ( peer_cpu == cpu )
971 break;
973 /*
974 * Get ahold of the scheduler lock for this peer CPU.
975 *
976 * Note: We don't spin on this lock but simply try it. Spinning could
977 * cause a deadlock if the peer CPU is also load balancing and trying
978 * to lock this CPU.
979 */
980 if ( spin_trylock(&schedule_data[peer_cpu].schedule_lock) )
981 {
983 spc = CSCHED_PCPU(peer_cpu);
984 if ( unlikely(spc == NULL) )
985 {
986 CSCHED_STAT_CRANK(steal_peer_down);
987 speer = NULL;
988 }
989 else
990 {
991 speer = csched_runq_steal(spc, cpu, snext->pri);
992 }
994 spin_unlock(&schedule_data[peer_cpu].schedule_lock);
996 /* Got one! */
997 if ( speer )
998 {
999 CSCHED_STAT_CRANK(vcpu_migrate);
1000 return speer;
1003 else
1005 CSCHED_STAT_CRANK(steal_trylock_failed);
1010 /* Failed to find more important work */
1011 __runq_remove(snext);
1012 return snext;
1015 /*
1016 * This function is in the critical path. It is designed to be simple and
1017 * fast for the common case.
1018 */
1019 static struct task_slice
1020 csched_schedule(s_time_t now)
1022 const int cpu = smp_processor_id();
1023 struct list_head * const runq = RUNQ(cpu);
1024 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1025 struct csched_vcpu *snext;
1026 struct task_slice ret;
1028 CSCHED_STAT_CRANK(schedule);
1029 CSCHED_VCPU_CHECK(current);
1031 /*
1032 * Select next runnable local VCPU (ie top of local runq)
1033 */
1034 if ( vcpu_runnable(current) )
1035 __runq_insert(cpu, scurr);
1036 else
1037 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1039 snext = __runq_elem(runq->next);
1041 /*
1042 * SMP Load balance:
1044 * If the next highest priority local runnable VCPU has already eaten
1045 * through its credits, look on other PCPUs to see if we have more
1046 * urgent work... If not, csched_load_balance() will return snext, but
1047 * already removed from the runq.
1048 */
1049 if ( snext->pri > CSCHED_PRI_TS_OVER )
1050 __runq_remove(snext);
1051 else
1052 snext = csched_load_balance(cpu, snext);
1054 /*
1055 * Update idlers mask if necessary. When we're idling, other CPUs
1056 * will tickle us when they get extra work.
1057 */
1058 if ( snext->pri == CSCHED_PRI_IDLE )
1060 if ( !cpu_isset(cpu, csched_priv.idlers) )
1061 cpu_set(cpu, csched_priv.idlers);
1063 else if ( cpu_isset(cpu, csched_priv.idlers) )
1065 cpu_clear(cpu, csched_priv.idlers);
1068 /*
1069 * Return task to run next...
1070 */
1071 ret.time = MILLISECS(CSCHED_TSLICE);
1072 ret.task = snext->vcpu;
1074 CSCHED_VCPU_CHECK(ret.task);
1076 return ret;
1079 static void
1080 csched_dump_vcpu(struct csched_vcpu *svc)
1082 struct csched_dom * const sdom = svc->sdom;
1084 printk("[%i.%i] pri=%i cpu=%i",
1085 svc->vcpu->domain->domain_id,
1086 svc->vcpu->vcpu_id,
1087 svc->pri,
1088 svc->vcpu->processor);
1090 if ( sdom )
1092 printk(" credit=%i (%d+%u) {a=%u i=%u w=%u}",
1093 atomic_read(&svc->credit),
1094 svc->credit_last,
1095 svc->credit_incr,
1096 svc->state_active,
1097 svc->state_idle,
1098 sdom->weight);
1101 printk("\n");
1104 static void
1105 csched_dump_pcpu(int cpu)
1107 struct list_head *runq, *iter;
1108 struct csched_pcpu *spc;
1109 struct csched_vcpu *svc;
1110 int loop;
1112 spc = CSCHED_PCPU(cpu);
1113 runq = &spc->runq;
1115 printk(" tick=%lu, sort=%d\n",
1116 schedule_data[cpu].tick,
1117 spc->runq_sort_last);
1119 /* current VCPU */
1120 svc = CSCHED_VCPU(schedule_data[cpu].curr);
1121 if ( svc )
1123 printk("\trun: ");
1124 csched_dump_vcpu(svc);
1127 loop = 0;
1128 list_for_each( iter, runq )
1130 svc = __runq_elem(iter);
1131 if ( svc )
1133 printk("\t%3d: ", ++loop);
1134 csched_dump_vcpu(svc);
1139 static void
1140 csched_dump(void)
1142 struct list_head *iter_sdom, *iter_svc;
1143 int loop;
1145 printk("info:\n"
1146 "\tncpus = %u\n"
1147 "\tmaster = %u\n"
1148 "\tcredit = %u\n"
1149 "\tcredit balance = %d\n"
1150 "\tweight = %u\n"
1151 "\trunq_sort = %u\n"
1152 "\ttick = %dms\n"
1153 "\ttslice = %dms\n"
1154 "\taccounting period = %dms\n"
1155 "\tdefault-weight = %d\n",
1156 csched_priv.ncpus,
1157 csched_priv.master,
1158 csched_priv.credit,
1159 csched_priv.credit_balance,
1160 csched_priv.weight,
1161 csched_priv.runq_sort,
1162 CSCHED_TICK,
1163 CSCHED_TSLICE,
1164 CSCHED_ACCT_PERIOD,
1165 CSCHED_DEFAULT_WEIGHT);
1167 printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
1169 CSCHED_STATS_PRINTK();
1171 printk("active vcpus:\n");
1172 loop = 0;
1173 list_for_each( iter_sdom, &csched_priv.active_sdom )
1175 struct csched_dom *sdom;
1176 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1178 list_for_each( iter_svc, &sdom->active_vcpu )
1180 struct csched_vcpu *svc;
1181 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1183 printk("\t%3d: ", ++loop);
1184 csched_dump_vcpu(svc);
1189 static void
1190 csched_init(void)
1192 spin_lock_init(&csched_priv.lock);
1193 INIT_LIST_HEAD(&csched_priv.active_sdom);
1194 csched_priv.ncpus = 0;
1195 csched_priv.master = UINT_MAX;
1196 cpus_clear(csched_priv.idlers);
1197 csched_priv.weight = 0U;
1198 csched_priv.credit = 0U;
1199 csched_priv.credit_balance = 0;
1200 csched_priv.runq_sort = 0U;
1201 CSCHED_STATS_RESET();
1205 struct scheduler sched_credit_def = {
1206 .name = "SMP Credit Scheduler",
1207 .opt_name = "credit",
1208 .sched_id = SCHED_CREDIT,
1210 .init_vcpu = csched_vcpu_init,
1211 .destroy_domain = csched_dom_destroy,
1213 .sleep = csched_vcpu_sleep,
1214 .wake = csched_vcpu_wake,
1216 .set_affinity = csched_vcpu_set_affinity,
1218 .adjdom = csched_dom_cntl,
1220 .tick = csched_tick,
1221 .do_schedule = csched_schedule,
1223 .dump_cpu_state = csched_dump_pcpu,
1224 .dump_settings = csched_dump,
1225 .init = csched_init,
1226 };