ia64/xen-unstable

view xen/common/schedule.c @ 6538:84ee014ebd41

Merge xen-vtx-unstable.hg
author adsharma@los-vmm.sc.intel.com
date Wed Aug 17 12:34:38 2005 -0800 (2005-08-17)
parents 23979fb12c49 3b0ce44f7b7a
children 99914b54f7bf
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 /*#define WAKE_HISTO*/
17 /*#define BLOCKTIME_HISTO*/
19 #if defined(WAKE_HISTO)
20 #define BUCKETS 31
21 #elif defined(BLOCKTIME_HISTO)
22 #define BUCKETS 200
23 #endif
25 #include <xen/config.h>
26 #include <xen/init.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/domain.h>
30 #include <xen/delay.h>
31 #include <xen/event.h>
32 #include <xen/time.h>
33 #include <xen/ac_timer.h>
34 #include <xen/perfc.h>
35 #include <xen/sched-if.h>
36 #include <xen/softirq.h>
37 #include <xen/trace.h>
38 #include <xen/mm.h>
39 #include <public/sched_ctl.h>
41 /* opt_sched: scheduler - default to SEDF */
42 static char opt_sched[10] = "sedf";
43 string_param("sched", opt_sched);
45 #if defined(WAKE_HISTO)
46 #define BUCKETS 31
47 #elif defined(BLOCKTIME_HISTO)
48 #define BUCKETS 200
49 #endif
51 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
53 /* Various timer handlers. */
54 static void s_timer_fn(void *unused);
55 static void t_timer_fn(void *unused);
56 static void dom_timer_fn(void *data);
58 /* This is global for now so that private implementations can reach it */
59 struct schedule_data schedule_data[NR_CPUS];
61 extern struct scheduler sched_bvt_def;
62 extern struct scheduler sched_sedf_def;
63 static struct scheduler *schedulers[] = {
64 &sched_bvt_def,
65 &sched_sedf_def,
66 NULL
67 };
69 static void __enter_scheduler(void);
71 static struct scheduler ops;
73 #define SCHED_OP(fn, ...) \
74 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
75 : (typeof(ops.fn(__VA_ARGS__)))0 )
77 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
78 static struct ac_timer t_timer[NR_CPUS];
80 void free_domain_struct(struct domain *d)
81 {
82 int i;
84 SCHED_OP(free_task, d);
85 for (i = 0; i < MAX_VIRT_CPUS; i++)
86 if ( d->vcpu[i] )
87 arch_free_vcpu_struct(d->vcpu[i]);
89 xfree(d);
90 }
92 struct vcpu *alloc_vcpu_struct(
93 struct domain *d, unsigned long vcpu)
94 {
95 struct vcpu *v, *vc;
97 ASSERT( d->vcpu[vcpu] == NULL );
99 if ( (v = arch_alloc_vcpu_struct()) == NULL )
100 return NULL;
102 memset(v, 0, sizeof(*v));
104 d->vcpu[vcpu] = v;
105 v->domain = d;
106 v->vcpu_id = vcpu;
108 if ( SCHED_OP(alloc_task, v) < 0 )
109 goto out;
111 if ( vcpu != 0 )
112 {
113 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
115 for_each_vcpu( d, vc )
116 {
117 if ( (vc->next_in_list == NULL) ||
118 (vc->next_in_list->vcpu_id > vcpu) )
119 break;
120 }
121 v->next_in_list = vc->next_in_list;
122 vc->next_in_list = v;
124 if (test_bit(_VCPUF_cpu_pinned, &vc->vcpu_flags)) {
125 v->processor = (vc->processor + 1) % num_online_cpus();
126 set_bit(_VCPUF_cpu_pinned, &v->vcpu_flags);
127 } else {
128 v->processor = (vc->processor + 1) % num_online_cpus();
129 }
130 }
132 return v;
134 out:
135 d->vcpu[vcpu] = NULL;
136 arch_free_vcpu_struct(v);
138 return NULL;
139 }
141 struct domain *alloc_domain_struct(void)
142 {
143 struct domain *d;
145 if ( (d = xmalloc(struct domain)) == NULL )
146 return NULL;
148 memset(d, 0, sizeof(*d));
150 if ( alloc_vcpu_struct(d, 0) == NULL )
151 goto out;
153 return d;
155 out:
156 xfree(d);
157 return NULL;
158 }
160 /*
161 * Add and remove a domain
162 */
163 void sched_add_domain(struct vcpu *v)
164 {
165 struct domain *d = v->domain;
167 /* Initialise the per-domain timer. */
168 init_ac_timer(&v->timer, dom_timer_fn, v, v->processor);
170 if ( is_idle_task(d) )
171 {
172 schedule_data[v->processor].curr = v;
173 schedule_data[v->processor].idle = v;
174 set_bit(_VCPUF_running, &v->vcpu_flags);
175 }
176 else
177 {
178 /* Must be unpaused by control software to start execution. */
179 set_bit(_VCPUF_ctrl_pause, &v->vcpu_flags);
180 }
182 SCHED_OP(add_task, v);
183 TRACE_2D(TRC_SCHED_DOM_ADD, d->domain_id, v->vcpu_id);
184 }
186 void sched_rem_domain(struct vcpu *v)
187 {
188 rem_ac_timer(&v->timer);
189 SCHED_OP(rem_task, v);
190 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
191 }
193 void domain_sleep_nosync(struct vcpu *v)
194 {
195 unsigned long flags;
197 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
198 if ( likely(!domain_runnable(v)) )
199 SCHED_OP(sleep, v);
200 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
202 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
203 }
205 void domain_sleep_sync(struct vcpu *v)
206 {
207 domain_sleep_nosync(v);
209 while ( test_bit(_VCPUF_running, &v->vcpu_flags) && !domain_runnable(v) )
210 cpu_relax();
212 if ( cpu_isset(v->processor, v->domain->cpumask) )
213 sync_lazy_execstate_cpu(v->processor);
214 }
216 void domain_wake(struct vcpu *v)
217 {
218 unsigned long flags;
220 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
221 if ( likely(domain_runnable(v)) )
222 {
223 SCHED_OP(wake, v);
224 #ifdef WAKE_HISTO
225 v->wokenup = NOW();
226 #endif
227 }
228 clear_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
229 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
231 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
232 }
234 /* Block the currently-executing domain until a pertinent event occurs. */
235 long do_block(void)
236 {
237 struct vcpu *v = current;
239 v->vcpu_info->evtchn_upcall_mask = 0;
240 set_bit(_VCPUF_blocked, &v->vcpu_flags);
242 /* Check for events /after/ blocking: avoids wakeup waiting race. */
243 if ( event_pending(v) )
244 {
245 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
246 }
247 else
248 {
249 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
250 __enter_scheduler();
251 }
253 return 0;
254 }
256 /* Voluntarily yield the processor for this allocation. */
257 static long do_yield(void)
258 {
259 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
260 __enter_scheduler();
261 return 0;
262 }
264 /* Mark target vcpu as non-runnable so it is not scheduled */
265 static long do_vcpu_down(int vcpu)
266 {
267 struct vcpu *target;
269 if ( vcpu > MAX_VIRT_CPUS )
270 return -EINVAL;
272 target = current->domain->vcpu[vcpu];
273 if ( target == NULL )
274 return -ESRCH;
275 set_bit(_VCPUF_down, &target->vcpu_flags);
277 return 0;
278 }
280 /* Mark target vcpu as runnable and wake it */
281 static long do_vcpu_up(int vcpu)
282 {
283 struct vcpu *target;
285 if (vcpu > MAX_VIRT_CPUS)
286 return -EINVAL;
288 target = current->domain->vcpu[vcpu];
289 if ( target == NULL )
290 return -ESRCH;
291 clear_bit(_VCPUF_down, &target->vcpu_flags);
292 /* wake vcpu */
293 domain_wake(target);
295 return 0;
296 }
298 /*
299 * Demultiplex scheduler-related hypercalls.
300 */
301 long do_sched_op(unsigned long op)
302 {
303 long ret = 0;
305 switch ( op & SCHEDOP_cmdmask )
306 {
307 case SCHEDOP_yield:
308 {
309 ret = do_yield();
310 break;
311 }
313 case SCHEDOP_block:
314 {
315 ret = do_block();
316 break;
317 }
319 case SCHEDOP_shutdown:
320 {
321 TRACE_3D(TRC_SCHED_SHUTDOWN,
322 current->domain->domain_id, current->vcpu_id,
323 (op >> SCHEDOP_reasonshift));
324 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
325 break;
326 }
327 case SCHEDOP_vcpu_down:
328 {
329 ret = do_vcpu_down((int)(op >> SCHEDOP_vcpushift));
330 break;
331 }
332 case SCHEDOP_vcpu_up:
333 {
334 ret = do_vcpu_up((int)(op >> SCHEDOP_vcpushift));
335 break;
336 }
338 default:
339 ret = -ENOSYS;
340 }
342 return ret;
343 }
345 /* Per-domain one-shot-timer hypercall. */
346 long do_set_timer_op(s_time_t timeout)
347 {
348 struct vcpu *v = current;
350 if ( timeout == 0 )
351 rem_ac_timer(&v->timer);
352 else
353 set_ac_timer(&v->timer, timeout);
355 return 0;
356 }
358 /** sched_id - fetch ID of current scheduler */
359 int sched_id()
360 {
361 return ops.sched_id;
362 }
364 long sched_ctl(struct sched_ctl_cmd *cmd)
365 {
366 if ( cmd->sched_id != ops.sched_id )
367 return -EINVAL;
369 SCHED_OP(control, cmd);
370 TRACE_0D(TRC_SCHED_CTL);
371 return 0;
372 }
375 /* Adjust scheduling parameter for a given domain. */
376 long sched_adjdom(struct sched_adjdom_cmd *cmd)
377 {
378 struct domain *d;
379 struct vcpu *v;
380 int cpu;
381 #if NR_CPUS <=32
382 unsigned long have_lock;
383 #else
384 unsigned long long have_lock;
385 #endif
386 int succ;
388 #define __set_cpu_bit(cpu, data) data |= ((typeof(data))1)<<cpu
389 #define __get_cpu_bit(cpu, data) (data & ((typeof(data))1)<<cpu)
390 #define __clear_cpu_bits(data) data = ((typeof(data))0)
392 if ( cmd->sched_id != ops.sched_id )
393 return -EINVAL;
395 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
396 return -EINVAL;
398 d = find_domain_by_id(cmd->domain);
399 if ( d == NULL )
400 return -ESRCH;
402 /* acquire locks on all CPUs on which vcpus of this domain run */
403 do {
404 succ = 0;
405 __clear_cpu_bits(have_lock);
406 for_each_vcpu(d, v) {
407 cpu = v->processor;
408 if (!__get_cpu_bit(cpu, have_lock)) {
409 /* if we don't have a lock on this CPU: acquire it*/
410 if (spin_trylock(&schedule_data[cpu].schedule_lock)) {
411 /*we have this lock!*/
412 __set_cpu_bit(cpu, have_lock);
413 succ = 1;
414 } else {
415 /*we didn,t get this lock -> free all other locks too!*/
416 for (cpu = 0; cpu < NR_CPUS; cpu++)
417 if (__get_cpu_bit(cpu, have_lock))
418 spin_unlock(&schedule_data[cpu].schedule_lock);
419 /* and start from the beginning! */
420 succ = 0;
421 /* leave the "for_each_domain_loop" */
422 break;
423 }
424 }
425 }
426 } while (!succ);
427 //spin_lock_irq(&schedule_data[d->vcpu[0]->processor].schedule_lock);
428 SCHED_OP(adjdom, d, cmd);
429 //spin_unlock_irq(&schedule_data[d->vcpu[0]->processor].schedule_lock);
430 for (cpu = 0; cpu < NR_CPUS; cpu++)
431 if (__get_cpu_bit(cpu, have_lock))
432 spin_unlock(&schedule_data[cpu].schedule_lock);
433 __clear_cpu_bits(have_lock);
435 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
436 put_domain(d);
437 return 0;
438 }
440 /*
441 * The main function
442 * - deschedule the current domain (scheduler independent).
443 * - pick a new domain (scheduler dependent).
444 */
445 static void __enter_scheduler(void)
446 {
447 struct vcpu *prev = current, *next = NULL;
448 int cpu = prev->processor;
449 s_time_t now;
450 struct task_slice next_slice;
451 s32 r_time; /* time for new dom to run */
453 perfc_incrc(sched_run);
455 spin_lock_irq(&schedule_data[cpu].schedule_lock);
457 now = NOW();
459 rem_ac_timer(&schedule_data[cpu].s_timer);
461 ASSERT(!in_irq());
463 prev->cpu_time += now - prev->lastschd;
465 /* get policy-specific decision on scheduling... */
466 next_slice = ops.do_schedule(now);
468 r_time = next_slice.time;
469 next = next_slice.task;
471 schedule_data[cpu].curr = next;
473 next->lastschd = now;
475 set_ac_timer(&schedule_data[cpu].s_timer, now + r_time);
477 if ( unlikely(prev == next) )
478 {
479 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
480 return continue_running(prev);
481 }
483 clear_bit(_VCPUF_running, &prev->vcpu_flags);
484 set_bit(_VCPUF_running, &next->vcpu_flags);
486 perfc_incrc(sched_ctx);
488 #if defined(WAKE_HISTO)
489 if ( !is_idle_task(next->domain) && next->wokenup ) {
490 ulong diff = (ulong)(now - next->wokenup);
491 diff /= (ulong)MILLISECS(1);
492 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
493 else schedule_data[cpu].hist[BUCKETS-1]++;
494 }
495 next->wokenup = (s_time_t)0;
496 #elif defined(BLOCKTIME_HISTO)
497 prev->lastdeschd = now;
498 if ( !is_idle_task(next->domain) )
499 {
500 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
501 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
502 else schedule_data[cpu].hist[BUCKETS-1]++;
503 }
504 #endif
506 prev->sleep_tick = schedule_data[cpu].tick;
508 /* Ensure that the domain has an up-to-date time base. */
509 if ( !is_idle_task(next->domain) )
510 {
511 update_dom_time(next);
512 if ( next->sleep_tick != schedule_data[cpu].tick )
513 send_guest_virq(next, VIRQ_TIMER);
514 }
516 TRACE_4D(TRC_SCHED_SWITCH,
517 prev->domain->domain_id, prev->vcpu_id,
518 next->domain->domain_id, next->vcpu_id);
520 context_switch(prev, next);
522 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
524 context_switch_finalise(next);
525 }
527 /* No locking needed -- pointer comparison is safe :-) */
528 int idle_cpu(int cpu)
529 {
530 struct vcpu *p = schedule_data[cpu].curr;
531 return p == idle_task[cpu];
532 }
535 /****************************************************************************
536 * Timers: the scheduler utilises a number of timers
537 * - s_timer: per CPU timer for preemption and scheduling decisions
538 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
539 * - dom_timer: per domain timer to specifiy timeout values
540 ****************************************************************************/
542 /* The scheduler timer: force a run through the scheduler */
543 static void s_timer_fn(void *unused)
544 {
545 raise_softirq(SCHEDULE_SOFTIRQ);
546 perfc_incrc(sched_irq);
547 }
549 /* Periodic tick timer: send timer event to current domain */
550 static void t_timer_fn(void *unused)
551 {
552 struct vcpu *v = current;
553 unsigned int cpu = v->processor;
555 schedule_data[cpu].tick++;
557 if ( !is_idle_task(v->domain) )
558 {
559 update_dom_time(v);
560 send_guest_virq(v, VIRQ_TIMER);
561 }
563 page_scrub_schedule_work();
565 set_ac_timer(&t_timer[cpu], NOW() + MILLISECS(10));
566 }
568 /* Domain timer function, sends a virtual timer interrupt to domain */
569 static void dom_timer_fn(void *data)
570 {
571 struct vcpu *v = data;
573 update_dom_time(v);
574 send_guest_virq(v, VIRQ_TIMER);
575 }
577 /* Initialise the data structures. */
578 void __init scheduler_init(void)
579 {
580 int i;
582 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
584 for ( i = 0; i < NR_CPUS; i++ )
585 {
586 spin_lock_init(&schedule_data[i].schedule_lock);
587 init_ac_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
588 init_ac_timer(&t_timer[i], t_timer_fn, NULL, i);
589 }
591 schedule_data[0].curr = idle_task[0];
592 schedule_data[0].idle = idle_task[0];
594 for ( i = 0; schedulers[i] != NULL; i++ )
595 {
596 ops = *schedulers[i];
597 if ( strcmp(ops.opt_name, opt_sched) == 0 )
598 break;
599 }
601 if ( schedulers[i] == NULL )
602 printk("Could not find scheduler: %s\n", opt_sched);
604 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
606 BUG_ON(SCHED_OP(alloc_task, idle_task[0]) < 0);
607 sched_add_domain(idle_task[0]);
608 }
610 /*
611 * Start a scheduler for each CPU
612 * This has to be done *after* the timers, e.g., APICs, have been initialised
613 */
614 void schedulers_start(void)
615 {
616 t_timer_fn(0);
617 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
618 }
620 void dump_runq(unsigned char key)
621 {
622 s_time_t now = NOW();
623 int i;
624 unsigned long flags;
626 local_irq_save(flags);
628 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
629 SCHED_OP(dump_settings);
630 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
632 for_each_online_cpu ( i )
633 {
634 spin_lock(&schedule_data[i].schedule_lock);
635 printk("CPU[%02d] ", i);
636 SCHED_OP(dump_cpu_state,i);
637 spin_unlock(&schedule_data[i].schedule_lock);
638 }
640 local_irq_restore(flags);
641 }
643 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
645 void print_sched_histo(unsigned char key)
646 {
647 int i, j, k;
648 for_each_online_cpu ( k )
649 {
650 j = 0;
651 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
652 for ( i = 0; i < BUCKETS; i++ )
653 {
654 if ( schedule_data[k].hist[i] != 0 )
655 {
656 if ( i < BUCKETS-1 )
657 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
658 else
659 printk(" >:[%7u] ", schedule_data[k].hist[i]);
660 if ( !(++j % 5) )
661 printk("\n");
662 }
663 }
664 printk("\n");
665 }
667 }
669 void reset_sched_histo(unsigned char key)
670 {
671 int i, j;
672 for ( j = 0; j < NR_CPUS; j++ )
673 for ( i=0; i < BUCKETS; i++ )
674 schedule_data[j].hist[i] = 0;
675 }
677 #else
679 void print_sched_histo(unsigned char key) { }
680 void reset_sched_histo(unsigned char key) { }
682 #endif
684 /*
685 * Local variables:
686 * mode: C
687 * c-set-style: "BSD"
688 * c-basic-offset: 4
689 * tab-width: 4
690 * indent-tabs-mode: nil
691 * End:
692 */