direct-io.hg

view xen/common/schedule.c @ 7357:d6e99066959a

Refactor domain/vcpu allocation to be more separated.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Oct 12 17:01:38 2005 +0100 (2005-10-12)
parents 52b9aca1916a
children 6d4caa5a2cdb
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 /*#define WAKE_HISTO*/
17 /*#define BLOCKTIME_HISTO*/
19 #if defined(WAKE_HISTO)
20 #define BUCKETS 31
21 #elif defined(BLOCKTIME_HISTO)
22 #define BUCKETS 200
23 #endif
25 #include <xen/config.h>
26 #include <xen/init.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/domain.h>
30 #include <xen/delay.h>
31 #include <xen/event.h>
32 #include <xen/time.h>
33 #include <xen/ac_timer.h>
34 #include <xen/perfc.h>
35 #include <xen/sched-if.h>
36 #include <xen/softirq.h>
37 #include <xen/trace.h>
38 #include <xen/mm.h>
39 #include <public/sched.h>
40 #include <public/sched_ctl.h>
42 extern void arch_getdomaininfo_ctxt(struct vcpu *,
43 struct vcpu_guest_context *);
44 /* opt_sched: scheduler - default to SEDF */
45 static char opt_sched[10] = "sedf";
46 string_param("sched", opt_sched);
48 #if defined(WAKE_HISTO)
49 #define BUCKETS 31
50 #elif defined(BLOCKTIME_HISTO)
51 #define BUCKETS 200
52 #endif
54 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
56 /* Various timer handlers. */
57 static void s_timer_fn(void *unused);
58 static void t_timer_fn(void *unused);
59 static void dom_timer_fn(void *data);
61 /* This is global for now so that private implementations can reach it */
62 struct schedule_data schedule_data[NR_CPUS];
64 extern struct scheduler sched_bvt_def;
65 extern struct scheduler sched_sedf_def;
66 static struct scheduler *schedulers[] = {
67 &sched_bvt_def,
68 &sched_sedf_def,
69 NULL
70 };
72 static void __enter_scheduler(void);
74 static struct scheduler ops;
76 #define SCHED_OP(fn, ...) \
77 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
78 : (typeof(ops.fn(__VA_ARGS__)))0 )
80 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
81 static struct ac_timer t_timer[NR_CPUS];
83 void free_domain(struct domain *d)
84 {
85 int i;
87 SCHED_OP(free_task, d);
89 for ( i = MAX_VIRT_CPUS-1; i >= 0; i-- )
90 if ( d->vcpu[i] != NULL )
91 free_vcpu_struct(d->vcpu[i]);
93 xfree(d);
94 }
96 struct vcpu *alloc_vcpu(
97 struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
98 {
99 struct vcpu *v;
101 BUG_ON(d->vcpu[vcpu_id] != NULL);
103 if ( (v = alloc_vcpu_struct(d, vcpu_id)) == NULL )
104 return NULL;
106 v->domain = d;
107 v->vcpu_id = vcpu_id;
108 v->processor = cpu_id;
109 atomic_set(&v->pausecnt, 0);
110 v->cpumap = CPUMAP_RUNANYWHERE;
112 d->vcpu[vcpu_id] = v;
114 if ( SCHED_OP(alloc_task, v) < 0 )
115 {
116 d->vcpu[vcpu_id] = NULL;
117 free_vcpu_struct(v);
118 return NULL;
119 }
121 sched_add_domain(v);
123 if ( vcpu_id != 0 )
124 {
125 v->vcpu_info = &d->shared_info->vcpu_data[vcpu_id];
126 d->vcpu[v->vcpu_id-1]->next_in_list = v;
127 set_bit(_VCPUF_down, &v->vcpu_flags);
128 }
130 return v;
131 }
133 struct domain *alloc_domain(void)
134 {
135 struct domain *d;
137 if ( (d = xmalloc(struct domain)) != NULL )
138 memset(d, 0, sizeof(*d));
140 return d;
141 }
143 /*
144 * Add and remove a domain
145 */
146 void sched_add_domain(struct vcpu *v)
147 {
148 struct domain *d = v->domain;
150 /* Initialise the per-domain timer. */
151 init_ac_timer(&v->timer, dom_timer_fn, v, v->processor);
153 if ( is_idle_task(d) )
154 {
155 schedule_data[v->processor].curr = v;
156 schedule_data[v->processor].idle = v;
157 set_bit(_VCPUF_running, &v->vcpu_flags);
158 }
160 SCHED_OP(add_task, v);
161 TRACE_2D(TRC_SCHED_DOM_ADD, d->domain_id, v->vcpu_id);
162 }
164 void sched_rem_domain(struct vcpu *v)
165 {
166 rem_ac_timer(&v->timer);
167 SCHED_OP(rem_task, v);
168 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
169 }
171 void vcpu_sleep_nosync(struct vcpu *v)
172 {
173 unsigned long flags;
175 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
176 if ( likely(!domain_runnable(v)) )
177 SCHED_OP(sleep, v);
178 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
180 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
181 }
183 void vcpu_sleep_sync(struct vcpu *v)
184 {
185 vcpu_sleep_nosync(v);
187 /*
188 * We can be sure that the VCPU is finally descheduled after the running
189 * flag is cleared and the scheduler lock is released. We also check that
190 * the domain continues to be unrunnable, in case someone else wakes it.
191 */
192 while ( !domain_runnable(v) &&
193 (test_bit(_VCPUF_running, &v->vcpu_flags) ||
194 spin_is_locked(&schedule_data[v->processor].schedule_lock)) )
195 cpu_relax();
197 sync_vcpu_execstate(v);
198 }
200 void vcpu_wake(struct vcpu *v)
201 {
202 unsigned long flags;
204 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
205 if ( likely(domain_runnable(v)) )
206 {
207 SCHED_OP(wake, v);
208 #ifdef WAKE_HISTO
209 v->wokenup = NOW();
210 #endif
211 }
212 clear_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
213 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
215 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
216 }
218 /* Block the currently-executing domain until a pertinent event occurs. */
219 long do_block(void)
220 {
221 struct vcpu *v = current;
223 v->vcpu_info->evtchn_upcall_mask = 0;
224 set_bit(_VCPUF_blocked, &v->vcpu_flags);
226 /* Check for events /after/ blocking: avoids wakeup waiting race. */
227 if ( event_pending(v) )
228 {
229 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
230 }
231 else
232 {
233 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
234 __enter_scheduler();
235 }
237 return 0;
238 }
240 /* Voluntarily yield the processor for this allocation. */
241 static long do_yield(void)
242 {
243 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
244 __enter_scheduler();
245 return 0;
246 }
248 long do_sched_op(int cmd, unsigned long arg)
249 {
250 long ret = 0;
252 switch ( cmd )
253 {
254 case SCHEDOP_yield:
255 {
256 ret = do_yield();
257 break;
258 }
260 case SCHEDOP_block:
261 {
262 ret = do_block();
263 break;
264 }
266 case SCHEDOP_shutdown:
267 {
268 TRACE_3D(TRC_SCHED_SHUTDOWN,
269 current->domain->domain_id, current->vcpu_id, arg);
270 domain_shutdown((u8)arg);
271 break;
272 }
274 default:
275 ret = -ENOSYS;
276 }
278 return ret;
279 }
281 /* Per-domain one-shot-timer hypercall. */
282 long do_set_timer_op(s_time_t timeout)
283 {
284 struct vcpu *v = current;
286 if ( timeout == 0 )
287 rem_ac_timer(&v->timer);
288 else
289 set_ac_timer(&v->timer, timeout);
291 return 0;
292 }
294 /* sched_id - fetch ID of current scheduler */
295 int sched_id(void)
296 {
297 return ops.sched_id;
298 }
300 long sched_ctl(struct sched_ctl_cmd *cmd)
301 {
302 if ( cmd->sched_id != ops.sched_id )
303 return -EINVAL;
305 SCHED_OP(control, cmd);
306 TRACE_0D(TRC_SCHED_CTL);
307 return 0;
308 }
311 /* Adjust scheduling parameter for a given domain. */
312 long sched_adjdom(struct sched_adjdom_cmd *cmd)
313 {
314 struct domain *d;
315 struct vcpu *v;
316 int cpu;
317 #if NR_CPUS <=32
318 unsigned long have_lock;
319 #else
320 unsigned long long have_lock;
321 #endif
322 int succ;
324 #define __set_cpu_bit(cpu, data) data |= ((typeof(data))1)<<cpu
325 #define __get_cpu_bit(cpu, data) (data & ((typeof(data))1)<<cpu)
326 #define __clear_cpu_bits(data) data = ((typeof(data))0)
328 if ( cmd->sched_id != ops.sched_id )
329 return -EINVAL;
331 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
332 return -EINVAL;
334 d = find_domain_by_id(cmd->domain);
335 if ( d == NULL )
336 return -ESRCH;
338 /* acquire locks on all CPUs on which vcpus of this domain run */
339 do {
340 succ = 0;
341 __clear_cpu_bits(have_lock);
342 for_each_vcpu(d, v) {
343 cpu = v->processor;
344 if (!__get_cpu_bit(cpu, have_lock)) {
345 /* if we don't have a lock on this CPU: acquire it*/
346 if (spin_trylock(&schedule_data[cpu].schedule_lock)) {
347 /*we have this lock!*/
348 __set_cpu_bit(cpu, have_lock);
349 succ = 1;
350 } else {
351 /*we didn,t get this lock -> free all other locks too!*/
352 for (cpu = 0; cpu < NR_CPUS; cpu++)
353 if (__get_cpu_bit(cpu, have_lock))
354 spin_unlock(&schedule_data[cpu].schedule_lock);
355 /* and start from the beginning! */
356 succ = 0;
357 /* leave the "for_each_domain_loop" */
358 break;
359 }
360 }
361 }
362 } while ( !succ );
364 SCHED_OP(adjdom, d, cmd);
366 for (cpu = 0; cpu < NR_CPUS; cpu++)
367 if (__get_cpu_bit(cpu, have_lock))
368 spin_unlock(&schedule_data[cpu].schedule_lock);
369 __clear_cpu_bits(have_lock);
371 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
372 put_domain(d);
373 return 0;
374 }
376 /*
377 * The main function
378 * - deschedule the current domain (scheduler independent).
379 * - pick a new domain (scheduler dependent).
380 */
381 static void __enter_scheduler(void)
382 {
383 struct vcpu *prev = current, *next = NULL;
384 int cpu = prev->processor;
385 s_time_t now;
386 struct task_slice next_slice;
387 s32 r_time; /* time for new dom to run */
389 perfc_incrc(sched_run);
391 spin_lock_irq(&schedule_data[cpu].schedule_lock);
393 now = NOW();
395 rem_ac_timer(&schedule_data[cpu].s_timer);
397 ASSERT(!in_irq());
399 prev->cpu_time += now - prev->lastschd;
401 /* get policy-specific decision on scheduling... */
402 next_slice = ops.do_schedule(now);
404 r_time = next_slice.time;
405 next = next_slice.task;
407 schedule_data[cpu].curr = next;
409 next->lastschd = now;
411 set_ac_timer(&schedule_data[cpu].s_timer, now + r_time);
413 if ( unlikely(prev == next) )
414 {
415 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
416 return continue_running(prev);
417 }
419 clear_bit(_VCPUF_running, &prev->vcpu_flags);
420 set_bit(_VCPUF_running, &next->vcpu_flags);
422 perfc_incrc(sched_ctx);
424 #if defined(WAKE_HISTO)
425 if ( !is_idle_task(next->domain) && next->wokenup )
426 {
427 ulong diff = (ulong)(now - next->wokenup);
428 diff /= (ulong)MILLISECS(1);
429 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
430 else schedule_data[cpu].hist[BUCKETS-1]++;
431 }
432 next->wokenup = (s_time_t)0;
433 #elif defined(BLOCKTIME_HISTO)
434 prev->lastdeschd = now;
435 if ( !is_idle_task(next->domain) )
436 {
437 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
438 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
439 else schedule_data[cpu].hist[BUCKETS-1]++;
440 }
441 #endif
443 prev->sleep_tick = schedule_data[cpu].tick;
445 /* Ensure that the domain has an up-to-date time base. */
446 if ( !is_idle_task(next->domain) )
447 {
448 update_dom_time(next);
449 if ( next->sleep_tick != schedule_data[cpu].tick )
450 send_guest_virq(next, VIRQ_TIMER);
451 }
453 TRACE_4D(TRC_SCHED_SWITCH,
454 prev->domain->domain_id, prev->vcpu_id,
455 next->domain->domain_id, next->vcpu_id);
457 context_switch(prev, next);
459 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
461 context_switch_finalise(next);
462 }
464 /* No locking needed -- pointer comparison is safe :-) */
465 int idle_cpu(int cpu)
466 {
467 struct vcpu *p = schedule_data[cpu].curr;
468 return p == idle_task[cpu];
469 }
472 /****************************************************************************
473 * Timers: the scheduler utilises a number of timers
474 * - s_timer: per CPU timer for preemption and scheduling decisions
475 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
476 * - dom_timer: per domain timer to specifiy timeout values
477 ****************************************************************************/
479 /* The scheduler timer: force a run through the scheduler */
480 static void s_timer_fn(void *unused)
481 {
482 raise_softirq(SCHEDULE_SOFTIRQ);
483 perfc_incrc(sched_irq);
484 }
486 /* Periodic tick timer: send timer event to current domain */
487 static void t_timer_fn(void *unused)
488 {
489 struct vcpu *v = current;
490 unsigned int cpu = v->processor;
492 schedule_data[cpu].tick++;
494 if ( !is_idle_task(v->domain) )
495 {
496 update_dom_time(v);
497 send_guest_virq(v, VIRQ_TIMER);
498 }
500 page_scrub_schedule_work();
502 set_ac_timer(&t_timer[cpu], NOW() + MILLISECS(10));
503 }
505 /* Domain timer function, sends a virtual timer interrupt to domain */
506 static void dom_timer_fn(void *data)
507 {
508 struct vcpu *v = data;
510 update_dom_time(v);
511 send_guest_virq(v, VIRQ_TIMER);
512 }
514 /* Initialise the data structures. */
515 void __init scheduler_init(void)
516 {
517 int i;
519 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
521 for ( i = 0; i < NR_CPUS; i++ )
522 {
523 spin_lock_init(&schedule_data[i].schedule_lock);
524 init_ac_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
525 init_ac_timer(&t_timer[i], t_timer_fn, NULL, i);
526 }
528 schedule_data[0].curr = idle_task[0];
529 schedule_data[0].idle = idle_task[0];
531 for ( i = 0; schedulers[i] != NULL; i++ )
532 {
533 ops = *schedulers[i];
534 if ( strcmp(ops.opt_name, opt_sched) == 0 )
535 break;
536 }
538 if ( schedulers[i] == NULL )
539 printk("Could not find scheduler: %s\n", opt_sched);
541 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
543 BUG_ON(SCHED_OP(alloc_task, idle_task[0]) < 0);
544 sched_add_domain(idle_task[0]);
545 }
547 /*
548 * Start a scheduler for each CPU
549 * This has to be done *after* the timers, e.g., APICs, have been initialised
550 */
551 void schedulers_start(void)
552 {
553 t_timer_fn(0);
554 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
555 }
557 void dump_runq(unsigned char key)
558 {
559 s_time_t now = NOW();
560 int i;
561 unsigned long flags;
563 local_irq_save(flags);
565 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
566 SCHED_OP(dump_settings);
567 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
569 for_each_online_cpu ( i )
570 {
571 spin_lock(&schedule_data[i].schedule_lock);
572 printk("CPU[%02d] ", i);
573 SCHED_OP(dump_cpu_state,i);
574 spin_unlock(&schedule_data[i].schedule_lock);
575 }
577 local_irq_restore(flags);
578 }
580 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
582 void print_sched_histo(unsigned char key)
583 {
584 int i, j, k;
585 for_each_online_cpu ( k )
586 {
587 j = 0;
588 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
589 for ( i = 0; i < BUCKETS; i++ )
590 {
591 if ( schedule_data[k].hist[i] != 0 )
592 {
593 if ( i < BUCKETS-1 )
594 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
595 else
596 printk(" >:[%7u] ", schedule_data[k].hist[i]);
597 if ( !(++j % 5) )
598 printk("\n");
599 }
600 }
601 printk("\n");
602 }
604 }
606 void reset_sched_histo(unsigned char key)
607 {
608 int i, j;
609 for ( j = 0; j < NR_CPUS; j++ )
610 for ( i=0; i < BUCKETS; i++ )
611 schedule_data[j].hist[i] = 0;
612 }
614 #else
616 void print_sched_histo(unsigned char key) { }
617 void reset_sched_histo(unsigned char key) { }
619 #endif
621 /*
622 * Local variables:
623 * mode: C
624 * c-set-style: "BSD"
625 * c-basic-offset: 4
626 * tab-width: 4
627 * indent-tabs-mode: nil
628 * End:
629 */