ia64/xen-unstable

view xen/common/schedule.c @ 4696:e686528abbfc

bitkeeper revision 1.1389.3.1 (42714dabVSywx2XWGjgw2J54ZylwYg)

Ensure block/yield hypercalls always return a sane return code.

Ensure callers of __enter_scheduler take appropriate arch-specific
action if no context switch occurs (callers from arch/x86 do not
expect to return from a call into the scheduler).

This fixes wildly unintuitive behaviour of do_block() for the
VMX team.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Apr 28 20:55:07 2005 +0000 (2005-04-28)
parents a1f760a94785
children 123bd8c4b408
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 /*#define WAKE_HISTO*/
17 /*#define BLOCKTIME_HISTO*/
19 #if defined(WAKE_HISTO)
20 #define BUCKETS 31
21 #elif defined(BLOCKTIME_HISTO)
22 #define BUCKETS 200
23 #endif
25 #include <xen/config.h>
26 #include <xen/init.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/delay.h>
30 #include <xen/event.h>
31 #include <xen/time.h>
32 #include <xen/ac_timer.h>
33 #include <xen/perfc.h>
34 #include <xen/sched-if.h>
35 #include <xen/softirq.h>
36 #include <xen/trace.h>
37 #include <public/sched_ctl.h>
39 /* opt_sched: scheduler - default to Borrowed Virtual Time */
40 static char opt_sched[10] = "bvt";
41 string_param("sched", opt_sched);
43 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
45 /* Various timer handlers. */
46 static void s_timer_fn(unsigned long unused);
47 static void t_timer_fn(unsigned long unused);
48 static void dom_timer_fn(unsigned long data);
50 /* This is global for now so that private implementations can reach it */
51 struct schedule_data schedule_data[NR_CPUS];
53 extern struct scheduler sched_bvt_def;
54 static struct scheduler *schedulers[] = {
55 &sched_bvt_def,
56 NULL
57 };
59 static void __enter_scheduler(void);
61 static struct scheduler ops;
63 #define SCHED_OP(fn, ...) \
64 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
65 : (typeof(ops.fn(__VA_ARGS__)))0 )
67 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
68 static struct ac_timer t_timer[NR_CPUS];
70 void free_domain_struct(struct domain *d)
71 {
72 int i;
74 SCHED_OP(free_task, d);
75 for (i = 0; i < MAX_VIRT_CPUS; i++)
76 if ( d->exec_domain[i] )
77 arch_free_exec_domain_struct(d->exec_domain[i]);
79 xfree(d);
80 }
82 struct exec_domain *alloc_exec_domain_struct(struct domain *d,
83 unsigned long vcpu)
84 {
85 struct exec_domain *ed, *edc;
87 ASSERT( d->exec_domain[vcpu] == NULL );
89 if ( (ed = arch_alloc_exec_domain_struct()) == NULL )
90 return NULL;
92 memset(ed, 0, sizeof(*ed));
94 d->exec_domain[vcpu] = ed;
95 ed->domain = d;
96 ed->eid = vcpu;
98 if ( SCHED_OP(alloc_task, ed) < 0 )
99 goto out;
101 if (vcpu != 0) {
102 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
104 for_each_exec_domain(d, edc) {
105 if (edc->ed_next_list == NULL || edc->ed_next_list->eid > vcpu)
106 break;
107 }
108 ed->ed_next_list = edc->ed_next_list;
109 edc->ed_next_list = ed;
111 if (test_bit(EDF_CPUPINNED, &edc->ed_flags)) {
112 ed->processor = (edc->processor + 1) % smp_num_cpus;
113 set_bit(EDF_CPUPINNED, &ed->ed_flags);
114 } else {
115 ed->processor = (edc->processor + 1) % smp_num_cpus; /* XXX */
116 }
117 }
119 return ed;
121 out:
122 d->exec_domain[vcpu] = NULL;
123 arch_free_exec_domain_struct(ed);
125 return NULL;
126 }
128 struct domain *alloc_domain_struct(void)
129 {
130 struct domain *d;
132 if ( (d = xmalloc(struct domain)) == NULL )
133 return NULL;
135 memset(d, 0, sizeof(*d));
137 if ( alloc_exec_domain_struct(d, 0) == NULL )
138 goto out;
140 return d;
142 out:
143 xfree(d);
144 return NULL;
145 }
147 /*
148 * Add and remove a domain
149 */
150 void sched_add_domain(struct exec_domain *ed)
151 {
152 struct domain *d = ed->domain;
154 /* Must be unpaused by control software to start execution. */
155 set_bit(EDF_CTRLPAUSE, &ed->ed_flags);
157 if ( d->id != IDLE_DOMAIN_ID )
158 {
159 /* Initialise the per-domain timer. */
160 init_ac_timer(&ed->timer);
161 ed->timer.cpu = ed->processor;
162 ed->timer.data = (unsigned long)ed;
163 ed->timer.function = &dom_timer_fn;
164 }
165 else
166 {
167 schedule_data[ed->processor].idle = ed;
168 }
170 SCHED_OP(add_task, ed);
171 TRACE_2D(TRC_SCHED_DOM_ADD, d->id, ed->eid);
172 }
174 void sched_rem_domain(struct exec_domain *ed)
175 {
176 rem_ac_timer(&ed->timer);
177 SCHED_OP(rem_task, ed);
178 TRACE_2D(TRC_SCHED_DOM_REM, ed->domain->id, ed->eid);
179 }
181 void init_idle_task(void)
182 {
183 if ( SCHED_OP(init_idle_task, current) < 0 )
184 BUG();
185 }
187 void domain_sleep(struct exec_domain *ed)
188 {
189 unsigned long flags;
191 spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags);
192 if ( likely(!domain_runnable(ed)) )
193 SCHED_OP(sleep, ed);
194 spin_unlock_irqrestore(&schedule_data[ed->processor].schedule_lock, flags);
196 TRACE_2D(TRC_SCHED_SLEEP, ed->domain->id, ed->eid);
198 /* Synchronous. */
199 while ( test_bit(EDF_RUNNING, &ed->ed_flags) && !domain_runnable(ed) )
200 cpu_relax();
201 }
203 void domain_wake(struct exec_domain *ed)
204 {
205 unsigned long flags;
207 spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags);
208 if ( likely(domain_runnable(ed)) )
209 {
210 SCHED_OP(wake, ed);
211 #ifdef WAKE_HISTO
212 ed->wokenup = NOW();
213 #endif
214 }
215 clear_bit(EDF_MIGRATED, &ed->ed_flags);
216 spin_unlock_irqrestore(&schedule_data[ed->processor].schedule_lock, flags);
218 TRACE_2D(TRC_SCHED_WAKE, ed->domain->id, ed->eid);
219 }
221 /* Block the currently-executing domain until a pertinent event occurs. */
222 long do_block(void)
223 {
224 struct exec_domain *ed = current;
226 ed->vcpu_info->evtchn_upcall_mask = 0;
227 set_bit(EDF_BLOCKED, &ed->ed_flags);
229 /* Check for events /after/ blocking: avoids wakeup waiting race. */
230 if ( event_pending(ed) )
231 {
232 clear_bit(EDF_BLOCKED, &ed->ed_flags);
233 }
234 else
235 {
236 TRACE_2D(TRC_SCHED_BLOCK, ed->domain->id, ed->eid);
237 __enter_scheduler();
238 }
240 return 0;
241 }
243 /* Voluntarily yield the processor for this allocation. */
244 static long do_yield(void)
245 {
246 TRACE_2D(TRC_SCHED_YIELD, current->domain->id, current->eid);
247 __enter_scheduler();
248 return 0;
249 }
251 /*
252 * Demultiplex scheduler-related hypercalls.
253 */
254 long do_sched_op(unsigned long op)
255 {
256 long ret = 0;
258 switch ( op & SCHEDOP_cmdmask )
259 {
261 case SCHEDOP_yield:
262 {
263 ret = do_yield();
264 break;
265 }
267 case SCHEDOP_block:
268 {
269 ret = do_block();
270 break;
271 }
273 case SCHEDOP_shutdown:
274 {
275 TRACE_3D(TRC_SCHED_SHUTDOWN, current->domain->id, current->eid,
276 (op >> SCHEDOP_reasonshift));
277 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
278 break;
279 }
281 default:
282 ret = -ENOSYS;
283 }
285 return ret;
286 }
288 /* Per-domain one-shot-timer hypercall. */
289 long do_set_timer_op(s_time_t timeout)
290 {
291 struct exec_domain *ed = current;
293 rem_ac_timer(&ed->timer);
295 if ( (ed->timer.expires = timeout) != 0 )
296 add_ac_timer(&ed->timer);
298 return 0;
299 }
301 /** sched_id - fetch ID of current scheduler */
302 int sched_id()
303 {
304 return ops.sched_id;
305 }
307 long sched_ctl(struct sched_ctl_cmd *cmd)
308 {
309 if ( cmd->sched_id != ops.sched_id )
310 return -EINVAL;
312 SCHED_OP(control, cmd);
313 TRACE_0D(TRC_SCHED_CTL);
314 return 0;
315 }
318 /* Adjust scheduling parameter for a given domain. */
319 long sched_adjdom(struct sched_adjdom_cmd *cmd)
320 {
321 struct domain *d;
323 if ( cmd->sched_id != ops.sched_id )
324 return -EINVAL;
326 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
327 return -EINVAL;
329 d = find_domain_by_id(cmd->domain);
330 if ( d == NULL )
331 return -ESRCH;
333 spin_lock_irq(&schedule_data[d->exec_domain[0]->processor].schedule_lock);
334 SCHED_OP(adjdom, d, cmd);
335 spin_unlock_irq(&schedule_data[d->exec_domain[0]->processor].schedule_lock);
337 TRACE_1D(TRC_SCHED_ADJDOM, d->id);
338 put_domain(d);
339 return 0;
340 }
342 /*
343 * The main function
344 * - deschedule the current domain (scheduler independent).
345 * - pick a new domain (scheduler dependent).
346 */
347 static void __enter_scheduler(void)
348 {
349 struct exec_domain *prev = current, *next = NULL;
350 int cpu = prev->processor;
351 s_time_t now;
352 struct task_slice next_slice;
353 s32 r_time; /* time for new dom to run */
355 perfc_incrc(sched_run);
357 spin_lock_irq(&schedule_data[cpu].schedule_lock);
359 now = NOW();
361 rem_ac_timer(&schedule_data[cpu].s_timer);
363 ASSERT(!in_irq());
365 prev->cpu_time += now - prev->lastschd;
367 /* get policy-specific decision on scheduling... */
368 next_slice = ops.do_schedule(now);
370 r_time = next_slice.time;
371 next = next_slice.task;
373 schedule_data[cpu].curr = next;
375 next->lastschd = now;
377 /* reprogramm the timer */
378 schedule_data[cpu].s_timer.expires = now + r_time;
379 add_ac_timer(&schedule_data[cpu].s_timer);
381 /* Must be protected by the schedule_lock! */
382 set_bit(EDF_RUNNING, &next->ed_flags);
384 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
386 if ( unlikely(prev == next) )
387 return continue_running(prev);
389 perfc_incrc(sched_ctx);
391 #if defined(WAKE_HISTO)
392 if ( !is_idle_task(next->domain) && next->wokenup ) {
393 ulong diff = (ulong)(now - next->wokenup);
394 diff /= (ulong)MILLISECS(1);
395 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
396 else schedule_data[cpu].hist[BUCKETS-1]++;
397 }
398 next->wokenup = (s_time_t)0;
399 #elif defined(BLOCKTIME_HISTO)
400 prev->lastdeschd = now;
401 if ( !is_idle_task(next->domain) )
402 {
403 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
404 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
405 else schedule_data[cpu].hist[BUCKETS-1]++;
406 }
407 #endif
409 prev->sleep_tick = schedule_data[cpu].tick;
411 /* Ensure that the domain has an up-to-date time base. */
412 if ( !is_idle_task(next->domain) )
413 {
414 update_dom_time(next);
415 if ( next->sleep_tick != schedule_data[cpu].tick )
416 send_guest_virq(next, VIRQ_TIMER);
417 }
419 TRACE_4D(TRC_SCHED_SWITCH,
420 prev->domain->id, prev->eid,
421 next->domain->id, next->eid);
423 context_switch(prev, next);
424 }
426 /* No locking needed -- pointer comparison is safe :-) */
427 int idle_cpu(int cpu)
428 {
429 struct exec_domain *p = schedule_data[cpu].curr;
430 return p == idle_task[cpu];
431 }
434 /****************************************************************************
435 * Timers: the scheduler utilises a number of timers
436 * - s_timer: per CPU timer for preemption and scheduling decisions
437 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
438 * - dom_timer: per domain timer to specifiy timeout values
439 ****************************************************************************/
441 /* The scheduler timer: force a run through the scheduler */
442 static void s_timer_fn(unsigned long unused)
443 {
444 raise_softirq(SCHEDULE_SOFTIRQ);
445 perfc_incrc(sched_irq);
446 }
448 /* Periodic tick timer: send timer event to current domain */
449 static void t_timer_fn(unsigned long unused)
450 {
451 struct exec_domain *ed = current;
452 unsigned int cpu = ed->processor;
454 schedule_data[cpu].tick++;
456 if ( !is_idle_task(ed->domain) )
457 {
458 update_dom_time(ed);
459 send_guest_virq(ed, VIRQ_TIMER);
460 }
462 page_scrub_schedule_work();
464 t_timer[cpu].expires = NOW() + MILLISECS(10);
465 add_ac_timer(&t_timer[cpu]);
466 }
468 /* Domain timer function, sends a virtual timer interrupt to domain */
469 static void dom_timer_fn(unsigned long data)
470 {
471 struct exec_domain *ed = (struct exec_domain *)data;
473 update_dom_time(ed);
474 send_guest_virq(ed, VIRQ_TIMER);
475 }
477 /* Initialise the data structures. */
478 void __init scheduler_init(void)
479 {
480 int i;
482 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
484 for ( i = 0; i < NR_CPUS; i++ )
485 {
486 spin_lock_init(&schedule_data[i].schedule_lock);
487 schedule_data[i].curr = &idle0_exec_domain;
489 init_ac_timer(&schedule_data[i].s_timer);
490 schedule_data[i].s_timer.cpu = i;
491 schedule_data[i].s_timer.data = 2;
492 schedule_data[i].s_timer.function = &s_timer_fn;
494 init_ac_timer(&t_timer[i]);
495 t_timer[i].cpu = i;
496 t_timer[i].data = 3;
497 t_timer[i].function = &t_timer_fn;
498 }
500 schedule_data[0].idle = &idle0_exec_domain;
502 for ( i = 0; schedulers[i] != NULL; i++ )
503 {
504 ops = *schedulers[i];
505 if ( strcmp(ops.opt_name, opt_sched) == 0 )
506 break;
507 }
509 if ( schedulers[i] == NULL )
510 printk("Could not find scheduler: %s\n", opt_sched);
512 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
514 if ( SCHED_OP(init_scheduler) < 0 )
515 panic("Initialising scheduler failed!");
516 }
518 /*
519 * Start a scheduler for each CPU
520 * This has to be done *after* the timers, e.g., APICs, have been initialised
521 */
522 void schedulers_start(void)
523 {
524 s_timer_fn(0);
525 smp_call_function((void *)s_timer_fn, NULL, 1, 1);
527 t_timer_fn(0);
528 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
529 }
532 void dump_runq(unsigned char key)
533 {
534 s_time_t now = NOW();
535 int i;
536 unsigned long flags;
538 local_irq_save(flags);
540 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
541 SCHED_OP(dump_settings);
542 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
544 for ( i = 0; i < smp_num_cpus; i++ )
545 {
546 spin_lock(&schedule_data[i].schedule_lock);
547 printk("CPU[%02d] ", i);
548 SCHED_OP(dump_cpu_state,i);
549 spin_unlock(&schedule_data[i].schedule_lock);
550 }
552 local_irq_restore(flags);
553 }
555 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
556 void print_sched_histo(unsigned char key)
557 {
558 int i, j, k;
559 for ( k = 0; k < smp_num_cpus; k++ )
560 {
561 j = 0;
562 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
563 for ( i = 0; i < BUCKETS; i++ )
564 {
565 if ( schedule_data[k].hist[i] != 0 )
566 {
567 if ( i < BUCKETS-1 )
568 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
569 else
570 printk(" >:[%7u] ", schedule_data[k].hist[i]);
571 if ( !(++j % 5) )
572 printk("\n");
573 }
574 }
575 printk("\n");
576 }
578 }
579 void reset_sched_histo(unsigned char key)
580 {
581 int i, j;
582 for ( j = 0; j < smp_num_cpus; j++ )
583 for ( i=0; i < BUCKETS; i++ )
584 schedule_data[j].hist[i] = 0;
585 }
586 #else
587 void print_sched_histo(unsigned char key) { }
588 void reset_sched_histo(unsigned char key) { }
589 #endif
591 /*
592 * Local variables:
593 * mode: C
594 * c-set-style: "BSD"
595 * c-basic-offset: 4
596 * tab-width: 4
597 * indent-tabs-mode: nil
598 * End:
599 */