ia64/xen-unstable

view xen/common/schedule.c @ 3735:4d39c79968fa

bitkeeper revision 1.1159.254.1 (4208fb40ebFKAx9nnWfHPfWIPdQ9Vw)

Bug fix for free_domain_struct().
Don't dereference a pointer after we've free'd it.
author mafetter@fleming.research
date Tue Feb 08 17:47:44 2005 +0000 (2005-02-08)
parents 4ba67049f771
children 7406a28a87bc
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 * (C) 2004 - Mark Williamson - Intel Research Cambridge
6 ****************************************************************************
7 *
8 * File: common/schedule.c
9 * Author: Rolf Neugebauer & Keir Fraser
10 * Updated for generic API by Mark Williamson
11 *
12 * Description: Generic CPU scheduling code
13 * implements support functionality for the Xen scheduler API.
14 *
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/ac_timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <public/sched_ctl.h>
31 /* opt_sched: scheduler - default to Borrowed Virtual Time */
32 static char opt_sched[10] = "bvt";
33 string_param("sched", opt_sched);
35 /*#define WAKE_HISTO*/
36 /*#define BLOCKTIME_HISTO*/
38 #if defined(WAKE_HISTO)
39 #define BUCKETS 31
40 #elif defined(BLOCKTIME_HISTO)
41 #define BUCKETS 200
42 #endif
44 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
46 /*
47 * TODO MAW pull trace-related #defines out of here and into an auto-generated
48 * header file later on!
49 */
50 #define TRC_SCHED_DOM_ADD 0x00010000
51 #define TRC_SCHED_DOM_REM 0x00010001
52 #define TRC_SCHED_WAKE 0x00010002
53 #define TRC_SCHED_BLOCK 0x00010003
54 #define TRC_SCHED_YIELD 0x00010004
55 #define TRC_SCHED_SET_TIMER 0x00010005
56 #define TRC_SCHED_CTL 0x00010006
57 #define TRC_SCHED_ADJDOM 0x00010007
58 #define TRC_SCHED_RESCHED 0x00010008
59 #define TRC_SCHED_SWITCH 0x00010009
60 #define TRC_SCHED_S_TIMER_FN 0x0001000A
61 #define TRC_SCHED_T_TIMER_FN 0x0001000B
62 #define TRC_SCHED_DOM_TIMER_FN 0x0001000C
64 /* Various timer handlers. */
65 static void s_timer_fn(unsigned long unused);
66 static void t_timer_fn(unsigned long unused);
67 static void dom_timer_fn(unsigned long data);
69 /* This is global for now so that private implementations can reach it */
70 schedule_data_t schedule_data[NR_CPUS];
72 extern struct scheduler sched_bvt_def;
73 // extern struct scheduler sched_rrobin_def;
74 // extern struct scheduler sched_atropos_def;
75 static struct scheduler *schedulers[] = {
76 &sched_bvt_def,
77 // &sched_rrobin_def,
78 // &sched_atropos_def,
79 NULL
80 };
82 /* Operations for the current scheduler. */
83 static struct scheduler ops;
85 #define SCHED_OP(fn, ...) \
86 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
87 : (typeof(ops.fn(__VA_ARGS__)))0 )
89 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
90 static struct ac_timer t_timer[NR_CPUS];
92 void free_domain_struct(struct domain *d)
93 {
94 int i;
96 SCHED_OP(free_task, d);
97 for (i = 0; i < MAX_VIRT_CPUS; i++)
98 if ( d->exec_domain[i] )
99 arch_free_exec_domain_struct(d->exec_domain[i]);
101 arch_free_domain_struct(d);
102 }
104 struct exec_domain *alloc_exec_domain_struct(struct domain *d,
105 unsigned long vcpu)
106 {
107 struct exec_domain *ed, *edc;
109 ASSERT( d->exec_domain[vcpu] == NULL );
111 if ( (ed = arch_alloc_exec_domain_struct()) == NULL )
112 return NULL;
114 memset(ed, 0, sizeof(*ed));
116 d->exec_domain[vcpu] = ed;
117 ed->domain = d;
118 ed->eid = vcpu;
120 if ( SCHED_OP(alloc_task, ed) < 0 )
121 goto out;
123 if (vcpu != 0) {
124 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
126 for_each_exec_domain(d, edc) {
127 if (edc->ed_next_list == NULL || edc->ed_next_list->eid > vcpu)
128 break;
129 }
130 ed->ed_next_list = edc->ed_next_list;
131 edc->ed_next_list = ed;
133 if (test_bit(EDF_CPUPINNED, &edc->ed_flags)) {
134 ed->processor = (edc->processor + 1) % smp_num_cpus;
135 set_bit(EDF_CPUPINNED, &ed->ed_flags);
136 } else {
137 ed->processor = (edc->processor + 1) % smp_num_cpus; /* XXX */
138 }
139 }
141 return ed;
143 out:
144 d->exec_domain[vcpu] = NULL;
145 arch_free_exec_domain_struct(ed);
147 return NULL;
148 }
150 struct domain *alloc_domain_struct(void)
151 {
152 struct domain *d;
154 if ( (d = arch_alloc_domain_struct()) == NULL )
155 return NULL;
157 memset(d, 0, sizeof(*d));
159 if ( alloc_exec_domain_struct(d, 0) == NULL )
160 goto out;
162 return d;
164 out:
165 arch_free_domain_struct(d);
166 return NULL;
167 }
169 /*
170 * Add and remove a domain
171 */
172 void sched_add_domain(struct exec_domain *ed)
173 {
174 struct domain *d = ed->domain;
176 /* Must be unpaused by control software to start execution. */
177 set_bit(EDF_CTRLPAUSE, &ed->ed_flags);
179 if ( d->id != IDLE_DOMAIN_ID )
180 {
181 /* Initialise the per-domain timer. */
182 init_ac_timer(&ed->timer);
183 ed->timer.cpu = ed->processor;
184 ed->timer.data = (unsigned long)ed;
185 ed->timer.function = &dom_timer_fn;
186 }
187 else
188 {
189 schedule_data[ed->processor].idle = ed;
190 }
192 SCHED_OP(add_task, ed);
194 TRACE_2D(TRC_SCHED_DOM_ADD, d->id, ed);
195 }
197 void sched_rem_domain(struct exec_domain *ed)
198 {
200 rem_ac_timer(&ed->timer);
201 SCHED_OP(rem_task, ed);
202 TRACE_3D(TRC_SCHED_DOM_REM, ed->domain->id, ed->eid, ed);
203 }
205 void init_idle_task(void)
206 {
207 if ( SCHED_OP(init_idle_task, current) < 0 )
208 BUG();
209 }
211 void domain_sleep(struct exec_domain *d)
212 {
213 unsigned long flags;
215 spin_lock_irqsave(&schedule_data[d->processor].schedule_lock, flags);
217 if ( likely(!domain_runnable(d)) )
218 SCHED_OP(sleep, d);
220 spin_unlock_irqrestore(&schedule_data[d->processor].schedule_lock, flags);
222 /* Synchronous. */
223 while ( test_bit(EDF_RUNNING, &d->ed_flags) && !domain_runnable(d) )
224 {
225 smp_mb();
226 cpu_relax();
227 }
228 }
230 void domain_wake(struct exec_domain *ed)
231 {
232 unsigned long flags;
234 spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags);
236 if ( likely(domain_runnable(ed)) )
237 {
238 TRACE_2D(TRC_SCHED_WAKE, ed->domain->id, ed);
239 SCHED_OP(wake, ed);
240 #ifdef WAKE_HISTO
241 ed->wokenup = NOW();
242 #endif
243 }
245 clear_bit(EDF_MIGRATED, &ed->ed_flags);
247 spin_unlock_irqrestore(&schedule_data[ed->processor].schedule_lock, flags);
248 }
250 /* Block the currently-executing domain until a pertinent event occurs. */
251 long do_block(void)
252 {
253 ASSERT(current->domain->id != IDLE_DOMAIN_ID);
254 current->vcpu_info->evtchn_upcall_mask = 0;
255 set_bit(EDF_BLOCKED, &current->ed_flags);
256 TRACE_2D(TRC_SCHED_BLOCK, current->domain->id, current);
257 __enter_scheduler();
258 return 0;
259 }
261 /* Voluntarily yield the processor for this allocation. */
262 static long do_yield(void)
263 {
264 TRACE_2D(TRC_SCHED_YIELD, current->domain->id, current);
265 __enter_scheduler();
266 return 0;
267 }
269 /*
270 * Demultiplex scheduler-related hypercalls.
271 */
272 long do_sched_op(unsigned long op)
273 {
274 long ret = 0;
276 switch ( op & SCHEDOP_cmdmask )
277 {
279 case SCHEDOP_yield:
280 {
281 ret = do_yield();
282 break;
283 }
285 case SCHEDOP_block:
286 {
287 ret = do_block();
288 break;
289 }
291 case SCHEDOP_shutdown:
292 {
293 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
294 break;
295 }
297 default:
298 ret = -ENOSYS;
299 }
301 return ret;
302 }
304 /* Per-domain one-shot-timer hypercall. */
305 long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
306 {
307 struct exec_domain *p = current;
309 rem_ac_timer(&p->timer);
311 if ( (timeout_hi != 0) || (timeout_lo != 0) )
312 {
313 p->timer.expires = ((s_time_t)timeout_hi<<32) | ((s_time_t)timeout_lo);
314 add_ac_timer(&p->timer);
315 }
317 TRACE_5D(TRC_SCHED_SET_TIMER, p->domain->id, p->eid, p, timeout_hi,
318 timeout_lo);
320 return 0;
321 }
323 /** sched_id - fetch ID of current scheduler */
324 int sched_id()
325 {
326 return ops.sched_id;
327 }
329 long sched_ctl(struct sched_ctl_cmd *cmd)
330 {
331 TRACE_0D(TRC_SCHED_CTL);
333 if ( cmd->sched_id != ops.sched_id )
334 return -EINVAL;
336 return SCHED_OP(control, cmd);
337 }
340 /* Adjust scheduling parameter for a given domain. */
341 long sched_adjdom(struct sched_adjdom_cmd *cmd)
342 {
343 struct domain *d;
345 if ( cmd->sched_id != ops.sched_id )
346 return -EINVAL;
348 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
349 return -EINVAL;
351 d = find_domain_by_id(cmd->domain);
352 if ( d == NULL )
353 return -ESRCH;
355 TRACE_1D(TRC_SCHED_ADJDOM, d->id);
357 spin_lock_irq(&schedule_data[d->exec_domain[0]->processor].schedule_lock);
358 SCHED_OP(adjdom, d, cmd);
359 spin_unlock_irq(&schedule_data[d->exec_domain[0]->processor].schedule_lock);
361 put_domain(d);
362 return 0;
363 }
365 /*
366 * The main function
367 * - deschedule the current domain (scheduler independent).
368 * - pick a new domain (scheduler dependent).
369 */
370 void __enter_scheduler(void)
371 {
372 struct exec_domain *prev = current, *next = NULL;
373 int cpu = prev->processor;
374 s_time_t now;
375 task_slice_t next_slice;
376 s32 r_time; /* time for new dom to run */
378 perfc_incrc(sched_run);
380 spin_lock_irq(&schedule_data[cpu].schedule_lock);
382 now = NOW();
384 rem_ac_timer(&schedule_data[cpu].s_timer);
386 ASSERT(!in_irq());
388 if ( test_bit(EDF_BLOCKED, &prev->ed_flags) )
389 {
390 /* This check is needed to avoid a race condition. */
391 if ( event_pending(prev) )
392 clear_bit(EDF_BLOCKED, &prev->ed_flags);
393 else
394 SCHED_OP(do_block, prev);
395 }
397 prev->cpu_time += now - prev->lastschd;
399 /* get policy-specific decision on scheduling... */
400 next_slice = ops.do_schedule(now);
402 r_time = next_slice.time;
403 next = next_slice.task;
405 schedule_data[cpu].curr = next;
407 next->lastschd = now;
409 /* reprogramm the timer */
410 schedule_data[cpu].s_timer.expires = now + r_time;
411 add_ac_timer(&schedule_data[cpu].s_timer);
413 /* Must be protected by the schedule_lock! */
414 set_bit(EDF_RUNNING, &next->ed_flags);
416 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
418 /* Ensure that the domain has an up-to-date time base. */
419 if ( !is_idle_task(next->domain) )
420 update_dom_time(next->domain);
422 if ( unlikely(prev == next) )
423 return;
425 perfc_incrc(sched_ctx);
427 if ( !is_idle_task(prev->domain) )
428 {
429 LOCK_BIGLOCK(prev->domain);
430 cleanup_writable_pagetable(prev->domain);
431 UNLOCK_BIGLOCK(prev->domain);
432 }
434 #if defined(WAKE_HISTO)
435 if ( !is_idle_task(next) && next->wokenup ) {
436 ulong diff = (ulong)(now - next->wokenup);
437 diff /= (ulong)MILLISECS(1);
438 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
439 else schedule_data[cpu].hist[BUCKETS-1]++;
440 }
441 next->wokenup = (s_time_t)0;
442 #elif defined(BLOCKTIME_HISTO)
443 prev->lastdeschd = now;
444 if ( !is_idle_task(next) )
445 {
446 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
447 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
448 else schedule_data[cpu].hist[BUCKETS-1]++;
449 }
450 #endif
452 TRACE_2D(TRC_SCHED_SWITCH, next->domain->id, next);
454 switch_to(prev, next);
456 /*
457 * We do this late on because it doesn't need to be protected by the
458 * schedule_lock, and because we want this to be the very last use of
459 * 'prev' (after this point, a dying domain's info structure may be freed
460 * without warning).
461 */
462 clear_bit(EDF_RUNNING, &prev->ed_flags);
464 /* Mark a timer event for the newly-scheduled domain. */
465 if ( !is_idle_task(next->domain) )
466 send_guest_virq(next, VIRQ_TIMER);
468 schedule_tail(next);
470 BUG();
471 }
473 /* No locking needed -- pointer comparison is safe :-) */
474 int idle_cpu(int cpu)
475 {
476 struct exec_domain *p = schedule_data[cpu].curr;
477 return p == idle_task[cpu];
478 }
481 /****************************************************************************
482 * Timers: the scheduler utilises a number of timers
483 * - s_timer: per CPU timer for preemption and scheduling decisions
484 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
485 * - dom_timer: per domain timer to specifiy timeout values
486 ****************************************************************************/
488 /* The scheduler timer: force a run through the scheduler*/
489 static void s_timer_fn(unsigned long unused)
490 {
491 TRACE_0D(TRC_SCHED_S_TIMER_FN);
492 raise_softirq(SCHEDULE_SOFTIRQ);
493 perfc_incrc(sched_irq);
494 }
496 /* Periodic tick timer: send timer event to current domain*/
497 static void t_timer_fn(unsigned long unused)
498 {
499 struct exec_domain *ed = current;
501 TRACE_0D(TRC_SCHED_T_TIMER_FN);
503 if ( !is_idle_task(ed->domain) )
504 {
505 update_dom_time(ed->domain);
506 send_guest_virq(ed, VIRQ_TIMER);
507 }
509 t_timer[ed->processor].expires = NOW() + MILLISECS(10);
510 add_ac_timer(&t_timer[ed->processor]);
511 }
513 /* Domain timer function, sends a virtual timer interrupt to domain */
514 static void dom_timer_fn(unsigned long data)
515 {
516 struct exec_domain *ed = (struct exec_domain *)data;
518 TRACE_0D(TRC_SCHED_DOM_TIMER_FN);
519 update_dom_time(ed->domain);
520 send_guest_virq(ed, VIRQ_TIMER);
521 }
523 /* Initialise the data structures. */
524 void __init scheduler_init(void)
525 {
526 int i;
528 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
530 for ( i = 0; i < NR_CPUS; i++ )
531 {
532 spin_lock_init(&schedule_data[i].schedule_lock);
533 schedule_data[i].curr = &idle0_exec_domain;
535 init_ac_timer(&schedule_data[i].s_timer);
536 schedule_data[i].s_timer.cpu = i;
537 schedule_data[i].s_timer.data = 2;
538 schedule_data[i].s_timer.function = &s_timer_fn;
540 init_ac_timer(&t_timer[i]);
541 t_timer[i].cpu = i;
542 t_timer[i].data = 3;
543 t_timer[i].function = &t_timer_fn;
544 }
546 schedule_data[0].idle = &idle0_exec_domain;
548 for ( i = 0; schedulers[i] != NULL; i++ )
549 {
550 ops = *schedulers[i];
551 if ( strcmp(ops.opt_name, opt_sched) == 0 )
552 break;
553 }
555 if ( schedulers[i] == NULL )
556 printk("Could not find scheduler: %s\n", opt_sched);
558 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
560 if ( SCHED_OP(init_scheduler) < 0 )
561 panic("Initialising scheduler failed!");
562 }
564 /*
565 * Start a scheduler for each CPU
566 * This has to be done *after* the timers, e.g., APICs, have been initialised
567 */
568 void schedulers_start(void)
569 {
570 s_timer_fn(0);
571 smp_call_function((void *)s_timer_fn, NULL, 1, 1);
573 t_timer_fn(0);
574 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
575 }
578 void dump_runq(unsigned char key)
579 {
580 s_time_t now = NOW();
581 int i;
582 unsigned long flags;
584 local_irq_save(flags);
586 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
587 SCHED_OP(dump_settings);
588 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
590 for ( i = 0; i < smp_num_cpus; i++ )
591 {
592 spin_lock(&schedule_data[i].schedule_lock);
593 printk("CPU[%02d] ", i);
594 SCHED_OP(dump_cpu_state,i);
595 spin_unlock(&schedule_data[i].schedule_lock);
596 }
598 local_irq_restore(flags);
599 }
601 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
602 void print_sched_histo(unsigned char key)
603 {
604 int i, j, k;
605 for ( k = 0; k < smp_num_cpus; k++ )
606 {
607 j = 0;
608 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
609 for ( i = 0; i < BUCKETS; i++ )
610 {
611 if ( schedule_data[k].hist[i] != 0 )
612 {
613 if ( i < BUCKETS-1 )
614 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
615 else
616 printk(" >:[%7u] ", schedule_data[k].hist[i]);
617 if ( !(++j % 5) )
618 printk("\n");
619 }
620 }
621 printk("\n");
622 }
624 }
625 void reset_sched_histo(unsigned char key)
626 {
627 int i, j;
628 for ( j = 0; j < smp_num_cpus; j++ )
629 for ( i=0; i < BUCKETS; i++ )
630 schedule_data[j].hist[i] = 0;
631 }
632 #else
633 void print_sched_histo(unsigned char key) { }
634 void reset_sched_histo(unsigned char key) { }
635 #endif