ia64/xen-unstable

view xen/common/schedule.c @ 332:a72c5a70fe2b

bitkeeper revision 1.149 (3e797d12w2a9Jkg9CeHbUBtMWxrA5Q)

Fixed syntax error in schedule.c.
Quality software ;-)
author iap10@labyrinth.cl.cam.ac.uk
date Thu Mar 20 08:34:26 2003 +0000 (2003-03-20)
parents 03dc7864109b
children 942eb9bcae13
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002 - Rolf Neugebauer - Intel Research Cambridge
4 ****************************************************************************
5 *
6 * File: schedule.c
7 * Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk)
8 * Changes:
9 *
10 * Date: Nov 2002
11 *
12 * Environment: Xen Hypervisor
13 * Description: CPU scheduling
14 * implements A Borrowed Virtual Time scheduler.
15 * (see Duda & Cheriton SOSP'99)
16 *
17 ****************************************************************************
18 * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
19 ****************************************************************************
20 */
22 #include <xeno/config.h>
23 #include <xeno/init.h>
24 #include <xeno/lib.h>
25 #include <xeno/sched.h>
26 #include <xeno/delay.h>
27 #include <xeno/event.h>
28 #include <xeno/time.h>
29 #include <xeno/ac_timer.h>
30 #include <xeno/interrupt.h>
32 #include <xeno/perfc.h>
35 #undef SCHEDULER_TRACE
36 #ifdef SCHEDULER_TRACE
37 #define TRC(_x) _x
38 #else
39 #define TRC(_x)
40 #endif
42 #define SCHED_HISTO
43 #ifdef SCHED_HISTO
44 #define BUCKETS 31
45 #endif
48 #define MCU (s32)MICROSECS(100) /* Minimum unit */
49 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
50 static s32 ctx_allow=(s32)MILLISECS(5); /* context switch allowance */
52 /*****************************************************************************
53 * per CPU data for the scheduler.
54 *****************************************************************************/
55 typedef struct schedule_data_st
56 {
57 spinlock_t lock; /* lock for protecting this */
58 struct list_head runqueue; /* runqueue */
59 struct task_struct *prev, *curr; /* previous and current task */
60 struct task_struct *idle; /* idle task for this cpu */
61 u32 svt; /* system virtual time. per CPU??? */
62 struct ac_timer s_timer; /* scheduling timer */
63 #ifdef SCHED_HISTO
64 u32 hist[BUCKETS]; /* for scheduler latency histogram */
65 #endif
67 } __cacheline_aligned schedule_data_t;
68 schedule_data_t schedule_data[NR_CPUS];
70 struct ac_timer v_timer; /* scheduling timer */
71 static void virt_timer(unsigned long foo);
72 static void dump_rqueue(struct list_head *queue, char *name);
75 /*****************************************************************************
76 * Some convenience functions
77 *****************************************************************************/
78 /* add a task to the head of the runqueue */
79 static inline void __add_to_runqueue_head(struct task_struct * p)
80 {
82 list_add(&p->run_list, &schedule_data[p->processor].runqueue);
83 }
84 /* add a task to the tail of the runqueue */
85 static inline void __add_to_runqueue_tail(struct task_struct * p)
86 {
87 list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
88 }
90 /* remove a task from runqueue */
91 static inline void __del_from_runqueue(struct task_struct * p)
92 {
93 list_del(&p->run_list);
94 p->run_list.next = NULL;
95 }
96 /* is task on run queue? */
97 static inline int __task_on_runqueue(struct task_struct *p)
98 {
99 return (p->run_list.next != NULL);
100 }
102 #define next_domain(p) \\
103 list_entry((p)->run_list.next, struct task_struct, run_list)
105 /******************************************************************************
106 * Add and remove a domain
107 ******************************************************************************/
108 void sched_add_domain(struct task_struct *p)
109 {
110 p->state = TASK_UNINTERRUPTIBLE;
111 p->mcu_advance = 10;
113 if (p->domain == IDLE_DOMAIN_ID) {
114 p->avt = 0xffffffff;
115 p->evt = 0xffffffff;
116 schedule_data[p->processor].idle = p;
117 } else {
118 /* set avt end evt to system virtual time */
119 p->avt = schedule_data[p->processor].svt;
120 p->evt = schedule_data[p->processor].svt;
121 /* RN: XXX BVT fill in other bits */
122 }
123 }
125 void sched_rem_domain(struct task_struct *p)
126 {
127 p->state = TASK_DYING;
128 }
131 /****************************************************************************
132 * wake up a domain which had been sleeping
133 ****************************************************************************/
134 int wake_up(struct task_struct *p)
135 {
136 unsigned long flags;
137 int ret = 0;
139 spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
141 if ( __task_on_runqueue(p) ) goto out;
143 p->state = TASK_RUNNING;
144 __add_to_runqueue_head(p);
146 /* set the BVT parameters */
147 if (p->avt < schedule_data[p->processor].svt)
148 p->avt = schedule_data[p->processor].svt;
150 p->evt = p->avt; /* RN: XXX BVT deal with warping here */
152 #ifdef SCHED_HISTO
153 p->wokenup = NOW();
154 #endif
156 ret = 1;
157 out:
158 spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
159 return ret;
160 }
162 /****************************************************************************
163 * Voluntarily yield the processor to another domain, until an event occurs.
164 ****************************************************************************/
165 long do_yield(void)
166 {
167 current->state = TASK_INTERRUPTIBLE;
168 schedule();
169 return 0;
170 }
172 /****************************************************************************
173 * Control the scheduler
174 ****************************************************************************/
175 long sched_bvtctl(unsigned long c_allow)
176 {
177 printk("sched: bvtctl %lu\n", c_allow);
178 ctx_allow = c_allow;
179 return 0;
180 }
182 /****************************************************************************
183 * Adjust scheduling parameter for a given domain
184 ****************************************************************************/
185 long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
186 unsigned long warpl, unsigned long warpu)
187 {
188 struct task_struct *p;
190 printk("sched: adjdom %02d %lu %lu %lu %lu\n",
191 dom, mcu_adv, warp, warpl, warpu);
193 p = find_domain_by_id(dom);
194 if ( p == NULL ) return -ESRCH;
196 spin_lock_irq(&schedule_data[p->processor].lock);
198 p->mcu_advance = mcu_adv;
200 spin_unlock_irq(&schedule_data[p->processor].lock);
202 return 0;
203 }
205 /****************************************************************************
206 * cause a run through the scheduler when appropriate
207 * Appropriate is:
208 * - current task is idle task
209 * - the current task already ran for it's context switch allowance
210 * Otherwise we do a run through the scheduler after the current tasks
211 * context switch allowance is over.
212 ****************************************************************************/
213 void reschedule(struct task_struct *p)
214 {
215 int cpu = p->processor;
216 struct task_struct *curr;
217 unsigned long flags;
218 s_time_t now, min_time;
220 if (p->has_cpu)
221 return;
223 spin_lock_irqsave(&schedule_data[cpu].lock, flags);
225 now = NOW();
226 curr = schedule_data[cpu].curr;
227 /* domain should run at least for ctx_allow */
228 min_time = curr->lastschd + ctx_allow;
230 if ( is_idle_task(curr) || (min_time <= now) ) {
231 /* reschedule */
232 set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
234 spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
236 if (cpu != smp_processor_id())
237 smp_send_event_check_cpu(cpu);
238 return;
239 }
241 /* current hasn't been running for long enough -> reprogram timer.
242 * but don't bother if timer would go off soon anyway */
243 if (schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP) {
244 mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
245 }
247 spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
248 return;
249 }
252 /****************************************************************************
253 * The main function
254 * - deschedule the current domain.
255 * - pick a new domain.
256 * i.e., the domain with lowest EVT.
257 * The runqueue should be ordered by EVT so that is easy.
258 ****************************************************************************/
259 asmlinkage void schedule(void)
260 {
261 struct task_struct *prev, *next, *next_prime, *p;
262 struct list_head *tmp;
263 int this_cpu;
264 s_time_t now;
265 s32 r_time; /* time for new dom to run */
266 s32 ranfor; /* assume we never run longer than 2.1s! */
267 s32 mcus;
268 u32 next_evt, next_prime_evt, min_avt;
270 perfc_incrc(sched_run1);
271 need_resched_back:
272 perfc_incrc(sched_run2);
274 prev = current;
275 next = NULL;
277 this_cpu = prev->processor;
279 spin_lock_irq(&schedule_data[this_cpu].lock);
281 now = NOW();
283 /* remove timer, if till on list */
284 //if (active_ac_timer(&schedule_data[this_cpu].s_timer))
285 rem_ac_timer(&schedule_data[this_cpu].s_timer);
287 /* deschedule the current domain */
289 ASSERT(!in_interrupt());
290 ASSERT(__task_on_runqueue(prev));
292 if (is_idle_task(prev))
293 goto deschedule_done;
295 /* do some accounting */
296 ranfor = (s32)(now - prev->lastschd);
297 ASSERT((ranfor>0));
298 prev->cpu_time += ranfor;
300 /* calculate mcu and update avt */
301 mcus = ranfor/MCU;
302 if (ranfor % MCU) mcus ++; /* always round up */
303 prev->avt += mcus * prev->mcu_advance;
304 prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */
306 /* dequeue */
307 __del_from_runqueue(prev);
308 switch (prev->state) {
309 case TASK_INTERRUPTIBLE:
310 if (signal_pending(prev)) {
311 prev->state = TASK_RUNNING; /* but has events pending */
312 break;
313 }
314 case TASK_UNINTERRUPTIBLE:
315 case TASK_WAIT:
316 case TASK_DYING:
317 default:
318 /* done if not running. Else, continue */
319 goto deschedule_done;
320 case TASK_RUNNING:;
321 }
323 /* requeue */
324 __add_to_runqueue_tail(prev);
327 deschedule_done:
328 clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
330 /*
331 * Pick a new domain
332 */
334 /* we should at least have the idle task */
335 ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
337 /*
338 * scan through the run queue and pick the task with the lowest evt
339 * *and* the task the second lowest evt.
340 * this code is O(n) but we expect n to be small.
341 */
342 next = schedule_data[this_cpu].idle;
343 next_prime = NULL;
345 next_evt = 0xffffffff;
346 next_prime_evt = 0xffffffff;
347 min_avt = 0xffffffff; /* to calculate svt */
350 list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
351 p = list_entry(tmp, struct task_struct, run_list);
352 if (p->evt < next_evt) {
353 next_prime = next;
354 next_prime_evt = next_evt;
355 next = p;
356 next_evt = p->evt;
357 } else if (next_prime_evt == 0xffffffff) {
358 next_prime_evt = p->evt;
359 next_prime = p;
360 } else if (p->evt < next_prime_evt) {
361 next_prime_evt = p->evt;
362 next_prime = p;
363 }
364 /* determine system virtual time */
365 if (p->avt < min_avt)
366 min_avt = p->avt;
367 }
368 ASSERT(next != NULL); /* we should have at least the idle task */
370 /* update system virtual time */
371 if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
373 if (is_idle_task(next)) {
374 r_time = ctx_allow;
375 goto sched_done;
376 }
378 if (next_prime == NULL || is_idle_task(next_prime)) {
379 /* we have only one runable task besides the idle task */
380 r_time = 10 * ctx_allow; /* RN: random constant */
381 goto sched_done;
382 }
384 /*
385 * if we are here we have two runable tasks.
386 * work out how long 'next' can run till its evt is greater than
387 * 'next_prime's evt. Taking context switch allowance into account.
388 */
389 ASSERT(next_prime->evt >= next->evt);
390 r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
392 sched_done:
393 ASSERT(r_time >= ctx_allow);
395 #ifndef NDEBUG
396 if (r_time < ctx_allow) {
397 printk("[%02d]: %lx\n", this_cpu, r_time);
398 dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
399 }
400 #endif
402 prev->has_cpu = 0;
403 next->has_cpu = 1;
405 schedule_data[this_cpu].prev = prev;
406 schedule_data[this_cpu].curr = next;
408 next->lastschd = now;
410 /* reprogramm the timer */
411 schedule_data[this_cpu].s_timer.expires = now + r_time;
412 add_ac_timer(&schedule_data[this_cpu].s_timer);
414 spin_unlock_irq(&schedule_data[this_cpu].lock);
416 if ( unlikely(prev == next) )
417 {
418 /* We won't go through the normal tail, so do this by hand */
419 prev->policy &= ~SCHED_YIELD;
420 goto same_process;
421 }
423 perfc_incrc(sched_ctx);
424 #ifdef SCHED_HISTO
425 {
426 ulong diff; /* should fit in 32bits */
427 if (!is_idle_task(next) && next->wokenup) {
428 diff = (ulong)(now - next->wokenup);
429 diff /= (ulong)MILLISECS(1);
430 if (diff <= BUCKETS-2) schedule_data[this_cpu].hist[diff]++;
431 else schedule_data[this_cpu].hist[BUCKETS-1]++;
432 }
433 next->wokenup = (s_time_t)0;
434 }
435 #endif
438 prepare_to_switch();
439 switch_to(prev, next);
440 prev = schedule_data[this_cpu].prev;
442 prev->policy &= ~SCHED_YIELD;
443 if ( prev->state == TASK_DYING ) release_task(prev);
445 same_process:
446 /* update the domains notion of time */
447 update_dom_time(current->shared_info);
449 if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) ) {
450 goto need_resched_back;
451 }
452 return;
453 }
455 /* No locking needed -- pointer comparison is safe :-) */
456 int idle_cpu(int cpu)
457 {
458 struct task_struct *p = schedule_data[cpu].curr;
459 return p == idle_task[cpu];
460 }
463 /*
464 * The scheduler timer.
465 */
466 static void sched_timer(unsigned long foo)
467 {
468 int cpu = smp_processor_id();
469 struct task_struct *curr = schedule_data[cpu].curr;
470 /* cause a reschedule */
471 set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
472 perfc_incrc(sched_irq);
473 }
475 /*
476 * The Domain virtual time timer
477 */
478 static void virt_timer(unsigned long foo)
479 {
480 unsigned long cpu_mask = 0;
481 struct task_struct *p;
482 s_time_t now;
484 /* send virtual timer interrupt */
485 read_lock(&tasklist_lock);
486 p = &idle0_task;
487 do {
488 if ( is_idle_task(p) ) continue;
489 cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
490 }
491 while ( (p = p->next_task) != &idle0_task );
492 read_unlock(&tasklist_lock);
493 guest_event_notify(cpu_mask);
495 now = NOW();
496 v_timer.expires = now + MILLISECS(20);
497 add_ac_timer(&v_timer);
498 }
500 /*
501 * Initialise the data structures
502 */
503 void __init scheduler_init(void)
504 {
505 int i;
507 printk("Initialising schedulers\n");
509 for ( i = 0; i < NR_CPUS; i++ )
510 {
511 INIT_LIST_HEAD(&schedule_data[i].runqueue);
512 spin_lock_init(&schedule_data[i].lock);
513 schedule_data[i].prev = &idle0_task;
514 schedule_data[i].curr = &idle0_task;
516 /* a timer for each CPU */
517 init_ac_timer(&schedule_data[i].s_timer, i);
518 schedule_data[i].s_timer.data = 2;
519 schedule_data[i].s_timer.function = &sched_timer;
521 }
522 schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */
523 init_ac_timer(&v_timer, 0);
524 v_timer.data = 3;
525 v_timer.function = &virt_timer;
526 }
528 /*
529 * Start a scheduler for each CPU
530 * This has to be done *after* the timers, e.g., APICs, have been initialised
531 */
532 void schedulers_start(void)
533 {
534 printk("Start schedulers\n");
535 __cli();
536 sched_timer(0);
537 virt_timer(0);
538 smp_call_function((void *)sched_timer, NULL, 1, 1);
539 __sti();
540 }
543 /****************************************************************************
544 * Functions for legacy support.
545 * Schedule timeout is used at a number of places and is a bit meaningless
546 * in the context of Xen, as Domains are not able to call these and all
547 * there entry points into Xen should be asynchronous. If a domain wishes
548 * to block for a while it should use Xen's sched_op/yield entry point.
549 ****************************************************************************/
551 static void process_timeout(unsigned long __data)
552 {
553 struct task_struct * p = (struct task_struct *) __data;
554 wake_up(p);
555 }
557 long schedule_timeout(long timeout)
558 {
559 struct timer_list timer;
560 unsigned long expire;
562 switch (timeout)
563 {
564 case MAX_SCHEDULE_TIMEOUT:
565 /*
566 * These two special cases are useful to be comfortable in the caller.
567 * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
568 * negative value but I' d like to return a valid offset (>=0) to allow
569 * the caller to do everything it want with the retval.
570 */
571 schedule();
572 goto out;
573 default:
574 /*
575 * Another bit of PARANOID. Note that the retval will be 0 since no
576 * piece of kernel is supposed to do a check for a negative retval of
577 * schedule_timeout() (since it should never happens anyway). You just
578 * have the printk() that will tell you if something is gone wrong and
579 * where.
580 */
581 if (timeout < 0)
582 {
583 printk(KERN_ERR "schedule_timeout: wrong timeout "
584 "value %lx from %p\n", timeout,
585 __builtin_return_address(0));
586 current->state = TASK_RUNNING;
587 goto out;
588 }
589 }
591 expire = timeout + jiffies;
593 init_timer(&timer);
594 timer.expires = expire;
595 timer.data = (unsigned long) current;
596 timer.function = process_timeout;
598 add_timer(&timer);
599 schedule();
600 del_timer_sync(&timer);
602 timeout = expire - jiffies;
604 out:
605 return timeout < 0 ? 0 : timeout;
606 }
608 /****************************************************************************
609 * debug function
610 ****************************************************************************/
612 static void dump_rqueue(struct list_head *queue, char *name)
613 {
614 struct list_head *list;
615 int loop = 0;
616 struct task_struct *p;
618 printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue,
619 (unsigned long) queue->next, (unsigned long) queue->prev);
620 list_for_each (list, queue) {
621 p = list_entry(list, struct task_struct, run_list);
622 printk("%3d: %3d has=%c mcua=0x%04lX ev=0x%08X av=0x%08X c=0x%X%08X\n",
623 loop++, p->domain,
624 p->has_cpu ? 'T':'F',
625 p->mcu_advance, p->evt, p->avt,
626 (u32)(p->cpu_time>>32), (u32)p->cpu_time);
627 printk(" l: %lx n: %lx p: %lx\n",
628 (unsigned long)list, (unsigned long)list->next,
629 (unsigned long)list->prev);
630 }
631 return;
632 }
634 void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
635 {
636 u_long flags;
637 s_time_t now = NOW();
638 int i;
640 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
641 (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now);
642 for (i = 0; i < smp_num_cpus; i++) {
643 spin_lock_irqsave(&schedule_data[i].lock, flags);
644 printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
645 dump_rqueue(&schedule_data[i].runqueue, "rq");
646 spin_unlock_irqrestore(&schedule_data[i].lock, flags);
647 }
648 return;
649 }
651 #ifdef SCHED_HISTO
652 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
653 {
654 int loop, i, j;
655 for (loop = 0; loop < smp_num_cpus; loop++) {
656 j = 0;
657 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", loop);
658 for (i=0; i<BUCKETS; i++) {
659 if (schedule_data[loop].hist[i]) {
660 if (i < BUCKETS-1)
661 printk("%2d:[%7u] ", i, schedule_data[loop].hist[i]);
662 else
663 printk(" >:[%7u] ", schedule_data[loop].hist[i]);
664 j++;
665 if (!(j % 5)) printk("\n");
666 }
667 }
668 printk("\n");
669 }
671 }
672 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
673 {
674 int loop, i;
675 for (loop = 0; loop < smp_num_cpus; loop++)
676 for (i=0; i<BUCKETS; i++)
677 schedule_data[loop].hist[i]=0;
678 }
679 #else
680 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
681 {
682 }
683 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
684 {
685 }
686 #endif