ia64/xen-unstable

view xen/common/schedule.c @ 745:52f228c7096b

bitkeeper revision 1.445 (3f6b3cbfPoEFlaJ9_8AHKqhfHOuhyQ)

e100.h, schedule.c:
Yet another e100/schedule_timeout fix.
author kaf24@scramble.cl.cam.ac.uk
date Fri Sep 19 17:28:31 2003 +0000 (2003-09-19)
parents 33cdf06502cb
children 7e78d76f7ba6
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebar & Keir Fraser
9 *
10 * Description: CPU scheduling
11 * implements A Borrowed Virtual Time scheduler.
12 * (see Duda & Cheriton SOSP'99)
13 */
15 #include <xeno/config.h>
16 #include <xeno/init.h>
17 #include <xeno/lib.h>
18 #include <xeno/sched.h>
19 #include <xeno/delay.h>
20 #include <xeno/event.h>
21 #include <xeno/time.h>
22 #include <xeno/ac_timer.h>
23 #include <xeno/interrupt.h>
24 #include <xeno/timer.h>
25 #include <xeno/perfc.h>
28 #undef SCHEDULER_TRACE
29 #ifdef SCHEDULER_TRACE
30 #define TRC(_x) _x
31 #else
32 #define TRC(_x)
33 #endif
35 /*#define SCHED_HISTO*/
36 #ifdef SCHED_HISTO
37 #define BUCKETS 31
38 #endif
40 #define MCU (s32)MICROSECS(100) /* Minimum unit */
41 #define MCU_ADVANCE 10 /* default weight */
42 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
43 static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
45 typedef struct schedule_data_st
46 {
47 spinlock_t lock; /* lock for protecting this */
48 struct list_head runqueue; /* runqueue */
49 struct task_struct *prev, *curr; /* previous and current task */
50 struct task_struct *idle; /* idle task for this cpu */
51 u32 svt; /* system virtual time. per CPU??? */
52 struct ac_timer s_timer; /* scheduling timer */
53 #ifdef SCHED_HISTO
54 u32 hist[BUCKETS]; /* for scheduler latency histogram */
55 #endif
56 } __cacheline_aligned schedule_data_t;
57 schedule_data_t schedule_data[NR_CPUS];
59 struct ac_timer v_timer; /* scheduling timer */
60 static void virt_timer(unsigned long foo);
61 static void dump_rqueue(struct list_head *queue, char *name);
64 static inline void __add_to_runqueue_head(struct task_struct * p)
65 {
66 list_add(&p->run_list, &schedule_data[p->processor].runqueue);
67 }
69 static inline void __add_to_runqueue_tail(struct task_struct * p)
70 {
71 list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
72 }
74 static inline void __del_from_runqueue(struct task_struct * p)
75 {
76 list_del(&p->run_list);
77 p->run_list.next = NULL;
78 }
80 static inline int __task_on_runqueue(struct task_struct *p)
81 {
82 return p->run_list.next != NULL;
83 }
85 #define next_domain(p) \\
86 list_entry((p)->run_list.next, struct task_struct, run_list)
88 static void __calc_evt(struct task_struct *p)
89 {
90 s_time_t now = NOW();
91 if ( p->warpback )
92 {
93 if ( ((now - p->warped) < p->warpl) &&
94 ((now - p->uwarped) > p->warpu) )
95 {
96 /* allowed to warp */
97 p->evt = p->avt - p->warp;
98 }
99 else
100 {
101 /* warped for too long -> unwarp */
102 p->evt = p->avt;
103 p->uwarped = now;
104 p->warpback = 0;
105 }
106 }
107 else
108 {
109 p->evt = p->avt;
110 }
111 }
114 /******************************************************************************
115 * Add and remove a domain
116 ******************************************************************************/
117 void sched_add_domain(struct task_struct *p)
118 {
119 p->state = TASK_SUSPENDED;
120 p->mcu_advance = MCU_ADVANCE;
122 if ( p->domain == IDLE_DOMAIN_ID )
123 {
124 p->avt = 0xffffffff;
125 p->evt = 0xffffffff;
126 schedule_data[p->processor].idle = p;
127 }
128 else
129 {
130 /* set avt end evt to system virtual time */
131 p->avt = schedule_data[p->processor].svt;
132 p->evt = schedule_data[p->processor].svt;
133 /* set some default values here */
134 p->warpback = 0;
135 p->warp = 0;
136 p->warpl = 0;
137 p->warpu = 0;
138 }
139 }
141 void sched_rem_domain(struct task_struct *p)
142 {
143 p->state = TASK_DYING;
144 }
147 void init_idle_task(void)
148 {
149 unsigned long flags;
150 struct task_struct *p = current;
151 spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
152 p->has_cpu = 1;
153 p->state = TASK_RUNNING;
154 if ( !__task_on_runqueue(p) )
155 __add_to_runqueue_head(p);
156 spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
157 }
160 /****************************************************************************
161 * wake up a domain which had been sleeping
162 ****************************************************************************/
163 int wake_up(struct task_struct *p)
164 {
165 unsigned long flags;
166 int ret = 0;
168 spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
170 /* XXX RN: should we warp here? Might be a good idea to also boost a
171 * domain which currently is unwarped and on run queue and
172 * the receives an event. */
173 if ( __task_on_runqueue(p) ) goto out;
175 p->state = TASK_RUNNING;
176 __add_to_runqueue_head(p);
178 /* set the BVT parameters */
179 if (p->avt < schedule_data[p->processor].svt)
180 p->avt = schedule_data[p->processor].svt;
182 /* deal with warping here */
183 p->warpback = 1;
184 p->warped = NOW();
185 __calc_evt(p);
187 #ifdef SCHED_HISTO
188 p->wokenup = NOW();
189 #endif
191 ret = 1;
192 out:
193 spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
194 return ret;
195 }
197 /****************************************************************************
198 * Voluntarily yield the processor to another domain, until an event occurs.
199 ****************************************************************************/
200 long do_yield(void)
201 {
202 current->state = TASK_INTERRUPTIBLE;
203 current->warpback = 0; /* XXX should only do this when blocking */
204 schedule();
205 return 0;
206 }
208 /****************************************************************************
209 * Control the scheduler
210 ****************************************************************************/
211 long sched_bvtctl(unsigned long c_allow)
212 {
213 ctx_allow = c_allow;
214 return 0;
215 }
217 /****************************************************************************
218 * Adjust scheduling parameter for a given domain
219 ****************************************************************************/
220 long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
221 unsigned long warpl, unsigned long warpu)
222 {
223 struct task_struct *p;
225 /* Sanity -- this can avoid divide-by-zero. */
226 if ( mcu_adv == 0 )
227 return -EINVAL;
229 p = find_domain_by_id(dom);
230 if ( p == NULL )
231 return -ESRCH;
233 spin_lock_irq(&schedule_data[p->processor].lock);
234 p->mcu_advance = mcu_adv;
235 spin_unlock_irq(&schedule_data[p->processor].lock);
237 put_task_struct(p);
239 return 0;
240 }
242 /****************************************************************************
243 * cause a run through the scheduler when appropriate
244 * Appropriate is:
245 * - current task is idle task
246 * - the current task already ran for it's context switch allowance
247 * Otherwise we do a run through the scheduler after the current tasks
248 * context switch allowance is over.
249 ****************************************************************************/
250 void reschedule(struct task_struct *p)
251 {
252 int cpu = p->processor;
253 struct task_struct *curr;
254 unsigned long flags;
255 s_time_t now, min_time;
257 if (p->has_cpu)
258 return;
260 spin_lock_irqsave(&schedule_data[cpu].lock, flags);
262 now = NOW();
263 curr = schedule_data[cpu].curr;
264 /* domain should run at least for ctx_allow */
265 min_time = curr->lastschd + ctx_allow;
267 if ( is_idle_task(curr) || (min_time <= now) ) {
268 /* reschedule */
269 set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
271 spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
273 if (cpu != smp_processor_id())
274 smp_send_event_check_cpu(cpu);
275 return;
276 }
278 /* current hasn't been running for long enough -> reprogram timer.
279 * but don't bother if timer would go off soon anyway */
280 if (schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP) {
281 mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
282 }
284 spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
285 return;
286 }
289 /****************************************************************************
290 * The main function
291 * - deschedule the current domain.
292 * - pick a new domain.
293 * i.e., the domain with lowest EVT.
294 * The runqueue should be ordered by EVT so that is easy.
295 ****************************************************************************/
296 asmlinkage void schedule(void)
297 {
298 struct task_struct *prev, *next, *next_prime, *p;
299 struct list_head *tmp;
300 int this_cpu;
301 s_time_t now;
302 s32 r_time; /* time for new dom to run */
303 s32 ranfor; /* assume we never run longer than 2.1s! */
304 s32 mcus;
305 u32 next_evt, next_prime_evt, min_avt;
307 perfc_incrc(sched_run1);
308 need_resched_back:
309 perfc_incrc(sched_run2);
311 prev = current;
312 next = NULL;
314 this_cpu = prev->processor;
316 spin_lock_irq(&schedule_data[this_cpu].lock);
318 now = NOW();
320 /* remove timer, if still on list */
321 rem_ac_timer(&schedule_data[this_cpu].s_timer);
323 /* deschedule the current domain */
325 ASSERT(!in_interrupt());
326 ASSERT(__task_on_runqueue(prev));
328 if (is_idle_task(prev))
329 goto deschedule_done;
331 /* do some accounting */
332 ranfor = (s32)(now - prev->lastschd);
333 ASSERT((ranfor>0));
334 prev->cpu_time += ranfor;
336 /* calculate mcu and update avt */
337 mcus = ranfor/MCU;
338 if (ranfor % MCU) mcus ++; /* always round up */
339 prev->avt += mcus * prev->mcu_advance;
341 /* recalculate evt */
342 __calc_evt(prev);
344 /* dequeue */
345 __del_from_runqueue(prev);
346 switch (prev->state) {
347 case TASK_INTERRUPTIBLE:
348 if (signal_pending(prev)) {
349 prev->state = TASK_RUNNING; /* but has events pending */
350 break;
351 }
352 case TASK_UNINTERRUPTIBLE:
353 case TASK_WAIT:
354 case TASK_DYING:
355 case TASK_SUSPENDED:
356 default:
357 /* done if not running. Else, continue */
358 goto deschedule_done;
359 case TASK_RUNNING:;
360 }
362 /* requeue */
363 __add_to_runqueue_tail(prev);
366 deschedule_done:
367 clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
369 /*
370 * Pick a new domain
371 */
373 /* we should at least have the idle task */
374 ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
376 /*
377 * scan through the run queue and pick the task with the lowest evt
378 * *and* the task the second lowest evt.
379 * this code is O(n) but we expect n to be small.
380 */
381 next = schedule_data[this_cpu].idle;
382 next_prime = NULL;
384 next_evt = 0xffffffff;
385 next_prime_evt = 0xffffffff;
386 min_avt = 0xffffffff; /* to calculate svt */
388 list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
389 p = list_entry(tmp, struct task_struct, run_list);
390 if (p->evt < next_evt) {
391 next_prime = next;
392 next_prime_evt = next_evt;
393 next = p;
394 next_evt = p->evt;
395 } else if (next_prime_evt == 0xffffffff) {
396 next_prime_evt = p->evt;
397 next_prime = p;
398 } else if (p->evt < next_prime_evt) {
399 next_prime_evt = p->evt;
400 next_prime = p;
401 }
402 /* determine system virtual time */
403 if (p->avt < min_avt)
404 min_avt = p->avt;
405 }
406 ASSERT(next != NULL); /* we should have at least the idle task */
408 /* update system virtual time */
409 if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
411 /* check for virtual time overrun on this cpu */
412 if (schedule_data[this_cpu].svt >= 0xf0000000) {
413 u_long t_flags;
414 write_lock_irqsave(&tasklist_lock, t_flags);
415 p = &idle0_task;
416 do {
417 if (p->processor == this_cpu && !is_idle_task(p)) {
418 p->evt -= 0xe0000000;
419 p->avt -= 0xe0000000;
420 }
421 } while ( (p = p->next_task) != &idle0_task );
422 write_unlock_irqrestore(&tasklist_lock, t_flags);
423 schedule_data[this_cpu].svt -= 0xe0000000;
424 }
426 /* work out time for next run through scheduler */
427 if (is_idle_task(next)) {
428 r_time = ctx_allow;
429 goto sched_done;
430 }
432 if (next_prime == NULL || is_idle_task(next_prime)) {
433 /* we have only one runable task besides the idle task */
434 r_time = 10 * ctx_allow; /* RN: random constant */
435 goto sched_done;
436 }
438 /*
439 * if we are here we have two runable tasks.
440 * work out how long 'next' can run till its evt is greater than
441 * 'next_prime's evt. Taking context switch allowance into account.
442 */
443 ASSERT(next_prime->evt >= next->evt);
444 r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
446 sched_done:
447 ASSERT(r_time >= ctx_allow);
449 #ifndef NDEBUG
450 if (r_time < ctx_allow) {
451 printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time);
452 dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
453 }
454 #endif
456 prev->has_cpu = 0;
457 next->has_cpu = 1;
459 schedule_data[this_cpu].prev = prev;
460 schedule_data[this_cpu].curr = next;
462 next->lastschd = now;
464 /* reprogramm the timer */
465 schedule_data[this_cpu].s_timer.expires = now + r_time;
466 add_ac_timer(&schedule_data[this_cpu].s_timer);
468 spin_unlock_irq(&schedule_data[this_cpu].lock);
470 /* done, switch tasks */
471 if ( unlikely(prev == next) )
472 {
473 /* We won't go through the normal tail, so do this by hand */
474 prev->policy &= ~SCHED_YIELD;
475 goto same_process;
476 }
478 perfc_incrc(sched_ctx);
479 #ifdef SCHED_HISTO
480 {
481 ulong diff; /* should fit in 32bits */
482 if (!is_idle_task(next) && next->wokenup) {
483 diff = (ulong)(now - next->wokenup);
484 diff /= (ulong)MILLISECS(1);
485 if (diff <= BUCKETS-2) schedule_data[this_cpu].hist[diff]++;
486 else schedule_data[this_cpu].hist[BUCKETS-1]++;
487 }
488 next->wokenup = (s_time_t)0;
489 }
490 #endif
493 prepare_to_switch();
494 switch_to(prev, next);
495 prev = schedule_data[this_cpu].prev;
497 prev->policy &= ~SCHED_YIELD;
498 if ( prev->state == TASK_DYING )
499 put_task_struct(prev);
501 same_process:
502 /* update the domains notion of time */
503 update_dom_time(current->shared_info);
505 if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) ) {
506 goto need_resched_back;
507 }
508 return;
509 }
511 /* No locking needed -- pointer comparison is safe :-) */
512 int idle_cpu(int cpu)
513 {
514 struct task_struct *p = schedule_data[cpu].curr;
515 return p == idle_task[cpu];
516 }
519 /*
520 * The scheduler timer.
521 */
522 static void sched_timer(unsigned long foo)
523 {
524 int cpu = smp_processor_id();
525 struct task_struct *curr = schedule_data[cpu].curr;
526 /* cause a reschedule */
527 set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
528 perfc_incrc(sched_irq);
529 }
531 /*
532 * The Domain virtual time timer
533 */
534 static void virt_timer(unsigned long foo)
535 {
536 unsigned long cpu_mask = 0;
537 struct task_struct *p;
538 s_time_t now;
540 /* send virtual timer interrupt */
541 read_lock(&tasklist_lock);
542 p = &idle0_task;
543 do {
544 if ( is_idle_task(p) ) continue;
545 cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
546 }
547 while ( (p = p->next_task) != &idle0_task );
548 read_unlock(&tasklist_lock);
549 guest_event_notify(cpu_mask);
551 now = NOW();
552 v_timer.expires = now + MILLISECS(20);
553 add_ac_timer(&v_timer);
554 }
556 /*
557 * Initialise the data structures
558 */
559 void __init scheduler_init(void)
560 {
561 int i;
563 printk("Initialising schedulers\n");
565 for ( i = 0; i < NR_CPUS; i++ )
566 {
567 INIT_LIST_HEAD(&schedule_data[i].runqueue);
568 spin_lock_init(&schedule_data[i].lock);
569 schedule_data[i].prev = &idle0_task;
570 schedule_data[i].curr = &idle0_task;
572 /* a timer for each CPU */
573 init_ac_timer(&schedule_data[i].s_timer, i);
574 schedule_data[i].s_timer.data = 2;
575 schedule_data[i].s_timer.function = &sched_timer;
577 }
578 schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */
579 init_ac_timer(&v_timer, 0);
580 v_timer.data = 3;
581 v_timer.function = &virt_timer;
582 }
584 /*
585 * Start a scheduler for each CPU
586 * This has to be done *after* the timers, e.g., APICs, have been initialised
587 */
588 void schedulers_start(void)
589 {
590 printk("Start schedulers\n");
591 sched_timer(0);
592 virt_timer(0);
593 smp_call_function((void *)sched_timer, NULL, 1, 1);
594 }
597 /****************************************************************************
598 * Functions for legacy support.
599 * Schedule timeout is used at a number of places and is a bit meaningless
600 * in the context of Xen, as Domains are not able to call these and all
601 * there entry points into Xen should be asynchronous. If a domain wishes
602 * to block for a while it should use Xen's sched_op/yield entry point.
603 ****************************************************************************/
605 static void process_timeout(unsigned long __data)
606 {
607 struct task_struct * p = (struct task_struct *) __data;
608 wake_up(p);
609 }
611 long schedule_timeout(long timeout)
612 {
613 struct timer_list timer;
614 unsigned long expire;
616 switch (timeout)
617 {
618 case MAX_SCHEDULE_TIMEOUT:
619 /* Sanity! This just wouldn't make sense. */
620 if ( is_idle_task(current) )
621 panic("Arbitrary sleep in idle task!");
622 /*
623 * These two special cases are useful to be comfortable in the caller.
624 * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
625 * negative value but I' d like to return a valid offset (>=0) to allow
626 * the caller to do everything it want with the retval.
627 */
628 schedule();
629 goto out;
631 default:
632 /*
633 * Another bit of PARANOID. Note that the retval will be 0 since no
634 * piece of kernel is supposed to do a check for a negative retval of
635 * schedule_timeout() (since it should never happens anyway). You just
636 * have the printk() that will tell you if something is gone wrong and
637 * where.
638 */
639 if (timeout < 0)
640 {
641 printk(KERN_ERR "schedule_timeout: wrong timeout "
642 "value %lx from %p\n", timeout,
643 __builtin_return_address(0));
644 current->state = TASK_RUNNING;
645 goto out;
646 }
647 }
649 expire = timeout + jiffies;
651 init_timer(&timer);
652 timer.expires = expire;
653 timer.data = (unsigned long) current;
654 timer.function = process_timeout;
656 add_timer(&timer);
657 schedule();
658 del_timer_sync(&timer);
660 timeout = expire - jiffies;
662 out:
663 return timeout < 0 ? 0 : timeout;
664 }
666 /****************************************************************************
667 * debug function
668 ****************************************************************************/
670 static void dump_rqueue(struct list_head *queue, char *name)
671 {
672 struct list_head *list;
673 int loop = 0;
674 struct task_struct *p;
676 printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue,
677 (unsigned long) queue->next, (unsigned long) queue->prev);
678 list_for_each (list, queue) {
679 p = list_entry(list, struct task_struct, run_list);
680 printk("%3d: %3d has=%c mcua=0x%04lX ev=0x%08X av=0x%08X c=0x%X%08X\n",
681 loop++, p->domain,
682 p->has_cpu ? 'T':'F',
683 p->mcu_advance, p->evt, p->avt,
684 (u32)(p->cpu_time>>32), (u32)p->cpu_time);
685 printk(" l: %lx n: %lx p: %lx\n",
686 (unsigned long)list, (unsigned long)list->next,
687 (unsigned long)list->prev);
688 }
689 return;
690 }
692 void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
693 {
694 u_long flags;
695 s_time_t now = NOW();
696 int i;
698 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
699 (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now);
700 for (i = 0; i < smp_num_cpus; i++) {
701 spin_lock_irqsave(&schedule_data[i].lock, flags);
702 printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
703 dump_rqueue(&schedule_data[i].runqueue, "rq");
704 spin_unlock_irqrestore(&schedule_data[i].lock, flags);
705 }
706 return;
707 }
709 #ifdef SCHED_HISTO
710 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
711 {
712 int loop, i, j;
713 for (loop = 0; loop < smp_num_cpus; loop++) {
714 j = 0;
715 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", loop);
716 for (i=0; i<BUCKETS; i++) {
717 if (schedule_data[loop].hist[i]) {
718 if (i < BUCKETS-1)
719 printk("%2d:[%7u] ", i, schedule_data[loop].hist[i]);
720 else
721 printk(" >:[%7u] ", schedule_data[loop].hist[i]);
722 j++;
723 if (!(j % 5)) printk("\n");
724 }
725 }
726 printk("\n");
727 }
729 }
730 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
731 {
732 int loop, i;
733 for (loop = 0; loop < smp_num_cpus; loop++)
734 for (i=0; i<BUCKETS; i++)
735 schedule_data[loop].hist[i]=0;
736 }
737 #else
738 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
739 {
740 }
741 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
742 {
743 }
744 #endif