ia64/xen-unstable

view xen/common/schedule.c @ 877:d1833d5b387b

bitkeeper revision 1.546.1.1 (3fa3e1b4UwJQtnD-lZcvMsbqR-XhSA)

sched hypercall unification -- tidying things up in
anticipation of suspend/resume
author akw27@labyrinth.cl.cam.ac.uk
date Sat Nov 01 16:39:16 2003 +0000 (2003-11-01)
parents 8305b95a7772
children 61c3759bc7be
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebar & Keir Fraser
9 *
10 * Description: CPU scheduling
11 * implements A Borrowed Virtual Time scheduler.
12 * (see Duda & Cheriton SOSP'99)
13 */
15 #include <xeno/config.h>
16 #include <xeno/init.h>
17 #include <xeno/lib.h>
18 #include <xeno/sched.h>
19 #include <xeno/delay.h>
20 #include <xeno/event.h>
21 #include <xeno/time.h>
22 #include <xeno/ac_timer.h>
23 #include <xeno/interrupt.h>
24 #include <xeno/timer.h>
25 #include <xeno/perfc.h>
28 #undef SCHEDULER_TRACE
29 #ifdef SCHEDULER_TRACE
30 #define TRC(_x) _x
31 #else
32 #define TRC(_x)
33 #endif
35 /*#define SCHED_HISTO*/
36 #ifdef SCHED_HISTO
37 #define BUCKETS 31
38 #endif
40 #define MCU (s32)MICROSECS(100) /* Minimum unit */
41 #define MCU_ADVANCE 10 /* default weight */
42 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
43 static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
45 typedef struct schedule_data_st
46 {
47 spinlock_t lock; /* lock for protecting this */
48 struct list_head runqueue; /* runqueue */
49 struct task_struct *curr; /* current task */
50 struct task_struct *idle; /* idle task for this cpu */
51 u32 svt; /* system virtual time. per CPU??? */
52 struct ac_timer s_timer; /* scheduling timer */
53 #ifdef SCHED_HISTO
54 u32 hist[BUCKETS]; /* for scheduler latency histogram */
55 #endif
56 } __cacheline_aligned schedule_data_t;
57 schedule_data_t schedule_data[NR_CPUS];
59 struct ac_timer v_timer; /* scheduling timer */
60 static void virt_timer(unsigned long foo);
61 static void dump_rqueue(struct list_head *queue, char *name);
64 static inline void __add_to_runqueue_head(struct task_struct * p)
65 {
66 list_add(&p->run_list, &schedule_data[p->processor].runqueue);
67 }
69 static inline void __add_to_runqueue_tail(struct task_struct * p)
70 {
71 list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
72 }
74 static inline void __del_from_runqueue(struct task_struct * p)
75 {
76 list_del(&p->run_list);
77 p->run_list.next = NULL;
78 }
80 static inline int __task_on_runqueue(struct task_struct *p)
81 {
82 return p->run_list.next != NULL;
83 }
85 #define next_domain(p) \\
86 list_entry((p)->run_list.next, struct task_struct, run_list)
88 static void __calc_evt(struct task_struct *p)
89 {
90 s_time_t now = NOW();
91 if ( p->warpback )
92 {
93 if ( ((now - p->warped) < p->warpl) &&
94 ((now - p->uwarped) > p->warpu) )
95 {
96 /* allowed to warp */
97 p->evt = p->avt - p->warp;
98 }
99 else
100 {
101 /* warped for too long -> unwarp */
102 p->evt = p->avt;
103 p->uwarped = now;
104 p->warpback = 0;
105 }
106 }
107 else
108 {
109 p->evt = p->avt;
110 }
111 }
114 /*
115 * Add and remove a domain
116 */
117 void sched_add_domain(struct task_struct *p)
118 {
119 p->state = TASK_SUSPENDED;
120 p->mcu_advance = MCU_ADVANCE;
122 if ( p->domain == IDLE_DOMAIN_ID )
123 {
124 p->avt = 0xffffffff;
125 p->evt = 0xffffffff;
126 schedule_data[p->processor].idle = p;
127 }
128 else
129 {
130 /* set avt end evt to system virtual time */
131 p->avt = schedule_data[p->processor].svt;
132 p->evt = schedule_data[p->processor].svt;
133 /* set some default values here */
134 p->warpback = 0;
135 p->warp = 0;
136 p->warpl = 0;
137 p->warpu = 0;
138 }
139 }
141 void sched_rem_domain(struct task_struct *p)
142 {
143 p->state = TASK_DYING;
144 }
147 void init_idle_task(void)
148 {
149 unsigned long flags;
150 struct task_struct *p = current;
151 spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
152 p->has_cpu = 1;
153 p->state = TASK_RUNNING;
154 if ( !__task_on_runqueue(p) )
155 __add_to_runqueue_head(p);
156 spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
157 }
160 /*
161 * wake up a domain which had been sleeping
162 */
163 int wake_up(struct task_struct *p)
164 {
165 unsigned long flags;
166 int ret = 0;
168 spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
170 /* XXX RN: should we warp here? Might be a good idea to also boost a
171 * domain which currently is unwarped and on run queue and
172 * the receives an event. */
173 if ( __task_on_runqueue(p) ) goto out;
175 p->state = TASK_RUNNING;
176 __add_to_runqueue_head(p);
178 /* set the BVT parameters */
179 if (p->avt < schedule_data[p->processor].svt)
180 p->avt = schedule_data[p->processor].svt;
182 /* deal with warping here */
183 p->warpback = 1;
184 p->warped = NOW();
185 __calc_evt(p);
187 #ifdef SCHED_HISTO
188 p->wokenup = NOW();
189 #endif
191 ret = 1;
192 out:
193 spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
194 return ret;
195 }
197 /*
198 * Voluntarily yield the processor to another domain, until an event occurs.
199 */
200 long do_yield(void)
201 {
202 current->state = TASK_INTERRUPTIBLE;
203 current->warpback = 0; /* XXX should only do this when blocking */
204 __enter_scheduler();
205 return 0;
206 }
208 /*
209 * Demultiplex scheduler-related hypercalls.
210 */
211 long do_sched_op(unsigned long op)
212 {
213 long ret = 0;
215 switch( op )
216 {
218 case SCHEDOP_yield:
219 {
220 ret = do_yield();
221 break;
222 }
224 case SCHEDOP_exit:
225 {
226 kill_domain();
227 break;
228 }
230 default:
231 ret = -ENOSYS;
232 }
234 return ret;
235 }
237 /*
238 * Control the scheduler
239 */
240 long sched_bvtctl(unsigned long c_allow)
241 {
242 ctx_allow = c_allow;
243 return 0;
244 }
246 /*
247 * Adjust scheduling parameter for a given domain
248 */
249 long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
250 unsigned long warpl, unsigned long warpu)
251 {
252 struct task_struct *p;
254 /* Sanity -- this can avoid divide-by-zero. */
255 if ( mcu_adv == 0 )
256 return -EINVAL;
258 p = find_domain_by_id(dom);
259 if ( p == NULL )
260 return -ESRCH;
262 spin_lock_irq(&schedule_data[p->processor].lock);
263 p->mcu_advance = mcu_adv;
264 spin_unlock_irq(&schedule_data[p->processor].lock);
266 put_task_struct(p);
268 return 0;
269 }
271 /*
272 * cause a run through the scheduler when appropriate
273 * Appropriate is:
274 * - current task is idle task
275 * - the current task already ran for it's context switch allowance
276 * Otherwise we do a run through the scheduler after the current tasks
277 * context switch allowance is over.
278 */
279 void reschedule(struct task_struct *p)
280 {
281 int cpu = p->processor;
282 struct task_struct *curr;
283 unsigned long flags;
284 s_time_t now, min_time;
286 if ( p->has_cpu )
287 return;
289 spin_lock_irqsave(&schedule_data[cpu].lock, flags);
291 now = NOW();
292 curr = schedule_data[cpu].curr;
293 /* domain should run at least for ctx_allow */
294 min_time = curr->lastschd + ctx_allow;
296 if ( is_idle_task(curr) || (min_time <= now) )
297 {
298 /* reschedule */
299 set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
301 spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
303 if ( cpu != smp_processor_id() )
304 smp_send_event_check_cpu(cpu);
306 return;
307 }
309 /* current hasn't been running for long enough -> reprogram timer.
310 * but don't bother if timer would go off soon anyway */
311 if ( schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP )
312 mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
314 spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
315 }
318 /*
319 * The main function
320 * - deschedule the current domain.
321 * - pick a new domain.
322 * i.e., the domain with lowest EVT.
323 * The runqueue should be ordered by EVT so that is easy.
324 */
325 asmlinkage void __enter_scheduler(void)
326 {
327 struct task_struct *prev, *next, *next_prime, *p;
328 struct list_head *tmp;
329 int this_cpu;
330 s_time_t now;
331 s32 r_time; /* time for new dom to run */
332 s32 ranfor; /* assume we never run longer than 2.1s! */
333 s32 mcus;
334 u32 next_evt, next_prime_evt, min_avt;
336 perfc_incrc(sched_run);
338 prev = current;
339 next = NULL;
341 this_cpu = prev->processor;
343 spin_lock_irq(&schedule_data[this_cpu].lock);
345 now = NOW();
347 /* remove timer, if still on list */
348 rem_ac_timer(&schedule_data[this_cpu].s_timer);
350 /* deschedule the current domain */
352 ASSERT(!in_interrupt());
353 ASSERT(__task_on_runqueue(prev));
355 if ( is_idle_task(prev) )
356 goto deschedule_done;
358 /* do some accounting */
359 ranfor = (s32)(now - prev->lastschd);
360 ASSERT((ranfor>0));
361 prev->cpu_time += ranfor;
363 /* calculate mcu and update avt */
364 mcus = ranfor/MCU;
365 if (ranfor % MCU) mcus ++; /* always round up */
366 prev->avt += mcus * prev->mcu_advance;
368 /* recalculate evt */
369 __calc_evt(prev);
371 /* dequeue */
372 __del_from_runqueue(prev);
374 switch ( prev->state )
375 {
376 case TASK_INTERRUPTIBLE:
377 if ( signal_pending(prev) )
378 {
379 prev->state = TASK_RUNNING; /* but has events pending */
380 break;
381 }
382 case TASK_UNINTERRUPTIBLE:
383 case TASK_WAIT:
384 case TASK_DYING:
385 case TASK_SUSPENDED:
386 default:
387 /* done if not running. Else, continue */
388 goto deschedule_done;
389 case TASK_RUNNING:;
390 }
392 /* requeue */
393 __add_to_runqueue_tail(prev);
395 deschedule_done:
396 clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
398 /*
399 * Pick a new domain
400 */
402 /* we should at least have the idle task */
403 ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
405 /*
406 * scan through the run queue and pick the task with the lowest evt
407 * *and* the task the second lowest evt.
408 * this code is O(n) but we expect n to be small.
409 */
410 next = schedule_data[this_cpu].idle;
411 next_prime = NULL;
413 next_evt = 0xffffffff;
414 next_prime_evt = 0xffffffff;
415 min_avt = 0xffffffff; /* to calculate svt */
417 list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
418 p = list_entry(tmp, struct task_struct, run_list);
419 if (p->evt < next_evt) {
420 next_prime = next;
421 next_prime_evt = next_evt;
422 next = p;
423 next_evt = p->evt;
424 } else if (next_prime_evt == 0xffffffff) {
425 next_prime_evt = p->evt;
426 next_prime = p;
427 } else if (p->evt < next_prime_evt) {
428 next_prime_evt = p->evt;
429 next_prime = p;
430 }
431 /* determine system virtual time */
432 if (p->avt < min_avt)
433 min_avt = p->avt;
434 }
435 ASSERT(next != NULL); /* we should have at least the idle task */
437 /* update system virtual time */
438 if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
440 /* check for virtual time overrun on this cpu */
441 if (schedule_data[this_cpu].svt >= 0xf0000000) {
442 u_long t_flags;
443 write_lock_irqsave(&tasklist_lock, t_flags);
444 p = &idle0_task;
445 do {
446 if (p->processor == this_cpu && !is_idle_task(p)) {
447 p->evt -= 0xe0000000;
448 p->avt -= 0xe0000000;
449 }
450 } while ( (p = p->next_task) != &idle0_task );
451 write_unlock_irqrestore(&tasklist_lock, t_flags);
452 schedule_data[this_cpu].svt -= 0xe0000000;
453 }
455 /* work out time for next run through scheduler */
456 if (is_idle_task(next)) {
457 r_time = ctx_allow;
458 goto sched_done;
459 }
461 if (next_prime == NULL || is_idle_task(next_prime)) {
462 /* we have only one runable task besides the idle task */
463 r_time = 10 * ctx_allow; /* RN: random constant */
464 goto sched_done;
465 }
467 /*
468 * if we are here we have two runable tasks.
469 * work out how long 'next' can run till its evt is greater than
470 * 'next_prime's evt. Taking context switch allowance into account.
471 */
472 ASSERT(next_prime->evt >= next->evt);
473 r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
475 sched_done:
476 ASSERT(r_time >= ctx_allow);
478 #ifndef NDEBUG
479 if (r_time < ctx_allow) {
480 printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time);
481 dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
482 }
483 #endif
485 prev->has_cpu = 0;
486 next->has_cpu = 1;
488 schedule_data[this_cpu].curr = next;
490 next->lastschd = now;
492 /* reprogramm the timer */
493 schedule_data[this_cpu].s_timer.expires = now + r_time;
494 add_ac_timer(&schedule_data[this_cpu].s_timer);
496 spin_unlock_irq(&schedule_data[this_cpu].lock);
498 /* done, switch tasks */
499 if ( unlikely(prev == next) )
500 {
501 /* We won't go through the normal tail, so do this by hand */
502 update_dom_time(prev->shared_info);
503 return;
504 }
506 perfc_incrc(sched_ctx);
507 #ifdef SCHED_HISTO
508 {
509 ulong diff; /* should fit in 32bits */
510 if (!is_idle_task(next) && next->wokenup) {
511 diff = (ulong)(now - next->wokenup);
512 diff /= (ulong)MILLISECS(1);
513 if (diff <= BUCKETS-2) schedule_data[this_cpu].hist[diff]++;
514 else schedule_data[this_cpu].hist[BUCKETS-1]++;
515 }
516 next->wokenup = (s_time_t)0;
517 }
518 #endif
520 switch_to(prev, next);
522 if ( unlikely(prev->state == TASK_DYING) )
523 put_task_struct(prev);
525 update_dom_time(next->shared_info);
527 schedule_tail(next);
529 BUG();
530 }
532 /* No locking needed -- pointer comparison is safe :-) */
533 int idle_cpu(int cpu)
534 {
535 struct task_struct *p = schedule_data[cpu].curr;
536 return p == idle_task[cpu];
537 }
540 /*
541 * The scheduler timer.
542 */
543 static void sched_timer(unsigned long foo)
544 {
545 int cpu = smp_processor_id();
546 struct task_struct *curr = schedule_data[cpu].curr;
547 /* cause a reschedule */
548 set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
549 perfc_incrc(sched_irq);
550 }
552 /*
553 * The Domain virtual time timer
554 */
555 static void virt_timer(unsigned long foo)
556 {
557 unsigned long cpu_mask = 0;
558 struct task_struct *p;
559 s_time_t now;
561 /* send virtual timer interrupt */
562 read_lock(&tasklist_lock);
563 p = &idle0_task;
564 do {
565 if ( is_idle_task(p) ) continue;
566 cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
567 if ( p->has_cpu )
568 update_dom_time(p->shared_info);
569 }
570 while ( (p = p->next_task) != &idle0_task );
571 read_unlock(&tasklist_lock);
572 guest_event_notify(cpu_mask);
574 now = NOW();
575 v_timer.expires = now + MILLISECS(20);
576 add_ac_timer(&v_timer);
577 }
579 /*
580 * Initialise the data structures
581 */
582 void __init scheduler_init(void)
583 {
584 int i;
586 printk("Initialising schedulers\n");
588 for ( i = 0; i < NR_CPUS; i++ )
589 {
590 INIT_LIST_HEAD(&schedule_data[i].runqueue);
591 spin_lock_init(&schedule_data[i].lock);
592 schedule_data[i].curr = &idle0_task;
594 /* a timer for each CPU */
595 init_ac_timer(&schedule_data[i].s_timer, i);
596 schedule_data[i].s_timer.data = 2;
597 schedule_data[i].s_timer.function = &sched_timer;
599 }
600 schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */
601 init_ac_timer(&v_timer, 0);
602 v_timer.data = 3;
603 v_timer.function = &virt_timer;
604 }
606 /*
607 * Start a scheduler for each CPU
608 * This has to be done *after* the timers, e.g., APICs, have been initialised
609 */
610 void schedulers_start(void)
611 {
612 printk("Start schedulers\n");
613 sched_timer(0);
614 virt_timer(0);
615 smp_call_function((void *)sched_timer, NULL, 1, 1);
616 }
619 static void process_timeout(unsigned long __data)
620 {
621 struct task_struct * p = (struct task_struct *) __data;
622 wake_up(p);
623 }
626 static void dump_rqueue(struct list_head *queue, char *name)
627 {
628 struct list_head *list;
629 int loop = 0;
630 struct task_struct *p;
632 printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue,
633 (unsigned long) queue->next, (unsigned long) queue->prev);
634 list_for_each (list, queue) {
635 p = list_entry(list, struct task_struct, run_list);
636 printk("%3d: %3d has=%c mcua=0x%04lX ev=0x%08X av=0x%08X c=0x%X%08X\n",
637 loop++, p->domain,
638 p->has_cpu ? 'T':'F',
639 p->mcu_advance, p->evt, p->avt,
640 (u32)(p->cpu_time>>32), (u32)p->cpu_time);
641 printk(" l: %lx n: %lx p: %lx\n",
642 (unsigned long)list, (unsigned long)list->next,
643 (unsigned long)list->prev);
644 }
645 return;
646 }
648 void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
649 {
650 u_long flags;
651 s_time_t now = NOW();
652 int i;
654 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
655 (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now);
656 for (i = 0; i < smp_num_cpus; i++) {
657 spin_lock_irqsave(&schedule_data[i].lock, flags);
658 printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
659 dump_rqueue(&schedule_data[i].runqueue, "rq");
660 spin_unlock_irqrestore(&schedule_data[i].lock, flags);
661 }
662 return;
663 }
665 #ifdef SCHED_HISTO
666 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
667 {
668 int loop, i, j;
669 for (loop = 0; loop < smp_num_cpus; loop++) {
670 j = 0;
671 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", loop);
672 for (i=0; i<BUCKETS; i++) {
673 if (schedule_data[loop].hist[i]) {
674 if (i < BUCKETS-1)
675 printk("%2d:[%7u] ", i, schedule_data[loop].hist[i]);
676 else
677 printk(" >:[%7u] ", schedule_data[loop].hist[i]);
678 j++;
679 if (!(j % 5)) printk("\n");
680 }
681 }
682 printk("\n");
683 }
685 }
686 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
687 {
688 int loop, i;
689 for (loop = 0; loop < smp_num_cpus; loop++)
690 for (i=0; i<BUCKETS; i++)
691 schedule_data[loop].hist[i]=0;
692 }
693 #else
694 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
695 {
696 }
697 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
698 {
699 }
700 #endif