direct-io.hg

view xen/common/schedule.c @ 5517:10e9028c8e3d

bitkeeper revision 1.1718.1.10 (42b7b19aqOS_1M8I4pIOFjiTPYWV-g)

Merge bk://xenbits.xensource.com/xen-unstable.bk
into spot.cl.cam.ac.uk:C:/Documents and Settings/iap10/xen-unstable.bk
author iap10@spot.cl.cam.ac.uk
date Tue Jun 21 06:20:10 2005 +0000 (2005-06-21)
parents 6b7a4f646fef
children 4cadd9fa93d5
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 /*#define WAKE_HISTO*/
17 /*#define BLOCKTIME_HISTO*/
19 #if defined(WAKE_HISTO)
20 #define BUCKETS 31
21 #elif defined(BLOCKTIME_HISTO)
22 #define BUCKETS 200
23 #endif
25 #include <xen/config.h>
26 #include <xen/init.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/domain.h>
30 #include <xen/delay.h>
31 #include <xen/event.h>
32 #include <xen/time.h>
33 #include <xen/ac_timer.h>
34 #include <xen/perfc.h>
35 #include <xen/sched-if.h>
36 #include <xen/softirq.h>
37 #include <xen/trace.h>
38 #include <xen/mm.h>
39 #include <public/sched_ctl.h>
41 /* opt_sched: scheduler - default to Borrowed Virtual Time */
42 static char opt_sched[10] = "bvt";
43 string_param("sched", opt_sched);
45 #if defined(WAKE_HISTO)
46 #define BUCKETS 31
47 #elif defined(BLOCKTIME_HISTO)
48 #define BUCKETS 200
49 #endif
51 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
53 /* Various timer handlers. */
54 static void s_timer_fn(void *unused);
55 static void t_timer_fn(void *unused);
56 static void dom_timer_fn(void *data);
58 /* This is global for now so that private implementations can reach it */
59 struct schedule_data schedule_data[NR_CPUS];
61 extern struct scheduler sched_bvt_def;
62 extern struct scheduler sched_sedf_def;
63 static struct scheduler *schedulers[] = {
64 &sched_bvt_def,
65 &sched_sedf_def,
66 NULL
67 };
69 static void __enter_scheduler(void);
71 static struct scheduler ops;
73 #define SCHED_OP(fn, ...) \
74 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
75 : (typeof(ops.fn(__VA_ARGS__)))0 )
77 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
78 static struct ac_timer t_timer[NR_CPUS];
80 void free_domain_struct(struct domain *d)
81 {
82 int i;
84 SCHED_OP(free_task, d);
85 for (i = 0; i < MAX_VIRT_CPUS; i++)
86 if ( d->vcpu[i] )
87 arch_free_vcpu_struct(d->vcpu[i]);
89 xfree(d);
90 }
92 struct vcpu *alloc_vcpu_struct(
93 struct domain *d, unsigned long vcpu)
94 {
95 struct vcpu *v, *vc;
97 ASSERT( d->vcpu[vcpu] == NULL );
99 if ( (v = arch_alloc_vcpu_struct()) == NULL )
100 return NULL;
102 memset(v, 0, sizeof(*v));
104 d->vcpu[vcpu] = v;
105 v->domain = d;
106 v->vcpu_id = vcpu;
108 if ( SCHED_OP(alloc_task, v) < 0 )
109 goto out;
111 if ( vcpu != 0 )
112 {
113 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
115 for_each_vcpu( d, vc )
116 {
117 if ( (vc->next_in_list == NULL) ||
118 (vc->next_in_list->vcpu_id > vcpu) )
119 break;
120 }
121 v->next_in_list = vc->next_in_list;
122 vc->next_in_list = v;
124 if (test_bit(_VCPUF_cpu_pinned, &vc->vcpu_flags)) {
125 v->processor = (vc->processor + 1) % num_online_cpus();
126 set_bit(_VCPUF_cpu_pinned, &v->vcpu_flags);
127 } else {
128 v->processor = (vc->processor + 1) % num_online_cpus();
129 }
130 }
132 return v;
134 out:
135 d->vcpu[vcpu] = NULL;
136 arch_free_vcpu_struct(v);
138 return NULL;
139 }
141 struct domain *alloc_domain_struct(void)
142 {
143 struct domain *d;
145 if ( (d = xmalloc(struct domain)) == NULL )
146 return NULL;
148 memset(d, 0, sizeof(*d));
150 if ( alloc_vcpu_struct(d, 0) == NULL )
151 goto out;
153 return d;
155 out:
156 xfree(d);
157 return NULL;
158 }
160 /*
161 * Add and remove a domain
162 */
163 void sched_add_domain(struct vcpu *v)
164 {
165 struct domain *d = v->domain;
167 /* Initialise the per-domain timer. */
168 init_ac_timer(&v->timer, dom_timer_fn, v, v->processor);
170 if ( is_idle_task(d) )
171 {
172 schedule_data[v->processor].curr = v;
173 schedule_data[v->processor].idle = v;
174 set_bit(_VCPUF_running, &v->vcpu_flags);
175 }
176 else
177 {
178 /* Must be unpaused by control software to start execution. */
179 set_bit(_VCPUF_ctrl_pause, &v->vcpu_flags);
180 }
182 SCHED_OP(add_task, v);
183 TRACE_2D(TRC_SCHED_DOM_ADD, d->domain_id, v->vcpu_id);
184 }
186 void sched_rem_domain(struct vcpu *v)
187 {
188 rem_ac_timer(&v->timer);
189 SCHED_OP(rem_task, v);
190 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
191 }
193 void domain_sleep_nosync(struct vcpu *v)
194 {
195 unsigned long flags;
197 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
198 if ( likely(!domain_runnable(v)) )
199 SCHED_OP(sleep, v);
200 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
202 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
203 }
205 void domain_sleep_sync(struct vcpu *v)
206 {
207 domain_sleep_nosync(v);
209 while ( test_bit(_VCPUF_running, &v->vcpu_flags) && !domain_runnable(v) )
210 cpu_relax();
212 if ( cpu_isset(v->processor, v->domain->cpumask) )
213 sync_lazy_execstate_cpu(v->processor);
214 }
216 void domain_wake(struct vcpu *v)
217 {
218 unsigned long flags;
220 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
221 if ( likely(domain_runnable(v)) )
222 {
223 SCHED_OP(wake, v);
224 #ifdef WAKE_HISTO
225 v->wokenup = NOW();
226 #endif
227 }
228 clear_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
229 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
231 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
232 }
234 /* Block the currently-executing domain until a pertinent event occurs. */
235 long do_block(void)
236 {
237 struct vcpu *v = current;
239 v->vcpu_info->evtchn_upcall_mask = 0;
240 set_bit(_VCPUF_blocked, &v->vcpu_flags);
242 /* Check for events /after/ blocking: avoids wakeup waiting race. */
243 if ( event_pending(v) )
244 {
245 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
246 }
247 else
248 {
249 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
250 __enter_scheduler();
251 }
253 return 0;
254 }
256 /* Voluntarily yield the processor for this allocation. */
257 static long do_yield(void)
258 {
259 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
260 __enter_scheduler();
261 return 0;
262 }
264 /*
265 * Demultiplex scheduler-related hypercalls.
266 */
267 long do_sched_op(unsigned long op)
268 {
269 long ret = 0;
271 switch ( op & SCHEDOP_cmdmask )
272 {
273 case SCHEDOP_yield:
274 {
275 ret = do_yield();
276 break;
277 }
279 case SCHEDOP_block:
280 {
281 ret = do_block();
282 break;
283 }
285 case SCHEDOP_shutdown:
286 {
287 TRACE_3D(TRC_SCHED_SHUTDOWN,
288 current->domain->domain_id, current->vcpu_id,
289 (op >> SCHEDOP_reasonshift));
290 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
291 break;
292 }
294 default:
295 ret = -ENOSYS;
296 }
298 return ret;
299 }
301 /* Per-domain one-shot-timer hypercall. */
302 long do_set_timer_op(s_time_t timeout)
303 {
304 struct vcpu *v = current;
306 if ( timeout == 0 )
307 rem_ac_timer(&v->timer);
308 else
309 set_ac_timer(&v->timer, timeout);
311 return 0;
312 }
314 /** sched_id - fetch ID of current scheduler */
315 int sched_id()
316 {
317 return ops.sched_id;
318 }
320 long sched_ctl(struct sched_ctl_cmd *cmd)
321 {
322 if ( cmd->sched_id != ops.sched_id )
323 return -EINVAL;
325 SCHED_OP(control, cmd);
326 TRACE_0D(TRC_SCHED_CTL);
327 return 0;
328 }
331 /* Adjust scheduling parameter for a given domain. */
332 long sched_adjdom(struct sched_adjdom_cmd *cmd)
333 {
334 struct domain *d;
335 struct vcpu *v;
336 int cpu;
337 #if NR_CPUS <=32
338 unsigned long have_lock;
339 #else
340 unsigned long long have_lock;
341 #endif
342 int succ;
344 #define __set_cpu_bit(cpu, data) data |= ((typeof(data))1)<<cpu
345 #define __get_cpu_bit(cpu, data) (data & ((typeof(data))1)<<cpu)
346 #define __clear_cpu_bits(data) data = ((typeof(data))0)
348 if ( cmd->sched_id != ops.sched_id )
349 return -EINVAL;
351 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
352 return -EINVAL;
354 d = find_domain_by_id(cmd->domain);
355 if ( d == NULL )
356 return -ESRCH;
358 /* acquire locks on all CPUs on which vcpus of this domain run */
359 do {
360 succ = 0;
361 __clear_cpu_bits(have_lock);
362 for_each_vcpu(d, v) {
363 cpu = v->processor;
364 if (!__get_cpu_bit(cpu, have_lock)) {
365 /* if we don't have a lock on this CPU: acquire it*/
366 if (spin_trylock(&schedule_data[cpu].schedule_lock)) {
367 /*we have this lock!*/
368 __set_cpu_bit(cpu, have_lock);
369 succ = 1;
370 } else {
371 /*we didn,t get this lock -> free all other locks too!*/
372 for (cpu = 0; cpu < NR_CPUS; cpu++)
373 if (__get_cpu_bit(cpu, have_lock))
374 spin_unlock(&schedule_data[cpu].schedule_lock);
375 /* and start from the beginning! */
376 succ = 0;
377 /* leave the "for_each_domain_loop" */
378 break;
379 }
380 }
381 }
382 } while (!succ);
383 //spin_lock_irq(&schedule_data[d->vcpu[0]->processor].schedule_lock);
384 SCHED_OP(adjdom, d, cmd);
385 //spin_unlock_irq(&schedule_data[d->vcpu[0]->processor].schedule_lock);
386 for (cpu = 0; cpu < NR_CPUS; cpu++)
387 if (__get_cpu_bit(cpu, have_lock))
388 spin_unlock(&schedule_data[cpu].schedule_lock);
389 __clear_cpu_bits(have_lock);
391 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
392 put_domain(d);
393 return 0;
394 }
396 /*
397 * The main function
398 * - deschedule the current domain (scheduler independent).
399 * - pick a new domain (scheduler dependent).
400 */
401 static void __enter_scheduler(void)
402 {
403 struct vcpu *prev = current, *next = NULL;
404 int cpu = prev->processor;
405 s_time_t now;
406 struct task_slice next_slice;
407 s32 r_time; /* time for new dom to run */
409 perfc_incrc(sched_run);
411 spin_lock_irq(&schedule_data[cpu].schedule_lock);
413 now = NOW();
415 rem_ac_timer(&schedule_data[cpu].s_timer);
417 ASSERT(!in_irq());
419 prev->cpu_time += now - prev->lastschd;
421 /* get policy-specific decision on scheduling... */
422 next_slice = ops.do_schedule(now);
424 r_time = next_slice.time;
425 next = next_slice.task;
427 schedule_data[cpu].curr = next;
429 next->lastschd = now;
431 set_ac_timer(&schedule_data[cpu].s_timer, now + r_time);
433 /* Must be protected by the schedule_lock! */
434 set_bit(_VCPUF_running, &next->vcpu_flags);
436 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
438 if ( unlikely(prev == next) )
439 return continue_running(prev);
441 perfc_incrc(sched_ctx);
443 #if defined(WAKE_HISTO)
444 if ( !is_idle_task(next->domain) && next->wokenup ) {
445 ulong diff = (ulong)(now - next->wokenup);
446 diff /= (ulong)MILLISECS(1);
447 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
448 else schedule_data[cpu].hist[BUCKETS-1]++;
449 }
450 next->wokenup = (s_time_t)0;
451 #elif defined(BLOCKTIME_HISTO)
452 prev->lastdeschd = now;
453 if ( !is_idle_task(next->domain) )
454 {
455 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
456 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
457 else schedule_data[cpu].hist[BUCKETS-1]++;
458 }
459 #endif
461 prev->sleep_tick = schedule_data[cpu].tick;
463 /* Ensure that the domain has an up-to-date time base. */
464 if ( !is_idle_task(next->domain) )
465 {
466 update_dom_time(next);
467 if ( next->sleep_tick != schedule_data[cpu].tick )
468 send_guest_virq(next, VIRQ_TIMER);
469 }
471 TRACE_4D(TRC_SCHED_SWITCH,
472 prev->domain->domain_id, prev->vcpu_id,
473 next->domain->domain_id, next->vcpu_id);
475 context_switch(prev, next);
476 }
478 /* No locking needed -- pointer comparison is safe :-) */
479 int idle_cpu(int cpu)
480 {
481 struct vcpu *p = schedule_data[cpu].curr;
482 return p == idle_task[cpu];
483 }
486 /****************************************************************************
487 * Timers: the scheduler utilises a number of timers
488 * - s_timer: per CPU timer for preemption and scheduling decisions
489 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
490 * - dom_timer: per domain timer to specifiy timeout values
491 ****************************************************************************/
493 /* The scheduler timer: force a run through the scheduler */
494 static void s_timer_fn(void *unused)
495 {
496 raise_softirq(SCHEDULE_SOFTIRQ);
497 perfc_incrc(sched_irq);
498 }
500 /* Periodic tick timer: send timer event to current domain */
501 static void t_timer_fn(void *unused)
502 {
503 struct vcpu *v = current;
504 unsigned int cpu = v->processor;
506 schedule_data[cpu].tick++;
508 if ( !is_idle_task(v->domain) )
509 {
510 update_dom_time(v);
511 send_guest_virq(v, VIRQ_TIMER);
512 }
514 page_scrub_schedule_work();
516 set_ac_timer(&t_timer[cpu], NOW() + MILLISECS(10));
517 }
519 /* Domain timer function, sends a virtual timer interrupt to domain */
520 static void dom_timer_fn(void *data)
521 {
522 struct vcpu *v = data;
524 update_dom_time(v);
525 send_guest_virq(v, VIRQ_TIMER);
526 }
528 /* Initialise the data structures. */
529 void __init scheduler_init(void)
530 {
531 int i;
533 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
535 for ( i = 0; i < NR_CPUS; i++ )
536 {
537 spin_lock_init(&schedule_data[i].schedule_lock);
538 init_ac_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
539 init_ac_timer(&t_timer[i], t_timer_fn, NULL, i);
540 }
542 schedule_data[0].curr = idle_task[0];
543 schedule_data[0].idle = idle_task[0];
545 for ( i = 0; schedulers[i] != NULL; i++ )
546 {
547 ops = *schedulers[i];
548 if ( strcmp(ops.opt_name, opt_sched) == 0 )
549 break;
550 }
552 if ( schedulers[i] == NULL )
553 printk("Could not find scheduler: %s\n", opt_sched);
555 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
557 BUG_ON(SCHED_OP(alloc_task, idle_task[0]) < 0);
558 sched_add_domain(idle_task[0]);
559 }
561 /*
562 * Start a scheduler for each CPU
563 * This has to be done *after* the timers, e.g., APICs, have been initialised
564 */
565 void schedulers_start(void)
566 {
567 t_timer_fn(0);
568 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
569 }
571 void dump_runq(unsigned char key)
572 {
573 s_time_t now = NOW();
574 int i;
575 unsigned long flags;
577 local_irq_save(flags);
579 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
580 SCHED_OP(dump_settings);
581 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
583 for_each_online_cpu ( i )
584 {
585 spin_lock(&schedule_data[i].schedule_lock);
586 printk("CPU[%02d] ", i);
587 SCHED_OP(dump_cpu_state,i);
588 spin_unlock(&schedule_data[i].schedule_lock);
589 }
591 local_irq_restore(flags);
592 }
594 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
596 void print_sched_histo(unsigned char key)
597 {
598 int i, j, k;
599 for_each_online_cpu ( k )
600 {
601 j = 0;
602 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
603 for ( i = 0; i < BUCKETS; i++ )
604 {
605 if ( schedule_data[k].hist[i] != 0 )
606 {
607 if ( i < BUCKETS-1 )
608 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
609 else
610 printk(" >:[%7u] ", schedule_data[k].hist[i]);
611 if ( !(++j % 5) )
612 printk("\n");
613 }
614 }
615 printk("\n");
616 }
618 }
620 void reset_sched_histo(unsigned char key)
621 {
622 int i, j;
623 for ( j = 0; j < NR_CPUS; j++ )
624 for ( i=0; i < BUCKETS; i++ )
625 schedule_data[j].hist[i] = 0;
626 }
628 #else
630 void print_sched_histo(unsigned char key) { }
631 void reset_sched_histo(unsigned char key) { }
633 #endif
635 /*
636 * Local variables:
637 * mode: C
638 * c-set-style: "BSD"
639 * c-basic-offset: 4
640 * tab-width: 4
641 * indent-tabs-mode: nil
642 * End:
643 */