ia64/xen-unstable

view xen/common/schedule.c @ 8609:85d693e6f61a

Arch-specific per-vcpu info should be initialised to zero
when allocating a new vcpu structure, not copied from
CPU0's idle VCPU. Especially now that the idle VCPU itself
is dynamically allocated.

This should fix assertions people have been seeing in
getdomain_info_ctxt() relation to IOPL in eflags.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Jan 14 21:26:40 2006 +0100 (2006-01-14)
parents c1840ac1f05d
children 1ccc28e075ba
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #include <xen/config.h>
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/domain.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <public/sched.h>
31 #include <public/sched_ctl.h>
33 extern void arch_getdomaininfo_ctxt(struct vcpu *,
34 struct vcpu_guest_context *);
35 /* opt_sched: scheduler - default to SEDF */
36 static char opt_sched[10] = "sedf";
37 string_param("sched", opt_sched);
39 /*#define WAKE_HISTO*/
40 /*#define BLOCKTIME_HISTO*/
41 #if defined(WAKE_HISTO)
42 #define BUCKETS 31
43 #elif defined(BLOCKTIME_HISTO)
44 #define BUCKETS 200
45 #endif
47 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
49 /* Various timer handlers. */
50 static void s_timer_fn(void *unused);
51 static void t_timer_fn(void *unused);
52 static void dom_timer_fn(void *data);
54 /* This is global for now so that private implementations can reach it */
55 struct schedule_data schedule_data[NR_CPUS];
57 extern struct scheduler sched_bvt_def;
58 extern struct scheduler sched_sedf_def;
59 static struct scheduler *schedulers[] = {
60 &sched_bvt_def,
61 &sched_sedf_def,
62 NULL
63 };
65 static void __enter_scheduler(void);
67 static struct scheduler ops;
69 #define SCHED_OP(fn, ...) \
70 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
71 : (typeof(ops.fn(__VA_ARGS__)))0 )
73 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
74 static struct timer t_timer[NR_CPUS];
76 void free_domain(struct domain *d)
77 {
78 int i;
80 SCHED_OP(free_task, d);
82 for ( i = MAX_VIRT_CPUS-1; i >= 0; i-- )
83 if ( d->vcpu[i] != NULL )
84 free_vcpu_struct(d->vcpu[i]);
86 xfree(d);
87 }
89 struct vcpu *alloc_vcpu(
90 struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
91 {
92 struct vcpu *v;
94 BUG_ON(d->vcpu[vcpu_id] != NULL);
96 if ( (v = alloc_vcpu_struct(d, vcpu_id)) == NULL )
97 return NULL;
99 v->domain = d;
100 v->vcpu_id = vcpu_id;
101 v->processor = cpu_id;
102 atomic_set(&v->pausecnt, 0);
104 v->cpu_affinity = is_idle_domain(d) ?
105 cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
107 d->vcpu[vcpu_id] = v;
109 if ( SCHED_OP(alloc_task, v) < 0 )
110 {
111 d->vcpu[vcpu_id] = NULL;
112 free_vcpu_struct(v);
113 return NULL;
114 }
116 sched_add_domain(v);
118 if ( vcpu_id != 0 )
119 {
120 v->vcpu_info = &d->shared_info->vcpu_info[vcpu_id];
121 d->vcpu[v->vcpu_id-1]->next_in_list = v;
122 set_bit(_VCPUF_down, &v->vcpu_flags);
123 }
125 return v;
126 }
128 struct domain *alloc_domain(void)
129 {
130 struct domain *d;
132 if ( (d = xmalloc(struct domain)) != NULL )
133 memset(d, 0, sizeof(*d));
135 return d;
136 }
138 /*
139 * Add and remove a domain
140 */
141 void sched_add_domain(struct vcpu *v)
142 {
143 /* Initialise the per-domain timer. */
144 init_timer(&v->timer, dom_timer_fn, v, v->processor);
146 if ( is_idle_vcpu(v) )
147 {
148 schedule_data[v->processor].curr = v;
149 schedule_data[v->processor].idle = v;
150 set_bit(_VCPUF_running, &v->vcpu_flags);
151 }
153 SCHED_OP(add_task, v);
154 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
155 }
157 void sched_rem_domain(struct vcpu *v)
158 {
159 kill_timer(&v->timer);
160 SCHED_OP(rem_task, v);
161 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
162 }
164 void vcpu_sleep_nosync(struct vcpu *v)
165 {
166 unsigned long flags;
168 vcpu_schedule_lock_irqsave(v, flags);
169 if ( likely(!vcpu_runnable(v)) )
170 SCHED_OP(sleep, v);
171 vcpu_schedule_unlock_irqrestore(v, flags);
173 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
174 }
176 void vcpu_sleep_sync(struct vcpu *v)
177 {
178 vcpu_sleep_nosync(v);
180 while ( !vcpu_runnable(v) && test_bit(_VCPUF_running, &v->vcpu_flags) )
181 cpu_relax();
183 sync_vcpu_execstate(v);
184 }
186 void vcpu_wake(struct vcpu *v)
187 {
188 unsigned long flags;
190 vcpu_schedule_lock_irqsave(v, flags);
191 if ( likely(vcpu_runnable(v)) )
192 {
193 SCHED_OP(wake, v);
194 v->wokenup = NOW();
195 }
196 vcpu_schedule_unlock_irqrestore(v, flags);
198 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
199 }
201 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
202 {
203 cpumask_t online_affinity;
205 cpus_and(online_affinity, *affinity, cpu_online_map);
206 if ( cpus_empty(online_affinity) )
207 return -EINVAL;
209 return SCHED_OP(set_affinity, v, affinity);
210 }
212 /* Block the currently-executing domain until a pertinent event occurs. */
213 long do_block(void)
214 {
215 struct vcpu *v = current;
217 v->vcpu_info->evtchn_upcall_mask = 0;
218 set_bit(_VCPUF_blocked, &v->vcpu_flags);
220 /* Check for events /after/ blocking: avoids wakeup waiting race. */
221 if ( event_pending(v) )
222 {
223 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
224 }
225 else
226 {
227 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
228 __enter_scheduler();
229 }
231 return 0;
232 }
234 /* Voluntarily yield the processor for this allocation. */
235 static long do_yield(void)
236 {
237 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
238 __enter_scheduler();
239 return 0;
240 }
242 long do_sched_op(int cmd, unsigned long arg)
243 {
244 long ret = 0;
246 switch ( cmd )
247 {
248 case SCHEDOP_yield:
249 {
250 ret = do_yield();
251 break;
252 }
254 case SCHEDOP_block:
255 {
256 ret = do_block();
257 break;
258 }
260 case SCHEDOP_shutdown:
261 {
262 TRACE_3D(TRC_SCHED_SHUTDOWN,
263 current->domain->domain_id, current->vcpu_id, arg);
264 domain_shutdown(current->domain, (u8)arg);
265 break;
266 }
268 default:
269 ret = -ENOSYS;
270 }
272 return ret;
273 }
275 /* Per-domain one-shot-timer hypercall. */
276 long do_set_timer_op(s_time_t timeout)
277 {
278 struct vcpu *v = current;
280 if ( timeout == 0 )
281 stop_timer(&v->timer);
282 else
283 set_timer(&v->timer, timeout);
285 return 0;
286 }
288 /* sched_id - fetch ID of current scheduler */
289 int sched_id(void)
290 {
291 return ops.sched_id;
292 }
294 long sched_ctl(struct sched_ctl_cmd *cmd)
295 {
296 if ( cmd->sched_id != ops.sched_id )
297 return -EINVAL;
299 SCHED_OP(control, cmd);
300 TRACE_0D(TRC_SCHED_CTL);
301 return 0;
302 }
305 /* Adjust scheduling parameter for a given domain. */
306 long sched_adjdom(struct sched_adjdom_cmd *cmd)
307 {
308 struct domain *d;
309 struct vcpu *v;
311 if ( (cmd->sched_id != ops.sched_id) ||
312 ((cmd->direction != SCHED_INFO_PUT) &&
313 (cmd->direction != SCHED_INFO_GET)) )
314 return -EINVAL;
316 d = find_domain_by_id(cmd->domain);
317 if ( d == NULL )
318 return -ESRCH;
320 /*
321 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
322 * we acquire the local schedule_lock to guard against concurrent updates.
323 */
324 for_each_vcpu ( d, v )
325 {
326 if ( v == current )
327 vcpu_schedule_lock_irq(v);
328 else
329 vcpu_pause(v);
330 }
332 SCHED_OP(adjdom, d, cmd);
334 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
336 for_each_vcpu ( d, v )
337 {
338 if ( v == current )
339 vcpu_schedule_unlock_irq(v);
340 else
341 vcpu_unpause(v);
342 }
344 put_domain(d);
346 return 0;
347 }
349 /*
350 * The main function
351 * - deschedule the current domain (scheduler independent).
352 * - pick a new domain (scheduler dependent).
353 */
354 static void __enter_scheduler(void)
355 {
356 struct vcpu *prev = current, *next = NULL;
357 int cpu = smp_processor_id();
358 s_time_t now = NOW();
359 struct task_slice next_slice;
360 s32 r_time; /* time for new dom to run */
362 ASSERT(!in_irq());
364 perfc_incrc(sched_run);
366 spin_lock_irq(&schedule_data[cpu].schedule_lock);
368 stop_timer(&schedule_data[cpu].s_timer);
370 prev->cpu_time += now - prev->lastschd;
372 /* get policy-specific decision on scheduling... */
373 next_slice = ops.do_schedule(now);
375 r_time = next_slice.time;
376 next = next_slice.task;
378 schedule_data[cpu].curr = next;
380 next->lastschd = now;
382 set_timer(&schedule_data[cpu].s_timer, now + r_time);
384 if ( unlikely(prev == next) )
385 {
386 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
387 return continue_running(prev);
388 }
390 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
391 prev->domain->domain_id, now - prev->lastschd);
392 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
393 next->domain->domain_id, now - next->wokenup, r_time);
395 /*
396 * Logic of wokenup field in domain struct:
397 * Used to calculate "waiting time", which is the time that a domain
398 * spends being "runnable", but not actually running. wokenup is set
399 * set whenever a domain wakes from sleeping. However, if wokenup is not
400 * also set here then a preempted runnable domain will get a screwed up
401 * "waiting time" value next time it is scheduled.
402 */
403 prev->wokenup = now;
405 #if defined(WAKE_HISTO)
406 if ( !is_idle_vcpu(next) && next->wokenup )
407 {
408 ulong diff = (ulong)(now - next->wokenup);
409 diff /= (ulong)MILLISECS(1);
410 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
411 else schedule_data[cpu].hist[BUCKETS-1]++;
412 }
413 next->wokenup = (s_time_t)0;
414 #elif defined(BLOCKTIME_HISTO)
415 prev->lastdeschd = now;
416 if ( !is_idle_vcpu(next) )
417 {
418 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
419 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
420 else schedule_data[cpu].hist[BUCKETS-1]++;
421 }
422 #endif
424 set_bit(_VCPUF_running, &next->vcpu_flags);
426 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
428 perfc_incrc(sched_ctx);
430 prev->sleep_tick = schedule_data[cpu].tick;
432 /* Ensure that the domain has an up-to-date time base. */
433 if ( !is_idle_vcpu(next) )
434 {
435 update_dom_time(next);
436 if ( next->sleep_tick != schedule_data[cpu].tick )
437 send_guest_virq(next, VIRQ_TIMER);
438 }
440 TRACE_4D(TRC_SCHED_SWITCH,
441 prev->domain->domain_id, prev->vcpu_id,
442 next->domain->domain_id, next->vcpu_id);
444 context_switch(prev, next);
445 }
448 /****************************************************************************
449 * Timers: the scheduler utilises a number of timers
450 * - s_timer: per CPU timer for preemption and scheduling decisions
451 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
452 * - dom_timer: per domain timer to specifiy timeout values
453 ****************************************************************************/
455 /* The scheduler timer: force a run through the scheduler */
456 static void s_timer_fn(void *unused)
457 {
458 raise_softirq(SCHEDULE_SOFTIRQ);
459 perfc_incrc(sched_irq);
460 }
462 /* Periodic tick timer: send timer event to current domain */
463 static void t_timer_fn(void *unused)
464 {
465 struct vcpu *v = current;
466 unsigned int cpu = smp_processor_id();
468 schedule_data[cpu].tick++;
470 if ( !is_idle_vcpu(v) )
471 {
472 update_dom_time(v);
473 send_guest_virq(v, VIRQ_TIMER);
474 }
476 page_scrub_schedule_work();
478 set_timer(&t_timer[cpu], NOW() + MILLISECS(10));
479 }
481 /* Domain timer function, sends a virtual timer interrupt to domain */
482 static void dom_timer_fn(void *data)
483 {
484 struct vcpu *v = data;
486 update_dom_time(v);
487 send_guest_virq(v, VIRQ_TIMER);
488 }
490 /* Initialise the data structures. */
491 void __init scheduler_init(void)
492 {
493 int i, rc;
495 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
497 for ( i = 0; i < NR_CPUS; i++ )
498 {
499 spin_lock_init(&schedule_data[i].schedule_lock);
500 init_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
501 init_timer(&t_timer[i], t_timer_fn, NULL, i);
502 }
504 for ( i = 0; schedulers[i] != NULL; i++ )
505 {
506 ops = *schedulers[i];
507 if ( strcmp(ops.opt_name, opt_sched) == 0 )
508 break;
509 }
511 if ( schedulers[i] == NULL )
512 printk("Could not find scheduler: %s\n", opt_sched);
514 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
516 if ( idle_vcpu[0] != NULL )
517 {
518 schedule_data[0].curr = idle_vcpu[0];
519 schedule_data[0].idle = idle_vcpu[0];
521 rc = SCHED_OP(alloc_task, idle_vcpu[0]);
522 BUG_ON(rc < 0);
524 sched_add_domain(idle_vcpu[0]);
525 }
526 }
528 /*
529 * Start a scheduler for each CPU
530 * This has to be done *after* the timers, e.g., APICs, have been initialised
531 */
532 void schedulers_start(void)
533 {
534 t_timer_fn(0);
535 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
536 }
538 void dump_runq(unsigned char key)
539 {
540 s_time_t now = NOW();
541 int i;
542 unsigned long flags;
544 local_irq_save(flags);
546 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
547 SCHED_OP(dump_settings);
548 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
550 for_each_online_cpu ( i )
551 {
552 spin_lock(&schedule_data[i].schedule_lock);
553 printk("CPU[%02d] ", i);
554 SCHED_OP(dump_cpu_state,i);
555 spin_unlock(&schedule_data[i].schedule_lock);
556 }
558 local_irq_restore(flags);
559 }
561 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
563 void print_sched_histo(unsigned char key)
564 {
565 int i, j, k;
566 for_each_online_cpu ( k )
567 {
568 j = 0;
569 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
570 for ( i = 0; i < BUCKETS; i++ )
571 {
572 if ( schedule_data[k].hist[i] != 0 )
573 {
574 if ( i < BUCKETS-1 )
575 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
576 else
577 printk(" >:[%7u] ", schedule_data[k].hist[i]);
578 if ( !(++j % 5) )
579 printk("\n");
580 }
581 }
582 printk("\n");
583 }
585 }
587 void reset_sched_histo(unsigned char key)
588 {
589 int i, j;
590 for ( j = 0; j < NR_CPUS; j++ )
591 for ( i=0; i < BUCKETS; i++ )
592 schedule_data[j].hist[i] = 0;
593 }
595 #else
597 void print_sched_histo(unsigned char key) { }
598 void reset_sched_histo(unsigned char key) { }
600 #endif
602 /*
603 * Local variables:
604 * mode: C
605 * c-set-style: "BSD"
606 * c-basic-offset: 4
607 * tab-width: 4
608 * indent-tabs-mode: nil
609 * End:
610 */