int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
integer_param("sched_ratelimit_us", sched_ratelimit_us);
+/* Number of vcpus per struct sched_unit. */
+static unsigned int __read_mostly sched_granularity = 1;
+
/* Common lock for free cpus. */
static DEFINE_SPINLOCK(sched_free_cpu_lock);
if ( IS_ERR(domdata) )
return PTR_ERR(domdata);
- /* TODO: fix array size with multiple vcpus per unit. */
- unit_priv = xzalloc_array(void *, d->max_vcpus);
+ unit_priv = xzalloc_array(void *,
+ DIV_ROUND_UP(d->max_vcpus, sched_granularity));
if ( unit_priv == NULL )
{
sched_free_domdata(c->sched, domdata);
spin_unlock(&v->periodic_timer_lock);
}
-/*
- * The main function
- * - deschedule the current domain (scheduler independent).
- * - pick a new domain (scheduler dependent).
- */
-static void schedule(void)
+static void sched_switch_units(struct sched_resource *sr,
+ struct sched_unit *next, struct sched_unit *prev,
+ s_time_t now)
{
- struct sched_unit *prev = current->sched_unit, *next = NULL;
- s_time_t now;
- struct scheduler *sched;
- unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
- bool tasklet_work_scheduled = false;
- struct sched_resource *sd;
- spinlock_t *lock;
- int cpu = smp_processor_id();
+ sr->curr = next;
- ASSERT_NOT_IN_ATOMIC();
+ TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, prev->unit_id,
+ now - prev->state_entry_time);
+ TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, next->unit_id,
+ (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
+ (now - next->state_entry_time) : 0, prev->next_time);
- SCHED_STAT_CRANK(sched_run);
+ ASSERT(prev->vcpu_list->runstate.state == RUNSTATE_running);
+
+ TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
+ next->domain->domain_id, next->unit_id);
+
+ sched_unit_runstate_change(prev, false, now);
+
+ ASSERT(next->vcpu_list->runstate.state != RUNSTATE_running);
+ sched_unit_runstate_change(next, true, now);
- sd = get_sched_res(cpu);
+ /*
+ * NB. Don't add any trace records from here until the actual context
+ * switch, else lost_records resume will not work properly.
+ */
+
+ ASSERT(!next->is_running);
+ next->vcpu_list->is_running = 1;
+ next->is_running = true;
+ next->state_entry_time = now;
+}
+
+static bool sched_tasklet_check_cpu(unsigned int cpu)
+{
+ unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
- /* Update tasklet scheduling status. */
switch ( *tasklet_work )
{
case TASKLET_enqueued:
set_bit(_TASKLET_scheduled, tasklet_work);
/* fallthrough */
case TASKLET_enqueued|TASKLET_scheduled:
- tasklet_work_scheduled = true;
+ return true;
break;
case TASKLET_scheduled:
clear_bit(_TASKLET_scheduled, tasklet_work);
+ /* fallthrough */
case 0:
- /*tasklet_work_scheduled = false;*/
+ /* return false; */
break;
default:
BUG();
}
- lock = pcpu_schedule_lock_irq(cpu);
+ return false;
+}
- now = NOW();
+static bool sched_tasklet_check(unsigned int cpu)
+{
+ bool tasklet_work_scheduled = false;
+ const cpumask_t *mask = get_sched_res(cpu)->cpus;
+ unsigned int cpu_iter;
+
+ for_each_cpu ( cpu_iter, mask )
+ if ( sched_tasklet_check_cpu(cpu_iter) )
+ tasklet_work_scheduled = true;
- stop_timer(&sd->s_timer);
+ return tasklet_work_scheduled;
+}
+
+static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
+ unsigned int cpu)
+{
+ struct scheduler *sched = per_cpu(scheduler, cpu);
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct sched_unit *next;
/* get policy-specific decision on scheduling... */
- sched = this_cpu(scheduler);
- sched->do_schedule(sched, prev, now, tasklet_work_scheduled);
+ sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
next = prev->next_task;
- sd->curr = next;
-
if ( prev->next_time >= 0 ) /* -ve means no limit */
- set_timer(&sd->s_timer, now + prev->next_time);
+ set_timer(&sr->s_timer, now + prev->next_time);
+
+ if ( likely(prev != next) )
+ sched_switch_units(sr, next, prev, now);
+
+ return next;
+}
+
+static void context_saved(struct vcpu *prev)
+{
+ struct sched_unit *unit = prev->sched_unit;
+
+ /* Clear running flag /after/ writing context to memory. */
+ smp_wmb();
+
+ prev->is_running = 0;
+ unit->is_running = false;
+ unit->state_entry_time = NOW();
+
+ /* Check for migration request /after/ clearing running flag. */
+ smp_mb();
+
+ sched_context_saved(vcpu_scheduler(prev), unit);
- if ( unlikely(prev == next) )
+ sched_unit_migrate_finish(unit);
+}
+
+/*
+ * Rendezvous on end of context switch.
+ * As no lock is protecting this rendezvous function we need to use atomic
+ * access functions on the counter.
+ * The counter will be 0 in case no rendezvous is needed. For the rendezvous
+ * case it is initialised to the number of cpus to rendezvous plus 1. Each
+ * member entering decrements the counter. The last one will decrement it to
+ * 1 and perform the final needed action in that case (call of context_saved()
+ * if vcpu was switched), and then set the counter to zero. The other members
+ * will wait until the counter becomes zero until they proceed.
+ */
+void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
+{
+ struct sched_unit *next = vnext->sched_unit;
+
+ if ( atomic_read(&next->rendezvous_out_cnt) )
+ {
+ int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
+
+ /* Call context_saved() before releasing other waiters. */
+ if ( cnt == 1 )
+ {
+ if ( vprev != vnext )
+ context_saved(vprev);
+ atomic_set(&next->rendezvous_out_cnt, 0);
+ }
+ else
+ while ( atomic_read(&next->rendezvous_out_cnt) )
+ cpu_relax();
+ }
+ else if ( vprev != vnext )
+ context_saved(vprev);
+}
+
+static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
+ s_time_t now)
+{
+ if ( unlikely(vprev == vnext) )
{
- pcpu_schedule_unlock_irq(lock, cpu);
TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
- next->domain->domain_id, next->unit_id,
- now - prev->state_entry_time,
- prev->next_time);
- trace_continue_running(next->vcpu_list);
- return continue_running(prev->vcpu_list);
+ vnext->domain->domain_id, vnext->sched_unit->unit_id,
+ now - vprev->runstate.state_entry_time,
+ vprev->sched_unit->next_time);
+ sched_context_switched(vprev, vnext);
+ trace_continue_running(vnext);
+ return continue_running(vprev);
}
- TRACE_3D(TRC_SCHED_SWITCH_INFPREV,
- prev->domain->domain_id, prev->unit_id,
- now - prev->state_entry_time);
- TRACE_4D(TRC_SCHED_SWITCH_INFNEXT,
- next->domain->domain_id, next->unit_id,
- (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
- (now - next->state_entry_time) : 0,
- prev->next_time);
+ SCHED_STAT_CRANK(sched_ctx);
- ASSERT(prev->vcpu_list->runstate.state == RUNSTATE_running);
+ stop_timer(&vprev->periodic_timer);
- TRACE_4D(TRC_SCHED_SWITCH,
- prev->domain->domain_id, prev->unit_id,
- next->domain->domain_id, next->unit_id);
+ if ( vnext->sched_unit->migrated )
+ vcpu_move_irqs(vnext);
- sched_unit_runstate_change(prev, false, now);
+ vcpu_periodic_timer_work(vnext);
- ASSERT(next->vcpu_list->runstate.state != RUNSTATE_running);
- sched_unit_runstate_change(next, true, now);
+ context_switch(vprev, vnext);
+}
- /*
- * NB. Don't add any trace records from here until the actual context
- * switch, else lost_records resume will not work properly.
- */
+/*
+ * Rendezvous before taking a scheduling decision.
+ * Called with schedule lock held, so all accesses to the rendezvous counter
+ * can be normal ones (no atomic accesses needed).
+ * The counter is initialized to the number of cpus to rendezvous initially.
+ * Each cpu entering will decrement the counter. In case the counter becomes
+ * zero do_schedule() is called and the rendezvous counter for leaving
+ * context_switch() is set. All other members will wait until the counter is
+ * becoming zero, dropping the schedule lock in between.
+ */
+static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
+ spinlock_t **lock, int cpu,
+ s_time_t now)
+{
+ struct sched_unit *next;
- ASSERT(!next->is_running);
- next->vcpu_list->is_running = 1;
- next->is_running = true;
- next->state_entry_time = now;
+ if ( !--prev->rendezvous_in_cnt )
+ {
+ next = do_schedule(prev, now, cpu);
+ atomic_set(&next->rendezvous_out_cnt, sched_granularity + 1);
+ return next;
+ }
- pcpu_schedule_unlock_irq(lock, cpu);
+ while ( prev->rendezvous_in_cnt )
+ {
+ /*
+ * Coming from idle might need to do tasklet work.
+ * In order to avoid deadlocks we can't do that here, but have to
+ * continue the idle loop.
+ * Undo the rendezvous_in_cnt decrement and schedule another call of
+ * sched_slave().
+ */
+ if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) )
+ {
+ struct vcpu *vprev = current;
- SCHED_STAT_CRANK(sched_ctx);
+ prev->rendezvous_in_cnt++;
+ atomic_set(&prev->rendezvous_out_cnt, 0);
+
+ pcpu_schedule_unlock_irq(*lock, cpu);
+
+ raise_softirq(SCHED_SLAVE_SOFTIRQ);
+ sched_context_switch(vprev, vprev, now);
+
+ return NULL; /* ARM only. */
+ }
- stop_timer(&prev->vcpu_list->periodic_timer);
+ pcpu_schedule_unlock_irq(*lock, cpu);
- if ( next->migrated )
- vcpu_move_irqs(next->vcpu_list);
+ cpu_relax();
- vcpu_periodic_timer_work(next->vcpu_list);
+ *lock = pcpu_schedule_lock_irq(cpu);
+ }
- context_switch(prev->vcpu_list, next->vcpu_list);
+ return prev->next_task;
}
-void context_saved(struct vcpu *prev)
+static void sched_slave(void)
{
- /* Clear running flag /after/ writing context to memory. */
- smp_wmb();
+ struct vcpu *vprev = current;
+ struct sched_unit *prev = vprev->sched_unit, *next;
+ s_time_t now;
+ spinlock_t *lock;
+ unsigned int cpu = smp_processor_id();
- prev->is_running = 0;
- prev->sched_unit->is_running = false;
- prev->sched_unit->state_entry_time = NOW();
+ ASSERT_NOT_IN_ATOMIC();
- /* Check for migration request /after/ clearing running flag. */
- smp_mb();
+ lock = pcpu_schedule_lock_irq(cpu);
- sched_context_saved(vcpu_scheduler(prev), prev->sched_unit);
+ now = NOW();
+
+ if ( !prev->rendezvous_in_cnt )
+ {
+ pcpu_schedule_unlock_irq(lock, cpu);
+ return;
+ }
+
+ stop_timer(&get_sched_res(cpu)->s_timer);
+
+ next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+ if ( !next )
+ return;
+
+ pcpu_schedule_unlock_irq(lock, cpu);
- sched_unit_migrate_finish(prev->sched_unit);
+ sched_context_switch(vprev, next->vcpu_list, now);
+}
+
+/*
+ * The main function
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
+ */
+static void schedule(void)
+{
+ struct vcpu *vnext, *vprev = current;
+ struct sched_unit *prev = vprev->sched_unit, *next = NULL;
+ s_time_t now;
+ struct sched_resource *sr;
+ spinlock_t *lock;
+ int cpu = smp_processor_id();
+
+ ASSERT_NOT_IN_ATOMIC();
+
+ SCHED_STAT_CRANK(sched_run);
+
+ sr = get_sched_res(cpu);
+
+ lock = pcpu_schedule_lock_irq(cpu);
+
+ if ( prev->rendezvous_in_cnt )
+ {
+ /*
+ * We have a race: sched_slave() should be called, so raise a softirq
+ * in order to re-enter schedule() later and call sched_slave() now.
+ */
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ raise_softirq(SCHEDULE_SOFTIRQ);
+ return sched_slave();
+ }
+
+ stop_timer(&sr->s_timer);
+
+ now = NOW();
+
+ if ( sched_granularity > 1 )
+ {
+ cpumask_t mask;
+
+ prev->rendezvous_in_cnt = sched_granularity;
+ cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu));
+ cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ);
+ next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+ if ( !next )
+ return;
+ }
+ else
+ {
+ prev->rendezvous_in_cnt = 0;
+ next = do_schedule(prev, now, cpu);
+ atomic_set(&next->rendezvous_out_cnt, 0);
+ }
+
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ vnext = next->vcpu_list;
+ sched_context_switch(vprev, vnext, now);
}
/* The scheduler timer: force a run through the scheduler */
if ( sr == NULL )
return -ENOMEM;
sr->master_cpu = cpu;
+ sr->cpus = cpumask_of(cpu);
set_sched_res(cpu, sr);
per_cpu(scheduler, cpu) = &sched_idle_ops;
if ( idle_vcpu[cpu] == NULL )
return -ENOMEM;
+ idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
+
/*
* No need to allocate any scheduler data, as cpus coming online are
* free initially and the idle scheduler doesn't need any data areas
int i;
open_softirq(SCHEDULE_SOFTIRQ, schedule);
+ open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
for ( i = 0; i < NUM_SCHEDULERS; i++)
{