Move sched*c and cpupool.c to a new directory common/sched.
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Dario Faggioli <dfaggioli@suse.com>
M: Stewart Hildebrand <stewart.hildebrand@dornerworks.com>
S: Supported
L: xen-devel@dornerworks.com
-F: xen/common/sched_arinc653.c
+F: xen/common/sched/arinc653.c
F: tools/libxc/xc_arinc653.c
ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE
M: Juergen Gross <jgross@suse.com>
M: Dario Faggioli <dfaggioli@suse.com>
S: Supported
-F: xen/common/cpupool.c
+F: xen/common/sched/cpupool.c
DEVICE TREE
M: Stefano Stabellini <sstabellini@kernel.org>
M: Dario Faggioli <dfaggioli@suse.com>
M: Meng Xu <mengxu@cis.upenn.edu>
S: Supported
-F: xen/common/sched_rt.c
+F: xen/common/sched/rt.c
SCHEDULING
M: George Dunlap <george.dunlap@eu.citrix.com>
M: Dario Faggioli <dfaggioli@suse.com>
S: Supported
-F: xen/common/sched*
+F: xen/common/sched/
SEABIOS UPSTREAM
M: Wei Liu <wl@xen.org>
If unsure, say N.
-menu "Schedulers"
- visible if EXPERT = "y"
-
-config SCHED_CREDIT
- bool "Credit scheduler support"
- default y
- ---help---
- The traditional credit scheduler is a general purpose scheduler.
-
-config SCHED_CREDIT2
- bool "Credit2 scheduler support"
- default y
- ---help---
- The credit2 scheduler is a general purpose scheduler that is
- optimized for lower latency and higher VM density.
-
-config SCHED_RTDS
- bool "RTDS scheduler support (EXPERIMENTAL)"
- default y
- ---help---
- The RTDS scheduler is a soft and firm real-time scheduler for
- multicore, targeted for embedded, automotive, graphics and gaming
- in the cloud, and general low-latency workloads.
-
-config SCHED_ARINC653
- bool "ARINC653 scheduler support (EXPERIMENTAL)"
- default DEBUG
- ---help---
- The ARINC653 scheduler is a hard real-time scheduler for single
- cores, targeted for avionics, drones, and medical devices.
-
-config SCHED_NULL
- bool "Null scheduler support (EXPERIMENTAL)"
- default y
- ---help---
- The null scheduler is a static, zero overhead scheduler,
- for when there always are less vCPUs than pCPUs, typically
- in embedded or HPC scenarios.
-
-choice
- prompt "Default Scheduler?"
- default SCHED_CREDIT2_DEFAULT
-
- config SCHED_CREDIT_DEFAULT
- bool "Credit Scheduler" if SCHED_CREDIT
- config SCHED_CREDIT2_DEFAULT
- bool "Credit2 Scheduler" if SCHED_CREDIT2
- config SCHED_RTDS_DEFAULT
- bool "RT Scheduler" if SCHED_RTDS
- config SCHED_ARINC653_DEFAULT
- bool "ARINC653 Scheduler" if SCHED_ARINC653
- config SCHED_NULL_DEFAULT
- bool "Null Scheduler" if SCHED_NULL
-endchoice
-
-config SCHED_DEFAULT
- string
- default "credit" if SCHED_CREDIT_DEFAULT
- default "credit2" if SCHED_CREDIT2_DEFAULT
- default "rtds" if SCHED_RTDS_DEFAULT
- default "arinc653" if SCHED_ARINC653_DEFAULT
- default "null" if SCHED_NULL_DEFAULT
- default "credit2"
-
-endmenu
+source "common/sched/Kconfig"
config CRYPTO
bool
obj-y += bsearch.o
obj-$(CONFIG_CORE_PARKING) += core_parking.o
obj-y += cpu.o
-obj-y += cpupool.o
obj-$(CONFIG_DEBUG_TRACE) += debugtrace.o
obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o
obj-y += domctl.o
obj-y += rbtree.o
obj-y += rcupdate.o
obj-y += rwlock.o
-obj-$(CONFIG_SCHED_ARINC653) += sched_arinc653.o
-obj-$(CONFIG_SCHED_CREDIT) += sched_credit.o
-obj-$(CONFIG_SCHED_CREDIT2) += sched_credit2.o
-obj-$(CONFIG_SCHED_RTDS) += sched_rt.o
-obj-$(CONFIG_SCHED_NULL) += sched_null.o
-obj-y += schedule.o
obj-y += shutdown.o
obj-y += softirq.o
obj-y += sort.o
extra-y := symbols-dummy.o
subdir-$(CONFIG_COVERAGE) += coverage
+subdir-y += sched
subdir-$(CONFIG_UBSAN) += ubsan
subdir-$(CONFIG_NEEDS_LIBELF) += libelf
+++ /dev/null
-/****************************************************************************
- * schedule.c
- *
- */
-
-#include <compat/sched.h>
-
-#define COMPAT
-#define ret_t int
-
-#define do_sched_op compat_sched_op
-
-#define xen_sched_pin_override sched_pin_override
-CHECK_sched_pin_override;
-#undef xen_sched_pin_override
-
-#define xen_sched_shutdown sched_shutdown
-CHECK_sched_shutdown;
-#undef xen_sched_shutdown
-
-#define xen_sched_remote_shutdown sched_remote_shutdown
-CHECK_sched_remote_shutdown;
-#undef xen_sched_remote_shutdown
-
-static int compat_poll(struct compat_sched_poll *compat)
-{
- struct sched_poll native;
-
-#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
- guest_from_compat_handle((_d_)->ports, (_s_)->ports)
- XLAT_sched_poll(&native, compat);
-#undef XLAT_sched_poll_HNDL_ports
-
- return do_poll(&native);
-}
-
-#define do_poll compat_poll
-#define sched_poll compat_sched_poll
-
-#include "../schedule.c"
-
-int compat_set_timer_op(u32 lo, s32 hi)
-{
- return do_set_timer_op(((s64)hi << 32) | lo);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * cpupool.c
- *
- * Generic cpupool-handling functions.
- *
- * Cpupools are a feature to have configurable scheduling domains. Each
- * cpupool runs an own scheduler on a dedicated set of physical cpus.
- * A domain is bound to one cpupool at any time, but it can be moved to
- * another cpupool.
- *
- * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
- */
-
-#include <xen/lib.h>
-#include <xen/init.h>
-#include <xen/cpumask.h>
-#include <xen/percpu.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/warning.h>
-#include <xen/keyhandler.h>
-#include <xen/cpu.h>
-
-#define for_each_cpupool(ptr) \
- for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
-
-struct cpupool *cpupool0; /* Initial cpupool with Dom0 */
-cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */
-
-static struct cpupool *cpupool_list; /* linked list, sorted by poolid */
-
-static int cpupool_moving_cpu = -1;
-static struct cpupool *cpupool_cpu_moving = NULL;
-static cpumask_t cpupool_locked_cpus;
-
-static DEFINE_SPINLOCK(cpupool_lock);
-
-static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
-static unsigned int __read_mostly sched_granularity = 1;
-
-#ifdef CONFIG_HAS_SCHED_GRANULARITY
-static int __init sched_select_granularity(const char *str)
-{
- if ( strcmp("cpu", str) == 0 )
- opt_sched_granularity = SCHED_GRAN_cpu;
- else if ( strcmp("core", str) == 0 )
- opt_sched_granularity = SCHED_GRAN_core;
- else if ( strcmp("socket", str) == 0 )
- opt_sched_granularity = SCHED_GRAN_socket;
- else
- return -EINVAL;
-
- return 0;
-}
-custom_param("sched-gran", sched_select_granularity);
-#endif
-
-static unsigned int __init cpupool_check_granularity(void)
-{
- unsigned int cpu;
- unsigned int siblings, gran = 0;
-
- if ( opt_sched_granularity == SCHED_GRAN_cpu )
- return 1;
-
- for_each_online_cpu ( cpu )
- {
- siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity,
- cpu));
- if ( gran == 0 )
- gran = siblings;
- else if ( gran != siblings )
- return 0;
- }
-
- sched_disable_smt_switching = true;
-
- return gran;
-}
-
-/* Setup data for selected scheduler granularity. */
-static void __init cpupool_gran_init(void)
-{
- unsigned int gran = 0;
- const char *fallback = NULL;
-
- while ( gran == 0 )
- {
- gran = cpupool_check_granularity();
-
- if ( gran == 0 )
- {
- switch ( opt_sched_granularity )
- {
- case SCHED_GRAN_core:
- opt_sched_granularity = SCHED_GRAN_cpu;
- fallback = "Asymmetric cpu configuration.\n"
- "Falling back to sched-gran=cpu.\n";
- break;
- case SCHED_GRAN_socket:
- opt_sched_granularity = SCHED_GRAN_core;
- fallback = "Asymmetric cpu configuration.\n"
- "Falling back to sched-gran=core.\n";
- break;
- default:
- ASSERT_UNREACHABLE();
- break;
- }
- }
- }
-
- if ( fallback )
- warning_add(fallback);
-
- sched_granularity = gran;
-}
-
-unsigned int cpupool_get_granularity(const struct cpupool *c)
-{
- return c ? sched_granularity : 1;
-}
-
-static void free_cpupool_struct(struct cpupool *c)
-{
- if ( c )
- {
- free_cpumask_var(c->res_valid);
- free_cpumask_var(c->cpu_valid);
- }
- xfree(c);
-}
-
-static struct cpupool *alloc_cpupool_struct(void)
-{
- struct cpupool *c = xzalloc(struct cpupool);
-
- if ( !c )
- return NULL;
-
- if ( !zalloc_cpumask_var(&c->cpu_valid) ||
- !zalloc_cpumask_var(&c->res_valid) )
- {
- free_cpupool_struct(c);
- c = NULL;
- }
-
- return c;
-}
-
-/*
- * find a cpupool by it's id. to be called with cpupool lock held
- * if exact is not specified, the first cpupool with an id larger or equal to
- * the searched id is returned
- * returns NULL if not found.
- */
-static struct cpupool *__cpupool_find_by_id(int id, int exact)
-{
- struct cpupool **q;
-
- ASSERT(spin_is_locked(&cpupool_lock));
-
- for_each_cpupool(q)
- if ( (*q)->cpupool_id >= id )
- break;
-
- return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL;
-}
-
-static struct cpupool *cpupool_find_by_id(int poolid)
-{
- return __cpupool_find_by_id(poolid, 1);
-}
-
-static struct cpupool *__cpupool_get_by_id(int poolid, int exact)
-{
- struct cpupool *c;
- spin_lock(&cpupool_lock);
- c = __cpupool_find_by_id(poolid, exact);
- if ( c != NULL )
- atomic_inc(&c->refcnt);
- spin_unlock(&cpupool_lock);
- return c;
-}
-
-struct cpupool *cpupool_get_by_id(int poolid)
-{
- return __cpupool_get_by_id(poolid, 1);
-}
-
-static struct cpupool *cpupool_get_next_by_id(int poolid)
-{
- return __cpupool_get_by_id(poolid, 0);
-}
-
-void cpupool_put(struct cpupool *pool)
-{
- if ( !atomic_dec_and_test(&pool->refcnt) )
- return;
- scheduler_free(pool->sched);
- free_cpupool_struct(pool);
-}
-
-/*
- * create a new cpupool with specified poolid and scheduler
- * returns pointer to new cpupool structure if okay, NULL else
- * possible failures:
- * - no memory
- * - poolid already used
- * - unknown scheduler
- */
-static struct cpupool *cpupool_create(
- int poolid, unsigned int sched_id, int *perr)
-{
- struct cpupool *c;
- struct cpupool **q;
- int last = 0;
-
- *perr = -ENOMEM;
- if ( (c = alloc_cpupool_struct()) == NULL )
- return NULL;
-
- /* One reference for caller, one reference for cpupool_destroy(). */
- atomic_set(&c->refcnt, 2);
-
- debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id);
-
- spin_lock(&cpupool_lock);
-
- for_each_cpupool(q)
- {
- last = (*q)->cpupool_id;
- if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
- break;
- }
- if ( *q != NULL )
- {
- if ( (*q)->cpupool_id == poolid )
- {
- *perr = -EEXIST;
- goto err;
- }
- c->next = *q;
- }
-
- c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
- if ( poolid == 0 )
- {
- c->sched = scheduler_get_default();
- }
- else
- {
- c->sched = scheduler_alloc(sched_id, perr);
- if ( c->sched == NULL )
- goto err;
- }
- c->gran = opt_sched_granularity;
-
- *q = c;
-
- spin_unlock(&cpupool_lock);
-
- debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n",
- c->cpupool_id, c->sched->name, c->sched->opt_name);
-
- *perr = 0;
- return c;
-
- err:
- spin_unlock(&cpupool_lock);
- free_cpupool_struct(c);
- return NULL;
-}
-/*
- * destroys the given cpupool
- * returns 0 on success, 1 else
- * possible failures:
- * - pool still in use
- * - cpus still assigned to pool
- * - pool not in list
- */
-static int cpupool_destroy(struct cpupool *c)
-{
- struct cpupool **q;
-
- spin_lock(&cpupool_lock);
- for_each_cpupool(q)
- if ( *q == c )
- break;
- if ( *q != c )
- {
- spin_unlock(&cpupool_lock);
- return -ENOENT;
- }
- if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
- {
- spin_unlock(&cpupool_lock);
- return -EBUSY;
- }
- *q = c->next;
- spin_unlock(&cpupool_lock);
-
- cpupool_put(c);
-
- debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id);
- return 0;
-}
-
-/*
- * Move domain to another cpupool
- */
-static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
-{
- int ret;
-
- if ( unlikely(d->cpupool == c) )
- return 0;
-
- d->cpupool->n_dom--;
- ret = sched_move_domain(d, c);
- if ( ret )
- d->cpupool->n_dom++;
- else
- c->n_dom++;
-
- return ret;
-}
-int cpupool_move_domain(struct domain *d, struct cpupool *c)
-{
- int ret;
-
- spin_lock(&cpupool_lock);
-
- ret = cpupool_move_domain_locked(d, c);
-
- spin_unlock(&cpupool_lock);
-
- return ret;
-}
-
-/*
- * assign a specific cpu to a cpupool
- * cpupool_lock must be held
- */
-static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
-{
- int ret;
- struct domain *d;
- const cpumask_t *cpus;
-
- cpus = sched_get_opt_cpumask(c->gran, cpu);
-
- if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
- return -EADDRNOTAVAIL;
- ret = schedule_cpu_add(cpumask_first(cpus), c);
- if ( ret )
- return ret;
-
- rcu_read_lock(&sched_res_rculock);
-
- cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
- if (cpupool_moving_cpu == cpu)
- {
- cpupool_moving_cpu = -1;
- cpupool_put(cpupool_cpu_moving);
- cpupool_cpu_moving = NULL;
- }
- cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
- cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
- rcu_read_unlock(&sched_res_rculock);
-
- rcu_read_lock(&domlist_read_lock);
- for_each_domain_in_cpupool(d, c)
- {
- domain_update_node_affinity(d);
- }
- rcu_read_unlock(&domlist_read_lock);
-
- return 0;
-}
-
-static int cpupool_unassign_cpu_finish(struct cpupool *c)
-{
- int cpu = cpupool_moving_cpu;
- const cpumask_t *cpus;
- struct domain *d;
- int ret;
-
- if ( c != cpupool_cpu_moving )
- return -EADDRNOTAVAIL;
-
- /*
- * We need this for scanning the domain list, both in
- * cpu_disable_scheduler(), and at the bottom of this function.
- */
- rcu_read_lock(&domlist_read_lock);
- ret = cpu_disable_scheduler(cpu);
-
- rcu_read_lock(&sched_res_rculock);
- cpus = get_sched_res(cpu)->cpus;
- cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-
- /*
- * cpu_disable_scheduler() returning an error doesn't require resetting
- * cpupool_free_cpus' cpu bit. All error cases should be of temporary
- * nature and tools will retry the operation. Even if the number of
- * retries may be limited, the in-between state can easily be repaired
- * by adding the cpu to the cpupool again.
- */
- if ( !ret )
- {
- ret = schedule_cpu_rm(cpu);
- if ( ret )
- cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
- else
- {
- cpupool_moving_cpu = -1;
- cpupool_put(cpupool_cpu_moving);
- cpupool_cpu_moving = NULL;
- }
- }
- rcu_read_unlock(&sched_res_rculock);
-
- for_each_domain_in_cpupool(d, c)
- {
- domain_update_node_affinity(d);
- }
- rcu_read_unlock(&domlist_read_lock);
-
- return ret;
-}
-
-static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
-{
- int ret;
- struct domain *d;
- const cpumask_t *cpus;
-
- spin_lock(&cpupool_lock);
- ret = -EADDRNOTAVAIL;
- if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
- && (cpu != cpupool_moving_cpu) )
- goto out;
-
- ret = 0;
- rcu_read_lock(&sched_res_rculock);
- cpus = get_sched_res(cpu)->cpus;
-
- if ( (c->n_dom > 0) &&
- (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
- (cpu != cpupool_moving_cpu) )
- {
- rcu_read_lock(&domlist_read_lock);
- for_each_domain_in_cpupool(d, c)
- {
- if ( !d->is_dying && system_state == SYS_STATE_active )
- {
- ret = -EBUSY;
- break;
- }
- ret = cpupool_move_domain_locked(d, cpupool0);
- if ( ret )
- break;
- }
- rcu_read_unlock(&domlist_read_lock);
- if ( ret )
- goto out;
- }
- cpupool_moving_cpu = cpu;
- atomic_inc(&c->refcnt);
- cpupool_cpu_moving = c;
- cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
- cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
- rcu_read_unlock(&domlist_read_lock);
-out:
- spin_unlock(&cpupool_lock);
-
- return ret;
-}
-
-static long cpupool_unassign_cpu_helper(void *info)
-{
- struct cpupool *c = info;
- long ret;
-
- debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
- cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
- spin_lock(&cpupool_lock);
-
- ret = cpupool_unassign_cpu_finish(c);
-
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
-
- return ret;
-}
-
-/*
- * unassign a specific cpu from a cpupool
- * we must be sure not to run on the cpu to be unassigned! to achieve this
- * the main functionality is performed via continue_hypercall_on_cpu on a
- * specific cpu.
- * if the cpu to be removed is the last one of the cpupool no active domain
- * must be bound to the cpupool. dying domains are moved to cpupool0 as they
- * might be zombies.
- * possible failures:
- * - last cpu and still active domains in cpupool
- * - cpu just being unplugged
- */
-static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
-{
- int work_cpu;
- int ret;
- unsigned int master_cpu;
-
- debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
- c->cpupool_id, cpu);
-
- master_cpu = sched_get_resource_cpu(cpu);
- ret = cpupool_unassign_cpu_start(c, master_cpu);
- if ( ret )
- {
- debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
- c->cpupool_id, cpu, ret);
- return ret;
- }
-
- work_cpu = sched_get_resource_cpu(smp_processor_id());
- if ( work_cpu == master_cpu )
- {
- work_cpu = cpumask_first(cpupool0->cpu_valid);
- if ( work_cpu == master_cpu )
- work_cpu = cpumask_last(cpupool0->cpu_valid);
- }
- return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
-}
-
-/*
- * add a new domain to a cpupool
- * possible failures:
- * - pool does not exist
- * - no cpu assigned to pool
- */
-int cpupool_add_domain(struct domain *d, int poolid)
-{
- struct cpupool *c;
- int rc;
- int n_dom = 0;
-
- if ( poolid == CPUPOOLID_NONE )
- return 0;
- spin_lock(&cpupool_lock);
- c = cpupool_find_by_id(poolid);
- if ( c == NULL )
- rc = -ESRCH;
- else if ( !cpumask_weight(c->cpu_valid) )
- rc = -ENODEV;
- else
- {
- c->n_dom++;
- n_dom = c->n_dom;
- d->cpupool = c;
- rc = 0;
- }
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n",
- d->domain_id, poolid, n_dom, rc);
- return rc;
-}
-
-/*
- * remove a domain from a cpupool
- */
-void cpupool_rm_domain(struct domain *d)
-{
- int cpupool_id;
- int n_dom;
-
- if ( d->cpupool == NULL )
- return;
- spin_lock(&cpupool_lock);
- cpupool_id = d->cpupool->cpupool_id;
- d->cpupool->n_dom--;
- n_dom = d->cpupool->n_dom;
- d->cpupool = NULL;
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
- d->domain_id, cpupool_id, n_dom);
- return;
-}
-
-/*
- * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
- * as they must have been in there when unplugged.
- */
-static int cpupool_cpu_add(unsigned int cpu)
-{
- int ret = 0;
- const cpumask_t *cpus;
-
- spin_lock(&cpupool_lock);
- cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
- cpumask_set_cpu(cpu, &cpupool_free_cpus);
-
- /*
- * If we are not resuming, we are hot-plugging cpu, and in which case
- * we add it to pool0, as it certainly was there when hot-unplagged
- * (or unplugging would have failed) and that is the default behavior
- * anyway.
- */
- rcu_read_lock(&sched_res_rculock);
- get_sched_res(cpu)->cpupool = NULL;
-
- cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
- if ( cpumask_subset(cpus, &cpupool_free_cpus) )
- ret = cpupool_assign_cpu_locked(cpupool0, cpu);
-
- rcu_read_unlock(&sched_res_rculock);
-
- spin_unlock(&cpupool_lock);
-
- return ret;
-}
-
-/*
- * This function is called in stop_machine context, so we can be sure no
- * non-idle vcpu is active on the system.
- */
-static void cpupool_cpu_remove(unsigned int cpu)
-{
- int ret;
-
- ASSERT(is_idle_vcpu(current));
-
- if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
- {
- ret = cpupool_unassign_cpu_finish(cpupool0);
- BUG_ON(ret);
- }
- cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-}
-
-/*
- * Called before a CPU is being removed from the system.
- * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
- * to free cpus actually before removing them).
- * The CPU is locked, to forbid adding it again to another cpupool.
- */
-static int cpupool_cpu_remove_prologue(unsigned int cpu)
-{
- int ret = 0;
- cpumask_t *cpus;
- unsigned int master_cpu;
-
- spin_lock(&cpupool_lock);
-
- rcu_read_lock(&sched_res_rculock);
- cpus = get_sched_res(cpu)->cpus;
- master_cpu = sched_get_resource_cpu(cpu);
- if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
- ret = -EBUSY;
- else
- cpumask_set_cpu(cpu, &cpupool_locked_cpus);
- rcu_read_unlock(&sched_res_rculock);
-
- spin_unlock(&cpupool_lock);
-
- if ( ret )
- return ret;
-
- if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
- {
- /* Cpupool0 is populated only after all cpus are up. */
- ASSERT(system_state == SYS_STATE_active);
-
- ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
- }
- else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
- ret = -ENODEV;
-
- return ret;
-}
-
-/*
- * Called during resume for all cpus which didn't come up again. The cpu must
- * be removed from the cpupool it is assigned to. In case a cpupool will be
- * left without cpu we move all domains of that cpupool to cpupool0.
- * As we are called with all domains still frozen there is no need to take the
- * cpupool lock here.
- */
-static void cpupool_cpu_remove_forced(unsigned int cpu)
-{
- struct cpupool **c;
- int ret;
- unsigned int master_cpu = sched_get_resource_cpu(cpu);
-
- for_each_cpupool ( c )
- {
- if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) )
- {
- ret = cpupool_unassign_cpu_start(*c, master_cpu);
- BUG_ON(ret);
- ret = cpupool_unassign_cpu_finish(*c);
- BUG_ON(ret);
- }
- }
-
- cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-
- rcu_read_lock(&sched_res_rculock);
- sched_rm_cpu(cpu);
- rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * do cpupool related sysctl operations
- */
-int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
-{
- int ret;
- struct cpupool *c;
-
- switch ( op->op )
- {
-
- case XEN_SYSCTL_CPUPOOL_OP_CREATE:
- {
- int poolid;
-
- poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
- CPUPOOLID_NONE: op->cpupool_id;
- c = cpupool_create(poolid, op->sched_id, &ret);
- if ( c != NULL )
- {
- op->cpupool_id = c->cpupool_id;
- cpupool_put(c);
- }
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
- {
- c = cpupool_get_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- break;
- ret = cpupool_destroy(c);
- cpupool_put(c);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_INFO:
- {
- c = cpupool_get_next_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- break;
- op->cpupool_id = c->cpupool_id;
- op->sched_id = c->sched->sched_id;
- op->n_dom = c->n_dom;
- ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
- cpupool_put(c);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
- {
- unsigned cpu;
- const cpumask_t *cpus;
-
- cpu = op->cpu;
- debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n",
- op->cpupool_id, cpu);
-
- spin_lock(&cpupool_lock);
-
- c = cpupool_find_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- goto addcpu_out;
- if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
- {
- for_each_cpu ( cpu, &cpupool_free_cpus )
- {
- cpus = sched_get_opt_cpumask(c->gran, cpu);
- if ( cpumask_subset(cpus, &cpupool_free_cpus) )
- break;
- }
- ret = -ENODEV;
- if ( cpu >= nr_cpu_ids )
- goto addcpu_out;
- }
- ret = -EINVAL;
- if ( cpu >= nr_cpu_ids )
- goto addcpu_out;
- ret = -ENODEV;
- cpus = sched_get_opt_cpumask(c->gran, cpu);
- if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
- cpumask_intersects(cpus, &cpupool_locked_cpus) )
- goto addcpu_out;
- ret = cpupool_assign_cpu_locked(c, cpu);
-
- addcpu_out:
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
- op->cpupool_id, cpu, ret);
-
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
- {
- unsigned cpu;
-
- c = cpupool_get_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- break;
- cpu = op->cpu;
- if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
- cpu = cpumask_last(c->cpu_valid);
- ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
- cpupool_put(c);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
- {
- struct domain *d;
-
- ret = rcu_lock_remote_domain_by_id(op->domid, &d);
- if ( ret )
- break;
- if ( d->cpupool == NULL )
- {
- ret = -EINVAL;
- rcu_unlock_domain(d);
- break;
- }
- if ( op->cpupool_id == d->cpupool->cpupool_id )
- {
- ret = 0;
- rcu_unlock_domain(d);
- break;
- }
- debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n",
- d->domain_id, op->cpupool_id);
- ret = -ENOENT;
- spin_lock(&cpupool_lock);
-
- c = cpupool_find_by_id(op->cpupool_id);
- if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
- ret = cpupool_move_domain_locked(d, c);
-
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n",
- d->domain_id, op->cpupool_id, ret);
- rcu_unlock_domain(d);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
- {
- ret = cpumask_to_xenctl_bitmap(
- &op->cpumap, &cpupool_free_cpus);
- }
- break;
-
- default:
- ret = -ENOSYS;
- break;
- }
-
- return ret;
-}
-
-void dump_runq(unsigned char key)
-{
- unsigned long flags;
- s_time_t now = NOW();
- struct cpupool **c;
-
- spin_lock(&cpupool_lock);
- local_irq_save(flags);
-
- printk("sched_smt_power_savings: %s\n",
- sched_smt_power_savings? "enabled":"disabled");
- printk("NOW=%"PRI_stime"\n", now);
-
- printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
- if ( !cpumask_empty(&cpupool_free_cpus) )
- {
- printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
- schedule_dump(NULL);
- }
-
- for_each_cpupool(c)
- {
- printk("Cpupool %d:\n", (*c)->cpupool_id);
- printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid));
- schedule_dump(*c);
- }
-
- local_irq_restore(flags);
- spin_unlock(&cpupool_lock);
-}
-
-static int cpu_callback(
- struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int rc = 0;
-
- switch ( action )
- {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- if ( system_state <= SYS_STATE_active )
- rc = cpupool_cpu_add(cpu);
- break;
- case CPU_DOWN_PREPARE:
- /* Suspend/Resume don't change assignments of cpus to cpupools. */
- if ( system_state <= SYS_STATE_active )
- rc = cpupool_cpu_remove_prologue(cpu);
- break;
- case CPU_DYING:
- /* Suspend/Resume don't change assignments of cpus to cpupools. */
- if ( system_state <= SYS_STATE_active )
- cpupool_cpu_remove(cpu);
- break;
- case CPU_RESUME_FAILED:
- cpupool_cpu_remove_forced(cpu);
- break;
- default:
- break;
- }
-
- return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
-static int __init cpupool_init(void)
-{
- unsigned int cpu;
- int err;
-
- cpupool_gran_init();
-
- cpupool0 = cpupool_create(0, 0, &err);
- BUG_ON(cpupool0 == NULL);
- cpupool_put(cpupool0);
- register_cpu_notifier(&cpu_nfb);
-
- spin_lock(&cpupool_lock);
-
- cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
-
- for_each_cpu ( cpu, &cpupool_free_cpus )
- cpupool_assign_cpu_locked(cpupool0, cpu);
-
- spin_unlock(&cpupool_lock);
-
- return 0;
-}
-__initcall(cpupool_init);
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
--- /dev/null
+menu "Schedulers"
+ visible if EXPERT = "y"
+
+config SCHED_CREDIT
+ bool "Credit scheduler support"
+ default y
+ ---help---
+ The traditional credit scheduler is a general purpose scheduler.
+
+config SCHED_CREDIT2
+ bool "Credit2 scheduler support"
+ default y
+ ---help---
+ The credit2 scheduler is a general purpose scheduler that is
+ optimized for lower latency and higher VM density.
+
+config SCHED_RTDS
+ bool "RTDS scheduler support (EXPERIMENTAL)"
+ default y
+ ---help---
+ The RTDS scheduler is a soft and firm real-time scheduler for
+ multicore, targeted for embedded, automotive, graphics and gaming
+ in the cloud, and general low-latency workloads.
+
+config SCHED_ARINC653
+ bool "ARINC653 scheduler support (EXPERIMENTAL)"
+ default DEBUG
+ ---help---
+ The ARINC653 scheduler is a hard real-time scheduler for single
+ cores, targeted for avionics, drones, and medical devices.
+
+config SCHED_NULL
+ bool "Null scheduler support (EXPERIMENTAL)"
+ default y
+ ---help---
+ The null scheduler is a static, zero overhead scheduler,
+ for when there always are less vCPUs than pCPUs, typically
+ in embedded or HPC scenarios.
+
+choice
+ prompt "Default Scheduler?"
+ default SCHED_CREDIT2_DEFAULT
+
+ config SCHED_CREDIT_DEFAULT
+ bool "Credit Scheduler" if SCHED_CREDIT
+ config SCHED_CREDIT2_DEFAULT
+ bool "Credit2 Scheduler" if SCHED_CREDIT2
+ config SCHED_RTDS_DEFAULT
+ bool "RT Scheduler" if SCHED_RTDS
+ config SCHED_ARINC653_DEFAULT
+ bool "ARINC653 Scheduler" if SCHED_ARINC653
+ config SCHED_NULL_DEFAULT
+ bool "Null Scheduler" if SCHED_NULL
+endchoice
+
+config SCHED_DEFAULT
+ string
+ default "credit" if SCHED_CREDIT_DEFAULT
+ default "credit2" if SCHED_CREDIT2_DEFAULT
+ default "rtds" if SCHED_RTDS_DEFAULT
+ default "arinc653" if SCHED_ARINC653_DEFAULT
+ default "null" if SCHED_NULL_DEFAULT
+ default "credit2"
+
+endmenu
--- /dev/null
+obj-y += cpupool.o
+obj-$(CONFIG_SCHED_ARINC653) += arinc653.o
+obj-$(CONFIG_SCHED_CREDIT) += credit.o
+obj-$(CONFIG_SCHED_CREDIT2) += credit2.o
+obj-$(CONFIG_SCHED_RTDS) += rt.o
+obj-$(CONFIG_SCHED_NULL) += null.o
+obj-y += core.o
--- /dev/null
+/******************************************************************************
+ * sched_arinc653.c
+ *
+ * An ARINC653-compatible scheduling algorithm for use in Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com>
+ */
+
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/timer.h>
+#include <xen/softirq.h>
+#include <xen/time.h>
+#include <xen/errno.h>
+#include <xen/list.h>
+#include <xen/guest_access.h>
+#include <public/sysctl.h>
+
+/**************************************************************************
+ * Private Macros *
+ **************************************************************************/
+
+/**
+ * Default timeslice for domain 0.
+ */
+#define DEFAULT_TIMESLICE MILLISECS(10)
+
+/**
+ * Retrieve the idle UNIT for a given physical CPU
+ */
+#define IDLETASK(cpu) (sched_idle_unit(cpu))
+
+/**
+ * Return a pointer to the ARINC 653-specific scheduler data information
+ * associated with the given UNIT (unit)
+ */
+#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv)
+
+/**
+ * Return the global scheduler private data given the scheduler ops pointer
+ */
+#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data))
+
+/**************************************************************************
+ * Private Type Definitions *
+ **************************************************************************/
+
+/**
+ * The arinc653_unit_t structure holds ARINC 653-scheduler-specific
+ * information for all non-idle UNITs
+ */
+typedef struct arinc653_unit_s
+{
+ /* unit points to Xen's struct sched_unit so we can get to it from an
+ * arinc653_unit_t pointer. */
+ struct sched_unit * unit;
+ /* awake holds whether the UNIT has been woken with vcpu_wake() */
+ bool_t awake;
+ /* list holds the linked list information for the list this UNIT
+ * is stored in */
+ struct list_head list;
+} arinc653_unit_t;
+
+/**
+ * The sched_entry_t structure holds a single entry of the
+ * ARINC 653 schedule.
+ */
+typedef struct sched_entry_s
+{
+ /* dom_handle holds the handle ("UUID") for the domain that this
+ * schedule entry refers to. */
+ xen_domain_handle_t dom_handle;
+ /* unit_id holds the UNIT number for the UNIT that this schedule
+ * entry refers to. */
+ int unit_id;
+ /* runtime holds the number of nanoseconds that the UNIT for this
+ * schedule entry should be allowed to run per major frame. */
+ s_time_t runtime;
+ /* unit holds a pointer to the Xen sched_unit structure */
+ struct sched_unit * unit;
+} sched_entry_t;
+
+/**
+ * This structure defines data that is global to an instance of the scheduler
+ */
+typedef struct a653sched_priv_s
+{
+ /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
+ spinlock_t lock;
+
+ /**
+ * This array holds the active ARINC 653 schedule.
+ *
+ * When the system tries to start a new UNIT, this schedule is scanned
+ * to look for a matching (handle, UNIT #) pair. If both the handle (UUID)
+ * and UNIT number match, then the UNIT is allowed to run. Its run time
+ * (per major frame) is given in the third entry of the schedule.
+ */
+ sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+
+ /**
+ * This variable holds the number of entries that are valid in
+ * the arinc653_schedule table.
+ *
+ * This is not necessarily the same as the number of domains in the
+ * schedule. A domain could be listed multiple times within the schedule,
+ * or a domain with multiple UNITs could have a different
+ * schedule entry for each UNIT.
+ */
+ unsigned int num_schedule_entries;
+
+ /**
+ * the major frame time for the ARINC 653 schedule.
+ */
+ s_time_t major_frame;
+
+ /**
+ * the time that the next major frame starts
+ */
+ s_time_t next_major_frame;
+
+ /**
+ * pointers to all Xen UNIT structures for iterating through
+ */
+ struct list_head unit_list;
+} a653sched_priv_t;
+
+/**************************************************************************
+ * Helper functions *
+ **************************************************************************/
+
+/**
+ * This function compares two domain handles.
+ *
+ * @param h1 Pointer to handle 1
+ * @param h2 Pointer to handle 2
+ *
+ * @return <ul>
+ * <li> <0: handle 1 is less than handle 2
+ * <li> 0: handle 1 is equal to handle 2
+ * <li> >0: handle 1 is greater than handle 2
+ * </ul>
+ */
+static int dom_handle_cmp(const xen_domain_handle_t h1,
+ const xen_domain_handle_t h2)
+{
+ return memcmp(h1, h2, sizeof(xen_domain_handle_t));
+}
+
+/**
+ * This function searches the unit list to find a UNIT that matches
+ * the domain handle and UNIT ID specified.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param handle Pointer to handler
+ * @param unit_id UNIT ID
+ *
+ * @return <ul>
+ * <li> Pointer to the matching UNIT if one is found
+ * <li> NULL otherwise
+ * </ul>
+ */
+static struct sched_unit *find_unit(
+ const struct scheduler *ops,
+ xen_domain_handle_t handle,
+ int unit_id)
+{
+ arinc653_unit_t *aunit;
+
+ /* loop through the unit_list looking for the specified UNIT */
+ list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list )
+ if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0)
+ && (unit_id == aunit->unit->unit_id) )
+ return aunit->unit;
+
+ return NULL;
+}
+
+/**
+ * This function updates the pointer to the Xen UNIT structure for each entry
+ * in the ARINC 653 schedule.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @return <None>
+ */
+static void update_schedule_units(const struct scheduler *ops)
+{
+ unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries;
+
+ for ( i = 0; i < n_entries; i++ )
+ SCHED_PRIV(ops)->schedule[i].unit =
+ find_unit(ops,
+ SCHED_PRIV(ops)->schedule[i].dom_handle,
+ SCHED_PRIV(ops)->schedule[i].unit_id);
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to put
+ * in place a new ARINC653 schedule.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ *
+ * @return <ul>
+ * <li> 0 = success
+ * <li> !0 = error
+ * </ul>
+ */
+static int
+arinc653_sched_set(
+ const struct scheduler *ops,
+ struct xen_sysctl_arinc653_schedule *schedule)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ s_time_t total_runtime = 0;
+ unsigned int i;
+ unsigned long flags;
+ int rc = -EINVAL;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ /* Check for valid major frame and number of schedule entries. */
+ if ( (schedule->major_frame <= 0)
+ || (schedule->num_sched_entries < 1)
+ || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) )
+ goto fail;
+
+ for ( i = 0; i < schedule->num_sched_entries; i++ )
+ {
+ /* Check for a valid run time. */
+ if ( schedule->sched_entries[i].runtime <= 0 )
+ goto fail;
+
+ /* Add this entry's run time to total run time. */
+ total_runtime += schedule->sched_entries[i].runtime;
+ }
+
+ /*
+ * Error if the major frame is not large enough to run all entries as
+ * indicated by comparing the total run time to the major frame length.
+ */
+ if ( total_runtime > schedule->major_frame )
+ goto fail;
+
+ /* Copy the new schedule into place. */
+ sched_priv->num_schedule_entries = schedule->num_sched_entries;
+ sched_priv->major_frame = schedule->major_frame;
+ for ( i = 0; i < schedule->num_sched_entries; i++ )
+ {
+ memcpy(sched_priv->schedule[i].dom_handle,
+ schedule->sched_entries[i].dom_handle,
+ sizeof(sched_priv->schedule[i].dom_handle));
+ sched_priv->schedule[i].unit_id =
+ schedule->sched_entries[i].vcpu_id;
+ sched_priv->schedule[i].runtime =
+ schedule->sched_entries[i].runtime;
+ }
+ update_schedule_units(ops);
+
+ /*
+ * The newly-installed schedule takes effect immediately. We do not even
+ * wait for the current major frame to expire.
+ *
+ * Signal a new major frame to begin. The next major frame is set up by
+ * the do_schedule callback function when it is next invoked.
+ */
+ sched_priv->next_major_frame = NOW();
+
+ rc = 0;
+
+ fail:
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+ return rc;
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to read the
+ * current ARINC 653 schedule
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @return <ul>
+ * <li> 0 = success
+ * <li> !0 = error
+ * </ul>
+ */
+static int
+arinc653_sched_get(
+ const struct scheduler *ops,
+ struct xen_sysctl_arinc653_schedule *schedule)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ unsigned int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ schedule->num_sched_entries = sched_priv->num_schedule_entries;
+ schedule->major_frame = sched_priv->major_frame;
+ for ( i = 0; i < sched_priv->num_schedule_entries; i++ )
+ {
+ memcpy(schedule->sched_entries[i].dom_handle,
+ sched_priv->schedule[i].dom_handle,
+ sizeof(sched_priv->schedule[i].dom_handle));
+ schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id;
+ schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime;
+ }
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+ return 0;
+}
+
+/**************************************************************************
+ * Scheduler callback functions *
+ **************************************************************************/
+
+/**
+ * This function performs initialization for an instance of the scheduler.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ *
+ * @return <ul>
+ * <li> 0 = success
+ * <li> !0 = error
+ * </ul>
+ */
+static int
+a653sched_init(struct scheduler *ops)
+{
+ a653sched_priv_t *prv;
+
+ prv = xzalloc(a653sched_priv_t);
+ if ( prv == NULL )
+ return -ENOMEM;
+
+ ops->sched_data = prv;
+
+ prv->next_major_frame = 0;
+ spin_lock_init(&prv->lock);
+ INIT_LIST_HEAD(&prv->unit_list);
+
+ return 0;
+}
+
+/**
+ * This function performs deinitialization for an instance of the scheduler
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_deinit(struct scheduler *ops)
+{
+ xfree(SCHED_PRIV(ops));
+ ops->sched_data = NULL;
+}
+
+/**
+ * This function allocates scheduler-specific data for a UNIT
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ *
+ * @return Pointer to the allocated data
+ */
+static void *
+a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+ void *dd)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ arinc653_unit_t *svc;
+ unsigned int entry;
+ unsigned long flags;
+
+ /*
+ * Allocate memory for the ARINC 653-specific scheduler data information
+ * associated with the given UNIT (unit).
+ */
+ svc = xmalloc(arinc653_unit_t);
+ if ( svc == NULL )
+ return NULL;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ /*
+ * Add every one of dom0's units to the schedule, as long as there are
+ * slots available.
+ */
+ if ( unit->domain->domain_id == 0 )
+ {
+ entry = sched_priv->num_schedule_entries;
+
+ if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE )
+ {
+ sched_priv->schedule[entry].dom_handle[0] = '\0';
+ sched_priv->schedule[entry].unit_id = unit->unit_id;
+ sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE;
+ sched_priv->schedule[entry].unit = unit;
+
+ sched_priv->major_frame += DEFAULT_TIMESLICE;
+ ++sched_priv->num_schedule_entries;
+ }
+ }
+
+ /*
+ * Initialize our ARINC 653 scheduler-specific information for the UNIT.
+ * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it
+ * will call the vcpu_wake scheduler callback function and our scheduler
+ * will mark the UNIT awake.
+ */
+ svc->unit = unit;
+ svc->awake = 0;
+ if ( !is_idle_unit(unit) )
+ list_add(&svc->list, &SCHED_PRIV(ops)->unit_list);
+ update_schedule_units(ops);
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+ return svc;
+}
+
+/**
+ * This function frees scheduler-specific UNIT data
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_free_udata(const struct scheduler *ops, void *priv)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ arinc653_unit_t *av = priv;
+ unsigned long flags;
+
+ if (av == NULL)
+ return;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ if ( !is_idle_unit(av->unit) )
+ list_del(&av->list);
+
+ xfree(av);
+ update_schedule_units(ops);
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+}
+
+/**
+ * Xen scheduler callback function to sleep a UNIT
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+ if ( AUNIT(unit) != NULL )
+ AUNIT(unit)->awake = 0;
+
+ /*
+ * If the UNIT being put to sleep is the same one that is currently
+ * running, raise a softirq to invoke the scheduler to switch domains.
+ */
+ if ( get_sched_res(sched_unit_master(unit))->curr == unit )
+ cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to wake up a UNIT
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+ if ( AUNIT(unit) != NULL )
+ AUNIT(unit)->awake = 1;
+
+ cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to select a UNIT to run.
+ * This is the main scheduler routine.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param now Current time
+ */
+static void
+a653sched_do_schedule(
+ const struct scheduler *ops,
+ struct sched_unit *prev,
+ s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ struct sched_unit *new_task = NULL;
+ static unsigned int sched_index = 0;
+ static s_time_t next_switch_time;
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ const unsigned int cpu = sched_get_resource_cpu(smp_processor_id());
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ if ( sched_priv->num_schedule_entries < 1 )
+ sched_priv->next_major_frame = now + DEFAULT_TIMESLICE;
+ else if ( now >= sched_priv->next_major_frame )
+ {
+ /* time to enter a new major frame
+ * the first time this function is called, this will be true */
+ /* start with the first domain in the schedule */
+ sched_index = 0;
+ sched_priv->next_major_frame = now + sched_priv->major_frame;
+ next_switch_time = now + sched_priv->schedule[0].runtime;
+ }
+ else
+ {
+ while ( (now >= next_switch_time)
+ && (sched_index < sched_priv->num_schedule_entries) )
+ {
+ /* time to switch to the next domain in this major frame */
+ sched_index++;
+ next_switch_time += sched_priv->schedule[sched_index].runtime;
+ }
+ }
+
+ /*
+ * If we exhausted the domains in the schedule and still have time left
+ * in the major frame then switch next at the next major frame.
+ */
+ if ( sched_index >= sched_priv->num_schedule_entries )
+ next_switch_time = sched_priv->next_major_frame;
+
+ /*
+ * If there are more domains to run in the current major frame, set
+ * new_task equal to the address of next domain's sched_unit structure.
+ * Otherwise, set new_task equal to the address of the idle task's
+ * sched_unit structure.
+ */
+ new_task = (sched_index < sched_priv->num_schedule_entries)
+ ? sched_priv->schedule[sched_index].unit
+ : IDLETASK(cpu);
+
+ /* Check to see if the new task can be run (awake & runnable). */
+ if ( !((new_task != NULL)
+ && (AUNIT(new_task) != NULL)
+ && AUNIT(new_task)->awake
+ && unit_runnable_state(new_task)) )
+ new_task = IDLETASK(cpu);
+ BUG_ON(new_task == NULL);
+
+ /*
+ * Check to make sure we did not miss a major frame.
+ * This is a good test for robust partitioning.
+ */
+ BUG_ON(now >= sched_priv->next_major_frame);
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+ /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+ if ( tasklet_work_scheduled )
+ new_task = IDLETASK(cpu);
+
+ /* Running this task would result in a migration */
+ if ( !is_idle_unit(new_task)
+ && (sched_unit_master(new_task) != cpu) )
+ new_task = IDLETASK(cpu);
+
+ /*
+ * Return the amount of time the next domain has to run and the address
+ * of the selected task's UNIT structure.
+ */
+ prev->next_time = next_switch_time - now;
+ prev->next_task = new_task;
+ new_task->migrated = false;
+
+ BUG_ON(prev->next_time <= 0);
+}
+
+/**
+ * Xen scheduler callback function to select a resource for the UNIT to run on
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ *
+ * @return Scheduler resource to run on
+ */
+static struct sched_resource *
+a653sched_pick_resource(const struct scheduler *ops,
+ const struct sched_unit *unit)
+{
+ cpumask_t *online;
+ unsigned int cpu;
+
+ /*
+ * If present, prefer unit's current processor, else
+ * just find the first valid unit.
+ */
+ online = cpupool_domain_master_cpumask(unit->domain);
+
+ cpu = cpumask_first(online);
+
+ if ( cpumask_test_cpu(sched_unit_master(unit), online)
+ || (cpu >= nr_cpu_ids) )
+ cpu = sched_unit_master(unit);
+
+ return get_sched_res(cpu);
+}
+
+/**
+ * Xen scheduler callback to change the scheduler of a cpu
+ *
+ * @param new_ops Pointer to this instance of the scheduler structure
+ * @param cpu The cpu that is changing scheduler
+ * @param pdata scheduler specific PCPU data (we don't have any)
+ * @param vdata scheduler specific UNIT data of the idle unit
+ */
+static spinlock_t *
+a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ struct sched_resource *sr = get_sched_res(cpu);
+ arinc653_unit_t *svc = vdata;
+
+ ASSERT(!pdata && svc && is_idle_unit(svc->unit));
+
+ sched_idle_unit(cpu)->priv = vdata;
+
+ return &sr->_lock;
+}
+
+/**
+ * Xen scheduler callback function to perform a global (not domain-specific)
+ * adjustment. It is used by the ARINC 653 scheduler to put in place a new
+ * ARINC 653 schedule or to retrieve the schedule currently in place.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param sc Pointer to the scheduler operation specified by Domain 0
+ */
+static int
+a653sched_adjust_global(const struct scheduler *ops,
+ struct xen_sysctl_scheduler_op *sc)
+{
+ struct xen_sysctl_arinc653_schedule local_sched;
+ int rc = -EINVAL;
+
+ switch ( sc->cmd )
+ {
+ case XEN_SYSCTL_SCHEDOP_putinfo:
+ if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+
+ rc = arinc653_sched_set(ops, &local_sched);
+ break;
+ case XEN_SYSCTL_SCHEDOP_getinfo:
+ memset(&local_sched, -1, sizeof(local_sched));
+ rc = arinc653_sched_get(ops, &local_sched);
+ if ( rc )
+ break;
+
+ if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
+ rc = -EFAULT;
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ * This structure defines our scheduler for Xen.
+ * The entries tell Xen where to find our scheduler-specific
+ * callback functions.
+ * The symbol must be visible to the rest of Xen at link time.
+ */
+static const struct scheduler sched_arinc653_def = {
+ .name = "ARINC 653 Scheduler",
+ .opt_name = "arinc653",
+ .sched_id = XEN_SCHEDULER_ARINC653,
+ .sched_data = NULL,
+
+ .init = a653sched_init,
+ .deinit = a653sched_deinit,
+
+ .free_udata = a653sched_free_udata,
+ .alloc_udata = a653sched_alloc_udata,
+
+ .insert_unit = NULL,
+ .remove_unit = NULL,
+
+ .sleep = a653sched_unit_sleep,
+ .wake = a653sched_unit_wake,
+ .yield = NULL,
+ .context_saved = NULL,
+
+ .do_schedule = a653sched_do_schedule,
+
+ .pick_resource = a653sched_pick_resource,
+
+ .switch_sched = a653_switch_sched,
+
+ .adjust = NULL,
+ .adjust_global = a653sched_adjust_global,
+
+ .dump_settings = NULL,
+ .dump_cpu_state = NULL,
+};
+
+REGISTER_SCHEDULER(sched_arinc653_def);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/****************************************************************************
+ * schedule.c
+ *
+ */
+
+#include <compat/sched.h>
+
+#define COMPAT
+#define ret_t int
+
+#define do_sched_op compat_sched_op
+
+#define xen_sched_pin_override sched_pin_override
+CHECK_sched_pin_override;
+#undef xen_sched_pin_override
+
+#define xen_sched_shutdown sched_shutdown
+CHECK_sched_shutdown;
+#undef xen_sched_shutdown
+
+#define xen_sched_remote_shutdown sched_remote_shutdown
+CHECK_sched_remote_shutdown;
+#undef xen_sched_remote_shutdown
+
+static int compat_poll(struct compat_sched_poll *compat)
+{
+ struct sched_poll native;
+
+#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
+ guest_from_compat_handle((_d_)->ports, (_s_)->ports)
+ XLAT_sched_poll(&native, compat);
+#undef XLAT_sched_poll_HNDL_ports
+
+ return do_poll(&native);
+}
+
+#define do_poll compat_poll
+#define sched_poll compat_sched_poll
+
+#include "core.c"
+
+int compat_set_timer_op(u32 lo, s32 hi)
+{
+ return do_set_timer_op(((s64)hi << 32) | lo);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004 - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ * File: common/schedule.c
+ * Author: Rolf Neugebauer & Keir Fraser
+ * Updated for generic API by Mark Williamson
+ *
+ * Description: Generic CPU scheduling code
+ * implements support functionality for the Xen scheduler API.
+ *
+ */
+
+#ifndef COMPAT
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+#include <xen/mm.h>
+#include <xen/err.h>
+#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <xen/multicall.h>
+#include <xen/cpu.h>
+#include <xen/preempt.h>
+#include <xen/event.h>
+#include <public/sched.h>
+#include <xsm/xsm.h>
+#include <xen/err.h>
+
+#ifdef CONFIG_XEN_GUEST
+#include <asm/guest.h>
+#else
+#define pv_shim false
+#endif
+
+/* opt_sched: scheduler - default to configured value */
+static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
+string_param("sched", opt_sched);
+
+/* if sched_smt_power_savings is set,
+ * scheduler will give preferrence to partially idle package compared to
+ * the full idle package, when picking pCPU to schedule vCPU.
+ */
+bool_t sched_smt_power_savings = 0;
+boolean_param("sched_smt_power_savings", sched_smt_power_savings);
+
+/* Default scheduling rate limit: 1ms
+ * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
+ * */
+int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+integer_param("sched_ratelimit_us", sched_ratelimit_us);
+
+/* Number of vcpus per struct sched_unit. */
+bool __read_mostly sched_disable_smt_switching;
+cpumask_t sched_res_mask;
+
+/* Common lock for free cpus. */
+static DEFINE_SPINLOCK(sched_free_cpu_lock);
+
+/* Various timer handlers. */
+static void s_timer_fn(void *unused);
+static void vcpu_periodic_timer_fn(void *data);
+static void vcpu_singleshot_timer_fn(void *data);
+static void poll_timer_fn(void *data);
+
+/* This is global for now so that private implementations can reach it */
+DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
+DEFINE_RCU_READ_LOCK(sched_res_rculock);
+
+/* Scratch space for cpumasks. */
+DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
+
+/* How many urgent vcpus. */
+DEFINE_PER_CPU(atomic_t, sched_urgent_count);
+
+extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
+#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
+#define schedulers __start_schedulers_array
+
+static struct scheduler __read_mostly ops;
+
+static bool scheduler_active;
+
+static void sched_set_affinity(
+ struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
+
+static struct sched_resource *
+sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+ return unit->res;
+}
+
+static void *
+sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+ void *dd)
+{
+ /* Any non-NULL pointer is fine here. */
+ return ZERO_BLOCK_PTR;
+}
+
+static void
+sched_idle_free_udata(const struct scheduler *ops, void *priv)
+{
+}
+
+static void sched_idle_schedule(
+ const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ const unsigned int cpu = smp_processor_id();
+
+ unit->next_time = -1;
+ unit->next_task = sched_idle_unit(cpu);
+}
+
+static struct scheduler sched_idle_ops = {
+ .name = "Idle Scheduler",
+ .opt_name = "idle",
+ .sched_data = NULL,
+
+ .pick_resource = sched_idle_res_pick,
+ .do_schedule = sched_idle_schedule,
+
+ .alloc_udata = sched_idle_alloc_udata,
+ .free_udata = sched_idle_free_udata,
+};
+
+static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
+ unsigned int cpu)
+{
+ unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
+ const struct domain *d = unit->domain;
+
+ return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
+}
+
+static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
+ unsigned int cpu)
+{
+ struct vcpu *v = unit2vcpu_cpu(unit, cpu);
+
+ return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
+}
+
+static inline struct scheduler *dom_scheduler(const struct domain *d)
+{
+ if ( likely(d->cpupool != NULL) )
+ return d->cpupool->sched;
+
+ /*
+ * If d->cpupool is NULL, this is the idle domain. This is special
+ * because the idle domain does not really belong to any cpupool, and,
+ * hence, does not really have a scheduler.
+ *
+ * This is (should be!) only called like this for allocating the idle
+ * vCPUs for the first time, during boot, in which case what we want
+ * is the default scheduler that has been, choosen at boot.
+ */
+ ASSERT(is_idle_domain(d));
+ return &ops;
+}
+
+static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
+{
+ struct domain *d = unit->domain;
+
+ if ( likely(d->cpupool != NULL) )
+ return d->cpupool->sched;
+
+ /*
+ * If d->cpupool is NULL, this is a unit of the idle domain. And this
+ * case is special because the idle domain does not really belong to
+ * a cpupool and, hence, doesn't really have a scheduler). In fact, its
+ * units (may) run on pCPUs which are in different pools, with different
+ * schedulers.
+ *
+ * What we want, in this case, is the scheduler of the pCPU where this
+ * particular idle unit is running. And, since unit->res never changes
+ * for idle units, it is safe to use it, with no locks, to figure that out.
+ */
+
+ ASSERT(is_idle_domain(d));
+ return unit->res->scheduler;
+}
+
+static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
+{
+ return unit_scheduler(v->sched_unit);
+}
+#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
+
+static inline void trace_runstate_change(struct vcpu *v, int new_state)
+{
+ struct { uint32_t vcpu:16, domain:16; } d;
+ uint32_t event;
+
+ if ( likely(!tb_init_done) )
+ return;
+
+ d.vcpu = v->vcpu_id;
+ d.domain = v->domain->domain_id;
+
+ event = TRC_SCHED_RUNSTATE_CHANGE;
+ event |= ( v->runstate.state & 0x3 ) << 8;
+ event |= ( new_state & 0x3 ) << 4;
+
+ __trace_var(event, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void trace_continue_running(struct vcpu *v)
+{
+ struct { uint32_t vcpu:16, domain:16; } d;
+
+ if ( likely(!tb_init_done) )
+ return;
+
+ d.vcpu = v->vcpu_id;
+ d.domain = v->domain->domain_id;
+
+ __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void vcpu_urgent_count_update(struct vcpu *v)
+{
+ if ( is_idle_vcpu(v) )
+ return;
+
+ if ( unlikely(v->is_urgent) )
+ {
+ if ( !(v->pause_flags & VPF_blocked) ||
+ !test_bit(v->vcpu_id, v->domain->poll_mask) )
+ {
+ v->is_urgent = 0;
+ atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+ }
+ }
+ else
+ {
+ if ( unlikely(v->pause_flags & VPF_blocked) &&
+ unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
+ {
+ v->is_urgent = 1;
+ atomic_inc(&per_cpu(sched_urgent_count, v->processor));
+ }
+ }
+}
+
+static inline void vcpu_runstate_change(
+ struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+ s_time_t delta;
+ struct sched_unit *unit = v->sched_unit;
+
+ ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+ if ( v->runstate.state == new_state )
+ return;
+
+ vcpu_urgent_count_update(v);
+
+ trace_runstate_change(v, new_state);
+
+ if ( !is_idle_vcpu(v) )
+ {
+ unit->runstate_cnt[v->runstate.state]--;
+ unit->runstate_cnt[new_state]++;
+ }
+
+ delta = new_entry_time - v->runstate.state_entry_time;
+ if ( delta > 0 )
+ {
+ v->runstate.time[v->runstate.state] += delta;
+ v->runstate.state_entry_time = new_entry_time;
+ }
+
+ v->runstate.state = new_state;
+}
+
+void sched_guest_idle(void (*idle) (void), unsigned int cpu)
+{
+ /*
+ * Another vcpu of the unit is active in guest context while this one is
+ * idle. In case of a scheduling event we don't want to have high latencies
+ * due to a cpu needing to wake up from deep C state for joining the
+ * rendezvous, so avoid those deep C states by incrementing the urgent
+ * count of the cpu.
+ */
+ atomic_inc(&per_cpu(sched_urgent_count, cpu));
+ idle();
+ atomic_dec(&per_cpu(sched_urgent_count, cpu));
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+ spinlock_t *lock;
+ s_time_t delta;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit);
+ memcpy(runstate, &v->runstate, sizeof(*runstate));
+ delta = NOW() - runstate->state_entry_time;
+ if ( delta > 0 )
+ runstate->time[runstate->state] += delta;
+
+ if ( unlikely(lock != NULL) )
+ unit_schedule_unlock_irq(lock, v->sched_unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+uint64_t get_cpu_idle_time(unsigned int cpu)
+{
+ struct vcpu_runstate_info state = { 0 };
+ struct vcpu *v = idle_vcpu[cpu];
+
+ if ( cpu_online(cpu) && v )
+ vcpu_runstate_get(v, &state);
+
+ return state.time[RUNSTATE_running];
+}
+
+/*
+ * If locks are different, take the one with the lower address first.
+ * This avoids dead- or live-locks when this code is running on both
+ * cpus at the same time.
+ */
+static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
+ unsigned long *flags)
+{
+ if ( lock1 == lock2 )
+ {
+ spin_lock_irqsave(lock1, *flags);
+ }
+ else if ( lock1 < lock2 )
+ {
+ spin_lock_irqsave(lock1, *flags);
+ spin_lock(lock2);
+ }
+ else
+ {
+ spin_lock_irqsave(lock2, *flags);
+ spin_lock(lock1);
+ }
+}
+
+static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
+ unsigned long flags)
+{
+ if ( lock1 != lock2 )
+ spin_unlock(lock2);
+ spin_unlock_irqrestore(lock1, flags);
+}
+
+static void sched_free_unit_mem(struct sched_unit *unit)
+{
+ struct sched_unit *prev_unit;
+ struct domain *d = unit->domain;
+
+ if ( d->sched_unit_list == unit )
+ d->sched_unit_list = unit->next_in_list;
+ else
+ {
+ for_each_sched_unit ( d, prev_unit )
+ {
+ if ( prev_unit->next_in_list == unit )
+ {
+ prev_unit->next_in_list = unit->next_in_list;
+ break;
+ }
+ }
+ }
+
+ free_cpumask_var(unit->cpu_hard_affinity);
+ free_cpumask_var(unit->cpu_hard_affinity_saved);
+ free_cpumask_var(unit->cpu_soft_affinity);
+
+ xfree(unit);
+}
+
+static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
+{
+ struct vcpu *vunit;
+ unsigned int cnt = 0;
+
+ /* Don't count to be released vcpu, might be not in vcpu list yet. */
+ for_each_sched_unit_vcpu ( unit, vunit )
+ if ( vunit != v )
+ cnt++;
+
+ v->sched_unit = NULL;
+ unit->runstate_cnt[v->runstate.state]--;
+
+ if ( unit->vcpu_list == v )
+ unit->vcpu_list = v->next_in_list;
+
+ if ( !cnt )
+ sched_free_unit_mem(unit);
+}
+
+static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
+{
+ v->sched_unit = unit;
+
+ /* All but idle vcpus are allocated with sequential vcpu_id. */
+ if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
+ {
+ unit->vcpu_list = v;
+ /*
+ * unit_id is always the same as lowest vcpu_id of unit.
+ * This is used for stopping for_each_sched_unit_vcpu() loop and in
+ * order to support cpupools with different granularities.
+ */
+ unit->unit_id = v->vcpu_id;
+ }
+ unit->runstate_cnt[v->runstate.state]++;
+}
+
+static struct sched_unit *sched_alloc_unit_mem(void)
+{
+ struct sched_unit *unit;
+
+ unit = xzalloc(struct sched_unit);
+ if ( !unit )
+ return NULL;
+
+ if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
+ !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
+ !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
+ {
+ sched_free_unit_mem(unit);
+ unit = NULL;
+ }
+
+ return unit;
+}
+
+static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
+{
+ struct sched_unit **prev_unit;
+
+ unit->domain = d;
+
+ for ( prev_unit = &d->sched_unit_list; *prev_unit;
+ prev_unit = &(*prev_unit)->next_in_list )
+ if ( (*prev_unit)->next_in_list &&
+ (*prev_unit)->next_in_list->unit_id > unit->unit_id )
+ break;
+
+ unit->next_in_list = *prev_unit;
+ *prev_unit = unit;
+}
+
+static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+{
+ struct sched_unit *unit;
+ struct domain *d = v->domain;
+ unsigned int gran = cpupool_get_granularity(d->cpupool);
+
+ for_each_sched_unit ( d, unit )
+ if ( unit->unit_id / gran == v->vcpu_id / gran )
+ break;
+
+ if ( unit )
+ {
+ sched_unit_add_vcpu(unit, v);
+ return unit;
+ }
+
+ if ( (unit = sched_alloc_unit_mem()) == NULL )
+ return NULL;
+
+ sched_unit_add_vcpu(unit, v);
+ sched_domain_insert_unit(unit, d);
+
+ return unit;
+}
+
+static unsigned int sched_select_initial_cpu(const struct vcpu *v)
+{
+ const struct domain *d = v->domain;
+ nodeid_t node;
+ spinlock_t *lock;
+ unsigned long flags;
+ unsigned int cpu_ret, cpu = smp_processor_id();
+ cpumask_t *cpus = cpumask_scratch_cpu(cpu);
+
+ lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+ cpumask_clear(cpus);
+ for_each_node_mask ( node, d->node_affinity )
+ cpumask_or(cpus, cpus, &node_to_cpumask(node));
+ cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
+ if ( cpumask_empty(cpus) )
+ cpumask_copy(cpus, d->cpupool->cpu_valid);
+
+ if ( v->vcpu_id == 0 )
+ cpu_ret = cpumask_first(cpus);
+ else
+ {
+ /* We can rely on previous vcpu being available. */
+ ASSERT(!is_idle_domain(d));
+
+ cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
+ }
+
+ pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+
+ return cpu_ret;
+}
+
+int sched_init_vcpu(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct sched_unit *unit;
+ unsigned int processor;
+
+ if ( (unit = sched_alloc_unit(v)) == NULL )
+ return 1;
+
+ if ( is_idle_domain(d) )
+ processor = v->vcpu_id;
+ else
+ processor = sched_select_initial_cpu(v);
+
+ /* Initialise the per-vcpu timers. */
+ spin_lock_init(&v->periodic_timer_lock);
+ init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
+ init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
+ init_timer(&v->poll_timer, poll_timer_fn, v, processor);
+
+ /* If this is not the first vcpu of the unit we are done. */
+ if ( unit->priv != NULL )
+ {
+ v->processor = processor;
+ return 0;
+ }
+
+ rcu_read_lock(&sched_res_rculock);
+
+ /* The first vcpu of an unit can be set via sched_set_res(). */
+ sched_set_res(unit, get_sched_res(processor));
+
+ unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
+ if ( unit->priv == NULL )
+ {
+ sched_free_unit(unit, v);
+ rcu_read_unlock(&sched_res_rculock);
+ return 1;
+ }
+
+ /*
+ * Initialize affinity settings. The idler, and potentially
+ * domain-0 VCPUs, are pinned onto their respective physical CPUs.
+ */
+ if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
+ sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
+ else
+ sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+ /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
+ if ( is_idle_domain(d) )
+ {
+ get_sched_res(v->processor)->curr = unit;
+ get_sched_res(v->processor)->sched_unit_idle = unit;
+ v->is_running = 1;
+ unit->is_running = true;
+ unit->state_entry_time = NOW();
+ }
+ else
+ {
+ sched_insert_unit(dom_scheduler(d), unit);
+ }
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ return 0;
+}
+
+static void vcpu_move_irqs(struct vcpu *v)
+{
+ arch_move_irqs(v);
+ evtchn_move_pirqs(v);
+}
+
+static void sched_move_irqs(const struct sched_unit *unit)
+{
+ struct vcpu *v;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ vcpu_move_irqs(v);
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+ struct vcpu *v;
+ struct sched_unit *unit;
+ unsigned int new_p, unit_idx;
+ void **unit_priv;
+ void *domdata;
+ void *unitdata;
+ struct scheduler *old_ops;
+ void *old_domdata;
+ unsigned int gran = cpupool_get_granularity(c);
+ int ret = 0;
+
+ for_each_vcpu ( d, v )
+ {
+ if ( v->affinity_broken )
+ return -EBUSY;
+ }
+
+ rcu_read_lock(&sched_res_rculock);
+
+ domdata = sched_alloc_domdata(c->sched, d);
+ if ( IS_ERR(domdata) )
+ {
+ ret = PTR_ERR(domdata);
+ goto out;
+ }
+
+ unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
+ if ( unit_priv == NULL )
+ {
+ sched_free_domdata(c->sched, domdata);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ unit_idx = 0;
+ for_each_sched_unit ( d, unit )
+ {
+ unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
+ if ( unit_priv[unit_idx] == NULL )
+ {
+ for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
+ sched_free_udata(c->sched, unit_priv[unit_idx]);
+ xfree(unit_priv);
+ sched_free_domdata(c->sched, domdata);
+ ret = -ENOMEM;
+ goto out;
+ }
+ unit_idx++;
+ }
+
+ domain_pause(d);
+
+ old_ops = dom_scheduler(d);
+ old_domdata = d->sched_priv;
+
+ for_each_sched_unit ( d, unit )
+ {
+ sched_remove_unit(old_ops, unit);
+ }
+
+ d->cpupool = c;
+ d->sched_priv = domdata;
+
+ new_p = cpumask_first(c->cpu_valid);
+ unit_idx = 0;
+ for_each_sched_unit ( d, unit )
+ {
+ spinlock_t *lock;
+ unsigned int unit_p = new_p;
+
+ unitdata = unit->priv;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ {
+ migrate_timer(&v->periodic_timer, new_p);
+ migrate_timer(&v->singleshot_timer, new_p);
+ migrate_timer(&v->poll_timer, new_p);
+ new_p = cpumask_cycle(new_p, c->cpu_valid);
+ }
+
+ lock = unit_schedule_lock_irq(unit);
+
+ sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+ sched_set_res(unit, get_sched_res(unit_p));
+ /*
+ * With v->processor modified we must not
+ * - make any further changes assuming we hold the scheduler lock,
+ * - use unit_schedule_unlock_irq().
+ */
+ spin_unlock_irq(lock);
+
+ unit->priv = unit_priv[unit_idx];
+ if ( !d->is_dying )
+ sched_move_irqs(unit);
+
+ sched_insert_unit(c->sched, unit);
+
+ sched_free_udata(old_ops, unitdata);
+
+ unit_idx++;
+ }
+
+ domain_update_node_affinity(d);
+
+ domain_unpause(d);
+
+ sched_free_domdata(old_ops, old_domdata);
+
+ xfree(unit_priv);
+
+out:
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+void sched_destroy_vcpu(struct vcpu *v)
+{
+ struct sched_unit *unit = v->sched_unit;
+
+ kill_timer(&v->periodic_timer);
+ kill_timer(&v->singleshot_timer);
+ kill_timer(&v->poll_timer);
+ if ( test_and_clear_bool(v->is_urgent) )
+ atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+ /*
+ * Vcpus are being destroyed top-down. So being the first vcpu of an unit
+ * is the same as being the only one.
+ */
+ if ( unit->vcpu_list == v )
+ {
+ rcu_read_lock(&sched_res_rculock);
+
+ sched_remove_unit(vcpu_scheduler(v), unit);
+ sched_free_udata(vcpu_scheduler(v), unit->priv);
+ sched_free_unit(unit, v);
+
+ rcu_read_unlock(&sched_res_rculock);
+ }
+}
+
+int sched_init_domain(struct domain *d, int poolid)
+{
+ void *sdom;
+ int ret;
+
+ ASSERT(d->cpupool == NULL);
+ ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+ if ( (ret = cpupool_add_domain(d, poolid)) )
+ return ret;
+
+ SCHED_STAT_CRANK(dom_init);
+ TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sdom = sched_alloc_domdata(dom_scheduler(d), d);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ if ( IS_ERR(sdom) )
+ return PTR_ERR(sdom);
+
+ d->sched_priv = sdom;
+
+ return 0;
+}
+
+void sched_destroy_domain(struct domain *d)
+{
+ ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+ if ( d->cpupool )
+ {
+ SCHED_STAT_CRANK(dom_destroy);
+ TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sched_free_domdata(dom_scheduler(d), d->sched_priv);
+ d->sched_priv = NULL;
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ cpupool_rm_domain(d);
+ }
+}
+
+static void vcpu_sleep_nosync_locked(struct vcpu *v)
+{
+ struct sched_unit *unit = v->sched_unit;
+
+ ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+
+ if ( likely(!vcpu_runnable(v)) )
+ {
+ if ( v->runstate.state == RUNSTATE_runnable )
+ vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
+ /* Only put unit to sleep in case all vcpus are not runnable. */
+ if ( likely(!unit_runnable(unit)) )
+ sched_sleep(unit_scheduler(unit), unit);
+ else if ( unit_running(unit) > 1 && v->is_running &&
+ !v->force_context_switch )
+ {
+ v->force_context_switch = true;
+ cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+ }
+ }
+}
+
+void vcpu_sleep_nosync(struct vcpu *v)
+{
+ unsigned long flags;
+ spinlock_t *lock;
+
+ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
+
+ vcpu_sleep_nosync_locked(v);
+
+ unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+void vcpu_sleep_sync(struct vcpu *v)
+{
+ vcpu_sleep_nosync(v);
+
+ while ( !vcpu_runnable(v) && v->is_running )
+ cpu_relax();
+
+ sync_vcpu_execstate(v);
+}
+
+void vcpu_wake(struct vcpu *v)
+{
+ unsigned long flags;
+ spinlock_t *lock;
+ struct sched_unit *unit = v->sched_unit;
+
+ TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irqsave(unit, &flags);
+
+ if ( likely(vcpu_runnable(v)) )
+ {
+ if ( v->runstate.state >= RUNSTATE_blocked )
+ vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
+ /*
+ * Call sched_wake() unconditionally, even if unit is running already.
+ * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
+ * and are now to be woken up again.
+ */
+ sched_wake(unit_scheduler(unit), unit);
+ if ( unit->is_running && !v->is_running && !v->force_context_switch )
+ {
+ v->force_context_switch = true;
+ cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+ }
+ }
+ else if ( !(v->pause_flags & VPF_blocked) )
+ {
+ if ( v->runstate.state == RUNSTATE_blocked )
+ vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+ }
+
+ unit_schedule_unlock_irqrestore(lock, flags, unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+void vcpu_unblock(struct vcpu *v)
+{
+ if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
+ return;
+
+ /* Polling period ends when a VCPU is unblocked. */
+ if ( unlikely(v->poll_evtchn != 0) )
+ {
+ v->poll_evtchn = 0;
+ /*
+ * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
+ * this VCPU (and it then going back to sleep on poll_mask).
+ * Test-and-clear is idiomatic and ensures clear_bit not reordered.
+ */
+ if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+ clear_bit(_VPF_blocked, &v->pause_flags);
+ }
+
+ vcpu_wake(v);
+}
+
+/*
+ * Do the actual movement of an unit from old to new CPU. Locks for *both*
+ * CPUs needs to have been taken already when calling this!
+ */
+static void sched_unit_move_locked(struct sched_unit *unit,
+ unsigned int new_cpu)
+{
+ unsigned int old_cpu = unit->res->master_cpu;
+ struct vcpu *v;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ /*
+ * Transfer urgency status to new CPU before switching CPUs, as
+ * once the switch occurs, v->is_urgent is no longer protected by
+ * the per-CPU scheduler lock we are holding.
+ */
+ for_each_sched_unit_vcpu ( unit, v )
+ {
+ if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
+ {
+ atomic_inc(&per_cpu(sched_urgent_count, new_cpu));
+ atomic_dec(&per_cpu(sched_urgent_count, old_cpu));
+ }
+ }
+
+ /*
+ * Actual CPU switch to new CPU. This is safe because the lock
+ * pointer can't change while the current lock is held.
+ */
+ sched_migrate(unit_scheduler(unit), unit, new_cpu);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+/*
+ * Initiating migration
+ *
+ * In order to migrate, we need the unit in question to have stopped
+ * running and have called sched_sleep() (to take it off any
+ * runqueues, for instance); and if it is currently running, it needs
+ * to be scheduled out. Finally, we need to hold the scheduling locks
+ * for both the processor we're migrating from, and the processor
+ * we're migrating to.
+ *
+ * In order to avoid deadlock while satisfying the final requirement,
+ * we must release any scheduling lock we hold, then try to grab both
+ * locks we want, then double-check to make sure that what we started
+ * to do hasn't been changed in the mean time.
+ *
+ * These steps are encapsulated in the following two functions; they
+ * should be called like this:
+ *
+ * lock = unit_schedule_lock_irq(unit);
+ * sched_unit_migrate_start(unit);
+ * unit_schedule_unlock_irq(lock, unit)
+ * sched_unit_migrate_finish(unit);
+ *
+ * sched_unit_migrate_finish() will do the work now if it can, or simply
+ * return if it can't (because unit is still running); in that case
+ * sched_unit_migrate_finish() will be called by unit_context_saved().
+ */
+static void sched_unit_migrate_start(struct sched_unit *unit)
+{
+ struct vcpu *v;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ {
+ set_bit(_VPF_migrating, &v->pause_flags);
+ vcpu_sleep_nosync_locked(v);
+ }
+}
+
+static void sched_unit_migrate_finish(struct sched_unit *unit)
+{
+ unsigned long flags;
+ unsigned int old_cpu, new_cpu;
+ spinlock_t *old_lock, *new_lock;
+ bool_t pick_called = 0;
+ struct vcpu *v;
+
+ /*
+ * If the unit is currently running, this will be handled by
+ * unit_context_saved(); and in any case, if the bit is cleared, then
+ * someone else has already done the work so we don't need to.
+ */
+ if ( unit->is_running )
+ return;
+ for_each_sched_unit_vcpu ( unit, v )
+ if ( !test_bit(_VPF_migrating, &v->pause_flags) )
+ return;
+
+ old_cpu = new_cpu = unit->res->master_cpu;
+ for ( ; ; )
+ {
+ /*
+ * We need another iteration if the pre-calculated lock addresses
+ * are not correct any longer after evaluating old and new cpu holding
+ * the locks.
+ */
+ old_lock = get_sched_res(old_cpu)->schedule_lock;
+ new_lock = get_sched_res(new_cpu)->schedule_lock;
+
+ sched_spin_lock_double(old_lock, new_lock, &flags);
+
+ old_cpu = unit->res->master_cpu;
+ if ( old_lock == get_sched_res(old_cpu)->schedule_lock )
+ {
+ /*
+ * If we selected a CPU on the previosu iteration, check if it
+ * remains suitable for running this vCPU.
+ */
+ if ( pick_called &&
+ (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
+ cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) &&
+ cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
+ break;
+
+ /* Select a new CPU. */
+ new_cpu = sched_pick_resource(unit_scheduler(unit),
+ unit)->master_cpu;
+ if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
+ cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
+ break;
+ pick_called = 1;
+ }
+ else
+ {
+ /*
+ * We do not hold the scheduler lock appropriate for this vCPU.
+ * Thus we cannot select a new CPU on this iteration. Try again.
+ */
+ pick_called = 0;
+ }
+
+ sched_spin_unlock_double(old_lock, new_lock, flags);
+ }
+
+ /*
+ * NB. Check of v->running happens /after/ setting migration flag
+ * because they both happen in (different) spinlock regions, and those
+ * regions are strictly serialised.
+ */
+ if ( unit->is_running )
+ {
+ sched_spin_unlock_double(old_lock, new_lock, flags);
+ return;
+ }
+ for_each_sched_unit_vcpu ( unit, v )
+ {
+ if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
+ {
+ sched_spin_unlock_double(old_lock, new_lock, flags);
+ return;
+ }
+ }
+
+ sched_unit_move_locked(unit, new_cpu);
+
+ sched_spin_unlock_double(old_lock, new_lock, flags);
+
+ if ( old_cpu != new_cpu )
+ {
+ /* Vcpus are moved to other pcpus, commit their states to memory. */
+ for_each_sched_unit_vcpu ( unit, v )
+ sync_vcpu_execstate(v);
+ sched_move_irqs(unit);
+ }
+
+ /* Wake on new CPU. */
+ for_each_sched_unit_vcpu ( unit, v )
+ vcpu_wake(v);
+}
+
+static bool sched_check_affinity_broken(const struct sched_unit *unit)
+{
+ const struct vcpu *v;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ if ( v->affinity_broken )
+ return true;
+
+ return false;
+}
+
+static void sched_reset_affinity_broken(struct sched_unit *unit)
+{
+ struct vcpu *v;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ v->affinity_broken = false;
+}
+
+void restore_vcpu_affinity(struct domain *d)
+{
+ unsigned int cpu = smp_processor_id();
+ struct sched_unit *unit;
+
+ ASSERT(system_state == SYS_STATE_resume);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ for_each_sched_unit ( d, unit )
+ {
+ spinlock_t *lock;
+ unsigned int old_cpu = sched_unit_master(unit);
+ struct sched_resource *res;
+
+ ASSERT(!unit_runnable(unit));
+
+ /*
+ * Re-assign the initial processor as after resume we have no
+ * guarantee the old processor has come back to life again.
+ *
+ * Therefore, here, before actually unpausing the domains, we should
+ * set v->processor of each of their vCPUs to something that will
+ * make sense for the scheduler of the cpupool in which they are in.
+ */
+ lock = unit_schedule_lock_irq(unit);
+
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(d));
+ if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
+ {
+ if ( sched_check_affinity_broken(unit) )
+ {
+ sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
+ sched_reset_affinity_broken(unit);
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(d));
+ }
+
+ if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
+ {
+ /* Affinity settings of one vcpu are for the complete unit. */
+ printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
+ unit->vcpu_list);
+ sched_set_affinity(unit, &cpumask_all, NULL);
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(d));
+ }
+ }
+
+ res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu)));
+ sched_set_res(unit, res);
+
+ spin_unlock_irq(lock);
+
+ /* v->processor might have changed, so reacquire the lock. */
+ lock = unit_schedule_lock_irq(unit);
+ res = sched_pick_resource(unit_scheduler(unit), unit);
+ sched_set_res(unit, res);
+ spin_unlock_irq(lock);
+
+ if ( old_cpu != sched_unit_master(unit) )
+ sched_move_irqs(unit);
+ }
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ domain_update_node_affinity(d);
+}
+
+/*
+ * This function is used by cpu_hotplug code via cpu notifier chain
+ * and from cpupools to switch schedulers on a cpu.
+ * Caller must get domlist_read_lock.
+ */
+int cpu_disable_scheduler(unsigned int cpu)
+{
+ struct domain *d;
+ struct cpupool *c;
+ cpumask_t online_affinity;
+ int ret = 0;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ c = get_sched_res(cpu)->cpupool;
+ if ( c == NULL )
+ goto out;
+
+ for_each_domain_in_cpupool ( d, c )
+ {
+ struct sched_unit *unit;
+
+ for_each_sched_unit ( d, unit )
+ {
+ unsigned long flags;
+ spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
+
+ cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
+ if ( cpumask_empty(&online_affinity) &&
+ cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
+ {
+ if ( sched_check_affinity_broken(unit) )
+ {
+ /* The unit is temporarily pinned, can't move it. */
+ unit_schedule_unlock_irqrestore(lock, flags, unit);
+ ret = -EADDRINUSE;
+ break;
+ }
+
+ printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
+ unit->vcpu_list);
+
+ sched_set_affinity(unit, &cpumask_all, NULL);
+ }
+
+ if ( unit->res != get_sched_res(cpu) )
+ {
+ /* The unit is not on this cpu, so we can move on. */
+ unit_schedule_unlock_irqrestore(lock, flags, unit);
+ continue;
+ }
+
+ /* If it is on this cpu, we must send it away.
+ * We are doing some cpupool manipulations:
+ * * we want to call the scheduler, and let it re-evaluation
+ * the placement of the vcpu, taking into account the new
+ * cpupool configuration;
+ * * the scheduler will always find a suitable solution, or
+ * things would have failed before getting in here.
+ */
+ sched_unit_migrate_start(unit);
+ unit_schedule_unlock_irqrestore(lock, flags, unit);
+ sched_unit_migrate_finish(unit);
+
+ /*
+ * The only caveat, in this case, is that if a vcpu active in
+ * the hypervisor isn't migratable. In this case, the caller
+ * should try again after releasing and reaquiring all locks.
+ */
+ if ( unit->res == get_sched_res(cpu) )
+ ret = -EAGAIN;
+ }
+ }
+
+out:
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+static int cpu_disable_scheduler_check(unsigned int cpu)
+{
+ struct domain *d;
+ struct vcpu *v;
+ struct cpupool *c;
+
+ c = get_sched_res(cpu)->cpupool;
+ if ( c == NULL )
+ return 0;
+
+ for_each_domain_in_cpupool ( d, c )
+ for_each_vcpu ( d, v )
+ if ( v->affinity_broken )
+ return -EADDRINUSE;
+
+ return 0;
+}
+
+/*
+ * In general, this must be called with the scheduler lock held, because the
+ * adjust_affinity hook may want to modify the vCPU state. However, when the
+ * vCPU is being initialized (either for dom0 or domU) there is no risk of
+ * races, and it's fine to not take the look (we're talking about
+ * sched_setup_dom0_vcpus() an sched_init_vcpu()).
+ */
+static void sched_set_affinity(
+ struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft)
+{
+ rcu_read_lock(&sched_res_rculock);
+ sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft);
+ rcu_read_unlock(&sched_res_rculock);
+
+ if ( hard )
+ cpumask_copy(unit->cpu_hard_affinity, hard);
+ if ( soft )
+ cpumask_copy(unit->cpu_soft_affinity, soft);
+
+ unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity,
+ unit->cpu_soft_affinity) &&
+ cpumask_intersects(unit->cpu_soft_affinity,
+ unit->cpu_hard_affinity);
+}
+
+static int vcpu_set_affinity(
+ struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which)
+{
+ struct sched_unit *unit = v->sched_unit;
+ spinlock_t *lock;
+ int ret = 0;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irq(unit);
+
+ if ( v->affinity_broken )
+ ret = -EBUSY;
+ else
+ {
+ /*
+ * Tell the scheduler we changes something about affinity,
+ * and ask to re-evaluate vcpu placement.
+ */
+ if ( which == unit->cpu_hard_affinity )
+ {
+ sched_set_affinity(unit, affinity, NULL);
+ }
+ else
+ {
+ ASSERT(which == unit->cpu_soft_affinity);
+ sched_set_affinity(unit, NULL, affinity);
+ }
+ sched_unit_migrate_start(unit);
+ }
+
+ unit_schedule_unlock_irq(lock, unit);
+
+ domain_update_node_affinity(v->domain);
+
+ sched_unit_migrate_finish(unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
+{
+ cpumask_t online_affinity;
+ cpumask_t *online;
+
+ online = VCPU2ONLINE(v);
+ cpumask_and(&online_affinity, affinity, online);
+ if ( cpumask_empty(&online_affinity) )
+ return -EINVAL;
+
+ return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity);
+}
+
+int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
+{
+ return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity);
+}
+
+/* Block the currently-executing domain until a pertinent event occurs. */
+void vcpu_block(void)
+{
+ struct vcpu *v = current;
+
+ set_bit(_VPF_blocked, &v->pause_flags);
+
+ arch_vcpu_block(v);
+
+ /* Check for events /after/ blocking: avoids wakeup waiting race. */
+ if ( local_events_need_delivery() )
+ {
+ clear_bit(_VPF_blocked, &v->pause_flags);
+ }
+ else
+ {
+ TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
+ raise_softirq(SCHEDULE_SOFTIRQ);
+ }
+}
+
+static void vcpu_block_enable_events(void)
+{
+ local_event_delivery_enable();
+ vcpu_block();
+}
+
+static long do_poll(struct sched_poll *sched_poll)
+{
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
+ evtchn_port_t port = 0;
+ long rc;
+ unsigned int i;
+
+ /* Fairly arbitrary limit. */
+ if ( sched_poll->nr_ports > 128 )
+ return -EINVAL;
+
+ if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
+ return -EFAULT;
+
+ set_bit(_VPF_blocked, &v->pause_flags);
+ v->poll_evtchn = -1;
+ set_bit(v->vcpu_id, d->poll_mask);
+
+ arch_vcpu_block(v);
+
+#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
+ /* Check for events /after/ setting flags: avoids wakeup waiting race. */
+ smp_mb();
+
+ /*
+ * Someone may have seen we are blocked but not that we are polling, or
+ * vice versa. We are certainly being woken, so clean up and bail. Beyond
+ * this point others can be guaranteed to clean up for us if they wake us.
+ */
+ rc = 0;
+ if ( (v->poll_evtchn == 0) ||
+ !test_bit(_VPF_blocked, &v->pause_flags) ||
+ !test_bit(v->vcpu_id, d->poll_mask) )
+ goto out;
+#endif
+
+ rc = 0;
+ if ( local_events_need_delivery() )
+ goto out;
+
+ for ( i = 0; i < sched_poll->nr_ports; i++ )
+ {
+ rc = -EFAULT;
+ if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
+ goto out;
+
+ rc = -EINVAL;
+ if ( port >= d->max_evtchns )
+ goto out;
+
+ rc = 0;
+ if ( evtchn_port_is_pending(d, port) )
+ goto out;
+ }
+
+ if ( sched_poll->nr_ports == 1 )
+ v->poll_evtchn = port;
+
+ if ( sched_poll->timeout != 0 )
+ set_timer(&v->poll_timer, sched_poll->timeout);
+
+ TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
+ raise_softirq(SCHEDULE_SOFTIRQ);
+
+ return 0;
+
+ out:
+ v->poll_evtchn = 0;
+ clear_bit(v->vcpu_id, d->poll_mask);
+ clear_bit(_VPF_blocked, &v->pause_flags);
+ return rc;
+}
+
+/* Voluntarily yield the processor for this allocation. */
+long vcpu_yield(void)
+{
+ struct vcpu * v=current;
+ spinlock_t *lock;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irq(v->sched_unit);
+ sched_yield(vcpu_scheduler(v), v->sched_unit);
+ unit_schedule_unlock_irq(lock, v->sched_unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ SCHED_STAT_CRANK(vcpu_yield);
+
+ TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
+ raise_softirq(SCHEDULE_SOFTIRQ);
+ return 0;
+}
+
+static void domain_watchdog_timeout(void *data)
+{
+ struct domain *d = data;
+
+ if ( d->is_shutting_down || d->is_dying )
+ return;
+
+ printk("Watchdog timer fired for domain %u\n", d->domain_id);
+ domain_shutdown(d, SHUTDOWN_watchdog);
+}
+
+static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
+{
+ if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
+ return -EINVAL;
+
+ spin_lock(&d->watchdog_lock);
+
+ if ( id == 0 )
+ {
+ for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
+ {
+ if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
+ continue;
+ set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
+ break;
+ }
+ spin_unlock(&d->watchdog_lock);
+ return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
+ }
+
+ id -= 1;
+ if ( !test_bit(id, &d->watchdog_inuse_map) )
+ {
+ spin_unlock(&d->watchdog_lock);
+ return -EINVAL;
+ }
+
+ if ( timeout == 0 )
+ {
+ stop_timer(&d->watchdog_timer[id]);
+ clear_bit(id, &d->watchdog_inuse_map);
+ }
+ else
+ {
+ set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
+ }
+
+ spin_unlock(&d->watchdog_lock);
+ return 0;
+}
+
+void watchdog_domain_init(struct domain *d)
+{
+ unsigned int i;
+
+ spin_lock_init(&d->watchdog_lock);
+
+ d->watchdog_inuse_map = 0;
+
+ for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
+ init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
+}
+
+void watchdog_domain_destroy(struct domain *d)
+{
+ unsigned int i;
+
+ for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
+ kill_timer(&d->watchdog_timer[i]);
+}
+
+/*
+ * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
+ * cpu is NR_CPUS).
+ * Temporary pinning can be done due to two reasons, which may be nested:
+ * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
+ * of a conflict (e.g. in case cpupool doesn't include requested CPU, or
+ * another conflicting temporary pinning is already in effect.
+ * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
+ * CPU it is just running on. Can't fail if used properly.
+ */
+int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
+{
+ struct sched_unit *unit = v->sched_unit;
+ spinlock_t *lock;
+ int ret = -EINVAL;
+ bool migrate;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irq(unit);
+
+ if ( cpu == NR_CPUS )
+ {
+ if ( v->affinity_broken & reason )
+ {
+ ret = 0;
+ v->affinity_broken &= ~reason;
+ }
+ if ( !ret && !sched_check_affinity_broken(unit) )
+ sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
+ }
+ else if ( cpu < nr_cpu_ids )
+ {
+ if ( (v->affinity_broken & reason) ||
+ (sched_check_affinity_broken(unit) && v->processor != cpu) )
+ ret = -EBUSY;
+ else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
+ {
+ if ( !sched_check_affinity_broken(unit) )
+ {
+ cpumask_copy(unit->cpu_hard_affinity_saved,
+ unit->cpu_hard_affinity);
+ sched_set_affinity(unit, cpumask_of(cpu), NULL);
+ }
+ v->affinity_broken |= reason;
+ ret = 0;
+ }
+ }
+
+ migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity);
+ if ( migrate )
+ sched_unit_migrate_start(unit);
+
+ unit_schedule_unlock_irq(lock, unit);
+
+ if ( migrate )
+ sched_unit_migrate_finish(unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+typedef long ret_t;
+
+#endif /* !COMPAT */
+
+ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
+{
+ ret_t ret = 0;
+
+ switch ( cmd )
+ {
+ case SCHEDOP_yield:
+ {
+ ret = vcpu_yield();
+ break;
+ }
+
+ case SCHEDOP_block:
+ {
+ vcpu_block_enable_events();
+ break;
+ }
+
+ case SCHEDOP_shutdown:
+ {
+ struct sched_shutdown sched_shutdown;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&sched_shutdown, arg, 1) )
+ break;
+
+ TRACE_3D(TRC_SCHED_SHUTDOWN,
+ current->domain->domain_id, current->vcpu_id,
+ sched_shutdown.reason);
+ ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
+
+ break;
+ }
+
+ case SCHEDOP_shutdown_code:
+ {
+ struct sched_shutdown sched_shutdown;
+ struct domain *d = current->domain;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&sched_shutdown, arg, 1) )
+ break;
+
+ TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
+ d->domain_id, current->vcpu_id, sched_shutdown.reason);
+
+ spin_lock(&d->shutdown_lock);
+ if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
+ d->shutdown_code = (u8)sched_shutdown.reason;
+ spin_unlock(&d->shutdown_lock);
+
+ ret = 0;
+ break;
+ }
+
+ case SCHEDOP_poll:
+ {
+ struct sched_poll sched_poll;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&sched_poll, arg, 1) )
+ break;
+
+ ret = do_poll(&sched_poll);
+
+ break;
+ }
+
+ case SCHEDOP_remote_shutdown:
+ {
+ struct domain *d;
+ struct sched_remote_shutdown sched_remote_shutdown;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
+ break;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
+ if ( d == NULL )
+ break;
+
+ ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
+ if ( likely(!ret) )
+ domain_shutdown(d, sched_remote_shutdown.reason);
+
+ rcu_unlock_domain(d);
+
+ break;
+ }
+
+ case SCHEDOP_watchdog:
+ {
+ struct sched_watchdog sched_watchdog;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&sched_watchdog, arg, 1) )
+ break;
+
+ ret = domain_watchdog(
+ current->domain, sched_watchdog.id, sched_watchdog.timeout);
+ break;
+ }
+
+ case SCHEDOP_pin_override:
+ {
+ struct sched_pin_override sched_pin_override;
+ unsigned int cpu;
+
+ ret = -EPERM;
+ if ( !is_hardware_domain(current->domain) )
+ break;
+
+ ret = -EFAULT;
+ if ( copy_from_guest(&sched_pin_override, arg, 1) )
+ break;
+
+ ret = -EINVAL;
+ if ( sched_pin_override.pcpu >= NR_CPUS )
+ break;
+
+ cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
+ ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
+
+ break;
+ }
+
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+#ifndef COMPAT
+
+/* Per-vcpu oneshot-timer hypercall. */
+long do_set_timer_op(s_time_t timeout)
+{
+ struct vcpu *v = current;
+ s_time_t offset = timeout - NOW();
+
+ if ( timeout == 0 )
+ {
+ stop_timer(&v->singleshot_timer);
+ }
+ else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
+ unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
+ {
+ /*
+ * Linux workaround: occasionally we will see timeouts a long way in
+ * the future due to wrapping in Linux's jiffy time handling. We check
+ * for timeouts wrapped negative, and for positive timeouts more than
+ * about 13 days in the future (2^50ns). The correct fix is to trigger
+ * an interrupt immediately (since Linux in fact has pending work to
+ * do in this situation). However, older guests also set a long timeout
+ * when they have *no* pending timers at all: setting an immediate
+ * timeout in this case can burn a lot of CPU. We therefore go for a
+ * reasonable middleground of triggering a timer event in 100ms.
+ */
+ gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
+ timeout);
+ set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
+ }
+ else
+ {
+ migrate_timer(&v->singleshot_timer, smp_processor_id());
+ set_timer(&v->singleshot_timer, timeout);
+ }
+
+ return 0;
+}
+
+/* sched_id - fetch ID of current scheduler */
+int sched_id(void)
+{
+ return ops.sched_id;
+}
+
+/* Adjust scheduling parameter for a given domain. */
+long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
+{
+ long ret;
+
+ ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
+ if ( ret )
+ return ret;
+
+ if ( op->sched_id != dom_scheduler(d)->sched_id )
+ return -EINVAL;
+
+ switch ( op->cmd )
+ {
+ case XEN_DOMCTL_SCHEDOP_putinfo:
+ case XEN_DOMCTL_SCHEDOP_getinfo:
+ case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
+ case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* NB: the pluggable scheduler code needs to take care
+ * of locking by itself. */
+ rcu_read_lock(&sched_res_rculock);
+
+ if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 )
+ TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
+{
+ struct cpupool *pool;
+ int rc;
+
+ rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
+ if ( rc )
+ return rc;
+
+ if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
+ (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
+ return -EINVAL;
+
+ pool = cpupool_get_by_id(op->cpupool_id);
+ if ( pool == NULL )
+ return -ESRCH;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ rc = ((op->sched_id == pool->sched->sched_id)
+ ? sched_adjust_cpupool(pool->sched, op) : -EINVAL);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ cpupool_put(pool);
+
+ return rc;
+}
+
+static void vcpu_periodic_timer_work_locked(struct vcpu *v)
+{
+ s_time_t now;
+ s_time_t periodic_next_event;
+
+ now = NOW();
+ periodic_next_event = v->periodic_last_event + v->periodic_period;
+
+ if ( now >= periodic_next_event )
+ {
+ send_timer_event(v);
+ v->periodic_last_event = now;
+ periodic_next_event = now + v->periodic_period;
+ }
+
+ migrate_timer(&v->periodic_timer, v->processor);
+ set_timer(&v->periodic_timer, periodic_next_event);
+}
+
+static void vcpu_periodic_timer_work(struct vcpu *v)
+{
+ if ( v->periodic_period == 0 )
+ return;
+
+ spin_lock(&v->periodic_timer_lock);
+ if ( v->periodic_period )
+ vcpu_periodic_timer_work_locked(v);
+ spin_unlock(&v->periodic_timer_lock);
+}
+
+/*
+ * Set the periodic timer of a vcpu.
+ */
+void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
+{
+ spin_lock(&v->periodic_timer_lock);
+
+ stop_timer(&v->periodic_timer);
+
+ v->periodic_period = value;
+ if ( value )
+ vcpu_periodic_timer_work_locked(v);
+
+ spin_unlock(&v->periodic_timer_lock);
+}
+
+static void sched_switch_units(struct sched_resource *sr,
+ struct sched_unit *next, struct sched_unit *prev,
+ s_time_t now)
+{
+ unsigned int cpu;
+
+ ASSERT(unit_running(prev));
+
+ if ( prev != next )
+ {
+ sr->curr = next;
+ sr->prev = prev;
+
+ TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
+ prev->unit_id, now - prev->state_entry_time);
+ TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
+ next->unit_id,
+ (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
+ (now - next->state_entry_time) : 0, prev->next_time);
+ TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
+ next->domain->domain_id, next->unit_id);
+
+ ASSERT(!unit_running(next));
+
+ /*
+ * NB. Don't add any trace records from here until the actual context
+ * switch, else lost_records resume will not work properly.
+ */
+
+ ASSERT(!next->is_running);
+ next->is_running = true;
+ next->state_entry_time = now;
+
+ if ( is_idle_unit(prev) )
+ {
+ prev->runstate_cnt[RUNSTATE_running] = 0;
+ prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity;
+ }
+ if ( is_idle_unit(next) )
+ {
+ next->runstate_cnt[RUNSTATE_running] = sr->granularity;
+ next->runstate_cnt[RUNSTATE_runnable] = 0;
+ }
+ }
+
+ for_each_cpu ( cpu, sr->cpus )
+ {
+ struct vcpu *vprev = get_cpu_current(cpu);
+ struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
+
+ if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
+ {
+ vcpu_runstate_change(vprev,
+ ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
+ (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+ now);
+ vcpu_runstate_change(vnext, vnext->new_state, now);
+ }
+
+ vnext->is_running = 1;
+
+ if ( is_idle_vcpu(vnext) )
+ vnext->sched_unit = next;
+ }
+}
+
+static bool sched_tasklet_check_cpu(unsigned int cpu)
+{
+ unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
+
+ switch ( *tasklet_work )
+ {
+ case TASKLET_enqueued:
+ set_bit(_TASKLET_scheduled, tasklet_work);
+ /* fallthrough */
+ case TASKLET_enqueued|TASKLET_scheduled:
+ return true;
+ break;
+ case TASKLET_scheduled:
+ clear_bit(_TASKLET_scheduled, tasklet_work);
+ /* fallthrough */
+ case 0:
+ /* return false; */
+ break;
+ default:
+ BUG();
+ }
+
+ return false;
+}
+
+static bool sched_tasklet_check(unsigned int cpu)
+{
+ bool tasklet_work_scheduled = false;
+ const cpumask_t *mask = get_sched_res(cpu)->cpus;
+ unsigned int cpu_iter;
+
+ for_each_cpu ( cpu_iter, mask )
+ if ( sched_tasklet_check_cpu(cpu_iter) )
+ tasklet_work_scheduled = true;
+
+ return tasklet_work_scheduled;
+}
+
+static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
+ unsigned int cpu)
+{
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct scheduler *sched = sr->scheduler;
+ struct sched_unit *next;
+
+ /* get policy-specific decision on scheduling... */
+ sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
+
+ next = prev->next_task;
+
+ if ( prev->next_time >= 0 ) /* -ve means no limit */
+ set_timer(&sr->s_timer, now + prev->next_time);
+
+ sched_switch_units(sr, next, prev, now);
+
+ return next;
+}
+
+static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
+{
+ /* Clear running flag /after/ writing context to memory. */
+ smp_wmb();
+
+ if ( vprev != vnext )
+ vprev->is_running = 0;
+}
+
+static void unit_context_saved(struct sched_resource *sr)
+{
+ struct sched_unit *unit = sr->prev;
+
+ if ( !unit )
+ return;
+
+ unit->is_running = false;
+ unit->state_entry_time = NOW();
+ sr->prev = NULL;
+
+ /* Check for migration request /after/ clearing running flag. */
+ smp_mb();
+
+ sched_context_saved(unit_scheduler(unit), unit);
+
+ /* Idle never migrates and idle vcpus might belong to other units. */
+ if ( !is_idle_unit(unit) )
+ sched_unit_migrate_finish(unit);
+}
+
+/*
+ * Rendezvous on end of context switch.
+ * As no lock is protecting this rendezvous function we need to use atomic
+ * access functions on the counter.
+ * The counter will be 0 in case no rendezvous is needed. For the rendezvous
+ * case it is initialised to the number of cpus to rendezvous plus 1. Each
+ * member entering decrements the counter. The last one will decrement it to
+ * 1 and perform the final needed action in that case (call of
+ * unit_context_saved()), and then set the counter to zero. The other members
+ * will wait until the counter becomes zero until they proceed.
+ */
+void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
+{
+ struct sched_unit *next = vnext->sched_unit;
+ struct sched_resource *sr;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sr = get_sched_res(smp_processor_id());
+
+ if ( atomic_read(&next->rendezvous_out_cnt) )
+ {
+ int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
+
+ vcpu_context_saved(vprev, vnext);
+
+ /* Call unit_context_saved() before releasing other waiters. */
+ if ( cnt == 1 )
+ {
+ unit_context_saved(sr);
+ atomic_set(&next->rendezvous_out_cnt, 0);
+ }
+ else
+ while ( atomic_read(&next->rendezvous_out_cnt) )
+ cpu_relax();
+ }
+ else
+ {
+ vcpu_context_saved(vprev, vnext);
+ if ( sr->granularity == 1 )
+ unit_context_saved(sr);
+ }
+
+ if ( is_idle_vcpu(vprev) && vprev != vnext )
+ vprev->sched_unit = sr->sched_unit_idle;
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
+ bool reset_idle_unit, s_time_t now)
+{
+ if ( unlikely(vprev == vnext) )
+ {
+ TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
+ vnext->domain->domain_id, vnext->sched_unit->unit_id,
+ now - vprev->runstate.state_entry_time,
+ vprev->sched_unit->next_time);
+ sched_context_switched(vprev, vnext);
+
+ /*
+ * We are switching from a non-idle to an idle unit.
+ * A vcpu of the idle unit might have been running before due to
+ * the guest vcpu being blocked. We must adjust the unit of the idle
+ * vcpu which might have been set to the guest's one.
+ */
+ if ( reset_idle_unit )
+ vnext->sched_unit =
+ get_sched_res(smp_processor_id())->sched_unit_idle;
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ trace_continue_running(vnext);
+ return continue_running(vprev);
+ }
+
+ SCHED_STAT_CRANK(sched_ctx);
+
+ stop_timer(&vprev->periodic_timer);
+
+ if ( vnext->sched_unit->migrated )
+ vcpu_move_irqs(vnext);
+
+ vcpu_periodic_timer_work(vnext);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ context_switch(vprev, vnext);
+}
+
+/*
+ * Force a context switch of a single vcpu of an unit.
+ * Might be called either if a vcpu of an already running unit is woken up
+ * or if a vcpu of a running unit is put asleep with other vcpus of the same
+ * unit still running.
+ * Returns either NULL if v is already in the correct state or the vcpu to
+ * run next.
+ */
+static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
+ struct vcpu *v,
+ unsigned int cpu, s_time_t now)
+{
+ v->force_context_switch = false;
+
+ if ( vcpu_runnable(v) == v->is_running )
+ return NULL;
+
+ if ( vcpu_runnable(v) )
+ {
+ if ( is_idle_vcpu(vprev) )
+ {
+ vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
+ vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
+ }
+ vcpu_runstate_change(v, RUNSTATE_running, now);
+ }
+ else
+ {
+ /* Make sure not to switch last vcpu of an unit away. */
+ if ( unit_running(v->sched_unit) == 1 )
+ return NULL;
+
+ v->new_state = vcpu_runstate_blocked(v);
+ vcpu_runstate_change(v, v->new_state, now);
+ v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
+ if ( v != vprev )
+ {
+ if ( is_idle_vcpu(vprev) )
+ {
+ vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
+ vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
+ }
+ else
+ {
+ v->sched_unit = vprev->sched_unit;
+ vcpu_runstate_change(v, RUNSTATE_running, now);
+ }
+ }
+ }
+
+ /* This vcpu will be switched to. */
+ v->is_running = true;
+
+ /* Make sure not to loose another slave call. */
+ raise_softirq(SCHED_SLAVE_SOFTIRQ);
+
+ return v;
+}
+
+/*
+ * Rendezvous before taking a scheduling decision.
+ * Called with schedule lock held, so all accesses to the rendezvous counter
+ * can be normal ones (no atomic accesses needed).
+ * The counter is initialized to the number of cpus to rendezvous initially.
+ * Each cpu entering will decrement the counter. In case the counter becomes
+ * zero do_schedule() is called and the rendezvous counter for leaving
+ * context_switch() is set. All other members will wait until the counter is
+ * becoming zero, dropping the schedule lock in between.
+ */
+static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
+ spinlock_t **lock, int cpu,
+ s_time_t now)
+{
+ struct sched_unit *next;
+ struct vcpu *v;
+ unsigned int gran = get_sched_res(cpu)->granularity;
+
+ if ( !--prev->rendezvous_in_cnt )
+ {
+ next = do_schedule(prev, now, cpu);
+ atomic_set(&next->rendezvous_out_cnt, gran + 1);
+ return next;
+ }
+
+ v = unit2vcpu_cpu(prev, cpu);
+ while ( prev->rendezvous_in_cnt )
+ {
+ if ( v && v->force_context_switch )
+ {
+ struct vcpu *vprev = current;
+
+ v = sched_force_context_switch(vprev, v, cpu, now);
+
+ if ( v )
+ {
+ /* We'll come back another time, so adjust rendezvous_in_cnt. */
+ prev->rendezvous_in_cnt++;
+ atomic_set(&prev->rendezvous_out_cnt, 0);
+
+ pcpu_schedule_unlock_irq(*lock, cpu);
+
+ sched_context_switch(vprev, v, false, now);
+
+ return NULL; /* ARM only. */
+ }
+
+ v = unit2vcpu_cpu(prev, cpu);
+ }
+ /*
+ * Coming from idle might need to do tasklet work.
+ * In order to avoid deadlocks we can't do that here, but have to
+ * continue the idle loop.
+ * Undo the rendezvous_in_cnt decrement and schedule another call of
+ * sched_slave().
+ */
+ if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) )
+ {
+ struct vcpu *vprev = current;
+
+ prev->rendezvous_in_cnt++;
+ atomic_set(&prev->rendezvous_out_cnt, 0);
+
+ pcpu_schedule_unlock_irq(*lock, cpu);
+
+ raise_softirq(SCHED_SLAVE_SOFTIRQ);
+ sched_context_switch(vprev, vprev, false, now);
+
+ return NULL; /* ARM only. */
+ }
+
+ pcpu_schedule_unlock_irq(*lock, cpu);
+
+ cpu_relax();
+
+ *lock = pcpu_schedule_lock_irq(cpu);
+
+ if ( unlikely(!scheduler_active) )
+ {
+ ASSERT(is_idle_unit(prev));
+ atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
+ prev->rendezvous_in_cnt = 0;
+ }
+ }
+
+ return prev->next_task;
+}
+
+static void sched_slave(void)
+{
+ struct vcpu *v, *vprev = current;
+ struct sched_unit *prev = vprev->sched_unit, *next;
+ s_time_t now;
+ spinlock_t *lock;
+ bool do_softirq = false;
+ unsigned int cpu = smp_processor_id();
+
+ ASSERT_NOT_IN_ATOMIC();
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = pcpu_schedule_lock_irq(cpu);
+
+ now = NOW();
+
+ v = unit2vcpu_cpu(prev, cpu);
+ if ( v && v->force_context_switch )
+ {
+ v = sched_force_context_switch(vprev, v, cpu, now);
+
+ if ( v )
+ {
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ sched_context_switch(vprev, v, false, now);
+
+ return;
+ }
+
+ do_softirq = true;
+ }
+
+ if ( !prev->rendezvous_in_cnt )
+ {
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ /* Check for failed forced context switch. */
+ if ( do_softirq )
+ raise_softirq(SCHEDULE_SOFTIRQ);
+
+ return;
+ }
+
+ stop_timer(&get_sched_res(cpu)->s_timer);
+
+ next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+ if ( !next )
+ return;
+
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
+ is_idle_unit(next) && !is_idle_unit(prev), now);
+}
+
+/*
+ * The main function
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
+ */
+static void schedule(void)
+{
+ struct vcpu *vnext, *vprev = current;
+ struct sched_unit *prev = vprev->sched_unit, *next = NULL;
+ s_time_t now;
+ struct sched_resource *sr;
+ spinlock_t *lock;
+ int cpu = smp_processor_id();
+ unsigned int gran;
+
+ ASSERT_NOT_IN_ATOMIC();
+
+ SCHED_STAT_CRANK(sched_run);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sr = get_sched_res(cpu);
+ gran = sr->granularity;
+
+ lock = pcpu_schedule_lock_irq(cpu);
+
+ if ( prev->rendezvous_in_cnt )
+ {
+ /*
+ * We have a race: sched_slave() should be called, so raise a softirq
+ * in order to re-enter schedule() later and call sched_slave() now.
+ */
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ raise_softirq(SCHEDULE_SOFTIRQ);
+ return sched_slave();
+ }
+
+ stop_timer(&sr->s_timer);
+
+ now = NOW();
+
+ if ( gran > 1 )
+ {
+ cpumask_t mask;
+
+ prev->rendezvous_in_cnt = gran;
+ cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu));
+ cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ);
+ next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+ if ( !next )
+ return;
+ }
+ else
+ {
+ prev->rendezvous_in_cnt = 0;
+ next = do_schedule(prev, now, cpu);
+ atomic_set(&next->rendezvous_out_cnt, 0);
+ }
+
+ pcpu_schedule_unlock_irq(lock, cpu);
+
+ vnext = sched_unit2vcpu_cpu(next, cpu);
+ sched_context_switch(vprev, vnext,
+ !is_idle_unit(prev) && is_idle_unit(next), now);
+}
+
+/* The scheduler timer: force a run through the scheduler */
+static void s_timer_fn(void *unused)
+{
+ raise_softirq(SCHEDULE_SOFTIRQ);
+ SCHED_STAT_CRANK(sched_irq);
+}
+
+/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
+static void vcpu_periodic_timer_fn(void *data)
+{
+ struct vcpu *v = data;
+ vcpu_periodic_timer_work(v);
+}
+
+/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
+static void vcpu_singleshot_timer_fn(void *data)
+{
+ struct vcpu *v = data;
+ send_timer_event(v);
+}
+
+/* SCHEDOP_poll timeout callback. */
+static void poll_timer_fn(void *data)
+{
+ struct vcpu *v = data;
+
+ if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+ vcpu_unblock(v);
+}
+
+static struct sched_resource *sched_alloc_res(void)
+{
+ struct sched_resource *sr;
+
+ sr = xzalloc(struct sched_resource);
+ if ( sr == NULL )
+ return NULL;
+ if ( !zalloc_cpumask_var(&sr->cpus) )
+ {
+ xfree(sr);
+ return NULL;
+ }
+ return sr;
+}
+
+static int cpu_schedule_up(unsigned int cpu)
+{
+ struct sched_resource *sr;
+
+ sr = sched_alloc_res();
+ if ( sr == NULL )
+ return -ENOMEM;
+
+ sr->master_cpu = cpu;
+ cpumask_copy(sr->cpus, cpumask_of(cpu));
+ set_sched_res(cpu, sr);
+
+ sr->scheduler = &sched_idle_ops;
+ spin_lock_init(&sr->_lock);
+ sr->schedule_lock = &sched_free_cpu_lock;
+ init_timer(&sr->s_timer, s_timer_fn, NULL, cpu);
+ atomic_set(&per_cpu(sched_urgent_count, cpu), 0);
+
+ /* We start with cpu granularity. */
+ sr->granularity = 1;
+
+ cpumask_set_cpu(cpu, &sched_res_mask);
+
+ /* Boot CPU is dealt with later in scheduler_init(). */
+ if ( cpu == 0 )
+ return 0;
+
+ if ( idle_vcpu[cpu] == NULL )
+ vcpu_create(idle_vcpu[0]->domain, cpu);
+ else
+ idle_vcpu[cpu]->sched_unit->res = sr;
+
+ if ( idle_vcpu[cpu] == NULL )
+ return -ENOMEM;
+
+ idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
+
+ /*
+ * No need to allocate any scheduler data, as cpus coming online are
+ * free initially and the idle scheduler doesn't need any data areas
+ * allocated.
+ */
+
+ sr->curr = idle_vcpu[cpu]->sched_unit;
+ sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
+
+ sr->sched_priv = NULL;
+
+ return 0;
+}
+
+static void sched_res_free(struct rcu_head *head)
+{
+ struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
+
+ free_cpumask_var(sr->cpus);
+ if ( sr->sched_unit_idle )
+ sched_free_unit_mem(sr->sched_unit_idle);
+ xfree(sr);
+}
+
+static void cpu_schedule_down(unsigned int cpu)
+{
+ struct sched_resource *sr;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sr = get_sched_res(cpu);
+
+ kill_timer(&sr->s_timer);
+
+ cpumask_clear_cpu(cpu, &sched_res_mask);
+ set_sched_res(cpu, NULL);
+
+ /* Keep idle unit. */
+ sr->sched_unit_idle = NULL;
+ call_rcu(&sr->rcu, sched_res_free);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+void sched_rm_cpu(unsigned int cpu)
+{
+ int rc;
+
+ rcu_read_lock(&domlist_read_lock);
+ rc = cpu_disable_scheduler(cpu);
+ BUG_ON(rc);
+ rcu_read_unlock(&domlist_read_lock);
+ cpu_schedule_down(cpu);
+}
+
+static int cpu_schedule_callback(
+ struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ int rc = 0;
+
+ /*
+ * All scheduler related suspend/resume handling needed is done in
+ * cpupool.c.
+ */
+ if ( system_state > SYS_STATE_active )
+ return NOTIFY_DONE;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ /*
+ * From the scheduler perspective, bringing up a pCPU requires
+ * allocating and initializing the per-pCPU scheduler specific data,
+ * as well as "registering" this pCPU to the scheduler (which may
+ * involve modifying some scheduler wide data structures).
+ * As new pCPUs always start as "free" cpus with the minimal idle
+ * scheduler being in charge, we don't need any of that.
+ *
+ * On the other hand, at teardown, we need to reverse what has been done
+ * during initialization, and then free the per-pCPU specific data. A
+ * pCPU brought down is not forced through "free" cpus, so here we need to
+ * use the appropriate hooks.
+ *
+ * This happens by calling the deinit_pdata and free_pdata hooks, in this
+ * order. If no per-pCPU memory was allocated, there is no need to
+ * provide an implementation of free_pdata. deinit_pdata may, however,
+ * be necessary/useful in this case too (e.g., it can undo something done
+ * on scheduler wide data structure during init_pdata). Both deinit_pdata
+ * and free_pdata are called during CPU_DEAD.
+ *
+ * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED.
+ */
+ switch ( action )
+ {
+ case CPU_UP_PREPARE:
+ rc = cpu_schedule_up(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcu_read_lock(&domlist_read_lock);
+ rc = cpu_disable_scheduler_check(cpu);
+ rcu_read_unlock(&domlist_read_lock);
+ break;
+ case CPU_DEAD:
+ sched_rm_cpu(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ cpu_schedule_down(cpu);
+ break;
+ default:
+ break;
+ }
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
+}
+
+static struct notifier_block cpu_schedule_nfb = {
+ .notifier_call = cpu_schedule_callback
+};
+
+const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu)
+{
+ const cpumask_t *mask;
+
+ switch ( opt )
+ {
+ case SCHED_GRAN_cpu:
+ mask = cpumask_of(cpu);
+ break;
+ case SCHED_GRAN_core:
+ mask = per_cpu(cpu_sibling_mask, cpu);
+ break;
+ case SCHED_GRAN_socket:
+ mask = per_cpu(cpu_core_mask, cpu);
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ return NULL;
+ }
+
+ return mask;
+}
+
+static void schedule_dummy(void)
+{
+ sched_tasklet_check_cpu(smp_processor_id());
+}
+
+void scheduler_disable(void)
+{
+ scheduler_active = false;
+ open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy);
+ open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy);
+}
+
+void scheduler_enable(void)
+{
+ open_softirq(SCHEDULE_SOFTIRQ, schedule);
+ open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
+ scheduler_active = true;
+}
+
+/* Initialise the data structures. */
+void __init scheduler_init(void)
+{
+ struct domain *idle_domain;
+ int i;
+
+ scheduler_enable();
+
+ for ( i = 0; i < NUM_SCHEDULERS; i++)
+ {
+#define sched_test_func(f) \
+ if ( !schedulers[i]->f ) \
+ { \
+ printk("scheduler %s misses .%s, dropped\n", \
+ schedulers[i]->opt_name, #f); \
+ schedulers[i] = NULL; \
+ }
+
+ sched_test_func(init);
+ sched_test_func(deinit);
+ sched_test_func(pick_resource);
+ sched_test_func(alloc_udata);
+ sched_test_func(free_udata);
+ sched_test_func(switch_sched);
+ sched_test_func(do_schedule);
+
+#undef sched_test_func
+
+ if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
+ {
+ printk("scheduler %s failed initialization, dropped\n",
+ schedulers[i]->opt_name);
+ schedulers[i] = NULL;
+ }
+
+ if ( schedulers[i] && !ops.name &&
+ !strcmp(schedulers[i]->opt_name, opt_sched) )
+ ops = *schedulers[i];
+ }
+
+ if ( !ops.name )
+ {
+ printk("Could not find scheduler: %s\n", opt_sched);
+ for ( i = 0; i < NUM_SCHEDULERS; i++ )
+ if ( schedulers[i] &&
+ !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
+ {
+ ops = *schedulers[i];
+ break;
+ }
+ BUG_ON(!ops.name);
+ printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
+ }
+
+ if ( cpu_schedule_up(0) )
+ BUG();
+ register_cpu_notifier(&cpu_schedule_nfb);
+
+ printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+ if ( sched_init(&ops) )
+ panic("scheduler returned error on init\n");
+
+ if ( sched_ratelimit_us &&
+ (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
+ || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
+ {
+ printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
+ " Resetting to default %u\n",
+ XEN_SYSCTL_SCHED_RATELIMIT_MIN,
+ XEN_SYSCTL_SCHED_RATELIMIT_MAX,
+ SCHED_DEFAULT_RATELIMIT_US);
+ sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+ }
+
+ idle_domain = domain_create(DOMID_IDLE, NULL, false);
+ BUG_ON(IS_ERR(idle_domain));
+ BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu));
+ idle_domain->vcpu = idle_vcpu;
+ idle_domain->max_vcpus = nr_cpu_ids;
+ if ( vcpu_create(idle_domain, 0) == NULL )
+ BUG();
+
+ rcu_read_lock(&sched_res_rculock);
+
+ get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
+ get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+/*
+ * Move a pCPU from free cpus (running the idle scheduler) to a cpupool
+ * using any "real" scheduler.
+ * The cpu is still marked as "free" and not yet valid for its cpupool.
+ */
+int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
+{
+ struct vcpu *idle;
+ void *ppriv, *vpriv;
+ struct scheduler *new_ops = c->sched;
+ struct sched_resource *sr;
+ spinlock_t *old_lock, *new_lock;
+ unsigned long flags;
+ int ret = 0;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sr = get_sched_res(cpu);
+
+ ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
+ ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid));
+ ASSERT(get_sched_res(cpu)->cpupool == NULL);
+
+ /*
+ * To setup the cpu for the new scheduler we need:
+ * - a valid instance of per-CPU scheduler specific data, as it is
+ * allocated by sched_alloc_pdata(). Note that we do not want to
+ * initialize it yet (i.e., we are not calling sched_init_pdata()).
+ * That will be done by the target scheduler, in sched_switch_sched(),
+ * in proper ordering and with locking.
+ * - a valid instance of per-vCPU scheduler specific data, for the idle
+ * vCPU of cpu. That is what the target scheduler will use for the
+ * sched_priv field of the per-vCPU info of the idle domain.
+ */
+ idle = idle_vcpu[cpu];
+ ppriv = sched_alloc_pdata(new_ops, cpu);
+ if ( IS_ERR(ppriv) )
+ {
+ ret = PTR_ERR(ppriv);
+ goto out;
+ }
+
+ vpriv = sched_alloc_udata(new_ops, idle->sched_unit,
+ idle->domain->sched_priv);
+ if ( vpriv == NULL )
+ {
+ sched_free_pdata(new_ops, ppriv, cpu);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * The actual switch, including the rerouting of the scheduler lock to
+ * whatever new_ops prefers, needs to happen in one critical section,
+ * protected by old_ops' lock, or races are possible.
+ * It is, in fact, the lock of the idle scheduler that we are taking.
+ * But that is ok as anyone trying to schedule on this cpu will spin until
+ * when we release that lock (bottom of this function). When he'll get the
+ * lock --thanks to the loop inside *_schedule_lock() functions-- he'll
+ * notice that the lock itself changed, and retry acquiring the new one
+ * (which will be the correct, remapped one, at that point).
+ */
+ old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+ if ( cpupool_get_granularity(c) > 1 )
+ {
+ const cpumask_t *mask;
+ unsigned int cpu_iter, idx = 0;
+ struct sched_unit *old_unit, *master_unit;
+ struct sched_resource *sr_old;
+
+ /*
+ * We need to merge multiple idle_vcpu units and sched_resource structs
+ * into one. As the free cpus all share the same lock we are fine doing
+ * that now. The worst which could happen would be someone waiting for
+ * the lock, thus dereferencing sched_res->schedule_lock. This is the
+ * reason we are freeing struct sched_res via call_rcu() to avoid the
+ * lock pointer suddenly disappearing.
+ */
+ mask = sched_get_opt_cpumask(c->gran, cpu);
+ master_unit = idle_vcpu[cpu]->sched_unit;
+
+ for_each_cpu ( cpu_iter, mask )
+ {
+ if ( idx )
+ cpumask_clear_cpu(cpu_iter, &sched_res_mask);
+
+ per_cpu(sched_res_idx, cpu_iter) = idx++;
+
+ if ( cpu == cpu_iter )
+ continue;
+
+ old_unit = idle_vcpu[cpu_iter]->sched_unit;
+ sr_old = get_sched_res(cpu_iter);
+ kill_timer(&sr_old->s_timer);
+ idle_vcpu[cpu_iter]->sched_unit = master_unit;
+ master_unit->runstate_cnt[RUNSTATE_running]++;
+ set_sched_res(cpu_iter, sr);
+ cpumask_set_cpu(cpu_iter, sr->cpus);
+
+ call_rcu(&sr_old->rcu, sched_res_free);
+ }
+ }
+
+ new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
+
+ sr->scheduler = new_ops;
+ sr->sched_priv = ppriv;
+
+ /*
+ * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
+ * if it is free (and it can be) we want that anyone that manages
+ * taking it, finds all the initializations we've done above in place.
+ */
+ smp_wmb();
+ sr->schedule_lock = new_lock;
+
+ /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
+ spin_unlock_irqrestore(old_lock, flags);
+
+ sr->granularity = cpupool_get_granularity(c);
+ sr->cpupool = c;
+ /* The cpu is added to a pool, trigger it to go pick up some work */
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+
+out:
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+/*
+ * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
+ * (the idle scheduler).
+ * The cpu is already marked as "free" and not valid any longer for its
+ * cpupool.
+ */
+int schedule_cpu_rm(unsigned int cpu)
+{
+ void *ppriv_old, *vpriv_old;
+ struct sched_resource *sr, **sr_new = NULL;
+ struct sched_unit *unit;
+ struct scheduler *old_ops;
+ spinlock_t *old_lock;
+ unsigned long flags;
+ int idx, ret = -ENOMEM;
+ unsigned int cpu_iter;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sr = get_sched_res(cpu);
+ old_ops = sr->scheduler;
+
+ if ( sr->granularity > 1 )
+ {
+ sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
+ if ( !sr_new )
+ goto out;
+ for ( idx = 0; idx < sr->granularity - 1; idx++ )
+ {
+ sr_new[idx] = sched_alloc_res();
+ if ( sr_new[idx] )
+ {
+ sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
+ if ( !sr_new[idx]->sched_unit_idle )
+ {
+ sched_res_free(&sr_new[idx]->rcu);
+ sr_new[idx] = NULL;
+ }
+ }
+ if ( !sr_new[idx] )
+ {
+ for ( idx--; idx >= 0; idx-- )
+ sched_res_free(&sr_new[idx]->rcu);
+ goto out;
+ }
+ sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
+ sr_new[idx]->scheduler = &sched_idle_ops;
+ sr_new[idx]->granularity = 1;
+
+ /* We want the lock not to change when replacing the resource. */
+ sr_new[idx]->schedule_lock = sr->schedule_lock;
+ }
+ }
+
+ ret = 0;
+ ASSERT(sr->cpupool != NULL);
+ ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
+ ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
+
+ /* See comment in schedule_cpu_add() regarding lock switching. */
+ old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+ vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
+ ppriv_old = sr->sched_priv;
+
+ idx = 0;
+ for_each_cpu ( cpu_iter, sr->cpus )
+ {
+ per_cpu(sched_res_idx, cpu_iter) = 0;
+ if ( cpu_iter == cpu )
+ {
+ idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
+ }
+ else
+ {
+ /* Initialize unit. */
+ unit = sr_new[idx]->sched_unit_idle;
+ unit->res = sr_new[idx];
+ unit->is_running = true;
+ sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
+ sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
+
+ /* Adjust cpu masks of resources (old and new). */
+ cpumask_clear_cpu(cpu_iter, sr->cpus);
+ cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
+
+ /* Init timer. */
+ init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
+
+ /* Last resource initializations and insert resource pointer. */
+ sr_new[idx]->master_cpu = cpu_iter;
+ set_sched_res(cpu_iter, sr_new[idx]);
+
+ /* Last action: set the new lock pointer. */
+ smp_mb();
+ sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
+
+ idx++;
+ }
+ }
+ sr->scheduler = &sched_idle_ops;
+ sr->sched_priv = NULL;
+
+ smp_mb();
+ sr->schedule_lock = &sched_free_cpu_lock;
+
+ /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+ spin_unlock_irqrestore(old_lock, flags);
+
+ sched_deinit_pdata(old_ops, ppriv_old, cpu);
+
+ sched_free_udata(old_ops, vpriv_old);
+ sched_free_pdata(old_ops, ppriv_old, cpu);
+
+ sr->granularity = 1;
+ sr->cpupool = NULL;
+
+out:
+ rcu_read_unlock(&sched_res_rculock);
+ xfree(sr_new);
+
+ return ret;
+}
+
+struct scheduler *scheduler_get_default(void)
+{
+ return &ops;
+}
+
+struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
+{
+ int i;
+ struct scheduler *sched;
+
+ for ( i = 0; i < NUM_SCHEDULERS; i++ )
+ if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
+ goto found;
+ *perr = -ENOENT;
+ return NULL;
+
+ found:
+ *perr = -ENOMEM;
+ if ( (sched = xmalloc(struct scheduler)) == NULL )
+ return NULL;
+ memcpy(sched, schedulers[i], sizeof(*sched));
+ if ( (*perr = sched_init(sched)) != 0 )
+ {
+ xfree(sched);
+ sched = NULL;
+ }
+
+ return sched;
+}
+
+void scheduler_free(struct scheduler *sched)
+{
+ BUG_ON(sched == &ops);
+ sched_deinit(sched);
+ xfree(sched);
+}
+
+void schedule_dump(struct cpupool *c)
+{
+ unsigned int i;
+ struct scheduler *sched;
+ cpumask_t *cpus;
+
+ /* Locking, if necessary, must be handled withing each scheduler */
+
+ rcu_read_lock(&sched_res_rculock);
+
+ if ( c != NULL )
+ {
+ sched = c->sched;
+ cpus = c->cpu_valid;
+ printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+ sched_dump_settings(sched);
+ }
+ else
+ {
+ sched = &ops;
+ cpus = &cpupool_free_cpus;
+ }
+
+ if ( sched->dump_cpu_state != NULL )
+ {
+ printk("CPUs info:\n");
+ for_each_cpu (i, cpus)
+ sched_dump_cpu_state(sched, i);
+ }
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+void sched_tick_suspend(void)
+{
+ rcu_idle_enter(smp_processor_id());
+ rcu_idle_timer_start();
+}
+
+void sched_tick_resume(void)
+{
+ rcu_idle_timer_stop();
+ rcu_idle_exit(smp_processor_id());
+}
+
+void wait(void)
+{
+ schedule();
+}
+
+#ifdef CONFIG_X86
+void __init sched_setup_dom0_vcpus(struct domain *d)
+{
+ unsigned int i;
+ struct sched_unit *unit;
+
+ for ( i = 1; i < d->max_vcpus; i++ )
+ vcpu_create(d, i);
+
+ /*
+ * PV-shim: vcpus are pinned 1:1.
+ * Initially only 1 cpu is online, others will be dealt with when
+ * onlining them. This avoids pinning a vcpu to a not yet online cpu here.
+ */
+ if ( pv_shim )
+ sched_set_affinity(d->vcpu[0]->sched_unit,
+ cpumask_of(0), cpumask_of(0));
+ else
+ {
+ for_each_sched_unit ( d, unit )
+ {
+ if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed )
+ sched_set_affinity(unit, &dom0_cpus, NULL);
+ sched_set_affinity(unit, NULL, &dom0_cpus);
+ }
+ }
+
+ domain_update_node_affinity(d);
+}
+#endif
+
+#ifdef CONFIG_COMPAT
+#include "compat.c"
+#endif
+
+#endif /* !COMPAT */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * cpupool.c
+ *
+ * Generic cpupool-handling functions.
+ *
+ * Cpupools are a feature to have configurable scheduling domains. Each
+ * cpupool runs an own scheduler on a dedicated set of physical cpus.
+ * A domain is bound to one cpupool at any time, but it can be moved to
+ * another cpupool.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/warning.h>
+#include <xen/keyhandler.h>
+#include <xen/cpu.h>
+
+#define for_each_cpupool(ptr) \
+ for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0; /* Initial cpupool with Dom0 */
+cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */
+
+static struct cpupool *cpupool_list; /* linked list, sorted by poolid */
+
+static int cpupool_moving_cpu = -1;
+static struct cpupool *cpupool_cpu_moving = NULL;
+static cpumask_t cpupool_locked_cpus;
+
+static DEFINE_SPINLOCK(cpupool_lock);
+
+static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
+static unsigned int __read_mostly sched_granularity = 1;
+
+#ifdef CONFIG_HAS_SCHED_GRANULARITY
+static int __init sched_select_granularity(const char *str)
+{
+ if ( strcmp("cpu", str) == 0 )
+ opt_sched_granularity = SCHED_GRAN_cpu;
+ else if ( strcmp("core", str) == 0 )
+ opt_sched_granularity = SCHED_GRAN_core;
+ else if ( strcmp("socket", str) == 0 )
+ opt_sched_granularity = SCHED_GRAN_socket;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+custom_param("sched-gran", sched_select_granularity);
+#endif
+
+static unsigned int __init cpupool_check_granularity(void)
+{
+ unsigned int cpu;
+ unsigned int siblings, gran = 0;
+
+ if ( opt_sched_granularity == SCHED_GRAN_cpu )
+ return 1;
+
+ for_each_online_cpu ( cpu )
+ {
+ siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity,
+ cpu));
+ if ( gran == 0 )
+ gran = siblings;
+ else if ( gran != siblings )
+ return 0;
+ }
+
+ sched_disable_smt_switching = true;
+
+ return gran;
+}
+
+/* Setup data for selected scheduler granularity. */
+static void __init cpupool_gran_init(void)
+{
+ unsigned int gran = 0;
+ const char *fallback = NULL;
+
+ while ( gran == 0 )
+ {
+ gran = cpupool_check_granularity();
+
+ if ( gran == 0 )
+ {
+ switch ( opt_sched_granularity )
+ {
+ case SCHED_GRAN_core:
+ opt_sched_granularity = SCHED_GRAN_cpu;
+ fallback = "Asymmetric cpu configuration.\n"
+ "Falling back to sched-gran=cpu.\n";
+ break;
+ case SCHED_GRAN_socket:
+ opt_sched_granularity = SCHED_GRAN_core;
+ fallback = "Asymmetric cpu configuration.\n"
+ "Falling back to sched-gran=core.\n";
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+ }
+ }
+
+ if ( fallback )
+ warning_add(fallback);
+
+ sched_granularity = gran;
+}
+
+unsigned int cpupool_get_granularity(const struct cpupool *c)
+{
+ return c ? sched_granularity : 1;
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+ if ( c )
+ {
+ free_cpumask_var(c->res_valid);
+ free_cpumask_var(c->cpu_valid);
+ }
+ xfree(c);
+}
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+ struct cpupool *c = xzalloc(struct cpupool);
+
+ if ( !c )
+ return NULL;
+
+ if ( !zalloc_cpumask_var(&c->cpu_valid) ||
+ !zalloc_cpumask_var(&c->res_valid) )
+ {
+ free_cpupool_struct(c);
+ c = NULL;
+ }
+
+ return c;
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held
+ * if exact is not specified, the first cpupool with an id larger or equal to
+ * the searched id is returned
+ * returns NULL if not found.
+ */
+static struct cpupool *__cpupool_find_by_id(int id, int exact)
+{
+ struct cpupool **q;
+
+ ASSERT(spin_is_locked(&cpupool_lock));
+
+ for_each_cpupool(q)
+ if ( (*q)->cpupool_id >= id )
+ break;
+
+ return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL;
+}
+
+static struct cpupool *cpupool_find_by_id(int poolid)
+{
+ return __cpupool_find_by_id(poolid, 1);
+}
+
+static struct cpupool *__cpupool_get_by_id(int poolid, int exact)
+{
+ struct cpupool *c;
+ spin_lock(&cpupool_lock);
+ c = __cpupool_find_by_id(poolid, exact);
+ if ( c != NULL )
+ atomic_inc(&c->refcnt);
+ spin_unlock(&cpupool_lock);
+ return c;
+}
+
+struct cpupool *cpupool_get_by_id(int poolid)
+{
+ return __cpupool_get_by_id(poolid, 1);
+}
+
+static struct cpupool *cpupool_get_next_by_id(int poolid)
+{
+ return __cpupool_get_by_id(poolid, 0);
+}
+
+void cpupool_put(struct cpupool *pool)
+{
+ if ( !atomic_dec_and_test(&pool->refcnt) )
+ return;
+ scheduler_free(pool->sched);
+ free_cpupool_struct(pool);
+}
+
+/*
+ * create a new cpupool with specified poolid and scheduler
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+static struct cpupool *cpupool_create(
+ int poolid, unsigned int sched_id, int *perr)
+{
+ struct cpupool *c;
+ struct cpupool **q;
+ int last = 0;
+
+ *perr = -ENOMEM;
+ if ( (c = alloc_cpupool_struct()) == NULL )
+ return NULL;
+
+ /* One reference for caller, one reference for cpupool_destroy(). */
+ atomic_set(&c->refcnt, 2);
+
+ debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id);
+
+ spin_lock(&cpupool_lock);
+
+ for_each_cpupool(q)
+ {
+ last = (*q)->cpupool_id;
+ if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+ break;
+ }
+ if ( *q != NULL )
+ {
+ if ( (*q)->cpupool_id == poolid )
+ {
+ *perr = -EEXIST;
+ goto err;
+ }
+ c->next = *q;
+ }
+
+ c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+ if ( poolid == 0 )
+ {
+ c->sched = scheduler_get_default();
+ }
+ else
+ {
+ c->sched = scheduler_alloc(sched_id, perr);
+ if ( c->sched == NULL )
+ goto err;
+ }
+ c->gran = opt_sched_granularity;
+
+ *q = c;
+
+ spin_unlock(&cpupool_lock);
+
+ debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n",
+ c->cpupool_id, c->sched->name, c->sched->opt_name);
+
+ *perr = 0;
+ return c;
+
+ err:
+ spin_unlock(&cpupool_lock);
+ free_cpupool_struct(c);
+ return NULL;
+}
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+static int cpupool_destroy(struct cpupool *c)
+{
+ struct cpupool **q;
+
+ spin_lock(&cpupool_lock);
+ for_each_cpupool(q)
+ if ( *q == c )
+ break;
+ if ( *q != c )
+ {
+ spin_unlock(&cpupool_lock);
+ return -ENOENT;
+ }
+ if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
+ {
+ spin_unlock(&cpupool_lock);
+ return -EBUSY;
+ }
+ *q = c->next;
+ spin_unlock(&cpupool_lock);
+
+ cpupool_put(c);
+
+ debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id);
+ return 0;
+}
+
+/*
+ * Move domain to another cpupool
+ */
+static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
+{
+ int ret;
+
+ if ( unlikely(d->cpupool == c) )
+ return 0;
+
+ d->cpupool->n_dom--;
+ ret = sched_move_domain(d, c);
+ if ( ret )
+ d->cpupool->n_dom++;
+ else
+ c->n_dom++;
+
+ return ret;
+}
+int cpupool_move_domain(struct domain *d, struct cpupool *c)
+{
+ int ret;
+
+ spin_lock(&cpupool_lock);
+
+ ret = cpupool_move_domain_locked(d, c);
+
+ spin_unlock(&cpupool_lock);
+
+ return ret;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ * cpupool_lock must be held
+ */
+static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+ int ret;
+ struct domain *d;
+ const cpumask_t *cpus;
+
+ cpus = sched_get_opt_cpumask(c->gran, cpu);
+
+ if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
+ return -EADDRNOTAVAIL;
+ ret = schedule_cpu_add(cpumask_first(cpus), c);
+ if ( ret )
+ return ret;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+ if (cpupool_moving_cpu == cpu)
+ {
+ cpupool_moving_cpu = -1;
+ cpupool_put(cpupool_cpu_moving);
+ cpupool_cpu_moving = NULL;
+ }
+ cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
+ cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ rcu_read_lock(&domlist_read_lock);
+ for_each_domain_in_cpupool(d, c)
+ {
+ domain_update_node_affinity(d);
+ }
+ rcu_read_unlock(&domlist_read_lock);
+
+ return 0;
+}
+
+static int cpupool_unassign_cpu_finish(struct cpupool *c)
+{
+ int cpu = cpupool_moving_cpu;
+ const cpumask_t *cpus;
+ struct domain *d;
+ int ret;
+
+ if ( c != cpupool_cpu_moving )
+ return -EADDRNOTAVAIL;
+
+ /*
+ * We need this for scanning the domain list, both in
+ * cpu_disable_scheduler(), and at the bottom of this function.
+ */
+ rcu_read_lock(&domlist_read_lock);
+ ret = cpu_disable_scheduler(cpu);
+
+ rcu_read_lock(&sched_res_rculock);
+ cpus = get_sched_res(cpu)->cpus;
+ cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+
+ /*
+ * cpu_disable_scheduler() returning an error doesn't require resetting
+ * cpupool_free_cpus' cpu bit. All error cases should be of temporary
+ * nature and tools will retry the operation. Even if the number of
+ * retries may be limited, the in-between state can easily be repaired
+ * by adding the cpu to the cpupool again.
+ */
+ if ( !ret )
+ {
+ ret = schedule_cpu_rm(cpu);
+ if ( ret )
+ cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+ else
+ {
+ cpupool_moving_cpu = -1;
+ cpupool_put(cpupool_cpu_moving);
+ cpupool_cpu_moving = NULL;
+ }
+ }
+ rcu_read_unlock(&sched_res_rculock);
+
+ for_each_domain_in_cpupool(d, c)
+ {
+ domain_update_node_affinity(d);
+ }
+ rcu_read_unlock(&domlist_read_lock);
+
+ return ret;
+}
+
+static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
+{
+ int ret;
+ struct domain *d;
+ const cpumask_t *cpus;
+
+ spin_lock(&cpupool_lock);
+ ret = -EADDRNOTAVAIL;
+ if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
+ && (cpu != cpupool_moving_cpu) )
+ goto out;
+
+ ret = 0;
+ rcu_read_lock(&sched_res_rculock);
+ cpus = get_sched_res(cpu)->cpus;
+
+ if ( (c->n_dom > 0) &&
+ (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
+ (cpu != cpupool_moving_cpu) )
+ {
+ rcu_read_lock(&domlist_read_lock);
+ for_each_domain_in_cpupool(d, c)
+ {
+ if ( !d->is_dying && system_state == SYS_STATE_active )
+ {
+ ret = -EBUSY;
+ break;
+ }
+ ret = cpupool_move_domain_locked(d, cpupool0);
+ if ( ret )
+ break;
+ }
+ rcu_read_unlock(&domlist_read_lock);
+ if ( ret )
+ goto out;
+ }
+ cpupool_moving_cpu = cpu;
+ atomic_inc(&c->refcnt);
+ cpupool_cpu_moving = c;
+ cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
+ cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
+
+ rcu_read_unlock(&domlist_read_lock);
+out:
+ spin_unlock(&cpupool_lock);
+
+ return ret;
+}
+
+static long cpupool_unassign_cpu_helper(void *info)
+{
+ struct cpupool *c = info;
+ long ret;
+
+ debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+ cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
+ spin_lock(&cpupool_lock);
+
+ ret = cpupool_unassign_cpu_finish(c);
+
+ spin_unlock(&cpupool_lock);
+ debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
+
+ return ret;
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * we must be sure not to run on the cpu to be unassigned! to achieve this
+ * the main functionality is performed via continue_hypercall_on_cpu on a
+ * specific cpu.
+ * if the cpu to be removed is the last one of the cpupool no active domain
+ * must be bound to the cpupool. dying domains are moved to cpupool0 as they
+ * might be zombies.
+ * possible failures:
+ * - last cpu and still active domains in cpupool
+ * - cpu just being unplugged
+ */
+static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+ int work_cpu;
+ int ret;
+ unsigned int master_cpu;
+
+ debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+ c->cpupool_id, cpu);
+
+ master_cpu = sched_get_resource_cpu(cpu);
+ ret = cpupool_unassign_cpu_start(c, master_cpu);
+ if ( ret )
+ {
+ debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
+ c->cpupool_id, cpu, ret);
+ return ret;
+ }
+
+ work_cpu = sched_get_resource_cpu(smp_processor_id());
+ if ( work_cpu == master_cpu )
+ {
+ work_cpu = cpumask_first(cpupool0->cpu_valid);
+ if ( work_cpu == master_cpu )
+ work_cpu = cpumask_last(cpupool0->cpu_valid);
+ }
+ return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+ struct cpupool *c;
+ int rc;
+ int n_dom = 0;
+
+ if ( poolid == CPUPOOLID_NONE )
+ return 0;
+ spin_lock(&cpupool_lock);
+ c = cpupool_find_by_id(poolid);
+ if ( c == NULL )
+ rc = -ESRCH;
+ else if ( !cpumask_weight(c->cpu_valid) )
+ rc = -ENODEV;
+ else
+ {
+ c->n_dom++;
+ n_dom = c->n_dom;
+ d->cpupool = c;
+ rc = 0;
+ }
+ spin_unlock(&cpupool_lock);
+ debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n",
+ d->domain_id, poolid, n_dom, rc);
+ return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+ int cpupool_id;
+ int n_dom;
+
+ if ( d->cpupool == NULL )
+ return;
+ spin_lock(&cpupool_lock);
+ cpupool_id = d->cpupool->cpupool_id;
+ d->cpupool->n_dom--;
+ n_dom = d->cpupool->n_dom;
+ d->cpupool = NULL;
+ spin_unlock(&cpupool_lock);
+ debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
+ d->domain_id, cpupool_id, n_dom);
+ return;
+}
+
+/*
+ * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
+ * as they must have been in there when unplugged.
+ */
+static int cpupool_cpu_add(unsigned int cpu)
+{
+ int ret = 0;
+ const cpumask_t *cpus;
+
+ spin_lock(&cpupool_lock);
+ cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
+ cpumask_set_cpu(cpu, &cpupool_free_cpus);
+
+ /*
+ * If we are not resuming, we are hot-plugging cpu, and in which case
+ * we add it to pool0, as it certainly was there when hot-unplagged
+ * (or unplugging would have failed) and that is the default behavior
+ * anyway.
+ */
+ rcu_read_lock(&sched_res_rculock);
+ get_sched_res(cpu)->cpupool = NULL;
+
+ cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
+ if ( cpumask_subset(cpus, &cpupool_free_cpus) )
+ ret = cpupool_assign_cpu_locked(cpupool0, cpu);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ spin_unlock(&cpupool_lock);
+
+ return ret;
+}
+
+/*
+ * This function is called in stop_machine context, so we can be sure no
+ * non-idle vcpu is active on the system.
+ */
+static void cpupool_cpu_remove(unsigned int cpu)
+{
+ int ret;
+
+ ASSERT(is_idle_vcpu(current));
+
+ if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
+ {
+ ret = cpupool_unassign_cpu_finish(cpupool0);
+ BUG_ON(ret);
+ }
+ cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+}
+
+/*
+ * Called before a CPU is being removed from the system.
+ * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
+ * to free cpus actually before removing them).
+ * The CPU is locked, to forbid adding it again to another cpupool.
+ */
+static int cpupool_cpu_remove_prologue(unsigned int cpu)
+{
+ int ret = 0;
+ cpumask_t *cpus;
+ unsigned int master_cpu;
+
+ spin_lock(&cpupool_lock);
+
+ rcu_read_lock(&sched_res_rculock);
+ cpus = get_sched_res(cpu)->cpus;
+ master_cpu = sched_get_resource_cpu(cpu);
+ if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
+ ret = -EBUSY;
+ else
+ cpumask_set_cpu(cpu, &cpupool_locked_cpus);
+ rcu_read_unlock(&sched_res_rculock);
+
+ spin_unlock(&cpupool_lock);
+
+ if ( ret )
+ return ret;
+
+ if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
+ {
+ /* Cpupool0 is populated only after all cpus are up. */
+ ASSERT(system_state == SYS_STATE_active);
+
+ ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
+ }
+ else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
+ ret = -ENODEV;
+
+ return ret;
+}
+
+/*
+ * Called during resume for all cpus which didn't come up again. The cpu must
+ * be removed from the cpupool it is assigned to. In case a cpupool will be
+ * left without cpu we move all domains of that cpupool to cpupool0.
+ * As we are called with all domains still frozen there is no need to take the
+ * cpupool lock here.
+ */
+static void cpupool_cpu_remove_forced(unsigned int cpu)
+{
+ struct cpupool **c;
+ int ret;
+ unsigned int master_cpu = sched_get_resource_cpu(cpu);
+
+ for_each_cpupool ( c )
+ {
+ if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) )
+ {
+ ret = cpupool_unassign_cpu_start(*c, master_cpu);
+ BUG_ON(ret);
+ ret = cpupool_unassign_cpu_finish(*c);
+ BUG_ON(ret);
+ }
+ }
+
+ cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+
+ rcu_read_lock(&sched_res_rculock);
+ sched_rm_cpu(cpu);
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+/*
+ * do cpupool related sysctl operations
+ */
+int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
+{
+ int ret;
+ struct cpupool *c;
+
+ switch ( op->op )
+ {
+
+ case XEN_SYSCTL_CPUPOOL_OP_CREATE:
+ {
+ int poolid;
+
+ poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
+ CPUPOOLID_NONE: op->cpupool_id;
+ c = cpupool_create(poolid, op->sched_id, &ret);
+ if ( c != NULL )
+ {
+ op->cpupool_id = c->cpupool_id;
+ cpupool_put(c);
+ }
+ }
+ break;
+
+ case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
+ {
+ c = cpupool_get_by_id(op->cpupool_id);
+ ret = -ENOENT;
+ if ( c == NULL )
+ break;
+ ret = cpupool_destroy(c);
+ cpupool_put(c);
+ }
+ break;
+
+ case XEN_SYSCTL_CPUPOOL_OP_INFO:
+ {
+ c = cpupool_get_next_by_id(op->cpupool_id);
+ ret = -ENOENT;
+ if ( c == NULL )
+ break;
+ op->cpupool_id = c->cpupool_id;
+ op->sched_id = c->sched->sched_id;
+ op->n_dom = c->n_dom;
+ ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
+ cpupool_put(c);
+ }
+ break;
+
+ case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
+ {
+ unsigned cpu;
+ const cpumask_t *cpus;
+
+ cpu = op->cpu;
+ debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n",
+ op->cpupool_id, cpu);
+
+ spin_lock(&cpupool_lock);
+
+ c = cpupool_find_by_id(op->cpupool_id);
+ ret = -ENOENT;
+ if ( c == NULL )
+ goto addcpu_out;
+ if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
+ {
+ for_each_cpu ( cpu, &cpupool_free_cpus )
+ {
+ cpus = sched_get_opt_cpumask(c->gran, cpu);
+ if ( cpumask_subset(cpus, &cpupool_free_cpus) )
+ break;
+ }
+ ret = -ENODEV;
+ if ( cpu >= nr_cpu_ids )
+ goto addcpu_out;
+ }
+ ret = -EINVAL;
+ if ( cpu >= nr_cpu_ids )
+ goto addcpu_out;
+ ret = -ENODEV;
+ cpus = sched_get_opt_cpumask(c->gran, cpu);
+ if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
+ cpumask_intersects(cpus, &cpupool_locked_cpus) )
+ goto addcpu_out;
+ ret = cpupool_assign_cpu_locked(c, cpu);
+
+ addcpu_out:
+ spin_unlock(&cpupool_lock);
+ debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
+ op->cpupool_id, cpu, ret);
+
+ }
+ break;
+
+ case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
+ {
+ unsigned cpu;
+
+ c = cpupool_get_by_id(op->cpupool_id);
+ ret = -ENOENT;
+ if ( c == NULL )
+ break;
+ cpu = op->cpu;
+ if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
+ cpu = cpumask_last(c->cpu_valid);
+ ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
+ cpupool_put(c);
+ }
+ break;
+
+ case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
+ {
+ struct domain *d;
+
+ ret = rcu_lock_remote_domain_by_id(op->domid, &d);
+ if ( ret )
+ break;
+ if ( d->cpupool == NULL )
+ {
+ ret = -EINVAL;
+ rcu_unlock_domain(d);
+ break;
+ }
+ if ( op->cpupool_id == d->cpupool->cpupool_id )
+ {
+ ret = 0;
+ rcu_unlock_domain(d);
+ break;
+ }
+ debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n",
+ d->domain_id, op->cpupool_id);
+ ret = -ENOENT;
+ spin_lock(&cpupool_lock);
+
+ c = cpupool_find_by_id(op->cpupool_id);
+ if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
+ ret = cpupool_move_domain_locked(d, c);
+
+ spin_unlock(&cpupool_lock);
+ debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n",
+ d->domain_id, op->cpupool_id, ret);
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
+ {
+ ret = cpumask_to_xenctl_bitmap(
+ &op->cpumap, &cpupool_free_cpus);
+ }
+ break;
+
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ return ret;
+}
+
+void dump_runq(unsigned char key)
+{
+ unsigned long flags;
+ s_time_t now = NOW();
+ struct cpupool **c;
+
+ spin_lock(&cpupool_lock);
+ local_irq_save(flags);
+
+ printk("sched_smt_power_savings: %s\n",
+ sched_smt_power_savings? "enabled":"disabled");
+ printk("NOW=%"PRI_stime"\n", now);
+
+ printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
+ if ( !cpumask_empty(&cpupool_free_cpus) )
+ {
+ printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
+ schedule_dump(NULL);
+ }
+
+ for_each_cpupool(c)
+ {
+ printk("Cpupool %d:\n", (*c)->cpupool_id);
+ printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid));
+ schedule_dump(*c);
+ }
+
+ local_irq_restore(flags);
+ spin_unlock(&cpupool_lock);
+}
+
+static int cpu_callback(
+ struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ int rc = 0;
+
+ switch ( action )
+ {
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ if ( system_state <= SYS_STATE_active )
+ rc = cpupool_cpu_add(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ /* Suspend/Resume don't change assignments of cpus to cpupools. */
+ if ( system_state <= SYS_STATE_active )
+ rc = cpupool_cpu_remove_prologue(cpu);
+ break;
+ case CPU_DYING:
+ /* Suspend/Resume don't change assignments of cpus to cpupools. */
+ if ( system_state <= SYS_STATE_active )
+ cpupool_cpu_remove(cpu);
+ break;
+ case CPU_RESUME_FAILED:
+ cpupool_cpu_remove_forced(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
+}
+
+static struct notifier_block cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+static int __init cpupool_init(void)
+{
+ unsigned int cpu;
+ int err;
+
+ cpupool_gran_init();
+
+ cpupool0 = cpupool_create(0, 0, &err);
+ BUG_ON(cpupool0 == NULL);
+ cpupool_put(cpupool0);
+ register_cpu_notifier(&cpu_nfb);
+
+ spin_lock(&cpupool_lock);
+
+ cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
+
+ for_each_cpu ( cpu, &cpupool_free_cpus )
+ cpupool_assign_cpu_locked(cpupool0, cpu);
+
+ spin_unlock(&cpupool_lock);
+
+ return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/****************************************************************************
+ * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
+ ****************************************************************************
+ *
+ * File: common/csched_credit.c
+ * Author: Emmanuel Ackaouy
+ *
+ * Description: Credit-based SMP CPU scheduler
+ */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+#include <xen/errno.h>
+#include <xen/keyhandler.h>
+#include <xen/trace.h>
+#include <xen/err.h>
+
+
+/*
+ * Locking:
+ * - Scheduler-lock (a.k.a. runqueue lock):
+ * + is per-runqueue, and there is one runqueue per-cpu;
+ * + serializes all runqueue manipulation operations;
+ * - Private data lock (a.k.a. private scheduler lock):
+ * + serializes accesses to the scheduler global state (weight,
+ * credit, balance_credit, etc);
+ * + serializes updates to the domains' scheduling parameters.
+ *
+ * Ordering is "private lock always comes first":
+ * + if we need both locks, we must acquire the private
+ * scheduler lock for first;
+ * + if we already own a runqueue lock, we must never acquire
+ * the private scheduler lock.
+ */
+
+/*
+ * Basic constants
+ */
+#define CSCHED_DEFAULT_WEIGHT 256
+#define CSCHED_TICKS_PER_TSLICE 3
+/* Default timeslice: 30ms */
+#define CSCHED_DEFAULT_TSLICE_MS 30
+#define CSCHED_CREDITS_PER_MSEC 10
+/* Never set a timer shorter than this value. */
+#define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN
+
+
+/*
+ * Priorities
+ */
+#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
+#define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
+#define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
+#define CSCHED_PRI_IDLE -64 /* idle */
+
+
+/*
+ * Flags
+ *
+ * Note that svc->flags (where these flags live) is protected by an
+ * inconsistent set of locks. Therefore atomic-safe bit operations must
+ * be used for accessing it.
+ */
+#define CSCHED_FLAG_UNIT_PARKED 0x0 /* UNIT over capped credits */
+#define CSCHED_FLAG_UNIT_YIELD 0x1 /* UNIT yielding */
+#define CSCHED_FLAG_UNIT_MIGRATING 0x2 /* UNIT may have moved to a new pcpu */
+#define CSCHED_FLAG_UNIT_PINNED 0x4 /* UNIT can run only on 1 pcpu */
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_PRIV(_ops) \
+ ((struct csched_private *)((_ops)->sched_data))
+#define CSCHED_PCPU(_c) \
+ ((struct csched_pcpu *)get_sched_res(_c)->sched_priv)
+#define CSCHED_UNIT(unit) ((struct csched_unit *) (unit)->priv)
+#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
+#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
+
+
+/*
+ * CSCHED_STATS
+ *
+ * Manage very basic per-unit counters and stats.
+ *
+ * Useful for debugging live systems. The stats are displayed
+ * with runq dumps ('r' on the Xen console).
+ */
+#ifdef SCHED_STATS
+
+#define CSCHED_STATS
+
+#define SCHED_UNIT_STATS_RESET(_V) \
+ do \
+ { \
+ memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
+ } while ( 0 )
+
+#define SCHED_UNIT_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
+
+#define SCHED_UNIT_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
+
+#else /* !SCHED_STATS */
+
+#undef CSCHED_STATS
+
+#define SCHED_UNIT_STATS_RESET(_V) do {} while ( 0 )
+#define SCHED_UNIT_STAT_CRANK(_V, _X) do {} while ( 0 )
+#define SCHED_UNIT_STAT_SET(_V, _X, _Y) do {} while ( 0 )
+
+#endif /* SCHED_STATS */
+
+
+/*
+ * Credit tracing events ("only" 512 available!). Check
+ * include/public/trace.h for more details.
+ */
+#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1)
+#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2)
+#define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3)
+#define TRC_CSCHED_STOLEN_UNIT TRC_SCHED_CLASS_EVT(CSCHED, 4)
+#define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5)
+#define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6)
+#define TRC_CSCHED_BOOST_START TRC_SCHED_CLASS_EVT(CSCHED, 7)
+#define TRC_CSCHED_BOOST_END TRC_SCHED_CLASS_EVT(CSCHED, 8)
+#define TRC_CSCHED_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED, 9)
+#define TRC_CSCHED_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED, 10)
+#define TRC_CSCHED_STEAL_CHECK TRC_SCHED_CLASS_EVT(CSCHED, 11)
+
+/*
+ * Boot parameters
+ */
+static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
+integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms);
+
+/*
+ * Physical CPU
+ */
+struct csched_pcpu {
+ struct list_head runq;
+ uint32_t runq_sort_last;
+
+ unsigned int idle_bias;
+ unsigned int nr_runnable;
+
+ unsigned int tick;
+ struct timer ticker;
+};
+
+/*
+ * Virtual UNIT
+ */
+struct csched_unit {
+ struct list_head runq_elem;
+ struct list_head active_unit_elem;
+
+ /* Up-pointers */
+ struct csched_dom *sdom;
+ struct sched_unit *unit;
+
+ s_time_t start_time; /* When we were scheduled (used for credit) */
+ unsigned flags;
+ int pri;
+
+ atomic_t credit;
+ unsigned int residual;
+
+ s_time_t last_sched_time;
+
+#ifdef CSCHED_STATS
+ struct {
+ int credit_last;
+ uint32_t credit_incr;
+ uint32_t state_active;
+ uint32_t state_idle;
+ uint32_t migrate_q;
+ uint32_t migrate_r;
+ uint32_t kicked_away;
+ } stats;
+#endif
+};
+
+/*
+ * Domain
+ */
+struct csched_dom {
+ struct list_head active_unit;
+ struct list_head active_sdom_elem;
+ struct domain *dom;
+ uint16_t active_unit_count;
+ uint16_t weight;
+ uint16_t cap;
+};
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+ /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
+ spinlock_t lock;
+
+ cpumask_var_t idlers;
+ cpumask_var_t cpus;
+ uint32_t *balance_bias;
+ uint32_t runq_sort;
+ uint32_t ncpus;
+
+ /* Period of master and tick in milliseconds */
+ unsigned int tick_period_us, ticks_per_tslice;
+ s_time_t ratelimit, tslice, unit_migr_delay;
+
+ struct list_head active_sdom;
+ uint32_t weight;
+ uint32_t credit;
+ int credit_balance;
+ unsigned int credits_per_tslice;
+
+ unsigned int master;
+ struct timer master_ticker;
+};
+
+static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
+
+static inline int
+__unit_on_runq(struct csched_unit *svc)
+{
+ return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched_unit *
+__runq_elem(struct list_head *elem)
+{
+ return list_entry(elem, struct csched_unit, runq_elem);
+}
+
+/* Is the first element of cpu's runq (if any) cpu's idle unit? */
+static inline bool_t is_runq_idle(unsigned int cpu)
+{
+ /*
+ * We're peeking at cpu's runq, we must hold the proper lock.
+ */
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+ return list_empty(RUNQ(cpu)) ||
+ is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit);
+}
+
+static inline void
+inc_nr_runnable(unsigned int cpu)
+{
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+ CSCHED_PCPU(cpu)->nr_runnable++;
+
+}
+
+static inline void
+dec_nr_runnable(unsigned int cpu)
+{
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+ ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1);
+ CSCHED_PCPU(cpu)->nr_runnable--;
+}
+
+static inline void
+__runq_insert(struct csched_unit *svc)
+{
+ unsigned int cpu = sched_unit_master(svc->unit);
+ const struct list_head * const runq = RUNQ(cpu);
+ struct list_head *iter;
+
+ BUG_ON( __unit_on_runq(svc) );
+
+ list_for_each( iter, runq )
+ {
+ const struct csched_unit * const iter_svc = __runq_elem(iter);
+ if ( svc->pri > iter_svc->pri )
+ break;
+ }
+
+ /* If the unit yielded, try to put it behind one lower-priority
+ * runnable unit if we can. The next runq_sort will bring it forward
+ * within 30ms if the queue too long. */
+ if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags)
+ && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
+ {
+ iter=iter->next;
+
+ /* Some sanity checks */
+ BUG_ON(iter == runq);
+ }
+
+ list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+runq_insert(struct csched_unit *svc)
+{
+ __runq_insert(svc);
+ inc_nr_runnable(sched_unit_master(svc->unit));
+}
+
+static inline void
+__runq_remove(struct csched_unit *svc)
+{
+ BUG_ON( !__unit_on_runq(svc) );
+ list_del_init(&svc->runq_elem);
+}
+
+static inline void
+runq_remove(struct csched_unit *svc)
+{
+ dec_nr_runnable(sched_unit_master(svc->unit));
+ __runq_remove(svc);
+}
+
+static void burn_credits(struct csched_unit *svc, s_time_t now)
+{
+ s_time_t delta;
+ uint64_t val;
+ unsigned int credits;
+
+ /* Assert svc is current */
+ ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) );
+
+ if ( (delta = now - svc->start_time) <= 0 )
+ return;
+
+ val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual;
+ svc->residual = do_div(val, MILLISECS(1));
+ credits = val;
+ ASSERT(credits == val); /* make sure we haven't truncated val */
+ atomic_sub(credits, &svc->credit);
+ svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
+}
+
+static bool_t __read_mostly opt_tickle_one_idle = 1;
+boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle);
+
+DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
+
+static inline void __runq_tickle(struct csched_unit *new)
+{
+ unsigned int cpu = sched_unit_master(new->unit);
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct sched_unit *unit = new->unit;
+ struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu));
+ struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
+ cpumask_t mask, idle_mask, *online;
+ int balance_step, idlers_empty;
+
+ ASSERT(cur);
+ cpumask_clear(&mask);
+
+ online = cpupool_domain_master_cpumask(new->sdom->dom);
+ cpumask_and(&idle_mask, prv->idlers, online);
+ idlers_empty = cpumask_empty(&idle_mask);
+
+ /*
+ * Exclusive pinning is when a unit has hard-affinity with only one
+ * cpu, and there is no other unit that has hard-affinity with that
+ * same cpu. This is infrequent, but if it happens, is for achieving
+ * the most possible determinism, and least possible overhead for
+ * the units in question.
+ *
+ * Try to identify the vast majority of these situations, and deal
+ * with them quickly.
+ */
+ if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) &&
+ cpumask_test_cpu(cpu, &idle_mask)) )
+ {
+ ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
+ SCHED_STAT_CRANK(tickled_idle_cpu_excl);
+ __cpumask_set_cpu(cpu, &mask);
+ goto tickle;
+ }
+
+ /*
+ * If the pcpu is idle, or there are no idlers and the new
+ * unit is a higher priority than the old unit, run it here.
+ *
+ * If there are idle cpus, first try to find one suitable to run
+ * new, so we can avoid preempting cur. If we cannot find a
+ * suitable idler on which to run new, run it here, but try to
+ * find a suitable idler on which to run cur instead.
+ */
+ if ( cur->pri == CSCHED_PRI_IDLE
+ || (idlers_empty && new->pri > cur->pri) )
+ {
+ if ( cur->pri != CSCHED_PRI_IDLE )
+ SCHED_STAT_CRANK(tickled_busy_cpu);
+ else
+ SCHED_STAT_CRANK(tickled_idle_cpu);
+ __cpumask_set_cpu(cpu, &mask);
+ }
+ else if ( !idlers_empty )
+ {
+ /*
+ * Soft and hard affinity balancing loop. For units without
+ * a useful soft affinity, consider hard affinity only.
+ */
+ for_each_affinity_balance_step( balance_step )
+ {
+ int new_idlers_empty;
+
+ if ( balance_step == BALANCE_SOFT_AFFINITY
+ && !has_soft_affinity(unit) )
+ continue;
+
+ /* Are there idlers suitable for new (for this balance step)? */
+ affinity_balance_cpumask(unit, balance_step,
+ cpumask_scratch_cpu(cpu));
+ cpumask_and(cpumask_scratch_cpu(cpu),
+ cpumask_scratch_cpu(cpu), &idle_mask);
+ new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu));
+
+ /*
+ * Let's not be too harsh! If there aren't idlers suitable
+ * for new in its soft affinity mask, make sure we check its
+ * hard affinity as well, before taking final decisions.
+ */
+ if ( new_idlers_empty
+ && balance_step == BALANCE_SOFT_AFFINITY )
+ continue;
+
+ /*
+ * If there are no suitable idlers for new, and it's higher
+ * priority than cur, check whether we can migrate cur away.
+ * We have to do it indirectly, via _VPF_migrating (instead
+ * of just tickling any idler suitable for cur) because cur
+ * is running.
+ *
+ * If there are suitable idlers for new, no matter priorities,
+ * leave cur alone (as it is running and is, likely, cache-hot)
+ * and wake some of them (which is waking up and so is, likely,
+ * cache cold anyway).
+ */
+ if ( new_idlers_empty && new->pri > cur->pri )
+ {
+ if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) )
+ {
+ SCHED_UNIT_STAT_CRANK(cur, kicked_away);
+ SCHED_UNIT_STAT_CRANK(cur, migrate_r);
+ SCHED_STAT_CRANK(migrate_kicked_away);
+ sched_set_pause_flags_atomic(cur->unit, _VPF_migrating);
+ }
+ /* Tickle cpu anyway, to let new preempt cur. */
+ SCHED_STAT_CRANK(tickled_busy_cpu);
+ __cpumask_set_cpu(cpu, &mask);
+ }
+ else if ( !new_idlers_empty )
+ {
+ /* Which of the idlers suitable for new shall we wake up? */
+ SCHED_STAT_CRANK(tickled_idle_cpu);
+ if ( opt_tickle_one_idle )
+ {
+ this_cpu(last_tickle_cpu) =
+ cpumask_cycle(this_cpu(last_tickle_cpu),
+ cpumask_scratch_cpu(cpu));
+ __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
+ }
+ else
+ cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu));
+ }
+
+ /* Did we find anyone? */
+ if ( !cpumask_empty(&mask) )
+ break;
+ }
+ }
+
+ tickle:
+ if ( !cpumask_empty(&mask) )
+ {
+ if ( unlikely(tb_init_done) )
+ {
+ /* Avoid TRACE_*: saves checking !tb_init_done each step */
+ for_each_cpu(cpu, &mask)
+ __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
+ }
+
+ /*
+ * Mark the designated CPUs as busy and send them all the scheduler
+ * interrupt. We need the for_each_cpu for dealing with the
+ * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
+ * can't use cpumask_andnot(), because prv->idlers needs atomic access.
+ *
+ * In the default (and most common) case, when opt_rickle_one_idle is
+ * true, the loop does only one step, and only one bit is cleared.
+ */
+ for_each_cpu(cpu, &mask)
+ cpumask_clear_cpu(cpu, prv->idlers);
+ cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
+ }
+ else
+ SCHED_STAT_CRANK(tickled_no_cpu);
+}
+
+static void
+csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+ struct csched_private *prv = CSCHED_PRIV(ops);
+
+ /*
+ * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're
+ * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed
+ * very early. xfree() does not really mind, but we want to be sure that,
+ * when we get here, either init_pdata has never been called, or
+ * deinit_pdata has been called already.
+ */
+ ASSERT(!cpumask_test_cpu(cpu, prv->cpus));
+
+ xfree(pcpu);
+}
+
+static void
+csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ struct csched_pcpu *spc = pcpu;
+ unsigned int node = cpu_to_node(cpu);
+ unsigned long flags;
+
+ /*
+ * Scheduler specific data for this pCPU must still be there and and be
+ * valid. In fact, if we are here:
+ * 1. alloc_pdata must have been called for this cpu, and free_pdata
+ * must not have been called on it before us,
+ * 2. init_pdata must have been called on this cpu, and deinit_pdata
+ * (us!) must not have been called on it already.
+ */
+ ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus));
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ prv->credit -= prv->credits_per_tslice;
+ prv->ncpus--;
+ cpumask_clear_cpu(cpu, prv->idlers);
+ cpumask_clear_cpu(cpu, prv->cpus);
+ if ( (prv->master == cpu) && (prv->ncpus > 0) )
+ {
+ prv->master = cpumask_first(prv->cpus);
+ migrate_timer(&prv->master_ticker, prv->master);
+ }
+ if ( prv->balance_bias[node] == cpu )
+ {
+ cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node));
+ if ( !cpumask_empty(cpumask_scratch) )
+ prv->balance_bias[node] = cpumask_first(cpumask_scratch);
+ }
+ kill_timer(&spc->ticker);
+ if ( prv->ncpus == 0 )
+ kill_timer(&prv->master_ticker);
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void *
+csched_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+ struct csched_pcpu *spc;
+
+ /* Allocate per-PCPU info */
+ spc = xzalloc(struct csched_pcpu);
+ if ( spc == NULL )
+ return ERR_PTR(-ENOMEM);
+
+ return spc;
+}
+
+static void
+init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
+{
+ ASSERT(spin_is_locked(&prv->lock));
+ /* cpu data needs to be allocated, but STILL uninitialized. */
+ ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL);
+
+ /* Initialize/update system-wide config */
+ prv->credit += prv->credits_per_tslice;
+ prv->ncpus++;
+ cpumask_set_cpu(cpu, prv->cpus);
+ if ( prv->ncpus == 1 )
+ {
+ prv->master = cpu;
+ init_timer(&prv->master_ticker, csched_acct, prv, cpu);
+ set_timer(&prv->master_ticker, NOW() + prv->tslice);
+ }
+
+ cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu)));
+ if ( cpumask_weight(cpumask_scratch) == 1 )
+ prv->balance_bias[cpu_to_node(cpu)] = cpu;
+
+ init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+ set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
+
+ INIT_LIST_HEAD(&spc->runq);
+ spc->runq_sort_last = prv->runq_sort;
+ spc->idle_bias = nr_cpu_ids - 1;
+
+ /* Start off idling... */
+ BUG_ON(!is_idle_unit(curr_on_cpu(cpu)));
+ cpumask_set_cpu(cpu, prv->idlers);
+ spc->nr_runnable = 0;
+}
+
+static void
+csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+ unsigned long flags;
+ struct csched_private *prv = CSCHED_PRIV(ops);
+
+ spin_lock_irqsave(&prv->lock, flags);
+ init_pdata(prv, pdata, cpu);
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+/* Change the scheduler of cpu to us (Credit). */
+static spinlock_t *
+csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct csched_private *prv = CSCHED_PRIV(new_ops);
+ struct csched_unit *svc = vdata;
+
+ ASSERT(svc && is_idle_unit(svc->unit));
+
+ sched_idle_unit(cpu)->priv = vdata;
+
+ /*
+ * We are holding the runqueue lock already (it's been taken in
+ * schedule_cpu_switch()). It actually may or may not be the 'right'
+ * one for this cpu, but that is ok for preventing races.
+ */
+ ASSERT(!local_irq_is_enabled());
+ spin_lock(&prv->lock);
+ init_pdata(prv, pdata, cpu);
+ spin_unlock(&prv->lock);
+
+ return &sr->_lock;
+}
+
+#ifndef NDEBUG
+static inline void
+__csched_unit_check(struct sched_unit *unit)
+{
+ struct csched_unit * const svc = CSCHED_UNIT(unit);
+ struct csched_dom * const sdom = svc->sdom;
+
+ BUG_ON( svc->unit != unit );
+ BUG_ON( sdom != CSCHED_DOM(unit->domain) );
+ if ( sdom )
+ {
+ BUG_ON( is_idle_unit(unit) );
+ BUG_ON( sdom->dom != unit->domain );
+ }
+ else
+ {
+ BUG_ON( !is_idle_unit(unit) );
+ }
+
+ SCHED_STAT_CRANK(unit_check);
+}
+#define CSCHED_UNIT_CHECK(unit) (__csched_unit_check(unit))
+#else
+#define CSCHED_UNIT_CHECK(unit)
+#endif
+
+/*
+ * Delay, in microseconds, between migrations of a UNIT between PCPUs.
+ * This prevents rapid fluttering of a UNIT between CPUs, and reduces the
+ * implicit overheads such as cache-warming. 1ms (1000) has been measured
+ * as a good value.
+ */
+static unsigned int vcpu_migration_delay_us;
+integer_param("vcpu_migration_delay", vcpu_migration_delay_us);
+
+static inline bool
+__csched_vcpu_is_cache_hot(const struct csched_private *prv,
+ const struct csched_unit *svc)
+{
+ bool hot = prv->unit_migr_delay &&
+ (NOW() - svc->last_sched_time) < prv->unit_migr_delay;
+
+ if ( hot )
+ SCHED_STAT_CRANK(unit_hot);
+
+ return hot;
+}
+
+static inline int
+__csched_unit_is_migrateable(const struct csched_private *prv,
+ struct sched_unit *unit,
+ int dest_cpu, cpumask_t *mask)
+{
+ const struct csched_unit *svc = CSCHED_UNIT(unit);
+ /*
+ * Don't pick up work that's hot on peer PCPU, or that can't (or
+ * would prefer not to) run on cpu.
+ *
+ * The caller is supposed to have already checked that unit is also
+ * not running.
+ */
+ ASSERT(!unit->is_running);
+
+ return !__csched_vcpu_is_cache_hot(prv, svc) &&
+ cpumask_test_cpu(dest_cpu, mask);
+}
+
+static int
+_csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit,
+ bool_t commit)
+{
+ int cpu = sched_unit_master(unit);
+ /* We must always use cpu's scratch space */
+ cpumask_t *cpus = cpumask_scratch_cpu(cpu);
+ cpumask_t idlers;
+ cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
+ struct csched_pcpu *spc = NULL;
+ int balance_step;
+
+ for_each_affinity_balance_step( balance_step )
+ {
+ affinity_balance_cpumask(unit, balance_step, cpus);
+ cpumask_and(cpus, online, cpus);
+ /*
+ * We want to pick up a pcpu among the ones that are online and
+ * can accommodate vc. As far as hard affinity is concerned, there
+ * always will be at least one of these pcpus in the scratch cpumask,
+ * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below
+ * are ok.
+ *
+ * On the other hand, when considering soft affinity, it is possible
+ * that the mask is empty (for instance, if the domain has been put
+ * in a cpupool that does not contain any of the pcpus in its soft
+ * affinity), which would result in the ASSERT()-s inside cpumask_*()
+ * operations triggering (in debug builds).
+ *
+ * Therefore, if that is the case, we just skip the soft affinity
+ * balancing step all together.
+ */
+ if ( balance_step == BALANCE_SOFT_AFFINITY &&
+ (!has_soft_affinity(unit) || cpumask_empty(cpus)) )
+ continue;
+
+ /* If present, prefer vc's current processor */
+ cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
+ ? sched_unit_master(unit)
+ : cpumask_cycle(sched_unit_master(unit), cpus);
+ ASSERT(cpumask_test_cpu(cpu, cpus));
+
+ /*
+ * Try to find an idle processor within the above constraints.
+ *
+ * In multi-core and multi-threaded CPUs, not all idle execution
+ * vehicles are equal!
+ *
+ * We give preference to the idle execution vehicle with the most
+ * idling neighbours in its grouping. This distributes work across
+ * distinct cores first and guarantees we don't do something stupid
+ * like run two UNITs on co-hyperthreads while there are idle cores
+ * or sockets.
+ *
+ * Notice that, when computing the "idleness" of cpu, we may want to
+ * discount unit. That is, iff unit is the currently running and the
+ * only runnable unit on cpu, we add cpu to the idlers.
+ */
+ cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
+ if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) )
+ __cpumask_set_cpu(cpu, &idlers);
+ cpumask_and(cpus, &idlers, cpus);
+
+ /*
+ * It is important that cpu points to an idle processor, if a suitable
+ * one exists (and we can use cpus to check and, possibly, choose a new
+ * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and
+ * cpu points to a busy thread with an idle sibling, both the threads
+ * will be considered the same, from the "idleness" calculation point
+ * of view", preventing unit from being moved to the thread that is
+ * actually idle.
+ *
+ * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so
+ * we check for it first.
+ */
+ if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) )
+ cpu = cpumask_cycle(cpu, cpus);
+ __cpumask_clear_cpu(cpu, cpus);
+
+ while ( !cpumask_empty(cpus) )
+ {
+ cpumask_t cpu_idlers;
+ cpumask_t nxt_idlers;
+ int nxt, weight_cpu, weight_nxt;
+ int migrate_factor;
+
+ nxt = cpumask_cycle(cpu, cpus);
+
+ if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) )
+ {
+ /* We're on the same socket, so check the busy-ness of threads.
+ * Migrate if # of idlers is less at all */
+ ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
+ migrate_factor = 1;
+ cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask,
+ cpu));
+ cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask,
+ nxt));
+ }
+ else
+ {
+ /* We're on different sockets, so check the busy-ness of cores.
+ * Migrate only if the other core is twice as idle */
+ ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
+ migrate_factor = 2;
+ cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu));
+ cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt));
+ }
+
+ weight_cpu = cpumask_weight(&cpu_idlers);
+ weight_nxt = cpumask_weight(&nxt_idlers);
+ /* smt_power_savings: consolidate work rather than spreading it */
+ if ( sched_smt_power_savings ?
+ weight_cpu > weight_nxt :
+ weight_cpu * migrate_factor < weight_nxt )
+ {
+ cpumask_and(&nxt_idlers, &nxt_idlers, cpus);
+ spc = CSCHED_PCPU(nxt);
+ cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers);
+ cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu));
+ }
+ else
+ {
+ cpumask_andnot(cpus, cpus, &nxt_idlers);
+ }
+ }
+
+ /* Stop if cpu is idle */
+ if ( cpumask_test_cpu(cpu, &idlers) )
+ break;
+ }
+
+ if ( commit && spc )
+ spc->idle_bias = cpu;
+
+ TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id,
+ cpu);
+
+ return cpu;
+}
+
+static struct sched_resource *
+csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+ struct csched_unit *svc = CSCHED_UNIT(unit);
+
+ /*
+ * We have been called by vcpu_migrate() (in schedule.c), as part
+ * of the process of seeing if vc can be migrated to another pcpu.
+ * We make a note about this in svc->flags so that later, in
+ * csched_unit_wake() (still called from vcpu_migrate()) we won't
+ * get boosted, which we don't deserve as we are "only" migrating.
+ */
+ set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
+ return get_sched_res(_csched_cpu_pick(ops, unit, 1));
+}
+
+static inline void
+__csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc)
+{
+ struct csched_dom * const sdom = svc->sdom;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ if ( list_empty(&svc->active_unit_elem) )
+ {
+ SCHED_UNIT_STAT_CRANK(svc, state_active);
+ SCHED_STAT_CRANK(acct_unit_active);
+
+ sdom->active_unit_count++;
+ list_add(&svc->active_unit_elem, &sdom->active_unit);
+ /* Make weight per-unit */
+ prv->weight += sdom->weight;
+ if ( list_empty(&sdom->active_sdom_elem) )
+ {
+ list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+ }
+ }
+
+ TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id,
+ svc->unit->unit_id, sdom->active_unit_count);
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static inline void
+__csched_unit_acct_stop_locked(struct csched_private *prv,
+ struct csched_unit *svc)
+{
+ struct csched_dom * const sdom = svc->sdom;
+
+ BUG_ON( list_empty(&svc->active_unit_elem) );
+
+ SCHED_UNIT_STAT_CRANK(svc, state_idle);
+ SCHED_STAT_CRANK(acct_unit_idle);
+
+ BUG_ON( prv->weight < sdom->weight );
+ sdom->active_unit_count--;
+ list_del_init(&svc->active_unit_elem);
+ prv->weight -= sdom->weight;
+ if ( list_empty(&sdom->active_unit) )
+ {
+ list_del_init(&sdom->active_sdom_elem);
+ }
+
+ TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id,
+ svc->unit->unit_id, sdom->active_unit_count);
+}
+
+static void
+csched_unit_acct(struct csched_private *prv, unsigned int cpu)
+{
+ struct sched_unit *currunit = current->sched_unit;
+ struct csched_unit * const svc = CSCHED_UNIT(currunit);
+ struct sched_resource *sr = get_sched_res(cpu);
+ const struct scheduler *ops = sr->scheduler;
+
+ ASSERT( sched_unit_master(currunit) == cpu );
+ ASSERT( svc->sdom != NULL );
+ ASSERT( !is_idle_unit(svc->unit) );
+
+ /*
+ * If this UNIT's priority was boosted when it last awoke, reset it.
+ * If the UNIT is found here, then it's consuming a non-negligeable
+ * amount of CPU resources and should no longer be boosted.
+ */
+ if ( svc->pri == CSCHED_PRI_TS_BOOST )
+ {
+ svc->pri = CSCHED_PRI_TS_UNDER;
+ TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id,
+ svc->unit->unit_id);
+ }
+
+ /*
+ * Update credits
+ */
+ burn_credits(svc, NOW());
+
+ /*
+ * Put this UNIT and domain back on the active list if it was
+ * idling.
+ */
+ if ( list_empty(&svc->active_unit_elem) )
+ {
+ __csched_unit_acct_start(prv, svc);
+ }
+ else
+ {
+ unsigned int new_cpu;
+ unsigned long flags;
+ spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags);
+
+ /*
+ * If it's been active a while, check if we'd be better off
+ * migrating it to run elsewhere (see multi-core and multi-thread
+ * support in csched_res_pick()).
+ */
+ new_cpu = _csched_cpu_pick(ops, currunit, 0);
+
+ unit_schedule_unlock_irqrestore(lock, flags, currunit);
+
+ if ( new_cpu != cpu )
+ {
+ SCHED_UNIT_STAT_CRANK(svc, migrate_r);
+ SCHED_STAT_CRANK(migrate_running);
+ sched_set_pause_flags_atomic(currunit, _VPF_migrating);
+ /*
+ * As we are about to tickle cpu, we should clear its bit in
+ * idlers. But, if we are here, it means there is someone running
+ * on it, and hence the bit must be zero already.
+ */
+ ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers));
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ }
+ }
+}
+
+static void *
+csched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+ void *dd)
+{
+ struct csched_unit *svc;
+
+ /* Allocate per-UNIT info */
+ svc = xzalloc(struct csched_unit);
+ if ( svc == NULL )
+ return NULL;
+
+ INIT_LIST_HEAD(&svc->runq_elem);
+ INIT_LIST_HEAD(&svc->active_unit_elem);
+ svc->sdom = dd;
+ svc->unit = unit;
+ svc->pri = is_idle_unit(unit) ?
+ CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+ SCHED_UNIT_STATS_RESET(svc);
+ SCHED_STAT_CRANK(unit_alloc);
+ return svc;
+}
+
+static void
+csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched_unit *svc = unit->priv;
+ spinlock_t *lock;
+
+ BUG_ON( is_idle_unit(unit) );
+
+ /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */
+ lock = unit_schedule_lock_irq(unit);
+
+ sched_set_res(unit, csched_res_pick(ops, unit));
+
+ spin_unlock_irq(lock);
+
+ lock = unit_schedule_lock_irq(unit);
+
+ if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running )
+ runq_insert(svc);
+
+ unit_schedule_unlock_irq(lock, unit);
+
+ SCHED_STAT_CRANK(unit_insert);
+}
+
+static void
+csched_free_udata(const struct scheduler *ops, void *priv)
+{
+ struct csched_unit *svc = priv;
+
+ BUG_ON( !list_empty(&svc->runq_elem) );
+
+ xfree(svc);
+}
+
+static void
+csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ struct csched_unit * const svc = CSCHED_UNIT(unit);
+ struct csched_dom * const sdom = svc->sdom;
+
+ SCHED_STAT_CRANK(unit_remove);
+
+ ASSERT(!__unit_on_runq(svc));
+
+ if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+ {
+ SCHED_STAT_CRANK(unit_unpark);
+ sched_unit_unpause(svc->unit);
+ }
+
+ spin_lock_irq(&prv->lock);
+
+ if ( !list_empty(&svc->active_unit_elem) )
+ __csched_unit_acct_stop_locked(prv, svc);
+
+ spin_unlock_irq(&prv->lock);
+
+ BUG_ON( sdom == NULL );
+}
+
+static void
+csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched_unit * const svc = CSCHED_UNIT(unit);
+ unsigned int cpu = sched_unit_master(unit);
+ struct sched_resource *sr = get_sched_res(cpu);
+
+ SCHED_STAT_CRANK(unit_sleep);
+
+ BUG_ON( is_idle_unit(unit) );
+
+ if ( curr_on_cpu(cpu) == unit )
+ {
+ /*
+ * We are about to tickle cpu, so we should clear its bit in idlers.
+ * But, we are here because unit is going to sleep while running on cpu,
+ * so the bit must be zero already.
+ */
+ ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers));
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ }
+ else if ( __unit_on_runq(svc) )
+ runq_remove(svc);
+}
+
+static void
+csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched_unit * const svc = CSCHED_UNIT(unit);
+ bool_t migrating;
+
+ BUG_ON( is_idle_unit(unit) );
+
+ if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
+ {
+ SCHED_STAT_CRANK(unit_wake_running);
+ return;
+ }
+ if ( unlikely(__unit_on_runq(svc)) )
+ {
+ SCHED_STAT_CRANK(unit_wake_onrunq);
+ return;
+ }
+
+ if ( likely(unit_runnable(unit)) )
+ SCHED_STAT_CRANK(unit_wake_runnable);
+ else
+ SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+ /*
+ * We temporarily boost the priority of awaking UNITs!
+ *
+ * If this UNIT consumes a non negligible amount of CPU, it
+ * will eventually find itself in the credit accounting code
+ * path where its priority will be reset to normal.
+ *
+ * If on the other hand the UNIT consumes little CPU and is
+ * blocking and awoken a lot (doing I/O for example), its
+ * priority will remain boosted, optimizing it's wake-to-run
+ * latencies.
+ *
+ * This allows wake-to-run latency sensitive UNITs to preempt
+ * more CPU resource intensive UNITs without impacting overall
+ * system fairness.
+ *
+ * There are two cases, when we don't want to boost:
+ * - UNITs that are waking up after a migration, rather than
+ * after having block;
+ * - UNITs of capped domains unpausing after earning credits
+ * they had overspent.
+ */
+ migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
+
+ if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER &&
+ !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+ {
+ TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id,
+ unit->unit_id);
+ SCHED_STAT_CRANK(unit_boost);
+ svc->pri = CSCHED_PRI_TS_BOOST;
+ }
+
+ /* Put the UNIT on the runq and tickle CPUs */
+ runq_insert(svc);
+ __runq_tickle(svc);
+}
+
+static void
+csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched_unit * const svc = CSCHED_UNIT(unit);
+
+ /* Let the scheduler know that this vcpu is trying to yield */
+ set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags);
+}
+
+static int
+csched_dom_cntl(
+ const struct scheduler *ops,
+ struct domain *d,
+ struct xen_domctl_scheduler_op *op)
+{
+ struct csched_dom * const sdom = CSCHED_DOM(d);
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ unsigned long flags;
+ int rc = 0;
+
+ /* Protect both get and put branches with the pluggable scheduler
+ * lock. Runq lock not needed anywhere in here. */
+ spin_lock_irqsave(&prv->lock, flags);
+
+ switch ( op->cmd )
+ {
+ case XEN_DOMCTL_SCHEDOP_getinfo:
+ op->u.credit.weight = sdom->weight;
+ op->u.credit.cap = sdom->cap;
+ break;
+ case XEN_DOMCTL_SCHEDOP_putinfo:
+ if ( op->u.credit.weight != 0 )
+ {
+ if ( !list_empty(&sdom->active_sdom_elem) )
+ {
+ prv->weight -= sdom->weight * sdom->active_unit_count;
+ prv->weight += op->u.credit.weight * sdom->active_unit_count;
+ }
+ sdom->weight = op->u.credit.weight;
+ }
+
+ if ( op->u.credit.cap != (uint16_t)~0U )
+ sdom->cap = op->u.credit.cap;
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ return rc;
+}
+
+static void
+csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
+ const cpumask_t *hard, const cpumask_t *soft)
+{
+ struct csched_unit *svc = CSCHED_UNIT(unit);
+
+ if ( !hard )
+ return;
+
+ /* Are we becoming exclusively pinned? */
+ if ( cpumask_weight(hard) == 1 )
+ set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
+ else
+ clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
+}
+
+static inline void
+__csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms)
+{
+ prv->tslice = MILLISECS(timeslice_ms);
+ prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
+ if ( timeslice_ms < prv->ticks_per_tslice )
+ prv->ticks_per_tslice = 1;
+ prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice;
+ prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms;
+ prv->credit = prv->credits_per_tslice * prv->ncpus;
+}
+
+static int
+csched_sys_cntl(const struct scheduler *ops,
+ struct xen_sysctl_scheduler_op *sc)
+{
+ int rc = -EINVAL;
+ struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit;
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ unsigned long flags;
+
+ switch ( sc->cmd )
+ {
+ case XEN_SYSCTL_SCHEDOP_putinfo:
+ if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
+ || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN
+ || (params->ratelimit_us
+ && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
+ || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
+ || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms)
+ || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
+ goto out;
+
+ spin_lock_irqsave(&prv->lock, flags);
+ __csched_set_tslice(prv, params->tslice_ms);
+ if ( !prv->ratelimit && params->ratelimit_us )
+ printk(XENLOG_INFO "Enabling context switch rate limiting\n");
+ else if ( prv->ratelimit && !params->ratelimit_us )
+ printk(XENLOG_INFO "Disabling context switch rate limiting\n");
+ prv->ratelimit = MICROSECS(params->ratelimit_us);
+ prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us);
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ /* FALLTHRU */
+ case XEN_SYSCTL_SCHEDOP_getinfo:
+ params->tslice_ms = prv->tslice / MILLISECS(1);
+ params->ratelimit_us = prv->ratelimit / MICROSECS(1);
+ params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1);
+ rc = 0;
+ break;
+ }
+ out:
+ return rc;
+}
+
+static void *
+csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+ struct csched_dom *sdom;
+
+ sdom = xzalloc(struct csched_dom);
+ if ( sdom == NULL )
+ return ERR_PTR(-ENOMEM);
+
+ /* Initialize credit and weight */
+ INIT_LIST_HEAD(&sdom->active_unit);
+ INIT_LIST_HEAD(&sdom->active_sdom_elem);
+ sdom->dom = dom;
+ sdom->weight = CSCHED_DEFAULT_WEIGHT;
+
+ return sdom;
+}
+
+static void
+csched_free_domdata(const struct scheduler *ops, void *data)
+{
+ xfree(data);
+}
+
+/*
+ * This is a O(n) optimized sort of the runq.
+ *
+ * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk
+ * through the runq and move up any UNDERs that are preceded by OVERS. We
+ * remember the last UNDER to make the move up operation O(1).
+ */
+static void
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
+{
+ struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+ struct list_head *runq, *elem, *next, *last_under;
+ struct csched_unit *svc_elem;
+ spinlock_t *lock;
+ unsigned long flags;
+ int sort_epoch;
+
+ sort_epoch = prv->runq_sort;
+ if ( sort_epoch == spc->runq_sort_last )
+ return;
+
+ spc->runq_sort_last = sort_epoch;
+
+ lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+ runq = &spc->runq;
+ elem = runq->next;
+ last_under = runq;
+
+ while ( elem != runq )
+ {
+ next = elem->next;
+ svc_elem = __runq_elem(elem);
+
+ if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
+ {
+ /* does elem need to move up the runq? */
+ if ( elem->prev != last_under )
+ {
+ list_del(elem);
+ list_add(elem, last_under);
+ }
+ last_under = elem;
+ }
+
+ elem = next;
+ }
+
+ pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+}
+
+static void
+csched_acct(void* dummy)
+{
+ struct csched_private *prv = dummy;
+ unsigned long flags;
+ struct list_head *iter_unit, *next_unit;
+ struct list_head *iter_sdom, *next_sdom;
+ struct csched_unit *svc;
+ struct csched_dom *sdom;
+ uint32_t credit_total;
+ uint32_t weight_total;
+ uint32_t weight_left;
+ uint32_t credit_fair;
+ uint32_t credit_peak;
+ uint32_t credit_cap;
+ int credit_balance;
+ int credit_xtra;
+ int credit;
+
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ weight_total = prv->weight;
+ credit_total = prv->credit;
+
+ /* Converge balance towards 0 when it drops negative */
+ if ( prv->credit_balance < 0 )
+ {
+ credit_total -= prv->credit_balance;
+ SCHED_STAT_CRANK(acct_balance);
+ }
+
+ if ( unlikely(weight_total == 0) )
+ {
+ prv->credit_balance = 0;
+ spin_unlock_irqrestore(&prv->lock, flags);
+ SCHED_STAT_CRANK(acct_no_work);
+ goto out;
+ }
+
+ SCHED_STAT_CRANK(acct_run);
+
+ weight_left = weight_total;
+ credit_balance = 0;
+ credit_xtra = 0;
+ credit_cap = 0U;
+
+ list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
+ {
+ sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+ BUG_ON( is_idle_domain(sdom->dom) );
+ BUG_ON( sdom->active_unit_count == 0 );
+ BUG_ON( sdom->weight == 0 );
+ BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left );
+
+ weight_left -= ( sdom->weight * sdom->active_unit_count );
+
+ /*
+ * A domain's fair share is computed using its weight in competition
+ * with that of all other active domains.
+ *
+ * At most, a domain can use credits to run all its active UNITs
+ * for one full accounting period. We allow a domain to earn more
+ * only when the system-wide credit balance is negative.
+ */
+ credit_peak = sdom->active_unit_count * prv->credits_per_tslice;
+ if ( prv->credit_balance < 0 )
+ {
+ credit_peak += ( ( -prv->credit_balance
+ * sdom->weight
+ * sdom->active_unit_count) +
+ (weight_total - 1)
+ ) / weight_total;
+ }
+
+ if ( sdom->cap != 0U )
+ {
+ credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100;
+ if ( credit_cap < credit_peak )
+ credit_peak = credit_cap;
+
+ /* FIXME -- set cap per-unit as well...? */
+ credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 )
+ ) / sdom->active_unit_count;
+ }
+
+ credit_fair = ( ( credit_total
+ * sdom->weight
+ * sdom->active_unit_count )
+ + (weight_total - 1)
+ ) / weight_total;
+
+ if ( credit_fair < credit_peak )
+ {
+ credit_xtra = 1;
+ }
+ else
+ {
+ if ( weight_left != 0U )
+ {
+ /* Give other domains a chance at unused credits */
+ credit_total += ( ( ( credit_fair - credit_peak
+ ) * weight_total
+ ) + ( weight_left - 1 )
+ ) / weight_left;
+ }
+
+ if ( credit_xtra )
+ {
+ /*
+ * Lazily keep domains with extra credits at the head of
+ * the queue to give others a chance at them in future
+ * accounting periods.
+ */
+ SCHED_STAT_CRANK(acct_reorder);
+ list_del(&sdom->active_sdom_elem);
+ list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+ }
+
+ credit_fair = credit_peak;
+ }
+
+ /* Compute fair share per UNIT */
+ credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 )
+ ) / sdom->active_unit_count;
+
+
+ list_for_each_safe( iter_unit, next_unit, &sdom->active_unit )
+ {
+ svc = list_entry(iter_unit, struct csched_unit, active_unit_elem);
+ BUG_ON( sdom != svc->sdom );
+
+ /* Increment credit */
+ atomic_add(credit_fair, &svc->credit);
+ credit = atomic_read(&svc->credit);
+
+ /*
+ * Recompute priority or, if UNIT is idling, remove it from
+ * the active list.
+ */
+ if ( credit < 0 )
+ {
+ svc->pri = CSCHED_PRI_TS_OVER;
+
+ /* Park running UNITs of capped-out domains */
+ if ( sdom->cap != 0U &&
+ credit < -credit_cap &&
+ !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+ {
+ SCHED_STAT_CRANK(unit_park);
+ sched_unit_pause_nosync(svc->unit);
+ }
+
+ /* Lower bound on credits */
+ if ( credit < -prv->credits_per_tslice )
+ {
+ SCHED_STAT_CRANK(acct_min_credit);
+ credit = -prv->credits_per_tslice;
+ atomic_set(&svc->credit, credit);
+ }
+ }
+ else
+ {
+ svc->pri = CSCHED_PRI_TS_UNDER;
+
+ /* Unpark any capped domains whose credits go positive */
+ if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+ {
+ /*
+ * It's important to unset the flag AFTER the unpause()
+ * call to make sure the UNIT's priority is not boosted
+ * if it is woken up here.
+ */
+ SCHED_STAT_CRANK(unit_unpark);
+ sched_unit_unpause(svc->unit);
+ clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags);
+ }
+
+ /* Upper bound on credits means UNIT stops earning */
+ if ( credit > prv->credits_per_tslice )
+ {
+ __csched_unit_acct_stop_locked(prv, svc);
+ /* Divide credits in half, so that when it starts
+ * accounting again, it starts a little bit "ahead" */
+ credit /= 2;
+ atomic_set(&svc->credit, credit);
+ }
+ }
+
+ SCHED_UNIT_STAT_SET(svc, credit_last, credit);
+ SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair);
+ credit_balance += credit;
+ }
+ }
+
+ prv->credit_balance = credit_balance;
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ /* Inform each CPU that its runq needs to be sorted */
+ prv->runq_sort++;
+
+out:
+ set_timer( &prv->master_ticker, NOW() + prv->tslice);
+}
+
+static void
+csched_tick(void *_cpu)
+{
+ unsigned int cpu = (unsigned long)_cpu;
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+ struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
+
+ spc->tick++;
+
+ /*
+ * Accounting for running UNIT
+ */
+ if ( !is_idle_unit(current->sched_unit) )
+ csched_unit_acct(prv, cpu);
+
+ /*
+ * Check if runq needs to be sorted
+ *
+ * Every physical CPU resorts the runq after the accounting master has
+ * modified priorities. This is a special O(n) sort and runs at most
+ * once per accounting period (currently 30 milliseconds).
+ */
+ csched_runq_sort(prv, cpu);
+
+ set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
+}
+
+static struct csched_unit *
+csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
+{
+ struct sched_resource *sr = get_sched_res(cpu);
+ const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler);
+ const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
+ struct csched_unit *speer;
+ struct list_head *iter;
+ struct sched_unit *unit;
+
+ ASSERT(peer_pcpu != NULL);
+
+ /*
+ * Don't steal from an idle CPU's runq because it's about to
+ * pick up work from it itself.
+ */
+ if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) )
+ goto out;
+
+ list_for_each( iter, &peer_pcpu->runq )
+ {
+ speer = __runq_elem(iter);
+
+ /*
+ * If next available UNIT here is not of strictly higher
+ * priority than ours, this PCPU is useless to us.
+ */
+ if ( speer->pri <= pri )
+ break;
+
+ /* Is this UNIT runnable on our PCPU? */
+ unit = speer->unit;
+ BUG_ON( is_idle_unit(unit) );
+
+ /*
+ * If the unit is still in peer_cpu's scheduling tail, or if it
+ * has no useful soft affinity, skip it.
+ *
+ * In fact, what we want is to check if we have any "soft-affine
+ * work" to steal, before starting to look at "hard-affine work".
+ *
+ * Notice that, if not even one unit on this runq has a useful
+ * soft affinity, we could have avoid considering this runq for
+ * a soft balancing step in the first place. This, for instance,
+ * can be implemented by taking note of on what runq there are
+ * units with useful soft affinities in some sort of bitmap
+ * or counter.
+ */
+ if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY &&
+ !has_soft_affinity(unit)) )
+ continue;
+
+ affinity_balance_cpumask(unit, balance_step, cpumask_scratch);
+ if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) )
+ {
+ /* We got a candidate. Grab it! */
+ TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu,
+ unit->domain->domain_id, unit->unit_id);
+ SCHED_UNIT_STAT_CRANK(speer, migrate_q);
+ SCHED_STAT_CRANK(migrate_queued);
+ runq_remove(speer);
+ sched_set_res(unit, get_sched_res(cpu));
+ /*
+ * speer will start executing directly on cpu, without having to
+ * go through runq_insert(). So we must update the runnable count
+ * for cpu here.
+ */
+ inc_nr_runnable(cpu);
+ return speer;
+ }
+ }
+ out:
+ SCHED_STAT_CRANK(steal_peer_idle);
+ return NULL;
+}
+
+static struct csched_unit *
+csched_load_balance(struct csched_private *prv, int cpu,
+ struct csched_unit *snext, bool *stolen)
+{
+ struct cpupool *c = get_sched_res(cpu)->cpupool;
+ struct csched_unit *speer;
+ cpumask_t workers;
+ cpumask_t *online = c->res_valid;
+ int peer_cpu, first_cpu, peer_node, bstep;
+ int node = cpu_to_node(cpu);
+
+ BUG_ON(get_sched_res(cpu) != snext->unit->res);
+
+ /*
+ * If this CPU is going offline, or is not (yet) part of any cpupool
+ * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
+ */
+ if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
+ goto out;
+
+ if ( snext->pri == CSCHED_PRI_IDLE )
+ SCHED_STAT_CRANK(load_balance_idle);
+ else if ( snext->pri == CSCHED_PRI_TS_OVER )
+ SCHED_STAT_CRANK(load_balance_over);
+ else
+ SCHED_STAT_CRANK(load_balance_other);
+
+ /*
+ * Let's look around for work to steal, taking both hard affinity
+ * and soft affinity into account. More specifically, we check all
+ * the non-idle CPUs' runq, looking for:
+ * 1. any "soft-affine work" to steal first,
+ * 2. if not finding anything, any "hard-affine work" to steal.
+ */
+ for_each_affinity_balance_step( bstep )
+ {
+ /*
+ * We peek at the non-idling CPUs in a node-wise fashion. In fact,
+ * it is more likely that we find some affine work on our same
+ * node, not to mention that migrating units within the same node
+ * could well expected to be cheaper than across-nodes (memory
+ * stays local, there might be some node-wide cache[s], etc.).
+ */
+ peer_node = node;
+ do
+ {
+ /* Select the pCPUs in this node that have work we can steal. */
+ cpumask_andnot(&workers, online, prv->idlers);
+ cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
+ __cpumask_clear_cpu(cpu, &workers);
+
+ first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers);
+ if ( first_cpu >= nr_cpu_ids )
+ goto next_node;
+ peer_cpu = first_cpu;
+ do
+ {
+ spinlock_t *lock;
+
+ /*
+ * If there is only one runnable unit on peer_cpu, it means
+ * there's no one to be stolen in its runqueue, so skip it.
+ *
+ * Checking this without holding the lock is racy... But that's
+ * the whole point of this optimization!
+ *
+ * In more details:
+ * - if we race with dec_nr_runnable(), we may try to take the
+ * lock and call csched_runq_steal() for no reason. This is
+ * not a functional issue, and should be infrequent enough.
+ * And we can avoid that by re-checking nr_runnable after
+ * having grabbed the lock, if we want;
+ * - if we race with inc_nr_runnable(), we skip a pCPU that may
+ * have runnable units in its runqueue, but that's not a
+ * problem because:
+ * + if racing with csched_unit_insert() or csched_unit_wake(),
+ * __runq_tickle() will be called afterwords, so the unit
+ * won't get stuck in the runqueue for too long;
+ * + if racing with csched_runq_steal(), it may be that an
+ * unit that we could have picked up, stays in a runqueue
+ * until someone else tries to steal it again. But this is
+ * no worse than what can happen already (without this
+ * optimization), it the pCPU would schedule right after we
+ * have taken the lock, and hence block on it.
+ */
+ if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
+ {
+ TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
+ goto next_cpu;
+ }
+
+ /*
+ * Get ahold of the scheduler lock for this peer CPU.
+ *
+ * Note: We don't spin on this lock but simply try it. Spinning
+ * could cause a deadlock if the peer CPU is also load
+ * balancing and trying to lock this CPU.
+ */
+ lock = pcpu_schedule_trylock(peer_cpu);
+ SCHED_STAT_CRANK(steal_trylock);
+ if ( !lock )
+ {
+ SCHED_STAT_CRANK(steal_trylock_failed);
+ TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
+ goto next_cpu;
+ }
+
+ TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
+
+ /* Any work over there to steal? */
+ speer = cpumask_test_cpu(peer_cpu, online) ?
+ csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
+ pcpu_schedule_unlock(lock, peer_cpu);
+
+ /* As soon as one unit is found, balancing ends */
+ if ( speer != NULL )
+ {
+ *stolen = true;
+ /*
+ * Next time we'll look for work to steal on this node, we
+ * will start from the next pCPU, with respect to this one,
+ * so we don't risk stealing always from the same ones.
+ */
+ prv->balance_bias[peer_node] = peer_cpu;
+ return speer;
+ }
+
+ next_cpu:
+ peer_cpu = cpumask_cycle(peer_cpu, &workers);
+
+ } while( peer_cpu != first_cpu );
+
+ next_node:
+ peer_node = cycle_node(peer_node, node_online_map);
+ } while( peer_node != node );
+ }
+
+ out:
+ /* Failed to find more important work elsewhere... */
+ __runq_remove(snext);
+ return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static void csched_schedule(
+ const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ const unsigned int cur_cpu = smp_processor_id();
+ const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+ struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu);
+ struct list_head * const runq = RUNQ(sched_cpu);
+ struct csched_unit * const scurr = CSCHED_UNIT(unit);
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ struct csched_unit *snext;
+ s_time_t runtime, tslice;
+ bool migrated = false;
+
+ SCHED_STAT_CRANK(schedule);
+ CSCHED_UNIT_CHECK(unit);
+
+ /*
+ * Here in Credit1 code, we usually just call TRACE_nD() helpers, and
+ * don't care about packing. But scheduling happens very often, so it
+ * actually is important that the record is as small as possible.
+ */
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned cpu:16, tasklet:8, idle:8;
+ } d;
+ d.cpu = cur_cpu;
+ d.tasklet = tasklet_work_scheduled;
+ d.idle = is_idle_unit(unit);
+ __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ runtime = now - unit->state_entry_time;
+ if ( runtime < 0 ) /* Does this ever happen? */
+ runtime = 0;
+
+ if ( !is_idle_unit(unit) )
+ {
+ /* Update credits of a non-idle UNIT. */
+ burn_credits(scurr, now);
+ scurr->start_time -= now;
+ scurr->last_sched_time = now;
+ }
+ else
+ {
+ /* Re-instate a boosted idle UNIT as normal-idle. */
+ scurr->pri = CSCHED_PRI_IDLE;
+ }
+
+ /* Choices, choices:
+ * - If we have a tasklet, we need to run the idle unit no matter what.
+ * - If sched rate limiting is in effect, and the current unit has
+ * run for less than that amount of time, continue the current one,
+ * but with a shorter timeslice and return it immediately
+ * - Otherwise, chose the one with the highest priority (which may
+ * be the one currently running)
+ * - If the currently running one is TS_OVER, see if there
+ * is a higher priority one waiting on the runqueue of another
+ * cpu and steal it.
+ */
+
+ /*
+ * If we have schedule rate limiting enabled, check to see
+ * how long we've run for.
+ *
+ * If scurr is yielding, however, we don't let rate limiting kick in.
+ * In fact, it may be the case that scurr is about to spin, and there's
+ * no point forcing it to do so until rate limiting expires.
+ */
+ if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags)
+ && !tasklet_work_scheduled
+ && prv->ratelimit
+ && unit_runnable_state(unit)
+ && !is_idle_unit(unit)
+ && runtime < prv->ratelimit )
+ {
+ snext = scurr;
+ snext->start_time += now;
+ perfc_incr(delay_ms);
+ /*
+ * Next timeslice must last just until we'll have executed for
+ * ratelimit. However, to avoid setting a really short timer, which
+ * will most likely be inaccurate and counterproductive, we never go
+ * below CSCHED_MIN_TIMER.
+ */
+ tslice = prv->ratelimit - runtime;
+ if ( unlikely(runtime < CSCHED_MIN_TIMER) )
+ tslice = CSCHED_MIN_TIMER;
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned runtime;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.runtime = runtime;
+ __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ goto out;
+ }
+ tslice = prv->tslice;
+
+ /*
+ * Select next runnable local UNIT (ie top of local runq)
+ */
+ if ( unit_runnable(unit) )
+ __runq_insert(scurr);
+ else
+ {
+ BUG_ON( is_idle_unit(unit) || list_empty(runq) );
+ /* Current has blocked. Update the runnable counter for this cpu. */
+ dec_nr_runnable(sched_cpu);
+ }
+
+ /*
+ * Clear YIELD flag before scheduling out
+ */
+ clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags);
+
+ do {
+ snext = __runq_elem(runq->next);
+
+ /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+ if ( tasklet_work_scheduled )
+ {
+ TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
+ snext = CSCHED_UNIT(sched_idle_unit(sched_cpu));
+ snext->pri = CSCHED_PRI_TS_BOOST;
+ }
+
+ /*
+ * SMP Load balance:
+ *
+ * If the next highest priority local runnable UNIT has already eaten
+ * through its credits, look on other PCPUs to see if we have more
+ * urgent work... If not, csched_load_balance() will return snext, but
+ * already removed from the runq.
+ */
+ if ( snext->pri > CSCHED_PRI_TS_OVER )
+ __runq_remove(snext);
+ else
+ snext = csched_load_balance(prv, sched_cpu, snext, &migrated);
+
+ } while ( !unit_runnable_state(snext->unit) );
+
+ /*
+ * Update idlers mask if necessary. When we're idling, other CPUs
+ * will tickle us when they get extra work.
+ */
+ if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE )
+ {
+ if ( !cpumask_test_cpu(sched_cpu, prv->idlers) )
+ cpumask_set_cpu(sched_cpu, prv->idlers);
+ }
+ else if ( cpumask_test_cpu(sched_cpu, prv->idlers) )
+ {
+ cpumask_clear_cpu(sched_cpu, prv->idlers);
+ }
+
+ if ( !is_idle_unit(snext->unit) )
+ snext->start_time += now;
+
+out:
+ /*
+ * Return task to run next...
+ */
+ unit->next_time = (is_idle_unit(snext->unit) ?
+ -1 : tslice);
+ unit->next_task = snext->unit;
+ snext->unit->migrated = migrated;
+
+ /* Stop credit tick when going to idle, restart it when coming from idle. */
+ if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) )
+ stop_timer(&spc->ticker);
+ if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) )
+ set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us)
+ - now % MICROSECS(prv->tick_period_us) );
+
+ CSCHED_UNIT_CHECK(unit->next_task);
+}
+
+static void
+csched_dump_unit(struct csched_unit *svc)
+{
+ struct csched_dom * const sdom = svc->sdom;
+
+ printk("[%i.%i] pri=%i flags=%x cpu=%i",
+ svc->unit->domain->domain_id,
+ svc->unit->unit_id,
+ svc->pri,
+ svc->flags,
+ sched_unit_master(svc->unit));
+
+ if ( sdom )
+ {
+ printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit),
+ sdom->weight, sdom->cap);
+#ifdef CSCHED_STATS
+ printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}",
+ svc->stats.credit_last,
+ svc->stats.credit_incr,
+ svc->stats.state_active,
+ svc->stats.state_idle,
+ svc->stats.migrate_q,
+ svc->stats.migrate_r,
+ svc->stats.kicked_away);
+#endif
+ }
+
+ printk("\n");
+}
+
+static void
+csched_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+ struct list_head *runq, *iter;
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ struct csched_pcpu *spc;
+ struct csched_unit *svc;
+ spinlock_t *lock;
+ unsigned long flags;
+ int loop;
+
+ /*
+ * We need both locks:
+ * - csched_dump_unit() wants to access domains' scheduling
+ * parameters, which are protected by the private scheduler lock;
+ * - we scan through the runqueue, so we need the proper runqueue
+ * lock (the one of the runqueue of this cpu).
+ */
+ spin_lock_irqsave(&prv->lock, flags);
+ lock = pcpu_schedule_lock(cpu);
+
+ spc = CSCHED_PCPU(cpu);
+ runq = &spc->runq;
+
+ printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n",
+ cpu, spc->nr_runnable, spc->runq_sort_last,
+ CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
+ CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+
+ /* current UNIT (nothing to say if that's the idle unit). */
+ svc = CSCHED_UNIT(curr_on_cpu(cpu));
+ if ( svc && !is_idle_unit(svc->unit) )
+ {
+ printk("\trun: ");
+ csched_dump_unit(svc);
+ }
+
+ loop = 0;
+ list_for_each( iter, runq )
+ {
+ svc = __runq_elem(iter);
+ if ( svc )
+ {
+ printk("\t%3d: ", ++loop);
+ csched_dump_unit(svc);
+ }
+ }
+
+ pcpu_schedule_unlock(lock, cpu);
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void
+csched_dump(const struct scheduler *ops)
+{
+ struct list_head *iter_sdom, *iter_svc;
+ struct csched_private *prv = CSCHED_PRIV(ops);
+ int loop;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ printk("info:\n"
+ "\tncpus = %u\n"
+ "\tmaster = %u\n"
+ "\tcredit = %u\n"
+ "\tcredit balance = %d\n"
+ "\tweight = %u\n"
+ "\trunq_sort = %u\n"
+ "\tdefault-weight = %d\n"
+ "\ttslice = %"PRI_stime"ms\n"
+ "\tratelimit = %"PRI_stime"us\n"
+ "\tcredits per msec = %d\n"
+ "\tticks per tslice = %d\n"
+ "\tmigration delay = %"PRI_stime"us\n",
+ prv->ncpus,
+ prv->master,
+ prv->credit,
+ prv->credit_balance,
+ prv->weight,
+ prv->runq_sort,
+ CSCHED_DEFAULT_WEIGHT,
+ prv->tslice / MILLISECS(1),
+ prv->ratelimit / MICROSECS(1),
+ CSCHED_CREDITS_PER_MSEC,
+ prv->ticks_per_tslice,
+ prv->unit_migr_delay/ MICROSECS(1));
+
+ printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers));
+
+ printk("active units:\n");
+ loop = 0;
+ list_for_each( iter_sdom, &prv->active_sdom )
+ {
+ struct csched_dom *sdom;
+ sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+ list_for_each( iter_svc, &sdom->active_unit )
+ {
+ struct csched_unit *svc;
+ spinlock_t *lock;
+
+ svc = list_entry(iter_svc, struct csched_unit, active_unit_elem);
+ lock = unit_schedule_lock(svc->unit);
+
+ printk("\t%3d: ", ++loop);
+ csched_dump_unit(svc);
+
+ unit_schedule_unlock(lock, svc->unit);
+ }
+ }
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static int __init
+csched_global_init(void)
+{
+ if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX ||
+ sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN )
+ {
+ printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n"
+ " Resetting to default %u\n",
+ XEN_SYSCTL_CSCHED_TSLICE_MIN,
+ XEN_SYSCTL_CSCHED_TSLICE_MAX,
+ CSCHED_DEFAULT_TSLICE_MS);
+ sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
+ }
+
+ if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
+ printk("WARNING: sched_ratelimit_us >"
+ "sched_credit_tslice_ms is undefined\n"
+ "Setting ratelimit to tslice\n");
+
+ if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
+ {
+ vcpu_migration_delay_us = 0;
+ printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n"
+ "Resetting to default: %u\n",
+ XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us);
+ }
+
+ return 0;
+}
+
+static int
+csched_init(struct scheduler *ops)
+{
+ struct csched_private *prv;
+
+ prv = xzalloc(struct csched_private);
+ if ( prv == NULL )
+ return -ENOMEM;
+
+ prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES);
+ if ( prv->balance_bias == NULL )
+ {
+ xfree(prv);
+ return -ENOMEM;
+ }
+
+ if ( !zalloc_cpumask_var(&prv->cpus) ||
+ !zalloc_cpumask_var(&prv->idlers) )
+ {
+ free_cpumask_var(prv->cpus);
+ xfree(prv->balance_bias);
+ xfree(prv);
+ return -ENOMEM;
+ }
+
+ ops->sched_data = prv;
+ spin_lock_init(&prv->lock);
+ INIT_LIST_HEAD(&prv->active_sdom);
+ prv->master = UINT_MAX;
+
+ __csched_set_tslice(prv, sched_credit_tslice_ms);
+
+ if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
+ prv->ratelimit = prv->tslice;
+ else
+ prv->ratelimit = MICROSECS(sched_ratelimit_us);
+
+ prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us);
+
+ return 0;
+}
+
+static void
+csched_deinit(struct scheduler *ops)
+{
+ struct csched_private *prv;
+
+ prv = CSCHED_PRIV(ops);
+ if ( prv != NULL )
+ {
+ ops->sched_data = NULL;
+ free_cpumask_var(prv->cpus);
+ free_cpumask_var(prv->idlers);
+ xfree(prv->balance_bias);
+ xfree(prv);
+ }
+}
+
+static const struct scheduler sched_credit_def = {
+ .name = "SMP Credit Scheduler",
+ .opt_name = "credit",
+ .sched_id = XEN_SCHEDULER_CREDIT,
+ .sched_data = NULL,
+
+ .global_init = csched_global_init,
+
+ .insert_unit = csched_unit_insert,
+ .remove_unit = csched_unit_remove,
+
+ .sleep = csched_unit_sleep,
+ .wake = csched_unit_wake,
+ .yield = csched_unit_yield,
+
+ .adjust = csched_dom_cntl,
+ .adjust_affinity= csched_aff_cntl,
+ .adjust_global = csched_sys_cntl,
+
+ .pick_resource = csched_res_pick,
+ .do_schedule = csched_schedule,
+
+ .dump_cpu_state = csched_dump_pcpu,
+ .dump_settings = csched_dump,
+ .init = csched_init,
+ .deinit = csched_deinit,
+ .alloc_udata = csched_alloc_udata,
+ .free_udata = csched_free_udata,
+ .alloc_pdata = csched_alloc_pdata,
+ .init_pdata = csched_init_pdata,
+ .deinit_pdata = csched_deinit_pdata,
+ .free_pdata = csched_free_pdata,
+ .switch_sched = csched_switch_sched,
+ .alloc_domdata = csched_alloc_domdata,
+ .free_domdata = csched_free_domdata,
+};
+
+REGISTER_SCHEDULER(sched_credit_def);
--- /dev/null
+
+/****************************************************************************
+ * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
+ ****************************************************************************
+ *
+ * File: common/sched_credit2.c
+ * Author: George Dunlap
+ *
+ * Description: Credit-based SMP CPU scheduler
+ * Based on an earlier verson by Emmanuel Ackaouy.
+ */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/div64.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/cpu.h>
+#include <xen/keyhandler.h>
+
+/* Meant only for helping developers during debugging. */
+/* #define d2printk printk */
+#define d2printk(x...)
+
+
+/*
+ * Credit2 tracing events ("only" 512 available!). Check
+ * include/public/trace.h for more details.
+ */
+#define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1)
+#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2)
+#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3)
+#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4)
+#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5)
+#define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6)
+#define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7)
+#define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8)
+#define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9)
+#define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10)
+#define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11)
+#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12)
+#define TRC_CSCHED2_TICKLE_NEW TRC_SCHED_CLASS_EVT(CSCHED2, 13)
+#define TRC_CSCHED2_RUNQ_MAX_WEIGHT TRC_SCHED_CLASS_EVT(CSCHED2, 14)
+#define TRC_CSCHED2_MIGRATE TRC_SCHED_CLASS_EVT(CSCHED2, 15)
+#define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16)
+#define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17)
+#define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19)
+#define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20)
+#define TRC_CSCHED2_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED2, 21)
+#define TRC_CSCHED2_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED2, 22)
+#define TRC_CSCHED2_RUNQ_CAND_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 23)
+
+/*
+ * TODO:
+ * + Hyperthreading
+ * - "Discount" time run on a thread with busy siblings
+ * + Algorithm:
+ * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
+ * a flash animation in the background) can we schedule it with low enough latency
+ * so that audio doesn't skip?
+ * + Optimizing
+ * - Profiling, making new algorithms, making math more efficient (no long division)
+ */
+
+/*
+ * Design:
+ *
+ * VMs "burn" credits based on their weight; higher weight means
+ * credits burn more slowly. The highest weight unit burns credits at
+ * a rate of 1 credit per nanosecond. Others burn proportionally
+ * more.
+ *
+ * units are inserted into the runqueue by credit order.
+ *
+ * Credits are "reset" when the next unit in the runqueue is less than
+ * or equal to zero. At that point, everyone's credits are "clipped"
+ * to a small value, and a fixed credit is added to everyone.
+ */
+
+/*
+ * Utilization cap:
+ *
+ * Setting an pCPU utilization cap for a domain means the following:
+ *
+ * - a domain can have a cap, expressed in terms of % of physical CPU time.
+ * A domain that must not use more than 1/4 of _one_ physical CPU, will
+ * be given a cap of 25%; a domain that must not use more than 1+1/2 of
+ * physical CPU time, will be given a cap of 150%;
+ *
+ * - caps are per-domain (not per-unit). If a domain has only 1 unit, and
+ * a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4
+ * units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be
+ * split among the v units. How much each of the units will actually get,
+ * during any given interval of time, is unspecified (as it depends on
+ * various aspects: workload, system load, etc.). For instance, it is
+ * possible that, during a given time interval, 2 units use 100% each,
+ * and the other two use nothing; while during another time interval,
+ * two units use 80%, one uses 10% and the other 30%; or that each use
+ * 50% (and so on and so forth).
+ *
+ * For implementing this, we use the following approach:
+ *
+ * - each domain is given a 'budget', an each domain has a timer, which
+ * replenishes the domain's budget periodically. The budget is the amount
+ * of time the units of the domain can use every 'period';
+ *
+ * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
+ * (but each domain has its own timer; so the all are periodic by the same
+ * period, but replenishment of the budgets of the various domains, at
+ * periods boundaries, are not synchronous);
+ *
+ * - when units run, they consume budget. When they don't run, they don't
+ * consume budget. If there is no budget left for the domain, no unit of
+ * that domain can run. If an unit tries to run and finds that there is no
+ * budget, it blocks.
+ * At whatever time an unit wants to run, it must check the domain's budget,
+ * and if there is some, it can use it.
+ *
+ * - budget is replenished to the top of the capacity for the domain once
+ * per period. Even if there was some leftover budget from previous period,
+ * though, the budget after a replenishment will always be at most equal
+ * to the total capacify of the domain ('tot_budget');
+ *
+ * - when a budget replenishment occurs, if there are units that had been
+ * blocked because of lack of budget, they'll be unblocked, and they will
+ * (potentially) be able to run again.
+ *
+ * Finally, some even more implementation related detail:
+ *
+ * - budget is stored in a domain-wide pool. Units of the domain that want
+ * to run go to such pool, and grub some. When they do so, the amount
+ * they grabbed is _immediately_ removed from the pool. This happens in
+ * unit_grab_budget();
+ *
+ * - when units stop running, if they've not consumed all the budget they
+ * took, the leftover is put back in the pool. This happens in
+ * unit_return_budget();
+ *
+ * - the above means that an unit can find out that there is no budget and
+ * block, not only if the cap has actually been reached (for this period),
+ * but also if some other units, in order to run, have grabbed a certain
+ * quota of budget, no matter whether they've already used it all or not.
+ * An unit blocking because (any form of) lack of budget is said to be
+ * "parked", and such blocking happens in park_unit();
+ *
+ * - when an unit stops running, and puts back some budget in the domain pool,
+ * we need to check whether there is someone which has been parked and that
+ * can be unparked. This happens in unpark_parked_units(), called from
+ * csched2_context_saved();
+ *
+ * - of course, unparking happens also as a consequence of the domain's budget
+ * being replenished by the periodic timer. This also occurs by means of
+ * calling csched2_context_saved() (but from replenish_domain_budget());
+ *
+ * - parked units of a domain are kept in a (per-domain) list, called
+ * 'parked_units'). Manipulation of the list and of the domain-wide budget
+ * pool, must occur only when holding the 'budget_lock'.
+ */
+
+/*
+ * Locking:
+ *
+ * - runqueue lock
+ * + it is per-runqueue, so:
+ * * cpus in a runqueue take the runqueue lock, when using
+ * pcpu_schedule_lock() / unit_schedule_lock() (and friends),
+ * * a cpu may (try to) take a "remote" runqueue lock, e.g., for
+ * load balancing;
+ * + serializes runqueue operations (removing and inserting units);
+ * + protects runqueue-wide data in csched2_runqueue_data;
+ * + protects unit parameters in csched2_unit for the unit in the
+ * runqueue.
+ *
+ * - Private scheduler lock
+ * + protects scheduler-wide data in csched2_private, such as:
+ * * the list of domains active in this scheduler,
+ * * what cpus and what runqueues are active and in what
+ * runqueue each cpu is;
+ * + serializes the operation of changing the weights of domains;
+ *
+ * - Budget lock
+ * + it is per-domain;
+ * + protects, in domains that have an utilization cap;
+ * * manipulation of the total budget of the domain (as it is shared
+ * among all units of the domain),
+ * * manipulation of the list of units that are blocked waiting for
+ * some budget to be available.
+ *
+ * - Type:
+ * + runqueue locks are 'regular' spinlocks;
+ * + the private scheduler lock can be an rwlock. In fact, data
+ * it protects is modified only during initialization, cpupool
+ * manipulation and when changing weights, and read in all
+ * other cases (e.g., during load balancing);
+ * + budget locks are 'regular' spinlocks.
+ *
+ * Ordering:
+ * + tylock must be used when wanting to take a runqueue lock,
+ * if we already hold another one;
+ * + if taking both a runqueue lock and the private scheduler
+ * lock is, the latter must always be taken for first;
+ * + if taking both a runqueue lock and a budget lock, the former
+ * must always be taken for first.
+ */
+
+/*
+ * Basic constants
+ */
+/* Default weight: How much a new domain starts with. */
+#define CSCHED2_DEFAULT_WEIGHT 256
+/*
+ * Min timer: Minimum length a timer will be set, to
+ * achieve efficiency.
+ */
+#define CSCHED2_MIN_TIMER MICROSECS(500)
+/*
+ * Amount of credit VMs begin with, and are reset to.
+ * ATM, set so that highest-weight VMs can only run for 10ms
+ * before a reset event.
+ */
+#define CSCHED2_CREDIT_INIT MILLISECS(10)
+/*
+ * Amount of credit the idle units have. It never changes, as idle
+ * units does not consume credits, and it must be lower than whatever
+ * amount of credit 'regular' unit would end up with.
+ */
+#define CSCHED2_IDLE_CREDIT (-(1U<<30))
+/*
+ * Carryover: How much "extra" credit may be carried over after
+ * a reset.
+ */
+#define CSCHED2_CARRYOVER_MAX CSCHED2_MIN_TIMER
+/*
+ * Stickiness: Cross-L2 migration resistance. Should be less than
+ * MIN_TIMER.
+ */
+#define CSCHED2_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1))
+/* How much to "compensate" an unit for L2 migration. */
+#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
+/* How tolerant we should be when peeking at runtime of units on other cpus */
+#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
+/* Reset: Value below which credit will be reset. */
+#define CSCHED2_CREDIT_RESET 0
+/* Max timer: Maximum time a guest can be run for. */
+#define CSCHED2_MAX_TIMER CSCHED2_CREDIT_INIT
+/* Period of the cap replenishment timer. */
+#define CSCHED2_BDGT_REPL_PERIOD ((opt_cap_period)*MILLISECS(1))
+
+/*
+ * Flags
+ */
+/*
+ * CSFLAG_scheduled: Is this unit either running on, or context-switching off,
+ * a physical cpu?
+ * + Accessed only with runqueue lock held
+ * + Set when chosen as next in csched2_schedule().
+ * + Cleared after context switch has been saved in csched2_context_saved()
+ * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
+ * set CSFLAG_delayed_runq_add
+ * + Checked to be false in runq_insert.
+ */
+#define __CSFLAG_scheduled 1
+#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled)
+/*
+ * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
+ * being context switched out?
+ * + Set when scheduling out in csched2_schedule() if prev is runnable
+ * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set
+ * + Read in csched2_context_saved(). If set, it adds prev to the runqueue and
+ * clears the bit.
+ */
+#define __CSFLAG_delayed_runq_add 2
+#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add)
+/*
+ * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a
+ * credit2-initiated runq migrate request; migrate it to the runqueue indicated
+ * in the svc struct.
+ */
+#define __CSFLAG_runq_migrate_request 3
+#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request)
+/*
+ * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The
+ * scheduler is invoked to see if we can give the cpu to someone else, and
+ * get back to the yielding unit in a while.
+ */
+#define __CSFLAG_unit_yield 4
+#define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield)
+/*
+ * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard
+ * affinity set to one and only 1 cpu (and, hence, can only run there).
+ */
+#define __CSFLAG_pinned 5
+#define CSFLAG_pinned (1U<<__CSFLAG_pinned)
+
+static unsigned int __read_mostly opt_migrate_resist = 500;
+integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
+
+/*
+ * Load tracking and load balancing
+ *
+ * Load history of runqueues and units is accounted for by using an
+ * exponential weighted moving average algorithm. However, instead of using
+ * fractions,we shift everything to left by the number of bits we want to
+ * use for representing the fractional part (Q-format).
+ *
+ * We may also want to reduce the precision of time accounting, to
+ * accommodate 'longer windows'. So, if that is the case, we just need to
+ * shift all time samples to the right.
+ *
+ * The details of the formulas used for load tracking are explained close to
+ * update_runq_load(). Let's just say here that, with full nanosecond time
+ * granularity, a 30 bits wide 'decaying window' is ~1 second long.
+ *
+ * We want to consider the following equations:
+ *
+ * avg[0] = load*P
+ * avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W, 0 <= delta <= W
+ *
+ * where W is the length of the window, P the multiplier for transitiong into
+ * Q-format fixed point arithmetic and load is the instantaneous load of a
+ * runqueue, which basically is the number of runnable units there are on the
+ * runqueue (for the meaning of the other terms, look at the doc comment to
+ * update_runq_load()).
+ *
+ * So, again, with full nanosecond granularity, and 1 second window, we have:
+ *
+ * W = 2^30
+ * P = 2^18
+ *
+ * The maximum possible value for the average load, which we want to store in
+ * s_time_t type variables (i.e., we have 63 bits available) is load*P. This
+ * means that, with P 18 bits wide, load can occupy 45 bits. This in turn
+ * means we can have 2^45 units in each runqueue, before overflow occurs!
+ *
+ * However, it can happen that, at step j+1, if:
+ *
+ * avg[j] = load*P
+ * delta = W
+ *
+ * then:
+ *
+ * avg[j+i] = avg[j] + W*load*P/W - W*load*P/W
+ *
+ * So we must be able to deal with W*load*P. This means load can't be higher
+ * than:
+ *
+ * 2^(63 - 30 - 18) = 2^15 = 32768
+ *
+ * So 32768 is the maximum number of units the we can have in a runqueue,
+ * at any given time, and still not have problems with the load tracking
+ * calculations... and this is more than fine.
+ *
+ * As a matter of fact, since we are using microseconds granularity, we have
+ * W=2^20. So, still with 18 fractional bits and a 1 second long window, there
+ * may be 2^25 = 33554432 units in a runq before we have to start thinking
+ * about overflow.
+ */
+
+/* If >0, decreases the granularity of time samples used for load tracking. */
+#define LOADAVG_GRANULARITY_SHIFT (10)
+/* Time window during which we still give value to previous load history. */
+#define LOADAVG_WINDOW_SHIFT (30)
+/* 18 bits by default (and not less than 4) for decimals. */
+#define LOADAVG_PRECISION_SHIFT (18)
+#define LOADAVG_PRECISION_SHIFT_MIN (4)
+
+/*
+ * Both the length of the window and the number of fractional bits can be
+ * decided with boot parameters.
+ *
+ * The length of the window is always expressed in nanoseconds. The actual
+ * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT.
+ */
+static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
+integer_param("credit2_load_window_shift", opt_load_window_shift);
+static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT;
+integer_param("credit2_load_precision_shift", opt_load_precision_shift);
+
+static int __read_mostly opt_underload_balance_tolerance = 0;
+integer_param("credit2_balance_under", opt_underload_balance_tolerance);
+static int __read_mostly opt_overload_balance_tolerance = -3;
+integer_param("credit2_balance_over", opt_overload_balance_tolerance);
+/*
+ * Domains subject to a cap receive a replenishment of their runtime budget
+ * once every opt_cap_period interval. Default is 10 ms. The amount of budget
+ * they receive depends on their cap. For instance, a domain with a 50% cap
+ * will receive 50% of 10 ms, so 5 ms.
+ */
+static unsigned int __read_mostly opt_cap_period = 10; /* ms */
+integer_param("credit2_cap_period_ms", opt_cap_period);
+
+/*
+ * Runqueue organization.
+ *
+ * The various cpus are to be assigned each one to a runqueue, and we
+ * want that to happen basing on topology. At the moment, it is possible
+ * to choose to arrange runqueues to be:
+ *
+ * - per-cpu: meaning that there will be one runqueue per logical cpu. This
+ * will happen when if the opt_runqueue parameter is set to 'cpu'.
+ *
+ * - per-core: meaning that there will be one runqueue per each physical
+ * core of the host. This will happen if the opt_runqueue
+ * parameter is set to 'core';
+ *
+ * - per-socket: meaning that there will be one runqueue per each physical
+ * socket (AKA package, which often, but not always, also
+ * matches a NUMA node) of the host; This will happen if
+ * the opt_runqueue parameter is set to 'socket';
+ *
+ * - per-node: meaning that there will be one runqueue per each physical
+ * NUMA node of the host. This will happen if the opt_runqueue
+ * parameter is set to 'node';
+ *
+ * - global: meaning that there will be only one runqueue to which all the
+ * (logical) processors of the host belong. This will happen if
+ * the opt_runqueue parameter is set to 'all'.
+ *
+ * Depending on the value of opt_runqueue, therefore, cpus that are part of
+ * either the same physical core, the same physical socket, the same NUMA
+ * node, or just all of them, will be put together to form runqueues.
+ */
+#define OPT_RUNQUEUE_CPU 0
+#define OPT_RUNQUEUE_CORE 1
+#define OPT_RUNQUEUE_SOCKET 2
+#define OPT_RUNQUEUE_NODE 3
+#define OPT_RUNQUEUE_ALL 4
+static const char *const opt_runqueue_str[] = {
+ [OPT_RUNQUEUE_CPU] = "cpu",
+ [OPT_RUNQUEUE_CORE] = "core",
+ [OPT_RUNQUEUE_SOCKET] = "socket",
+ [OPT_RUNQUEUE_NODE] = "node",
+ [OPT_RUNQUEUE_ALL] = "all"
+};
+static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET;
+
+static int __init parse_credit2_runqueue(const char *s)
+{
+ unsigned int i;
+
+ for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ )
+ {
+ if ( !strcmp(s, opt_runqueue_str[i]) )
+ {
+ opt_runqueue = i;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+custom_param("credit2_runqueue", parse_credit2_runqueue);
+
+/*
+ * Per-runqueue data
+ */
+struct csched2_runqueue_data {
+ spinlock_t lock; /* Lock for this runqueue */
+
+ struct list_head runq; /* Ordered list of runnable vms */
+ unsigned int nr_cpus; /* How many CPUs are sharing this runqueue */
+ int id; /* ID of this runqueue (-1 if invalid) */
+
+ int load; /* Instantaneous load (num of non-idle units) */
+ s_time_t load_last_update; /* Last time average was updated */
+ s_time_t avgload; /* Decaying queue load */
+ s_time_t b_avgload; /* Decaying queue load modified by balancing */
+
+ cpumask_t active, /* CPUs enabled for this runqueue */
+ smt_idle, /* Fully idle-and-untickled cores (see below) */
+ tickled, /* Have been asked to go through schedule */
+ idle; /* Currently idle pcpus */
+
+ struct list_head svc; /* List of all units assigned to the runqueue */
+ unsigned int max_weight; /* Max weight of the units in this runqueue */
+ unsigned int pick_bias; /* Last picked pcpu. Start from it next time */
+};
+
+/*
+ * System-wide private data
+ */
+struct csched2_private {
+ rwlock_t lock; /* Private scheduler lock */
+
+ unsigned int load_precision_shift; /* Precision of load calculations */
+ unsigned int load_window_shift; /* Lenght of load decaying window */
+ unsigned int ratelimit_us; /* Rate limiting for this scheduler */
+
+ cpumask_t active_queues; /* Runqueues with (maybe) active cpus */
+ struct csched2_runqueue_data *rqd; /* Data of the various runqueues */
+
+ cpumask_t initialized; /* CPUs part of this scheduler */
+ struct list_head sdom; /* List of domains (for debug key) */
+};
+
+/*
+ * Physical CPU
+ */
+struct csched2_pcpu {
+ cpumask_t sibling_mask; /* Siblings in the same runqueue */
+ int runq_id;
+};
+
+/*
+ * Schedule Unit
+ */
+struct csched2_unit {
+ struct csched2_dom *sdom; /* Up-pointer to domain */
+ struct sched_unit *unit; /* Up-pointer, to schedule unit */
+ struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue */
+
+ int credit; /* Current amount of credit */
+ unsigned int weight; /* Weight of this unit */
+ unsigned int residual; /* Reminder of div(max_weight/weight) */
+ unsigned flags; /* Status flags (16 bits would be ok, */
+ s_time_t budget; /* Current budget (if domains has cap) */
+ /* but clear_bit() does not like that) */
+ s_time_t budget_quota; /* Budget to which unit is entitled */
+
+ s_time_t start_time; /* Time we were scheduled (for credit) */
+
+ /* Individual contribution to load */
+ s_time_t load_last_update; /* Last time average was updated */
+ s_time_t avgload; /* Decaying queue load */
+
+ struct list_head runq_elem; /* On the runqueue (rqd->runq) */
+ struct list_head parked_elem; /* On the parked_units list */
+ struct list_head rqd_elem; /* On csched2_runqueue_data's svc list */
+ struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */
+ int tickled_cpu; /* Cpu that will pick us (-1 if none) */
+};
+
+/*
+ * Domain
+ */
+struct csched2_dom {
+ struct domain *dom; /* Up-pointer to domain */
+
+ spinlock_t budget_lock; /* Serialized budget calculations */
+ s_time_t tot_budget; /* Total amount of budget */
+ s_time_t budget; /* Currently available budget */
+
+ struct timer repl_timer; /* Timer for periodic replenishment of budget */
+ s_time_t next_repl; /* Time at which next replenishment occurs */
+ struct list_head parked_units; /* List of CPUs waiting for budget */
+
+ struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list */
+ uint16_t weight; /* User specified weight */
+ uint16_t cap; /* User specified cap */
+ uint16_t nr_units; /* Number of units of this domain */
+};
+
+/*
+ * Accessor helpers functions.
+ */
+static inline struct csched2_private *csched2_priv(const struct scheduler *ops)
+{
+ return ops->sched_data;
+}
+
+static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
+{
+ return get_sched_res(cpu)->sched_priv;
+}
+
+static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit)
+{
+ return unit->priv;
+}
+
+static inline struct csched2_dom *csched2_dom(const struct domain *d)
+{
+ return d->sched_priv;
+}
+
+/* CPU to runq_id macro */
+static inline int c2r(unsigned int cpu)
+{
+ return csched2_pcpu(cpu)->runq_id;
+}
+
+/* CPU to runqueue struct macro */
+static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
+ unsigned int cpu)
+{
+ return &csched2_priv(ops)->rqd[c2r(cpu)];
+}
+
+/* Does the domain of this unit have a cap? */
+static inline bool has_cap(const struct csched2_unit *svc)
+{
+ return svc->budget != STIME_MAX;
+}
+
+/*
+ * Hyperthreading (SMT) support.
+ *
+ * We use a special per-runq mask (smt_idle) and update it according to the
+ * following logic:
+ * - when _all_ the SMT sibling in a core are idle, all their corresponding
+ * bits are set in the smt_idle mask;
+ * - when even _just_one_ of the SMT siblings in a core is not idle, all the
+ * bits correspondings to it and to all its siblings are clear in the
+ * smt_idle mask.
+ *
+ * Once we have such a mask, it is easy to implement a policy that, either:
+ * - uses fully idle cores first: it is enough to try to schedule the units
+ * on pcpus from smt_idle mask first. This is what happens if
+ * sched_smt_power_savings was not set at boot (default), and it maximizes
+ * true parallelism, and hence performance;
+ * - uses already busy cores first: it is enough to try to schedule the units
+ * on pcpus that are idle, but are not in smt_idle. This is what happens if
+ * sched_smt_power_savings is set at boot, and it allows as more cores as
+ * possible to stay in low power states, minimizing power consumption.
+ *
+ * This logic is entirely implemented in runq_tickle(), and that is enough.
+ * In fact, in this scheduler, placement of an unit on one of the pcpus of a
+ * runq, _always_ happens by means of tickling:
+ * - when an unit wakes up, it calls csched2_unit_wake(), which calls
+ * runq_tickle();
+ * - when a migration is initiated in schedule.c, we call csched2_res_pick(),
+ * csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake().
+ * csched2_res_pick() looks for the least loaded runq and return just any
+ * of its processors. Then, csched2_unit_migrate() just moves the unit to
+ * the chosen runq, and it is again runq_tickle(), called by
+ * csched2_unit_wake() that actually decides what pcpu to use within the
+ * chosen runq;
+ * - when a migration is initiated in sched_credit2.c, by calling migrate()
+ * directly, that again temporarily use a random pcpu from the new runq,
+ * and then calls runq_tickle(), by itself.
+ */
+
+/*
+ * If all the siblings of cpu (including cpu itself) are both idle and
+ * untickled, set all their bits in mask.
+ *
+ * NB that rqd->smt_idle is different than rqd->idle. rqd->idle
+ * records pcpus that at are merely idle (i.e., at the moment do not
+ * have an unit running on them). But you have to manually filter out
+ * which pcpus have been tickled in order to find cores that are not
+ * going to be busy soon. Filtering out tickled cpus pairwise is a
+ * lot of extra pain; so for rqd->smt_idle, we explicitly make so that
+ * the bits of a pcpu are set only if all the threads on its core are
+ * both idle *and* untickled.
+ *
+ * This means changing the mask when either rqd->idle or rqd->tickled
+ * changes.
+ */
+static inline
+void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers,
+ cpumask_t *mask)
+{
+ const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
+
+ if ( cpumask_subset(cpu_siblings, idlers) )
+ cpumask_or(mask, mask, cpu_siblings);
+}
+
+/*
+ * Clear the bits of all the siblings of cpu from mask (if necessary).
+ */
+static inline
+void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask)
+{
+ const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
+
+ if ( cpumask_subset(cpu_siblings, mask) )
+ cpumask_andnot(mask, mask, cpu_siblings);
+}
+
+/*
+ * In csched2_res_pick(), it may not be possible to actually look at remote
+ * runqueues (the trylock-s on their spinlocks can fail!). If that happens,
+ * we pick, in order of decreasing preference:
+ * 1) svc's current pcpu, if it is part of svc's soft affinity;
+ * 2) a pcpu in svc's current runqueue that is also in svc's soft affinity;
+ * 3) svc's current pcpu, if it is part of svc's hard affinity;
+ * 4) a pcpu in svc's current runqueue that is also in svc's hard affinity;
+ * 5) just one valid pcpu from svc's hard affinity
+ *
+ * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also
+ * note that at least 5 is guaranteed to _always_ return at least one pcpu.
+ */
+static int get_fallback_cpu(struct csched2_unit *svc)
+{
+ struct sched_unit *unit = svc->unit;
+ unsigned int bs;
+
+ SCHED_STAT_CRANK(need_fallback_cpu);
+
+ for_each_affinity_balance_step( bs )
+ {
+ int cpu = sched_unit_master(unit);
+
+ if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
+ continue;
+
+ affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ cpupool_domain_master_cpumask(unit->domain));
+
+ /*
+ * This is cases 1 or 3 (depending on bs): if processor is (still)
+ * in our affinity, go for it, for cache betterness.
+ */
+ if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
+ return cpu;
+
+ /*
+ * This is cases 2 or 4 (depending on bs): v->processor isn't there
+ * any longer, check if we at least can stay in our current runq.
+ */
+ if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu),
+ &svc->rqd->active)) )
+ {
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &svc->rqd->active);
+ return cpumask_first(cpumask_scratch_cpu(cpu));
+ }
+
+ /*
+ * We may well pick any valid pcpu from our soft-affinity, outside
+ * of our current runqueue, but we decide not to. In fact, changing
+ * runqueue is slow, affects load distribution, and is a source of
+ * overhead for the units running on the other runqueue (we need the
+ * lock). So, better do that as a consequence of a well informed
+ * decision (or if we really don't have any other chance, as we will,
+ * at step 5, if we get to there).
+ *
+ * Also, being here, looking for a fallback, is an unfortunate and
+ * infrequent event, while the decision of putting us in the runqueue
+ * wehere we are was (likely) made taking all the relevant factors
+ * into account. So let's not disrupt that, just for the sake of
+ * soft-affinity, and let's wait here to be able to made (hopefully,
+ * soon), another similar well informed decision.
+ */
+ if ( bs == BALANCE_SOFT_AFFINITY )
+ continue;
+
+ /*
+ * This is cases 5: last stand, just one valid pcpu from our hard
+ * affinity. It's guaranteed that there is at least one valid cpu,
+ * and therefore we are sure that we return it, and never really
+ * exit the loop.
+ */
+ ASSERT(bs == BALANCE_HARD_AFFINITY &&
+ !cpumask_empty(cpumask_scratch_cpu(cpu)));
+ cpu = cpumask_first(cpumask_scratch_cpu(cpu));
+ if ( likely(cpu < nr_cpu_ids) )
+ return cpu;
+ }
+ ASSERT_UNREACHABLE();
+ /*
+ * We can't be here. But if that somehow happen (in non-debug builds),
+ * at least return something which both online and in our hard-affinity.
+ */
+ return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit)));
+}
+
+/*
+ * Time-to-credit, credit-to-time.
+ *
+ * We keep track of the "residual" time to make sure that frequent short
+ * schedules still get accounted for in the end.
+ *
+ * FIXME: Do pre-calculated division?
+ */
+static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
+ struct csched2_unit *svc)
+{
+ uint64_t val = time * rqd->max_weight + svc->residual;
+
+ svc->residual = do_div(val, svc->weight);
+ svc->credit -= val;
+}
+
+static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_unit *svc)
+{
+ return credit * svc->weight / rqd->max_weight;
+}
+
+/*
+ * Runqueue related code.
+ */
+
+static inline int unit_on_runq(struct csched2_unit *svc)
+{
+ return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched2_unit * runq_elem(struct list_head *elem)
+{
+ return list_entry(elem, struct csched2_unit, runq_elem);
+}
+
+static void activate_runqueue(struct csched2_private *prv, int rqi)
+{
+ struct csched2_runqueue_data *rqd;
+
+ rqd = prv->rqd + rqi;
+
+ BUG_ON(!cpumask_empty(&rqd->active));
+
+ rqd->max_weight = 1;
+ rqd->id = rqi;
+ INIT_LIST_HEAD(&rqd->svc);
+ INIT_LIST_HEAD(&rqd->runq);
+ spin_lock_init(&rqd->lock);
+
+ __cpumask_set_cpu(rqi, &prv->active_queues);
+}
+
+static void deactivate_runqueue(struct csched2_private *prv, int rqi)
+{
+ struct csched2_runqueue_data *rqd;
+
+ rqd = prv->rqd + rqi;
+
+ BUG_ON(!cpumask_empty(&rqd->active));
+
+ rqd->id = -1;
+
+ __cpumask_clear_cpu(rqi, &prv->active_queues);
+}
+
+static inline bool same_node(unsigned int cpua, unsigned int cpub)
+{
+ return cpu_to_node(cpua) == cpu_to_node(cpub);
+}
+
+static inline bool same_socket(unsigned int cpua, unsigned int cpub)
+{
+ return cpu_to_socket(cpua) == cpu_to_socket(cpub);
+}
+
+static inline bool same_core(unsigned int cpua, unsigned int cpub)
+{
+ return same_socket(cpua, cpub) &&
+ cpu_to_core(cpua) == cpu_to_core(cpub);
+}
+
+static unsigned int
+cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu)
+{
+ struct csched2_runqueue_data *rqd;
+ unsigned int rqi;
+
+ for ( rqi = 0; rqi < nr_cpu_ids; rqi++ )
+ {
+ unsigned int peer_cpu;
+
+ /*
+ * As soon as we come across an uninitialized runqueue, use it.
+ * In fact, either:
+ * - we are initializing the first cpu, and we assign it to
+ * runqueue 0. This is handy, especially if we are dealing
+ * with the boot cpu (if credit2 is the default scheduler),
+ * as we would not be able to use cpu_to_socket() and similar
+ * helpers anyway (they're result of which is not reliable yet);
+ * - we have gone through all the active runqueues, and have not
+ * found anyone whose cpus' topology matches the one we are
+ * dealing with, so activating a new runqueue is what we want.
+ */
+ if ( prv->rqd[rqi].id == -1 )
+ break;
+
+ rqd = prv->rqd + rqi;
+ BUG_ON(cpumask_empty(&rqd->active));
+
+ peer_cpu = cpumask_first(&rqd->active);
+ BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID ||
+ cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID);
+
+ if (opt_runqueue == OPT_RUNQUEUE_CPU)
+ continue;
+ if ( opt_runqueue == OPT_RUNQUEUE_ALL ||
+ (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) ||
+ (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) ||
+ (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) )
+ break;
+ }
+
+ /* We really expect to be able to assign each cpu to a runqueue. */
+ BUG_ON(rqi >= nr_cpu_ids);
+
+ return rqi;
+}
+
+/* Find the domain with the highest weight. */
+static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
+ int old_weight)
+{
+ /* Try to avoid brute-force search:
+ * - If new_weight is larger, max_weigth <- new_weight
+ * - If old_weight != max_weight, someone else is still max_weight
+ * (No action required)
+ * - If old_weight == max_weight, brute-force search for max weight
+ */
+ if ( new_weight > rqd->max_weight )
+ {
+ rqd->max_weight = new_weight;
+ SCHED_STAT_CRANK(upd_max_weight_quick);
+ }
+ else if ( old_weight == rqd->max_weight )
+ {
+ struct list_head *iter;
+ int max_weight = 1;
+
+ list_for_each( iter, &rqd->svc )
+ {
+ struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem);
+
+ if ( svc->weight > max_weight )
+ max_weight = svc->weight;
+ }
+
+ rqd->max_weight = max_weight;
+ SCHED_STAT_CRANK(upd_max_weight_full);
+ }
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned rqi:16, max_weight:16;
+ } d;
+ d.rqi = rqd->id;
+ d.max_weight = rqd->max_weight;
+ __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+}
+
+/* Add and remove from runqueue assignment (not active run queue) */
+static void
+_runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd)
+{
+
+ svc->rqd = rqd;
+ list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
+
+ update_max_weight(svc->rqd, svc->weight, 0);
+
+ /* Expected new load based on adding this unit */
+ rqd->b_avgload += svc->avgload;
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned rqi:16;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.rqi=rqd->id;
+ __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+}
+
+static void
+runq_assign(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit *svc = unit->priv;
+
+ ASSERT(svc->rqd == NULL);
+
+ _runq_assign(svc, c2rqd(ops, sched_unit_master(unit)));
+}
+
+static void
+_runq_deassign(struct csched2_unit *svc)
+{
+ struct csched2_runqueue_data *rqd = svc->rqd;
+
+ ASSERT(!unit_on_runq(svc));
+ ASSERT(!(svc->flags & CSFLAG_scheduled));
+
+ list_del_init(&svc->rqd_elem);
+ update_max_weight(rqd, 0, svc->weight);
+
+ /* Expected new load based on removing this unit */
+ rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
+
+ svc->rqd = NULL;
+}
+
+static void
+runq_deassign(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit *svc = unit->priv;
+
+ ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+
+ _runq_deassign(svc);
+}
+
+/*
+ * Track the runq load by gathering instantaneous load samples, and using
+ * exponentially weighted moving average (EWMA) for the 'decaying'.
+ *
+ * We consider a window of length W=2^(prv->load_window_shift) nsecs
+ * (which takes LOADAVG_GRANULARITY_SHIFT into account).
+ *
+ * If load is the instantaneous load, the formula for EWMA looks as follows,
+ * for the i-eth sample:
+ *
+ * avg[i] = a*load + (1 - a)*avg[i-1]
+ *
+ * where avg[i] is the new value of the average load, avg[i-1] is the value
+ * of the average load calculated so far, and a is a coefficient less or
+ * equal to 1.
+ *
+ * So, for us, it becomes:
+ *
+ * avgload = a*load + (1 - a)*avgload
+ *
+ * For determining a, we consider _when_ we are doing the load update, wrt
+ * the length of the window. We define delta as follows:
+ *
+ * delta = t - load_last_update
+ *
+ * where t is current time (i.e., time at which we are both sampling and
+ * updating the load average) and load_last_update is the last time we did
+ * that.
+ *
+ * There are two possible situations:
+ *
+ * a) delta <= W
+ * this means that, during the last window of length W, the runeuque load
+ * was avgload for (W - detla) time, and load for delta time:
+ *
+ * |----------- W ---------|
+ * | |
+ * | load_last_update t
+ * -------------------------|---------|---
+ * | | |
+ * \__W - delta__/\_delta__/
+ * | | |
+ * |___avgload___|__load___|
+ *
+ * So, what about using delta/W as our smoothing coefficient a. If we do,
+ * here's what happens:
+ *
+ * a = delta / W
+ * 1 - a = 1 - (delta / W) = (W - delta) / W
+ *
+ * Which matches the above description of what happened in the last
+ * window of length W.
+ *
+ * Note that this also means that the weight that we assign to both the
+ * latest load sample, and to previous history, varies at each update.
+ * The longer the latest load sample has been in efect, within the last
+ * window, the higher it weights (and the lesser the previous history
+ * weights).
+ *
+ * This is some sort of extension of plain EWMA to fit even better to our
+ * use case.
+ *
+ * b) delta > W
+ * this means more than a full window has passed since the last update:
+ *
+ * |----------- W ---------|
+ * | |
+ * load_last_update t
+ * ----|------------------------------|---
+ * | |
+ * \_________________delta________/
+ *
+ * Basically, it means the last load sample has been in effect for more
+ * than W time, and hence we should just use it, and forget everything
+ * before that.
+ *
+ * This can be seen as a 'reset condition', occurring when, for whatever
+ * reason, load has not been updated for longer than we expected. (It is
+ * also how avgload is assigned its first value.)
+ *
+ * The formula for avgload then becomes:
+ *
+ * avgload = (delta/W)*load + (W - delta)*avgload/W
+ * avgload = delta*load/W + W*avgload/W - delta*avgload/W
+ * avgload = avgload + delta*load/W - delta*avgload/W
+ *
+ * So, final form is:
+ *
+ * avgload_0 = load
+ * avgload = avgload + delta*load/W - delta*avgload/W, 0<=delta<=W
+ *
+ * As a confirmation, let's look at the extremes, when delta is 0 (i.e.,
+ * what happens if we update the load twice, at the same time instant?):
+ *
+ * avgload = avgload + 0*load/W - 0*avgload/W
+ * avgload = avgload
+ *
+ * and when delta is W (i.e., what happens if we update at the last
+ * possible instant before the window 'expires'?):
+ *
+ * avgload = avgload + W*load/W - W*avgload/W
+ * avgload = avgload + load - avgload
+ * avgload = load
+ *
+ * Which, in both cases, is what we expect.
+ */
+static void
+update_runq_load(const struct scheduler *ops,
+ struct csched2_runqueue_data *rqd, int change, s_time_t now)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ s_time_t delta, load = rqd->load;
+ unsigned int P, W;
+
+ W = prv->load_window_shift;
+ P = prv->load_precision_shift;
+ now >>= LOADAVG_GRANULARITY_SHIFT;
+
+ /*
+ * To avoid using fractions, we shift to left by load_precision_shift,
+ * and use the least last load_precision_shift bits as fractional part.
+ * Looking back at the formula we want to use, we now have:
+ *
+ * P = 2^(load_precision_shift)
+ * P*avgload = P*(avgload + delta*load/W - delta*avgload/W)
+ * P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W
+ *
+ * And if we are ok storing and using P*avgload, we can rewrite this as:
+ *
+ * P*avgload = avgload'
+ * avgload' = avgload' + delta*P*load/W - delta*avgload'/W
+ *
+ * Coupled with, of course:
+ *
+ * avgload_0' = P*load
+ */
+
+ if ( rqd->load_last_update + (1ULL << W) < now )
+ {
+ rqd->avgload = load << P;
+ rqd->b_avgload = load << P;
+ }
+ else
+ {
+ delta = now - rqd->load_last_update;
+ if ( unlikely(delta < 0) )
+ {
+ d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
+ __func__, now, rqd->load_last_update);
+ delta = 0;
+ }
+
+ /*
+ * Note that, if we were to enforce (or check) some relationship
+ * between P and W, we may save one shift. E.g., if we are sure
+ * that P < W, we could write:
+ *
+ * (delta * (load << P)) >> W
+ *
+ * as:
+ *
+ * (delta * load) >> (W - P)
+ */
+ rqd->avgload = rqd->avgload +
+ ((delta * (load << P)) >> W) -
+ ((delta * rqd->avgload) >> W);
+ rqd->b_avgload = rqd->b_avgload +
+ ((delta * (load << P)) >> W) -
+ ((delta * rqd->b_avgload) >> W);
+ }
+ rqd->load += change;
+ rqd->load_last_update = now;
+
+ /* Overflow, capable of making the load look negative, must not occur. */
+ ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint64_t rq_avgload, b_avgload;
+ unsigned rq_load:16, rq_id:8, shift:8;
+ } d;
+ d.rq_id = rqd->id;
+ d.rq_load = rqd->load;
+ d.rq_avgload = rqd->avgload;
+ d.b_avgload = rqd->b_avgload;
+ d.shift = P;
+ __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+}
+
+static void
+update_svc_load(const struct scheduler *ops,
+ struct csched2_unit *svc, int change, s_time_t now)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ s_time_t delta, unit_load;
+ unsigned int P, W;
+
+ if ( change == -1 )
+ unit_load = 1;
+ else if ( change == 1 )
+ unit_load = 0;
+ else
+ unit_load = unit_runnable(svc->unit);
+
+ W = prv->load_window_shift;
+ P = prv->load_precision_shift;
+ now >>= LOADAVG_GRANULARITY_SHIFT;
+
+ if ( svc->load_last_update + (1ULL << W) < now )
+ {
+ svc->avgload = unit_load << P;
+ }
+ else
+ {
+ delta = now - svc->load_last_update;
+ if ( unlikely(delta < 0) )
+ {
+ d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
+ __func__, now, svc->load_last_update);
+ delta = 0;
+ }
+
+ svc->avgload = svc->avgload +
+ ((delta * (unit_load << P)) >> W) -
+ ((delta * svc->avgload) >> W);
+ }
+ svc->load_last_update = now;
+
+ /* Overflow, capable of making the load look negative, must not occur. */
+ ASSERT(svc->avgload >= 0);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint64_t v_avgload;
+ unsigned unit:16, dom:16;
+ unsigned shift;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.v_avgload = svc->avgload;
+ d.shift = P;
+ __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+}
+
+static void
+update_load(const struct scheduler *ops,
+ struct csched2_runqueue_data *rqd,
+ struct csched2_unit *svc, int change, s_time_t now)
+{
+ trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0, NULL);
+
+ update_runq_load(ops, rqd, change, now);
+ if ( svc )
+ update_svc_load(ops, svc, change, now);
+}
+
+static void
+runq_insert(const struct scheduler *ops, struct csched2_unit *svc)
+{
+ struct list_head *iter;
+ unsigned int cpu = sched_unit_master(svc->unit);
+ struct list_head * runq = &c2rqd(ops, cpu)->runq;
+ int pos = 0;
+
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+ ASSERT(!unit_on_runq(svc));
+ ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit)));
+
+ ASSERT(&svc->rqd->runq == runq);
+ ASSERT(!is_idle_unit(svc->unit));
+ ASSERT(!svc->unit->is_running);
+ ASSERT(!(svc->flags & CSFLAG_scheduled));
+
+ list_for_each( iter, runq )
+ {
+ struct csched2_unit * iter_svc = runq_elem(iter);
+
+ if ( svc->credit > iter_svc->credit )
+ break;
+
+ pos++;
+ }
+ list_add_tail(&svc->runq_elem, iter);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned pos;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.pos = pos;
+ __trace_var(TRC_CSCHED2_RUNQ_POS, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+}
+
+static inline void runq_remove(struct csched2_unit *svc)
+{
+ ASSERT(unit_on_runq(svc));
+ list_del_init(&svc->runq_elem);
+}
+
+void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t);
+
+static inline void
+tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd)
+{
+ __cpumask_set_cpu(cpu, &rqd->tickled);
+ smt_idle_mask_clear(cpu, &rqd->smt_idle);
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+}
+
+/*
+ * What we want to know is whether svc, which we assume to be running on some
+ * pcpu, can be interrupted and preempted (which, so far, basically means
+ * whether or not it already run for more than the ratelimit, to which we
+ * apply some tolerance).
+ */
+static inline bool is_preemptable(const struct csched2_unit *svc,
+ s_time_t now, s_time_t ratelimit)
+{
+ if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
+ return true;
+
+ ASSERT(svc->unit->is_running);
+ return now - svc->unit->state_entry_time >
+ ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
+}
+
+/*
+ * Score to preempt the target cpu. Return a negative number if the
+ * credit isn't high enough; if it is, favor a preemption on cpu in
+ * this order:
+ * - cpu is in new's soft-affinity, not in cur's soft-affinity
+ * (2 x CSCHED2_CREDIT_INIT score bonus);
+ * - cpu is in new's soft-affinity and cur's soft-affinity, or
+ * cpu is not in new's soft-affinity, nor in cur's soft-affinity
+ * (1x CSCHED2_CREDIT_INIT score bonus);
+ * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity
+ * (no bonus).
+ *
+ * Within the same class, the highest difference of credit.
+ */
+static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
+ struct csched2_unit *new, unsigned int cpu)
+{
+ struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+ struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu));
+ struct csched2_private *prv = csched2_priv(ops);
+ s_time_t score;
+
+ /*
+ * We are dealing with cpus that are marked non-idle (i.e., that are not
+ * in rqd->idle). However, some of them may be running their idle unit,
+ * if taking care of tasklets. In that case, we want to leave it alone.
+ */
+ if ( unlikely(is_idle_unit(cur->unit) ||
+ !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
+ return -1;
+
+ burn_credits(rqd, cur, now);
+
+ score = new->credit - cur->credit;
+ if ( sched_unit_master(new->unit) != cpu )
+ score -= CSCHED2_MIGRATE_RESIST;
+
+ /*
+ * If score is positive, it means new has enough credits (i.e.,
+ * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST).
+ *
+ * Let's compute the bonuses for soft-affinities.
+ */
+ if ( score > 0 )
+ {
+ if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) )
+ score += CSCHED2_CREDIT_INIT;
+
+ if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) )
+ score += CSCHED2_CREDIT_INIT;
+ }
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ int credit, score;
+ } d;
+ d.dom = cur->unit->domain->domain_id;
+ d.unit = cur->unit->unit_id;
+ d.credit = cur->credit;
+ d.score = score;
+ __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ return score;
+}
+
+/*
+ * Check what processor it is best to 'wake', for picking up an unit that has
+ * just been put (back) in the runqueue. Logic is as follows:
+ * 1. if there are idle processors in the runq, wake one of them;
+ * 2. if there aren't idle processor, check the one were the unit was
+ * running before to see if we can preempt what's running there now
+ * (and hence doing just one migration);
+ * 3. last stand: check all processors and see if the unit is in right
+ * of preempting any of the other units running on them (this requires
+ * two migrations, and that's indeed why it is left as the last stand).
+ *
+ * Note that when we say 'idle processors' what we really mean is (pretty
+ * much always) both _idle_ and _not_already_tickled_. In fact, if a
+ * processor has been tickled, it will run csched2_schedule() shortly, and
+ * pick up some work, so it would be wrong to consider it idle.
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now)
+{
+ int i, ipid = -1;
+ s_time_t max = 0;
+ struct sched_unit *unit = new->unit;
+ unsigned int bs, cpu = sched_unit_master(unit);
+ struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+ cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
+ cpumask_t mask;
+
+ ASSERT(new->rqd == rqd);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned processor;
+ int credit;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.processor = cpu;
+ d.credit = new->credit;
+ __trace_var(TRC_CSCHED2_TICKLE_NEW, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ /*
+ * Exclusive pinning is when an unit has hard-affinity with only one
+ * cpu, and there is no other unit that has hard-affinity with that
+ * same cpu. This is infrequent, but if it happens, is for achieving
+ * the most possible determinism, and least possible overhead for
+ * the units in question.
+ *
+ * Try to identify the vast majority of these situations, and deal
+ * with them quickly.
+ */
+ if ( unlikely((new->flags & CSFLAG_pinned) &&
+ cpumask_test_cpu(cpu, &rqd->idle) &&
+ !cpumask_test_cpu(cpu, &rqd->tickled)) )
+ {
+ ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
+ SCHED_STAT_CRANK(tickled_idle_cpu_excl);
+ ipid = cpu;
+ goto tickle;
+ }
+
+ for_each_affinity_balance_step( bs )
+ {
+ /* Just skip first step, if we don't have a soft affinity */
+ if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
+ continue;
+
+ affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
+
+ /*
+ * First of all, consider idle cpus, checking if we can just
+ * re-use the pcpu where we were running before.
+ *
+ * If there are cores where all the siblings are idle, consider
+ * them first, honoring whatever the spreading-vs-consolidation
+ * SMT policy wants us to do.
+ */
+ if ( unlikely(sched_smt_power_savings) )
+ {
+ cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle);
+ cpumask_and(&mask, &mask, online);
+ }
+ else
+ cpumask_and(&mask, &rqd->smt_idle, online);
+ cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
+ i = cpumask_test_or_cycle(cpu, &mask);
+ if ( i < nr_cpu_ids )
+ {
+ SCHED_STAT_CRANK(tickled_idle_cpu);
+ ipid = i;
+ goto tickle;
+ }
+
+ /*
+ * If there are no fully idle cores, check all idlers, after
+ * having filtered out pcpus that have been tickled but haven't
+ * gone through the scheduler yet.
+ */
+ cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online);
+ cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
+ i = cpumask_test_or_cycle(cpu, &mask);
+ if ( i < nr_cpu_ids )
+ {
+ SCHED_STAT_CRANK(tickled_idle_cpu);
+ ipid = i;
+ goto tickle;
+ }
+ }
+
+ /*
+ * Note that, if we are here, it means we have done the hard-affinity
+ * balancing step of the loop, and hence what we have in cpumask_scratch
+ * is what we put there for last, i.e., new's unit_hard_affinity & online
+ * which is exactly what we need for the next part of the function.
+ */
+
+ /*
+ * Otherwise, look for the non-idle (and non-tickled) processors with
+ * the lowest credit, among the ones new is allowed to run on. Again,
+ * the cpu were it was running on would be the best candidate.
+ *
+ * For deciding which cpu to tickle, we use tickle_score(), which will
+ * factor in both new's soft-affinity, and the soft-affinity of the
+ * unit running on each cpu that we consider.
+ */
+ cpumask_andnot(&mask, &rqd->active, &rqd->idle);
+ cpumask_andnot(&mask, &mask, &rqd->tickled);
+ cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
+ if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
+ {
+ s_time_t score = tickle_score(ops, now, new, cpu);
+
+ if ( score > max )
+ {
+ max = score;
+ ipid = cpu;
+
+ /* If this is in new's soft affinity, just take it */
+ if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) )
+ {
+ SCHED_STAT_CRANK(tickled_busy_cpu);
+ goto tickle;
+ }
+ }
+ }
+
+ for_each_cpu(i, &mask)
+ {
+ s_time_t score;
+
+ /* Already looked at this one above */
+ ASSERT(i != cpu);
+
+ score = tickle_score(ops, now, new, i);
+
+ if ( score > max )
+ {
+ max = score;
+ ipid = i;
+ }
+ }
+
+ if ( ipid == -1 )
+ {
+ SCHED_STAT_CRANK(tickled_no_cpu);
+ return;
+ }
+
+ ASSERT(!is_idle_unit(curr_on_cpu(ipid)));
+ SCHED_STAT_CRANK(tickled_busy_cpu);
+ tickle:
+ BUG_ON(ipid == -1);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned cpu:16, pad:16;
+ } d;
+ d.cpu = ipid; d.pad = 0;
+ __trace_var(TRC_CSCHED2_TICKLE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ tickle_cpu(ipid, rqd);
+
+ if ( unlikely(new->tickled_cpu != -1) )
+ SCHED_STAT_CRANK(tickled_cpu_overwritten);
+ new->tickled_cpu = ipid;
+}
+
+/*
+ * Credit-related code
+ */
+static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
+ struct csched2_unit *snext)
+{
+ struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+ struct list_head *iter;
+ int m;
+
+ /*
+ * Under normal circumstances, snext->credit should never be less
+ * than -CSCHED2_MIN_TIMER. However, under some circumstances, an
+ * unit with low credits may be allowed to run long enough that
+ * its credits are actually less than -CSCHED2_CREDIT_INIT.
+ * (Instances have been observed, for example, where an unit with
+ * 200us of credit was allowed to run for 11ms, giving it -10.8ms
+ * of credit. Thus it was still negative even after the reset.)
+ *
+ * If this is the case for snext, we simply want to keep moving
+ * everyone up until it is in the black again. This fair because
+ * none of the other units want to run at the moment.
+ *
+ * Rather than looping, however, we just calculate a multiplier,
+ * avoiding an integer division and multiplication in the common
+ * case.
+ */
+ m = 1;
+ if ( snext->credit < -CSCHED2_CREDIT_INIT )
+ m += (-snext->credit) / CSCHED2_CREDIT_INIT;
+
+ list_for_each( iter, &rqd->svc )
+ {
+ unsigned int svc_cpu;
+ struct csched2_unit * svc;
+ int start_credit;
+
+ svc = list_entry(iter, struct csched2_unit, rqd_elem);
+ svc_cpu = sched_unit_master(svc->unit);
+
+ ASSERT(!is_idle_unit(svc->unit));
+ ASSERT(svc->rqd == rqd);
+
+ /*
+ * If svc is running, it is our responsibility to make sure, here,
+ * that the credit it has spent so far get accounted.
+ */
+ if ( svc->unit == curr_on_cpu(svc_cpu) )
+ {
+ burn_credits(rqd, svc, now);
+ /*
+ * And, similarly, in case it has run out of budget, as a
+ * consequence of this round of accounting, we also must inform
+ * its pCPU that it's time to park it, and pick up someone else.
+ */
+ if ( unlikely(svc->budget <= 0) )
+ tickle_cpu(svc_cpu, rqd);
+ }
+
+ start_credit = svc->credit;
+
+ /*
+ * Add INIT * m, avoiding integer multiplication in the common case.
+ */
+ if ( likely(m==1) )
+ svc->credit += CSCHED2_CREDIT_INIT;
+ else
+ svc->credit += m * CSCHED2_CREDIT_INIT;
+
+ /* "Clip" credits to max carryover */
+ if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX )
+ svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX;
+
+ svc->start_time = now;
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ int credit_start, credit_end;
+ unsigned multiplier;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.credit_start = start_credit;
+ d.credit_end = svc->credit;
+ d.multiplier = m;
+ __trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+ }
+
+ SCHED_STAT_CRANK(credit_reset);
+
+ /* No need to resort runqueue, as everyone's order should be the same. */
+}
+
+void burn_credits(struct csched2_runqueue_data *rqd,
+ struct csched2_unit *svc, s_time_t now)
+{
+ s_time_t delta;
+
+ ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit))));
+
+ if ( unlikely(is_idle_unit(svc->unit)) )
+ {
+ ASSERT(svc->credit == CSCHED2_IDLE_CREDIT);
+ return;
+ }
+
+ delta = now - svc->start_time;
+
+ if ( unlikely(delta <= 0) )
+ {
+ if ( unlikely(delta < 0) )
+ d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
+ " start_time %"PRI_stime"\n", __func__, now,
+ svc->start_time);
+ goto out;
+ }
+
+ SCHED_STAT_CRANK(burn_credits_t2c);
+ t2c_update(rqd, delta, svc);
+
+ if ( has_cap(svc) )
+ svc->budget -= delta;
+
+ svc->start_time = now;
+
+ out:
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ int credit, budget;
+ int delta;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.credit = svc->credit;
+ d.budget = has_cap(svc) ? svc->budget : INT_MIN;
+ d.delta = delta;
+ __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+}
+
+/*
+ * Budget-related code.
+ */
+
+static void park_unit(struct csched2_unit *svc)
+{
+ struct sched_unit *unit = svc->unit;
+
+ ASSERT(spin_is_locked(&svc->sdom->budget_lock));
+
+ /*
+ * It was impossible to find budget for this unit, so it has to be
+ * "parked". This implies it is not runnable, so we mark it as such in
+ * its pause_flags. If the unit is currently scheduled (which means we
+ * are here after being called from within csched_schedule()), flagging
+ * is enough, as we'll choose someone else, and then context_saved()
+ * will take care of updating the load properly.
+ *
+ * If, OTOH, the unit is sitting in the runqueue (which means we are here
+ * after being called from within runq_candidate()), we must go all the
+ * way down to taking it out of there, and updating the load accordingly.
+ *
+ * In both cases, we also add it to the list of parked units of the domain.
+ */
+ sched_set_pause_flags(unit, _VPF_parked);
+ if ( unit_on_runq(svc) )
+ {
+ runq_remove(svc);
+ update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
+ }
+ list_add(&svc->parked_elem, &svc->sdom->parked_units);
+}
+
+static bool unit_grab_budget(struct csched2_unit *svc)
+{
+ struct csched2_dom *sdom = svc->sdom;
+ unsigned int cpu = sched_unit_master(svc->unit);
+
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+ if ( svc->budget > 0 )
+ return true;
+
+ /* budget_lock nests inside runqueue lock. */
+ spin_lock(&sdom->budget_lock);
+
+ /*
+ * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
+ * above!). That basically means the unit has overrun a bit --because of
+ * various reasons-- and we want to take that into account. With the +=,
+ * we are actually subtracting the amount of budget the unit has
+ * overconsumed, from the total domain budget.
+ */
+ sdom->budget += svc->budget;
+
+ if ( sdom->budget > 0 )
+ {
+ s_time_t budget;
+
+ /* Get our quota, if there's at least as much budget */
+ if ( likely(sdom->budget >= svc->budget_quota) )
+ budget = svc->budget_quota;
+ else
+ budget = sdom->budget;
+
+ svc->budget = budget;
+ sdom->budget -= budget;
+ }
+ else
+ {
+ svc->budget = 0;
+ park_unit(svc);
+ }
+
+ spin_unlock(&sdom->budget_lock);
+
+ return svc->budget > 0;
+}
+
+static void
+unit_return_budget(struct csched2_unit *svc, struct list_head *parked)
+{
+ struct csched2_dom *sdom = svc->sdom;
+ unsigned int cpu = sched_unit_master(svc->unit);
+
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+ ASSERT(list_empty(parked));
+
+ /* budget_lock nests inside runqueue lock. */
+ spin_lock(&sdom->budget_lock);
+
+ /*
+ * The unit is stopping running (e.g., because it's blocking, or it has
+ * been preempted). If it hasn't consumed all the budget it got when,
+ * starting to run, put that remaining amount back in the domain's budget
+ * pool.
+ */
+ sdom->budget += svc->budget;
+ svc->budget = 0;
+
+ /*
+ * Making budget available again to the domain means that parked units
+ * may be unparked and run. They are, if any, in the domain's parked_units
+ * list, so we want to go through that and unpark them (so they can try
+ * to get some budget).
+ *
+ * Touching the list requires the budget_lock, which we hold. Let's
+ * therefore put everyone in that list in another, temporary list, which
+ * then the caller will traverse, unparking the units it finds there.
+ *
+ * In fact, we can't do the actual unparking here, because that requires
+ * taking the runqueue lock of the units being unparked, and we can't
+ * take any runqueue locks while we hold a budget_lock.
+ */
+ if ( sdom->budget > 0 )
+ list_splice_init(&sdom->parked_units, parked);
+
+ spin_unlock(&sdom->budget_lock);
+}
+
+static void
+unpark_parked_units(const struct scheduler *ops, struct list_head *units)
+{
+ struct csched2_unit *svc, *tmp;
+ spinlock_t *lock;
+
+ list_for_each_entry_safe ( svc, tmp, units, parked_elem )
+ {
+ unsigned long flags;
+ s_time_t now;
+
+ lock = unit_schedule_lock_irqsave(svc->unit, &flags);
+
+ sched_clear_pause_flags(svc->unit, _VPF_parked);
+ if ( unlikely(svc->flags & CSFLAG_scheduled) )
+ {
+ /*
+ * We end here if a budget replenishment arrived between
+ * csched2_schedule() (and, in particular, after a call to
+ * unit_grab_budget() that returned false), and
+ * context_saved(). By setting __CSFLAG_delayed_runq_add,
+ * we tell context_saved() to put the unit back in the
+ * runqueue, from where it will compete with the others
+ * for the newly replenished budget.
+ */
+ ASSERT( svc->rqd != NULL );
+ ASSERT( c2rqd(ops, sched_unit_master(svc->unit)) == svc->rqd );
+ __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+ }
+ else if ( unit_runnable(svc->unit) )
+ {
+ /*
+ * The unit should go back to the runqueue, and compete for
+ * the newly replenished budget, but only if it is actually
+ * runnable (and was therefore offline only because of the
+ * lack of budget).
+ */
+ now = NOW();
+ update_load(ops, svc->rqd, svc, 1, now);
+ runq_insert(ops, svc);
+ runq_tickle(ops, svc, now);
+ }
+ list_del_init(&svc->parked_elem);
+
+ unit_schedule_unlock_irqrestore(lock, flags, svc->unit);
+ }
+}
+
+static inline void do_replenish(struct csched2_dom *sdom)
+{
+ sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
+ sdom->budget += sdom->tot_budget;
+}
+
+static void replenish_domain_budget(void* data)
+{
+ struct csched2_dom *sdom = data;
+ unsigned long flags;
+ s_time_t now;
+ LIST_HEAD(parked);
+
+ spin_lock_irqsave(&sdom->budget_lock, flags);
+
+ now = NOW();
+
+ /*
+ * Let's do the replenishment. Note, though, that a domain may overrun,
+ * which means the budget would have gone below 0 (reasons may be system
+ * overbooking, accounting issues, etc.). It also may happen that we are
+ * handling the replenishment (much) later than we should (reasons may
+ * again be overbooking, or issues with timers).
+ *
+ * Even in cases of overrun or delay, however, we expect that in 99% of
+ * cases, doing just one replenishment will be good enough for being able
+ * to unpark the units that are waiting for some budget.
+ */
+ do_replenish(sdom);
+
+ /*
+ * And now, the special cases:
+ * 1) if we are late enough to have skipped (at least) one full period,
+ * what we must do is doing more replenishments. Note that, however,
+ * every time we add tot_budget to the budget, we also move next_repl
+ * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always
+ * respected.
+ */
+ if ( unlikely(sdom->next_repl <= now) )
+ {
+ do
+ do_replenish(sdom);
+ while ( sdom->next_repl <= now );
+ }
+ /*
+ * 2) if we overrun by more than tot_budget, then budget+tot_budget is
+ * still < 0, which means that we can't unpark the units. Let's bail,
+ * and wait for future replenishments.
+ */
+ if ( unlikely(sdom->budget <= 0) )
+ {
+ spin_unlock_irqrestore(&sdom->budget_lock, flags);
+ goto out;
+ }
+
+ /* Since we do more replenishments, make sure we didn't overshot. */
+ sdom->budget = min(sdom->budget, sdom->tot_budget);
+
+ /*
+ * As above, let's prepare the temporary list, out of the domain's
+ * parked_units list, now that we hold the budget_lock. Then, drop such
+ * lock, and pass the list to the unparking function.
+ */
+ list_splice_init(&sdom->parked_units, &parked);
+
+ spin_unlock_irqrestore(&sdom->budget_lock, flags);
+
+ unpark_parked_units(sdom->dom->cpupool->sched, &parked);
+
+ out:
+ set_timer(&sdom->repl_timer, sdom->next_repl);
+}
+
+#ifndef NDEBUG
+static inline void
+csched2_unit_check(struct sched_unit *unit)
+{
+ struct csched2_unit * const svc = csched2_unit(unit);
+ struct csched2_dom * const sdom = svc->sdom;
+
+ BUG_ON( svc->unit != unit );
+ BUG_ON( sdom != csched2_dom(unit->domain) );
+ if ( sdom )
+ {
+ BUG_ON( is_idle_unit(unit) );
+ BUG_ON( sdom->dom != unit->domain );
+ }
+ else
+ {
+ BUG_ON( !is_idle_unit(unit) );
+ }
+ SCHED_STAT_CRANK(unit_check);
+}
+#define CSCHED2_UNIT_CHECK(unit) (csched2_unit_check(unit))
+#else
+#define CSCHED2_UNIT_CHECK(unit)
+#endif
+
+static void *
+csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+ void *dd)
+{
+ struct csched2_unit *svc;
+
+ /* Allocate per-UNIT info */
+ svc = xzalloc(struct csched2_unit);
+ if ( svc == NULL )
+ return NULL;
+
+ INIT_LIST_HEAD(&svc->rqd_elem);
+ INIT_LIST_HEAD(&svc->runq_elem);
+
+ svc->sdom = dd;
+ svc->unit = unit;
+ svc->flags = 0U;
+
+ if ( ! is_idle_unit(unit) )
+ {
+ ASSERT(svc->sdom != NULL);
+ svc->credit = CSCHED2_CREDIT_INIT;
+ svc->weight = svc->sdom->weight;
+ /* Starting load of 50% */
+ svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1);
+ svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT;
+ }
+ else
+ {
+ ASSERT(svc->sdom == NULL);
+ svc->credit = CSCHED2_IDLE_CREDIT;
+ svc->weight = 0;
+ }
+ svc->tickled_cpu = -1;
+
+ svc->budget = STIME_MAX;
+ svc->budget_quota = 0;
+ INIT_LIST_HEAD(&svc->parked_elem);
+
+ SCHED_STAT_CRANK(unit_alloc);
+
+ return svc;
+}
+
+static void
+csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit * const svc = csched2_unit(unit);
+
+ ASSERT(!is_idle_unit(unit));
+ SCHED_STAT_CRANK(unit_sleep);
+
+ if ( curr_on_cpu(sched_unit_master(unit)) == unit )
+ {
+ tickle_cpu(sched_unit_master(unit), svc->rqd);
+ }
+ else if ( unit_on_runq(svc) )
+ {
+ ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+ update_load(ops, svc->rqd, svc, -1, NOW());
+ runq_remove(svc);
+ }
+ else
+ __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+}
+
+static void
+csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit * const svc = csched2_unit(unit);
+ unsigned int cpu = sched_unit_master(unit);
+ s_time_t now;
+
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+ ASSERT(!is_idle_unit(unit));
+
+ if ( unlikely(curr_on_cpu(cpu) == unit) )
+ {
+ SCHED_STAT_CRANK(unit_wake_running);
+ goto out;
+ }
+
+ if ( unlikely(unit_on_runq(svc)) )
+ {
+ SCHED_STAT_CRANK(unit_wake_onrunq);
+ goto out;
+ }
+
+ if ( likely(unit_runnable(unit)) )
+ SCHED_STAT_CRANK(unit_wake_runnable);
+ else
+ SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+ /* If the context hasn't been saved for this unit yet, we can't put it on
+ * another runqueue. Instead, we set a flag so that it will be put on the runqueue
+ * after the context has been saved. */
+ if ( unlikely(svc->flags & CSFLAG_scheduled) )
+ {
+ __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+ goto out;
+ }
+
+ /* Add into the new runqueue if necessary */
+ if ( svc->rqd == NULL )
+ runq_assign(ops, unit);
+ else
+ ASSERT(c2rqd(ops, sched_unit_master(unit)) == svc->rqd );
+
+ now = NOW();
+
+ update_load(ops, svc->rqd, svc, 1, now);
+
+ /* Put the UNIT on the runq */
+ runq_insert(ops, svc);
+ runq_tickle(ops, svc, now);
+
+out:
+ return;
+}
+
+static void
+csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit * const svc = csched2_unit(unit);
+
+ __set_bit(__CSFLAG_unit_yield, &svc->flags);
+}
+
+static void
+csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit * const svc = csched2_unit(unit);
+ spinlock_t *lock = unit_schedule_lock_irq(unit);
+ s_time_t now = NOW();
+ LIST_HEAD(were_parked);
+
+ BUG_ON( !is_idle_unit(unit) &&
+ svc->rqd != c2rqd(ops, sched_unit_master(unit)));
+ ASSERT(is_idle_unit(unit) ||
+ svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+
+ /* This unit is now eligible to be put on the runqueue again */
+ __clear_bit(__CSFLAG_scheduled, &svc->flags);
+
+ if ( unlikely(has_cap(svc) && svc->budget > 0) )
+ unit_return_budget(svc, &were_parked);
+
+ /* If someone wants it on the runqueue, put it there. */
+ /*
+ * NB: We can get rid of CSFLAG_scheduled by checking for
+ * vc->is_running and unit_on_runq(svc) here. However,
+ * since we're accessing the flags cacheline anyway,
+ * it seems a bit pointless; especially as we have plenty of
+ * bits free.
+ */
+ if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
+ && likely(unit_runnable(unit)) )
+ {
+ ASSERT(!unit_on_runq(svc));
+
+ runq_insert(ops, svc);
+ runq_tickle(ops, svc, now);
+ }
+ else if ( !is_idle_unit(unit) )
+ update_load(ops, svc->rqd, svc, -1, now);
+
+ unit_schedule_unlock_irq(lock, unit);
+
+ unpark_parked_units(ops, &were_parked);
+}
+
+#define MAX_LOAD (STIME_MAX)
+static struct sched_resource *
+csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ int i, min_rqi = -1, min_s_rqi = -1;
+ unsigned int new_cpu, cpu = sched_unit_master(unit);
+ struct csched2_unit *svc = csched2_unit(unit);
+ s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD;
+ bool has_soft;
+
+ ASSERT(!cpumask_empty(&prv->active_queues));
+
+ SCHED_STAT_CRANK(pick_resource);
+
+ /* Locking:
+ * - Runqueue lock of vc->processor is already locked
+ * - Need to grab prv lock to make sure active runqueues don't
+ * change
+ * - Need to grab locks for other runqueues while checking
+ * avgload
+ * Locking constraint is:
+ * - Lock prv before runqueue locks
+ * - Trylock between runqueue locks (no ordering)
+ *
+ * Since one of the runqueue locks is already held, we can't
+ * just grab the prv lock. Instead, we'll have to trylock, and
+ * do something else reasonable if we fail.
+ */
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+ if ( !read_trylock(&prv->lock) )
+ {
+ /* We may be here because someone requested us to migrate. */
+ __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
+ new_cpu = get_fallback_cpu(svc);
+ /*
+ * Tracing of runq and its load won't be accurate, since we could
+ * not get the lock, but at least we will output the chosen pcpu.
+ */
+ goto out;
+ }
+
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(unit->domain));
+
+ /*
+ * First check to see if we're here because someone else suggested a place
+ * for us to move.
+ */
+ if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
+ {
+ if ( unlikely(svc->migrate_rqd->id < 0) )
+ {
+ printk(XENLOG_WARNING "%s: target runqueue disappeared!\n",
+ __func__);
+ }
+ else if ( cpumask_intersects(cpumask_scratch_cpu(cpu),
+ &svc->migrate_rqd->active) )
+ {
+ /*
+ * If we've been asked to move to migrate_rqd, we should just do
+ * that, which we actually do by returning one cpu from that runq.
+ * There is no need to take care of soft affinity, as that will
+ * happen in runq_tickle().
+ */
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &svc->migrate_rqd->active);
+ new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias,
+ cpumask_scratch_cpu(cpu));
+
+ svc->migrate_rqd->pick_bias = new_cpu;
+ goto out_up;
+ }
+ /* Fall-through to normal cpu pick */
+ }
+
+ /*
+ * What we want is:
+ * - if we have soft affinity, the runqueue with the lowest average
+ * load, among the ones that contain cpus in our soft affinity; this
+ * represents the best runq on which we would want to run.
+ * - the runqueue with the lowest average load among the ones that
+ * contains cpus in our hard affinity; this represent the best runq
+ * on which we can run.
+ *
+ * Find both runqueues in one pass.
+ */
+ has_soft = has_soft_affinity(unit);
+ for_each_cpu(i, &prv->active_queues)
+ {
+ struct csched2_runqueue_data *rqd;
+ s_time_t rqd_avgload = MAX_LOAD;
+
+ rqd = prv->rqd + i;
+
+ /*
+ * If none of the cpus of this runqueue is in svc's hard-affinity,
+ * skip the runqueue.
+ *
+ * Note that, in case svc's hard-affinity has changed, this is the
+ * first time when we see such change, so it is indeed possible
+ * that we end up skipping svc's current runqueue.
+ */
+ if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) )
+ continue;
+
+ /*
+ * If checking a different runqueue, grab the lock, read the avg,
+ * and then release the lock.
+ *
+ * If on our own runqueue, don't grab or release the lock;
+ * but subtract our own load from the runqueue load to simulate
+ * impartiality.
+ */
+ if ( rqd == svc->rqd )
+ {
+ rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
+ }
+ else if ( spin_trylock(&rqd->lock) )
+ {
+ rqd_avgload = rqd->b_avgload;
+ spin_unlock(&rqd->lock);
+ }
+
+ /*
+ * if svc has a soft-affinity, and some cpus of rqd are part of it,
+ * see if we need to update the "soft-affinity minimum".
+ */
+ if ( has_soft &&
+ rqd_avgload < min_s_avgload )
+ {
+ cpumask_t mask;
+
+ cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active);
+ if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) )
+ {
+ min_s_avgload = rqd_avgload;
+ min_s_rqi = i;
+ }
+ }
+ /* In any case, keep the "hard-affinity minimum" updated too. */
+ if ( rqd_avgload < min_avgload )
+ {
+ min_avgload = rqd_avgload;
+ min_rqi = i;
+ }
+ }
+
+ if ( has_soft && min_s_rqi != -1 )
+ {
+ /*
+ * We have soft affinity, and we have a candidate runq, so go for it.
+ *
+ * Note that, to obtain the soft-affinity mask, we "just" put what we
+ * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is
+ * ok because:
+ * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have
+ * a non-empty intersection (because has_soft is true);
+ * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask()
+ * already in cpumask_scratch, we do save a lot doing like this.
+ *
+ * It's kind of like open coding affinity_balance_cpumask() but, in
+ * this specific case, calling that would mean a lot of (unnecessary)
+ * cpumask operations.
+ */
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ unit->cpu_soft_affinity);
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &prv->rqd[min_s_rqi].active);
+ }
+ else if ( min_rqi != -1 )
+ {
+ /*
+ * Either we don't have soft-affinity, or we do, but we did not find
+ * any suitable runq. But we did find one when considering hard
+ * affinity, so go for it.
+ *
+ * cpumask_scratch already has unit->cpu_hard_affinity &
+ * cpupool_domain_master_cpumask() in it, so it's enough that we filter
+ * with the cpus of the runq.
+ */
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &prv->rqd[min_rqi].active);
+ }
+ else
+ {
+ /*
+ * We didn't find anyone at all (most likely because of spinlock
+ * contention).
+ */
+ new_cpu = get_fallback_cpu(svc);
+ min_rqi = c2r(new_cpu);
+ min_avgload = prv->rqd[min_rqi].b_avgload;
+ goto out_up;
+ }
+
+ new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias,
+ cpumask_scratch_cpu(cpu));
+ prv->rqd[min_rqi].pick_bias = new_cpu;
+ BUG_ON(new_cpu >= nr_cpu_ids);
+
+ out_up:
+ read_unlock(&prv->lock);
+ out:
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint64_t b_avgload;
+ unsigned unit:16, dom:16;
+ unsigned rq_id:16, new_cpu:16;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.rq_id = min_rqi;
+ d.b_avgload = min_avgload;
+ d.new_cpu = new_cpu;
+ __trace_var(TRC_CSCHED2_PICKED_CPU, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ return get_sched_res(new_cpu);
+}
+
+/* Working state of the load-balancing algorithm */
+typedef struct {
+ /* NB: Modified by consider() */
+ s_time_t load_delta;
+ struct csched2_unit * best_push_svc, *best_pull_svc;
+ /* NB: Read by consider() */
+ struct csched2_runqueue_data *lrqd;
+ struct csched2_runqueue_data *orqd;
+} balance_state_t;
+
+static void consider(balance_state_t *st,
+ struct csched2_unit *push_svc,
+ struct csched2_unit *pull_svc)
+{
+ s_time_t l_load, o_load, delta;
+
+ l_load = st->lrqd->b_avgload;
+ o_load = st->orqd->b_avgload;
+ if ( push_svc )
+ {
+ /* What happens to the load on both if we push? */
+ l_load -= push_svc->avgload;
+ o_load += push_svc->avgload;
+ }
+ if ( pull_svc )
+ {
+ /* What happens to the load on both if we pull? */
+ l_load += pull_svc->avgload;
+ o_load -= pull_svc->avgload;
+ }
+
+ delta = l_load - o_load;
+ if ( delta < 0 )
+ delta = -delta;
+
+ if ( delta < st->load_delta )
+ {
+ st->load_delta = delta;
+ st->best_push_svc=push_svc;
+ st->best_pull_svc=pull_svc;
+ }
+}
+
+
+static void migrate(const struct scheduler *ops,
+ struct csched2_unit *svc,
+ struct csched2_runqueue_data *trqd,
+ s_time_t now)
+{
+ struct sched_unit *unit = svc->unit;
+ int cpu = sched_unit_master(unit);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned rqi:16, trqi:16;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.rqi = svc->rqd->id;
+ d.trqi = trqd->id;
+ __trace_var(TRC_CSCHED2_MIGRATE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ if ( svc->flags & CSFLAG_scheduled )
+ {
+ /* It's running; mark it to migrate. */
+ svc->migrate_rqd = trqd;
+ sched_set_pause_flags(unit, _VPF_migrating);
+ __set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
+ SCHED_STAT_CRANK(migrate_requested);
+ tickle_cpu(cpu, svc->rqd);
+ }
+ else
+ {
+ int on_runq = 0;
+ /* It's not running; just move it */
+ if ( unit_on_runq(svc) )
+ {
+ runq_remove(svc);
+ update_load(ops, svc->rqd, NULL, -1, now);
+ on_runq = 1;
+ }
+ _runq_deassign(svc);
+
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(unit->domain));
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &trqd->active);
+ sched_set_res(unit,
+ get_sched_res(cpumask_cycle(trqd->pick_bias,
+ cpumask_scratch_cpu(cpu))));
+ trqd->pick_bias = sched_unit_master(unit);
+ ASSERT(sched_unit_master(unit) < nr_cpu_ids);
+
+ _runq_assign(svc, trqd);
+ if ( on_runq )
+ {
+ update_load(ops, svc->rqd, NULL, 1, now);
+ runq_insert(ops, svc);
+ runq_tickle(ops, svc, now);
+ SCHED_STAT_CRANK(migrate_on_runq);
+ }
+ else
+ SCHED_STAT_CRANK(migrate_no_runq);
+ }
+}
+
+/*
+ * It makes sense considering migrating svc to rqd, if:
+ * - svc is not already flagged to migrate,
+ * - if svc is allowed to run on at least one of the pcpus of rqd.
+ */
+static bool unit_is_migrateable(struct csched2_unit *svc,
+ struct csched2_runqueue_data *rqd)
+{
+ struct sched_unit *unit = svc->unit;
+ int cpu = sched_unit_master(unit);
+
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(unit->domain));
+
+ return !(svc->flags & CSFLAG_runq_migrate_request) &&
+ cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active);
+}
+
+static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ int i, max_delta_rqi;
+ struct list_head *push_iter, *pull_iter;
+ bool inner_load_updated = 0;
+
+ balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL };
+
+ /*
+ * Basic algorithm: Push, pull, or swap.
+ * - Find the runqueue with the furthest load distance
+ * - Find a pair that makes the difference the least (where one
+ * on either side may be empty).
+ */
+
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+ st.lrqd = c2rqd(ops, cpu);
+
+ update_runq_load(ops, st.lrqd, 0, now);
+
+retry:
+ max_delta_rqi = -1;
+ if ( !read_trylock(&prv->lock) )
+ return;
+
+ st.load_delta = 0;
+
+ for_each_cpu(i, &prv->active_queues)
+ {
+ s_time_t delta;
+
+ st.orqd = prv->rqd + i;
+
+ if ( st.orqd == st.lrqd
+ || !spin_trylock(&st.orqd->lock) )
+ continue;
+
+ update_runq_load(ops, st.orqd, 0, now);
+
+ delta = st.lrqd->b_avgload - st.orqd->b_avgload;
+ if ( delta < 0 )
+ delta = -delta;
+
+ if ( delta > st.load_delta )
+ {
+ st.load_delta = delta;
+ max_delta_rqi = i;
+ }
+
+ spin_unlock(&st.orqd->lock);
+ }
+
+ /* Minimize holding the private scheduler lock. */
+ read_unlock(&prv->lock);
+ if ( max_delta_rqi == -1 )
+ goto out;
+
+ {
+ s_time_t load_max;
+ int cpus_max;
+
+
+ load_max = st.lrqd->b_avgload;
+ if ( st.orqd->b_avgload > load_max )
+ load_max = st.orqd->b_avgload;
+
+ cpus_max = st.lrqd->nr_cpus;
+ i = st.orqd->nr_cpus;
+ if ( i > cpus_max )
+ cpus_max = i;
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned lrq_id:16, orq_id:16;
+ unsigned load_delta;
+ } d;
+ d.lrq_id = st.lrqd->id;
+ d.orq_id = st.orqd->id;
+ d.load_delta = st.load_delta;
+ __trace_var(TRC_CSCHED2_LOAD_CHECK, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ /*
+ * If we're under 100% capacaty, only shift if load difference
+ * is > 1. otherwise, shift if under 12.5%
+ */
+ if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) )
+ {
+ if ( st.load_delta < (1ULL << (prv->load_precision_shift +
+ opt_underload_balance_tolerance)) )
+ goto out;
+ }
+ else
+ if ( st.load_delta < (1ULL << (prv->load_precision_shift +
+ opt_overload_balance_tolerance)) )
+ goto out;
+ }
+
+ /* Try to grab the other runqueue lock; if it's been taken in the
+ * meantime, try the process over again. This can't deadlock
+ * because if it doesn't get any other rqd locks, it will simply
+ * give up and return. */
+ st.orqd = prv->rqd + max_delta_rqi;
+ if ( !spin_trylock(&st.orqd->lock) )
+ goto retry;
+
+ /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
+ if ( unlikely(st.orqd->id < 0) )
+ goto out_up;
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint64_t lb_avgload, ob_avgload;
+ unsigned lrq_id:16, orq_id:16;
+ } d;
+ d.lrq_id = st.lrqd->id;
+ d.lb_avgload = st.lrqd->b_avgload;
+ d.orq_id = st.orqd->id;
+ d.ob_avgload = st.orqd->b_avgload;
+ __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ SCHED_STAT_CRANK(acct_load_balance);
+
+ /* Look for "swap" which gives the best load average
+ * FIXME: O(n^2)! */
+
+ /* Reuse load delta (as we're trying to minimize it) */
+ list_for_each( push_iter, &st.lrqd->svc )
+ {
+ struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem);
+
+ update_svc_load(ops, push_svc, 0, now);
+
+ if ( !unit_is_migrateable(push_svc, st.orqd) )
+ continue;
+
+ list_for_each( pull_iter, &st.orqd->svc )
+ {
+ struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
+
+ if ( !inner_load_updated )
+ update_svc_load(ops, pull_svc, 0, now);
+
+ if ( !unit_is_migrateable(pull_svc, st.lrqd) )
+ continue;
+
+ consider(&st, push_svc, pull_svc);
+ }
+
+ inner_load_updated = 1;
+
+ /* Consider push only */
+ consider(&st, push_svc, NULL);
+ }
+
+ list_for_each( pull_iter, &st.orqd->svc )
+ {
+ struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
+
+ if ( !unit_is_migrateable(pull_svc, st.lrqd) )
+ continue;
+
+ /* Consider pull only */
+ consider(&st, NULL, pull_svc);
+ }
+
+ /* OK, now we have some candidates; do the moving */
+ if ( st.best_push_svc )
+ migrate(ops, st.best_push_svc, st.orqd, now);
+ if ( st.best_pull_svc )
+ migrate(ops, st.best_pull_svc, st.lrqd, now);
+
+ out_up:
+ spin_unlock(&st.orqd->lock);
+ out:
+ return;
+}
+
+static void
+csched2_unit_migrate(
+ const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu)
+{
+ struct domain *d = unit->domain;
+ struct csched2_unit * const svc = csched2_unit(unit);
+ struct csched2_runqueue_data *trqd;
+ s_time_t now = NOW();
+
+ /*
+ * Being passed a target pCPU which is outside of our cpupool is only
+ * valid if we are shutting down (or doing ACPI suspend), and we are
+ * moving everyone to BSP, no matter whether or not BSP is inside our
+ * cpupool.
+ *
+ * And since there indeed is the chance that it is not part of it, all
+ * we must do is remove _and_ unassign the unit from any runqueue, as
+ * well as updating v->processor with the target, so that the suspend
+ * process can continue.
+ *
+ * It will then be during resume that a new, meaningful, value for
+ * v->processor will be chosen, and during actual domain unpause that
+ * the unit will be assigned to and added to the proper runqueue.
+ */
+ if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_master_cpumask(d))) )
+ {
+ ASSERT(system_state == SYS_STATE_suspend);
+ if ( unit_on_runq(svc) )
+ {
+ runq_remove(svc);
+ update_load(ops, svc->rqd, NULL, -1, now);
+ }
+ _runq_deassign(svc);
+ sched_set_res(unit, get_sched_res(new_cpu));
+ return;
+ }
+
+ /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */
+ ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized));
+ ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity));
+
+ trqd = c2rqd(ops, new_cpu);
+
+ /*
+ * Do the actual movement toward new_cpu, and update vc->processor.
+ * If we are changing runqueue, migrate() takes care of everything.
+ * If we are not changing runqueue, we need to update vc->processor
+ * here. In fact, if, for instance, we are here because the unit's
+ * hard affinity changed, we don't want to risk leaving vc->processor
+ * pointing to a pcpu where we can't run any longer.
+ */
+ if ( trqd != svc->rqd )
+ migrate(ops, svc, trqd, now);
+ else
+ sched_set_res(unit, get_sched_res(new_cpu));
+}
+
+static int
+csched2_dom_cntl(
+ const struct scheduler *ops,
+ struct domain *d,
+ struct xen_domctl_scheduler_op *op)
+{
+ struct csched2_dom * const sdom = csched2_dom(d);
+ struct csched2_private *prv = csched2_priv(ops);
+ unsigned long flags;
+ struct sched_unit *unit;
+ int rc = 0;
+
+ /*
+ * Locking:
+ * - we must take the private lock for accessing the weights of the
+ * units of d, and/or the cap;
+ * - in the putinfo case, we also need the runqueue lock(s), for
+ * updating the max waight of the runqueue(s).
+ * If changing the cap, we also need the budget_lock, for updating
+ * the value of the domain budget pool (and the runqueue lock,
+ * for adjusting the parameters and rescheduling any unit that is
+ * running at the time of the change).
+ */
+ switch ( op->cmd )
+ {
+ case XEN_DOMCTL_SCHEDOP_getinfo:
+ read_lock_irqsave(&prv->lock, flags);
+ op->u.credit2.weight = sdom->weight;
+ op->u.credit2.cap = sdom->cap;
+ read_unlock_irqrestore(&prv->lock, flags);
+ break;
+ case XEN_DOMCTL_SCHEDOP_putinfo:
+ write_lock_irqsave(&prv->lock, flags);
+ /* Weight */
+ if ( op->u.credit2.weight != 0 )
+ {
+ int old_weight;
+
+ old_weight = sdom->weight;
+
+ sdom->weight = op->u.credit2.weight;
+
+ /* Update weights for units, and max_weight for runqueues on which they reside */
+ for_each_sched_unit ( d, unit )
+ {
+ struct csched2_unit *svc = csched2_unit(unit);
+ spinlock_t *lock = unit_schedule_lock(unit);
+
+ ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+
+ svc->weight = sdom->weight;
+ update_max_weight(svc->rqd, svc->weight, old_weight);
+
+ unit_schedule_unlock(lock, unit);
+ }
+ }
+ /* Cap */
+ if ( op->u.credit2.cap != 0 )
+ {
+ struct csched2_unit *svc;
+ spinlock_t *lock;
+
+ /* Cap is only valid if it's below 100 * nr_of_units */
+ if ( op->u.credit2.cap > 100 * sdom->nr_units )
+ {
+ rc = -EINVAL;
+ write_unlock_irqrestore(&prv->lock, flags);
+ break;
+ }
+
+ spin_lock(&sdom->budget_lock);
+ sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap);
+ sdom->tot_budget /= 100;
+ spin_unlock(&sdom->budget_lock);
+
+ /*
+ * When trying to get some budget and run, each unit will grab
+ * from the pool 1/N (with N = nr of units of the domain) of
+ * the total budget. Roughly speaking, this means each unit will
+ * have at least one chance to run during every period.
+ */
+ for_each_sched_unit ( d, unit )
+ {
+ svc = csched2_unit(unit);
+ lock = unit_schedule_lock(unit);
+ /*
+ * Too small quotas would in theory cause a lot of overhead,
+ * which then won't happen because, in csched2_runtime(),
+ * CSCHED2_MIN_TIMER is what would be used anyway.
+ */
+ svc->budget_quota = max(sdom->tot_budget / sdom->nr_units,
+ CSCHED2_MIN_TIMER);
+ unit_schedule_unlock(lock, unit);
+ }
+
+ if ( sdom->cap == 0 )
+ {
+ /*
+ * We give to the domain the budget to which it is entitled,
+ * and queue its first replenishment event.
+ *
+ * Since cap is currently disabled for this domain, we
+ * know no unit is messing with the domain's budget, and
+ * the replenishment timer is still off.
+ * For these reasons, it is safe to do the following without
+ * taking the budget_lock.
+ */
+ sdom->budget = sdom->tot_budget;
+ sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD;
+ set_timer(&sdom->repl_timer, sdom->next_repl);
+
+ /*
+ * Now, let's enable budget accounting for all the units.
+ * For making sure that they will start to honour the domain's
+ * cap, we set their budget to 0.
+ * This way, as soon as they will try to run, they will have
+ * to get some budget.
+ *
+ * For the units that are already running, we trigger the
+ * scheduler on their pCPU. When, as a consequence of this,
+ * csched2_schedule() will run, it will figure out there is
+ * no budget, and the unit will try to get some (and be parked,
+ * if there's none, and we'll switch to someone else).
+ */
+ for_each_sched_unit ( d, unit )
+ {
+ svc = csched2_unit(unit);
+ lock = unit_schedule_lock(unit);
+ if ( unit->is_running )
+ {
+ unsigned int cpu = sched_unit_master(unit);
+ struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+
+ ASSERT(curr_on_cpu(cpu) == unit);
+
+ /*
+ * We are triggering a reschedule on the unit's
+ * pCPU. That will run burn_credits() and, since
+ * the unit is capped now, it would charge all the
+ * execution time of this last round as budget as
+ * well. That will make the unit budget go negative,
+ * potentially by a large amount, and it's unfair.
+ *
+ * To avoid that, call burn_credit() here, to do the
+ * accounting of this current running instance now,
+ * with budgetting still disabled. This does not
+ * prevent some small amount of budget being charged
+ * to the unit (i.e., the amount of time it runs from
+ * now, to when scheduling happens). The budget will
+ * also go below 0, but a lot less than how it would
+ * if we don't do this.
+ */
+ burn_credits(rqd, svc, NOW());
+ __cpumask_set_cpu(cpu, &rqd->tickled);
+ ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle));
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ }
+ svc->budget = 0;
+ unit_schedule_unlock(lock, unit);
+ }
+ }
+
+ sdom->cap = op->u.credit2.cap;
+ }
+ else if ( sdom->cap != 0 )
+ {
+ LIST_HEAD(parked);
+
+ stop_timer(&sdom->repl_timer);
+
+ /* Disable budget accounting for all the units. */
+ for_each_sched_unit ( d, unit )
+ {
+ struct csched2_unit *svc = csched2_unit(unit);
+ spinlock_t *lock = unit_schedule_lock(unit);
+
+ svc->budget = STIME_MAX;
+ svc->budget_quota = 0;
+
+ unit_schedule_unlock(lock, unit);
+ }
+ sdom->cap = 0;
+ /*
+ * We are disabling the cap for this domain, which may have
+ * units waiting for a replenishment, so we unpark them all.
+ * Note that, since we have already disabled budget accounting
+ * for all the units of the domain, no currently running unit
+ * will be added to the parked units list any longer.
+ */
+ spin_lock(&sdom->budget_lock);
+ list_splice_init(&sdom->parked_units, &parked);
+ spin_unlock(&sdom->budget_lock);
+
+ unpark_parked_units(ops, &parked);
+ }
+ write_unlock_irqrestore(&prv->lock, flags);
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+
+ return rc;
+}
+
+static void
+csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
+ const cpumask_t *hard, const cpumask_t *soft)
+{
+ struct csched2_unit *svc = csched2_unit(unit);
+
+ if ( !hard )
+ return;
+
+ /* Are we becoming exclusively pinned? */
+ if ( cpumask_weight(hard) == 1 )
+ __set_bit(__CSFLAG_pinned, &svc->flags);
+ else
+ __clear_bit(__CSFLAG_pinned, &svc->flags);
+}
+
+static int csched2_sys_cntl(const struct scheduler *ops,
+ struct xen_sysctl_scheduler_op *sc)
+{
+ struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2;
+ struct csched2_private *prv = csched2_priv(ops);
+ unsigned long flags;
+
+ switch (sc->cmd )
+ {
+ case XEN_SYSCTL_SCHEDOP_putinfo:
+ if ( params->ratelimit_us &&
+ (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX ||
+ params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN ))
+ return -EINVAL;
+
+ write_lock_irqsave(&prv->lock, flags);
+ if ( !prv->ratelimit_us && params->ratelimit_us )
+ printk(XENLOG_INFO "Enabling context switch rate limiting\n");
+ else if ( prv->ratelimit_us && !params->ratelimit_us )
+ printk(XENLOG_INFO "Disabling context switch rate limiting\n");
+ prv->ratelimit_us = params->ratelimit_us;
+ write_unlock_irqrestore(&prv->lock, flags);
+
+ /* FALLTHRU */
+ case XEN_SYSCTL_SCHEDOP_getinfo:
+ params->ratelimit_us = prv->ratelimit_us;
+ break;
+ }
+
+ return 0;
+}
+
+static void *
+csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ struct csched2_dom *sdom;
+ unsigned long flags;
+
+ sdom = xzalloc(struct csched2_dom);
+ if ( sdom == NULL )
+ return ERR_PTR(-ENOMEM);
+
+ /* Initialize credit, cap and weight */
+ INIT_LIST_HEAD(&sdom->sdom_elem);
+ sdom->dom = dom;
+ sdom->weight = CSCHED2_DEFAULT_WEIGHT;
+ sdom->cap = 0U;
+ sdom->nr_units = 0;
+
+ init_timer(&sdom->repl_timer, replenish_domain_budget, sdom,
+ cpumask_any(cpupool_domain_master_cpumask(dom)));
+ spin_lock_init(&sdom->budget_lock);
+ INIT_LIST_HEAD(&sdom->parked_units);
+
+ write_lock_irqsave(&prv->lock, flags);
+
+ list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
+
+ write_unlock_irqrestore(&prv->lock, flags);
+
+ return sdom;
+}
+
+static void
+csched2_free_domdata(const struct scheduler *ops, void *data)
+{
+ struct csched2_dom *sdom = data;
+ struct csched2_private *prv = csched2_priv(ops);
+
+ if ( sdom )
+ {
+ unsigned long flags;
+
+ kill_timer(&sdom->repl_timer);
+
+ write_lock_irqsave(&prv->lock, flags);
+ list_del_init(&sdom->sdom_elem);
+ write_unlock_irqrestore(&prv->lock, flags);
+
+ xfree(sdom);
+ }
+}
+
+static void
+csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit *svc = unit->priv;
+ struct csched2_dom * const sdom = svc->sdom;
+ spinlock_t *lock;
+
+ ASSERT(!is_idle_unit(unit));
+ ASSERT(list_empty(&svc->runq_elem));
+
+ /* csched2_res_pick() expects the pcpu lock to be held */
+ lock = unit_schedule_lock_irq(unit);
+
+ sched_set_res(unit, csched2_res_pick(ops, unit));
+
+ spin_unlock_irq(lock);
+
+ lock = unit_schedule_lock_irq(unit);
+
+ /* Add unit to runqueue of initial processor */
+ runq_assign(ops, unit);
+
+ unit_schedule_unlock_irq(lock, unit);
+
+ sdom->nr_units++;
+
+ SCHED_STAT_CRANK(unit_insert);
+
+ CSCHED2_UNIT_CHECK(unit);
+}
+
+static void
+csched2_free_udata(const struct scheduler *ops, void *priv)
+{
+ struct csched2_unit *svc = priv;
+
+ xfree(svc);
+}
+
+static void
+csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct csched2_unit * const svc = csched2_unit(unit);
+ spinlock_t *lock;
+
+ ASSERT(!is_idle_unit(unit));
+ ASSERT(list_empty(&svc->runq_elem));
+
+ SCHED_STAT_CRANK(unit_remove);
+
+ /* Remove from runqueue */
+ lock = unit_schedule_lock_irq(unit);
+
+ runq_deassign(ops, unit);
+
+ unit_schedule_unlock_irq(lock, unit);
+
+ svc->sdom->nr_units--;
+}
+
+/* How long should we let this unit run for? */
+static s_time_t
+csched2_runtime(const struct scheduler *ops, int cpu,
+ struct csched2_unit *snext, s_time_t now)
+{
+ s_time_t time, min_time;
+ int rt_credit; /* Proposed runtime measured in credits */
+ struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+ struct list_head *runq = &rqd->runq;
+ struct csched2_private *prv = csched2_priv(ops);
+
+ /*
+ * If we're idle, just stay so. Others (or external events)
+ * will poke us when necessary.
+ */
+ if ( is_idle_unit(snext->unit) )
+ return -1;
+
+ /* General algorithm:
+ * 1) Run until snext's credit will be 0.
+ * 2) But if someone is waiting, run until snext's credit is equal
+ * to his.
+ * 3) But, if we are capped, never run more than our budget.
+ * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or
+ * the ratelimit time.
+ */
+
+ /* Calculate mintime */
+ min_time = CSCHED2_MIN_TIMER;
+ if ( prv->ratelimit_us )
+ {
+ s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us);
+ if ( snext->unit->is_running )
+ ratelimit_min = snext->unit->state_entry_time +
+ MICROSECS(prv->ratelimit_us) - now;
+ if ( ratelimit_min > min_time )
+ min_time = ratelimit_min;
+ }
+
+ /* 1) Run until snext's credit will be 0. */
+ rt_credit = snext->credit;
+
+ /*
+ * 2) If there's someone waiting whose credit is positive,
+ * run until your credit ~= his.
+ */
+ if ( ! list_empty(runq) )
+ {
+ struct csched2_unit *swait = runq_elem(runq->next);
+
+ if ( ! is_idle_unit(swait->unit)
+ && swait->credit > 0 )
+ {
+ rt_credit = snext->credit - swait->credit;
+ }
+ }
+
+ /*
+ * The next guy on the runqueue may actually have a higher credit,
+ * if we've tried to avoid migrating him from a different cpu.
+ * Setting time=0 will ensure the minimum timeslice is chosen.
+ *
+ * FIXME: See if we can eliminate this conversion if we know time
+ * will be outside (MIN,MAX). Probably requires pre-calculating
+ * credit values of MIN,MAX per unit, since each unit burns credit
+ * at a different rate.
+ */
+ if ( rt_credit > 0 )
+ time = c2t(rqd, rt_credit, snext);
+ else
+ time = 0;
+
+ /*
+ * 3) But, if capped, never run more than our budget.
+ */
+ if ( has_cap(snext) )
+ time = snext->budget < time ? snext->budget : time;
+
+ /*
+ * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or
+ * the rate_limit time.
+ */
+ if ( time < min_time )
+ {
+ time = min_time;
+ SCHED_STAT_CRANK(runtime_min_timer);
+ }
+ else if (time > CSCHED2_MAX_TIMER)
+ {
+ time = CSCHED2_MAX_TIMER;
+ SCHED_STAT_CRANK(runtime_max_timer);
+ }
+
+ return time;
+}
+
+/*
+ * Find a candidate.
+ */
+static struct csched2_unit *
+runq_candidate(struct csched2_runqueue_data *rqd,
+ struct csched2_unit *scurr,
+ int cpu, s_time_t now,
+ unsigned int *skipped)
+{
+ struct list_head *iter, *temp;
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct csched2_unit *snext = NULL;
+ struct csched2_private *prv = csched2_priv(sr->scheduler);
+ bool yield = false, soft_aff_preempt = false;
+
+ *skipped = 0;
+
+ if ( unlikely(is_idle_unit(scurr->unit)) )
+ {
+ snext = scurr;
+ goto check_runq;
+ }
+
+ yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags);
+
+ /*
+ * Return the current unit if it has executed for less than ratelimit.
+ * Adjuststment for the selected unit's credit and decision
+ * for how long it will run will be taken in csched2_runtime.
+ *
+ * Note that, if scurr is yielding, we don't let rate limiting kick in.
+ * In fact, it may be the case that scurr is about to spin, and there's
+ * no point forcing it to do so until rate limiting expires.
+ */
+ if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) &&
+ (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) )
+ {
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned runtime;
+ } d;
+ d.dom = scurr->unit->domain->domain_id;
+ d.unit = scurr->unit->unit_id;
+ d.runtime = now - scurr->unit->state_entry_time;
+ __trace_var(TRC_CSCHED2_RATELIMIT, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+ return scurr;
+ }
+
+ /* If scurr has a soft-affinity, let's check whether cpu is part of it */
+ if ( has_soft_affinity(scurr->unit) )
+ {
+ affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY,
+ cpumask_scratch);
+ if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) )
+ {
+ cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain);
+
+ /* Ok, is any of the pcpus in scurr soft-affinity idle? */
+ cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle);
+ cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled);
+ soft_aff_preempt = cpumask_intersects(cpumask_scratch, online);
+ }
+ }
+
+ /*
+ * If scurr is runnable, and this cpu is in its soft-affinity, default to
+ * it. We also default to it, even if cpu is not in its soft-affinity, if
+ * there aren't any idle and not tickled cpu in its soft-affinity. In
+ * fact, we don't want to risk leaving scurr in the runq and this cpu idle
+ * only because scurr is running outside of its soft-affinity.
+ *
+ * On the other hand, if cpu is not in scurr's soft-affinity, and there
+ * looks to be better options, go for them. That happens by defaulting to
+ * idle here, which means scurr will be preempted, put back in runq, and
+ * one of those idle and not tickled cpus from its soft-affinity will be
+ * tickled to pick it up.
+ *
+ * Finally, if scurr does not have a valid soft-affinity, we also let it
+ * continue to run here (in fact, soft_aff_preempt will still be false,
+ * in this case).
+ *
+ * Of course, we also default to idle also if scurr is not runnable.
+ */
+ if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt )
+ snext = scurr;
+ else
+ snext = csched2_unit(sched_idle_unit(cpu));
+
+ check_runq:
+ list_for_each_safe( iter, temp, &rqd->runq )
+ {
+ struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ /* Only consider units that are allowed to run on this processor. */
+ if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) )
+ {
+ (*skipped)++;
+ continue;
+ }
+
+ /*
+ * If an unit is meant to be picked up by another processor, and such
+ * processor has not scheduled yet, leave it in the runqueue for him.
+ */
+ if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
+ cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
+ {
+ (*skipped)++;
+ SCHED_STAT_CRANK(deferred_to_tickled_cpu);
+ continue;
+ }
+
+ /*
+ * If this is on a different processor, don't pull it unless
+ * its credit is at least CSCHED2_MIGRATE_RESIST higher.
+ */
+ if ( sched_unit_master(svc->unit) != cpu
+ && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
+ {
+ (*skipped)++;
+ SCHED_STAT_CRANK(migrate_resisted);
+ continue;
+ }
+
+ /*
+ * If the one in the runqueue has more credit than current (or idle,
+ * if current is not runnable), or if current is yielding, and also
+ * if the one in runqueue either is not capped, or is capped but has
+ * some budget, then choose it.
+ */
+ if ( (yield || svc->credit > snext->credit) &&
+ (!has_cap(svc) || unit_grab_budget(svc)) &&
+ unit_runnable_state(svc->unit) )
+ snext = svc;
+
+ /* In any case, if we got this far, break. */
+ break;
+ }
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned unit:16, dom:16;
+ unsigned tickled_cpu, skipped;
+ int credit;
+ } d;
+ d.dom = snext->unit->domain->domain_id;
+ d.unit = snext->unit->unit_id;
+ d.credit = snext->credit;
+ d.tickled_cpu = snext->tickled_cpu;
+ d.skipped = *skipped;
+ __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
+ SCHED_STAT_CRANK(tickled_cpu_overridden);
+
+ /*
+ * If snext is from a capped domain, it must have budget (or it
+ * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
+ * which still is >= 0.
+ */
+ ASSERT(snext->budget >= 0);
+
+ return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static void csched2_schedule(
+ const struct scheduler *ops, struct sched_unit *currunit, s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ const unsigned int cur_cpu = smp_processor_id();
+ const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+ struct csched2_runqueue_data *rqd;
+ struct csched2_unit * const scurr = csched2_unit(currunit);
+ struct csched2_unit *snext = NULL;
+ unsigned int skipped_units = 0;
+ bool tickled;
+ bool migrated = false;
+
+ SCHED_STAT_CRANK(schedule);
+ CSCHED2_UNIT_CHECK(currunit);
+
+ BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized));
+
+ rqd = c2rqd(ops, sched_cpu);
+ BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active));
+
+ ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock));
+
+ BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd);
+
+ /* Clear "tickled" bit now that we've been scheduled */
+ tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled);
+ if ( tickled )
+ {
+ __cpumask_clear_cpu(sched_cpu, &rqd->tickled);
+ cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
+ smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
+ }
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ unsigned cpu:16, rq_id:16;
+ unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
+ } d;
+ d.cpu = cur_cpu;
+ d.rq_id = c2r(sched_cpu);
+ d.tasklet = tasklet_work_scheduled;
+ d.idle = is_idle_unit(currunit);
+ d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle);
+ d.tickled = tickled;
+ __trace_var(TRC_CSCHED2_SCHEDULE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ /* Update credits (and budget, if necessary). */
+ burn_credits(rqd, scurr, now);
+
+ /*
+ * Below 0, means that we are capped and we have overrun our budget.
+ * Let's try to get some more but, if we fail (e.g., because of the
+ * other running units), we will be parked.
+ */
+ if ( unlikely(scurr->budget <= 0) )
+ unit_grab_budget(scurr);
+
+ /*
+ * Select next runnable local UNIT (ie top of local runq).
+ *
+ * If the current unit is runnable, and has higher credit than
+ * the next guy on the queue (or there is noone else), we want to
+ * run him again.
+ *
+ * If there's tasklet work to do, we want to chose the idle unit
+ * for this processor, and mark the current for delayed runqueue
+ * add.
+ *
+ * If the current unit is runnable, and there's another runnable
+ * candidate, we want to mark current for delayed runqueue add,
+ * and remove the next guy from the queue.
+ *
+ * If the current unit is not runnable, we want to chose the idle
+ * unit for this processor.
+ */
+ if ( tasklet_work_scheduled )
+ {
+ __clear_bit(__CSFLAG_unit_yield, &scurr->flags);
+ trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL);
+ snext = csched2_unit(sched_idle_unit(sched_cpu));
+ }
+ else
+ snext = runq_candidate(rqd, scurr, sched_cpu, now, &skipped_units);
+
+ /* If switching from a non-idle runnable unit, put it
+ * back on the runqueue. */
+ if ( snext != scurr
+ && !is_idle_unit(currunit)
+ && unit_runnable(currunit) )
+ __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
+
+ /* Accounting for non-idle tasks */
+ if ( !is_idle_unit(snext->unit) )
+ {
+ /* If switching, remove this from the runqueue and mark it scheduled */
+ if ( snext != scurr )
+ {
+ ASSERT(snext->rqd == rqd);
+ ASSERT(!snext->unit->is_running);
+
+ runq_remove(snext);
+ __set_bit(__CSFLAG_scheduled, &snext->flags);
+ }
+
+ /* Clear the idle mask if necessary */
+ if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
+ {
+ __cpumask_clear_cpu(sched_cpu, &rqd->idle);
+ smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
+ }
+
+ /*
+ * The reset condition is "has a scheduler epoch come to an end?".
+ * The way this is enforced is checking whether the unit at the top
+ * of the runqueue has negative credits. This means the epochs have
+ * variable length, as in one epoch expores when:
+ * 1) the unit at the top of the runqueue has executed for
+ * around 10 ms (with default parameters);
+ * 2) no other unit with higher credits wants to run.
+ *
+ * Here, where we want to check for reset, we need to make sure the
+ * proper unit is being used. In fact, runqueue_candidate() may have
+ * not returned the first unit in the runqueue, for various reasons
+ * (e.g., affinity). Only trigger a reset when it does.
+ */
+ if ( skipped_units == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
+ {
+ reset_credit(ops, sched_cpu, now, snext);
+ balance_load(ops, sched_cpu, now);
+ }
+
+ snext->start_time = now;
+ snext->tickled_cpu = -1;
+
+ /* Safe because lock for old processor is held */
+ if ( sched_unit_master(snext->unit) != sched_cpu )
+ {
+ snext->credit += CSCHED2_MIGRATE_COMPENSATION;
+ sched_set_res(snext->unit, get_sched_res(sched_cpu));
+ SCHED_STAT_CRANK(migrated);
+ migrated = true;
+ }
+ }
+ else
+ {
+ /*
+ * Update the idle mask if necessary. Note that, if we're scheduling
+ * idle in order to carry on some tasklet work, we want to play busy!
+ */
+ if ( tasklet_work_scheduled )
+ {
+ if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
+ {
+ __cpumask_clear_cpu(sched_cpu, &rqd->idle);
+ smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
+ }
+ }
+ else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) )
+ {
+ __cpumask_set_cpu(sched_cpu, &rqd->idle);
+ cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
+ smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
+ }
+ /* Make sure avgload gets updated periodically even
+ * if there's no activity */
+ update_load(ops, rqd, NULL, 0, now);
+ }
+
+ /*
+ * Return task to run next...
+ */
+ currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now);
+ currunit->next_task = snext->unit;
+ snext->unit->migrated = migrated;
+
+ CSCHED2_UNIT_CHECK(currunit->next_task);
+}
+
+static void
+csched2_dump_unit(struct csched2_private *prv, struct csched2_unit *svc)
+{
+ printk("[%i.%i] flags=%x cpu=%i",
+ svc->unit->domain->domain_id,
+ svc->unit->unit_id,
+ svc->flags,
+ sched_unit_master(svc->unit));
+
+ printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
+
+ if ( has_cap(svc) )
+ printk(" budget=%"PRI_stime"(%"PRI_stime")",
+ svc->budget, svc->budget_quota);
+
+ printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
+ (svc->avgload * 100) >> prv->load_precision_shift);
+
+ printk("\n");
+}
+
+static inline void
+dump_pcpu(const struct scheduler *ops, int cpu)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ struct csched2_unit *svc;
+
+ printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n",
+ cpu, c2r(cpu),
+ CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
+ CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+
+ /* current UNIT (nothing to say if that's the idle unit) */
+ svc = csched2_unit(curr_on_cpu(cpu));
+ if ( svc && !is_idle_unit(svc->unit) )
+ {
+ printk("\trun: ");
+ csched2_dump_unit(prv, svc);
+ }
+}
+
+static void
+csched2_dump(const struct scheduler *ops)
+{
+ struct list_head *iter_sdom;
+ struct csched2_private *prv = csched2_priv(ops);
+ unsigned long flags;
+ unsigned int i, j, loop;
+
+ /*
+ * We need the private scheduler lock as we access global
+ * scheduler data and (below) the list of active domains.
+ */
+ read_lock_irqsave(&prv->lock, flags);
+
+ printk("Active queues: %d\n"
+ "\tdefault-weight = %d\n",
+ cpumask_weight(&prv->active_queues),
+ CSCHED2_DEFAULT_WEIGHT);
+ for_each_cpu(i, &prv->active_queues)
+ {
+ s_time_t fraction;
+
+ fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift;
+
+ printk("Runqueue %d:\n"
+ "\tncpus = %u\n"
+ "\tcpus = %*pbl\n"
+ "\tmax_weight = %u\n"
+ "\tpick_bias = %u\n"
+ "\tinstload = %d\n"
+ "\taveload = %"PRI_stime" (~%"PRI_stime"%%)\n",
+ i,
+ prv->rqd[i].nr_cpus,
+ CPUMASK_PR(&prv->rqd[i].active),
+ prv->rqd[i].max_weight,
+ prv->rqd[i].pick_bias,
+ prv->rqd[i].load,
+ prv->rqd[i].avgload,
+ fraction);
+
+ printk("\tidlers: %*pb\n"
+ "\ttickled: %*pb\n"
+ "\tfully idle cores: %*pb\n",
+ CPUMASK_PR(&prv->rqd[i].idle),
+ CPUMASK_PR(&prv->rqd[i].tickled),
+ CPUMASK_PR(&prv->rqd[i].smt_idle));
+ }
+
+ printk("Domain info:\n");
+ loop = 0;
+ list_for_each( iter_sdom, &prv->sdom )
+ {
+ struct csched2_dom *sdom;
+ struct sched_unit *unit;
+
+ sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
+
+ printk("\tDomain: %d w %d c %u v %d\n",
+ sdom->dom->domain_id,
+ sdom->weight,
+ sdom->cap,
+ sdom->nr_units);
+
+ for_each_sched_unit ( sdom->dom, unit )
+ {
+ struct csched2_unit * const svc = csched2_unit(unit);
+ spinlock_t *lock;
+
+ lock = unit_schedule_lock(unit);
+
+ printk("\t%3d: ", ++loop);
+ csched2_dump_unit(prv, svc);
+
+ unit_schedule_unlock(lock, unit);
+ }
+ }
+
+ for_each_cpu(i, &prv->active_queues)
+ {
+ struct csched2_runqueue_data *rqd = prv->rqd + i;
+ struct list_head *iter, *runq = &rqd->runq;
+ int loop = 0;
+
+ /* We need the lock to scan the runqueue. */
+ spin_lock(&rqd->lock);
+
+ printk("Runqueue %d:\n", i);
+
+ for_each_cpu(j, &rqd->active)
+ dump_pcpu(ops, j);
+
+ printk("RUNQ:\n");
+ list_for_each( iter, runq )
+ {
+ struct csched2_unit *svc = runq_elem(iter);
+
+ if ( svc )
+ {
+ printk("\t%3d: ", loop++);
+ csched2_dump_unit(prv, svc);
+ }
+ }
+ spin_unlock(&rqd->lock);
+ }
+
+ read_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void *
+csched2_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+ struct csched2_pcpu *spc;
+
+ spc = xzalloc(struct csched2_pcpu);
+ if ( spc == NULL )
+ return ERR_PTR(-ENOMEM);
+
+ /* Not in any runqueue yet */
+ spc->runq_id = -1;
+
+ return spc;
+}
+
+/* Returns the ID of the runqueue the cpu is assigned to. */
+static unsigned
+init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc,
+ unsigned int cpu)
+{
+ struct csched2_runqueue_data *rqd;
+ unsigned int rcpu;
+
+ ASSERT(rw_is_write_locked(&prv->lock));
+ ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
+ /* CPU data needs to be allocated, but still uninitialized. */
+ ASSERT(spc && spc->runq_id == -1);
+
+ /* Figure out which runqueue to put it in */
+ spc->runq_id = cpu_to_runqueue(prv, cpu);
+
+ rqd = prv->rqd + spc->runq_id;
+
+ printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
+ if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
+ {
+ printk(XENLOG_INFO " First cpu on runqueue, activating\n");
+ activate_runqueue(prv, spc->runq_id);
+ }
+
+ __cpumask_set_cpu(cpu, &spc->sibling_mask);
+
+ if ( rqd->nr_cpus > 0 )
+ for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) )
+ if ( cpumask_test_cpu(rcpu, &rqd->active) )
+ {
+ __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
+ __cpumask_set_cpu(rcpu, &spc->sibling_mask);
+ }
+
+ __cpumask_set_cpu(cpu, &rqd->idle);
+ __cpumask_set_cpu(cpu, &rqd->active);
+ __cpumask_set_cpu(cpu, &prv->initialized);
+ __cpumask_set_cpu(cpu, &rqd->smt_idle);
+
+ rqd->nr_cpus++;
+ ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
+
+ if ( rqd->nr_cpus == 1 )
+ rqd->pick_bias = cpu;
+
+ return spc->runq_id;
+}
+
+static void
+csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+ struct csched2_private *prv = csched2_priv(ops);
+ spinlock_t *old_lock;
+ unsigned long flags;
+ unsigned rqi;
+
+ write_lock_irqsave(&prv->lock, flags);
+ old_lock = pcpu_schedule_lock(cpu);
+
+ rqi = init_pdata(prv, pdata, cpu);
+ /* Move the scheduler lock to the new runq lock. */
+ get_sched_res(cpu)->schedule_lock = &prv->rqd[rqi].lock;
+
+ /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+ spin_unlock(old_lock);
+ write_unlock_irqrestore(&prv->lock, flags);
+}
+
+/* Change the scheduler of cpu to us (Credit2). */
+static spinlock_t *
+csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ struct csched2_private *prv = csched2_priv(new_ops);
+ struct csched2_unit *svc = vdata;
+ unsigned rqi;
+
+ ASSERT(pdata && svc && is_idle_unit(svc->unit));
+
+ /*
+ * We own one runqueue lock already (from schedule_cpu_switch()). This
+ * looks like it violates this scheduler's locking rules, but it does
+ * not, as what we own is the lock of another scheduler, that hence has
+ * no particular (ordering) relationship with our private global lock.
+ * And owning exactly that one (the lock of the old scheduler of this
+ * cpu) is what is necessary to prevent races.
+ */
+ ASSERT(!local_irq_is_enabled());
+ write_lock(&prv->lock);
+
+ sched_idle_unit(cpu)->priv = vdata;
+
+ rqi = init_pdata(prv, pdata, cpu);
+
+ /*
+ * Now that we know what runqueue we'll go in, double check what's said
+ * above: the lock we already hold is not the one of this runqueue of
+ * this scheduler, and so it's safe to have taken it /before/ our
+ * private global lock.
+ */
+ ASSERT(get_sched_res(cpu)->schedule_lock != &prv->rqd[rqi].lock);
+
+ write_unlock(&prv->lock);
+
+ return &prv->rqd[rqi].lock;
+}
+
+static void
+csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+ unsigned long flags;
+ struct csched2_private *prv = csched2_priv(ops);
+ struct csched2_runqueue_data *rqd;
+ struct csched2_pcpu *spc = pcpu;
+ unsigned int rcpu;
+
+ write_lock_irqsave(&prv->lock, flags);
+
+ /*
+ * alloc_pdata is not implemented, so pcpu must be NULL. On the other
+ * hand, init_pdata must have been called for this pCPU.
+ */
+ /*
+ * Scheduler specific data for this pCPU must still be there and and be
+ * valid. In fact, if we are here:
+ * 1. alloc_pdata must have been called for this cpu, and free_pdata
+ * must not have been called on it before us,
+ * 2. init_pdata must have been called on this cpu, and deinit_pdata
+ * (us!) must not have been called on it already.
+ */
+ ASSERT(spc && spc->runq_id != -1);
+ ASSERT(cpumask_test_cpu(cpu, &prv->initialized));
+
+ /* Find the old runqueue and remove this cpu from it */
+ rqd = prv->rqd + spc->runq_id;
+
+ /* No need to save IRQs here, they're already disabled */
+ spin_lock(&rqd->lock);
+
+ printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id);
+
+ __cpumask_clear_cpu(cpu, &rqd->idle);
+ __cpumask_clear_cpu(cpu, &rqd->smt_idle);
+ __cpumask_clear_cpu(cpu, &rqd->active);
+
+ for_each_cpu ( rcpu, &rqd->active )
+ __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
+
+ rqd->nr_cpus--;
+ ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
+
+ if ( rqd->nr_cpus == 0 )
+ {
+ printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
+ deactivate_runqueue(prv, spc->runq_id);
+ }
+ else if ( rqd->pick_bias == cpu )
+ rqd->pick_bias = cpumask_first(&rqd->active);
+
+ spc->runq_id = -1;
+
+ spin_unlock(&rqd->lock);
+
+ __cpumask_clear_cpu(cpu, &prv->initialized);
+
+ write_unlock_irqrestore(&prv->lock, flags);
+
+ return;
+}
+
+static void
+csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+ struct csched2_pcpu *spc = pcpu;
+
+ /*
+ * pcpu either points to a valid struct csched2_pcpu, or is NULL (if
+ * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED).
+ * xfree() does not really mind, but we want to be sure that either
+ * init_pdata has never been called, or deinit_pdata has been called
+ * already.
+ */
+ ASSERT(!pcpu || spc->runq_id == -1);
+ ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
+
+ xfree(pcpu);
+}
+
+static int __init
+csched2_global_init(void)
+{
+ if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
+ {
+ printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n",
+ __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN);
+ opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN;
+ }
+
+ if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT )
+ {
+ printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n",
+ __func__, opt_load_window_shift);
+ opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
+ }
+
+ if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
+ {
+ printk("WARNING: %s: opt_cap_period %u too small, resetting\n",
+ __func__, opt_cap_period);
+ opt_cap_period = 10; /* ms */
+ }
+
+ return 0;
+}
+
+static int
+csched2_init(struct scheduler *ops)
+{
+ int i;
+ struct csched2_private *prv;
+
+ printk("Initializing Credit2 scheduler\n");
+
+ printk(XENLOG_INFO " load_precision_shift: %d\n"
+ XENLOG_INFO " load_window_shift: %d\n"
+ XENLOG_INFO " underload_balance_tolerance: %d\n"
+ XENLOG_INFO " overload_balance_tolerance: %d\n"
+ XENLOG_INFO " runqueues arrangement: %s\n"
+ XENLOG_INFO " cap enforcement granularity: %dms\n",
+ opt_load_precision_shift,
+ opt_load_window_shift,
+ opt_underload_balance_tolerance,
+ opt_overload_balance_tolerance,
+ opt_runqueue_str[opt_runqueue],
+ opt_cap_period);
+
+ printk(XENLOG_INFO "load tracking window length %llu ns\n",
+ 1ULL << opt_load_window_shift);
+
+ /*
+ * Basically no CPU information is available at this point; just
+ * set up basic structures, and a callback when the CPU info is
+ * available.
+ */
+
+ prv = xzalloc(struct csched2_private);
+ if ( prv == NULL )
+ return -ENOMEM;
+ ops->sched_data = prv;
+
+ rwlock_init(&prv->lock);
+ INIT_LIST_HEAD(&prv->sdom);
+
+ /* Allocate all runqueues and mark them as un-initialized */
+ prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids);
+ if ( !prv->rqd )
+ {
+ xfree(prv);
+ return -ENOMEM;
+ }
+ for ( i = 0; i < nr_cpu_ids; i++ )
+ prv->rqd[i].id = -1;
+
+ /* initialize ratelimit */
+ prv->ratelimit_us = sched_ratelimit_us;
+
+ prv->load_precision_shift = opt_load_precision_shift;
+ prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT;
+ ASSERT(opt_load_window_shift > 0);
+
+ return 0;
+}
+
+static void
+csched2_deinit(struct scheduler *ops)
+{
+ struct csched2_private *prv;
+
+ prv = csched2_priv(ops);
+ ops->sched_data = NULL;
+ if ( prv )
+ xfree(prv->rqd);
+ xfree(prv);
+}
+
+static const struct scheduler sched_credit2_def = {
+ .name = "SMP Credit Scheduler rev2",
+ .opt_name = "credit2",
+ .sched_id = XEN_SCHEDULER_CREDIT2,
+ .sched_data = NULL,
+
+ .global_init = csched2_global_init,
+
+ .insert_unit = csched2_unit_insert,
+ .remove_unit = csched2_unit_remove,
+
+ .sleep = csched2_unit_sleep,
+ .wake = csched2_unit_wake,
+ .yield = csched2_unit_yield,
+
+ .adjust = csched2_dom_cntl,
+ .adjust_affinity= csched2_aff_cntl,
+ .adjust_global = csched2_sys_cntl,
+
+ .pick_resource = csched2_res_pick,
+ .migrate = csched2_unit_migrate,
+ .do_schedule = csched2_schedule,
+ .context_saved = csched2_context_saved,
+
+ .dump_settings = csched2_dump,
+ .init = csched2_init,
+ .deinit = csched2_deinit,
+ .alloc_udata = csched2_alloc_udata,
+ .free_udata = csched2_free_udata,
+ .alloc_pdata = csched2_alloc_pdata,
+ .init_pdata = csched2_init_pdata,
+ .deinit_pdata = csched2_deinit_pdata,
+ .free_pdata = csched2_free_pdata,
+ .switch_sched = csched2_switch_sched,
+ .alloc_domdata = csched2_alloc_domdata,
+ .free_domdata = csched2_free_domdata,
+};
+
+REGISTER_SCHEDULER(sched_credit2_def);
--- /dev/null
+/*
+ * xen/common/sched_null.c
+ *
+ * Copyright (c) 2017, Dario Faggioli, Citrix Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * The 'null' scheduler always choose to run, on each pCPU, either nothing
+ * (i.e., the pCPU stays idle) or always the same unit.
+ *
+ * It is aimed at supporting static scenarios, where there always are
+ * less units than pCPUs (and the units don't need to move among pCPUs
+ * for any reason) with the least possible overhead.
+ *
+ * Typical usecase are embedded applications, but also HPC, especially
+ * if the scheduler is used inside a cpupool.
+ */
+
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+
+/*
+ * null tracing events. Check include/public/trace.h for more details.
+ */
+#define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1)
+#define TRC_SNULL_UNIT_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2)
+#define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
+#define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4)
+#define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5)
+#define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6)
+
+/*
+ * Locking:
+ * - Scheduler-lock (a.k.a. runqueue lock):
+ * + is per-pCPU;
+ * + serializes assignment and deassignment of units to a pCPU.
+ * - Private data lock (a.k.a. private scheduler lock):
+ * + is scheduler-wide;
+ * + serializes accesses to the list of domains in this scheduler.
+ * - Waitqueue lock:
+ * + is scheduler-wide;
+ * + serialize accesses to the list of units waiting to be assigned
+ * to pCPUs.
+ *
+ * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
+ * waitqueue lock nests inside runqueue lock which nests inside private
+ * lock. More specifically:
+ * + if we need both runqueue and private locks, we must acquire the
+ * private lock for first;
+ * + if we need both runqueue and waitqueue locks, we must acquire
+ * the runqueue lock for first;
+ * + if we need both private and waitqueue locks, we must acquire
+ * the private lock for first;
+ * + if we already own a runqueue lock, we must never acquire
+ * the private lock;
+ * + if we already own the waitqueue lock, we must never acquire
+ * the runqueue lock or the private lock.
+ */
+
+/*
+ * System-wide private data
+ */
+struct null_private {
+ spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */
+ struct list_head ndom; /* Domains of this scheduler */
+ struct list_head waitq; /* units not assigned to any pCPU */
+ spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */
+ cpumask_t cpus_free; /* CPUs without a unit associated to them */
+};
+
+/*
+ * Physical CPU
+ */
+struct null_pcpu {
+ struct sched_unit *unit;
+};
+DEFINE_PER_CPU(struct null_pcpu, npc);
+
+/*
+ * Schedule unit
+ */
+struct null_unit {
+ struct list_head waitq_elem;
+ struct sched_unit *unit;
+};
+
+/*
+ * Domain
+ */
+struct null_dom {
+ struct list_head ndom_elem;
+ struct domain *dom;
+};
+
+/*
+ * Accessor helpers functions
+ */
+static inline struct null_private *null_priv(const struct scheduler *ops)
+{
+ return ops->sched_data;
+}
+
+static inline struct null_unit *null_unit(const struct sched_unit *unit)
+{
+ return unit->priv;
+}
+
+static inline bool unit_check_affinity(struct sched_unit *unit,
+ unsigned int cpu,
+ unsigned int balance_step)
+{
+ affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu));
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ cpupool_domain_master_cpumask(unit->domain));
+
+ return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
+}
+
+static int null_init(struct scheduler *ops)
+{
+ struct null_private *prv;
+
+ printk("Initializing null scheduler\n"
+ "WARNING: This is experimental software in development.\n"
+ "Use at your own risk.\n");
+
+ prv = xzalloc(struct null_private);
+ if ( prv == NULL )
+ return -ENOMEM;
+
+ spin_lock_init(&prv->lock);
+ spin_lock_init(&prv->waitq_lock);
+ INIT_LIST_HEAD(&prv->ndom);
+ INIT_LIST_HEAD(&prv->waitq);
+
+ ops->sched_data = prv;
+
+ return 0;
+}
+
+static void null_deinit(struct scheduler *ops)
+{
+ xfree(ops->sched_data);
+ ops->sched_data = NULL;
+}
+
+static void init_pdata(struct null_private *prv, unsigned int cpu)
+{
+ /* Mark the pCPU as free, and with no unit assigned */
+ cpumask_set_cpu(cpu, &prv->cpus_free);
+ per_cpu(npc, cpu).unit = NULL;
+}
+
+static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+ struct null_private *prv = null_priv(ops);
+
+ /* alloc_pdata is not implemented, so we want this to be NULL. */
+ ASSERT(!pdata);
+
+ init_pdata(prv, cpu);
+}
+
+static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+ struct null_private *prv = null_priv(ops);
+
+ /* alloc_pdata not implemented, so this must have stayed NULL */
+ ASSERT(!pcpu);
+
+ cpumask_clear_cpu(cpu, &prv->cpus_free);
+ per_cpu(npc, cpu).unit = NULL;
+}
+
+static void *null_alloc_udata(const struct scheduler *ops,
+ struct sched_unit *unit, void *dd)
+{
+ struct null_unit *nvc;
+
+ nvc = xzalloc(struct null_unit);
+ if ( nvc == NULL )
+ return NULL;
+
+ INIT_LIST_HEAD(&nvc->waitq_elem);
+ nvc->unit = unit;
+
+ SCHED_STAT_CRANK(unit_alloc);
+
+ return nvc;
+}
+
+static void null_free_udata(const struct scheduler *ops, void *priv)
+{
+ struct null_unit *nvc = priv;
+
+ xfree(nvc);
+}
+
+static void * null_alloc_domdata(const struct scheduler *ops,
+ struct domain *d)
+{
+ struct null_private *prv = null_priv(ops);
+ struct null_dom *ndom;
+ unsigned long flags;
+
+ ndom = xzalloc(struct null_dom);
+ if ( ndom == NULL )
+ return ERR_PTR(-ENOMEM);
+
+ ndom->dom = d;
+
+ spin_lock_irqsave(&prv->lock, flags);
+ list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ return ndom;
+}
+
+static void null_free_domdata(const struct scheduler *ops, void *data)
+{
+ struct null_dom *ndom = data;
+ struct null_private *prv = null_priv(ops);
+
+ if ( ndom )
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&prv->lock, flags);
+ list_del_init(&ndom->ndom_elem);
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ xfree(ndom);
+ }
+}
+
+/*
+ * unit to pCPU assignment and placement. This _only_ happens:
+ * - on insert,
+ * - on migrate.
+ *
+ * Insert occurs when a unit joins this scheduler for the first time
+ * (e.g., when the domain it's part of is moved to the scheduler's
+ * cpupool).
+ *
+ * Migration may be necessary if a pCPU (with a unit assigned to it)
+ * is removed from the scheduler's cpupool.
+ *
+ * So this is not part of any hot path.
+ */
+static struct sched_resource *
+pick_res(struct null_private *prv, const struct sched_unit *unit)
+{
+ unsigned int bs;
+ unsigned int cpu = sched_unit_master(unit), new_cpu;
+ cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain);
+
+ ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+ for_each_affinity_balance_step( bs )
+ {
+ if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
+ continue;
+
+ affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
+
+ /*
+ * If our processor is free, or we are assigned to it, and it is also
+ * still valid and part of our affinity, just go for it.
+ * (Note that we may call unit_check_affinity(), but we deliberately
+ * don't, so we get to keep in the scratch cpumask what we have just
+ * put in it.)
+ */
+ if ( likely((per_cpu(npc, cpu).unit == NULL ||
+ per_cpu(npc, cpu).unit == unit)
+ && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
+ {
+ new_cpu = cpu;
+ goto out;
+ }
+
+ /* If not, just go for a free pCPU, within our affinity, if any */
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &prv->cpus_free);
+ new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
+
+ if ( likely(new_cpu != nr_cpu_ids) )
+ goto out;
+ }
+
+ /*
+ * If we didn't find any free pCPU, just pick any valid pcpu, even if
+ * it has another unit assigned. This will happen during shutdown and
+ * suspend/resume, but it may also happen during "normal operation", if
+ * all the pCPUs are busy.
+ *
+ * In fact, there must always be something sane in v->processor, or
+ * unit_schedule_lock() and friends won't work. This is not a problem,
+ * as we will actually assign the unit to the pCPU we return from here,
+ * only if the pCPU is free.
+ */
+ cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity);
+ new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
+
+ out:
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint16_t unit, dom;
+ uint32_t new_cpu;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.new_cpu = new_cpu;
+ __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d);
+ }
+
+ return get_sched_res(new_cpu);
+}
+
+static void unit_assign(struct null_private *prv, struct sched_unit *unit,
+ unsigned int cpu)
+{
+ ASSERT(is_unit_online(unit));
+
+ per_cpu(npc, cpu).unit = unit;
+ sched_set_res(unit, get_sched_res(cpu));
+ cpumask_clear_cpu(cpu, &prv->cpus_free);
+
+ dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint16_t unit, dom;
+ uint32_t cpu;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.cpu = cpu;
+ __trace_var(TRC_SNULL_UNIT_ASSIGN, 1, sizeof(d), &d);
+ }
+}
+
+/* Returns true if a cpu was tickled */
+static bool unit_deassign(struct null_private *prv, struct sched_unit *unit)
+{
+ unsigned int bs;
+ unsigned int cpu = sched_unit_master(unit);
+ struct null_unit *wvc;
+
+ ASSERT(list_empty(&null_unit(unit)->waitq_elem));
+ ASSERT(per_cpu(npc, cpu).unit == unit);
+ ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free));
+
+ per_cpu(npc, cpu).unit = NULL;
+ cpumask_set_cpu(cpu, &prv->cpus_free);
+
+ dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain,
+ unit->unit_id);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint16_t unit, dom;
+ uint32_t cpu;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.cpu = cpu;
+ __trace_var(TRC_SNULL_UNIT_DEASSIGN, 1, sizeof(d), &d);
+ }
+
+ spin_lock(&prv->waitq_lock);
+
+ /*
+ * If unit is assigned to a pCPU, let's see if there is someone waiting,
+ * suitable to be assigned to it (prioritizing units that have
+ * soft-affinity with cpu).
+ */
+ for_each_affinity_balance_step( bs )
+ {
+ list_for_each_entry( wvc, &prv->waitq, waitq_elem )
+ {
+ if ( bs == BALANCE_SOFT_AFFINITY &&
+ !has_soft_affinity(wvc->unit) )
+ continue;
+
+ if ( unit_check_affinity(wvc->unit, cpu, bs) )
+ {
+ list_del_init(&wvc->waitq_elem);
+ unit_assign(prv, wvc->unit, cpu);
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ spin_unlock(&prv->waitq_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&prv->waitq_lock);
+
+ return false;
+}
+
+/* Change the scheduler of cpu to us (null). */
+static spinlock_t *null_switch_sched(struct scheduler *new_ops,
+ unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ struct sched_resource *sr = get_sched_res(cpu);
+ struct null_private *prv = null_priv(new_ops);
+ struct null_unit *nvc = vdata;
+
+ ASSERT(nvc && is_idle_unit(nvc->unit));
+
+ sched_idle_unit(cpu)->priv = vdata;
+
+ /*
+ * We are holding the runqueue lock already (it's been taken in
+ * schedule_cpu_switch()). It actually may or may not be the 'right'
+ * one for this cpu, but that is ok for preventing races.
+ */
+ ASSERT(!local_irq_is_enabled());
+
+ init_pdata(prv, cpu);
+
+ return &sr->_lock;
+}
+
+static void null_unit_insert(const struct scheduler *ops,
+ struct sched_unit *unit)
+{
+ struct null_private *prv = null_priv(ops);
+ struct null_unit *nvc = null_unit(unit);
+ unsigned int cpu;
+ spinlock_t *lock;
+
+ ASSERT(!is_idle_unit(unit));
+
+ lock = unit_schedule_lock_irq(unit);
+
+ if ( unlikely(!is_unit_online(unit)) )
+ {
+ unit_schedule_unlock_irq(lock, unit);
+ return;
+ }
+
+ retry:
+ sched_set_res(unit, pick_res(prv, unit));
+ cpu = sched_unit_master(unit);
+
+ spin_unlock(lock);
+
+ lock = unit_schedule_lock(unit);
+
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(unit->domain));
+
+ /* If the pCPU is free, we assign unit to it */
+ if ( likely(per_cpu(npc, cpu).unit == NULL) )
+ {
+ /*
+ * Insert is followed by vcpu_wake(), so there's no need to poke
+ * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
+ */
+ unit_assign(prv, unit, cpu);
+ }
+ else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
+ {
+ /*
+ * If the pCPU is not free (e.g., because we raced with another
+ * insert or a migrate), but there are other free pCPUs, we can
+ * try to pick again.
+ */
+ goto retry;
+ }
+ else
+ {
+ /*
+ * If the pCPU is not free, and there aren't any (valid) others,
+ * we have no alternatives than to go into the waitqueue.
+ */
+ spin_lock(&prv->waitq_lock);
+ list_add_tail(&nvc->waitq_elem, &prv->waitq);
+ dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n",
+ unit->domain, unit->unit_id);
+ spin_unlock(&prv->waitq_lock);
+ }
+ spin_unlock_irq(lock);
+
+ SCHED_STAT_CRANK(unit_insert);
+}
+
+static void null_unit_remove(const struct scheduler *ops,
+ struct sched_unit *unit)
+{
+ struct null_private *prv = null_priv(ops);
+ struct null_unit *nvc = null_unit(unit);
+ spinlock_t *lock;
+
+ ASSERT(!is_idle_unit(unit));
+
+ lock = unit_schedule_lock_irq(unit);
+
+ /* If offline, the unit shouldn't be assigned, nor in the waitqueue */
+ if ( unlikely(!is_unit_online(unit)) )
+ {
+ ASSERT(per_cpu(npc, sched_unit_master(unit)).unit != unit);
+ ASSERT(list_empty(&nvc->waitq_elem));
+ goto out;
+ }
+
+ /* If unit is in waitqueue, just get it out of there and bail */
+ if ( unlikely(!list_empty(&nvc->waitq_elem)) )
+ {
+ spin_lock(&prv->waitq_lock);
+ list_del_init(&nvc->waitq_elem);
+ spin_unlock(&prv->waitq_lock);
+
+ goto out;
+ }
+
+ unit_deassign(prv, unit);
+
+ out:
+ unit_schedule_unlock_irq(lock, unit);
+
+ SCHED_STAT_CRANK(unit_remove);
+}
+
+static void null_unit_wake(const struct scheduler *ops,
+ struct sched_unit *unit)
+{
+ struct null_private *prv = null_priv(ops);
+ struct null_unit *nvc = null_unit(unit);
+ unsigned int cpu = sched_unit_master(unit);
+
+ ASSERT(!is_idle_unit(unit));
+
+ if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
+ {
+ SCHED_STAT_CRANK(unit_wake_running);
+ return;
+ }
+
+ if ( unlikely(!list_empty(&nvc->waitq_elem)) )
+ {
+ /* Not exactly "on runq", but close enough for reusing the counter */
+ SCHED_STAT_CRANK(unit_wake_onrunq);
+ return;
+ }
+
+ if ( likely(unit_runnable(unit)) )
+ SCHED_STAT_CRANK(unit_wake_runnable);
+ else
+ SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+ if ( likely(per_cpu(npc, cpu).unit == unit) )
+ {
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ return;
+ }
+
+ /*
+ * If a unit is neither on a pCPU nor in the waitqueue, it means it was
+ * offline, and that it is now coming back being online. If we're lucky,
+ * and its previous resource is free (and affinities match), we can just
+ * assign the unit to it (we own the proper lock already) and be done.
+ */
+ if ( per_cpu(npc, cpu).unit == NULL &&
+ unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) )
+ {
+ if ( !has_soft_affinity(unit) ||
+ unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) )
+ {
+ unit_assign(prv, unit, cpu);
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ return;
+ }
+ }
+
+ /*
+ * If the resource is not free (or affinities do not match) we need
+ * to assign unit to some other one, but we can't do it here, as:
+ * - we don't own the proper lock,
+ * - we can't change v->processor under vcpu_wake()'s feet.
+ * So we add it to the waitqueue, and tickle all the free CPUs (if any)
+ * on which unit can run. The first one that schedules will pick it up.
+ */
+ spin_lock(&prv->waitq_lock);
+ list_add_tail(&nvc->waitq_elem, &prv->waitq);
+ spin_unlock(&prv->waitq_lock);
+
+ cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+ cpupool_domain_master_cpumask(unit->domain));
+ cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ &prv->cpus_free);
+
+ if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
+ dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
+ unit->domain->domain_id, unit->unit_id);
+ else
+ cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ);
+}
+
+static void null_unit_sleep(const struct scheduler *ops,
+ struct sched_unit *unit)
+{
+ struct null_private *prv = null_priv(ops);
+ unsigned int cpu = sched_unit_master(unit);
+ bool tickled = false;
+
+ ASSERT(!is_idle_unit(unit));
+
+ /*
+ * Check if the unit is in the process of being offlined. If yes,
+ * we need to remove it from either its pCPU or the waitqueue.
+ */
+ if ( unlikely(!is_unit_online(unit)) )
+ {
+ struct null_unit *nvc = null_unit(unit);
+
+ if ( unlikely(!list_empty(&nvc->waitq_elem)) )
+ {
+ spin_lock(&prv->waitq_lock);
+ list_del_init(&nvc->waitq_elem);
+ spin_unlock(&prv->waitq_lock);
+ }
+ else if ( per_cpu(npc, cpu).unit == unit )
+ tickled = unit_deassign(prv, unit);
+ }
+
+ /* If unit is not assigned to a pCPU, or is not running, no need to bother */
+ if ( likely(!tickled && curr_on_cpu(cpu) == unit) )
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+
+ SCHED_STAT_CRANK(unit_sleep);
+}
+
+static struct sched_resource *
+null_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+ ASSERT(!is_idle_unit(unit));
+ return pick_res(null_priv(ops), unit);
+}
+
+static void null_unit_migrate(const struct scheduler *ops,
+ struct sched_unit *unit, unsigned int new_cpu)
+{
+ struct null_private *prv = null_priv(ops);
+ struct null_unit *nvc = null_unit(unit);
+
+ ASSERT(!is_idle_unit(unit));
+
+ if ( sched_unit_master(unit) == new_cpu )
+ return;
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint16_t unit, dom;
+ uint16_t cpu, new_cpu;
+ } d;
+ d.dom = unit->domain->domain_id;
+ d.unit = unit->unit_id;
+ d.cpu = sched_unit_master(unit);
+ d.new_cpu = new_cpu;
+ __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d);
+ }
+
+ /*
+ * If unit is assigned to a pCPU, then such pCPU becomes free, and we
+ * should look in the waitqueue if anyone else can be assigned to it.
+ */
+ if ( likely(per_cpu(npc, sched_unit_master(unit)).unit == unit) )
+ {
+ unit_deassign(prv, unit);
+ SCHED_STAT_CRANK(migrate_running);
+ }
+ else if ( !list_empty(&nvc->waitq_elem) )
+ SCHED_STAT_CRANK(migrate_on_runq);
+
+ SCHED_STAT_CRANK(migrated);
+
+ /*
+ * If a unit is (going) offline, we want it to be neither assigned
+ * to a pCPU, nor in the waitqueue.
+ *
+ * If it was on a cpu, we've removed it from there above. If it is
+ * in the waitqueue, we remove it from there now. And then we bail.
+ */
+ if ( unlikely(!is_unit_online(unit)) )
+ {
+ spin_lock(&prv->waitq_lock);
+ list_del_init(&nvc->waitq_elem);
+ spin_unlock(&prv->waitq_lock);
+ goto out;
+ }
+
+ /*
+ * Let's now consider new_cpu, which is where unit is being sent. It can be
+ * either free, or have a unit already assigned to it.
+ *
+ * In the former case we should assign unit to it, and try to get it to run,
+ * if possible, according to affinity.
+ *
+ * In latter, all we can do is to park unit in the waitqueue.
+ */
+ if ( per_cpu(npc, new_cpu).unit == NULL &&
+ unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) )
+ {
+ /* unit might have been in the waitqueue, so remove it */
+ spin_lock(&prv->waitq_lock);
+ list_del_init(&nvc->waitq_elem);
+ spin_unlock(&prv->waitq_lock);
+
+ unit_assign(prv, unit, new_cpu);
+ }
+ else
+ {
+ /* Put unit in the waitqueue, if it wasn't there already */
+ spin_lock(&prv->waitq_lock);
+ if ( list_empty(&nvc->waitq_elem) )
+ {
+ list_add_tail(&nvc->waitq_elem, &prv->waitq);
+ dprintk(XENLOG_G_WARNING,
+ "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain,
+ unit->unit_id);
+ }
+ spin_unlock(&prv->waitq_lock);
+ }
+
+ /*
+ * Whatever all the above, we always at least override v->processor.
+ * This is especially important for shutdown or suspend/resume paths,
+ * when it is important to let our caller (cpu_disable_scheduler())
+ * know that the migration did happen, to the best of our possibilities,
+ * at least. In case of suspend, any temporary inconsistency caused
+ * by this, will be fixed-up during resume.
+ */
+ out:
+ sched_set_res(unit, get_sched_res(new_cpu));
+}
+
+#ifndef NDEBUG
+static inline void null_unit_check(struct sched_unit *unit)
+{
+ struct null_unit * const nvc = null_unit(unit);
+ struct null_dom * const ndom = unit->domain->sched_priv;
+
+ BUG_ON(nvc->unit != unit);
+
+ if ( ndom )
+ BUG_ON(is_idle_unit(unit));
+ else
+ BUG_ON(!is_idle_unit(unit));
+
+ SCHED_STAT_CRANK(unit_check);
+}
+#define NULL_UNIT_CHECK(unit) (null_unit_check(unit))
+#else
+#define NULL_UNIT_CHECK(unit)
+#endif
+
+
+/*
+ * The most simple scheduling function of all times! We either return:
+ * - the unit assigned to the pCPU, if there's one and it can run;
+ * - the idle unit, otherwise.
+ */
+static void null_schedule(const struct scheduler *ops, struct sched_unit *prev,
+ s_time_t now, bool tasklet_work_scheduled)
+{
+ unsigned int bs;
+ const unsigned int cur_cpu = smp_processor_id();
+ const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+ struct null_private *prv = null_priv(ops);
+ struct null_unit *wvc;
+
+ SCHED_STAT_CRANK(schedule);
+ NULL_UNIT_CHECK(current->sched_unit);
+
+ if ( unlikely(tb_init_done) )
+ {
+ struct {
+ uint16_t tasklet, cpu;
+ int16_t unit, dom;
+ } d;
+ d.cpu = cur_cpu;
+ d.tasklet = tasklet_work_scheduled;
+ if ( per_cpu(npc, sched_cpu).unit == NULL )
+ {
+ d.unit = d.dom = -1;
+ }
+ else
+ {
+ d.unit = per_cpu(npc, sched_cpu).unit->unit_id;
+ d.dom = per_cpu(npc, sched_cpu).unit->domain->domain_id;
+ }
+ __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d);
+ }
+
+ if ( tasklet_work_scheduled )
+ {
+ trace_var(TRC_SNULL_TASKLET, 1, 0, NULL);
+ prev->next_task = sched_idle_unit(sched_cpu);
+ }
+ else
+ prev->next_task = per_cpu(npc, sched_cpu).unit;
+ prev->next_time = -1;
+
+ /*
+ * We may be new in the cpupool, or just coming back online. In which
+ * case, there may be units in the waitqueue that we can assign to us
+ * and run.
+ */
+ if ( unlikely(prev->next_task == NULL) )
+ {
+ bool unit_found;
+
+ spin_lock(&prv->waitq_lock);
+
+ if ( list_empty(&prv->waitq) )
+ goto unlock;
+
+ /*
+ * We scan the waitqueue twice, for prioritizing units that have
+ * soft-affinity with cpu. This may look like something expensive to
+ * do here in null_schedule(), but it's actually fine, because we do
+ * it only in cases where a pcpu has no unit associated (e.g., as
+ * said above, the cpu has just joined a cpupool).
+ */
+ unit_found = false;
+ for_each_affinity_balance_step( bs )
+ {
+ list_for_each_entry( wvc, &prv->waitq, waitq_elem )
+ {
+ if ( bs == BALANCE_SOFT_AFFINITY &&
+ !has_soft_affinity(wvc->unit) )
+ continue;
+
+ if ( unit_check_affinity(wvc->unit, sched_cpu, bs) )
+ {
+ spinlock_t *lock;
+
+ unit_found = true;
+
+ /*
+ * If the unit in the waitqueue has just come up online,
+ * we risk racing with vcpu_wake(). To avoid this, sync
+ * on the spinlock that vcpu_wake() holds, but only with
+ * trylock, to avoid deadlock).
+ */
+ lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit));
+
+ /*
+ * We know the vcpu's lock is not this resource's lock. In
+ * fact, if it were, since this cpu is free, vcpu_wake()
+ * would have assigned the unit to here directly.
+ */
+ ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock);
+
+ if ( lock ) {
+ unit_assign(prv, wvc->unit, sched_cpu);
+ list_del_init(&wvc->waitq_elem);
+ prev->next_task = wvc->unit;
+ spin_unlock(lock);
+ goto unlock;
+ }
+ }
+ }
+ }
+ /*
+ * If we did find a unit with suitable affinity in the waitqueue, but
+ * we could not pick it up (due to lock contention), and hence we are
+ * still free, plan for another try. In fact, we don't want such unit
+ * to be stuck in the waitqueue, when there are free cpus where it
+ * could run.
+ */
+ if ( unlikely( unit_found && prev->next_task == NULL &&
+ !list_empty(&prv->waitq)) )
+ cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ);
+ unlock:
+ spin_unlock(&prv->waitq_lock);
+
+ if ( prev->next_task == NULL &&
+ !cpumask_test_cpu(sched_cpu, &prv->cpus_free) )
+ cpumask_set_cpu(sched_cpu, &prv->cpus_free);
+ }
+
+ if ( unlikely(prev->next_task == NULL ||
+ !unit_runnable_state(prev->next_task)) )
+ prev->next_task = sched_idle_unit(sched_cpu);
+
+ NULL_UNIT_CHECK(prev->next_task);
+
+ prev->next_task->migrated = false;
+}
+
+static inline void dump_unit(struct null_private *prv, struct null_unit *nvc)
+{
+ printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id,
+ nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ?
+ sched_unit_master(nvc->unit) : -1);
+}
+
+static void null_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+ struct null_private *prv = null_priv(ops);
+ struct null_unit *nvc;
+ spinlock_t *lock;
+ unsigned long flags;
+
+ lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+ printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}",
+ cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
+ CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+ if ( per_cpu(npc, cpu).unit != NULL )
+ printk(", unit=%pdv%d", per_cpu(npc, cpu).unit->domain,
+ per_cpu(npc, cpu).unit->unit_id);
+ printk("\n");
+
+ /* current unit (nothing to say if that's the idle unit) */
+ nvc = null_unit(curr_on_cpu(cpu));
+ if ( nvc && !is_idle_unit(nvc->unit) )
+ {
+ printk("\trun: ");
+ dump_unit(prv, nvc);
+ printk("\n");
+ }
+
+ pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+}
+
+static void null_dump(const struct scheduler *ops)
+{
+ struct null_private *prv = null_priv(ops);
+ struct list_head *iter;
+ unsigned long flags;
+ unsigned int loop;
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free));
+
+ printk("Domain info:\n");
+ loop = 0;
+ list_for_each( iter, &prv->ndom )
+ {
+ struct null_dom *ndom;
+ struct sched_unit *unit;
+
+ ndom = list_entry(iter, struct null_dom, ndom_elem);
+
+ printk("\tDomain: %d\n", ndom->dom->domain_id);
+ for_each_sched_unit( ndom->dom, unit )
+ {
+ struct null_unit * const nvc = null_unit(unit);
+ spinlock_t *lock;
+
+ lock = unit_schedule_lock(unit);
+
+ printk("\t%3d: ", ++loop);
+ dump_unit(prv, nvc);
+ printk("\n");
+
+ unit_schedule_unlock(lock, unit);
+ }
+ }
+
+ printk("Waitqueue: ");
+ loop = 0;
+ spin_lock(&prv->waitq_lock);
+ list_for_each( iter, &prv->waitq )
+ {
+ struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem);
+
+ if ( loop++ != 0 )
+ printk(", ");
+ if ( loop % 24 == 0 )
+ printk("\n\t");
+ printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id);
+ }
+ printk("\n");
+ spin_unlock(&prv->waitq_lock);
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static const struct scheduler sched_null_def = {
+ .name = "null Scheduler",
+ .opt_name = "null",
+ .sched_id = XEN_SCHEDULER_NULL,
+ .sched_data = NULL,
+
+ .init = null_init,
+ .deinit = null_deinit,
+ .init_pdata = null_init_pdata,
+ .switch_sched = null_switch_sched,
+ .deinit_pdata = null_deinit_pdata,
+
+ .alloc_udata = null_alloc_udata,
+ .free_udata = null_free_udata,
+ .alloc_domdata = null_alloc_domdata,
+ .free_domdata = null_free_domdata,
+
+ .insert_unit = null_unit_insert,
+ .remove_unit = null_unit_remove,
+
+ .wake = null_unit_wake,
+ .sleep = null_unit_sleep,
+ .pick_resource = null_res_pick,
+ .migrate = null_unit_migrate,
+ .do_schedule = null_schedule,
+
+ .dump_cpu_state = null_dump_pcpu,
+ .dump_settings = null_dump,
+};
+
+REGISTER_SCHEDULER(sched_null_def);
--- /dev/null
+/*****************************************************************************
+ * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen
+ * EDF scheduling is a real-time scheduling algorithm used in embedded field.
+ *
+ * by Sisu Xi, 2013, Washington University in Saint Louis
+ * Meng Xu, 2014-2016, University of Pennsylvania
+ *
+ * Conversion toward event driven model by Tianyang Chen
+ * and Dagaen Golomb, 2016, University of Pennsylvania
+ *
+ * based on the code of credit Scheduler
+ */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/cpu.h>
+#include <xen/keyhandler.h>
+#include <xen/trace.h>
+#include <xen/err.h>
+#include <xen/guest_access.h>
+
+/*
+ * TODO:
+ *
+ * Migration compensation and resist like credit2 to better use cache;
+ * Lock Holder Problem, using yield?
+ * Self switch problem: UNITs of the same domain may preempt each other;
+ */
+
+/*
+ * Design:
+ *
+ * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
+ * theory in real-time field.
+ * At any scheduling point, the UNIT with earlier deadline has higher priority.
+ * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
+ * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
+ * has a lower-priority UNIT running on it.)
+ *
+ * Each UNIT has a dedicated period, budget and a extratime flag
+ * The deadline of an UNIT is at the end of each period;
+ * An UNIT has its budget replenished at the beginning of each period;
+ * While scheduled, an UNIT burns its budget.
+ * The UNIT needs to finish its budget before its deadline in each period;
+ * The UNIT discards its unused budget at the end of each period.
+ * When an UNIT runs out of budget in a period, if its extratime flag is set,
+ * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
+ * it has to wait until next period.
+ *
+ * Each UNIT is implemented as a deferable server.
+ * When an UNIT has a task running on it, its budget is continuously burned;
+ * When an UNIT has no task but with budget left, its budget is preserved.
+ *
+ * Queue scheme:
+ * A global runqueue and a global depletedqueue for each CPU pool.
+ * The runqueue holds all runnable UNITs with budget,
+ * sorted by priority_level and deadline;
+ * The depletedqueue holds all UNITs without budget, unsorted;
+ *
+ * Note: cpumask and cpupool is supported.
+ */
+
+/*
+ * Locking:
+ * A global system lock is used to protect the RunQ and DepletedQ.
+ * The global lock is referenced by sched_res->schedule_lock
+ * from all physical cpus.
+ *
+ * The lock is already grabbed when calling wake/sleep/schedule/ functions
+ * in schedule.c
+ *
+ * The functions involes RunQ and needs to grab locks are:
+ * unit_insert, unit_remove, context_saved, runq_insert
+ */
+
+
+/*
+ * Default parameters:
+ * Period and budget in default is 10 and 4 ms, respectively
+ */
+#define RTDS_DEFAULT_PERIOD (MICROSECS(10000))
+#define RTDS_DEFAULT_BUDGET (MICROSECS(4000))
+
+/*
+ * Max period: max delta of time type, because period is added to the time
+ * an unit activates, so this must not overflow.
+ * Min period: 10 us, considering the scheduling overhead (when period is
+ * too low, scheduling is invoked too frequently, causing high overhead).
+ */
+#define RTDS_MAX_PERIOD (STIME_DELTA_MAX)
+#define RTDS_MIN_PERIOD (MICROSECS(10))
+
+/*
+ * Min budget: 10 us, considering the scheduling overhead (when budget is
+ * consumed too fast, scheduling is invoked too frequently, causing
+ * high overhead).
+ */
+#define RTDS_MIN_BUDGET (MICROSECS(10))
+
+/*
+ * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
+ * the next deadline, performing addition could be faster if the difference
+ * between cur_deadline and now is small. If the difference is bigger than
+ * 1024 * period, use multiplication.
+ */
+#define UPDATE_LIMIT_SHIFT 10
+
+/*
+ * Flags
+ */
+/*
+ * RTDS_scheduled: Is this unit either running on, or context-switching off,
+ * a physical cpu?
+ * + Accessed only with global lock held.
+ * + Set when chosen as next in rt_schedule().
+ * + Cleared after context switch has been saved in rt_context_saved()
+ * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
+ * set RTDS_delayed_runq_add
+ * + Checked to be false in runq_insert.
+ */
+#define __RTDS_scheduled 1
+#define RTDS_scheduled (1<<__RTDS_scheduled)
+/*
+ * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
+ * once it's done being context switching out?
+ * + Set when scheduling out in rt_schedule() if prev is runable
+ * + Set in rt_unit_wake if it finds RTDS_scheduled set
+ * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
+ * and clears the bit.
+ */
+#define __RTDS_delayed_runq_add 2
+#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
+
+/*
+ * RTDS_depleted: Does this vcp run out of budget?
+ * This flag is
+ * + set in burn_budget() if an unit has zero budget left;
+ * + cleared and checked in the repenishment handler,
+ * for the units that are being replenished.
+ */
+#define __RTDS_depleted 3
+#define RTDS_depleted (1<<__RTDS_depleted)
+
+/*
+ * RTDS_extratime: Can the unit run in the time that is
+ * not part of any real-time reservation, and would therefore
+ * be otherwise left idle?
+ */
+#define __RTDS_extratime 4
+#define RTDS_extratime (1<<__RTDS_extratime)
+
+/*
+ * rt tracing events ("only" 512 available!). Check
+ * include/public/trace.h for more details.
+ */
+#define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1)
+#define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2)
+#define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3)
+#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
+#define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5)
+#define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6)
+
+static void repl_timer_handler(void *data);
+
+/*
+ * System-wide private data, include global RunQueue/DepletedQ
+ * Global lock is referenced by sched_res->schedule_lock from all
+ * physical cpus. It can be grabbed via unit_schedule_lock_irq()
+ */
+struct rt_private {
+ spinlock_t lock; /* the global coarse-grained lock */
+ struct list_head sdom; /* list of availalbe domains, used for dump */
+
+ struct list_head runq; /* ordered list of runnable units */
+ struct list_head depletedq; /* unordered list of depleted units */
+
+ struct timer repl_timer; /* replenishment timer */
+ struct list_head replq; /* ordered list of units that need replenishment */
+
+ cpumask_t tickled; /* cpus been tickled */
+};
+
+/*
+ * Virtual CPU
+ */
+struct rt_unit {
+ struct list_head q_elem; /* on the runq/depletedq list */
+ struct list_head replq_elem; /* on the replenishment events list */
+
+ /* UNIT parameters, in nanoseconds */
+ s_time_t period;
+ s_time_t budget;
+
+ /* UNIT current information in nanosecond */
+ s_time_t cur_budget; /* current budget */
+ s_time_t last_start; /* last start time */
+ s_time_t cur_deadline; /* current deadline for EDF */
+
+ /* Up-pointers */
+ struct rt_dom *sdom;
+ struct sched_unit *unit;
+
+ unsigned priority_level;
+
+ unsigned flags; /* mark __RTDS_scheduled, etc.. */
+};
+
+/*
+ * Domain
+ */
+struct rt_dom {
+ struct list_head sdom_elem; /* link list on rt_priv */
+ struct domain *dom; /* pointer to upper domain */
+};
+
+/*
+ * Useful inline functions
+ */
+static inline struct rt_private *rt_priv(const struct scheduler *ops)
+{
+ return ops->sched_data;
+}
+
+static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
+{
+ return unit->priv;
+}
+
+static inline struct list_head *rt_runq(const struct scheduler *ops)
+{
+ return &rt_priv(ops)->runq;
+}
+
+static inline struct list_head *rt_depletedq(const struct scheduler *ops)
+{
+ return &rt_priv(ops)->depletedq;
+}
+
+static inline struct list_head *rt_replq(const struct scheduler *ops)
+{
+ return &rt_priv(ops)->replq;
+}
+
+static inline bool has_extratime(const struct rt_unit *svc)
+{
+ return svc->flags & RTDS_extratime;
+}
+
+/*
+ * Helper functions for manipulating the runqueue, the depleted queue,
+ * and the replenishment events queue.
+ */
+static int
+unit_on_q(const struct rt_unit *svc)
+{
+ return !list_empty(&svc->q_elem);
+}
+
+static struct rt_unit *
+q_elem(struct list_head *elem)
+{
+ return list_entry(elem, struct rt_unit, q_elem);
+}
+
+static struct rt_unit *
+replq_elem(struct list_head *elem)
+{
+ return list_entry(elem, struct rt_unit, replq_elem);
+}
+
+static int
+unit_on_replq(const struct rt_unit *svc)
+{
+ return !list_empty(&svc->replq_elem);
+}
+
+/*
+ * If v1 priority >= v2 priority, return value > 0
+ * Otherwise, return value < 0
+ */
+static s_time_t
+compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
+{
+ int prio = v2->priority_level - v1->priority_level;
+
+ if ( prio == 0 )
+ return v2->cur_deadline - v1->cur_deadline;
+
+ return prio;
+}
+
+/*
+ * Debug related code, dump unit/cpu information
+ */
+static void
+rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
+{
+ cpumask_t *cpupool_mask, *mask;
+
+ ASSERT(svc != NULL);
+ /* idle unit */
+ if( svc->sdom == NULL )
+ {
+ printk("\n");
+ return;
+ }
+
+ /*
+ * We can't just use 'cpumask_scratch' because the dumping can
+ * happen from a pCPU outside of this scheduler's cpupool, and
+ * hence it's not right to use its pCPU's scratch mask.
+ * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
+ * own scratch space, since we hold the runqueue lock.
+ */
+ mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
+
+ cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
+ cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
+ printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
+ " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
+ " \t\t priority_level=%d has_extratime=%d\n"
+ " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
+ svc->unit->domain->domain_id,
+ svc->unit->unit_id,
+ sched_unit_master(svc->unit),
+ svc->period,
+ svc->budget,
+ svc->cur_budget,
+ svc->cur_deadline,
+ svc->last_start,
+ svc->priority_level,
+ has_extratime(svc),
+ unit_on_q(svc),
+ unit_runnable(svc->unit),
+ svc->flags, CPUMASK_PR(mask));
+}
+
+static void
+rt_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+ struct rt_private *prv = rt_priv(ops);
+ struct rt_unit *svc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prv->lock, flags);
+ printk("CPU[%02d]\n", cpu);
+ /* current UNIT (nothing to say if that's the idle unit). */
+ svc = rt_unit(curr_on_cpu(cpu));
+ if ( svc && !is_idle_unit(svc->unit) )
+ {
+ rt_dump_unit(ops, svc);
+ }
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void
+rt_dump(const struct scheduler *ops)
+{
+ struct list_head *runq, *depletedq, *replq, *iter;
+ struct rt_private *prv = rt_priv(ops);
+ struct rt_unit *svc;
+ struct rt_dom *sdom;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ if ( list_empty(&prv->sdom) )
+ goto out;
+
+ runq = rt_runq(ops);
+ depletedq = rt_depletedq(ops);
+ replq = rt_replq(ops);
+
+ printk("Global RunQueue info:\n");
+ list_for_each ( iter, runq )
+ {
+ svc = q_elem(iter);
+ rt_dump_unit(ops, svc);
+ }
+
+ printk("Global DepletedQueue info:\n");
+ list_for_each ( iter, depletedq )
+ {
+ svc = q_elem(iter);
+ rt_dump_unit(ops, svc);
+ }
+
+ printk("Global Replenishment Events info:\n");
+ list_for_each ( iter, replq )
+ {
+ svc = replq_elem(iter);
+ rt_dump_unit(ops, svc);
+ }
+
+ printk("Domain info:\n");
+ list_for_each ( iter, &prv->sdom )
+ {
+ struct sched_unit *unit;
+
+ sdom = list_entry(iter, struct rt_dom, sdom_elem);
+ printk("\tdomain: %d\n", sdom->dom->domain_id);
+
+ for_each_sched_unit ( sdom->dom, unit )
+ {
+ svc = rt_unit(unit);
+ rt_dump_unit(ops, svc);
+ }
+ }
+
+ out:
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+/*
+ * update deadline and budget when now >= cur_deadline
+ * it needs to be updated to the deadline of the current period
+ */
+static void
+rt_update_deadline(s_time_t now, struct rt_unit *svc)
+{
+ ASSERT(now >= svc->cur_deadline);
+ ASSERT(svc->period != 0);
+
+ if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
+ {
+ do
+ svc->cur_deadline += svc->period;
+ while ( svc->cur_deadline <= now );
+ }
+ else
+ {
+ long count = ((now - svc->cur_deadline) / svc->period) + 1;
+ svc->cur_deadline += count * svc->period;
+ }
+
+ /*
+ * svc may be scheduled to run immediately after it misses deadline
+ * Then rt_update_deadline is called before rt_schedule, which
+ * should only deduct the time spent in current period from the budget
+ */
+ svc->last_start = now;
+ svc->cur_budget = svc->budget;
+ svc->priority_level = 0;
+
+ /* TRACE */
+ {
+ struct __packed {
+ unsigned unit:16, dom:16;
+ unsigned priority_level;
+ uint64_t cur_deadline, cur_budget;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.priority_level = svc->priority_level;
+ d.cur_deadline = (uint64_t) svc->cur_deadline;
+ d.cur_budget = (uint64_t) svc->cur_budget;
+ trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
+ sizeof(d),
+ (unsigned char *) &d);
+ }
+
+ return;
+}
+
+/*
+ * Helpers for removing and inserting an unit in a queue
+ * that is being kept ordered by the units' deadlines (as EDF
+ * mandates).
+ *
+ * For callers' convenience, the unit removing helper returns
+ * true if the unit removed was the one at the front of the
+ * queue; similarly, the inserting helper returns true if the
+ * inserted ended at the front of the queue (i.e., in both
+ * cases, if the unit with the earliest deadline is what we
+ * are dealing with).
+ */
+static inline bool
+deadline_queue_remove(struct list_head *queue, struct list_head *elem)
+{
+ int pos = 0;
+
+ if ( queue->next != elem )
+ pos = 1;
+
+ list_del_init(elem);
+ return !pos;
+}
+
+static inline bool
+deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
+ struct rt_unit *svc, struct list_head *elem,
+ struct list_head *queue)
+{
+ struct list_head *iter;
+ int pos = 0;
+
+ list_for_each ( iter, queue )
+ {
+ struct rt_unit * iter_svc = (*qelem)(iter);
+ if ( compare_unit_priority(svc, iter_svc) > 0 )
+ break;
+ pos++;
+ }
+ list_add_tail(elem, iter);
+ return !pos;
+}
+#define deadline_runq_insert(...) \
+ deadline_queue_insert(&q_elem, ##__VA_ARGS__)
+#define deadline_replq_insert(...) \
+ deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
+
+static inline void
+q_remove(struct rt_unit *svc)
+{
+ ASSERT( unit_on_q(svc) );
+ list_del_init(&svc->q_elem);
+}
+
+static inline void
+replq_remove(const struct scheduler *ops, struct rt_unit *svc)
+{
+ struct rt_private *prv = rt_priv(ops);
+ struct list_head *replq = rt_replq(ops);
+
+ ASSERT( unit_on_replq(svc) );
+
+ if ( deadline_queue_remove(replq, &svc->replq_elem) )
+ {
+ /*
+ * The replenishment timer needs to be set to fire when a
+ * replenishment for the unit at the front of the replenishment
+ * queue is due. If it is such unit that we just removed, we may
+ * need to reprogram the timer.
+ */
+ if ( !list_empty(replq) )
+ {
+ struct rt_unit *svc_next = replq_elem(replq->next);
+ set_timer(&prv->repl_timer, svc_next->cur_deadline);
+ }
+ else
+ stop_timer(&prv->repl_timer);
+ }
+}
+
+/*
+ * Insert svc with budget in RunQ according to EDF:
+ * units with smaller deadlines go first.
+ * Insert svc without budget in DepletedQ unsorted;
+ */
+static void
+runq_insert(const struct scheduler *ops, struct rt_unit *svc)
+{
+ struct rt_private *prv = rt_priv(ops);
+ struct list_head *runq = rt_runq(ops);
+
+ ASSERT( spin_is_locked(&prv->lock) );
+ ASSERT( !unit_on_q(svc) );
+ ASSERT( unit_on_replq(svc) );
+
+ /* add svc to runq if svc still has budget or its extratime is set */
+ if ( svc->cur_budget > 0 ||
+ has_extratime(svc) )
+ deadline_runq_insert(svc, &svc->q_elem, runq);
+ else
+ list_add(&svc->q_elem, &prv->depletedq);
+}
+
+static void
+replq_insert(const struct scheduler *ops, struct rt_unit *svc)
+{
+ struct list_head *replq = rt_replq(ops);
+ struct rt_private *prv = rt_priv(ops);
+
+ ASSERT( !unit_on_replq(svc) );
+
+ /*
+ * The timer may be re-programmed if svc is inserted
+ * at the front of the event list.
+ */
+ if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
+ set_timer(&prv->repl_timer, svc->cur_deadline);
+}
+
+/*
+ * Removes and re-inserts an event to the replenishment queue.
+ * The aim is to update its position inside the queue, as its
+ * deadline (and hence its replenishment time) could have
+ * changed.
+ */
+static void
+replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
+{
+ struct list_head *replq = rt_replq(ops);
+ struct rt_unit *rearm_svc = svc;
+ bool_t rearm = 0;
+
+ ASSERT( unit_on_replq(svc) );
+
+ /*
+ * If svc was at the front of the replenishment queue, we certainly
+ * need to re-program the timer, and we want to use the deadline of
+ * the unit which is now at the front of the queue (which may still
+ * be svc or not).
+ *
+ * We may also need to re-program, if svc has been put at the front
+ * of the replenishment queue when being re-inserted.
+ */
+ if ( deadline_queue_remove(replq, &svc->replq_elem) )
+ {
+ deadline_replq_insert(svc, &svc->replq_elem, replq);
+ rearm_svc = replq_elem(replq->next);
+ rearm = 1;
+ }
+ else
+ rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
+
+ if ( rearm )
+ set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
+}
+
+/*
+ * Pick a valid resource for the unit vc
+ * Valid resource of an unit is intesection of unit's affinity
+ * and available resources
+ */
+static struct sched_resource *
+rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+ cpumask_t cpus;
+ cpumask_t *online;
+ int cpu;
+
+ online = cpupool_domain_master_cpumask(unit->domain);
+ cpumask_and(&cpus, online, unit->cpu_hard_affinity);
+
+ cpu = cpumask_test_cpu(sched_unit_master(unit), &cpus)
+ ? sched_unit_master(unit)
+ : cpumask_cycle(sched_unit_master(unit), &cpus);
+ ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
+
+ return get_sched_res(cpu);
+}
+
+/*
+ * Init/Free related code
+ */
+static int
+rt_init(struct scheduler *ops)
+{
+ int rc = -ENOMEM;
+ struct rt_private *prv = xzalloc(struct rt_private);
+
+ printk("Initializing RTDS scheduler\n"
+ "WARNING: This is experimental software in development.\n"
+ "Use at your own risk.\n");
+
+ if ( prv == NULL )
+ goto err;
+
+ spin_lock_init(&prv->lock);
+ INIT_LIST_HEAD(&prv->sdom);
+ INIT_LIST_HEAD(&prv->runq);
+ INIT_LIST_HEAD(&prv->depletedq);
+ INIT_LIST_HEAD(&prv->replq);
+
+ ops->sched_data = prv;
+ rc = 0;
+
+ err:
+ if ( rc )
+ xfree(prv);
+
+ return rc;
+}
+
+static void
+rt_deinit(struct scheduler *ops)
+{
+ struct rt_private *prv = rt_priv(ops);
+
+ ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
+ prv->repl_timer.status == TIMER_STATUS_killed);
+
+ ops->sched_data = NULL;
+ xfree(prv);
+}
+
+/*
+ * Point per_cpu spinlock to the global system lock;
+ * All cpu have same global system lock
+ */
+static void
+rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+ struct rt_private *prv = rt_priv(ops);
+ spinlock_t *old_lock;
+ unsigned long flags;
+
+ old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+ /*
+ * TIMER_STATUS_invalid means we are the first cpu that sees the timer
+ * allocated but not initialized, and so it's up to us to initialize it.
+ */
+ if ( prv->repl_timer.status == TIMER_STATUS_invalid )
+ {
+ init_timer(&prv->repl_timer, repl_timer_handler, (void *)ops, cpu);
+ dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
+ }
+
+ /* Move the scheduler lock to our global runqueue lock. */
+ get_sched_res(cpu)->schedule_lock = &prv->lock;
+
+ /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
+ spin_unlock_irqrestore(old_lock, flags);
+}
+
+/* Change the scheduler of cpu to us (RTDS). */
+static spinlock_t *
+rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ struct rt_private *prv = rt_priv(new_ops);
+ struct rt_unit *svc = vdata;
+
+ ASSERT(!pdata && svc && is_idle_unit(svc->unit));
+
+ /*
+ * We are holding the runqueue lock already (it's been taken in
+ * schedule_cpu_switch()). It's actually the runqueue lock of
+ * another scheduler, but that is how things need to be, for
+ * preventing races.
+ */
+ ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
+
+ /*
+ * If we are the absolute first cpu being switched toward this
+ * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
+ * first one that is added back to the cpupool that had all its cpus
+ * removed (in which case we'll see TIMER_STATUS_killed), it's our
+ * job to (re)initialize the timer.
+ */
+ if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
+ prv->repl_timer.status == TIMER_STATUS_killed )
+ {
+ init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
+ dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
+ }
+
+ sched_idle_unit(cpu)->priv = vdata;
+
+ return &prv->lock;
+}
+
+static void
+rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+ unsigned long flags;
+ struct rt_private *prv = rt_priv(ops);
+
+ spin_lock_irqsave(&prv->lock, flags);
+
+ if ( prv->repl_timer.cpu == cpu )
+ {
+ cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
+ unsigned int new_cpu = cpumask_cycle(cpu, online);
+
+ /*
+ * Make sure the timer run on one of the cpus that are still available
+ * to this scheduler. If there aren't any left, it means it's the time
+ * to just kill it.
+ */
+ if ( new_cpu >= nr_cpu_ids )
+ {
+ kill_timer(&prv->repl_timer);
+ dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
+ }
+ else
+ {
+ migrate_timer(&prv->repl_timer, new_cpu);
+ }
+ }
+
+ spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void *
+rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+ unsigned long flags;
+ struct rt_dom *sdom;
+ struct rt_private * prv = rt_priv(ops);
+
+ sdom = xzalloc(struct rt_dom);
+ if ( sdom == NULL )
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&sdom->sdom_elem);
+ sdom->dom = dom;
+
+ /* spinlock here to insert the dom */
+ spin_lock_irqsave(&prv->lock, flags);
+ list_add_tail(&sdom->sdom_elem, &(prv->sdom));
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ return sdom;
+}
+
+static void
+rt_free_domdata(const struct scheduler *ops, void *data)
+{
+ struct rt_dom *sdom = data;
+ struct rt_private *prv = rt_priv(ops);
+
+ if ( sdom )
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&prv->lock, flags);
+ list_del_init(&sdom->sdom_elem);
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ xfree(sdom);
+ }
+}
+
+static void *
+rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
+{
+ struct rt_unit *svc;
+
+ /* Allocate per-UNIT info */
+ svc = xzalloc(struct rt_unit);
+ if ( svc == NULL )
+ return NULL;
+
+ INIT_LIST_HEAD(&svc->q_elem);
+ INIT_LIST_HEAD(&svc->replq_elem);
+ svc->flags = 0U;
+ svc->sdom = dd;
+ svc->unit = unit;
+ svc->last_start = 0;
+
+ __set_bit(__RTDS_extratime, &svc->flags);
+ svc->priority_level = 0;
+ svc->period = RTDS_DEFAULT_PERIOD;
+ if ( !is_idle_unit(unit) )
+ svc->budget = RTDS_DEFAULT_BUDGET;
+
+ SCHED_STAT_CRANK(unit_alloc);
+
+ return svc;
+}
+
+static void
+rt_free_udata(const struct scheduler *ops, void *priv)
+{
+ struct rt_unit *svc = priv;
+
+ xfree(svc);
+}
+
+/*
+ * It is called in sched_move_domain() and sched_init_vcpu
+ * in schedule.c.
+ * When move a domain to a new cpupool.
+ * It inserts units of moving domain to the scheduler's RunQ in
+ * dest. cpupool.
+ */
+static void
+rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct rt_unit *svc = rt_unit(unit);
+ s_time_t now;
+ spinlock_t *lock;
+
+ BUG_ON( is_idle_unit(unit) );
+
+ /* This is safe because unit isn't yet being scheduled */
+ sched_set_res(unit, rt_res_pick(ops, unit));
+
+ lock = unit_schedule_lock_irq(unit);
+
+ now = NOW();
+ if ( now >= svc->cur_deadline )
+ rt_update_deadline(now, svc);
+
+ if ( !unit_on_q(svc) && unit_runnable(unit) )
+ {
+ replq_insert(ops, svc);
+
+ if ( !unit->is_running )
+ runq_insert(ops, svc);
+ }
+ unit_schedule_unlock_irq(lock, unit);
+
+ SCHED_STAT_CRANK(unit_insert);
+}
+
+/*
+ * Remove rt_unit svc from the old scheduler in source cpupool.
+ */
+static void
+rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct rt_unit * const svc = rt_unit(unit);
+ struct rt_dom * const sdom = svc->sdom;
+ spinlock_t *lock;
+
+ SCHED_STAT_CRANK(unit_remove);
+
+ BUG_ON( sdom == NULL );
+
+ lock = unit_schedule_lock_irq(unit);
+ if ( unit_on_q(svc) )
+ q_remove(svc);
+
+ if ( unit_on_replq(svc) )
+ replq_remove(ops,svc);
+
+ unit_schedule_unlock_irq(lock, unit);
+}
+
+/*
+ * Burn budget in nanosecond granularity
+ */
+static void
+burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
+{
+ s_time_t delta;
+
+ /* don't burn budget for idle UNIT */
+ if ( is_idle_unit(svc->unit) )
+ return;
+
+ /* burn at nanoseconds level */
+ delta = now - svc->last_start;
+ /*
+ * delta < 0 only happens in nested virtualization;
+ * TODO: how should we handle delta < 0 in a better way?
+ */
+ if ( delta < 0 )
+ {
+ printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
+ __func__, delta);
+ svc->last_start = now;
+ return;
+ }
+
+ svc->cur_budget -= delta;
+ svc->last_start = now;
+
+ if ( svc->cur_budget <= 0 )
+ {
+ if ( has_extratime(svc) )
+ {
+ svc->priority_level++;
+ svc->cur_budget = svc->budget;
+ }
+ else
+ {
+ svc->cur_budget = 0;
+ __set_bit(__RTDS_depleted, &svc->flags);
+ }
+ }
+
+ /* TRACE */
+ {
+ struct __packed {
+ unsigned unit:16, dom:16;
+ uint64_t cur_budget;
+ int delta;
+ unsigned priority_level;
+ bool has_extratime;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.cur_budget = (uint64_t) svc->cur_budget;
+ d.delta = delta;
+ d.priority_level = svc->priority_level;
+ d.has_extratime = svc->flags & RTDS_extratime;
+ trace_var(TRC_RTDS_BUDGET_BURN, 1,
+ sizeof(d),
+ (unsigned char *) &d);
+ }
+}
+
+/*
+ * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
+ * lock is grabbed before calling this function
+ */
+static struct rt_unit *
+runq_pick(const struct scheduler *ops, const cpumask_t *mask)
+{
+ struct list_head *runq = rt_runq(ops);
+ struct list_head *iter;
+ struct rt_unit *svc = NULL;
+ struct rt_unit *iter_svc = NULL;
+ cpumask_t cpu_common;
+ cpumask_t *online;
+
+ list_for_each ( iter, runq )
+ {
+ iter_svc = q_elem(iter);
+
+ /* mask cpu_hard_affinity & cpupool & mask */
+ online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
+ cpumask_and(&cpu_common, online, iter_svc->unit->cpu_hard_affinity);
+ cpumask_and(&cpu_common, mask, &cpu_common);
+ if ( cpumask_empty(&cpu_common) )
+ continue;
+
+ ASSERT( iter_svc->cur_budget > 0 );
+
+ svc = iter_svc;
+ break;
+ }
+
+ /* TRACE */
+ {
+ if( svc != NULL )
+ {
+ struct __packed {
+ unsigned unit:16, dom:16;
+ uint64_t cur_deadline, cur_budget;
+ } d;
+ d.dom = svc->unit->domain->domain_id;
+ d.unit = svc->unit->unit_id;
+ d.cur_deadline = (uint64_t) svc->cur_deadline;
+ d.cur_budget = (uint64_t) svc->cur_budget;
+ trace_var(TRC_RTDS_RUNQ_PICK, 1,
+ sizeof(d),
+ (unsigned char *) &d);
+ }
+ }
+
+ return svc;
+}
+
+/*
+ * schedule function for rt scheduler.
+ * The lock is already grabbed in schedule.c, no need to lock here
+ */
+static void
+rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
+ s_time_t now, bool tasklet_work_scheduled)
+{
+ const unsigned int cur_cpu = smp_processor_id();
+ const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+ struct rt_private *prv = rt_priv(ops);
+ struct rt_unit *const scurr = rt_unit(currunit);
+ struct rt_unit *snext = NULL;
+ bool migrated = false;
+
+ /* TRACE */
+ {
+ struct __packed {
+ unsigned cpu:16, tasklet:8, tickled:4, idle:4;
+ } d;
+ d.cpu = cur_cpu;
+ d.tasklet = tasklet_work_scheduled;
+ d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled);
+ d.idle = is_idle_unit(currunit);
+ trace_var(TRC_RTDS_SCHEDULE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ /* clear ticked bit now that we've been scheduled */
+ cpumask_clear_cpu(sched_cpu, &prv->tickled);
+
+ /* burn_budget would return for IDLE UNIT */
+ burn_budget(ops, scurr, now);
+
+ if ( tasklet_work_scheduled )
+ {
+ trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL);
+ snext = rt_unit(sched_idle_unit(sched_cpu));
+ }
+ else
+ {
+ snext = runq_pick(ops, cpumask_of(sched_cpu));
+
+ if ( snext == NULL )
+ snext = rt_unit(sched_idle_unit(sched_cpu));
+ else if ( !unit_runnable_state(snext->unit) )
+ {
+ q_remove(snext);
+ snext = rt_unit(sched_idle_unit(sched_cpu));
+ }
+
+ /* if scurr has higher priority and budget, still pick scurr */
+ if ( !is_idle_unit(currunit) &&
+ unit_runnable_state(currunit) &&
+ scurr->cur_budget > 0 &&
+ ( is_idle_unit(snext->unit) ||
+ compare_unit_priority(scurr, snext) > 0 ) )
+ snext = scurr;
+ }
+
+ if ( snext != scurr &&
+ !is_idle_unit(currunit) &&
+ unit_runnable(currunit) )
+ __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
+
+ snext->last_start = now;
+ currunit->next_time = -1; /* if an idle unit is picked */
+ if ( !is_idle_unit(snext->unit) )
+ {
+ if ( snext != scurr )
+ {
+ q_remove(snext);
+ __set_bit(__RTDS_scheduled, &snext->flags);
+ }
+ if ( sched_unit_master(snext->unit) != sched_cpu )
+ {
+ sched_set_res(snext->unit, get_sched_res(sched_cpu));
+ migrated = true;
+ }
+ /* Invoke the scheduler next time. */
+ currunit->next_time = snext->cur_budget;
+ }
+ currunit->next_task = snext->unit;
+ snext->unit->migrated = migrated;
+}
+
+/*
+ * Remove UNIT from RunQ
+ * The lock is already grabbed in schedule.c, no need to lock here
+ */
+static void
+rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct rt_unit * const svc = rt_unit(unit);
+
+ BUG_ON( is_idle_unit(unit) );
+ SCHED_STAT_CRANK(unit_sleep);
+
+ if ( curr_on_cpu(sched_unit_master(unit)) == unit )
+ cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+ else if ( unit_on_q(svc) )
+ {
+ q_remove(svc);
+ replq_remove(ops, svc);
+ }
+ else if ( svc->flags & RTDS_delayed_runq_add )
+ __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
+}
+
+/*
+ * Pick a cpu where to run an unit,
+ * possibly kicking out the unit running there
+ * Called by wake() and context_saved()
+ * We have a running candidate here, the kick logic is:
+ * Among all the cpus that are within the cpu affinity
+ * 1) if there are any idle CPUs, kick one.
+ For cache benefit, we check new->cpu as first
+ * 2) now all pcpus are busy;
+ * among all the running units, pick lowest priority one
+ * if snext has higher priority, kick it.
+ *
+ * TODO:
+ * 1) what if these two units belongs to the same domain?
+ * replace an unit belonging to the same domain introduces more overhead
+ *
+ * lock is grabbed before calling this function
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct rt_unit *new)
+{
+ struct rt_private *prv = rt_priv(ops);
+ struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
+ struct rt_unit *iter_svc;
+ struct sched_unit *iter_unit;
+ int cpu = 0, cpu_to_tickle = 0;
+ cpumask_t not_tickled;
+ cpumask_t *online;
+
+ if ( new == NULL || is_idle_unit(new->unit) )
+ return;
+
+ online = cpupool_domain_master_cpumask(new->unit->domain);
+ cpumask_and(¬_tickled, online, new->unit->cpu_hard_affinity);
+ cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled);
+
+ /*
+ * 1) If there are any idle CPUs, kick one.
+ * For cache benefit,we first search new->cpu.
+ * The same loop also find the one with lowest priority.
+ */
+ cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), ¬_tickled);
+ while ( cpu!= nr_cpu_ids )
+ {
+ iter_unit = curr_on_cpu(cpu);
+ if ( is_idle_unit(iter_unit) )
+ {
+ SCHED_STAT_CRANK(tickled_idle_cpu);
+ cpu_to_tickle = cpu;
+ goto out;
+ }
+ iter_svc = rt_unit(iter_unit);
+ if ( latest_deadline_unit == NULL ||
+ compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
+ latest_deadline_unit = iter_svc;
+
+ cpumask_clear_cpu(cpu, ¬_tickled);
+ cpu = cpumask_cycle(cpu, ¬_tickled);
+ }
+
+ /* 2) candicate has higher priority, kick out lowest priority unit */
+ if ( latest_deadline_unit != NULL &&
+ compare_unit_priority(latest_deadline_unit, new) < 0 )
+ {
+ SCHED_STAT_CRANK(tickled_busy_cpu);
+ cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
+ goto out;
+ }
+
+ /* didn't tickle any cpu */
+ SCHED_STAT_CRANK(tickled_no_cpu);
+ return;
+ out:
+ /* TRACE */
+ {
+ struct {
+ unsigned cpu:16, pad:16;
+ } d;
+ d.cpu = cpu_to_tickle;
+ d.pad = 0;
+ trace_var(TRC_RTDS_TICKLE, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
+ cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
+ return;
+}
+
+/*
+ * Should always wake up runnable unit, put it back to RunQ.
+ * Check priority to raise interrupt
+ * The lock is already grabbed in schedule.c, no need to lock here
+ * TODO: what if these two units belongs to the same domain?
+ */
+static void
+rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct rt_unit * const svc = rt_unit(unit);
+ s_time_t now;
+ bool_t missed;
+
+ BUG_ON( is_idle_unit(unit) );
+
+ if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
+ {
+ SCHED_STAT_CRANK(unit_wake_running);
+ return;
+ }
+
+ /* on RunQ/DepletedQ, just update info is ok */
+ if ( unlikely(unit_on_q(svc)) )
+ {
+ SCHED_STAT_CRANK(unit_wake_onrunq);
+ return;
+ }
+
+ if ( likely(unit_runnable(unit)) )
+ SCHED_STAT_CRANK(unit_wake_runnable);
+ else
+ SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+ /*
+ * If a deadline passed while svc was asleep/blocked, we need new
+ * scheduling parameters (a new deadline and full budget).
+ */
+ now = NOW();
+
+ missed = ( now >= svc->cur_deadline );
+ if ( missed )
+ rt_update_deadline(now, svc);
+
+ /*
+ * If context hasn't been saved for this unit yet, we can't put it on
+ * the run-queue/depleted-queue. Instead, we set the appropriate flag,
+ * the unit will be put back on queue after the context has been saved
+ * (in rt_context_save()).
+ */
+ if ( unlikely(svc->flags & RTDS_scheduled) )
+ {
+ __set_bit(__RTDS_delayed_runq_add, &svc->flags);
+ /*
+ * The unit is waking up already, and we didn't even had the time to
+ * remove its next replenishment event from the replenishment queue
+ * when it blocked! No big deal. If we did not miss the deadline in
+ * the meantime, let's just leave it there. If we did, let's remove it
+ * and queue a new one (to occur at our new deadline).
+ */
+ if ( missed )
+ replq_reinsert(ops, svc);
+ return;
+ }
+
+ /* Replenishment event got cancelled when we blocked. Add it back. */
+ replq_insert(ops, svc);
+ /* insert svc to runq/depletedq because svc is not in queue now */
+ runq_insert(ops, svc);
+
+ runq_tickle(ops, svc);
+}
+
+/*
+ * scurr has finished context switch, insert it back to the RunQ,
+ * and then pick the highest priority unit from runq to run
+ */
+static void
+rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
+{
+ struct rt_unit *svc = rt_unit(unit);
+ spinlock_t *lock = unit_schedule_lock_irq(unit);
+
+ __clear_bit(__RTDS_scheduled, &svc->flags);
+ /* not insert idle unit to runq */
+ if ( is_idle_unit(unit) )
+ goto out;
+
+ if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
+ likely(unit_runnable(unit)) )
+ {
+ runq_insert(ops, svc);
+ runq_tickle(ops, svc);
+ }
+ else
+ replq_remove(ops, svc);
+
+out:
+ unit_schedule_unlock_irq(lock, unit);
+}
+
+/*
+ * set/get each unit info of each domain
+ */
+static int
+rt_dom_cntl(
+ const struct scheduler *ops,
+ struct domain *d,
+ struct xen_domctl_scheduler_op *op)
+{
+ struct rt_private *prv = rt_priv(ops);
+ struct rt_unit *svc;
+ struct sched_unit *unit;
+ unsigned long flags;
+ int rc = 0;
+ struct xen_domctl_schedparam_vcpu local_sched;
+ s_time_t period, budget;
+ uint32_t index = 0;
+
+ switch ( op->cmd )
+ {
+ case XEN_DOMCTL_SCHEDOP_getinfo:
+ /* Return the default parameters. */
+ op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
+ op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
+ break;
+ case XEN_DOMCTL_SCHEDOP_putinfo:
+ if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
+ {
+ rc = -EINVAL;
+ break;
+ }
+ spin_lock_irqsave(&prv->lock, flags);
+ for_each_sched_unit ( d, unit )
+ {
+ svc = rt_unit(unit);
+ svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
+ svc->budget = MICROSECS(op->u.rtds.budget);
+ }
+ spin_unlock_irqrestore(&prv->lock, flags);
+ break;
+ case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
+ case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
+ while ( index < op->u.v.nr_vcpus )
+ {
+ if ( copy_from_guest_offset(&local_sched,
+ op->u.v.vcpus, index, 1) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+ if ( local_sched.vcpuid >= d->max_vcpus ||
+ d->vcpu[local_sched.vcpuid] == NULL )
+ {
+ rc = -EINVAL;
+ break;
+ }
+
+ if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
+ {
+ spin_lock_irqsave(&prv->lock, flags);
+ svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
+ local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
+ local_sched.u.rtds.period = svc->period / MICROSECS(1);
+ if ( has_extratime(svc) )
+ local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
+ else
+ local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
+ spin_unlock_irqrestore(&prv->lock, flags);
+
+ if ( copy_to_guest_offset(op->u.v.vcpus, index,
+ &local_sched, 1) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+ }
+ else
+ {
+ period = MICROSECS(local_sched.u.rtds.period);
+ budget = MICROSECS(local_sched.u.rtds.budget);
+ if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
+ budget > period || period < RTDS_MIN_PERIOD )
+ {
+ rc = -EINVAL;
+ break;
+ }
+
+ spin_lock_irqsave(&prv->lock, flags);
+ svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
+ svc->period = period;
+ svc->budget = budget;
+ if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
+ __set_bit(__RTDS_extratime, &svc->flags);
+ else
+ __clear_bit(__RTDS_extratime, &svc->flags);
+ spin_unlock_irqrestore(&prv->lock, flags);
+ }
+ /* Process a most 64 vCPUs without checking for preemptions. */
+ if ( (++index > 63) && hypercall_preempt_check() )
+ break;
+ }
+ if ( !rc )
+ /* notify upper caller how many units have been processed. */
+ op->u.v.nr_vcpus = index;
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * The replenishment timer handler picks units
+ * from the replq and does the actual replenishment.
+ */
+static void repl_timer_handler(void *data){
+ s_time_t now;
+ struct scheduler *ops = data;
+ struct rt_private *prv = rt_priv(ops);
+ struct list_head *replq = rt_replq(ops);
+ struct list_head *runq = rt_runq(ops);
+ struct list_head *iter, *tmp;
+ struct rt_unit *svc;
+ LIST_HEAD(tmp_replq);
+
+ spin_lock_irq(&prv->lock);
+
+ now = NOW();
+
+ /*
+ * Do the replenishment and move replenished units
+ * to the temporary list to tickle.
+ * If svc is on run queue, we need to put it at
+ * the correct place since its deadline changes.
+ */
+ list_for_each_safe ( iter, tmp, replq )
+ {
+ svc = replq_elem(iter);
+
+ if ( now < svc->cur_deadline )
+ break;
+
+ list_del(&svc->replq_elem);
+ rt_update_deadline(now, svc);
+ list_add(&svc->replq_elem, &tmp_replq);
+
+ if ( unit_on_q(svc) )
+ {
+ q_remove(svc);
+ runq_insert(ops, svc);
+ }
+ }
+
+ /*
+ * Iterate through the list of updated units.
+ * If an updated unit is running, tickle the head of the
+ * runqueue if it has a higher priority.
+ * If an updated unit was depleted and on the runqueue, tickle it.
+ * Finally, reinsert the units back to replenishement events list.
+ */
+ list_for_each_safe ( iter, tmp, &tmp_replq )
+ {
+ svc = replq_elem(iter);
+
+ if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
+ !list_empty(runq) )
+ {
+ struct rt_unit *next_on_runq = q_elem(runq->next);
+
+ if ( compare_unit_priority(svc, next_on_runq) < 0 )
+ runq_tickle(ops, next_on_runq);
+ }
+ else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
+ unit_on_q(svc) )
+ runq_tickle(ops, svc);
+
+ list_del(&svc->replq_elem);
+ deadline_replq_insert(svc, &svc->replq_elem, replq);
+ }
+
+ /*
+ * If there are units left in the replenishment event list,
+ * set the next replenishment to happen at the deadline of
+ * the one in the front.
+ */
+ if ( !list_empty(replq) )
+ set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
+
+ spin_unlock_irq(&prv->lock);
+}
+
+static const struct scheduler sched_rtds_def = {
+ .name = "SMP RTDS Scheduler",
+ .opt_name = "rtds",
+ .sched_id = XEN_SCHEDULER_RTDS,
+ .sched_data = NULL,
+
+ .dump_cpu_state = rt_dump_pcpu,
+ .dump_settings = rt_dump,
+ .init = rt_init,
+ .deinit = rt_deinit,
+ .init_pdata = rt_init_pdata,
+ .switch_sched = rt_switch_sched,
+ .deinit_pdata = rt_deinit_pdata,
+ .alloc_domdata = rt_alloc_domdata,
+ .free_domdata = rt_free_domdata,
+ .alloc_udata = rt_alloc_udata,
+ .free_udata = rt_free_udata,
+ .insert_unit = rt_unit_insert,
+ .remove_unit = rt_unit_remove,
+
+ .adjust = rt_dom_cntl,
+
+ .pick_resource = rt_res_pick,
+ .do_schedule = rt_schedule,
+ .sleep = rt_unit_sleep,
+ .wake = rt_unit_wake,
+ .context_saved = rt_context_saved,
+};
+
+REGISTER_SCHEDULER(sched_rtds_def);
+++ /dev/null
-/******************************************************************************
- * sched_arinc653.c
- *
- * An ARINC653-compatible scheduling algorithm for use in Xen.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com>
- */
-
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/timer.h>
-#include <xen/softirq.h>
-#include <xen/time.h>
-#include <xen/errno.h>
-#include <xen/list.h>
-#include <xen/guest_access.h>
-#include <public/sysctl.h>
-
-/**************************************************************************
- * Private Macros *
- **************************************************************************/
-
-/**
- * Default timeslice for domain 0.
- */
-#define DEFAULT_TIMESLICE MILLISECS(10)
-
-/**
- * Retrieve the idle UNIT for a given physical CPU
- */
-#define IDLETASK(cpu) (sched_idle_unit(cpu))
-
-/**
- * Return a pointer to the ARINC 653-specific scheduler data information
- * associated with the given UNIT (unit)
- */
-#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv)
-
-/**
- * Return the global scheduler private data given the scheduler ops pointer
- */
-#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data))
-
-/**************************************************************************
- * Private Type Definitions *
- **************************************************************************/
-
-/**
- * The arinc653_unit_t structure holds ARINC 653-scheduler-specific
- * information for all non-idle UNITs
- */
-typedef struct arinc653_unit_s
-{
- /* unit points to Xen's struct sched_unit so we can get to it from an
- * arinc653_unit_t pointer. */
- struct sched_unit * unit;
- /* awake holds whether the UNIT has been woken with vcpu_wake() */
- bool_t awake;
- /* list holds the linked list information for the list this UNIT
- * is stored in */
- struct list_head list;
-} arinc653_unit_t;
-
-/**
- * The sched_entry_t structure holds a single entry of the
- * ARINC 653 schedule.
- */
-typedef struct sched_entry_s
-{
- /* dom_handle holds the handle ("UUID") for the domain that this
- * schedule entry refers to. */
- xen_domain_handle_t dom_handle;
- /* unit_id holds the UNIT number for the UNIT that this schedule
- * entry refers to. */
- int unit_id;
- /* runtime holds the number of nanoseconds that the UNIT for this
- * schedule entry should be allowed to run per major frame. */
- s_time_t runtime;
- /* unit holds a pointer to the Xen sched_unit structure */
- struct sched_unit * unit;
-} sched_entry_t;
-
-/**
- * This structure defines data that is global to an instance of the scheduler
- */
-typedef struct a653sched_priv_s
-{
- /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
- spinlock_t lock;
-
- /**
- * This array holds the active ARINC 653 schedule.
- *
- * When the system tries to start a new UNIT, this schedule is scanned
- * to look for a matching (handle, UNIT #) pair. If both the handle (UUID)
- * and UNIT number match, then the UNIT is allowed to run. Its run time
- * (per major frame) is given in the third entry of the schedule.
- */
- sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
-
- /**
- * This variable holds the number of entries that are valid in
- * the arinc653_schedule table.
- *
- * This is not necessarily the same as the number of domains in the
- * schedule. A domain could be listed multiple times within the schedule,
- * or a domain with multiple UNITs could have a different
- * schedule entry for each UNIT.
- */
- unsigned int num_schedule_entries;
-
- /**
- * the major frame time for the ARINC 653 schedule.
- */
- s_time_t major_frame;
-
- /**
- * the time that the next major frame starts
- */
- s_time_t next_major_frame;
-
- /**
- * pointers to all Xen UNIT structures for iterating through
- */
- struct list_head unit_list;
-} a653sched_priv_t;
-
-/**************************************************************************
- * Helper functions *
- **************************************************************************/
-
-/**
- * This function compares two domain handles.
- *
- * @param h1 Pointer to handle 1
- * @param h2 Pointer to handle 2
- *
- * @return <ul>
- * <li> <0: handle 1 is less than handle 2
- * <li> 0: handle 1 is equal to handle 2
- * <li> >0: handle 1 is greater than handle 2
- * </ul>
- */
-static int dom_handle_cmp(const xen_domain_handle_t h1,
- const xen_domain_handle_t h2)
-{
- return memcmp(h1, h2, sizeof(xen_domain_handle_t));
-}
-
-/**
- * This function searches the unit list to find a UNIT that matches
- * the domain handle and UNIT ID specified.
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param handle Pointer to handler
- * @param unit_id UNIT ID
- *
- * @return <ul>
- * <li> Pointer to the matching UNIT if one is found
- * <li> NULL otherwise
- * </ul>
- */
-static struct sched_unit *find_unit(
- const struct scheduler *ops,
- xen_domain_handle_t handle,
- int unit_id)
-{
- arinc653_unit_t *aunit;
-
- /* loop through the unit_list looking for the specified UNIT */
- list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list )
- if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0)
- && (unit_id == aunit->unit->unit_id) )
- return aunit->unit;
-
- return NULL;
-}
-
-/**
- * This function updates the pointer to the Xen UNIT structure for each entry
- * in the ARINC 653 schedule.
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @return <None>
- */
-static void update_schedule_units(const struct scheduler *ops)
-{
- unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries;
-
- for ( i = 0; i < n_entries; i++ )
- SCHED_PRIV(ops)->schedule[i].unit =
- find_unit(ops,
- SCHED_PRIV(ops)->schedule[i].dom_handle,
- SCHED_PRIV(ops)->schedule[i].unit_id);
-}
-
-/**
- * This function is called by the adjust_global scheduler hook to put
- * in place a new ARINC653 schedule.
- *
- * @param ops Pointer to this instance of the scheduler structure
- *
- * @return <ul>
- * <li> 0 = success
- * <li> !0 = error
- * </ul>
- */
-static int
-arinc653_sched_set(
- const struct scheduler *ops,
- struct xen_sysctl_arinc653_schedule *schedule)
-{
- a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
- s_time_t total_runtime = 0;
- unsigned int i;
- unsigned long flags;
- int rc = -EINVAL;
-
- spin_lock_irqsave(&sched_priv->lock, flags);
-
- /* Check for valid major frame and number of schedule entries. */
- if ( (schedule->major_frame <= 0)
- || (schedule->num_sched_entries < 1)
- || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) )
- goto fail;
-
- for ( i = 0; i < schedule->num_sched_entries; i++ )
- {
- /* Check for a valid run time. */
- if ( schedule->sched_entries[i].runtime <= 0 )
- goto fail;
-
- /* Add this entry's run time to total run time. */
- total_runtime += schedule->sched_entries[i].runtime;
- }
-
- /*
- * Error if the major frame is not large enough to run all entries as
- * indicated by comparing the total run time to the major frame length.
- */
- if ( total_runtime > schedule->major_frame )
- goto fail;
-
- /* Copy the new schedule into place. */
- sched_priv->num_schedule_entries = schedule->num_sched_entries;
- sched_priv->major_frame = schedule->major_frame;
- for ( i = 0; i < schedule->num_sched_entries; i++ )
- {
- memcpy(sched_priv->schedule[i].dom_handle,
- schedule->sched_entries[i].dom_handle,
- sizeof(sched_priv->schedule[i].dom_handle));
- sched_priv->schedule[i].unit_id =
- schedule->sched_entries[i].vcpu_id;
- sched_priv->schedule[i].runtime =
- schedule->sched_entries[i].runtime;
- }
- update_schedule_units(ops);
-
- /*
- * The newly-installed schedule takes effect immediately. We do not even
- * wait for the current major frame to expire.
- *
- * Signal a new major frame to begin. The next major frame is set up by
- * the do_schedule callback function when it is next invoked.
- */
- sched_priv->next_major_frame = NOW();
-
- rc = 0;
-
- fail:
- spin_unlock_irqrestore(&sched_priv->lock, flags);
- return rc;
-}
-
-/**
- * This function is called by the adjust_global scheduler hook to read the
- * current ARINC 653 schedule
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @return <ul>
- * <li> 0 = success
- * <li> !0 = error
- * </ul>
- */
-static int
-arinc653_sched_get(
- const struct scheduler *ops,
- struct xen_sysctl_arinc653_schedule *schedule)
-{
- a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
- unsigned int i;
- unsigned long flags;
-
- spin_lock_irqsave(&sched_priv->lock, flags);
-
- schedule->num_sched_entries = sched_priv->num_schedule_entries;
- schedule->major_frame = sched_priv->major_frame;
- for ( i = 0; i < sched_priv->num_schedule_entries; i++ )
- {
- memcpy(schedule->sched_entries[i].dom_handle,
- sched_priv->schedule[i].dom_handle,
- sizeof(sched_priv->schedule[i].dom_handle));
- schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id;
- schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime;
- }
-
- spin_unlock_irqrestore(&sched_priv->lock, flags);
-
- return 0;
-}
-
-/**************************************************************************
- * Scheduler callback functions *
- **************************************************************************/
-
-/**
- * This function performs initialization for an instance of the scheduler.
- *
- * @param ops Pointer to this instance of the scheduler structure
- *
- * @return <ul>
- * <li> 0 = success
- * <li> !0 = error
- * </ul>
- */
-static int
-a653sched_init(struct scheduler *ops)
-{
- a653sched_priv_t *prv;
-
- prv = xzalloc(a653sched_priv_t);
- if ( prv == NULL )
- return -ENOMEM;
-
- ops->sched_data = prv;
-
- prv->next_major_frame = 0;
- spin_lock_init(&prv->lock);
- INIT_LIST_HEAD(&prv->unit_list);
-
- return 0;
-}
-
-/**
- * This function performs deinitialization for an instance of the scheduler
- *
- * @param ops Pointer to this instance of the scheduler structure
- */
-static void
-a653sched_deinit(struct scheduler *ops)
-{
- xfree(SCHED_PRIV(ops));
- ops->sched_data = NULL;
-}
-
-/**
- * This function allocates scheduler-specific data for a UNIT
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param unit Pointer to struct sched_unit
- *
- * @return Pointer to the allocated data
- */
-static void *
-a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
- void *dd)
-{
- a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
- arinc653_unit_t *svc;
- unsigned int entry;
- unsigned long flags;
-
- /*
- * Allocate memory for the ARINC 653-specific scheduler data information
- * associated with the given UNIT (unit).
- */
- svc = xmalloc(arinc653_unit_t);
- if ( svc == NULL )
- return NULL;
-
- spin_lock_irqsave(&sched_priv->lock, flags);
-
- /*
- * Add every one of dom0's units to the schedule, as long as there are
- * slots available.
- */
- if ( unit->domain->domain_id == 0 )
- {
- entry = sched_priv->num_schedule_entries;
-
- if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE )
- {
- sched_priv->schedule[entry].dom_handle[0] = '\0';
- sched_priv->schedule[entry].unit_id = unit->unit_id;
- sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE;
- sched_priv->schedule[entry].unit = unit;
-
- sched_priv->major_frame += DEFAULT_TIMESLICE;
- ++sched_priv->num_schedule_entries;
- }
- }
-
- /*
- * Initialize our ARINC 653 scheduler-specific information for the UNIT.
- * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it
- * will call the vcpu_wake scheduler callback function and our scheduler
- * will mark the UNIT awake.
- */
- svc->unit = unit;
- svc->awake = 0;
- if ( !is_idle_unit(unit) )
- list_add(&svc->list, &SCHED_PRIV(ops)->unit_list);
- update_schedule_units(ops);
-
- spin_unlock_irqrestore(&sched_priv->lock, flags);
-
- return svc;
-}
-
-/**
- * This function frees scheduler-specific UNIT data
- *
- * @param ops Pointer to this instance of the scheduler structure
- */
-static void
-a653sched_free_udata(const struct scheduler *ops, void *priv)
-{
- a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
- arinc653_unit_t *av = priv;
- unsigned long flags;
-
- if (av == NULL)
- return;
-
- spin_lock_irqsave(&sched_priv->lock, flags);
-
- if ( !is_idle_unit(av->unit) )
- list_del(&av->list);
-
- xfree(av);
- update_schedule_units(ops);
-
- spin_unlock_irqrestore(&sched_priv->lock, flags);
-}
-
-/**
- * Xen scheduler callback function to sleep a UNIT
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param unit Pointer to struct sched_unit
- */
-static void
-a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
- if ( AUNIT(unit) != NULL )
- AUNIT(unit)->awake = 0;
-
- /*
- * If the UNIT being put to sleep is the same one that is currently
- * running, raise a softirq to invoke the scheduler to switch domains.
- */
- if ( get_sched_res(sched_unit_master(unit))->curr == unit )
- cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
-}
-
-/**
- * Xen scheduler callback function to wake up a UNIT
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param unit Pointer to struct sched_unit
- */
-static void
-a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
- if ( AUNIT(unit) != NULL )
- AUNIT(unit)->awake = 1;
-
- cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
-}
-
-/**
- * Xen scheduler callback function to select a UNIT to run.
- * This is the main scheduler routine.
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param now Current time
- */
-static void
-a653sched_do_schedule(
- const struct scheduler *ops,
- struct sched_unit *prev,
- s_time_t now,
- bool tasklet_work_scheduled)
-{
- struct sched_unit *new_task = NULL;
- static unsigned int sched_index = 0;
- static s_time_t next_switch_time;
- a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
- const unsigned int cpu = sched_get_resource_cpu(smp_processor_id());
- unsigned long flags;
-
- spin_lock_irqsave(&sched_priv->lock, flags);
-
- if ( sched_priv->num_schedule_entries < 1 )
- sched_priv->next_major_frame = now + DEFAULT_TIMESLICE;
- else if ( now >= sched_priv->next_major_frame )
- {
- /* time to enter a new major frame
- * the first time this function is called, this will be true */
- /* start with the first domain in the schedule */
- sched_index = 0;
- sched_priv->next_major_frame = now + sched_priv->major_frame;
- next_switch_time = now + sched_priv->schedule[0].runtime;
- }
- else
- {
- while ( (now >= next_switch_time)
- && (sched_index < sched_priv->num_schedule_entries) )
- {
- /* time to switch to the next domain in this major frame */
- sched_index++;
- next_switch_time += sched_priv->schedule[sched_index].runtime;
- }
- }
-
- /*
- * If we exhausted the domains in the schedule and still have time left
- * in the major frame then switch next at the next major frame.
- */
- if ( sched_index >= sched_priv->num_schedule_entries )
- next_switch_time = sched_priv->next_major_frame;
-
- /*
- * If there are more domains to run in the current major frame, set
- * new_task equal to the address of next domain's sched_unit structure.
- * Otherwise, set new_task equal to the address of the idle task's
- * sched_unit structure.
- */
- new_task = (sched_index < sched_priv->num_schedule_entries)
- ? sched_priv->schedule[sched_index].unit
- : IDLETASK(cpu);
-
- /* Check to see if the new task can be run (awake & runnable). */
- if ( !((new_task != NULL)
- && (AUNIT(new_task) != NULL)
- && AUNIT(new_task)->awake
- && unit_runnable_state(new_task)) )
- new_task = IDLETASK(cpu);
- BUG_ON(new_task == NULL);
-
- /*
- * Check to make sure we did not miss a major frame.
- * This is a good test for robust partitioning.
- */
- BUG_ON(now >= sched_priv->next_major_frame);
-
- spin_unlock_irqrestore(&sched_priv->lock, flags);
-
- /* Tasklet work (which runs in idle UNIT context) overrides all else. */
- if ( tasklet_work_scheduled )
- new_task = IDLETASK(cpu);
-
- /* Running this task would result in a migration */
- if ( !is_idle_unit(new_task)
- && (sched_unit_master(new_task) != cpu) )
- new_task = IDLETASK(cpu);
-
- /*
- * Return the amount of time the next domain has to run and the address
- * of the selected task's UNIT structure.
- */
- prev->next_time = next_switch_time - now;
- prev->next_task = new_task;
- new_task->migrated = false;
-
- BUG_ON(prev->next_time <= 0);
-}
-
-/**
- * Xen scheduler callback function to select a resource for the UNIT to run on
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param unit Pointer to struct sched_unit
- *
- * @return Scheduler resource to run on
- */
-static struct sched_resource *
-a653sched_pick_resource(const struct scheduler *ops,
- const struct sched_unit *unit)
-{
- cpumask_t *online;
- unsigned int cpu;
-
- /*
- * If present, prefer unit's current processor, else
- * just find the first valid unit.
- */
- online = cpupool_domain_master_cpumask(unit->domain);
-
- cpu = cpumask_first(online);
-
- if ( cpumask_test_cpu(sched_unit_master(unit), online)
- || (cpu >= nr_cpu_ids) )
- cpu = sched_unit_master(unit);
-
- return get_sched_res(cpu);
-}
-
-/**
- * Xen scheduler callback to change the scheduler of a cpu
- *
- * @param new_ops Pointer to this instance of the scheduler structure
- * @param cpu The cpu that is changing scheduler
- * @param pdata scheduler specific PCPU data (we don't have any)
- * @param vdata scheduler specific UNIT data of the idle unit
- */
-static spinlock_t *
-a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
- void *pdata, void *vdata)
-{
- struct sched_resource *sr = get_sched_res(cpu);
- arinc653_unit_t *svc = vdata;
-
- ASSERT(!pdata && svc && is_idle_unit(svc->unit));
-
- sched_idle_unit(cpu)->priv = vdata;
-
- return &sr->_lock;
-}
-
-/**
- * Xen scheduler callback function to perform a global (not domain-specific)
- * adjustment. It is used by the ARINC 653 scheduler to put in place a new
- * ARINC 653 schedule or to retrieve the schedule currently in place.
- *
- * @param ops Pointer to this instance of the scheduler structure
- * @param sc Pointer to the scheduler operation specified by Domain 0
- */
-static int
-a653sched_adjust_global(const struct scheduler *ops,
- struct xen_sysctl_scheduler_op *sc)
-{
- struct xen_sysctl_arinc653_schedule local_sched;
- int rc = -EINVAL;
-
- switch ( sc->cmd )
- {
- case XEN_SYSCTL_SCHEDOP_putinfo:
- if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
- {
- rc = -EFAULT;
- break;
- }
-
- rc = arinc653_sched_set(ops, &local_sched);
- break;
- case XEN_SYSCTL_SCHEDOP_getinfo:
- memset(&local_sched, -1, sizeof(local_sched));
- rc = arinc653_sched_get(ops, &local_sched);
- if ( rc )
- break;
-
- if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
- rc = -EFAULT;
- break;
- }
-
- return rc;
-}
-
-/**
- * This structure defines our scheduler for Xen.
- * The entries tell Xen where to find our scheduler-specific
- * callback functions.
- * The symbol must be visible to the rest of Xen at link time.
- */
-static const struct scheduler sched_arinc653_def = {
- .name = "ARINC 653 Scheduler",
- .opt_name = "arinc653",
- .sched_id = XEN_SCHEDULER_ARINC653,
- .sched_data = NULL,
-
- .init = a653sched_init,
- .deinit = a653sched_deinit,
-
- .free_udata = a653sched_free_udata,
- .alloc_udata = a653sched_alloc_udata,
-
- .insert_unit = NULL,
- .remove_unit = NULL,
-
- .sleep = a653sched_unit_sleep,
- .wake = a653sched_unit_wake,
- .yield = NULL,
- .context_saved = NULL,
-
- .do_schedule = a653sched_do_schedule,
-
- .pick_resource = a653sched_pick_resource,
-
- .switch_sched = a653_switch_sched,
-
- .adjust = NULL,
- .adjust_global = a653sched_adjust_global,
-
- .dump_settings = NULL,
- .dump_cpu_state = NULL,
-};
-
-REGISTER_SCHEDULER(sched_arinc653_def);
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/****************************************************************************
- * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
- ****************************************************************************
- *
- * File: common/csched_credit.c
- * Author: Emmanuel Ackaouy
- *
- * Description: Credit-based SMP CPU scheduler
- */
-
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <asm/atomic.h>
-#include <asm/div64.h>
-#include <xen/errno.h>
-#include <xen/keyhandler.h>
-#include <xen/trace.h>
-#include <xen/err.h>
-
-
-/*
- * Locking:
- * - Scheduler-lock (a.k.a. runqueue lock):
- * + is per-runqueue, and there is one runqueue per-cpu;
- * + serializes all runqueue manipulation operations;
- * - Private data lock (a.k.a. private scheduler lock):
- * + serializes accesses to the scheduler global state (weight,
- * credit, balance_credit, etc);
- * + serializes updates to the domains' scheduling parameters.
- *
- * Ordering is "private lock always comes first":
- * + if we need both locks, we must acquire the private
- * scheduler lock for first;
- * + if we already own a runqueue lock, we must never acquire
- * the private scheduler lock.
- */
-
-/*
- * Basic constants
- */
-#define CSCHED_DEFAULT_WEIGHT 256
-#define CSCHED_TICKS_PER_TSLICE 3
-/* Default timeslice: 30ms */
-#define CSCHED_DEFAULT_TSLICE_MS 30
-#define CSCHED_CREDITS_PER_MSEC 10
-/* Never set a timer shorter than this value. */
-#define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN
-
-
-/*
- * Priorities
- */
-#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
-#define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
-#define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
-#define CSCHED_PRI_IDLE -64 /* idle */
-
-
-/*
- * Flags
- *
- * Note that svc->flags (where these flags live) is protected by an
- * inconsistent set of locks. Therefore atomic-safe bit operations must
- * be used for accessing it.
- */
-#define CSCHED_FLAG_UNIT_PARKED 0x0 /* UNIT over capped credits */
-#define CSCHED_FLAG_UNIT_YIELD 0x1 /* UNIT yielding */
-#define CSCHED_FLAG_UNIT_MIGRATING 0x2 /* UNIT may have moved to a new pcpu */
-#define CSCHED_FLAG_UNIT_PINNED 0x4 /* UNIT can run only on 1 pcpu */
-
-
-/*
- * Useful macros
- */
-#define CSCHED_PRIV(_ops) \
- ((struct csched_private *)((_ops)->sched_data))
-#define CSCHED_PCPU(_c) \
- ((struct csched_pcpu *)get_sched_res(_c)->sched_priv)
-#define CSCHED_UNIT(unit) ((struct csched_unit *) (unit)->priv)
-#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
-#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
-
-
-/*
- * CSCHED_STATS
- *
- * Manage very basic per-unit counters and stats.
- *
- * Useful for debugging live systems. The stats are displayed
- * with runq dumps ('r' on the Xen console).
- */
-#ifdef SCHED_STATS
-
-#define CSCHED_STATS
-
-#define SCHED_UNIT_STATS_RESET(_V) \
- do \
- { \
- memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
- } while ( 0 )
-
-#define SCHED_UNIT_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
-
-#define SCHED_UNIT_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
-
-#else /* !SCHED_STATS */
-
-#undef CSCHED_STATS
-
-#define SCHED_UNIT_STATS_RESET(_V) do {} while ( 0 )
-#define SCHED_UNIT_STAT_CRANK(_V, _X) do {} while ( 0 )
-#define SCHED_UNIT_STAT_SET(_V, _X, _Y) do {} while ( 0 )
-
-#endif /* SCHED_STATS */
-
-
-/*
- * Credit tracing events ("only" 512 available!). Check
- * include/public/trace.h for more details.
- */
-#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1)
-#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2)
-#define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3)
-#define TRC_CSCHED_STOLEN_UNIT TRC_SCHED_CLASS_EVT(CSCHED, 4)
-#define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5)
-#define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6)
-#define TRC_CSCHED_BOOST_START TRC_SCHED_CLASS_EVT(CSCHED, 7)
-#define TRC_CSCHED_BOOST_END TRC_SCHED_CLASS_EVT(CSCHED, 8)
-#define TRC_CSCHED_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED, 9)
-#define TRC_CSCHED_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED, 10)
-#define TRC_CSCHED_STEAL_CHECK TRC_SCHED_CLASS_EVT(CSCHED, 11)
-
-/*
- * Boot parameters
- */
-static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
-integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms);
-
-/*
- * Physical CPU
- */
-struct csched_pcpu {
- struct list_head runq;
- uint32_t runq_sort_last;
-
- unsigned int idle_bias;
- unsigned int nr_runnable;
-
- unsigned int tick;
- struct timer ticker;
-};
-
-/*
- * Virtual UNIT
- */
-struct csched_unit {
- struct list_head runq_elem;
- struct list_head active_unit_elem;
-
- /* Up-pointers */
- struct csched_dom *sdom;
- struct sched_unit *unit;
-
- s_time_t start_time; /* When we were scheduled (used for credit) */
- unsigned flags;
- int pri;
-
- atomic_t credit;
- unsigned int residual;
-
- s_time_t last_sched_time;
-
-#ifdef CSCHED_STATS
- struct {
- int credit_last;
- uint32_t credit_incr;
- uint32_t state_active;
- uint32_t state_idle;
- uint32_t migrate_q;
- uint32_t migrate_r;
- uint32_t kicked_away;
- } stats;
-#endif
-};
-
-/*
- * Domain
- */
-struct csched_dom {
- struct list_head active_unit;
- struct list_head active_sdom_elem;
- struct domain *dom;
- uint16_t active_unit_count;
- uint16_t weight;
- uint16_t cap;
-};
-
-/*
- * System-wide private data
- */
-struct csched_private {
- /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
- spinlock_t lock;
-
- cpumask_var_t idlers;
- cpumask_var_t cpus;
- uint32_t *balance_bias;
- uint32_t runq_sort;
- uint32_t ncpus;
-
- /* Period of master and tick in milliseconds */
- unsigned int tick_period_us, ticks_per_tslice;
- s_time_t ratelimit, tslice, unit_migr_delay;
-
- struct list_head active_sdom;
- uint32_t weight;
- uint32_t credit;
- int credit_balance;
- unsigned int credits_per_tslice;
-
- unsigned int master;
- struct timer master_ticker;
-};
-
-static void csched_tick(void *_cpu);
-static void csched_acct(void *dummy);
-
-static inline int
-__unit_on_runq(struct csched_unit *svc)
-{
- return !list_empty(&svc->runq_elem);
-}
-
-static inline struct csched_unit *
-__runq_elem(struct list_head *elem)
-{
- return list_entry(elem, struct csched_unit, runq_elem);
-}
-
-/* Is the first element of cpu's runq (if any) cpu's idle unit? */
-static inline bool_t is_runq_idle(unsigned int cpu)
-{
- /*
- * We're peeking at cpu's runq, we must hold the proper lock.
- */
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
- return list_empty(RUNQ(cpu)) ||
- is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit);
-}
-
-static inline void
-inc_nr_runnable(unsigned int cpu)
-{
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
- CSCHED_PCPU(cpu)->nr_runnable++;
-
-}
-
-static inline void
-dec_nr_runnable(unsigned int cpu)
-{
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
- ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1);
- CSCHED_PCPU(cpu)->nr_runnable--;
-}
-
-static inline void
-__runq_insert(struct csched_unit *svc)
-{
- unsigned int cpu = sched_unit_master(svc->unit);
- const struct list_head * const runq = RUNQ(cpu);
- struct list_head *iter;
-
- BUG_ON( __unit_on_runq(svc) );
-
- list_for_each( iter, runq )
- {
- const struct csched_unit * const iter_svc = __runq_elem(iter);
- if ( svc->pri > iter_svc->pri )
- break;
- }
-
- /* If the unit yielded, try to put it behind one lower-priority
- * runnable unit if we can. The next runq_sort will bring it forward
- * within 30ms if the queue too long. */
- if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags)
- && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
- {
- iter=iter->next;
-
- /* Some sanity checks */
- BUG_ON(iter == runq);
- }
-
- list_add_tail(&svc->runq_elem, iter);
-}
-
-static inline void
-runq_insert(struct csched_unit *svc)
-{
- __runq_insert(svc);
- inc_nr_runnable(sched_unit_master(svc->unit));
-}
-
-static inline void
-__runq_remove(struct csched_unit *svc)
-{
- BUG_ON( !__unit_on_runq(svc) );
- list_del_init(&svc->runq_elem);
-}
-
-static inline void
-runq_remove(struct csched_unit *svc)
-{
- dec_nr_runnable(sched_unit_master(svc->unit));
- __runq_remove(svc);
-}
-
-static void burn_credits(struct csched_unit *svc, s_time_t now)
-{
- s_time_t delta;
- uint64_t val;
- unsigned int credits;
-
- /* Assert svc is current */
- ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) );
-
- if ( (delta = now - svc->start_time) <= 0 )
- return;
-
- val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual;
- svc->residual = do_div(val, MILLISECS(1));
- credits = val;
- ASSERT(credits == val); /* make sure we haven't truncated val */
- atomic_sub(credits, &svc->credit);
- svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
-}
-
-static bool_t __read_mostly opt_tickle_one_idle = 1;
-boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle);
-
-DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
-
-static inline void __runq_tickle(struct csched_unit *new)
-{
- unsigned int cpu = sched_unit_master(new->unit);
- struct sched_resource *sr = get_sched_res(cpu);
- struct sched_unit *unit = new->unit;
- struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu));
- struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
- cpumask_t mask, idle_mask, *online;
- int balance_step, idlers_empty;
-
- ASSERT(cur);
- cpumask_clear(&mask);
-
- online = cpupool_domain_master_cpumask(new->sdom->dom);
- cpumask_and(&idle_mask, prv->idlers, online);
- idlers_empty = cpumask_empty(&idle_mask);
-
- /*
- * Exclusive pinning is when a unit has hard-affinity with only one
- * cpu, and there is no other unit that has hard-affinity with that
- * same cpu. This is infrequent, but if it happens, is for achieving
- * the most possible determinism, and least possible overhead for
- * the units in question.
- *
- * Try to identify the vast majority of these situations, and deal
- * with them quickly.
- */
- if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) &&
- cpumask_test_cpu(cpu, &idle_mask)) )
- {
- ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
- SCHED_STAT_CRANK(tickled_idle_cpu_excl);
- __cpumask_set_cpu(cpu, &mask);
- goto tickle;
- }
-
- /*
- * If the pcpu is idle, or there are no idlers and the new
- * unit is a higher priority than the old unit, run it here.
- *
- * If there are idle cpus, first try to find one suitable to run
- * new, so we can avoid preempting cur. If we cannot find a
- * suitable idler on which to run new, run it here, but try to
- * find a suitable idler on which to run cur instead.
- */
- if ( cur->pri == CSCHED_PRI_IDLE
- || (idlers_empty && new->pri > cur->pri) )
- {
- if ( cur->pri != CSCHED_PRI_IDLE )
- SCHED_STAT_CRANK(tickled_busy_cpu);
- else
- SCHED_STAT_CRANK(tickled_idle_cpu);
- __cpumask_set_cpu(cpu, &mask);
- }
- else if ( !idlers_empty )
- {
- /*
- * Soft and hard affinity balancing loop. For units without
- * a useful soft affinity, consider hard affinity only.
- */
- for_each_affinity_balance_step( balance_step )
- {
- int new_idlers_empty;
-
- if ( balance_step == BALANCE_SOFT_AFFINITY
- && !has_soft_affinity(unit) )
- continue;
-
- /* Are there idlers suitable for new (for this balance step)? */
- affinity_balance_cpumask(unit, balance_step,
- cpumask_scratch_cpu(cpu));
- cpumask_and(cpumask_scratch_cpu(cpu),
- cpumask_scratch_cpu(cpu), &idle_mask);
- new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu));
-
- /*
- * Let's not be too harsh! If there aren't idlers suitable
- * for new in its soft affinity mask, make sure we check its
- * hard affinity as well, before taking final decisions.
- */
- if ( new_idlers_empty
- && balance_step == BALANCE_SOFT_AFFINITY )
- continue;
-
- /*
- * If there are no suitable idlers for new, and it's higher
- * priority than cur, check whether we can migrate cur away.
- * We have to do it indirectly, via _VPF_migrating (instead
- * of just tickling any idler suitable for cur) because cur
- * is running.
- *
- * If there are suitable idlers for new, no matter priorities,
- * leave cur alone (as it is running and is, likely, cache-hot)
- * and wake some of them (which is waking up and so is, likely,
- * cache cold anyway).
- */
- if ( new_idlers_empty && new->pri > cur->pri )
- {
- if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) )
- {
- SCHED_UNIT_STAT_CRANK(cur, kicked_away);
- SCHED_UNIT_STAT_CRANK(cur, migrate_r);
- SCHED_STAT_CRANK(migrate_kicked_away);
- sched_set_pause_flags_atomic(cur->unit, _VPF_migrating);
- }
- /* Tickle cpu anyway, to let new preempt cur. */
- SCHED_STAT_CRANK(tickled_busy_cpu);
- __cpumask_set_cpu(cpu, &mask);
- }
- else if ( !new_idlers_empty )
- {
- /* Which of the idlers suitable for new shall we wake up? */
- SCHED_STAT_CRANK(tickled_idle_cpu);
- if ( opt_tickle_one_idle )
- {
- this_cpu(last_tickle_cpu) =
- cpumask_cycle(this_cpu(last_tickle_cpu),
- cpumask_scratch_cpu(cpu));
- __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
- }
- else
- cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu));
- }
-
- /* Did we find anyone? */
- if ( !cpumask_empty(&mask) )
- break;
- }
- }
-
- tickle:
- if ( !cpumask_empty(&mask) )
- {
- if ( unlikely(tb_init_done) )
- {
- /* Avoid TRACE_*: saves checking !tb_init_done each step */
- for_each_cpu(cpu, &mask)
- __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
- }
-
- /*
- * Mark the designated CPUs as busy and send them all the scheduler
- * interrupt. We need the for_each_cpu for dealing with the
- * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
- * can't use cpumask_andnot(), because prv->idlers needs atomic access.
- *
- * In the default (and most common) case, when opt_rickle_one_idle is
- * true, the loop does only one step, and only one bit is cleared.
- */
- for_each_cpu(cpu, &mask)
- cpumask_clear_cpu(cpu, prv->idlers);
- cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
- }
- else
- SCHED_STAT_CRANK(tickled_no_cpu);
-}
-
-static void
-csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
- struct csched_private *prv = CSCHED_PRIV(ops);
-
- /*
- * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're
- * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed
- * very early. xfree() does not really mind, but we want to be sure that,
- * when we get here, either init_pdata has never been called, or
- * deinit_pdata has been called already.
- */
- ASSERT(!cpumask_test_cpu(cpu, prv->cpus));
-
- xfree(pcpu);
-}
-
-static void
-csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
- struct csched_private *prv = CSCHED_PRIV(ops);
- struct csched_pcpu *spc = pcpu;
- unsigned int node = cpu_to_node(cpu);
- unsigned long flags;
-
- /*
- * Scheduler specific data for this pCPU must still be there and and be
- * valid. In fact, if we are here:
- * 1. alloc_pdata must have been called for this cpu, and free_pdata
- * must not have been called on it before us,
- * 2. init_pdata must have been called on this cpu, and deinit_pdata
- * (us!) must not have been called on it already.
- */
- ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus));
-
- spin_lock_irqsave(&prv->lock, flags);
-
- prv->credit -= prv->credits_per_tslice;
- prv->ncpus--;
- cpumask_clear_cpu(cpu, prv->idlers);
- cpumask_clear_cpu(cpu, prv->cpus);
- if ( (prv->master == cpu) && (prv->ncpus > 0) )
- {
- prv->master = cpumask_first(prv->cpus);
- migrate_timer(&prv->master_ticker, prv->master);
- }
- if ( prv->balance_bias[node] == cpu )
- {
- cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node));
- if ( !cpumask_empty(cpumask_scratch) )
- prv->balance_bias[node] = cpumask_first(cpumask_scratch);
- }
- kill_timer(&spc->ticker);
- if ( prv->ncpus == 0 )
- kill_timer(&prv->master_ticker);
-
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void *
-csched_alloc_pdata(const struct scheduler *ops, int cpu)
-{
- struct csched_pcpu *spc;
-
- /* Allocate per-PCPU info */
- spc = xzalloc(struct csched_pcpu);
- if ( spc == NULL )
- return ERR_PTR(-ENOMEM);
-
- return spc;
-}
-
-static void
-init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
-{
- ASSERT(spin_is_locked(&prv->lock));
- /* cpu data needs to be allocated, but STILL uninitialized. */
- ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL);
-
- /* Initialize/update system-wide config */
- prv->credit += prv->credits_per_tslice;
- prv->ncpus++;
- cpumask_set_cpu(cpu, prv->cpus);
- if ( prv->ncpus == 1 )
- {
- prv->master = cpu;
- init_timer(&prv->master_ticker, csched_acct, prv, cpu);
- set_timer(&prv->master_ticker, NOW() + prv->tslice);
- }
-
- cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu)));
- if ( cpumask_weight(cpumask_scratch) == 1 )
- prv->balance_bias[cpu_to_node(cpu)] = cpu;
-
- init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
- set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
-
- INIT_LIST_HEAD(&spc->runq);
- spc->runq_sort_last = prv->runq_sort;
- spc->idle_bias = nr_cpu_ids - 1;
-
- /* Start off idling... */
- BUG_ON(!is_idle_unit(curr_on_cpu(cpu)));
- cpumask_set_cpu(cpu, prv->idlers);
- spc->nr_runnable = 0;
-}
-
-static void
-csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
- unsigned long flags;
- struct csched_private *prv = CSCHED_PRIV(ops);
-
- spin_lock_irqsave(&prv->lock, flags);
- init_pdata(prv, pdata, cpu);
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-/* Change the scheduler of cpu to us (Credit). */
-static spinlock_t *
-csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
- void *pdata, void *vdata)
-{
- struct sched_resource *sr = get_sched_res(cpu);
- struct csched_private *prv = CSCHED_PRIV(new_ops);
- struct csched_unit *svc = vdata;
-
- ASSERT(svc && is_idle_unit(svc->unit));
-
- sched_idle_unit(cpu)->priv = vdata;
-
- /*
- * We are holding the runqueue lock already (it's been taken in
- * schedule_cpu_switch()). It actually may or may not be the 'right'
- * one for this cpu, but that is ok for preventing races.
- */
- ASSERT(!local_irq_is_enabled());
- spin_lock(&prv->lock);
- init_pdata(prv, pdata, cpu);
- spin_unlock(&prv->lock);
-
- return &sr->_lock;
-}
-
-#ifndef NDEBUG
-static inline void
-__csched_unit_check(struct sched_unit *unit)
-{
- struct csched_unit * const svc = CSCHED_UNIT(unit);
- struct csched_dom * const sdom = svc->sdom;
-
- BUG_ON( svc->unit != unit );
- BUG_ON( sdom != CSCHED_DOM(unit->domain) );
- if ( sdom )
- {
- BUG_ON( is_idle_unit(unit) );
- BUG_ON( sdom->dom != unit->domain );
- }
- else
- {
- BUG_ON( !is_idle_unit(unit) );
- }
-
- SCHED_STAT_CRANK(unit_check);
-}
-#define CSCHED_UNIT_CHECK(unit) (__csched_unit_check(unit))
-#else
-#define CSCHED_UNIT_CHECK(unit)
-#endif
-
-/*
- * Delay, in microseconds, between migrations of a UNIT between PCPUs.
- * This prevents rapid fluttering of a UNIT between CPUs, and reduces the
- * implicit overheads such as cache-warming. 1ms (1000) has been measured
- * as a good value.
- */
-static unsigned int vcpu_migration_delay_us;
-integer_param("vcpu_migration_delay", vcpu_migration_delay_us);
-
-static inline bool
-__csched_vcpu_is_cache_hot(const struct csched_private *prv,
- const struct csched_unit *svc)
-{
- bool hot = prv->unit_migr_delay &&
- (NOW() - svc->last_sched_time) < prv->unit_migr_delay;
-
- if ( hot )
- SCHED_STAT_CRANK(unit_hot);
-
- return hot;
-}
-
-static inline int
-__csched_unit_is_migrateable(const struct csched_private *prv,
- struct sched_unit *unit,
- int dest_cpu, cpumask_t *mask)
-{
- const struct csched_unit *svc = CSCHED_UNIT(unit);
- /*
- * Don't pick up work that's hot on peer PCPU, or that can't (or
- * would prefer not to) run on cpu.
- *
- * The caller is supposed to have already checked that unit is also
- * not running.
- */
- ASSERT(!unit->is_running);
-
- return !__csched_vcpu_is_cache_hot(prv, svc) &&
- cpumask_test_cpu(dest_cpu, mask);
-}
-
-static int
-_csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit,
- bool_t commit)
-{
- int cpu = sched_unit_master(unit);
- /* We must always use cpu's scratch space */
- cpumask_t *cpus = cpumask_scratch_cpu(cpu);
- cpumask_t idlers;
- cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
- struct csched_pcpu *spc = NULL;
- int balance_step;
-
- for_each_affinity_balance_step( balance_step )
- {
- affinity_balance_cpumask(unit, balance_step, cpus);
- cpumask_and(cpus, online, cpus);
- /*
- * We want to pick up a pcpu among the ones that are online and
- * can accommodate vc. As far as hard affinity is concerned, there
- * always will be at least one of these pcpus in the scratch cpumask,
- * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below
- * are ok.
- *
- * On the other hand, when considering soft affinity, it is possible
- * that the mask is empty (for instance, if the domain has been put
- * in a cpupool that does not contain any of the pcpus in its soft
- * affinity), which would result in the ASSERT()-s inside cpumask_*()
- * operations triggering (in debug builds).
- *
- * Therefore, if that is the case, we just skip the soft affinity
- * balancing step all together.
- */
- if ( balance_step == BALANCE_SOFT_AFFINITY &&
- (!has_soft_affinity(unit) || cpumask_empty(cpus)) )
- continue;
-
- /* If present, prefer vc's current processor */
- cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
- ? sched_unit_master(unit)
- : cpumask_cycle(sched_unit_master(unit), cpus);
- ASSERT(cpumask_test_cpu(cpu, cpus));
-
- /*
- * Try to find an idle processor within the above constraints.
- *
- * In multi-core and multi-threaded CPUs, not all idle execution
- * vehicles are equal!
- *
- * We give preference to the idle execution vehicle with the most
- * idling neighbours in its grouping. This distributes work across
- * distinct cores first and guarantees we don't do something stupid
- * like run two UNITs on co-hyperthreads while there are idle cores
- * or sockets.
- *
- * Notice that, when computing the "idleness" of cpu, we may want to
- * discount unit. That is, iff unit is the currently running and the
- * only runnable unit on cpu, we add cpu to the idlers.
- */
- cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
- if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) )
- __cpumask_set_cpu(cpu, &idlers);
- cpumask_and(cpus, &idlers, cpus);
-
- /*
- * It is important that cpu points to an idle processor, if a suitable
- * one exists (and we can use cpus to check and, possibly, choose a new
- * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and
- * cpu points to a busy thread with an idle sibling, both the threads
- * will be considered the same, from the "idleness" calculation point
- * of view", preventing unit from being moved to the thread that is
- * actually idle.
- *
- * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so
- * we check for it first.
- */
- if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) )
- cpu = cpumask_cycle(cpu, cpus);
- __cpumask_clear_cpu(cpu, cpus);
-
- while ( !cpumask_empty(cpus) )
- {
- cpumask_t cpu_idlers;
- cpumask_t nxt_idlers;
- int nxt, weight_cpu, weight_nxt;
- int migrate_factor;
-
- nxt = cpumask_cycle(cpu, cpus);
-
- if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) )
- {
- /* We're on the same socket, so check the busy-ness of threads.
- * Migrate if # of idlers is less at all */
- ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
- migrate_factor = 1;
- cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask,
- cpu));
- cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask,
- nxt));
- }
- else
- {
- /* We're on different sockets, so check the busy-ness of cores.
- * Migrate only if the other core is twice as idle */
- ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
- migrate_factor = 2;
- cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu));
- cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt));
- }
-
- weight_cpu = cpumask_weight(&cpu_idlers);
- weight_nxt = cpumask_weight(&nxt_idlers);
- /* smt_power_savings: consolidate work rather than spreading it */
- if ( sched_smt_power_savings ?
- weight_cpu > weight_nxt :
- weight_cpu * migrate_factor < weight_nxt )
- {
- cpumask_and(&nxt_idlers, &nxt_idlers, cpus);
- spc = CSCHED_PCPU(nxt);
- cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers);
- cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu));
- }
- else
- {
- cpumask_andnot(cpus, cpus, &nxt_idlers);
- }
- }
-
- /* Stop if cpu is idle */
- if ( cpumask_test_cpu(cpu, &idlers) )
- break;
- }
-
- if ( commit && spc )
- spc->idle_bias = cpu;
-
- TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id,
- cpu);
-
- return cpu;
-}
-
-static struct sched_resource *
-csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
- struct csched_unit *svc = CSCHED_UNIT(unit);
-
- /*
- * We have been called by vcpu_migrate() (in schedule.c), as part
- * of the process of seeing if vc can be migrated to another pcpu.
- * We make a note about this in svc->flags so that later, in
- * csched_unit_wake() (still called from vcpu_migrate()) we won't
- * get boosted, which we don't deserve as we are "only" migrating.
- */
- set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
- return get_sched_res(_csched_cpu_pick(ops, unit, 1));
-}
-
-static inline void
-__csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc)
-{
- struct csched_dom * const sdom = svc->sdom;
- unsigned long flags;
-
- spin_lock_irqsave(&prv->lock, flags);
-
- if ( list_empty(&svc->active_unit_elem) )
- {
- SCHED_UNIT_STAT_CRANK(svc, state_active);
- SCHED_STAT_CRANK(acct_unit_active);
-
- sdom->active_unit_count++;
- list_add(&svc->active_unit_elem, &sdom->active_unit);
- /* Make weight per-unit */
- prv->weight += sdom->weight;
- if ( list_empty(&sdom->active_sdom_elem) )
- {
- list_add(&sdom->active_sdom_elem, &prv->active_sdom);
- }
- }
-
- TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id,
- svc->unit->unit_id, sdom->active_unit_count);
-
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static inline void
-__csched_unit_acct_stop_locked(struct csched_private *prv,
- struct csched_unit *svc)
-{
- struct csched_dom * const sdom = svc->sdom;
-
- BUG_ON( list_empty(&svc->active_unit_elem) );
-
- SCHED_UNIT_STAT_CRANK(svc, state_idle);
- SCHED_STAT_CRANK(acct_unit_idle);
-
- BUG_ON( prv->weight < sdom->weight );
- sdom->active_unit_count--;
- list_del_init(&svc->active_unit_elem);
- prv->weight -= sdom->weight;
- if ( list_empty(&sdom->active_unit) )
- {
- list_del_init(&sdom->active_sdom_elem);
- }
-
- TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id,
- svc->unit->unit_id, sdom->active_unit_count);
-}
-
-static void
-csched_unit_acct(struct csched_private *prv, unsigned int cpu)
-{
- struct sched_unit *currunit = current->sched_unit;
- struct csched_unit * const svc = CSCHED_UNIT(currunit);
- struct sched_resource *sr = get_sched_res(cpu);
- const struct scheduler *ops = sr->scheduler;
-
- ASSERT( sched_unit_master(currunit) == cpu );
- ASSERT( svc->sdom != NULL );
- ASSERT( !is_idle_unit(svc->unit) );
-
- /*
- * If this UNIT's priority was boosted when it last awoke, reset it.
- * If the UNIT is found here, then it's consuming a non-negligeable
- * amount of CPU resources and should no longer be boosted.
- */
- if ( svc->pri == CSCHED_PRI_TS_BOOST )
- {
- svc->pri = CSCHED_PRI_TS_UNDER;
- TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id,
- svc->unit->unit_id);
- }
-
- /*
- * Update credits
- */
- burn_credits(svc, NOW());
-
- /*
- * Put this UNIT and domain back on the active list if it was
- * idling.
- */
- if ( list_empty(&svc->active_unit_elem) )
- {
- __csched_unit_acct_start(prv, svc);
- }
- else
- {
- unsigned int new_cpu;
- unsigned long flags;
- spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags);
-
- /*
- * If it's been active a while, check if we'd be better off
- * migrating it to run elsewhere (see multi-core and multi-thread
- * support in csched_res_pick()).
- */
- new_cpu = _csched_cpu_pick(ops, currunit, 0);
-
- unit_schedule_unlock_irqrestore(lock, flags, currunit);
-
- if ( new_cpu != cpu )
- {
- SCHED_UNIT_STAT_CRANK(svc, migrate_r);
- SCHED_STAT_CRANK(migrate_running);
- sched_set_pause_flags_atomic(currunit, _VPF_migrating);
- /*
- * As we are about to tickle cpu, we should clear its bit in
- * idlers. But, if we are here, it means there is someone running
- * on it, and hence the bit must be zero already.
- */
- ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers));
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
- }
- }
-}
-
-static void *
-csched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
- void *dd)
-{
- struct csched_unit *svc;
-
- /* Allocate per-UNIT info */
- svc = xzalloc(struct csched_unit);
- if ( svc == NULL )
- return NULL;
-
- INIT_LIST_HEAD(&svc->runq_elem);
- INIT_LIST_HEAD(&svc->active_unit_elem);
- svc->sdom = dd;
- svc->unit = unit;
- svc->pri = is_idle_unit(unit) ?
- CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
- SCHED_UNIT_STATS_RESET(svc);
- SCHED_STAT_CRANK(unit_alloc);
- return svc;
-}
-
-static void
-csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched_unit *svc = unit->priv;
- spinlock_t *lock;
-
- BUG_ON( is_idle_unit(unit) );
-
- /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */
- lock = unit_schedule_lock_irq(unit);
-
- sched_set_res(unit, csched_res_pick(ops, unit));
-
- spin_unlock_irq(lock);
-
- lock = unit_schedule_lock_irq(unit);
-
- if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running )
- runq_insert(svc);
-
- unit_schedule_unlock_irq(lock, unit);
-
- SCHED_STAT_CRANK(unit_insert);
-}
-
-static void
-csched_free_udata(const struct scheduler *ops, void *priv)
-{
- struct csched_unit *svc = priv;
-
- BUG_ON( !list_empty(&svc->runq_elem) );
-
- xfree(svc);
-}
-
-static void
-csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched_private *prv = CSCHED_PRIV(ops);
- struct csched_unit * const svc = CSCHED_UNIT(unit);
- struct csched_dom * const sdom = svc->sdom;
-
- SCHED_STAT_CRANK(unit_remove);
-
- ASSERT(!__unit_on_runq(svc));
-
- if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
- {
- SCHED_STAT_CRANK(unit_unpark);
- sched_unit_unpause(svc->unit);
- }
-
- spin_lock_irq(&prv->lock);
-
- if ( !list_empty(&svc->active_unit_elem) )
- __csched_unit_acct_stop_locked(prv, svc);
-
- spin_unlock_irq(&prv->lock);
-
- BUG_ON( sdom == NULL );
-}
-
-static void
-csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched_unit * const svc = CSCHED_UNIT(unit);
- unsigned int cpu = sched_unit_master(unit);
- struct sched_resource *sr = get_sched_res(cpu);
-
- SCHED_STAT_CRANK(unit_sleep);
-
- BUG_ON( is_idle_unit(unit) );
-
- if ( curr_on_cpu(cpu) == unit )
- {
- /*
- * We are about to tickle cpu, so we should clear its bit in idlers.
- * But, we are here because unit is going to sleep while running on cpu,
- * so the bit must be zero already.
- */
- ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers));
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
- }
- else if ( __unit_on_runq(svc) )
- runq_remove(svc);
-}
-
-static void
-csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched_unit * const svc = CSCHED_UNIT(unit);
- bool_t migrating;
-
- BUG_ON( is_idle_unit(unit) );
-
- if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
- {
- SCHED_STAT_CRANK(unit_wake_running);
- return;
- }
- if ( unlikely(__unit_on_runq(svc)) )
- {
- SCHED_STAT_CRANK(unit_wake_onrunq);
- return;
- }
-
- if ( likely(unit_runnable(unit)) )
- SCHED_STAT_CRANK(unit_wake_runnable);
- else
- SCHED_STAT_CRANK(unit_wake_not_runnable);
-
- /*
- * We temporarily boost the priority of awaking UNITs!
- *
- * If this UNIT consumes a non negligible amount of CPU, it
- * will eventually find itself in the credit accounting code
- * path where its priority will be reset to normal.
- *
- * If on the other hand the UNIT consumes little CPU and is
- * blocking and awoken a lot (doing I/O for example), its
- * priority will remain boosted, optimizing it's wake-to-run
- * latencies.
- *
- * This allows wake-to-run latency sensitive UNITs to preempt
- * more CPU resource intensive UNITs without impacting overall
- * system fairness.
- *
- * There are two cases, when we don't want to boost:
- * - UNITs that are waking up after a migration, rather than
- * after having block;
- * - UNITs of capped domains unpausing after earning credits
- * they had overspent.
- */
- migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
-
- if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER &&
- !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
- {
- TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id,
- unit->unit_id);
- SCHED_STAT_CRANK(unit_boost);
- svc->pri = CSCHED_PRI_TS_BOOST;
- }
-
- /* Put the UNIT on the runq and tickle CPUs */
- runq_insert(svc);
- __runq_tickle(svc);
-}
-
-static void
-csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched_unit * const svc = CSCHED_UNIT(unit);
-
- /* Let the scheduler know that this vcpu is trying to yield */
- set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags);
-}
-
-static int
-csched_dom_cntl(
- const struct scheduler *ops,
- struct domain *d,
- struct xen_domctl_scheduler_op *op)
-{
- struct csched_dom * const sdom = CSCHED_DOM(d);
- struct csched_private *prv = CSCHED_PRIV(ops);
- unsigned long flags;
- int rc = 0;
-
- /* Protect both get and put branches with the pluggable scheduler
- * lock. Runq lock not needed anywhere in here. */
- spin_lock_irqsave(&prv->lock, flags);
-
- switch ( op->cmd )
- {
- case XEN_DOMCTL_SCHEDOP_getinfo:
- op->u.credit.weight = sdom->weight;
- op->u.credit.cap = sdom->cap;
- break;
- case XEN_DOMCTL_SCHEDOP_putinfo:
- if ( op->u.credit.weight != 0 )
- {
- if ( !list_empty(&sdom->active_sdom_elem) )
- {
- prv->weight -= sdom->weight * sdom->active_unit_count;
- prv->weight += op->u.credit.weight * sdom->active_unit_count;
- }
- sdom->weight = op->u.credit.weight;
- }
-
- if ( op->u.credit.cap != (uint16_t)~0U )
- sdom->cap = op->u.credit.cap;
- break;
- default:
- rc = -EINVAL;
- break;
- }
-
- spin_unlock_irqrestore(&prv->lock, flags);
-
- return rc;
-}
-
-static void
-csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
- const cpumask_t *hard, const cpumask_t *soft)
-{
- struct csched_unit *svc = CSCHED_UNIT(unit);
-
- if ( !hard )
- return;
-
- /* Are we becoming exclusively pinned? */
- if ( cpumask_weight(hard) == 1 )
- set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
- else
- clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
-}
-
-static inline void
-__csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms)
-{
- prv->tslice = MILLISECS(timeslice_ms);
- prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
- if ( timeslice_ms < prv->ticks_per_tslice )
- prv->ticks_per_tslice = 1;
- prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice;
- prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms;
- prv->credit = prv->credits_per_tslice * prv->ncpus;
-}
-
-static int
-csched_sys_cntl(const struct scheduler *ops,
- struct xen_sysctl_scheduler_op *sc)
-{
- int rc = -EINVAL;
- struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit;
- struct csched_private *prv = CSCHED_PRIV(ops);
- unsigned long flags;
-
- switch ( sc->cmd )
- {
- case XEN_SYSCTL_SCHEDOP_putinfo:
- if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
- || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN
- || (params->ratelimit_us
- && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
- || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
- || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms)
- || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
- goto out;
-
- spin_lock_irqsave(&prv->lock, flags);
- __csched_set_tslice(prv, params->tslice_ms);
- if ( !prv->ratelimit && params->ratelimit_us )
- printk(XENLOG_INFO "Enabling context switch rate limiting\n");
- else if ( prv->ratelimit && !params->ratelimit_us )
- printk(XENLOG_INFO "Disabling context switch rate limiting\n");
- prv->ratelimit = MICROSECS(params->ratelimit_us);
- prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us);
- spin_unlock_irqrestore(&prv->lock, flags);
-
- /* FALLTHRU */
- case XEN_SYSCTL_SCHEDOP_getinfo:
- params->tslice_ms = prv->tslice / MILLISECS(1);
- params->ratelimit_us = prv->ratelimit / MICROSECS(1);
- params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1);
- rc = 0;
- break;
- }
- out:
- return rc;
-}
-
-static void *
-csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
-{
- struct csched_dom *sdom;
-
- sdom = xzalloc(struct csched_dom);
- if ( sdom == NULL )
- return ERR_PTR(-ENOMEM);
-
- /* Initialize credit and weight */
- INIT_LIST_HEAD(&sdom->active_unit);
- INIT_LIST_HEAD(&sdom->active_sdom_elem);
- sdom->dom = dom;
- sdom->weight = CSCHED_DEFAULT_WEIGHT;
-
- return sdom;
-}
-
-static void
-csched_free_domdata(const struct scheduler *ops, void *data)
-{
- xfree(data);
-}
-
-/*
- * This is a O(n) optimized sort of the runq.
- *
- * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk
- * through the runq and move up any UNDERs that are preceded by OVERS. We
- * remember the last UNDER to make the move up operation O(1).
- */
-static void
-csched_runq_sort(struct csched_private *prv, unsigned int cpu)
-{
- struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
- struct list_head *runq, *elem, *next, *last_under;
- struct csched_unit *svc_elem;
- spinlock_t *lock;
- unsigned long flags;
- int sort_epoch;
-
- sort_epoch = prv->runq_sort;
- if ( sort_epoch == spc->runq_sort_last )
- return;
-
- spc->runq_sort_last = sort_epoch;
-
- lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
- runq = &spc->runq;
- elem = runq->next;
- last_under = runq;
-
- while ( elem != runq )
- {
- next = elem->next;
- svc_elem = __runq_elem(elem);
-
- if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
- {
- /* does elem need to move up the runq? */
- if ( elem->prev != last_under )
- {
- list_del(elem);
- list_add(elem, last_under);
- }
- last_under = elem;
- }
-
- elem = next;
- }
-
- pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
-}
-
-static void
-csched_acct(void* dummy)
-{
- struct csched_private *prv = dummy;
- unsigned long flags;
- struct list_head *iter_unit, *next_unit;
- struct list_head *iter_sdom, *next_sdom;
- struct csched_unit *svc;
- struct csched_dom *sdom;
- uint32_t credit_total;
- uint32_t weight_total;
- uint32_t weight_left;
- uint32_t credit_fair;
- uint32_t credit_peak;
- uint32_t credit_cap;
- int credit_balance;
- int credit_xtra;
- int credit;
-
-
- spin_lock_irqsave(&prv->lock, flags);
-
- weight_total = prv->weight;
- credit_total = prv->credit;
-
- /* Converge balance towards 0 when it drops negative */
- if ( prv->credit_balance < 0 )
- {
- credit_total -= prv->credit_balance;
- SCHED_STAT_CRANK(acct_balance);
- }
-
- if ( unlikely(weight_total == 0) )
- {
- prv->credit_balance = 0;
- spin_unlock_irqrestore(&prv->lock, flags);
- SCHED_STAT_CRANK(acct_no_work);
- goto out;
- }
-
- SCHED_STAT_CRANK(acct_run);
-
- weight_left = weight_total;
- credit_balance = 0;
- credit_xtra = 0;
- credit_cap = 0U;
-
- list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
- {
- sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
-
- BUG_ON( is_idle_domain(sdom->dom) );
- BUG_ON( sdom->active_unit_count == 0 );
- BUG_ON( sdom->weight == 0 );
- BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left );
-
- weight_left -= ( sdom->weight * sdom->active_unit_count );
-
- /*
- * A domain's fair share is computed using its weight in competition
- * with that of all other active domains.
- *
- * At most, a domain can use credits to run all its active UNITs
- * for one full accounting period. We allow a domain to earn more
- * only when the system-wide credit balance is negative.
- */
- credit_peak = sdom->active_unit_count * prv->credits_per_tslice;
- if ( prv->credit_balance < 0 )
- {
- credit_peak += ( ( -prv->credit_balance
- * sdom->weight
- * sdom->active_unit_count) +
- (weight_total - 1)
- ) / weight_total;
- }
-
- if ( sdom->cap != 0U )
- {
- credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100;
- if ( credit_cap < credit_peak )
- credit_peak = credit_cap;
-
- /* FIXME -- set cap per-unit as well...? */
- credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 )
- ) / sdom->active_unit_count;
- }
-
- credit_fair = ( ( credit_total
- * sdom->weight
- * sdom->active_unit_count )
- + (weight_total - 1)
- ) / weight_total;
-
- if ( credit_fair < credit_peak )
- {
- credit_xtra = 1;
- }
- else
- {
- if ( weight_left != 0U )
- {
- /* Give other domains a chance at unused credits */
- credit_total += ( ( ( credit_fair - credit_peak
- ) * weight_total
- ) + ( weight_left - 1 )
- ) / weight_left;
- }
-
- if ( credit_xtra )
- {
- /*
- * Lazily keep domains with extra credits at the head of
- * the queue to give others a chance at them in future
- * accounting periods.
- */
- SCHED_STAT_CRANK(acct_reorder);
- list_del(&sdom->active_sdom_elem);
- list_add(&sdom->active_sdom_elem, &prv->active_sdom);
- }
-
- credit_fair = credit_peak;
- }
-
- /* Compute fair share per UNIT */
- credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 )
- ) / sdom->active_unit_count;
-
-
- list_for_each_safe( iter_unit, next_unit, &sdom->active_unit )
- {
- svc = list_entry(iter_unit, struct csched_unit, active_unit_elem);
- BUG_ON( sdom != svc->sdom );
-
- /* Increment credit */
- atomic_add(credit_fair, &svc->credit);
- credit = atomic_read(&svc->credit);
-
- /*
- * Recompute priority or, if UNIT is idling, remove it from
- * the active list.
- */
- if ( credit < 0 )
- {
- svc->pri = CSCHED_PRI_TS_OVER;
-
- /* Park running UNITs of capped-out domains */
- if ( sdom->cap != 0U &&
- credit < -credit_cap &&
- !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
- {
- SCHED_STAT_CRANK(unit_park);
- sched_unit_pause_nosync(svc->unit);
- }
-
- /* Lower bound on credits */
- if ( credit < -prv->credits_per_tslice )
- {
- SCHED_STAT_CRANK(acct_min_credit);
- credit = -prv->credits_per_tslice;
- atomic_set(&svc->credit, credit);
- }
- }
- else
- {
- svc->pri = CSCHED_PRI_TS_UNDER;
-
- /* Unpark any capped domains whose credits go positive */
- if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
- {
- /*
- * It's important to unset the flag AFTER the unpause()
- * call to make sure the UNIT's priority is not boosted
- * if it is woken up here.
- */
- SCHED_STAT_CRANK(unit_unpark);
- sched_unit_unpause(svc->unit);
- clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags);
- }
-
- /* Upper bound on credits means UNIT stops earning */
- if ( credit > prv->credits_per_tslice )
- {
- __csched_unit_acct_stop_locked(prv, svc);
- /* Divide credits in half, so that when it starts
- * accounting again, it starts a little bit "ahead" */
- credit /= 2;
- atomic_set(&svc->credit, credit);
- }
- }
-
- SCHED_UNIT_STAT_SET(svc, credit_last, credit);
- SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair);
- credit_balance += credit;
- }
- }
-
- prv->credit_balance = credit_balance;
-
- spin_unlock_irqrestore(&prv->lock, flags);
-
- /* Inform each CPU that its runq needs to be sorted */
- prv->runq_sort++;
-
-out:
- set_timer( &prv->master_ticker, NOW() + prv->tslice);
-}
-
-static void
-csched_tick(void *_cpu)
-{
- unsigned int cpu = (unsigned long)_cpu;
- struct sched_resource *sr = get_sched_res(cpu);
- struct csched_pcpu *spc = CSCHED_PCPU(cpu);
- struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
-
- spc->tick++;
-
- /*
- * Accounting for running UNIT
- */
- if ( !is_idle_unit(current->sched_unit) )
- csched_unit_acct(prv, cpu);
-
- /*
- * Check if runq needs to be sorted
- *
- * Every physical CPU resorts the runq after the accounting master has
- * modified priorities. This is a special O(n) sort and runs at most
- * once per accounting period (currently 30 milliseconds).
- */
- csched_runq_sort(prv, cpu);
-
- set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
-}
-
-static struct csched_unit *
-csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
-{
- struct sched_resource *sr = get_sched_res(cpu);
- const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler);
- const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
- struct csched_unit *speer;
- struct list_head *iter;
- struct sched_unit *unit;
-
- ASSERT(peer_pcpu != NULL);
-
- /*
- * Don't steal from an idle CPU's runq because it's about to
- * pick up work from it itself.
- */
- if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) )
- goto out;
-
- list_for_each( iter, &peer_pcpu->runq )
- {
- speer = __runq_elem(iter);
-
- /*
- * If next available UNIT here is not of strictly higher
- * priority than ours, this PCPU is useless to us.
- */
- if ( speer->pri <= pri )
- break;
-
- /* Is this UNIT runnable on our PCPU? */
- unit = speer->unit;
- BUG_ON( is_idle_unit(unit) );
-
- /*
- * If the unit is still in peer_cpu's scheduling tail, or if it
- * has no useful soft affinity, skip it.
- *
- * In fact, what we want is to check if we have any "soft-affine
- * work" to steal, before starting to look at "hard-affine work".
- *
- * Notice that, if not even one unit on this runq has a useful
- * soft affinity, we could have avoid considering this runq for
- * a soft balancing step in the first place. This, for instance,
- * can be implemented by taking note of on what runq there are
- * units with useful soft affinities in some sort of bitmap
- * or counter.
- */
- if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY &&
- !has_soft_affinity(unit)) )
- continue;
-
- affinity_balance_cpumask(unit, balance_step, cpumask_scratch);
- if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) )
- {
- /* We got a candidate. Grab it! */
- TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu,
- unit->domain->domain_id, unit->unit_id);
- SCHED_UNIT_STAT_CRANK(speer, migrate_q);
- SCHED_STAT_CRANK(migrate_queued);
- runq_remove(speer);
- sched_set_res(unit, get_sched_res(cpu));
- /*
- * speer will start executing directly on cpu, without having to
- * go through runq_insert(). So we must update the runnable count
- * for cpu here.
- */
- inc_nr_runnable(cpu);
- return speer;
- }
- }
- out:
- SCHED_STAT_CRANK(steal_peer_idle);
- return NULL;
-}
-
-static struct csched_unit *
-csched_load_balance(struct csched_private *prv, int cpu,
- struct csched_unit *snext, bool *stolen)
-{
- struct cpupool *c = get_sched_res(cpu)->cpupool;
- struct csched_unit *speer;
- cpumask_t workers;
- cpumask_t *online = c->res_valid;
- int peer_cpu, first_cpu, peer_node, bstep;
- int node = cpu_to_node(cpu);
-
- BUG_ON(get_sched_res(cpu) != snext->unit->res);
-
- /*
- * If this CPU is going offline, or is not (yet) part of any cpupool
- * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
- */
- if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
- goto out;
-
- if ( snext->pri == CSCHED_PRI_IDLE )
- SCHED_STAT_CRANK(load_balance_idle);
- else if ( snext->pri == CSCHED_PRI_TS_OVER )
- SCHED_STAT_CRANK(load_balance_over);
- else
- SCHED_STAT_CRANK(load_balance_other);
-
- /*
- * Let's look around for work to steal, taking both hard affinity
- * and soft affinity into account. More specifically, we check all
- * the non-idle CPUs' runq, looking for:
- * 1. any "soft-affine work" to steal first,
- * 2. if not finding anything, any "hard-affine work" to steal.
- */
- for_each_affinity_balance_step( bstep )
- {
- /*
- * We peek at the non-idling CPUs in a node-wise fashion. In fact,
- * it is more likely that we find some affine work on our same
- * node, not to mention that migrating units within the same node
- * could well expected to be cheaper than across-nodes (memory
- * stays local, there might be some node-wide cache[s], etc.).
- */
- peer_node = node;
- do
- {
- /* Select the pCPUs in this node that have work we can steal. */
- cpumask_andnot(&workers, online, prv->idlers);
- cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
- __cpumask_clear_cpu(cpu, &workers);
-
- first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers);
- if ( first_cpu >= nr_cpu_ids )
- goto next_node;
- peer_cpu = first_cpu;
- do
- {
- spinlock_t *lock;
-
- /*
- * If there is only one runnable unit on peer_cpu, it means
- * there's no one to be stolen in its runqueue, so skip it.
- *
- * Checking this without holding the lock is racy... But that's
- * the whole point of this optimization!
- *
- * In more details:
- * - if we race with dec_nr_runnable(), we may try to take the
- * lock and call csched_runq_steal() for no reason. This is
- * not a functional issue, and should be infrequent enough.
- * And we can avoid that by re-checking nr_runnable after
- * having grabbed the lock, if we want;
- * - if we race with inc_nr_runnable(), we skip a pCPU that may
- * have runnable units in its runqueue, but that's not a
- * problem because:
- * + if racing with csched_unit_insert() or csched_unit_wake(),
- * __runq_tickle() will be called afterwords, so the unit
- * won't get stuck in the runqueue for too long;
- * + if racing with csched_runq_steal(), it may be that an
- * unit that we could have picked up, stays in a runqueue
- * until someone else tries to steal it again. But this is
- * no worse than what can happen already (without this
- * optimization), it the pCPU would schedule right after we
- * have taken the lock, and hence block on it.
- */
- if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
- {
- TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
- goto next_cpu;
- }
-
- /*
- * Get ahold of the scheduler lock for this peer CPU.
- *
- * Note: We don't spin on this lock but simply try it. Spinning
- * could cause a deadlock if the peer CPU is also load
- * balancing and trying to lock this CPU.
- */
- lock = pcpu_schedule_trylock(peer_cpu);
- SCHED_STAT_CRANK(steal_trylock);
- if ( !lock )
- {
- SCHED_STAT_CRANK(steal_trylock_failed);
- TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
- goto next_cpu;
- }
-
- TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
-
- /* Any work over there to steal? */
- speer = cpumask_test_cpu(peer_cpu, online) ?
- csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
- pcpu_schedule_unlock(lock, peer_cpu);
-
- /* As soon as one unit is found, balancing ends */
- if ( speer != NULL )
- {
- *stolen = true;
- /*
- * Next time we'll look for work to steal on this node, we
- * will start from the next pCPU, with respect to this one,
- * so we don't risk stealing always from the same ones.
- */
- prv->balance_bias[peer_node] = peer_cpu;
- return speer;
- }
-
- next_cpu:
- peer_cpu = cpumask_cycle(peer_cpu, &workers);
-
- } while( peer_cpu != first_cpu );
-
- next_node:
- peer_node = cycle_node(peer_node, node_online_map);
- } while( peer_node != node );
- }
-
- out:
- /* Failed to find more important work elsewhere... */
- __runq_remove(snext);
- return snext;
-}
-
-/*
- * This function is in the critical path. It is designed to be simple and
- * fast for the common case.
- */
-static void csched_schedule(
- const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
- bool tasklet_work_scheduled)
-{
- const unsigned int cur_cpu = smp_processor_id();
- const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
- struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu);
- struct list_head * const runq = RUNQ(sched_cpu);
- struct csched_unit * const scurr = CSCHED_UNIT(unit);
- struct csched_private *prv = CSCHED_PRIV(ops);
- struct csched_unit *snext;
- s_time_t runtime, tslice;
- bool migrated = false;
-
- SCHED_STAT_CRANK(schedule);
- CSCHED_UNIT_CHECK(unit);
-
- /*
- * Here in Credit1 code, we usually just call TRACE_nD() helpers, and
- * don't care about packing. But scheduling happens very often, so it
- * actually is important that the record is as small as possible.
- */
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned cpu:16, tasklet:8, idle:8;
- } d;
- d.cpu = cur_cpu;
- d.tasklet = tasklet_work_scheduled;
- d.idle = is_idle_unit(unit);
- __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d),
- (unsigned char *)&d);
- }
-
- runtime = now - unit->state_entry_time;
- if ( runtime < 0 ) /* Does this ever happen? */
- runtime = 0;
-
- if ( !is_idle_unit(unit) )
- {
- /* Update credits of a non-idle UNIT. */
- burn_credits(scurr, now);
- scurr->start_time -= now;
- scurr->last_sched_time = now;
- }
- else
- {
- /* Re-instate a boosted idle UNIT as normal-idle. */
- scurr->pri = CSCHED_PRI_IDLE;
- }
-
- /* Choices, choices:
- * - If we have a tasklet, we need to run the idle unit no matter what.
- * - If sched rate limiting is in effect, and the current unit has
- * run for less than that amount of time, continue the current one,
- * but with a shorter timeslice and return it immediately
- * - Otherwise, chose the one with the highest priority (which may
- * be the one currently running)
- * - If the currently running one is TS_OVER, see if there
- * is a higher priority one waiting on the runqueue of another
- * cpu and steal it.
- */
-
- /*
- * If we have schedule rate limiting enabled, check to see
- * how long we've run for.
- *
- * If scurr is yielding, however, we don't let rate limiting kick in.
- * In fact, it may be the case that scurr is about to spin, and there's
- * no point forcing it to do so until rate limiting expires.
- */
- if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags)
- && !tasklet_work_scheduled
- && prv->ratelimit
- && unit_runnable_state(unit)
- && !is_idle_unit(unit)
- && runtime < prv->ratelimit )
- {
- snext = scurr;
- snext->start_time += now;
- perfc_incr(delay_ms);
- /*
- * Next timeslice must last just until we'll have executed for
- * ratelimit. However, to avoid setting a really short timer, which
- * will most likely be inaccurate and counterproductive, we never go
- * below CSCHED_MIN_TIMER.
- */
- tslice = prv->ratelimit - runtime;
- if ( unlikely(runtime < CSCHED_MIN_TIMER) )
- tslice = CSCHED_MIN_TIMER;
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned runtime;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.runtime = runtime;
- __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d),
- (unsigned char *)&d);
- }
-
- goto out;
- }
- tslice = prv->tslice;
-
- /*
- * Select next runnable local UNIT (ie top of local runq)
- */
- if ( unit_runnable(unit) )
- __runq_insert(scurr);
- else
- {
- BUG_ON( is_idle_unit(unit) || list_empty(runq) );
- /* Current has blocked. Update the runnable counter for this cpu. */
- dec_nr_runnable(sched_cpu);
- }
-
- /*
- * Clear YIELD flag before scheduling out
- */
- clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags);
-
- do {
- snext = __runq_elem(runq->next);
-
- /* Tasklet work (which runs in idle UNIT context) overrides all else. */
- if ( tasklet_work_scheduled )
- {
- TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
- snext = CSCHED_UNIT(sched_idle_unit(sched_cpu));
- snext->pri = CSCHED_PRI_TS_BOOST;
- }
-
- /*
- * SMP Load balance:
- *
- * If the next highest priority local runnable UNIT has already eaten
- * through its credits, look on other PCPUs to see if we have more
- * urgent work... If not, csched_load_balance() will return snext, but
- * already removed from the runq.
- */
- if ( snext->pri > CSCHED_PRI_TS_OVER )
- __runq_remove(snext);
- else
- snext = csched_load_balance(prv, sched_cpu, snext, &migrated);
-
- } while ( !unit_runnable_state(snext->unit) );
-
- /*
- * Update idlers mask if necessary. When we're idling, other CPUs
- * will tickle us when they get extra work.
- */
- if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE )
- {
- if ( !cpumask_test_cpu(sched_cpu, prv->idlers) )
- cpumask_set_cpu(sched_cpu, prv->idlers);
- }
- else if ( cpumask_test_cpu(sched_cpu, prv->idlers) )
- {
- cpumask_clear_cpu(sched_cpu, prv->idlers);
- }
-
- if ( !is_idle_unit(snext->unit) )
- snext->start_time += now;
-
-out:
- /*
- * Return task to run next...
- */
- unit->next_time = (is_idle_unit(snext->unit) ?
- -1 : tslice);
- unit->next_task = snext->unit;
- snext->unit->migrated = migrated;
-
- /* Stop credit tick when going to idle, restart it when coming from idle. */
- if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) )
- stop_timer(&spc->ticker);
- if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) )
- set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us)
- - now % MICROSECS(prv->tick_period_us) );
-
- CSCHED_UNIT_CHECK(unit->next_task);
-}
-
-static void
-csched_dump_unit(struct csched_unit *svc)
-{
- struct csched_dom * const sdom = svc->sdom;
-
- printk("[%i.%i] pri=%i flags=%x cpu=%i",
- svc->unit->domain->domain_id,
- svc->unit->unit_id,
- svc->pri,
- svc->flags,
- sched_unit_master(svc->unit));
-
- if ( sdom )
- {
- printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit),
- sdom->weight, sdom->cap);
-#ifdef CSCHED_STATS
- printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}",
- svc->stats.credit_last,
- svc->stats.credit_incr,
- svc->stats.state_active,
- svc->stats.state_idle,
- svc->stats.migrate_q,
- svc->stats.migrate_r,
- svc->stats.kicked_away);
-#endif
- }
-
- printk("\n");
-}
-
-static void
-csched_dump_pcpu(const struct scheduler *ops, int cpu)
-{
- struct list_head *runq, *iter;
- struct csched_private *prv = CSCHED_PRIV(ops);
- struct csched_pcpu *spc;
- struct csched_unit *svc;
- spinlock_t *lock;
- unsigned long flags;
- int loop;
-
- /*
- * We need both locks:
- * - csched_dump_unit() wants to access domains' scheduling
- * parameters, which are protected by the private scheduler lock;
- * - we scan through the runqueue, so we need the proper runqueue
- * lock (the one of the runqueue of this cpu).
- */
- spin_lock_irqsave(&prv->lock, flags);
- lock = pcpu_schedule_lock(cpu);
-
- spc = CSCHED_PCPU(cpu);
- runq = &spc->runq;
-
- printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n",
- cpu, spc->nr_runnable, spc->runq_sort_last,
- CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
- CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
-
- /* current UNIT (nothing to say if that's the idle unit). */
- svc = CSCHED_UNIT(curr_on_cpu(cpu));
- if ( svc && !is_idle_unit(svc->unit) )
- {
- printk("\trun: ");
- csched_dump_unit(svc);
- }
-
- loop = 0;
- list_for_each( iter, runq )
- {
- svc = __runq_elem(iter);
- if ( svc )
- {
- printk("\t%3d: ", ++loop);
- csched_dump_unit(svc);
- }
- }
-
- pcpu_schedule_unlock(lock, cpu);
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void
-csched_dump(const struct scheduler *ops)
-{
- struct list_head *iter_sdom, *iter_svc;
- struct csched_private *prv = CSCHED_PRIV(ops);
- int loop;
- unsigned long flags;
-
- spin_lock_irqsave(&prv->lock, flags);
-
- printk("info:\n"
- "\tncpus = %u\n"
- "\tmaster = %u\n"
- "\tcredit = %u\n"
- "\tcredit balance = %d\n"
- "\tweight = %u\n"
- "\trunq_sort = %u\n"
- "\tdefault-weight = %d\n"
- "\ttslice = %"PRI_stime"ms\n"
- "\tratelimit = %"PRI_stime"us\n"
- "\tcredits per msec = %d\n"
- "\tticks per tslice = %d\n"
- "\tmigration delay = %"PRI_stime"us\n",
- prv->ncpus,
- prv->master,
- prv->credit,
- prv->credit_balance,
- prv->weight,
- prv->runq_sort,
- CSCHED_DEFAULT_WEIGHT,
- prv->tslice / MILLISECS(1),
- prv->ratelimit / MICROSECS(1),
- CSCHED_CREDITS_PER_MSEC,
- prv->ticks_per_tslice,
- prv->unit_migr_delay/ MICROSECS(1));
-
- printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers));
-
- printk("active units:\n");
- loop = 0;
- list_for_each( iter_sdom, &prv->active_sdom )
- {
- struct csched_dom *sdom;
- sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
-
- list_for_each( iter_svc, &sdom->active_unit )
- {
- struct csched_unit *svc;
- spinlock_t *lock;
-
- svc = list_entry(iter_svc, struct csched_unit, active_unit_elem);
- lock = unit_schedule_lock(svc->unit);
-
- printk("\t%3d: ", ++loop);
- csched_dump_unit(svc);
-
- unit_schedule_unlock(lock, svc->unit);
- }
- }
-
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static int __init
-csched_global_init(void)
-{
- if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX ||
- sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN )
- {
- printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n"
- " Resetting to default %u\n",
- XEN_SYSCTL_CSCHED_TSLICE_MIN,
- XEN_SYSCTL_CSCHED_TSLICE_MAX,
- CSCHED_DEFAULT_TSLICE_MS);
- sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
- }
-
- if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
- printk("WARNING: sched_ratelimit_us >"
- "sched_credit_tslice_ms is undefined\n"
- "Setting ratelimit to tslice\n");
-
- if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
- {
- vcpu_migration_delay_us = 0;
- printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n"
- "Resetting to default: %u\n",
- XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us);
- }
-
- return 0;
-}
-
-static int
-csched_init(struct scheduler *ops)
-{
- struct csched_private *prv;
-
- prv = xzalloc(struct csched_private);
- if ( prv == NULL )
- return -ENOMEM;
-
- prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES);
- if ( prv->balance_bias == NULL )
- {
- xfree(prv);
- return -ENOMEM;
- }
-
- if ( !zalloc_cpumask_var(&prv->cpus) ||
- !zalloc_cpumask_var(&prv->idlers) )
- {
- free_cpumask_var(prv->cpus);
- xfree(prv->balance_bias);
- xfree(prv);
- return -ENOMEM;
- }
-
- ops->sched_data = prv;
- spin_lock_init(&prv->lock);
- INIT_LIST_HEAD(&prv->active_sdom);
- prv->master = UINT_MAX;
-
- __csched_set_tslice(prv, sched_credit_tslice_ms);
-
- if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
- prv->ratelimit = prv->tslice;
- else
- prv->ratelimit = MICROSECS(sched_ratelimit_us);
-
- prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us);
-
- return 0;
-}
-
-static void
-csched_deinit(struct scheduler *ops)
-{
- struct csched_private *prv;
-
- prv = CSCHED_PRIV(ops);
- if ( prv != NULL )
- {
- ops->sched_data = NULL;
- free_cpumask_var(prv->cpus);
- free_cpumask_var(prv->idlers);
- xfree(prv->balance_bias);
- xfree(prv);
- }
-}
-
-static const struct scheduler sched_credit_def = {
- .name = "SMP Credit Scheduler",
- .opt_name = "credit",
- .sched_id = XEN_SCHEDULER_CREDIT,
- .sched_data = NULL,
-
- .global_init = csched_global_init,
-
- .insert_unit = csched_unit_insert,
- .remove_unit = csched_unit_remove,
-
- .sleep = csched_unit_sleep,
- .wake = csched_unit_wake,
- .yield = csched_unit_yield,
-
- .adjust = csched_dom_cntl,
- .adjust_affinity= csched_aff_cntl,
- .adjust_global = csched_sys_cntl,
-
- .pick_resource = csched_res_pick,
- .do_schedule = csched_schedule,
-
- .dump_cpu_state = csched_dump_pcpu,
- .dump_settings = csched_dump,
- .init = csched_init,
- .deinit = csched_deinit,
- .alloc_udata = csched_alloc_udata,
- .free_udata = csched_free_udata,
- .alloc_pdata = csched_alloc_pdata,
- .init_pdata = csched_init_pdata,
- .deinit_pdata = csched_deinit_pdata,
- .free_pdata = csched_free_pdata,
- .switch_sched = csched_switch_sched,
- .alloc_domdata = csched_alloc_domdata,
- .free_domdata = csched_free_domdata,
-};
-
-REGISTER_SCHEDULER(sched_credit_def);
+++ /dev/null
-
-/****************************************************************************
- * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
- ****************************************************************************
- *
- * File: common/sched_credit2.c
- * Author: George Dunlap
- *
- * Description: Credit-based SMP CPU scheduler
- * Based on an earlier verson by Emmanuel Ackaouy.
- */
-
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/perfc.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <asm/div64.h>
-#include <xen/errno.h>
-#include <xen/trace.h>
-#include <xen/cpu.h>
-#include <xen/keyhandler.h>
-
-/* Meant only for helping developers during debugging. */
-/* #define d2printk printk */
-#define d2printk(x...)
-
-
-/*
- * Credit2 tracing events ("only" 512 available!). Check
- * include/public/trace.h for more details.
- */
-#define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1)
-#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2)
-#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3)
-#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4)
-#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5)
-#define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6)
-#define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7)
-#define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8)
-#define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9)
-#define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10)
-#define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11)
-#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12)
-#define TRC_CSCHED2_TICKLE_NEW TRC_SCHED_CLASS_EVT(CSCHED2, 13)
-#define TRC_CSCHED2_RUNQ_MAX_WEIGHT TRC_SCHED_CLASS_EVT(CSCHED2, 14)
-#define TRC_CSCHED2_MIGRATE TRC_SCHED_CLASS_EVT(CSCHED2, 15)
-#define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16)
-#define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17)
-#define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19)
-#define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20)
-#define TRC_CSCHED2_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED2, 21)
-#define TRC_CSCHED2_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED2, 22)
-#define TRC_CSCHED2_RUNQ_CAND_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 23)
-
-/*
- * TODO:
- * + Hyperthreading
- * - "Discount" time run on a thread with busy siblings
- * + Algorithm:
- * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
- * a flash animation in the background) can we schedule it with low enough latency
- * so that audio doesn't skip?
- * + Optimizing
- * - Profiling, making new algorithms, making math more efficient (no long division)
- */
-
-/*
- * Design:
- *
- * VMs "burn" credits based on their weight; higher weight means
- * credits burn more slowly. The highest weight unit burns credits at
- * a rate of 1 credit per nanosecond. Others burn proportionally
- * more.
- *
- * units are inserted into the runqueue by credit order.
- *
- * Credits are "reset" when the next unit in the runqueue is less than
- * or equal to zero. At that point, everyone's credits are "clipped"
- * to a small value, and a fixed credit is added to everyone.
- */
-
-/*
- * Utilization cap:
- *
- * Setting an pCPU utilization cap for a domain means the following:
- *
- * - a domain can have a cap, expressed in terms of % of physical CPU time.
- * A domain that must not use more than 1/4 of _one_ physical CPU, will
- * be given a cap of 25%; a domain that must not use more than 1+1/2 of
- * physical CPU time, will be given a cap of 150%;
- *
- * - caps are per-domain (not per-unit). If a domain has only 1 unit, and
- * a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4
- * units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be
- * split among the v units. How much each of the units will actually get,
- * during any given interval of time, is unspecified (as it depends on
- * various aspects: workload, system load, etc.). For instance, it is
- * possible that, during a given time interval, 2 units use 100% each,
- * and the other two use nothing; while during another time interval,
- * two units use 80%, one uses 10% and the other 30%; or that each use
- * 50% (and so on and so forth).
- *
- * For implementing this, we use the following approach:
- *
- * - each domain is given a 'budget', an each domain has a timer, which
- * replenishes the domain's budget periodically. The budget is the amount
- * of time the units of the domain can use every 'period';
- *
- * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
- * (but each domain has its own timer; so the all are periodic by the same
- * period, but replenishment of the budgets of the various domains, at
- * periods boundaries, are not synchronous);
- *
- * - when units run, they consume budget. When they don't run, they don't
- * consume budget. If there is no budget left for the domain, no unit of
- * that domain can run. If an unit tries to run and finds that there is no
- * budget, it blocks.
- * At whatever time an unit wants to run, it must check the domain's budget,
- * and if there is some, it can use it.
- *
- * - budget is replenished to the top of the capacity for the domain once
- * per period. Even if there was some leftover budget from previous period,
- * though, the budget after a replenishment will always be at most equal
- * to the total capacify of the domain ('tot_budget');
- *
- * - when a budget replenishment occurs, if there are units that had been
- * blocked because of lack of budget, they'll be unblocked, and they will
- * (potentially) be able to run again.
- *
- * Finally, some even more implementation related detail:
- *
- * - budget is stored in a domain-wide pool. Units of the domain that want
- * to run go to such pool, and grub some. When they do so, the amount
- * they grabbed is _immediately_ removed from the pool. This happens in
- * unit_grab_budget();
- *
- * - when units stop running, if they've not consumed all the budget they
- * took, the leftover is put back in the pool. This happens in
- * unit_return_budget();
- *
- * - the above means that an unit can find out that there is no budget and
- * block, not only if the cap has actually been reached (for this period),
- * but also if some other units, in order to run, have grabbed a certain
- * quota of budget, no matter whether they've already used it all or not.
- * An unit blocking because (any form of) lack of budget is said to be
- * "parked", and such blocking happens in park_unit();
- *
- * - when an unit stops running, and puts back some budget in the domain pool,
- * we need to check whether there is someone which has been parked and that
- * can be unparked. This happens in unpark_parked_units(), called from
- * csched2_context_saved();
- *
- * - of course, unparking happens also as a consequence of the domain's budget
- * being replenished by the periodic timer. This also occurs by means of
- * calling csched2_context_saved() (but from replenish_domain_budget());
- *
- * - parked units of a domain are kept in a (per-domain) list, called
- * 'parked_units'). Manipulation of the list and of the domain-wide budget
- * pool, must occur only when holding the 'budget_lock'.
- */
-
-/*
- * Locking:
- *
- * - runqueue lock
- * + it is per-runqueue, so:
- * * cpus in a runqueue take the runqueue lock, when using
- * pcpu_schedule_lock() / unit_schedule_lock() (and friends),
- * * a cpu may (try to) take a "remote" runqueue lock, e.g., for
- * load balancing;
- * + serializes runqueue operations (removing and inserting units);
- * + protects runqueue-wide data in csched2_runqueue_data;
- * + protects unit parameters in csched2_unit for the unit in the
- * runqueue.
- *
- * - Private scheduler lock
- * + protects scheduler-wide data in csched2_private, such as:
- * * the list of domains active in this scheduler,
- * * what cpus and what runqueues are active and in what
- * runqueue each cpu is;
- * + serializes the operation of changing the weights of domains;
- *
- * - Budget lock
- * + it is per-domain;
- * + protects, in domains that have an utilization cap;
- * * manipulation of the total budget of the domain (as it is shared
- * among all units of the domain),
- * * manipulation of the list of units that are blocked waiting for
- * some budget to be available.
- *
- * - Type:
- * + runqueue locks are 'regular' spinlocks;
- * + the private scheduler lock can be an rwlock. In fact, data
- * it protects is modified only during initialization, cpupool
- * manipulation and when changing weights, and read in all
- * other cases (e.g., during load balancing);
- * + budget locks are 'regular' spinlocks.
- *
- * Ordering:
- * + tylock must be used when wanting to take a runqueue lock,
- * if we already hold another one;
- * + if taking both a runqueue lock and the private scheduler
- * lock is, the latter must always be taken for first;
- * + if taking both a runqueue lock and a budget lock, the former
- * must always be taken for first.
- */
-
-/*
- * Basic constants
- */
-/* Default weight: How much a new domain starts with. */
-#define CSCHED2_DEFAULT_WEIGHT 256
-/*
- * Min timer: Minimum length a timer will be set, to
- * achieve efficiency.
- */
-#define CSCHED2_MIN_TIMER MICROSECS(500)
-/*
- * Amount of credit VMs begin with, and are reset to.
- * ATM, set so that highest-weight VMs can only run for 10ms
- * before a reset event.
- */
-#define CSCHED2_CREDIT_INIT MILLISECS(10)
-/*
- * Amount of credit the idle units have. It never changes, as idle
- * units does not consume credits, and it must be lower than whatever
- * amount of credit 'regular' unit would end up with.
- */
-#define CSCHED2_IDLE_CREDIT (-(1U<<30))
-/*
- * Carryover: How much "extra" credit may be carried over after
- * a reset.
- */
-#define CSCHED2_CARRYOVER_MAX CSCHED2_MIN_TIMER
-/*
- * Stickiness: Cross-L2 migration resistance. Should be less than
- * MIN_TIMER.
- */
-#define CSCHED2_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1))
-/* How much to "compensate" an unit for L2 migration. */
-#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
-/* How tolerant we should be when peeking at runtime of units on other cpus */
-#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
-/* Reset: Value below which credit will be reset. */
-#define CSCHED2_CREDIT_RESET 0
-/* Max timer: Maximum time a guest can be run for. */
-#define CSCHED2_MAX_TIMER CSCHED2_CREDIT_INIT
-/* Period of the cap replenishment timer. */
-#define CSCHED2_BDGT_REPL_PERIOD ((opt_cap_period)*MILLISECS(1))
-
-/*
- * Flags
- */
-/*
- * CSFLAG_scheduled: Is this unit either running on, or context-switching off,
- * a physical cpu?
- * + Accessed only with runqueue lock held
- * + Set when chosen as next in csched2_schedule().
- * + Cleared after context switch has been saved in csched2_context_saved()
- * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
- * set CSFLAG_delayed_runq_add
- * + Checked to be false in runq_insert.
- */
-#define __CSFLAG_scheduled 1
-#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled)
-/*
- * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
- * being context switched out?
- * + Set when scheduling out in csched2_schedule() if prev is runnable
- * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set
- * + Read in csched2_context_saved(). If set, it adds prev to the runqueue and
- * clears the bit.
- */
-#define __CSFLAG_delayed_runq_add 2
-#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add)
-/*
- * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a
- * credit2-initiated runq migrate request; migrate it to the runqueue indicated
- * in the svc struct.
- */
-#define __CSFLAG_runq_migrate_request 3
-#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request)
-/*
- * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The
- * scheduler is invoked to see if we can give the cpu to someone else, and
- * get back to the yielding unit in a while.
- */
-#define __CSFLAG_unit_yield 4
-#define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield)
-/*
- * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard
- * affinity set to one and only 1 cpu (and, hence, can only run there).
- */
-#define __CSFLAG_pinned 5
-#define CSFLAG_pinned (1U<<__CSFLAG_pinned)
-
-static unsigned int __read_mostly opt_migrate_resist = 500;
-integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
-
-/*
- * Load tracking and load balancing
- *
- * Load history of runqueues and units is accounted for by using an
- * exponential weighted moving average algorithm. However, instead of using
- * fractions,we shift everything to left by the number of bits we want to
- * use for representing the fractional part (Q-format).
- *
- * We may also want to reduce the precision of time accounting, to
- * accommodate 'longer windows'. So, if that is the case, we just need to
- * shift all time samples to the right.
- *
- * The details of the formulas used for load tracking are explained close to
- * update_runq_load(). Let's just say here that, with full nanosecond time
- * granularity, a 30 bits wide 'decaying window' is ~1 second long.
- *
- * We want to consider the following equations:
- *
- * avg[0] = load*P
- * avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W, 0 <= delta <= W
- *
- * where W is the length of the window, P the multiplier for transitiong into
- * Q-format fixed point arithmetic and load is the instantaneous load of a
- * runqueue, which basically is the number of runnable units there are on the
- * runqueue (for the meaning of the other terms, look at the doc comment to
- * update_runq_load()).
- *
- * So, again, with full nanosecond granularity, and 1 second window, we have:
- *
- * W = 2^30
- * P = 2^18
- *
- * The maximum possible value for the average load, which we want to store in
- * s_time_t type variables (i.e., we have 63 bits available) is load*P. This
- * means that, with P 18 bits wide, load can occupy 45 bits. This in turn
- * means we can have 2^45 units in each runqueue, before overflow occurs!
- *
- * However, it can happen that, at step j+1, if:
- *
- * avg[j] = load*P
- * delta = W
- *
- * then:
- *
- * avg[j+i] = avg[j] + W*load*P/W - W*load*P/W
- *
- * So we must be able to deal with W*load*P. This means load can't be higher
- * than:
- *
- * 2^(63 - 30 - 18) = 2^15 = 32768
- *
- * So 32768 is the maximum number of units the we can have in a runqueue,
- * at any given time, and still not have problems with the load tracking
- * calculations... and this is more than fine.
- *
- * As a matter of fact, since we are using microseconds granularity, we have
- * W=2^20. So, still with 18 fractional bits and a 1 second long window, there
- * may be 2^25 = 33554432 units in a runq before we have to start thinking
- * about overflow.
- */
-
-/* If >0, decreases the granularity of time samples used for load tracking. */
-#define LOADAVG_GRANULARITY_SHIFT (10)
-/* Time window during which we still give value to previous load history. */
-#define LOADAVG_WINDOW_SHIFT (30)
-/* 18 bits by default (and not less than 4) for decimals. */
-#define LOADAVG_PRECISION_SHIFT (18)
-#define LOADAVG_PRECISION_SHIFT_MIN (4)
-
-/*
- * Both the length of the window and the number of fractional bits can be
- * decided with boot parameters.
- *
- * The length of the window is always expressed in nanoseconds. The actual
- * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT.
- */
-static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
-integer_param("credit2_load_window_shift", opt_load_window_shift);
-static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT;
-integer_param("credit2_load_precision_shift", opt_load_precision_shift);
-
-static int __read_mostly opt_underload_balance_tolerance = 0;
-integer_param("credit2_balance_under", opt_underload_balance_tolerance);
-static int __read_mostly opt_overload_balance_tolerance = -3;
-integer_param("credit2_balance_over", opt_overload_balance_tolerance);
-/*
- * Domains subject to a cap receive a replenishment of their runtime budget
- * once every opt_cap_period interval. Default is 10 ms. The amount of budget
- * they receive depends on their cap. For instance, a domain with a 50% cap
- * will receive 50% of 10 ms, so 5 ms.
- */
-static unsigned int __read_mostly opt_cap_period = 10; /* ms */
-integer_param("credit2_cap_period_ms", opt_cap_period);
-
-/*
- * Runqueue organization.
- *
- * The various cpus are to be assigned each one to a runqueue, and we
- * want that to happen basing on topology. At the moment, it is possible
- * to choose to arrange runqueues to be:
- *
- * - per-cpu: meaning that there will be one runqueue per logical cpu. This
- * will happen when if the opt_runqueue parameter is set to 'cpu'.
- *
- * - per-core: meaning that there will be one runqueue per each physical
- * core of the host. This will happen if the opt_runqueue
- * parameter is set to 'core';
- *
- * - per-socket: meaning that there will be one runqueue per each physical
- * socket (AKA package, which often, but not always, also
- * matches a NUMA node) of the host; This will happen if
- * the opt_runqueue parameter is set to 'socket';
- *
- * - per-node: meaning that there will be one runqueue per each physical
- * NUMA node of the host. This will happen if the opt_runqueue
- * parameter is set to 'node';
- *
- * - global: meaning that there will be only one runqueue to which all the
- * (logical) processors of the host belong. This will happen if
- * the opt_runqueue parameter is set to 'all'.
- *
- * Depending on the value of opt_runqueue, therefore, cpus that are part of
- * either the same physical core, the same physical socket, the same NUMA
- * node, or just all of them, will be put together to form runqueues.
- */
-#define OPT_RUNQUEUE_CPU 0
-#define OPT_RUNQUEUE_CORE 1
-#define OPT_RUNQUEUE_SOCKET 2
-#define OPT_RUNQUEUE_NODE 3
-#define OPT_RUNQUEUE_ALL 4
-static const char *const opt_runqueue_str[] = {
- [OPT_RUNQUEUE_CPU] = "cpu",
- [OPT_RUNQUEUE_CORE] = "core",
- [OPT_RUNQUEUE_SOCKET] = "socket",
- [OPT_RUNQUEUE_NODE] = "node",
- [OPT_RUNQUEUE_ALL] = "all"
-};
-static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET;
-
-static int __init parse_credit2_runqueue(const char *s)
-{
- unsigned int i;
-
- for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ )
- {
- if ( !strcmp(s, opt_runqueue_str[i]) )
- {
- opt_runqueue = i;
- return 0;
- }
- }
-
- return -EINVAL;
-}
-custom_param("credit2_runqueue", parse_credit2_runqueue);
-
-/*
- * Per-runqueue data
- */
-struct csched2_runqueue_data {
- spinlock_t lock; /* Lock for this runqueue */
-
- struct list_head runq; /* Ordered list of runnable vms */
- unsigned int nr_cpus; /* How many CPUs are sharing this runqueue */
- int id; /* ID of this runqueue (-1 if invalid) */
-
- int load; /* Instantaneous load (num of non-idle units) */
- s_time_t load_last_update; /* Last time average was updated */
- s_time_t avgload; /* Decaying queue load */
- s_time_t b_avgload; /* Decaying queue load modified by balancing */
-
- cpumask_t active, /* CPUs enabled for this runqueue */
- smt_idle, /* Fully idle-and-untickled cores (see below) */
- tickled, /* Have been asked to go through schedule */
- idle; /* Currently idle pcpus */
-
- struct list_head svc; /* List of all units assigned to the runqueue */
- unsigned int max_weight; /* Max weight of the units in this runqueue */
- unsigned int pick_bias; /* Last picked pcpu. Start from it next time */
-};
-
-/*
- * System-wide private data
- */
-struct csched2_private {
- rwlock_t lock; /* Private scheduler lock */
-
- unsigned int load_precision_shift; /* Precision of load calculations */
- unsigned int load_window_shift; /* Lenght of load decaying window */
- unsigned int ratelimit_us; /* Rate limiting for this scheduler */
-
- cpumask_t active_queues; /* Runqueues with (maybe) active cpus */
- struct csched2_runqueue_data *rqd; /* Data of the various runqueues */
-
- cpumask_t initialized; /* CPUs part of this scheduler */
- struct list_head sdom; /* List of domains (for debug key) */
-};
-
-/*
- * Physical CPU
- */
-struct csched2_pcpu {
- cpumask_t sibling_mask; /* Siblings in the same runqueue */
- int runq_id;
-};
-
-/*
- * Schedule Unit
- */
-struct csched2_unit {
- struct csched2_dom *sdom; /* Up-pointer to domain */
- struct sched_unit *unit; /* Up-pointer, to schedule unit */
- struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue */
-
- int credit; /* Current amount of credit */
- unsigned int weight; /* Weight of this unit */
- unsigned int residual; /* Reminder of div(max_weight/weight) */
- unsigned flags; /* Status flags (16 bits would be ok, */
- s_time_t budget; /* Current budget (if domains has cap) */
- /* but clear_bit() does not like that) */
- s_time_t budget_quota; /* Budget to which unit is entitled */
-
- s_time_t start_time; /* Time we were scheduled (for credit) */
-
- /* Individual contribution to load */
- s_time_t load_last_update; /* Last time average was updated */
- s_time_t avgload; /* Decaying queue load */
-
- struct list_head runq_elem; /* On the runqueue (rqd->runq) */
- struct list_head parked_elem; /* On the parked_units list */
- struct list_head rqd_elem; /* On csched2_runqueue_data's svc list */
- struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */
- int tickled_cpu; /* Cpu that will pick us (-1 if none) */
-};
-
-/*
- * Domain
- */
-struct csched2_dom {
- struct domain *dom; /* Up-pointer to domain */
-
- spinlock_t budget_lock; /* Serialized budget calculations */
- s_time_t tot_budget; /* Total amount of budget */
- s_time_t budget; /* Currently available budget */
-
- struct timer repl_timer; /* Timer for periodic replenishment of budget */
- s_time_t next_repl; /* Time at which next replenishment occurs */
- struct list_head parked_units; /* List of CPUs waiting for budget */
-
- struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list */
- uint16_t weight; /* User specified weight */
- uint16_t cap; /* User specified cap */
- uint16_t nr_units; /* Number of units of this domain */
-};
-
-/*
- * Accessor helpers functions.
- */
-static inline struct csched2_private *csched2_priv(const struct scheduler *ops)
-{
- return ops->sched_data;
-}
-
-static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
-{
- return get_sched_res(cpu)->sched_priv;
-}
-
-static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit)
-{
- return unit->priv;
-}
-
-static inline struct csched2_dom *csched2_dom(const struct domain *d)
-{
- return d->sched_priv;
-}
-
-/* CPU to runq_id macro */
-static inline int c2r(unsigned int cpu)
-{
- return csched2_pcpu(cpu)->runq_id;
-}
-
-/* CPU to runqueue struct macro */
-static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
- unsigned int cpu)
-{
- return &csched2_priv(ops)->rqd[c2r(cpu)];
-}
-
-/* Does the domain of this unit have a cap? */
-static inline bool has_cap(const struct csched2_unit *svc)
-{
- return svc->budget != STIME_MAX;
-}
-
-/*
- * Hyperthreading (SMT) support.
- *
- * We use a special per-runq mask (smt_idle) and update it according to the
- * following logic:
- * - when _all_ the SMT sibling in a core are idle, all their corresponding
- * bits are set in the smt_idle mask;
- * - when even _just_one_ of the SMT siblings in a core is not idle, all the
- * bits correspondings to it and to all its siblings are clear in the
- * smt_idle mask.
- *
- * Once we have such a mask, it is easy to implement a policy that, either:
- * - uses fully idle cores first: it is enough to try to schedule the units
- * on pcpus from smt_idle mask first. This is what happens if
- * sched_smt_power_savings was not set at boot (default), and it maximizes
- * true parallelism, and hence performance;
- * - uses already busy cores first: it is enough to try to schedule the units
- * on pcpus that are idle, but are not in smt_idle. This is what happens if
- * sched_smt_power_savings is set at boot, and it allows as more cores as
- * possible to stay in low power states, minimizing power consumption.
- *
- * This logic is entirely implemented in runq_tickle(), and that is enough.
- * In fact, in this scheduler, placement of an unit on one of the pcpus of a
- * runq, _always_ happens by means of tickling:
- * - when an unit wakes up, it calls csched2_unit_wake(), which calls
- * runq_tickle();
- * - when a migration is initiated in schedule.c, we call csched2_res_pick(),
- * csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake().
- * csched2_res_pick() looks for the least loaded runq and return just any
- * of its processors. Then, csched2_unit_migrate() just moves the unit to
- * the chosen runq, and it is again runq_tickle(), called by
- * csched2_unit_wake() that actually decides what pcpu to use within the
- * chosen runq;
- * - when a migration is initiated in sched_credit2.c, by calling migrate()
- * directly, that again temporarily use a random pcpu from the new runq,
- * and then calls runq_tickle(), by itself.
- */
-
-/*
- * If all the siblings of cpu (including cpu itself) are both idle and
- * untickled, set all their bits in mask.
- *
- * NB that rqd->smt_idle is different than rqd->idle. rqd->idle
- * records pcpus that at are merely idle (i.e., at the moment do not
- * have an unit running on them). But you have to manually filter out
- * which pcpus have been tickled in order to find cores that are not
- * going to be busy soon. Filtering out tickled cpus pairwise is a
- * lot of extra pain; so for rqd->smt_idle, we explicitly make so that
- * the bits of a pcpu are set only if all the threads on its core are
- * both idle *and* untickled.
- *
- * This means changing the mask when either rqd->idle or rqd->tickled
- * changes.
- */
-static inline
-void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers,
- cpumask_t *mask)
-{
- const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
-
- if ( cpumask_subset(cpu_siblings, idlers) )
- cpumask_or(mask, mask, cpu_siblings);
-}
-
-/*
- * Clear the bits of all the siblings of cpu from mask (if necessary).
- */
-static inline
-void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask)
-{
- const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
-
- if ( cpumask_subset(cpu_siblings, mask) )
- cpumask_andnot(mask, mask, cpu_siblings);
-}
-
-/*
- * In csched2_res_pick(), it may not be possible to actually look at remote
- * runqueues (the trylock-s on their spinlocks can fail!). If that happens,
- * we pick, in order of decreasing preference:
- * 1) svc's current pcpu, if it is part of svc's soft affinity;
- * 2) a pcpu in svc's current runqueue that is also in svc's soft affinity;
- * 3) svc's current pcpu, if it is part of svc's hard affinity;
- * 4) a pcpu in svc's current runqueue that is also in svc's hard affinity;
- * 5) just one valid pcpu from svc's hard affinity
- *
- * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also
- * note that at least 5 is guaranteed to _always_ return at least one pcpu.
- */
-static int get_fallback_cpu(struct csched2_unit *svc)
-{
- struct sched_unit *unit = svc->unit;
- unsigned int bs;
-
- SCHED_STAT_CRANK(need_fallback_cpu);
-
- for_each_affinity_balance_step( bs )
- {
- int cpu = sched_unit_master(unit);
-
- if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
- continue;
-
- affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- cpupool_domain_master_cpumask(unit->domain));
-
- /*
- * This is cases 1 or 3 (depending on bs): if processor is (still)
- * in our affinity, go for it, for cache betterness.
- */
- if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
- return cpu;
-
- /*
- * This is cases 2 or 4 (depending on bs): v->processor isn't there
- * any longer, check if we at least can stay in our current runq.
- */
- if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu),
- &svc->rqd->active)) )
- {
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &svc->rqd->active);
- return cpumask_first(cpumask_scratch_cpu(cpu));
- }
-
- /*
- * We may well pick any valid pcpu from our soft-affinity, outside
- * of our current runqueue, but we decide not to. In fact, changing
- * runqueue is slow, affects load distribution, and is a source of
- * overhead for the units running on the other runqueue (we need the
- * lock). So, better do that as a consequence of a well informed
- * decision (or if we really don't have any other chance, as we will,
- * at step 5, if we get to there).
- *
- * Also, being here, looking for a fallback, is an unfortunate and
- * infrequent event, while the decision of putting us in the runqueue
- * wehere we are was (likely) made taking all the relevant factors
- * into account. So let's not disrupt that, just for the sake of
- * soft-affinity, and let's wait here to be able to made (hopefully,
- * soon), another similar well informed decision.
- */
- if ( bs == BALANCE_SOFT_AFFINITY )
- continue;
-
- /*
- * This is cases 5: last stand, just one valid pcpu from our hard
- * affinity. It's guaranteed that there is at least one valid cpu,
- * and therefore we are sure that we return it, and never really
- * exit the loop.
- */
- ASSERT(bs == BALANCE_HARD_AFFINITY &&
- !cpumask_empty(cpumask_scratch_cpu(cpu)));
- cpu = cpumask_first(cpumask_scratch_cpu(cpu));
- if ( likely(cpu < nr_cpu_ids) )
- return cpu;
- }
- ASSERT_UNREACHABLE();
- /*
- * We can't be here. But if that somehow happen (in non-debug builds),
- * at least return something which both online and in our hard-affinity.
- */
- return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit)));
-}
-
-/*
- * Time-to-credit, credit-to-time.
- *
- * We keep track of the "residual" time to make sure that frequent short
- * schedules still get accounted for in the end.
- *
- * FIXME: Do pre-calculated division?
- */
-static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
- struct csched2_unit *svc)
-{
- uint64_t val = time * rqd->max_weight + svc->residual;
-
- svc->residual = do_div(val, svc->weight);
- svc->credit -= val;
-}
-
-static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_unit *svc)
-{
- return credit * svc->weight / rqd->max_weight;
-}
-
-/*
- * Runqueue related code.
- */
-
-static inline int unit_on_runq(struct csched2_unit *svc)
-{
- return !list_empty(&svc->runq_elem);
-}
-
-static inline struct csched2_unit * runq_elem(struct list_head *elem)
-{
- return list_entry(elem, struct csched2_unit, runq_elem);
-}
-
-static void activate_runqueue(struct csched2_private *prv, int rqi)
-{
- struct csched2_runqueue_data *rqd;
-
- rqd = prv->rqd + rqi;
-
- BUG_ON(!cpumask_empty(&rqd->active));
-
- rqd->max_weight = 1;
- rqd->id = rqi;
- INIT_LIST_HEAD(&rqd->svc);
- INIT_LIST_HEAD(&rqd->runq);
- spin_lock_init(&rqd->lock);
-
- __cpumask_set_cpu(rqi, &prv->active_queues);
-}
-
-static void deactivate_runqueue(struct csched2_private *prv, int rqi)
-{
- struct csched2_runqueue_data *rqd;
-
- rqd = prv->rqd + rqi;
-
- BUG_ON(!cpumask_empty(&rqd->active));
-
- rqd->id = -1;
-
- __cpumask_clear_cpu(rqi, &prv->active_queues);
-}
-
-static inline bool same_node(unsigned int cpua, unsigned int cpub)
-{
- return cpu_to_node(cpua) == cpu_to_node(cpub);
-}
-
-static inline bool same_socket(unsigned int cpua, unsigned int cpub)
-{
- return cpu_to_socket(cpua) == cpu_to_socket(cpub);
-}
-
-static inline bool same_core(unsigned int cpua, unsigned int cpub)
-{
- return same_socket(cpua, cpub) &&
- cpu_to_core(cpua) == cpu_to_core(cpub);
-}
-
-static unsigned int
-cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu)
-{
- struct csched2_runqueue_data *rqd;
- unsigned int rqi;
-
- for ( rqi = 0; rqi < nr_cpu_ids; rqi++ )
- {
- unsigned int peer_cpu;
-
- /*
- * As soon as we come across an uninitialized runqueue, use it.
- * In fact, either:
- * - we are initializing the first cpu, and we assign it to
- * runqueue 0. This is handy, especially if we are dealing
- * with the boot cpu (if credit2 is the default scheduler),
- * as we would not be able to use cpu_to_socket() and similar
- * helpers anyway (they're result of which is not reliable yet);
- * - we have gone through all the active runqueues, and have not
- * found anyone whose cpus' topology matches the one we are
- * dealing with, so activating a new runqueue is what we want.
- */
- if ( prv->rqd[rqi].id == -1 )
- break;
-
- rqd = prv->rqd + rqi;
- BUG_ON(cpumask_empty(&rqd->active));
-
- peer_cpu = cpumask_first(&rqd->active);
- BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID ||
- cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID);
-
- if (opt_runqueue == OPT_RUNQUEUE_CPU)
- continue;
- if ( opt_runqueue == OPT_RUNQUEUE_ALL ||
- (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) ||
- (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) ||
- (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) )
- break;
- }
-
- /* We really expect to be able to assign each cpu to a runqueue. */
- BUG_ON(rqi >= nr_cpu_ids);
-
- return rqi;
-}
-
-/* Find the domain with the highest weight. */
-static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
- int old_weight)
-{
- /* Try to avoid brute-force search:
- * - If new_weight is larger, max_weigth <- new_weight
- * - If old_weight != max_weight, someone else is still max_weight
- * (No action required)
- * - If old_weight == max_weight, brute-force search for max weight
- */
- if ( new_weight > rqd->max_weight )
- {
- rqd->max_weight = new_weight;
- SCHED_STAT_CRANK(upd_max_weight_quick);
- }
- else if ( old_weight == rqd->max_weight )
- {
- struct list_head *iter;
- int max_weight = 1;
-
- list_for_each( iter, &rqd->svc )
- {
- struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem);
-
- if ( svc->weight > max_weight )
- max_weight = svc->weight;
- }
-
- rqd->max_weight = max_weight;
- SCHED_STAT_CRANK(upd_max_weight_full);
- }
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned rqi:16, max_weight:16;
- } d;
- d.rqi = rqd->id;
- d.max_weight = rqd->max_weight;
- __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-}
-
-/* Add and remove from runqueue assignment (not active run queue) */
-static void
-_runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd)
-{
-
- svc->rqd = rqd;
- list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
-
- update_max_weight(svc->rqd, svc->weight, 0);
-
- /* Expected new load based on adding this unit */
- rqd->b_avgload += svc->avgload;
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned rqi:16;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.rqi=rqd->id;
- __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
-}
-
-static void
-runq_assign(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit *svc = unit->priv;
-
- ASSERT(svc->rqd == NULL);
-
- _runq_assign(svc, c2rqd(ops, sched_unit_master(unit)));
-}
-
-static void
-_runq_deassign(struct csched2_unit *svc)
-{
- struct csched2_runqueue_data *rqd = svc->rqd;
-
- ASSERT(!unit_on_runq(svc));
- ASSERT(!(svc->flags & CSFLAG_scheduled));
-
- list_del_init(&svc->rqd_elem);
- update_max_weight(rqd, 0, svc->weight);
-
- /* Expected new load based on removing this unit */
- rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
-
- svc->rqd = NULL;
-}
-
-static void
-runq_deassign(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit *svc = unit->priv;
-
- ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-
- _runq_deassign(svc);
-}
-
-/*
- * Track the runq load by gathering instantaneous load samples, and using
- * exponentially weighted moving average (EWMA) for the 'decaying'.
- *
- * We consider a window of length W=2^(prv->load_window_shift) nsecs
- * (which takes LOADAVG_GRANULARITY_SHIFT into account).
- *
- * If load is the instantaneous load, the formula for EWMA looks as follows,
- * for the i-eth sample:
- *
- * avg[i] = a*load + (1 - a)*avg[i-1]
- *
- * where avg[i] is the new value of the average load, avg[i-1] is the value
- * of the average load calculated so far, and a is a coefficient less or
- * equal to 1.
- *
- * So, for us, it becomes:
- *
- * avgload = a*load + (1 - a)*avgload
- *
- * For determining a, we consider _when_ we are doing the load update, wrt
- * the length of the window. We define delta as follows:
- *
- * delta = t - load_last_update
- *
- * where t is current time (i.e., time at which we are both sampling and
- * updating the load average) and load_last_update is the last time we did
- * that.
- *
- * There are two possible situations:
- *
- * a) delta <= W
- * this means that, during the last window of length W, the runeuque load
- * was avgload for (W - detla) time, and load for delta time:
- *
- * |----------- W ---------|
- * | |
- * | load_last_update t
- * -------------------------|---------|---
- * | | |
- * \__W - delta__/\_delta__/
- * | | |
- * |___avgload___|__load___|
- *
- * So, what about using delta/W as our smoothing coefficient a. If we do,
- * here's what happens:
- *
- * a = delta / W
- * 1 - a = 1 - (delta / W) = (W - delta) / W
- *
- * Which matches the above description of what happened in the last
- * window of length W.
- *
- * Note that this also means that the weight that we assign to both the
- * latest load sample, and to previous history, varies at each update.
- * The longer the latest load sample has been in efect, within the last
- * window, the higher it weights (and the lesser the previous history
- * weights).
- *
- * This is some sort of extension of plain EWMA to fit even better to our
- * use case.
- *
- * b) delta > W
- * this means more than a full window has passed since the last update:
- *
- * |----------- W ---------|
- * | |
- * load_last_update t
- * ----|------------------------------|---
- * | |
- * \_________________delta________/
- *
- * Basically, it means the last load sample has been in effect for more
- * than W time, and hence we should just use it, and forget everything
- * before that.
- *
- * This can be seen as a 'reset condition', occurring when, for whatever
- * reason, load has not been updated for longer than we expected. (It is
- * also how avgload is assigned its first value.)
- *
- * The formula for avgload then becomes:
- *
- * avgload = (delta/W)*load + (W - delta)*avgload/W
- * avgload = delta*load/W + W*avgload/W - delta*avgload/W
- * avgload = avgload + delta*load/W - delta*avgload/W
- *
- * So, final form is:
- *
- * avgload_0 = load
- * avgload = avgload + delta*load/W - delta*avgload/W, 0<=delta<=W
- *
- * As a confirmation, let's look at the extremes, when delta is 0 (i.e.,
- * what happens if we update the load twice, at the same time instant?):
- *
- * avgload = avgload + 0*load/W - 0*avgload/W
- * avgload = avgload
- *
- * and when delta is W (i.e., what happens if we update at the last
- * possible instant before the window 'expires'?):
- *
- * avgload = avgload + W*load/W - W*avgload/W
- * avgload = avgload + load - avgload
- * avgload = load
- *
- * Which, in both cases, is what we expect.
- */
-static void
-update_runq_load(const struct scheduler *ops,
- struct csched2_runqueue_data *rqd, int change, s_time_t now)
-{
- struct csched2_private *prv = csched2_priv(ops);
- s_time_t delta, load = rqd->load;
- unsigned int P, W;
-
- W = prv->load_window_shift;
- P = prv->load_precision_shift;
- now >>= LOADAVG_GRANULARITY_SHIFT;
-
- /*
- * To avoid using fractions, we shift to left by load_precision_shift,
- * and use the least last load_precision_shift bits as fractional part.
- * Looking back at the formula we want to use, we now have:
- *
- * P = 2^(load_precision_shift)
- * P*avgload = P*(avgload + delta*load/W - delta*avgload/W)
- * P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W
- *
- * And if we are ok storing and using P*avgload, we can rewrite this as:
- *
- * P*avgload = avgload'
- * avgload' = avgload' + delta*P*load/W - delta*avgload'/W
- *
- * Coupled with, of course:
- *
- * avgload_0' = P*load
- */
-
- if ( rqd->load_last_update + (1ULL << W) < now )
- {
- rqd->avgload = load << P;
- rqd->b_avgload = load << P;
- }
- else
- {
- delta = now - rqd->load_last_update;
- if ( unlikely(delta < 0) )
- {
- d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
- __func__, now, rqd->load_last_update);
- delta = 0;
- }
-
- /*
- * Note that, if we were to enforce (or check) some relationship
- * between P and W, we may save one shift. E.g., if we are sure
- * that P < W, we could write:
- *
- * (delta * (load << P)) >> W
- *
- * as:
- *
- * (delta * load) >> (W - P)
- */
- rqd->avgload = rqd->avgload +
- ((delta * (load << P)) >> W) -
- ((delta * rqd->avgload) >> W);
- rqd->b_avgload = rqd->b_avgload +
- ((delta * (load << P)) >> W) -
- ((delta * rqd->b_avgload) >> W);
- }
- rqd->load += change;
- rqd->load_last_update = now;
-
- /* Overflow, capable of making the load look negative, must not occur. */
- ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint64_t rq_avgload, b_avgload;
- unsigned rq_load:16, rq_id:8, shift:8;
- } d;
- d.rq_id = rqd->id;
- d.rq_load = rqd->load;
- d.rq_avgload = rqd->avgload;
- d.b_avgload = rqd->b_avgload;
- d.shift = P;
- __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-}
-
-static void
-update_svc_load(const struct scheduler *ops,
- struct csched2_unit *svc, int change, s_time_t now)
-{
- struct csched2_private *prv = csched2_priv(ops);
- s_time_t delta, unit_load;
- unsigned int P, W;
-
- if ( change == -1 )
- unit_load = 1;
- else if ( change == 1 )
- unit_load = 0;
- else
- unit_load = unit_runnable(svc->unit);
-
- W = prv->load_window_shift;
- P = prv->load_precision_shift;
- now >>= LOADAVG_GRANULARITY_SHIFT;
-
- if ( svc->load_last_update + (1ULL << W) < now )
- {
- svc->avgload = unit_load << P;
- }
- else
- {
- delta = now - svc->load_last_update;
- if ( unlikely(delta < 0) )
- {
- d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
- __func__, now, svc->load_last_update);
- delta = 0;
- }
-
- svc->avgload = svc->avgload +
- ((delta * (unit_load << P)) >> W) -
- ((delta * svc->avgload) >> W);
- }
- svc->load_last_update = now;
-
- /* Overflow, capable of making the load look negative, must not occur. */
- ASSERT(svc->avgload >= 0);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint64_t v_avgload;
- unsigned unit:16, dom:16;
- unsigned shift;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.v_avgload = svc->avgload;
- d.shift = P;
- __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-}
-
-static void
-update_load(const struct scheduler *ops,
- struct csched2_runqueue_data *rqd,
- struct csched2_unit *svc, int change, s_time_t now)
-{
- trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0, NULL);
-
- update_runq_load(ops, rqd, change, now);
- if ( svc )
- update_svc_load(ops, svc, change, now);
-}
-
-static void
-runq_insert(const struct scheduler *ops, struct csched2_unit *svc)
-{
- struct list_head *iter;
- unsigned int cpu = sched_unit_master(svc->unit);
- struct list_head * runq = &c2rqd(ops, cpu)->runq;
- int pos = 0;
-
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
- ASSERT(!unit_on_runq(svc));
- ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit)));
-
- ASSERT(&svc->rqd->runq == runq);
- ASSERT(!is_idle_unit(svc->unit));
- ASSERT(!svc->unit->is_running);
- ASSERT(!(svc->flags & CSFLAG_scheduled));
-
- list_for_each( iter, runq )
- {
- struct csched2_unit * iter_svc = runq_elem(iter);
-
- if ( svc->credit > iter_svc->credit )
- break;
-
- pos++;
- }
- list_add_tail(&svc->runq_elem, iter);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned pos;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.pos = pos;
- __trace_var(TRC_CSCHED2_RUNQ_POS, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-}
-
-static inline void runq_remove(struct csched2_unit *svc)
-{
- ASSERT(unit_on_runq(svc));
- list_del_init(&svc->runq_elem);
-}
-
-void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t);
-
-static inline void
-tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd)
-{
- __cpumask_set_cpu(cpu, &rqd->tickled);
- smt_idle_mask_clear(cpu, &rqd->smt_idle);
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-}
-
-/*
- * What we want to know is whether svc, which we assume to be running on some
- * pcpu, can be interrupted and preempted (which, so far, basically means
- * whether or not it already run for more than the ratelimit, to which we
- * apply some tolerance).
- */
-static inline bool is_preemptable(const struct csched2_unit *svc,
- s_time_t now, s_time_t ratelimit)
-{
- if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
- return true;
-
- ASSERT(svc->unit->is_running);
- return now - svc->unit->state_entry_time >
- ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
-}
-
-/*
- * Score to preempt the target cpu. Return a negative number if the
- * credit isn't high enough; if it is, favor a preemption on cpu in
- * this order:
- * - cpu is in new's soft-affinity, not in cur's soft-affinity
- * (2 x CSCHED2_CREDIT_INIT score bonus);
- * - cpu is in new's soft-affinity and cur's soft-affinity, or
- * cpu is not in new's soft-affinity, nor in cur's soft-affinity
- * (1x CSCHED2_CREDIT_INIT score bonus);
- * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity
- * (no bonus).
- *
- * Within the same class, the highest difference of credit.
- */
-static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
- struct csched2_unit *new, unsigned int cpu)
-{
- struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
- struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu));
- struct csched2_private *prv = csched2_priv(ops);
- s_time_t score;
-
- /*
- * We are dealing with cpus that are marked non-idle (i.e., that are not
- * in rqd->idle). However, some of them may be running their idle unit,
- * if taking care of tasklets. In that case, we want to leave it alone.
- */
- if ( unlikely(is_idle_unit(cur->unit) ||
- !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
- return -1;
-
- burn_credits(rqd, cur, now);
-
- score = new->credit - cur->credit;
- if ( sched_unit_master(new->unit) != cpu )
- score -= CSCHED2_MIGRATE_RESIST;
-
- /*
- * If score is positive, it means new has enough credits (i.e.,
- * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST).
- *
- * Let's compute the bonuses for soft-affinities.
- */
- if ( score > 0 )
- {
- if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) )
- score += CSCHED2_CREDIT_INIT;
-
- if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) )
- score += CSCHED2_CREDIT_INIT;
- }
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- int credit, score;
- } d;
- d.dom = cur->unit->domain->domain_id;
- d.unit = cur->unit->unit_id;
- d.credit = cur->credit;
- d.score = score;
- __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- return score;
-}
-
-/*
- * Check what processor it is best to 'wake', for picking up an unit that has
- * just been put (back) in the runqueue. Logic is as follows:
- * 1. if there are idle processors in the runq, wake one of them;
- * 2. if there aren't idle processor, check the one were the unit was
- * running before to see if we can preempt what's running there now
- * (and hence doing just one migration);
- * 3. last stand: check all processors and see if the unit is in right
- * of preempting any of the other units running on them (this requires
- * two migrations, and that's indeed why it is left as the last stand).
- *
- * Note that when we say 'idle processors' what we really mean is (pretty
- * much always) both _idle_ and _not_already_tickled_. In fact, if a
- * processor has been tickled, it will run csched2_schedule() shortly, and
- * pick up some work, so it would be wrong to consider it idle.
- */
-static void
-runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now)
-{
- int i, ipid = -1;
- s_time_t max = 0;
- struct sched_unit *unit = new->unit;
- unsigned int bs, cpu = sched_unit_master(unit);
- struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
- cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
- cpumask_t mask;
-
- ASSERT(new->rqd == rqd);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned processor;
- int credit;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.processor = cpu;
- d.credit = new->credit;
- __trace_var(TRC_CSCHED2_TICKLE_NEW, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- /*
- * Exclusive pinning is when an unit has hard-affinity with only one
- * cpu, and there is no other unit that has hard-affinity with that
- * same cpu. This is infrequent, but if it happens, is for achieving
- * the most possible determinism, and least possible overhead for
- * the units in question.
- *
- * Try to identify the vast majority of these situations, and deal
- * with them quickly.
- */
- if ( unlikely((new->flags & CSFLAG_pinned) &&
- cpumask_test_cpu(cpu, &rqd->idle) &&
- !cpumask_test_cpu(cpu, &rqd->tickled)) )
- {
- ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
- SCHED_STAT_CRANK(tickled_idle_cpu_excl);
- ipid = cpu;
- goto tickle;
- }
-
- for_each_affinity_balance_step( bs )
- {
- /* Just skip first step, if we don't have a soft affinity */
- if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
- continue;
-
- affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
-
- /*
- * First of all, consider idle cpus, checking if we can just
- * re-use the pcpu where we were running before.
- *
- * If there are cores where all the siblings are idle, consider
- * them first, honoring whatever the spreading-vs-consolidation
- * SMT policy wants us to do.
- */
- if ( unlikely(sched_smt_power_savings) )
- {
- cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle);
- cpumask_and(&mask, &mask, online);
- }
- else
- cpumask_and(&mask, &rqd->smt_idle, online);
- cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
- i = cpumask_test_or_cycle(cpu, &mask);
- if ( i < nr_cpu_ids )
- {
- SCHED_STAT_CRANK(tickled_idle_cpu);
- ipid = i;
- goto tickle;
- }
-
- /*
- * If there are no fully idle cores, check all idlers, after
- * having filtered out pcpus that have been tickled but haven't
- * gone through the scheduler yet.
- */
- cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online);
- cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
- i = cpumask_test_or_cycle(cpu, &mask);
- if ( i < nr_cpu_ids )
- {
- SCHED_STAT_CRANK(tickled_idle_cpu);
- ipid = i;
- goto tickle;
- }
- }
-
- /*
- * Note that, if we are here, it means we have done the hard-affinity
- * balancing step of the loop, and hence what we have in cpumask_scratch
- * is what we put there for last, i.e., new's unit_hard_affinity & online
- * which is exactly what we need for the next part of the function.
- */
-
- /*
- * Otherwise, look for the non-idle (and non-tickled) processors with
- * the lowest credit, among the ones new is allowed to run on. Again,
- * the cpu were it was running on would be the best candidate.
- *
- * For deciding which cpu to tickle, we use tickle_score(), which will
- * factor in both new's soft-affinity, and the soft-affinity of the
- * unit running on each cpu that we consider.
- */
- cpumask_andnot(&mask, &rqd->active, &rqd->idle);
- cpumask_andnot(&mask, &mask, &rqd->tickled);
- cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
- if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
- {
- s_time_t score = tickle_score(ops, now, new, cpu);
-
- if ( score > max )
- {
- max = score;
- ipid = cpu;
-
- /* If this is in new's soft affinity, just take it */
- if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) )
- {
- SCHED_STAT_CRANK(tickled_busy_cpu);
- goto tickle;
- }
- }
- }
-
- for_each_cpu(i, &mask)
- {
- s_time_t score;
-
- /* Already looked at this one above */
- ASSERT(i != cpu);
-
- score = tickle_score(ops, now, new, i);
-
- if ( score > max )
- {
- max = score;
- ipid = i;
- }
- }
-
- if ( ipid == -1 )
- {
- SCHED_STAT_CRANK(tickled_no_cpu);
- return;
- }
-
- ASSERT(!is_idle_unit(curr_on_cpu(ipid)));
- SCHED_STAT_CRANK(tickled_busy_cpu);
- tickle:
- BUG_ON(ipid == -1);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned cpu:16, pad:16;
- } d;
- d.cpu = ipid; d.pad = 0;
- __trace_var(TRC_CSCHED2_TICKLE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- tickle_cpu(ipid, rqd);
-
- if ( unlikely(new->tickled_cpu != -1) )
- SCHED_STAT_CRANK(tickled_cpu_overwritten);
- new->tickled_cpu = ipid;
-}
-
-/*
- * Credit-related code
- */
-static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
- struct csched2_unit *snext)
-{
- struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
- struct list_head *iter;
- int m;
-
- /*
- * Under normal circumstances, snext->credit should never be less
- * than -CSCHED2_MIN_TIMER. However, under some circumstances, an
- * unit with low credits may be allowed to run long enough that
- * its credits are actually less than -CSCHED2_CREDIT_INIT.
- * (Instances have been observed, for example, where an unit with
- * 200us of credit was allowed to run for 11ms, giving it -10.8ms
- * of credit. Thus it was still negative even after the reset.)
- *
- * If this is the case for snext, we simply want to keep moving
- * everyone up until it is in the black again. This fair because
- * none of the other units want to run at the moment.
- *
- * Rather than looping, however, we just calculate a multiplier,
- * avoiding an integer division and multiplication in the common
- * case.
- */
- m = 1;
- if ( snext->credit < -CSCHED2_CREDIT_INIT )
- m += (-snext->credit) / CSCHED2_CREDIT_INIT;
-
- list_for_each( iter, &rqd->svc )
- {
- unsigned int svc_cpu;
- struct csched2_unit * svc;
- int start_credit;
-
- svc = list_entry(iter, struct csched2_unit, rqd_elem);
- svc_cpu = sched_unit_master(svc->unit);
-
- ASSERT(!is_idle_unit(svc->unit));
- ASSERT(svc->rqd == rqd);
-
- /*
- * If svc is running, it is our responsibility to make sure, here,
- * that the credit it has spent so far get accounted.
- */
- if ( svc->unit == curr_on_cpu(svc_cpu) )
- {
- burn_credits(rqd, svc, now);
- /*
- * And, similarly, in case it has run out of budget, as a
- * consequence of this round of accounting, we also must inform
- * its pCPU that it's time to park it, and pick up someone else.
- */
- if ( unlikely(svc->budget <= 0) )
- tickle_cpu(svc_cpu, rqd);
- }
-
- start_credit = svc->credit;
-
- /*
- * Add INIT * m, avoiding integer multiplication in the common case.
- */
- if ( likely(m==1) )
- svc->credit += CSCHED2_CREDIT_INIT;
- else
- svc->credit += m * CSCHED2_CREDIT_INIT;
-
- /* "Clip" credits to max carryover */
- if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX )
- svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX;
-
- svc->start_time = now;
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- int credit_start, credit_end;
- unsigned multiplier;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.credit_start = start_credit;
- d.credit_end = svc->credit;
- d.multiplier = m;
- __trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
- }
-
- SCHED_STAT_CRANK(credit_reset);
-
- /* No need to resort runqueue, as everyone's order should be the same. */
-}
-
-void burn_credits(struct csched2_runqueue_data *rqd,
- struct csched2_unit *svc, s_time_t now)
-{
- s_time_t delta;
-
- ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit))));
-
- if ( unlikely(is_idle_unit(svc->unit)) )
- {
- ASSERT(svc->credit == CSCHED2_IDLE_CREDIT);
- return;
- }
-
- delta = now - svc->start_time;
-
- if ( unlikely(delta <= 0) )
- {
- if ( unlikely(delta < 0) )
- d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
- " start_time %"PRI_stime"\n", __func__, now,
- svc->start_time);
- goto out;
- }
-
- SCHED_STAT_CRANK(burn_credits_t2c);
- t2c_update(rqd, delta, svc);
-
- if ( has_cap(svc) )
- svc->budget -= delta;
-
- svc->start_time = now;
-
- out:
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- int credit, budget;
- int delta;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.credit = svc->credit;
- d.budget = has_cap(svc) ? svc->budget : INT_MIN;
- d.delta = delta;
- __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-}
-
-/*
- * Budget-related code.
- */
-
-static void park_unit(struct csched2_unit *svc)
-{
- struct sched_unit *unit = svc->unit;
-
- ASSERT(spin_is_locked(&svc->sdom->budget_lock));
-
- /*
- * It was impossible to find budget for this unit, so it has to be
- * "parked". This implies it is not runnable, so we mark it as such in
- * its pause_flags. If the unit is currently scheduled (which means we
- * are here after being called from within csched_schedule()), flagging
- * is enough, as we'll choose someone else, and then context_saved()
- * will take care of updating the load properly.
- *
- * If, OTOH, the unit is sitting in the runqueue (which means we are here
- * after being called from within runq_candidate()), we must go all the
- * way down to taking it out of there, and updating the load accordingly.
- *
- * In both cases, we also add it to the list of parked units of the domain.
- */
- sched_set_pause_flags(unit, _VPF_parked);
- if ( unit_on_runq(svc) )
- {
- runq_remove(svc);
- update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
- }
- list_add(&svc->parked_elem, &svc->sdom->parked_units);
-}
-
-static bool unit_grab_budget(struct csched2_unit *svc)
-{
- struct csched2_dom *sdom = svc->sdom;
- unsigned int cpu = sched_unit_master(svc->unit);
-
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
- if ( svc->budget > 0 )
- return true;
-
- /* budget_lock nests inside runqueue lock. */
- spin_lock(&sdom->budget_lock);
-
- /*
- * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
- * above!). That basically means the unit has overrun a bit --because of
- * various reasons-- and we want to take that into account. With the +=,
- * we are actually subtracting the amount of budget the unit has
- * overconsumed, from the total domain budget.
- */
- sdom->budget += svc->budget;
-
- if ( sdom->budget > 0 )
- {
- s_time_t budget;
-
- /* Get our quota, if there's at least as much budget */
- if ( likely(sdom->budget >= svc->budget_quota) )
- budget = svc->budget_quota;
- else
- budget = sdom->budget;
-
- svc->budget = budget;
- sdom->budget -= budget;
- }
- else
- {
- svc->budget = 0;
- park_unit(svc);
- }
-
- spin_unlock(&sdom->budget_lock);
-
- return svc->budget > 0;
-}
-
-static void
-unit_return_budget(struct csched2_unit *svc, struct list_head *parked)
-{
- struct csched2_dom *sdom = svc->sdom;
- unsigned int cpu = sched_unit_master(svc->unit);
-
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
- ASSERT(list_empty(parked));
-
- /* budget_lock nests inside runqueue lock. */
- spin_lock(&sdom->budget_lock);
-
- /*
- * The unit is stopping running (e.g., because it's blocking, or it has
- * been preempted). If it hasn't consumed all the budget it got when,
- * starting to run, put that remaining amount back in the domain's budget
- * pool.
- */
- sdom->budget += svc->budget;
- svc->budget = 0;
-
- /*
- * Making budget available again to the domain means that parked units
- * may be unparked and run. They are, if any, in the domain's parked_units
- * list, so we want to go through that and unpark them (so they can try
- * to get some budget).
- *
- * Touching the list requires the budget_lock, which we hold. Let's
- * therefore put everyone in that list in another, temporary list, which
- * then the caller will traverse, unparking the units it finds there.
- *
- * In fact, we can't do the actual unparking here, because that requires
- * taking the runqueue lock of the units being unparked, and we can't
- * take any runqueue locks while we hold a budget_lock.
- */
- if ( sdom->budget > 0 )
- list_splice_init(&sdom->parked_units, parked);
-
- spin_unlock(&sdom->budget_lock);
-}
-
-static void
-unpark_parked_units(const struct scheduler *ops, struct list_head *units)
-{
- struct csched2_unit *svc, *tmp;
- spinlock_t *lock;
-
- list_for_each_entry_safe ( svc, tmp, units, parked_elem )
- {
- unsigned long flags;
- s_time_t now;
-
- lock = unit_schedule_lock_irqsave(svc->unit, &flags);
-
- sched_clear_pause_flags(svc->unit, _VPF_parked);
- if ( unlikely(svc->flags & CSFLAG_scheduled) )
- {
- /*
- * We end here if a budget replenishment arrived between
- * csched2_schedule() (and, in particular, after a call to
- * unit_grab_budget() that returned false), and
- * context_saved(). By setting __CSFLAG_delayed_runq_add,
- * we tell context_saved() to put the unit back in the
- * runqueue, from where it will compete with the others
- * for the newly replenished budget.
- */
- ASSERT( svc->rqd != NULL );
- ASSERT( c2rqd(ops, sched_unit_master(svc->unit)) == svc->rqd );
- __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
- }
- else if ( unit_runnable(svc->unit) )
- {
- /*
- * The unit should go back to the runqueue, and compete for
- * the newly replenished budget, but only if it is actually
- * runnable (and was therefore offline only because of the
- * lack of budget).
- */
- now = NOW();
- update_load(ops, svc->rqd, svc, 1, now);
- runq_insert(ops, svc);
- runq_tickle(ops, svc, now);
- }
- list_del_init(&svc->parked_elem);
-
- unit_schedule_unlock_irqrestore(lock, flags, svc->unit);
- }
-}
-
-static inline void do_replenish(struct csched2_dom *sdom)
-{
- sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
- sdom->budget += sdom->tot_budget;
-}
-
-static void replenish_domain_budget(void* data)
-{
- struct csched2_dom *sdom = data;
- unsigned long flags;
- s_time_t now;
- LIST_HEAD(parked);
-
- spin_lock_irqsave(&sdom->budget_lock, flags);
-
- now = NOW();
-
- /*
- * Let's do the replenishment. Note, though, that a domain may overrun,
- * which means the budget would have gone below 0 (reasons may be system
- * overbooking, accounting issues, etc.). It also may happen that we are
- * handling the replenishment (much) later than we should (reasons may
- * again be overbooking, or issues with timers).
- *
- * Even in cases of overrun or delay, however, we expect that in 99% of
- * cases, doing just one replenishment will be good enough for being able
- * to unpark the units that are waiting for some budget.
- */
- do_replenish(sdom);
-
- /*
- * And now, the special cases:
- * 1) if we are late enough to have skipped (at least) one full period,
- * what we must do is doing more replenishments. Note that, however,
- * every time we add tot_budget to the budget, we also move next_repl
- * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always
- * respected.
- */
- if ( unlikely(sdom->next_repl <= now) )
- {
- do
- do_replenish(sdom);
- while ( sdom->next_repl <= now );
- }
- /*
- * 2) if we overrun by more than tot_budget, then budget+tot_budget is
- * still < 0, which means that we can't unpark the units. Let's bail,
- * and wait for future replenishments.
- */
- if ( unlikely(sdom->budget <= 0) )
- {
- spin_unlock_irqrestore(&sdom->budget_lock, flags);
- goto out;
- }
-
- /* Since we do more replenishments, make sure we didn't overshot. */
- sdom->budget = min(sdom->budget, sdom->tot_budget);
-
- /*
- * As above, let's prepare the temporary list, out of the domain's
- * parked_units list, now that we hold the budget_lock. Then, drop such
- * lock, and pass the list to the unparking function.
- */
- list_splice_init(&sdom->parked_units, &parked);
-
- spin_unlock_irqrestore(&sdom->budget_lock, flags);
-
- unpark_parked_units(sdom->dom->cpupool->sched, &parked);
-
- out:
- set_timer(&sdom->repl_timer, sdom->next_repl);
-}
-
-#ifndef NDEBUG
-static inline void
-csched2_unit_check(struct sched_unit *unit)
-{
- struct csched2_unit * const svc = csched2_unit(unit);
- struct csched2_dom * const sdom = svc->sdom;
-
- BUG_ON( svc->unit != unit );
- BUG_ON( sdom != csched2_dom(unit->domain) );
- if ( sdom )
- {
- BUG_ON( is_idle_unit(unit) );
- BUG_ON( sdom->dom != unit->domain );
- }
- else
- {
- BUG_ON( !is_idle_unit(unit) );
- }
- SCHED_STAT_CRANK(unit_check);
-}
-#define CSCHED2_UNIT_CHECK(unit) (csched2_unit_check(unit))
-#else
-#define CSCHED2_UNIT_CHECK(unit)
-#endif
-
-static void *
-csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
- void *dd)
-{
- struct csched2_unit *svc;
-
- /* Allocate per-UNIT info */
- svc = xzalloc(struct csched2_unit);
- if ( svc == NULL )
- return NULL;
-
- INIT_LIST_HEAD(&svc->rqd_elem);
- INIT_LIST_HEAD(&svc->runq_elem);
-
- svc->sdom = dd;
- svc->unit = unit;
- svc->flags = 0U;
-
- if ( ! is_idle_unit(unit) )
- {
- ASSERT(svc->sdom != NULL);
- svc->credit = CSCHED2_CREDIT_INIT;
- svc->weight = svc->sdom->weight;
- /* Starting load of 50% */
- svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1);
- svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT;
- }
- else
- {
- ASSERT(svc->sdom == NULL);
- svc->credit = CSCHED2_IDLE_CREDIT;
- svc->weight = 0;
- }
- svc->tickled_cpu = -1;
-
- svc->budget = STIME_MAX;
- svc->budget_quota = 0;
- INIT_LIST_HEAD(&svc->parked_elem);
-
- SCHED_STAT_CRANK(unit_alloc);
-
- return svc;
-}
-
-static void
-csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit * const svc = csched2_unit(unit);
-
- ASSERT(!is_idle_unit(unit));
- SCHED_STAT_CRANK(unit_sleep);
-
- if ( curr_on_cpu(sched_unit_master(unit)) == unit )
- {
- tickle_cpu(sched_unit_master(unit), svc->rqd);
- }
- else if ( unit_on_runq(svc) )
- {
- ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
- update_load(ops, svc->rqd, svc, -1, NOW());
- runq_remove(svc);
- }
- else
- __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
-}
-
-static void
-csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit * const svc = csched2_unit(unit);
- unsigned int cpu = sched_unit_master(unit);
- s_time_t now;
-
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
- ASSERT(!is_idle_unit(unit));
-
- if ( unlikely(curr_on_cpu(cpu) == unit) )
- {
- SCHED_STAT_CRANK(unit_wake_running);
- goto out;
- }
-
- if ( unlikely(unit_on_runq(svc)) )
- {
- SCHED_STAT_CRANK(unit_wake_onrunq);
- goto out;
- }
-
- if ( likely(unit_runnable(unit)) )
- SCHED_STAT_CRANK(unit_wake_runnable);
- else
- SCHED_STAT_CRANK(unit_wake_not_runnable);
-
- /* If the context hasn't been saved for this unit yet, we can't put it on
- * another runqueue. Instead, we set a flag so that it will be put on the runqueue
- * after the context has been saved. */
- if ( unlikely(svc->flags & CSFLAG_scheduled) )
- {
- __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
- goto out;
- }
-
- /* Add into the new runqueue if necessary */
- if ( svc->rqd == NULL )
- runq_assign(ops, unit);
- else
- ASSERT(c2rqd(ops, sched_unit_master(unit)) == svc->rqd );
-
- now = NOW();
-
- update_load(ops, svc->rqd, svc, 1, now);
-
- /* Put the UNIT on the runq */
- runq_insert(ops, svc);
- runq_tickle(ops, svc, now);
-
-out:
- return;
-}
-
-static void
-csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit * const svc = csched2_unit(unit);
-
- __set_bit(__CSFLAG_unit_yield, &svc->flags);
-}
-
-static void
-csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit * const svc = csched2_unit(unit);
- spinlock_t *lock = unit_schedule_lock_irq(unit);
- s_time_t now = NOW();
- LIST_HEAD(were_parked);
-
- BUG_ON( !is_idle_unit(unit) &&
- svc->rqd != c2rqd(ops, sched_unit_master(unit)));
- ASSERT(is_idle_unit(unit) ||
- svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-
- /* This unit is now eligible to be put on the runqueue again */
- __clear_bit(__CSFLAG_scheduled, &svc->flags);
-
- if ( unlikely(has_cap(svc) && svc->budget > 0) )
- unit_return_budget(svc, &were_parked);
-
- /* If someone wants it on the runqueue, put it there. */
- /*
- * NB: We can get rid of CSFLAG_scheduled by checking for
- * vc->is_running and unit_on_runq(svc) here. However,
- * since we're accessing the flags cacheline anyway,
- * it seems a bit pointless; especially as we have plenty of
- * bits free.
- */
- if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
- && likely(unit_runnable(unit)) )
- {
- ASSERT(!unit_on_runq(svc));
-
- runq_insert(ops, svc);
- runq_tickle(ops, svc, now);
- }
- else if ( !is_idle_unit(unit) )
- update_load(ops, svc->rqd, svc, -1, now);
-
- unit_schedule_unlock_irq(lock, unit);
-
- unpark_parked_units(ops, &were_parked);
-}
-
-#define MAX_LOAD (STIME_MAX)
-static struct sched_resource *
-csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
- struct csched2_private *prv = csched2_priv(ops);
- int i, min_rqi = -1, min_s_rqi = -1;
- unsigned int new_cpu, cpu = sched_unit_master(unit);
- struct csched2_unit *svc = csched2_unit(unit);
- s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD;
- bool has_soft;
-
- ASSERT(!cpumask_empty(&prv->active_queues));
-
- SCHED_STAT_CRANK(pick_resource);
-
- /* Locking:
- * - Runqueue lock of vc->processor is already locked
- * - Need to grab prv lock to make sure active runqueues don't
- * change
- * - Need to grab locks for other runqueues while checking
- * avgload
- * Locking constraint is:
- * - Lock prv before runqueue locks
- * - Trylock between runqueue locks (no ordering)
- *
- * Since one of the runqueue locks is already held, we can't
- * just grab the prv lock. Instead, we'll have to trylock, and
- * do something else reasonable if we fail.
- */
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
- if ( !read_trylock(&prv->lock) )
- {
- /* We may be here because someone requested us to migrate. */
- __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
- new_cpu = get_fallback_cpu(svc);
- /*
- * Tracing of runq and its load won't be accurate, since we could
- * not get the lock, but at least we will output the chosen pcpu.
- */
- goto out;
- }
-
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(unit->domain));
-
- /*
- * First check to see if we're here because someone else suggested a place
- * for us to move.
- */
- if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
- {
- if ( unlikely(svc->migrate_rqd->id < 0) )
- {
- printk(XENLOG_WARNING "%s: target runqueue disappeared!\n",
- __func__);
- }
- else if ( cpumask_intersects(cpumask_scratch_cpu(cpu),
- &svc->migrate_rqd->active) )
- {
- /*
- * If we've been asked to move to migrate_rqd, we should just do
- * that, which we actually do by returning one cpu from that runq.
- * There is no need to take care of soft affinity, as that will
- * happen in runq_tickle().
- */
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &svc->migrate_rqd->active);
- new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias,
- cpumask_scratch_cpu(cpu));
-
- svc->migrate_rqd->pick_bias = new_cpu;
- goto out_up;
- }
- /* Fall-through to normal cpu pick */
- }
-
- /*
- * What we want is:
- * - if we have soft affinity, the runqueue with the lowest average
- * load, among the ones that contain cpus in our soft affinity; this
- * represents the best runq on which we would want to run.
- * - the runqueue with the lowest average load among the ones that
- * contains cpus in our hard affinity; this represent the best runq
- * on which we can run.
- *
- * Find both runqueues in one pass.
- */
- has_soft = has_soft_affinity(unit);
- for_each_cpu(i, &prv->active_queues)
- {
- struct csched2_runqueue_data *rqd;
- s_time_t rqd_avgload = MAX_LOAD;
-
- rqd = prv->rqd + i;
-
- /*
- * If none of the cpus of this runqueue is in svc's hard-affinity,
- * skip the runqueue.
- *
- * Note that, in case svc's hard-affinity has changed, this is the
- * first time when we see such change, so it is indeed possible
- * that we end up skipping svc's current runqueue.
- */
- if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) )
- continue;
-
- /*
- * If checking a different runqueue, grab the lock, read the avg,
- * and then release the lock.
- *
- * If on our own runqueue, don't grab or release the lock;
- * but subtract our own load from the runqueue load to simulate
- * impartiality.
- */
- if ( rqd == svc->rqd )
- {
- rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
- }
- else if ( spin_trylock(&rqd->lock) )
- {
- rqd_avgload = rqd->b_avgload;
- spin_unlock(&rqd->lock);
- }
-
- /*
- * if svc has a soft-affinity, and some cpus of rqd are part of it,
- * see if we need to update the "soft-affinity minimum".
- */
- if ( has_soft &&
- rqd_avgload < min_s_avgload )
- {
- cpumask_t mask;
-
- cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active);
- if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) )
- {
- min_s_avgload = rqd_avgload;
- min_s_rqi = i;
- }
- }
- /* In any case, keep the "hard-affinity minimum" updated too. */
- if ( rqd_avgload < min_avgload )
- {
- min_avgload = rqd_avgload;
- min_rqi = i;
- }
- }
-
- if ( has_soft && min_s_rqi != -1 )
- {
- /*
- * We have soft affinity, and we have a candidate runq, so go for it.
- *
- * Note that, to obtain the soft-affinity mask, we "just" put what we
- * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is
- * ok because:
- * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have
- * a non-empty intersection (because has_soft is true);
- * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask()
- * already in cpumask_scratch, we do save a lot doing like this.
- *
- * It's kind of like open coding affinity_balance_cpumask() but, in
- * this specific case, calling that would mean a lot of (unnecessary)
- * cpumask operations.
- */
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- unit->cpu_soft_affinity);
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &prv->rqd[min_s_rqi].active);
- }
- else if ( min_rqi != -1 )
- {
- /*
- * Either we don't have soft-affinity, or we do, but we did not find
- * any suitable runq. But we did find one when considering hard
- * affinity, so go for it.
- *
- * cpumask_scratch already has unit->cpu_hard_affinity &
- * cpupool_domain_master_cpumask() in it, so it's enough that we filter
- * with the cpus of the runq.
- */
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &prv->rqd[min_rqi].active);
- }
- else
- {
- /*
- * We didn't find anyone at all (most likely because of spinlock
- * contention).
- */
- new_cpu = get_fallback_cpu(svc);
- min_rqi = c2r(new_cpu);
- min_avgload = prv->rqd[min_rqi].b_avgload;
- goto out_up;
- }
-
- new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias,
- cpumask_scratch_cpu(cpu));
- prv->rqd[min_rqi].pick_bias = new_cpu;
- BUG_ON(new_cpu >= nr_cpu_ids);
-
- out_up:
- read_unlock(&prv->lock);
- out:
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint64_t b_avgload;
- unsigned unit:16, dom:16;
- unsigned rq_id:16, new_cpu:16;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.rq_id = min_rqi;
- d.b_avgload = min_avgload;
- d.new_cpu = new_cpu;
- __trace_var(TRC_CSCHED2_PICKED_CPU, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- return get_sched_res(new_cpu);
-}
-
-/* Working state of the load-balancing algorithm */
-typedef struct {
- /* NB: Modified by consider() */
- s_time_t load_delta;
- struct csched2_unit * best_push_svc, *best_pull_svc;
- /* NB: Read by consider() */
- struct csched2_runqueue_data *lrqd;
- struct csched2_runqueue_data *orqd;
-} balance_state_t;
-
-static void consider(balance_state_t *st,
- struct csched2_unit *push_svc,
- struct csched2_unit *pull_svc)
-{
- s_time_t l_load, o_load, delta;
-
- l_load = st->lrqd->b_avgload;
- o_load = st->orqd->b_avgload;
- if ( push_svc )
- {
- /* What happens to the load on both if we push? */
- l_load -= push_svc->avgload;
- o_load += push_svc->avgload;
- }
- if ( pull_svc )
- {
- /* What happens to the load on both if we pull? */
- l_load += pull_svc->avgload;
- o_load -= pull_svc->avgload;
- }
-
- delta = l_load - o_load;
- if ( delta < 0 )
- delta = -delta;
-
- if ( delta < st->load_delta )
- {
- st->load_delta = delta;
- st->best_push_svc=push_svc;
- st->best_pull_svc=pull_svc;
- }
-}
-
-
-static void migrate(const struct scheduler *ops,
- struct csched2_unit *svc,
- struct csched2_runqueue_data *trqd,
- s_time_t now)
-{
- struct sched_unit *unit = svc->unit;
- int cpu = sched_unit_master(unit);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned rqi:16, trqi:16;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.rqi = svc->rqd->id;
- d.trqi = trqd->id;
- __trace_var(TRC_CSCHED2_MIGRATE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- if ( svc->flags & CSFLAG_scheduled )
- {
- /* It's running; mark it to migrate. */
- svc->migrate_rqd = trqd;
- sched_set_pause_flags(unit, _VPF_migrating);
- __set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
- SCHED_STAT_CRANK(migrate_requested);
- tickle_cpu(cpu, svc->rqd);
- }
- else
- {
- int on_runq = 0;
- /* It's not running; just move it */
- if ( unit_on_runq(svc) )
- {
- runq_remove(svc);
- update_load(ops, svc->rqd, NULL, -1, now);
- on_runq = 1;
- }
- _runq_deassign(svc);
-
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(unit->domain));
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &trqd->active);
- sched_set_res(unit,
- get_sched_res(cpumask_cycle(trqd->pick_bias,
- cpumask_scratch_cpu(cpu))));
- trqd->pick_bias = sched_unit_master(unit);
- ASSERT(sched_unit_master(unit) < nr_cpu_ids);
-
- _runq_assign(svc, trqd);
- if ( on_runq )
- {
- update_load(ops, svc->rqd, NULL, 1, now);
- runq_insert(ops, svc);
- runq_tickle(ops, svc, now);
- SCHED_STAT_CRANK(migrate_on_runq);
- }
- else
- SCHED_STAT_CRANK(migrate_no_runq);
- }
-}
-
-/*
- * It makes sense considering migrating svc to rqd, if:
- * - svc is not already flagged to migrate,
- * - if svc is allowed to run on at least one of the pcpus of rqd.
- */
-static bool unit_is_migrateable(struct csched2_unit *svc,
- struct csched2_runqueue_data *rqd)
-{
- struct sched_unit *unit = svc->unit;
- int cpu = sched_unit_master(unit);
-
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(unit->domain));
-
- return !(svc->flags & CSFLAG_runq_migrate_request) &&
- cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active);
-}
-
-static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
-{
- struct csched2_private *prv = csched2_priv(ops);
- int i, max_delta_rqi;
- struct list_head *push_iter, *pull_iter;
- bool inner_load_updated = 0;
-
- balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL };
-
- /*
- * Basic algorithm: Push, pull, or swap.
- * - Find the runqueue with the furthest load distance
- * - Find a pair that makes the difference the least (where one
- * on either side may be empty).
- */
-
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
- st.lrqd = c2rqd(ops, cpu);
-
- update_runq_load(ops, st.lrqd, 0, now);
-
-retry:
- max_delta_rqi = -1;
- if ( !read_trylock(&prv->lock) )
- return;
-
- st.load_delta = 0;
-
- for_each_cpu(i, &prv->active_queues)
- {
- s_time_t delta;
-
- st.orqd = prv->rqd + i;
-
- if ( st.orqd == st.lrqd
- || !spin_trylock(&st.orqd->lock) )
- continue;
-
- update_runq_load(ops, st.orqd, 0, now);
-
- delta = st.lrqd->b_avgload - st.orqd->b_avgload;
- if ( delta < 0 )
- delta = -delta;
-
- if ( delta > st.load_delta )
- {
- st.load_delta = delta;
- max_delta_rqi = i;
- }
-
- spin_unlock(&st.orqd->lock);
- }
-
- /* Minimize holding the private scheduler lock. */
- read_unlock(&prv->lock);
- if ( max_delta_rqi == -1 )
- goto out;
-
- {
- s_time_t load_max;
- int cpus_max;
-
-
- load_max = st.lrqd->b_avgload;
- if ( st.orqd->b_avgload > load_max )
- load_max = st.orqd->b_avgload;
-
- cpus_max = st.lrqd->nr_cpus;
- i = st.orqd->nr_cpus;
- if ( i > cpus_max )
- cpus_max = i;
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned lrq_id:16, orq_id:16;
- unsigned load_delta;
- } d;
- d.lrq_id = st.lrqd->id;
- d.orq_id = st.orqd->id;
- d.load_delta = st.load_delta;
- __trace_var(TRC_CSCHED2_LOAD_CHECK, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- /*
- * If we're under 100% capacaty, only shift if load difference
- * is > 1. otherwise, shift if under 12.5%
- */
- if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) )
- {
- if ( st.load_delta < (1ULL << (prv->load_precision_shift +
- opt_underload_balance_tolerance)) )
- goto out;
- }
- else
- if ( st.load_delta < (1ULL << (prv->load_precision_shift +
- opt_overload_balance_tolerance)) )
- goto out;
- }
-
- /* Try to grab the other runqueue lock; if it's been taken in the
- * meantime, try the process over again. This can't deadlock
- * because if it doesn't get any other rqd locks, it will simply
- * give up and return. */
- st.orqd = prv->rqd + max_delta_rqi;
- if ( !spin_trylock(&st.orqd->lock) )
- goto retry;
-
- /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
- if ( unlikely(st.orqd->id < 0) )
- goto out_up;
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint64_t lb_avgload, ob_avgload;
- unsigned lrq_id:16, orq_id:16;
- } d;
- d.lrq_id = st.lrqd->id;
- d.lb_avgload = st.lrqd->b_avgload;
- d.orq_id = st.orqd->id;
- d.ob_avgload = st.orqd->b_avgload;
- __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- SCHED_STAT_CRANK(acct_load_balance);
-
- /* Look for "swap" which gives the best load average
- * FIXME: O(n^2)! */
-
- /* Reuse load delta (as we're trying to minimize it) */
- list_for_each( push_iter, &st.lrqd->svc )
- {
- struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem);
-
- update_svc_load(ops, push_svc, 0, now);
-
- if ( !unit_is_migrateable(push_svc, st.orqd) )
- continue;
-
- list_for_each( pull_iter, &st.orqd->svc )
- {
- struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
-
- if ( !inner_load_updated )
- update_svc_load(ops, pull_svc, 0, now);
-
- if ( !unit_is_migrateable(pull_svc, st.lrqd) )
- continue;
-
- consider(&st, push_svc, pull_svc);
- }
-
- inner_load_updated = 1;
-
- /* Consider push only */
- consider(&st, push_svc, NULL);
- }
-
- list_for_each( pull_iter, &st.orqd->svc )
- {
- struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
-
- if ( !unit_is_migrateable(pull_svc, st.lrqd) )
- continue;
-
- /* Consider pull only */
- consider(&st, NULL, pull_svc);
- }
-
- /* OK, now we have some candidates; do the moving */
- if ( st.best_push_svc )
- migrate(ops, st.best_push_svc, st.orqd, now);
- if ( st.best_pull_svc )
- migrate(ops, st.best_pull_svc, st.lrqd, now);
-
- out_up:
- spin_unlock(&st.orqd->lock);
- out:
- return;
-}
-
-static void
-csched2_unit_migrate(
- const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu)
-{
- struct domain *d = unit->domain;
- struct csched2_unit * const svc = csched2_unit(unit);
- struct csched2_runqueue_data *trqd;
- s_time_t now = NOW();
-
- /*
- * Being passed a target pCPU which is outside of our cpupool is only
- * valid if we are shutting down (or doing ACPI suspend), and we are
- * moving everyone to BSP, no matter whether or not BSP is inside our
- * cpupool.
- *
- * And since there indeed is the chance that it is not part of it, all
- * we must do is remove _and_ unassign the unit from any runqueue, as
- * well as updating v->processor with the target, so that the suspend
- * process can continue.
- *
- * It will then be during resume that a new, meaningful, value for
- * v->processor will be chosen, and during actual domain unpause that
- * the unit will be assigned to and added to the proper runqueue.
- */
- if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_master_cpumask(d))) )
- {
- ASSERT(system_state == SYS_STATE_suspend);
- if ( unit_on_runq(svc) )
- {
- runq_remove(svc);
- update_load(ops, svc->rqd, NULL, -1, now);
- }
- _runq_deassign(svc);
- sched_set_res(unit, get_sched_res(new_cpu));
- return;
- }
-
- /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */
- ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized));
- ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity));
-
- trqd = c2rqd(ops, new_cpu);
-
- /*
- * Do the actual movement toward new_cpu, and update vc->processor.
- * If we are changing runqueue, migrate() takes care of everything.
- * If we are not changing runqueue, we need to update vc->processor
- * here. In fact, if, for instance, we are here because the unit's
- * hard affinity changed, we don't want to risk leaving vc->processor
- * pointing to a pcpu where we can't run any longer.
- */
- if ( trqd != svc->rqd )
- migrate(ops, svc, trqd, now);
- else
- sched_set_res(unit, get_sched_res(new_cpu));
-}
-
-static int
-csched2_dom_cntl(
- const struct scheduler *ops,
- struct domain *d,
- struct xen_domctl_scheduler_op *op)
-{
- struct csched2_dom * const sdom = csched2_dom(d);
- struct csched2_private *prv = csched2_priv(ops);
- unsigned long flags;
- struct sched_unit *unit;
- int rc = 0;
-
- /*
- * Locking:
- * - we must take the private lock for accessing the weights of the
- * units of d, and/or the cap;
- * - in the putinfo case, we also need the runqueue lock(s), for
- * updating the max waight of the runqueue(s).
- * If changing the cap, we also need the budget_lock, for updating
- * the value of the domain budget pool (and the runqueue lock,
- * for adjusting the parameters and rescheduling any unit that is
- * running at the time of the change).
- */
- switch ( op->cmd )
- {
- case XEN_DOMCTL_SCHEDOP_getinfo:
- read_lock_irqsave(&prv->lock, flags);
- op->u.credit2.weight = sdom->weight;
- op->u.credit2.cap = sdom->cap;
- read_unlock_irqrestore(&prv->lock, flags);
- break;
- case XEN_DOMCTL_SCHEDOP_putinfo:
- write_lock_irqsave(&prv->lock, flags);
- /* Weight */
- if ( op->u.credit2.weight != 0 )
- {
- int old_weight;
-
- old_weight = sdom->weight;
-
- sdom->weight = op->u.credit2.weight;
-
- /* Update weights for units, and max_weight for runqueues on which they reside */
- for_each_sched_unit ( d, unit )
- {
- struct csched2_unit *svc = csched2_unit(unit);
- spinlock_t *lock = unit_schedule_lock(unit);
-
- ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-
- svc->weight = sdom->weight;
- update_max_weight(svc->rqd, svc->weight, old_weight);
-
- unit_schedule_unlock(lock, unit);
- }
- }
- /* Cap */
- if ( op->u.credit2.cap != 0 )
- {
- struct csched2_unit *svc;
- spinlock_t *lock;
-
- /* Cap is only valid if it's below 100 * nr_of_units */
- if ( op->u.credit2.cap > 100 * sdom->nr_units )
- {
- rc = -EINVAL;
- write_unlock_irqrestore(&prv->lock, flags);
- break;
- }
-
- spin_lock(&sdom->budget_lock);
- sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap);
- sdom->tot_budget /= 100;
- spin_unlock(&sdom->budget_lock);
-
- /*
- * When trying to get some budget and run, each unit will grab
- * from the pool 1/N (with N = nr of units of the domain) of
- * the total budget. Roughly speaking, this means each unit will
- * have at least one chance to run during every period.
- */
- for_each_sched_unit ( d, unit )
- {
- svc = csched2_unit(unit);
- lock = unit_schedule_lock(unit);
- /*
- * Too small quotas would in theory cause a lot of overhead,
- * which then won't happen because, in csched2_runtime(),
- * CSCHED2_MIN_TIMER is what would be used anyway.
- */
- svc->budget_quota = max(sdom->tot_budget / sdom->nr_units,
- CSCHED2_MIN_TIMER);
- unit_schedule_unlock(lock, unit);
- }
-
- if ( sdom->cap == 0 )
- {
- /*
- * We give to the domain the budget to which it is entitled,
- * and queue its first replenishment event.
- *
- * Since cap is currently disabled for this domain, we
- * know no unit is messing with the domain's budget, and
- * the replenishment timer is still off.
- * For these reasons, it is safe to do the following without
- * taking the budget_lock.
- */
- sdom->budget = sdom->tot_budget;
- sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD;
- set_timer(&sdom->repl_timer, sdom->next_repl);
-
- /*
- * Now, let's enable budget accounting for all the units.
- * For making sure that they will start to honour the domain's
- * cap, we set their budget to 0.
- * This way, as soon as they will try to run, they will have
- * to get some budget.
- *
- * For the units that are already running, we trigger the
- * scheduler on their pCPU. When, as a consequence of this,
- * csched2_schedule() will run, it will figure out there is
- * no budget, and the unit will try to get some (and be parked,
- * if there's none, and we'll switch to someone else).
- */
- for_each_sched_unit ( d, unit )
- {
- svc = csched2_unit(unit);
- lock = unit_schedule_lock(unit);
- if ( unit->is_running )
- {
- unsigned int cpu = sched_unit_master(unit);
- struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
-
- ASSERT(curr_on_cpu(cpu) == unit);
-
- /*
- * We are triggering a reschedule on the unit's
- * pCPU. That will run burn_credits() and, since
- * the unit is capped now, it would charge all the
- * execution time of this last round as budget as
- * well. That will make the unit budget go negative,
- * potentially by a large amount, and it's unfair.
- *
- * To avoid that, call burn_credit() here, to do the
- * accounting of this current running instance now,
- * with budgetting still disabled. This does not
- * prevent some small amount of budget being charged
- * to the unit (i.e., the amount of time it runs from
- * now, to when scheduling happens). The budget will
- * also go below 0, but a lot less than how it would
- * if we don't do this.
- */
- burn_credits(rqd, svc, NOW());
- __cpumask_set_cpu(cpu, &rqd->tickled);
- ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle));
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
- }
- svc->budget = 0;
- unit_schedule_unlock(lock, unit);
- }
- }
-
- sdom->cap = op->u.credit2.cap;
- }
- else if ( sdom->cap != 0 )
- {
- LIST_HEAD(parked);
-
- stop_timer(&sdom->repl_timer);
-
- /* Disable budget accounting for all the units. */
- for_each_sched_unit ( d, unit )
- {
- struct csched2_unit *svc = csched2_unit(unit);
- spinlock_t *lock = unit_schedule_lock(unit);
-
- svc->budget = STIME_MAX;
- svc->budget_quota = 0;
-
- unit_schedule_unlock(lock, unit);
- }
- sdom->cap = 0;
- /*
- * We are disabling the cap for this domain, which may have
- * units waiting for a replenishment, so we unpark them all.
- * Note that, since we have already disabled budget accounting
- * for all the units of the domain, no currently running unit
- * will be added to the parked units list any longer.
- */
- spin_lock(&sdom->budget_lock);
- list_splice_init(&sdom->parked_units, &parked);
- spin_unlock(&sdom->budget_lock);
-
- unpark_parked_units(ops, &parked);
- }
- write_unlock_irqrestore(&prv->lock, flags);
- break;
- default:
- rc = -EINVAL;
- break;
- }
-
-
- return rc;
-}
-
-static void
-csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
- const cpumask_t *hard, const cpumask_t *soft)
-{
- struct csched2_unit *svc = csched2_unit(unit);
-
- if ( !hard )
- return;
-
- /* Are we becoming exclusively pinned? */
- if ( cpumask_weight(hard) == 1 )
- __set_bit(__CSFLAG_pinned, &svc->flags);
- else
- __clear_bit(__CSFLAG_pinned, &svc->flags);
-}
-
-static int csched2_sys_cntl(const struct scheduler *ops,
- struct xen_sysctl_scheduler_op *sc)
-{
- struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2;
- struct csched2_private *prv = csched2_priv(ops);
- unsigned long flags;
-
- switch (sc->cmd )
- {
- case XEN_SYSCTL_SCHEDOP_putinfo:
- if ( params->ratelimit_us &&
- (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX ||
- params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN ))
- return -EINVAL;
-
- write_lock_irqsave(&prv->lock, flags);
- if ( !prv->ratelimit_us && params->ratelimit_us )
- printk(XENLOG_INFO "Enabling context switch rate limiting\n");
- else if ( prv->ratelimit_us && !params->ratelimit_us )
- printk(XENLOG_INFO "Disabling context switch rate limiting\n");
- prv->ratelimit_us = params->ratelimit_us;
- write_unlock_irqrestore(&prv->lock, flags);
-
- /* FALLTHRU */
- case XEN_SYSCTL_SCHEDOP_getinfo:
- params->ratelimit_us = prv->ratelimit_us;
- break;
- }
-
- return 0;
-}
-
-static void *
-csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
-{
- struct csched2_private *prv = csched2_priv(ops);
- struct csched2_dom *sdom;
- unsigned long flags;
-
- sdom = xzalloc(struct csched2_dom);
- if ( sdom == NULL )
- return ERR_PTR(-ENOMEM);
-
- /* Initialize credit, cap and weight */
- INIT_LIST_HEAD(&sdom->sdom_elem);
- sdom->dom = dom;
- sdom->weight = CSCHED2_DEFAULT_WEIGHT;
- sdom->cap = 0U;
- sdom->nr_units = 0;
-
- init_timer(&sdom->repl_timer, replenish_domain_budget, sdom,
- cpumask_any(cpupool_domain_master_cpumask(dom)));
- spin_lock_init(&sdom->budget_lock);
- INIT_LIST_HEAD(&sdom->parked_units);
-
- write_lock_irqsave(&prv->lock, flags);
-
- list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
-
- write_unlock_irqrestore(&prv->lock, flags);
-
- return sdom;
-}
-
-static void
-csched2_free_domdata(const struct scheduler *ops, void *data)
-{
- struct csched2_dom *sdom = data;
- struct csched2_private *prv = csched2_priv(ops);
-
- if ( sdom )
- {
- unsigned long flags;
-
- kill_timer(&sdom->repl_timer);
-
- write_lock_irqsave(&prv->lock, flags);
- list_del_init(&sdom->sdom_elem);
- write_unlock_irqrestore(&prv->lock, flags);
-
- xfree(sdom);
- }
-}
-
-static void
-csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit *svc = unit->priv;
- struct csched2_dom * const sdom = svc->sdom;
- spinlock_t *lock;
-
- ASSERT(!is_idle_unit(unit));
- ASSERT(list_empty(&svc->runq_elem));
-
- /* csched2_res_pick() expects the pcpu lock to be held */
- lock = unit_schedule_lock_irq(unit);
-
- sched_set_res(unit, csched2_res_pick(ops, unit));
-
- spin_unlock_irq(lock);
-
- lock = unit_schedule_lock_irq(unit);
-
- /* Add unit to runqueue of initial processor */
- runq_assign(ops, unit);
-
- unit_schedule_unlock_irq(lock, unit);
-
- sdom->nr_units++;
-
- SCHED_STAT_CRANK(unit_insert);
-
- CSCHED2_UNIT_CHECK(unit);
-}
-
-static void
-csched2_free_udata(const struct scheduler *ops, void *priv)
-{
- struct csched2_unit *svc = priv;
-
- xfree(svc);
-}
-
-static void
-csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct csched2_unit * const svc = csched2_unit(unit);
- spinlock_t *lock;
-
- ASSERT(!is_idle_unit(unit));
- ASSERT(list_empty(&svc->runq_elem));
-
- SCHED_STAT_CRANK(unit_remove);
-
- /* Remove from runqueue */
- lock = unit_schedule_lock_irq(unit);
-
- runq_deassign(ops, unit);
-
- unit_schedule_unlock_irq(lock, unit);
-
- svc->sdom->nr_units--;
-}
-
-/* How long should we let this unit run for? */
-static s_time_t
-csched2_runtime(const struct scheduler *ops, int cpu,
- struct csched2_unit *snext, s_time_t now)
-{
- s_time_t time, min_time;
- int rt_credit; /* Proposed runtime measured in credits */
- struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
- struct list_head *runq = &rqd->runq;
- struct csched2_private *prv = csched2_priv(ops);
-
- /*
- * If we're idle, just stay so. Others (or external events)
- * will poke us when necessary.
- */
- if ( is_idle_unit(snext->unit) )
- return -1;
-
- /* General algorithm:
- * 1) Run until snext's credit will be 0.
- * 2) But if someone is waiting, run until snext's credit is equal
- * to his.
- * 3) But, if we are capped, never run more than our budget.
- * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or
- * the ratelimit time.
- */
-
- /* Calculate mintime */
- min_time = CSCHED2_MIN_TIMER;
- if ( prv->ratelimit_us )
- {
- s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us);
- if ( snext->unit->is_running )
- ratelimit_min = snext->unit->state_entry_time +
- MICROSECS(prv->ratelimit_us) - now;
- if ( ratelimit_min > min_time )
- min_time = ratelimit_min;
- }
-
- /* 1) Run until snext's credit will be 0. */
- rt_credit = snext->credit;
-
- /*
- * 2) If there's someone waiting whose credit is positive,
- * run until your credit ~= his.
- */
- if ( ! list_empty(runq) )
- {
- struct csched2_unit *swait = runq_elem(runq->next);
-
- if ( ! is_idle_unit(swait->unit)
- && swait->credit > 0 )
- {
- rt_credit = snext->credit - swait->credit;
- }
- }
-
- /*
- * The next guy on the runqueue may actually have a higher credit,
- * if we've tried to avoid migrating him from a different cpu.
- * Setting time=0 will ensure the minimum timeslice is chosen.
- *
- * FIXME: See if we can eliminate this conversion if we know time
- * will be outside (MIN,MAX). Probably requires pre-calculating
- * credit values of MIN,MAX per unit, since each unit burns credit
- * at a different rate.
- */
- if ( rt_credit > 0 )
- time = c2t(rqd, rt_credit, snext);
- else
- time = 0;
-
- /*
- * 3) But, if capped, never run more than our budget.
- */
- if ( has_cap(snext) )
- time = snext->budget < time ? snext->budget : time;
-
- /*
- * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or
- * the rate_limit time.
- */
- if ( time < min_time )
- {
- time = min_time;
- SCHED_STAT_CRANK(runtime_min_timer);
- }
- else if (time > CSCHED2_MAX_TIMER)
- {
- time = CSCHED2_MAX_TIMER;
- SCHED_STAT_CRANK(runtime_max_timer);
- }
-
- return time;
-}
-
-/*
- * Find a candidate.
- */
-static struct csched2_unit *
-runq_candidate(struct csched2_runqueue_data *rqd,
- struct csched2_unit *scurr,
- int cpu, s_time_t now,
- unsigned int *skipped)
-{
- struct list_head *iter, *temp;
- struct sched_resource *sr = get_sched_res(cpu);
- struct csched2_unit *snext = NULL;
- struct csched2_private *prv = csched2_priv(sr->scheduler);
- bool yield = false, soft_aff_preempt = false;
-
- *skipped = 0;
-
- if ( unlikely(is_idle_unit(scurr->unit)) )
- {
- snext = scurr;
- goto check_runq;
- }
-
- yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags);
-
- /*
- * Return the current unit if it has executed for less than ratelimit.
- * Adjuststment for the selected unit's credit and decision
- * for how long it will run will be taken in csched2_runtime.
- *
- * Note that, if scurr is yielding, we don't let rate limiting kick in.
- * In fact, it may be the case that scurr is about to spin, and there's
- * no point forcing it to do so until rate limiting expires.
- */
- if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) &&
- (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) )
- {
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned runtime;
- } d;
- d.dom = scurr->unit->domain->domain_id;
- d.unit = scurr->unit->unit_id;
- d.runtime = now - scurr->unit->state_entry_time;
- __trace_var(TRC_CSCHED2_RATELIMIT, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
- return scurr;
- }
-
- /* If scurr has a soft-affinity, let's check whether cpu is part of it */
- if ( has_soft_affinity(scurr->unit) )
- {
- affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY,
- cpumask_scratch);
- if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) )
- {
- cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain);
-
- /* Ok, is any of the pcpus in scurr soft-affinity idle? */
- cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle);
- cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled);
- soft_aff_preempt = cpumask_intersects(cpumask_scratch, online);
- }
- }
-
- /*
- * If scurr is runnable, and this cpu is in its soft-affinity, default to
- * it. We also default to it, even if cpu is not in its soft-affinity, if
- * there aren't any idle and not tickled cpu in its soft-affinity. In
- * fact, we don't want to risk leaving scurr in the runq and this cpu idle
- * only because scurr is running outside of its soft-affinity.
- *
- * On the other hand, if cpu is not in scurr's soft-affinity, and there
- * looks to be better options, go for them. That happens by defaulting to
- * idle here, which means scurr will be preempted, put back in runq, and
- * one of those idle and not tickled cpus from its soft-affinity will be
- * tickled to pick it up.
- *
- * Finally, if scurr does not have a valid soft-affinity, we also let it
- * continue to run here (in fact, soft_aff_preempt will still be false,
- * in this case).
- *
- * Of course, we also default to idle also if scurr is not runnable.
- */
- if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt )
- snext = scurr;
- else
- snext = csched2_unit(sched_idle_unit(cpu));
-
- check_runq:
- list_for_each_safe( iter, temp, &rqd->runq )
- {
- struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- /* Only consider units that are allowed to run on this processor. */
- if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) )
- {
- (*skipped)++;
- continue;
- }
-
- /*
- * If an unit is meant to be picked up by another processor, and such
- * processor has not scheduled yet, leave it in the runqueue for him.
- */
- if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
- cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
- {
- (*skipped)++;
- SCHED_STAT_CRANK(deferred_to_tickled_cpu);
- continue;
- }
-
- /*
- * If this is on a different processor, don't pull it unless
- * its credit is at least CSCHED2_MIGRATE_RESIST higher.
- */
- if ( sched_unit_master(svc->unit) != cpu
- && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
- {
- (*skipped)++;
- SCHED_STAT_CRANK(migrate_resisted);
- continue;
- }
-
- /*
- * If the one in the runqueue has more credit than current (or idle,
- * if current is not runnable), or if current is yielding, and also
- * if the one in runqueue either is not capped, or is capped but has
- * some budget, then choose it.
- */
- if ( (yield || svc->credit > snext->credit) &&
- (!has_cap(svc) || unit_grab_budget(svc)) &&
- unit_runnable_state(svc->unit) )
- snext = svc;
-
- /* In any case, if we got this far, break. */
- break;
- }
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned unit:16, dom:16;
- unsigned tickled_cpu, skipped;
- int credit;
- } d;
- d.dom = snext->unit->domain->domain_id;
- d.unit = snext->unit->unit_id;
- d.credit = snext->credit;
- d.tickled_cpu = snext->tickled_cpu;
- d.skipped = *skipped;
- __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
- SCHED_STAT_CRANK(tickled_cpu_overridden);
-
- /*
- * If snext is from a capped domain, it must have budget (or it
- * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
- * which still is >= 0.
- */
- ASSERT(snext->budget >= 0);
-
- return snext;
-}
-
-/*
- * This function is in the critical path. It is designed to be simple and
- * fast for the common case.
- */
-static void csched2_schedule(
- const struct scheduler *ops, struct sched_unit *currunit, s_time_t now,
- bool tasklet_work_scheduled)
-{
- const unsigned int cur_cpu = smp_processor_id();
- const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
- struct csched2_runqueue_data *rqd;
- struct csched2_unit * const scurr = csched2_unit(currunit);
- struct csched2_unit *snext = NULL;
- unsigned int skipped_units = 0;
- bool tickled;
- bool migrated = false;
-
- SCHED_STAT_CRANK(schedule);
- CSCHED2_UNIT_CHECK(currunit);
-
- BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized));
-
- rqd = c2rqd(ops, sched_cpu);
- BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active));
-
- ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock));
-
- BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd);
-
- /* Clear "tickled" bit now that we've been scheduled */
- tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled);
- if ( tickled )
- {
- __cpumask_clear_cpu(sched_cpu, &rqd->tickled);
- cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
- smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
- }
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- unsigned cpu:16, rq_id:16;
- unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
- } d;
- d.cpu = cur_cpu;
- d.rq_id = c2r(sched_cpu);
- d.tasklet = tasklet_work_scheduled;
- d.idle = is_idle_unit(currunit);
- d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle);
- d.tickled = tickled;
- __trace_var(TRC_CSCHED2_SCHEDULE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- /* Update credits (and budget, if necessary). */
- burn_credits(rqd, scurr, now);
-
- /*
- * Below 0, means that we are capped and we have overrun our budget.
- * Let's try to get some more but, if we fail (e.g., because of the
- * other running units), we will be parked.
- */
- if ( unlikely(scurr->budget <= 0) )
- unit_grab_budget(scurr);
-
- /*
- * Select next runnable local UNIT (ie top of local runq).
- *
- * If the current unit is runnable, and has higher credit than
- * the next guy on the queue (or there is noone else), we want to
- * run him again.
- *
- * If there's tasklet work to do, we want to chose the idle unit
- * for this processor, and mark the current for delayed runqueue
- * add.
- *
- * If the current unit is runnable, and there's another runnable
- * candidate, we want to mark current for delayed runqueue add,
- * and remove the next guy from the queue.
- *
- * If the current unit is not runnable, we want to chose the idle
- * unit for this processor.
- */
- if ( tasklet_work_scheduled )
- {
- __clear_bit(__CSFLAG_unit_yield, &scurr->flags);
- trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL);
- snext = csched2_unit(sched_idle_unit(sched_cpu));
- }
- else
- snext = runq_candidate(rqd, scurr, sched_cpu, now, &skipped_units);
-
- /* If switching from a non-idle runnable unit, put it
- * back on the runqueue. */
- if ( snext != scurr
- && !is_idle_unit(currunit)
- && unit_runnable(currunit) )
- __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
-
- /* Accounting for non-idle tasks */
- if ( !is_idle_unit(snext->unit) )
- {
- /* If switching, remove this from the runqueue and mark it scheduled */
- if ( snext != scurr )
- {
- ASSERT(snext->rqd == rqd);
- ASSERT(!snext->unit->is_running);
-
- runq_remove(snext);
- __set_bit(__CSFLAG_scheduled, &snext->flags);
- }
-
- /* Clear the idle mask if necessary */
- if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
- {
- __cpumask_clear_cpu(sched_cpu, &rqd->idle);
- smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
- }
-
- /*
- * The reset condition is "has a scheduler epoch come to an end?".
- * The way this is enforced is checking whether the unit at the top
- * of the runqueue has negative credits. This means the epochs have
- * variable length, as in one epoch expores when:
- * 1) the unit at the top of the runqueue has executed for
- * around 10 ms (with default parameters);
- * 2) no other unit with higher credits wants to run.
- *
- * Here, where we want to check for reset, we need to make sure the
- * proper unit is being used. In fact, runqueue_candidate() may have
- * not returned the first unit in the runqueue, for various reasons
- * (e.g., affinity). Only trigger a reset when it does.
- */
- if ( skipped_units == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
- {
- reset_credit(ops, sched_cpu, now, snext);
- balance_load(ops, sched_cpu, now);
- }
-
- snext->start_time = now;
- snext->tickled_cpu = -1;
-
- /* Safe because lock for old processor is held */
- if ( sched_unit_master(snext->unit) != sched_cpu )
- {
- snext->credit += CSCHED2_MIGRATE_COMPENSATION;
- sched_set_res(snext->unit, get_sched_res(sched_cpu));
- SCHED_STAT_CRANK(migrated);
- migrated = true;
- }
- }
- else
- {
- /*
- * Update the idle mask if necessary. Note that, if we're scheduling
- * idle in order to carry on some tasklet work, we want to play busy!
- */
- if ( tasklet_work_scheduled )
- {
- if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
- {
- __cpumask_clear_cpu(sched_cpu, &rqd->idle);
- smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
- }
- }
- else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) )
- {
- __cpumask_set_cpu(sched_cpu, &rqd->idle);
- cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
- smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
- }
- /* Make sure avgload gets updated periodically even
- * if there's no activity */
- update_load(ops, rqd, NULL, 0, now);
- }
-
- /*
- * Return task to run next...
- */
- currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now);
- currunit->next_task = snext->unit;
- snext->unit->migrated = migrated;
-
- CSCHED2_UNIT_CHECK(currunit->next_task);
-}
-
-static void
-csched2_dump_unit(struct csched2_private *prv, struct csched2_unit *svc)
-{
- printk("[%i.%i] flags=%x cpu=%i",
- svc->unit->domain->domain_id,
- svc->unit->unit_id,
- svc->flags,
- sched_unit_master(svc->unit));
-
- printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
-
- if ( has_cap(svc) )
- printk(" budget=%"PRI_stime"(%"PRI_stime")",
- svc->budget, svc->budget_quota);
-
- printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
- (svc->avgload * 100) >> prv->load_precision_shift);
-
- printk("\n");
-}
-
-static inline void
-dump_pcpu(const struct scheduler *ops, int cpu)
-{
- struct csched2_private *prv = csched2_priv(ops);
- struct csched2_unit *svc;
-
- printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n",
- cpu, c2r(cpu),
- CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
- CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
-
- /* current UNIT (nothing to say if that's the idle unit) */
- svc = csched2_unit(curr_on_cpu(cpu));
- if ( svc && !is_idle_unit(svc->unit) )
- {
- printk("\trun: ");
- csched2_dump_unit(prv, svc);
- }
-}
-
-static void
-csched2_dump(const struct scheduler *ops)
-{
- struct list_head *iter_sdom;
- struct csched2_private *prv = csched2_priv(ops);
- unsigned long flags;
- unsigned int i, j, loop;
-
- /*
- * We need the private scheduler lock as we access global
- * scheduler data and (below) the list of active domains.
- */
- read_lock_irqsave(&prv->lock, flags);
-
- printk("Active queues: %d\n"
- "\tdefault-weight = %d\n",
- cpumask_weight(&prv->active_queues),
- CSCHED2_DEFAULT_WEIGHT);
- for_each_cpu(i, &prv->active_queues)
- {
- s_time_t fraction;
-
- fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift;
-
- printk("Runqueue %d:\n"
- "\tncpus = %u\n"
- "\tcpus = %*pbl\n"
- "\tmax_weight = %u\n"
- "\tpick_bias = %u\n"
- "\tinstload = %d\n"
- "\taveload = %"PRI_stime" (~%"PRI_stime"%%)\n",
- i,
- prv->rqd[i].nr_cpus,
- CPUMASK_PR(&prv->rqd[i].active),
- prv->rqd[i].max_weight,
- prv->rqd[i].pick_bias,
- prv->rqd[i].load,
- prv->rqd[i].avgload,
- fraction);
-
- printk("\tidlers: %*pb\n"
- "\ttickled: %*pb\n"
- "\tfully idle cores: %*pb\n",
- CPUMASK_PR(&prv->rqd[i].idle),
- CPUMASK_PR(&prv->rqd[i].tickled),
- CPUMASK_PR(&prv->rqd[i].smt_idle));
- }
-
- printk("Domain info:\n");
- loop = 0;
- list_for_each( iter_sdom, &prv->sdom )
- {
- struct csched2_dom *sdom;
- struct sched_unit *unit;
-
- sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
-
- printk("\tDomain: %d w %d c %u v %d\n",
- sdom->dom->domain_id,
- sdom->weight,
- sdom->cap,
- sdom->nr_units);
-
- for_each_sched_unit ( sdom->dom, unit )
- {
- struct csched2_unit * const svc = csched2_unit(unit);
- spinlock_t *lock;
-
- lock = unit_schedule_lock(unit);
-
- printk("\t%3d: ", ++loop);
- csched2_dump_unit(prv, svc);
-
- unit_schedule_unlock(lock, unit);
- }
- }
-
- for_each_cpu(i, &prv->active_queues)
- {
- struct csched2_runqueue_data *rqd = prv->rqd + i;
- struct list_head *iter, *runq = &rqd->runq;
- int loop = 0;
-
- /* We need the lock to scan the runqueue. */
- spin_lock(&rqd->lock);
-
- printk("Runqueue %d:\n", i);
-
- for_each_cpu(j, &rqd->active)
- dump_pcpu(ops, j);
-
- printk("RUNQ:\n");
- list_for_each( iter, runq )
- {
- struct csched2_unit *svc = runq_elem(iter);
-
- if ( svc )
- {
- printk("\t%3d: ", loop++);
- csched2_dump_unit(prv, svc);
- }
- }
- spin_unlock(&rqd->lock);
- }
-
- read_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void *
-csched2_alloc_pdata(const struct scheduler *ops, int cpu)
-{
- struct csched2_pcpu *spc;
-
- spc = xzalloc(struct csched2_pcpu);
- if ( spc == NULL )
- return ERR_PTR(-ENOMEM);
-
- /* Not in any runqueue yet */
- spc->runq_id = -1;
-
- return spc;
-}
-
-/* Returns the ID of the runqueue the cpu is assigned to. */
-static unsigned
-init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc,
- unsigned int cpu)
-{
- struct csched2_runqueue_data *rqd;
- unsigned int rcpu;
-
- ASSERT(rw_is_write_locked(&prv->lock));
- ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
- /* CPU data needs to be allocated, but still uninitialized. */
- ASSERT(spc && spc->runq_id == -1);
-
- /* Figure out which runqueue to put it in */
- spc->runq_id = cpu_to_runqueue(prv, cpu);
-
- rqd = prv->rqd + spc->runq_id;
-
- printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
- if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
- {
- printk(XENLOG_INFO " First cpu on runqueue, activating\n");
- activate_runqueue(prv, spc->runq_id);
- }
-
- __cpumask_set_cpu(cpu, &spc->sibling_mask);
-
- if ( rqd->nr_cpus > 0 )
- for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) )
- if ( cpumask_test_cpu(rcpu, &rqd->active) )
- {
- __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
- __cpumask_set_cpu(rcpu, &spc->sibling_mask);
- }
-
- __cpumask_set_cpu(cpu, &rqd->idle);
- __cpumask_set_cpu(cpu, &rqd->active);
- __cpumask_set_cpu(cpu, &prv->initialized);
- __cpumask_set_cpu(cpu, &rqd->smt_idle);
-
- rqd->nr_cpus++;
- ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
-
- if ( rqd->nr_cpus == 1 )
- rqd->pick_bias = cpu;
-
- return spc->runq_id;
-}
-
-static void
-csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
- struct csched2_private *prv = csched2_priv(ops);
- spinlock_t *old_lock;
- unsigned long flags;
- unsigned rqi;
-
- write_lock_irqsave(&prv->lock, flags);
- old_lock = pcpu_schedule_lock(cpu);
-
- rqi = init_pdata(prv, pdata, cpu);
- /* Move the scheduler lock to the new runq lock. */
- get_sched_res(cpu)->schedule_lock = &prv->rqd[rqi].lock;
-
- /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
- spin_unlock(old_lock);
- write_unlock_irqrestore(&prv->lock, flags);
-}
-
-/* Change the scheduler of cpu to us (Credit2). */
-static spinlock_t *
-csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
- void *pdata, void *vdata)
-{
- struct csched2_private *prv = csched2_priv(new_ops);
- struct csched2_unit *svc = vdata;
- unsigned rqi;
-
- ASSERT(pdata && svc && is_idle_unit(svc->unit));
-
- /*
- * We own one runqueue lock already (from schedule_cpu_switch()). This
- * looks like it violates this scheduler's locking rules, but it does
- * not, as what we own is the lock of another scheduler, that hence has
- * no particular (ordering) relationship with our private global lock.
- * And owning exactly that one (the lock of the old scheduler of this
- * cpu) is what is necessary to prevent races.
- */
- ASSERT(!local_irq_is_enabled());
- write_lock(&prv->lock);
-
- sched_idle_unit(cpu)->priv = vdata;
-
- rqi = init_pdata(prv, pdata, cpu);
-
- /*
- * Now that we know what runqueue we'll go in, double check what's said
- * above: the lock we already hold is not the one of this runqueue of
- * this scheduler, and so it's safe to have taken it /before/ our
- * private global lock.
- */
- ASSERT(get_sched_res(cpu)->schedule_lock != &prv->rqd[rqi].lock);
-
- write_unlock(&prv->lock);
-
- return &prv->rqd[rqi].lock;
-}
-
-static void
-csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
- unsigned long flags;
- struct csched2_private *prv = csched2_priv(ops);
- struct csched2_runqueue_data *rqd;
- struct csched2_pcpu *spc = pcpu;
- unsigned int rcpu;
-
- write_lock_irqsave(&prv->lock, flags);
-
- /*
- * alloc_pdata is not implemented, so pcpu must be NULL. On the other
- * hand, init_pdata must have been called for this pCPU.
- */
- /*
- * Scheduler specific data for this pCPU must still be there and and be
- * valid. In fact, if we are here:
- * 1. alloc_pdata must have been called for this cpu, and free_pdata
- * must not have been called on it before us,
- * 2. init_pdata must have been called on this cpu, and deinit_pdata
- * (us!) must not have been called on it already.
- */
- ASSERT(spc && spc->runq_id != -1);
- ASSERT(cpumask_test_cpu(cpu, &prv->initialized));
-
- /* Find the old runqueue and remove this cpu from it */
- rqd = prv->rqd + spc->runq_id;
-
- /* No need to save IRQs here, they're already disabled */
- spin_lock(&rqd->lock);
-
- printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id);
-
- __cpumask_clear_cpu(cpu, &rqd->idle);
- __cpumask_clear_cpu(cpu, &rqd->smt_idle);
- __cpumask_clear_cpu(cpu, &rqd->active);
-
- for_each_cpu ( rcpu, &rqd->active )
- __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
-
- rqd->nr_cpus--;
- ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
-
- if ( rqd->nr_cpus == 0 )
- {
- printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
- deactivate_runqueue(prv, spc->runq_id);
- }
- else if ( rqd->pick_bias == cpu )
- rqd->pick_bias = cpumask_first(&rqd->active);
-
- spc->runq_id = -1;
-
- spin_unlock(&rqd->lock);
-
- __cpumask_clear_cpu(cpu, &prv->initialized);
-
- write_unlock_irqrestore(&prv->lock, flags);
-
- return;
-}
-
-static void
-csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
- struct csched2_pcpu *spc = pcpu;
-
- /*
- * pcpu either points to a valid struct csched2_pcpu, or is NULL (if
- * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED).
- * xfree() does not really mind, but we want to be sure that either
- * init_pdata has never been called, or deinit_pdata has been called
- * already.
- */
- ASSERT(!pcpu || spc->runq_id == -1);
- ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
-
- xfree(pcpu);
-}
-
-static int __init
-csched2_global_init(void)
-{
- if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
- {
- printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n",
- __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN);
- opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN;
- }
-
- if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT )
- {
- printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n",
- __func__, opt_load_window_shift);
- opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
- }
-
- if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
- {
- printk("WARNING: %s: opt_cap_period %u too small, resetting\n",
- __func__, opt_cap_period);
- opt_cap_period = 10; /* ms */
- }
-
- return 0;
-}
-
-static int
-csched2_init(struct scheduler *ops)
-{
- int i;
- struct csched2_private *prv;
-
- printk("Initializing Credit2 scheduler\n");
-
- printk(XENLOG_INFO " load_precision_shift: %d\n"
- XENLOG_INFO " load_window_shift: %d\n"
- XENLOG_INFO " underload_balance_tolerance: %d\n"
- XENLOG_INFO " overload_balance_tolerance: %d\n"
- XENLOG_INFO " runqueues arrangement: %s\n"
- XENLOG_INFO " cap enforcement granularity: %dms\n",
- opt_load_precision_shift,
- opt_load_window_shift,
- opt_underload_balance_tolerance,
- opt_overload_balance_tolerance,
- opt_runqueue_str[opt_runqueue],
- opt_cap_period);
-
- printk(XENLOG_INFO "load tracking window length %llu ns\n",
- 1ULL << opt_load_window_shift);
-
- /*
- * Basically no CPU information is available at this point; just
- * set up basic structures, and a callback when the CPU info is
- * available.
- */
-
- prv = xzalloc(struct csched2_private);
- if ( prv == NULL )
- return -ENOMEM;
- ops->sched_data = prv;
-
- rwlock_init(&prv->lock);
- INIT_LIST_HEAD(&prv->sdom);
-
- /* Allocate all runqueues and mark them as un-initialized */
- prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids);
- if ( !prv->rqd )
- {
- xfree(prv);
- return -ENOMEM;
- }
- for ( i = 0; i < nr_cpu_ids; i++ )
- prv->rqd[i].id = -1;
-
- /* initialize ratelimit */
- prv->ratelimit_us = sched_ratelimit_us;
-
- prv->load_precision_shift = opt_load_precision_shift;
- prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT;
- ASSERT(opt_load_window_shift > 0);
-
- return 0;
-}
-
-static void
-csched2_deinit(struct scheduler *ops)
-{
- struct csched2_private *prv;
-
- prv = csched2_priv(ops);
- ops->sched_data = NULL;
- if ( prv )
- xfree(prv->rqd);
- xfree(prv);
-}
-
-static const struct scheduler sched_credit2_def = {
- .name = "SMP Credit Scheduler rev2",
- .opt_name = "credit2",
- .sched_id = XEN_SCHEDULER_CREDIT2,
- .sched_data = NULL,
-
- .global_init = csched2_global_init,
-
- .insert_unit = csched2_unit_insert,
- .remove_unit = csched2_unit_remove,
-
- .sleep = csched2_unit_sleep,
- .wake = csched2_unit_wake,
- .yield = csched2_unit_yield,
-
- .adjust = csched2_dom_cntl,
- .adjust_affinity= csched2_aff_cntl,
- .adjust_global = csched2_sys_cntl,
-
- .pick_resource = csched2_res_pick,
- .migrate = csched2_unit_migrate,
- .do_schedule = csched2_schedule,
- .context_saved = csched2_context_saved,
-
- .dump_settings = csched2_dump,
- .init = csched2_init,
- .deinit = csched2_deinit,
- .alloc_udata = csched2_alloc_udata,
- .free_udata = csched2_free_udata,
- .alloc_pdata = csched2_alloc_pdata,
- .init_pdata = csched2_init_pdata,
- .deinit_pdata = csched2_deinit_pdata,
- .free_pdata = csched2_free_pdata,
- .switch_sched = csched2_switch_sched,
- .alloc_domdata = csched2_alloc_domdata,
- .free_domdata = csched2_free_domdata,
-};
-
-REGISTER_SCHEDULER(sched_credit2_def);
+++ /dev/null
-/*
- * xen/common/sched_null.c
- *
- * Copyright (c) 2017, Dario Faggioli, Citrix Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * The 'null' scheduler always choose to run, on each pCPU, either nothing
- * (i.e., the pCPU stays idle) or always the same unit.
- *
- * It is aimed at supporting static scenarios, where there always are
- * less units than pCPUs (and the units don't need to move among pCPUs
- * for any reason) with the least possible overhead.
- *
- * Typical usecase are embedded applications, but also HPC, especially
- * if the scheduler is used inside a cpupool.
- */
-
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <xen/trace.h>
-
-/*
- * null tracing events. Check include/public/trace.h for more details.
- */
-#define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1)
-#define TRC_SNULL_UNIT_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2)
-#define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
-#define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4)
-#define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5)
-#define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6)
-
-/*
- * Locking:
- * - Scheduler-lock (a.k.a. runqueue lock):
- * + is per-pCPU;
- * + serializes assignment and deassignment of units to a pCPU.
- * - Private data lock (a.k.a. private scheduler lock):
- * + is scheduler-wide;
- * + serializes accesses to the list of domains in this scheduler.
- * - Waitqueue lock:
- * + is scheduler-wide;
- * + serialize accesses to the list of units waiting to be assigned
- * to pCPUs.
- *
- * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
- * waitqueue lock nests inside runqueue lock which nests inside private
- * lock. More specifically:
- * + if we need both runqueue and private locks, we must acquire the
- * private lock for first;
- * + if we need both runqueue and waitqueue locks, we must acquire
- * the runqueue lock for first;
- * + if we need both private and waitqueue locks, we must acquire
- * the private lock for first;
- * + if we already own a runqueue lock, we must never acquire
- * the private lock;
- * + if we already own the waitqueue lock, we must never acquire
- * the runqueue lock or the private lock.
- */
-
-/*
- * System-wide private data
- */
-struct null_private {
- spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */
- struct list_head ndom; /* Domains of this scheduler */
- struct list_head waitq; /* units not assigned to any pCPU */
- spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */
- cpumask_t cpus_free; /* CPUs without a unit associated to them */
-};
-
-/*
- * Physical CPU
- */
-struct null_pcpu {
- struct sched_unit *unit;
-};
-DEFINE_PER_CPU(struct null_pcpu, npc);
-
-/*
- * Schedule unit
- */
-struct null_unit {
- struct list_head waitq_elem;
- struct sched_unit *unit;
-};
-
-/*
- * Domain
- */
-struct null_dom {
- struct list_head ndom_elem;
- struct domain *dom;
-};
-
-/*
- * Accessor helpers functions
- */
-static inline struct null_private *null_priv(const struct scheduler *ops)
-{
- return ops->sched_data;
-}
-
-static inline struct null_unit *null_unit(const struct sched_unit *unit)
-{
- return unit->priv;
-}
-
-static inline bool unit_check_affinity(struct sched_unit *unit,
- unsigned int cpu,
- unsigned int balance_step)
-{
- affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu));
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- cpupool_domain_master_cpumask(unit->domain));
-
- return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
-}
-
-static int null_init(struct scheduler *ops)
-{
- struct null_private *prv;
-
- printk("Initializing null scheduler\n"
- "WARNING: This is experimental software in development.\n"
- "Use at your own risk.\n");
-
- prv = xzalloc(struct null_private);
- if ( prv == NULL )
- return -ENOMEM;
-
- spin_lock_init(&prv->lock);
- spin_lock_init(&prv->waitq_lock);
- INIT_LIST_HEAD(&prv->ndom);
- INIT_LIST_HEAD(&prv->waitq);
-
- ops->sched_data = prv;
-
- return 0;
-}
-
-static void null_deinit(struct scheduler *ops)
-{
- xfree(ops->sched_data);
- ops->sched_data = NULL;
-}
-
-static void init_pdata(struct null_private *prv, unsigned int cpu)
-{
- /* Mark the pCPU as free, and with no unit assigned */
- cpumask_set_cpu(cpu, &prv->cpus_free);
- per_cpu(npc, cpu).unit = NULL;
-}
-
-static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
- struct null_private *prv = null_priv(ops);
-
- /* alloc_pdata is not implemented, so we want this to be NULL. */
- ASSERT(!pdata);
-
- init_pdata(prv, cpu);
-}
-
-static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
- struct null_private *prv = null_priv(ops);
-
- /* alloc_pdata not implemented, so this must have stayed NULL */
- ASSERT(!pcpu);
-
- cpumask_clear_cpu(cpu, &prv->cpus_free);
- per_cpu(npc, cpu).unit = NULL;
-}
-
-static void *null_alloc_udata(const struct scheduler *ops,
- struct sched_unit *unit, void *dd)
-{
- struct null_unit *nvc;
-
- nvc = xzalloc(struct null_unit);
- if ( nvc == NULL )
- return NULL;
-
- INIT_LIST_HEAD(&nvc->waitq_elem);
- nvc->unit = unit;
-
- SCHED_STAT_CRANK(unit_alloc);
-
- return nvc;
-}
-
-static void null_free_udata(const struct scheduler *ops, void *priv)
-{
- struct null_unit *nvc = priv;
-
- xfree(nvc);
-}
-
-static void * null_alloc_domdata(const struct scheduler *ops,
- struct domain *d)
-{
- struct null_private *prv = null_priv(ops);
- struct null_dom *ndom;
- unsigned long flags;
-
- ndom = xzalloc(struct null_dom);
- if ( ndom == NULL )
- return ERR_PTR(-ENOMEM);
-
- ndom->dom = d;
-
- spin_lock_irqsave(&prv->lock, flags);
- list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
- spin_unlock_irqrestore(&prv->lock, flags);
-
- return ndom;
-}
-
-static void null_free_domdata(const struct scheduler *ops, void *data)
-{
- struct null_dom *ndom = data;
- struct null_private *prv = null_priv(ops);
-
- if ( ndom )
- {
- unsigned long flags;
-
- spin_lock_irqsave(&prv->lock, flags);
- list_del_init(&ndom->ndom_elem);
- spin_unlock_irqrestore(&prv->lock, flags);
-
- xfree(ndom);
- }
-}
-
-/*
- * unit to pCPU assignment and placement. This _only_ happens:
- * - on insert,
- * - on migrate.
- *
- * Insert occurs when a unit joins this scheduler for the first time
- * (e.g., when the domain it's part of is moved to the scheduler's
- * cpupool).
- *
- * Migration may be necessary if a pCPU (with a unit assigned to it)
- * is removed from the scheduler's cpupool.
- *
- * So this is not part of any hot path.
- */
-static struct sched_resource *
-pick_res(struct null_private *prv, const struct sched_unit *unit)
-{
- unsigned int bs;
- unsigned int cpu = sched_unit_master(unit), new_cpu;
- cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain);
-
- ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
- for_each_affinity_balance_step( bs )
- {
- if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
- continue;
-
- affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
-
- /*
- * If our processor is free, or we are assigned to it, and it is also
- * still valid and part of our affinity, just go for it.
- * (Note that we may call unit_check_affinity(), but we deliberately
- * don't, so we get to keep in the scratch cpumask what we have just
- * put in it.)
- */
- if ( likely((per_cpu(npc, cpu).unit == NULL ||
- per_cpu(npc, cpu).unit == unit)
- && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
- {
- new_cpu = cpu;
- goto out;
- }
-
- /* If not, just go for a free pCPU, within our affinity, if any */
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &prv->cpus_free);
- new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
-
- if ( likely(new_cpu != nr_cpu_ids) )
- goto out;
- }
-
- /*
- * If we didn't find any free pCPU, just pick any valid pcpu, even if
- * it has another unit assigned. This will happen during shutdown and
- * suspend/resume, but it may also happen during "normal operation", if
- * all the pCPUs are busy.
- *
- * In fact, there must always be something sane in v->processor, or
- * unit_schedule_lock() and friends won't work. This is not a problem,
- * as we will actually assign the unit to the pCPU we return from here,
- * only if the pCPU is free.
- */
- cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity);
- new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
-
- out:
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint16_t unit, dom;
- uint32_t new_cpu;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.new_cpu = new_cpu;
- __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d);
- }
-
- return get_sched_res(new_cpu);
-}
-
-static void unit_assign(struct null_private *prv, struct sched_unit *unit,
- unsigned int cpu)
-{
- ASSERT(is_unit_online(unit));
-
- per_cpu(npc, cpu).unit = unit;
- sched_set_res(unit, get_sched_res(cpu));
- cpumask_clear_cpu(cpu, &prv->cpus_free);
-
- dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint16_t unit, dom;
- uint32_t cpu;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.cpu = cpu;
- __trace_var(TRC_SNULL_UNIT_ASSIGN, 1, sizeof(d), &d);
- }
-}
-
-/* Returns true if a cpu was tickled */
-static bool unit_deassign(struct null_private *prv, struct sched_unit *unit)
-{
- unsigned int bs;
- unsigned int cpu = sched_unit_master(unit);
- struct null_unit *wvc;
-
- ASSERT(list_empty(&null_unit(unit)->waitq_elem));
- ASSERT(per_cpu(npc, cpu).unit == unit);
- ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free));
-
- per_cpu(npc, cpu).unit = NULL;
- cpumask_set_cpu(cpu, &prv->cpus_free);
-
- dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain,
- unit->unit_id);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint16_t unit, dom;
- uint32_t cpu;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.cpu = cpu;
- __trace_var(TRC_SNULL_UNIT_DEASSIGN, 1, sizeof(d), &d);
- }
-
- spin_lock(&prv->waitq_lock);
-
- /*
- * If unit is assigned to a pCPU, let's see if there is someone waiting,
- * suitable to be assigned to it (prioritizing units that have
- * soft-affinity with cpu).
- */
- for_each_affinity_balance_step( bs )
- {
- list_for_each_entry( wvc, &prv->waitq, waitq_elem )
- {
- if ( bs == BALANCE_SOFT_AFFINITY &&
- !has_soft_affinity(wvc->unit) )
- continue;
-
- if ( unit_check_affinity(wvc->unit, cpu, bs) )
- {
- list_del_init(&wvc->waitq_elem);
- unit_assign(prv, wvc->unit, cpu);
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
- spin_unlock(&prv->waitq_lock);
- return true;
- }
- }
- }
- spin_unlock(&prv->waitq_lock);
-
- return false;
-}
-
-/* Change the scheduler of cpu to us (null). */
-static spinlock_t *null_switch_sched(struct scheduler *new_ops,
- unsigned int cpu,
- void *pdata, void *vdata)
-{
- struct sched_resource *sr = get_sched_res(cpu);
- struct null_private *prv = null_priv(new_ops);
- struct null_unit *nvc = vdata;
-
- ASSERT(nvc && is_idle_unit(nvc->unit));
-
- sched_idle_unit(cpu)->priv = vdata;
-
- /*
- * We are holding the runqueue lock already (it's been taken in
- * schedule_cpu_switch()). It actually may or may not be the 'right'
- * one for this cpu, but that is ok for preventing races.
- */
- ASSERT(!local_irq_is_enabled());
-
- init_pdata(prv, cpu);
-
- return &sr->_lock;
-}
-
-static void null_unit_insert(const struct scheduler *ops,
- struct sched_unit *unit)
-{
- struct null_private *prv = null_priv(ops);
- struct null_unit *nvc = null_unit(unit);
- unsigned int cpu;
- spinlock_t *lock;
-
- ASSERT(!is_idle_unit(unit));
-
- lock = unit_schedule_lock_irq(unit);
-
- if ( unlikely(!is_unit_online(unit)) )
- {
- unit_schedule_unlock_irq(lock, unit);
- return;
- }
-
- retry:
- sched_set_res(unit, pick_res(prv, unit));
- cpu = sched_unit_master(unit);
-
- spin_unlock(lock);
-
- lock = unit_schedule_lock(unit);
-
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(unit->domain));
-
- /* If the pCPU is free, we assign unit to it */
- if ( likely(per_cpu(npc, cpu).unit == NULL) )
- {
- /*
- * Insert is followed by vcpu_wake(), so there's no need to poke
- * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
- */
- unit_assign(prv, unit, cpu);
- }
- else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
- {
- /*
- * If the pCPU is not free (e.g., because we raced with another
- * insert or a migrate), but there are other free pCPUs, we can
- * try to pick again.
- */
- goto retry;
- }
- else
- {
- /*
- * If the pCPU is not free, and there aren't any (valid) others,
- * we have no alternatives than to go into the waitqueue.
- */
- spin_lock(&prv->waitq_lock);
- list_add_tail(&nvc->waitq_elem, &prv->waitq);
- dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n",
- unit->domain, unit->unit_id);
- spin_unlock(&prv->waitq_lock);
- }
- spin_unlock_irq(lock);
-
- SCHED_STAT_CRANK(unit_insert);
-}
-
-static void null_unit_remove(const struct scheduler *ops,
- struct sched_unit *unit)
-{
- struct null_private *prv = null_priv(ops);
- struct null_unit *nvc = null_unit(unit);
- spinlock_t *lock;
-
- ASSERT(!is_idle_unit(unit));
-
- lock = unit_schedule_lock_irq(unit);
-
- /* If offline, the unit shouldn't be assigned, nor in the waitqueue */
- if ( unlikely(!is_unit_online(unit)) )
- {
- ASSERT(per_cpu(npc, sched_unit_master(unit)).unit != unit);
- ASSERT(list_empty(&nvc->waitq_elem));
- goto out;
- }
-
- /* If unit is in waitqueue, just get it out of there and bail */
- if ( unlikely(!list_empty(&nvc->waitq_elem)) )
- {
- spin_lock(&prv->waitq_lock);
- list_del_init(&nvc->waitq_elem);
- spin_unlock(&prv->waitq_lock);
-
- goto out;
- }
-
- unit_deassign(prv, unit);
-
- out:
- unit_schedule_unlock_irq(lock, unit);
-
- SCHED_STAT_CRANK(unit_remove);
-}
-
-static void null_unit_wake(const struct scheduler *ops,
- struct sched_unit *unit)
-{
- struct null_private *prv = null_priv(ops);
- struct null_unit *nvc = null_unit(unit);
- unsigned int cpu = sched_unit_master(unit);
-
- ASSERT(!is_idle_unit(unit));
-
- if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
- {
- SCHED_STAT_CRANK(unit_wake_running);
- return;
- }
-
- if ( unlikely(!list_empty(&nvc->waitq_elem)) )
- {
- /* Not exactly "on runq", but close enough for reusing the counter */
- SCHED_STAT_CRANK(unit_wake_onrunq);
- return;
- }
-
- if ( likely(unit_runnable(unit)) )
- SCHED_STAT_CRANK(unit_wake_runnable);
- else
- SCHED_STAT_CRANK(unit_wake_not_runnable);
-
- if ( likely(per_cpu(npc, cpu).unit == unit) )
- {
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
- return;
- }
-
- /*
- * If a unit is neither on a pCPU nor in the waitqueue, it means it was
- * offline, and that it is now coming back being online. If we're lucky,
- * and its previous resource is free (and affinities match), we can just
- * assign the unit to it (we own the proper lock already) and be done.
- */
- if ( per_cpu(npc, cpu).unit == NULL &&
- unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) )
- {
- if ( !has_soft_affinity(unit) ||
- unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) )
- {
- unit_assign(prv, unit, cpu);
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
- return;
- }
- }
-
- /*
- * If the resource is not free (or affinities do not match) we need
- * to assign unit to some other one, but we can't do it here, as:
- * - we don't own the proper lock,
- * - we can't change v->processor under vcpu_wake()'s feet.
- * So we add it to the waitqueue, and tickle all the free CPUs (if any)
- * on which unit can run. The first one that schedules will pick it up.
- */
- spin_lock(&prv->waitq_lock);
- list_add_tail(&nvc->waitq_elem, &prv->waitq);
- spin_unlock(&prv->waitq_lock);
-
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(unit->domain));
- cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
- &prv->cpus_free);
-
- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
- dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
- unit->domain->domain_id, unit->unit_id);
- else
- cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ);
-}
-
-static void null_unit_sleep(const struct scheduler *ops,
- struct sched_unit *unit)
-{
- struct null_private *prv = null_priv(ops);
- unsigned int cpu = sched_unit_master(unit);
- bool tickled = false;
-
- ASSERT(!is_idle_unit(unit));
-
- /*
- * Check if the unit is in the process of being offlined. If yes,
- * we need to remove it from either its pCPU or the waitqueue.
- */
- if ( unlikely(!is_unit_online(unit)) )
- {
- struct null_unit *nvc = null_unit(unit);
-
- if ( unlikely(!list_empty(&nvc->waitq_elem)) )
- {
- spin_lock(&prv->waitq_lock);
- list_del_init(&nvc->waitq_elem);
- spin_unlock(&prv->waitq_lock);
- }
- else if ( per_cpu(npc, cpu).unit == unit )
- tickled = unit_deassign(prv, unit);
- }
-
- /* If unit is not assigned to a pCPU, or is not running, no need to bother */
- if ( likely(!tickled && curr_on_cpu(cpu) == unit) )
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-
- SCHED_STAT_CRANK(unit_sleep);
-}
-
-static struct sched_resource *
-null_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
- ASSERT(!is_idle_unit(unit));
- return pick_res(null_priv(ops), unit);
-}
-
-static void null_unit_migrate(const struct scheduler *ops,
- struct sched_unit *unit, unsigned int new_cpu)
-{
- struct null_private *prv = null_priv(ops);
- struct null_unit *nvc = null_unit(unit);
-
- ASSERT(!is_idle_unit(unit));
-
- if ( sched_unit_master(unit) == new_cpu )
- return;
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint16_t unit, dom;
- uint16_t cpu, new_cpu;
- } d;
- d.dom = unit->domain->domain_id;
- d.unit = unit->unit_id;
- d.cpu = sched_unit_master(unit);
- d.new_cpu = new_cpu;
- __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d);
- }
-
- /*
- * If unit is assigned to a pCPU, then such pCPU becomes free, and we
- * should look in the waitqueue if anyone else can be assigned to it.
- */
- if ( likely(per_cpu(npc, sched_unit_master(unit)).unit == unit) )
- {
- unit_deassign(prv, unit);
- SCHED_STAT_CRANK(migrate_running);
- }
- else if ( !list_empty(&nvc->waitq_elem) )
- SCHED_STAT_CRANK(migrate_on_runq);
-
- SCHED_STAT_CRANK(migrated);
-
- /*
- * If a unit is (going) offline, we want it to be neither assigned
- * to a pCPU, nor in the waitqueue.
- *
- * If it was on a cpu, we've removed it from there above. If it is
- * in the waitqueue, we remove it from there now. And then we bail.
- */
- if ( unlikely(!is_unit_online(unit)) )
- {
- spin_lock(&prv->waitq_lock);
- list_del_init(&nvc->waitq_elem);
- spin_unlock(&prv->waitq_lock);
- goto out;
- }
-
- /*
- * Let's now consider new_cpu, which is where unit is being sent. It can be
- * either free, or have a unit already assigned to it.
- *
- * In the former case we should assign unit to it, and try to get it to run,
- * if possible, according to affinity.
- *
- * In latter, all we can do is to park unit in the waitqueue.
- */
- if ( per_cpu(npc, new_cpu).unit == NULL &&
- unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) )
- {
- /* unit might have been in the waitqueue, so remove it */
- spin_lock(&prv->waitq_lock);
- list_del_init(&nvc->waitq_elem);
- spin_unlock(&prv->waitq_lock);
-
- unit_assign(prv, unit, new_cpu);
- }
- else
- {
- /* Put unit in the waitqueue, if it wasn't there already */
- spin_lock(&prv->waitq_lock);
- if ( list_empty(&nvc->waitq_elem) )
- {
- list_add_tail(&nvc->waitq_elem, &prv->waitq);
- dprintk(XENLOG_G_WARNING,
- "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain,
- unit->unit_id);
- }
- spin_unlock(&prv->waitq_lock);
- }
-
- /*
- * Whatever all the above, we always at least override v->processor.
- * This is especially important for shutdown or suspend/resume paths,
- * when it is important to let our caller (cpu_disable_scheduler())
- * know that the migration did happen, to the best of our possibilities,
- * at least. In case of suspend, any temporary inconsistency caused
- * by this, will be fixed-up during resume.
- */
- out:
- sched_set_res(unit, get_sched_res(new_cpu));
-}
-
-#ifndef NDEBUG
-static inline void null_unit_check(struct sched_unit *unit)
-{
- struct null_unit * const nvc = null_unit(unit);
- struct null_dom * const ndom = unit->domain->sched_priv;
-
- BUG_ON(nvc->unit != unit);
-
- if ( ndom )
- BUG_ON(is_idle_unit(unit));
- else
- BUG_ON(!is_idle_unit(unit));
-
- SCHED_STAT_CRANK(unit_check);
-}
-#define NULL_UNIT_CHECK(unit) (null_unit_check(unit))
-#else
-#define NULL_UNIT_CHECK(unit)
-#endif
-
-
-/*
- * The most simple scheduling function of all times! We either return:
- * - the unit assigned to the pCPU, if there's one and it can run;
- * - the idle unit, otherwise.
- */
-static void null_schedule(const struct scheduler *ops, struct sched_unit *prev,
- s_time_t now, bool tasklet_work_scheduled)
-{
- unsigned int bs;
- const unsigned int cur_cpu = smp_processor_id();
- const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
- struct null_private *prv = null_priv(ops);
- struct null_unit *wvc;
-
- SCHED_STAT_CRANK(schedule);
- NULL_UNIT_CHECK(current->sched_unit);
-
- if ( unlikely(tb_init_done) )
- {
- struct {
- uint16_t tasklet, cpu;
- int16_t unit, dom;
- } d;
- d.cpu = cur_cpu;
- d.tasklet = tasklet_work_scheduled;
- if ( per_cpu(npc, sched_cpu).unit == NULL )
- {
- d.unit = d.dom = -1;
- }
- else
- {
- d.unit = per_cpu(npc, sched_cpu).unit->unit_id;
- d.dom = per_cpu(npc, sched_cpu).unit->domain->domain_id;
- }
- __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d);
- }
-
- if ( tasklet_work_scheduled )
- {
- trace_var(TRC_SNULL_TASKLET, 1, 0, NULL);
- prev->next_task = sched_idle_unit(sched_cpu);
- }
- else
- prev->next_task = per_cpu(npc, sched_cpu).unit;
- prev->next_time = -1;
-
- /*
- * We may be new in the cpupool, or just coming back online. In which
- * case, there may be units in the waitqueue that we can assign to us
- * and run.
- */
- if ( unlikely(prev->next_task == NULL) )
- {
- bool unit_found;
-
- spin_lock(&prv->waitq_lock);
-
- if ( list_empty(&prv->waitq) )
- goto unlock;
-
- /*
- * We scan the waitqueue twice, for prioritizing units that have
- * soft-affinity with cpu. This may look like something expensive to
- * do here in null_schedule(), but it's actually fine, because we do
- * it only in cases where a pcpu has no unit associated (e.g., as
- * said above, the cpu has just joined a cpupool).
- */
- unit_found = false;
- for_each_affinity_balance_step( bs )
- {
- list_for_each_entry( wvc, &prv->waitq, waitq_elem )
- {
- if ( bs == BALANCE_SOFT_AFFINITY &&
- !has_soft_affinity(wvc->unit) )
- continue;
-
- if ( unit_check_affinity(wvc->unit, sched_cpu, bs) )
- {
- spinlock_t *lock;
-
- unit_found = true;
-
- /*
- * If the unit in the waitqueue has just come up online,
- * we risk racing with vcpu_wake(). To avoid this, sync
- * on the spinlock that vcpu_wake() holds, but only with
- * trylock, to avoid deadlock).
- */
- lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit));
-
- /*
- * We know the vcpu's lock is not this resource's lock. In
- * fact, if it were, since this cpu is free, vcpu_wake()
- * would have assigned the unit to here directly.
- */
- ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock);
-
- if ( lock ) {
- unit_assign(prv, wvc->unit, sched_cpu);
- list_del_init(&wvc->waitq_elem);
- prev->next_task = wvc->unit;
- spin_unlock(lock);
- goto unlock;
- }
- }
- }
- }
- /*
- * If we did find a unit with suitable affinity in the waitqueue, but
- * we could not pick it up (due to lock contention), and hence we are
- * still free, plan for another try. In fact, we don't want such unit
- * to be stuck in the waitqueue, when there are free cpus where it
- * could run.
- */
- if ( unlikely( unit_found && prev->next_task == NULL &&
- !list_empty(&prv->waitq)) )
- cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ);
- unlock:
- spin_unlock(&prv->waitq_lock);
-
- if ( prev->next_task == NULL &&
- !cpumask_test_cpu(sched_cpu, &prv->cpus_free) )
- cpumask_set_cpu(sched_cpu, &prv->cpus_free);
- }
-
- if ( unlikely(prev->next_task == NULL ||
- !unit_runnable_state(prev->next_task)) )
- prev->next_task = sched_idle_unit(sched_cpu);
-
- NULL_UNIT_CHECK(prev->next_task);
-
- prev->next_task->migrated = false;
-}
-
-static inline void dump_unit(struct null_private *prv, struct null_unit *nvc)
-{
- printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id,
- nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ?
- sched_unit_master(nvc->unit) : -1);
-}
-
-static void null_dump_pcpu(const struct scheduler *ops, int cpu)
-{
- struct null_private *prv = null_priv(ops);
- struct null_unit *nvc;
- spinlock_t *lock;
- unsigned long flags;
-
- lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
- printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}",
- cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
- CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
- if ( per_cpu(npc, cpu).unit != NULL )
- printk(", unit=%pdv%d", per_cpu(npc, cpu).unit->domain,
- per_cpu(npc, cpu).unit->unit_id);
- printk("\n");
-
- /* current unit (nothing to say if that's the idle unit) */
- nvc = null_unit(curr_on_cpu(cpu));
- if ( nvc && !is_idle_unit(nvc->unit) )
- {
- printk("\trun: ");
- dump_unit(prv, nvc);
- printk("\n");
- }
-
- pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
-}
-
-static void null_dump(const struct scheduler *ops)
-{
- struct null_private *prv = null_priv(ops);
- struct list_head *iter;
- unsigned long flags;
- unsigned int loop;
-
- spin_lock_irqsave(&prv->lock, flags);
-
- printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free));
-
- printk("Domain info:\n");
- loop = 0;
- list_for_each( iter, &prv->ndom )
- {
- struct null_dom *ndom;
- struct sched_unit *unit;
-
- ndom = list_entry(iter, struct null_dom, ndom_elem);
-
- printk("\tDomain: %d\n", ndom->dom->domain_id);
- for_each_sched_unit( ndom->dom, unit )
- {
- struct null_unit * const nvc = null_unit(unit);
- spinlock_t *lock;
-
- lock = unit_schedule_lock(unit);
-
- printk("\t%3d: ", ++loop);
- dump_unit(prv, nvc);
- printk("\n");
-
- unit_schedule_unlock(lock, unit);
- }
- }
-
- printk("Waitqueue: ");
- loop = 0;
- spin_lock(&prv->waitq_lock);
- list_for_each( iter, &prv->waitq )
- {
- struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem);
-
- if ( loop++ != 0 )
- printk(", ");
- if ( loop % 24 == 0 )
- printk("\n\t");
- printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id);
- }
- printk("\n");
- spin_unlock(&prv->waitq_lock);
-
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static const struct scheduler sched_null_def = {
- .name = "null Scheduler",
- .opt_name = "null",
- .sched_id = XEN_SCHEDULER_NULL,
- .sched_data = NULL,
-
- .init = null_init,
- .deinit = null_deinit,
- .init_pdata = null_init_pdata,
- .switch_sched = null_switch_sched,
- .deinit_pdata = null_deinit_pdata,
-
- .alloc_udata = null_alloc_udata,
- .free_udata = null_free_udata,
- .alloc_domdata = null_alloc_domdata,
- .free_domdata = null_free_domdata,
-
- .insert_unit = null_unit_insert,
- .remove_unit = null_unit_remove,
-
- .wake = null_unit_wake,
- .sleep = null_unit_sleep,
- .pick_resource = null_res_pick,
- .migrate = null_unit_migrate,
- .do_schedule = null_schedule,
-
- .dump_cpu_state = null_dump_pcpu,
- .dump_settings = null_dump,
-};
-
-REGISTER_SCHEDULER(sched_null_def);
+++ /dev/null
-/*****************************************************************************
- * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen
- * EDF scheduling is a real-time scheduling algorithm used in embedded field.
- *
- * by Sisu Xi, 2013, Washington University in Saint Louis
- * Meng Xu, 2014-2016, University of Pennsylvania
- *
- * Conversion toward event driven model by Tianyang Chen
- * and Dagaen Golomb, 2016, University of Pennsylvania
- *
- * based on the code of credit Scheduler
- */
-
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/timer.h>
-#include <xen/perfc.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <asm/atomic.h>
-#include <xen/errno.h>
-#include <xen/trace.h>
-#include <xen/cpu.h>
-#include <xen/keyhandler.h>
-#include <xen/trace.h>
-#include <xen/err.h>
-#include <xen/guest_access.h>
-
-/*
- * TODO:
- *
- * Migration compensation and resist like credit2 to better use cache;
- * Lock Holder Problem, using yield?
- * Self switch problem: UNITs of the same domain may preempt each other;
- */
-
-/*
- * Design:
- *
- * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
- * theory in real-time field.
- * At any scheduling point, the UNIT with earlier deadline has higher priority.
- * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
- * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
- * has a lower-priority UNIT running on it.)
- *
- * Each UNIT has a dedicated period, budget and a extratime flag
- * The deadline of an UNIT is at the end of each period;
- * An UNIT has its budget replenished at the beginning of each period;
- * While scheduled, an UNIT burns its budget.
- * The UNIT needs to finish its budget before its deadline in each period;
- * The UNIT discards its unused budget at the end of each period.
- * When an UNIT runs out of budget in a period, if its extratime flag is set,
- * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
- * it has to wait until next period.
- *
- * Each UNIT is implemented as a deferable server.
- * When an UNIT has a task running on it, its budget is continuously burned;
- * When an UNIT has no task but with budget left, its budget is preserved.
- *
- * Queue scheme:
- * A global runqueue and a global depletedqueue for each CPU pool.
- * The runqueue holds all runnable UNITs with budget,
- * sorted by priority_level and deadline;
- * The depletedqueue holds all UNITs without budget, unsorted;
- *
- * Note: cpumask and cpupool is supported.
- */
-
-/*
- * Locking:
- * A global system lock is used to protect the RunQ and DepletedQ.
- * The global lock is referenced by sched_res->schedule_lock
- * from all physical cpus.
- *
- * The lock is already grabbed when calling wake/sleep/schedule/ functions
- * in schedule.c
- *
- * The functions involes RunQ and needs to grab locks are:
- * unit_insert, unit_remove, context_saved, runq_insert
- */
-
-
-/*
- * Default parameters:
- * Period and budget in default is 10 and 4 ms, respectively
- */
-#define RTDS_DEFAULT_PERIOD (MICROSECS(10000))
-#define RTDS_DEFAULT_BUDGET (MICROSECS(4000))
-
-/*
- * Max period: max delta of time type, because period is added to the time
- * an unit activates, so this must not overflow.
- * Min period: 10 us, considering the scheduling overhead (when period is
- * too low, scheduling is invoked too frequently, causing high overhead).
- */
-#define RTDS_MAX_PERIOD (STIME_DELTA_MAX)
-#define RTDS_MIN_PERIOD (MICROSECS(10))
-
-/*
- * Min budget: 10 us, considering the scheduling overhead (when budget is
- * consumed too fast, scheduling is invoked too frequently, causing
- * high overhead).
- */
-#define RTDS_MIN_BUDGET (MICROSECS(10))
-
-/*
- * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
- * the next deadline, performing addition could be faster if the difference
- * between cur_deadline and now is small. If the difference is bigger than
- * 1024 * period, use multiplication.
- */
-#define UPDATE_LIMIT_SHIFT 10
-
-/*
- * Flags
- */
-/*
- * RTDS_scheduled: Is this unit either running on, or context-switching off,
- * a physical cpu?
- * + Accessed only with global lock held.
- * + Set when chosen as next in rt_schedule().
- * + Cleared after context switch has been saved in rt_context_saved()
- * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
- * set RTDS_delayed_runq_add
- * + Checked to be false in runq_insert.
- */
-#define __RTDS_scheduled 1
-#define RTDS_scheduled (1<<__RTDS_scheduled)
-/*
- * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
- * once it's done being context switching out?
- * + Set when scheduling out in rt_schedule() if prev is runable
- * + Set in rt_unit_wake if it finds RTDS_scheduled set
- * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
- * and clears the bit.
- */
-#define __RTDS_delayed_runq_add 2
-#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
-
-/*
- * RTDS_depleted: Does this vcp run out of budget?
- * This flag is
- * + set in burn_budget() if an unit has zero budget left;
- * + cleared and checked in the repenishment handler,
- * for the units that are being replenished.
- */
-#define __RTDS_depleted 3
-#define RTDS_depleted (1<<__RTDS_depleted)
-
-/*
- * RTDS_extratime: Can the unit run in the time that is
- * not part of any real-time reservation, and would therefore
- * be otherwise left idle?
- */
-#define __RTDS_extratime 4
-#define RTDS_extratime (1<<__RTDS_extratime)
-
-/*
- * rt tracing events ("only" 512 available!). Check
- * include/public/trace.h for more details.
- */
-#define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1)
-#define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2)
-#define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3)
-#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
-#define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5)
-#define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6)
-
-static void repl_timer_handler(void *data);
-
-/*
- * System-wide private data, include global RunQueue/DepletedQ
- * Global lock is referenced by sched_res->schedule_lock from all
- * physical cpus. It can be grabbed via unit_schedule_lock_irq()
- */
-struct rt_private {
- spinlock_t lock; /* the global coarse-grained lock */
- struct list_head sdom; /* list of availalbe domains, used for dump */
-
- struct list_head runq; /* ordered list of runnable units */
- struct list_head depletedq; /* unordered list of depleted units */
-
- struct timer repl_timer; /* replenishment timer */
- struct list_head replq; /* ordered list of units that need replenishment */
-
- cpumask_t tickled; /* cpus been tickled */
-};
-
-/*
- * Virtual CPU
- */
-struct rt_unit {
- struct list_head q_elem; /* on the runq/depletedq list */
- struct list_head replq_elem; /* on the replenishment events list */
-
- /* UNIT parameters, in nanoseconds */
- s_time_t period;
- s_time_t budget;
-
- /* UNIT current information in nanosecond */
- s_time_t cur_budget; /* current budget */
- s_time_t last_start; /* last start time */
- s_time_t cur_deadline; /* current deadline for EDF */
-
- /* Up-pointers */
- struct rt_dom *sdom;
- struct sched_unit *unit;
-
- unsigned priority_level;
-
- unsigned flags; /* mark __RTDS_scheduled, etc.. */
-};
-
-/*
- * Domain
- */
-struct rt_dom {
- struct list_head sdom_elem; /* link list on rt_priv */
- struct domain *dom; /* pointer to upper domain */
-};
-
-/*
- * Useful inline functions
- */
-static inline struct rt_private *rt_priv(const struct scheduler *ops)
-{
- return ops->sched_data;
-}
-
-static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
-{
- return unit->priv;
-}
-
-static inline struct list_head *rt_runq(const struct scheduler *ops)
-{
- return &rt_priv(ops)->runq;
-}
-
-static inline struct list_head *rt_depletedq(const struct scheduler *ops)
-{
- return &rt_priv(ops)->depletedq;
-}
-
-static inline struct list_head *rt_replq(const struct scheduler *ops)
-{
- return &rt_priv(ops)->replq;
-}
-
-static inline bool has_extratime(const struct rt_unit *svc)
-{
- return svc->flags & RTDS_extratime;
-}
-
-/*
- * Helper functions for manipulating the runqueue, the depleted queue,
- * and the replenishment events queue.
- */
-static int
-unit_on_q(const struct rt_unit *svc)
-{
- return !list_empty(&svc->q_elem);
-}
-
-static struct rt_unit *
-q_elem(struct list_head *elem)
-{
- return list_entry(elem, struct rt_unit, q_elem);
-}
-
-static struct rt_unit *
-replq_elem(struct list_head *elem)
-{
- return list_entry(elem, struct rt_unit, replq_elem);
-}
-
-static int
-unit_on_replq(const struct rt_unit *svc)
-{
- return !list_empty(&svc->replq_elem);
-}
-
-/*
- * If v1 priority >= v2 priority, return value > 0
- * Otherwise, return value < 0
- */
-static s_time_t
-compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
-{
- int prio = v2->priority_level - v1->priority_level;
-
- if ( prio == 0 )
- return v2->cur_deadline - v1->cur_deadline;
-
- return prio;
-}
-
-/*
- * Debug related code, dump unit/cpu information
- */
-static void
-rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
-{
- cpumask_t *cpupool_mask, *mask;
-
- ASSERT(svc != NULL);
- /* idle unit */
- if( svc->sdom == NULL )
- {
- printk("\n");
- return;
- }
-
- /*
- * We can't just use 'cpumask_scratch' because the dumping can
- * happen from a pCPU outside of this scheduler's cpupool, and
- * hence it's not right to use its pCPU's scratch mask.
- * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
- * own scratch space, since we hold the runqueue lock.
- */
- mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
-
- cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
- cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
- printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
- " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
- " \t\t priority_level=%d has_extratime=%d\n"
- " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
- svc->unit->domain->domain_id,
- svc->unit->unit_id,
- sched_unit_master(svc->unit),
- svc->period,
- svc->budget,
- svc->cur_budget,
- svc->cur_deadline,
- svc->last_start,
- svc->priority_level,
- has_extratime(svc),
- unit_on_q(svc),
- unit_runnable(svc->unit),
- svc->flags, CPUMASK_PR(mask));
-}
-
-static void
-rt_dump_pcpu(const struct scheduler *ops, int cpu)
-{
- struct rt_private *prv = rt_priv(ops);
- struct rt_unit *svc;
- unsigned long flags;
-
- spin_lock_irqsave(&prv->lock, flags);
- printk("CPU[%02d]\n", cpu);
- /* current UNIT (nothing to say if that's the idle unit). */
- svc = rt_unit(curr_on_cpu(cpu));
- if ( svc && !is_idle_unit(svc->unit) )
- {
- rt_dump_unit(ops, svc);
- }
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void
-rt_dump(const struct scheduler *ops)
-{
- struct list_head *runq, *depletedq, *replq, *iter;
- struct rt_private *prv = rt_priv(ops);
- struct rt_unit *svc;
- struct rt_dom *sdom;
- unsigned long flags;
-
- spin_lock_irqsave(&prv->lock, flags);
-
- if ( list_empty(&prv->sdom) )
- goto out;
-
- runq = rt_runq(ops);
- depletedq = rt_depletedq(ops);
- replq = rt_replq(ops);
-
- printk("Global RunQueue info:\n");
- list_for_each ( iter, runq )
- {
- svc = q_elem(iter);
- rt_dump_unit(ops, svc);
- }
-
- printk("Global DepletedQueue info:\n");
- list_for_each ( iter, depletedq )
- {
- svc = q_elem(iter);
- rt_dump_unit(ops, svc);
- }
-
- printk("Global Replenishment Events info:\n");
- list_for_each ( iter, replq )
- {
- svc = replq_elem(iter);
- rt_dump_unit(ops, svc);
- }
-
- printk("Domain info:\n");
- list_for_each ( iter, &prv->sdom )
- {
- struct sched_unit *unit;
-
- sdom = list_entry(iter, struct rt_dom, sdom_elem);
- printk("\tdomain: %d\n", sdom->dom->domain_id);
-
- for_each_sched_unit ( sdom->dom, unit )
- {
- svc = rt_unit(unit);
- rt_dump_unit(ops, svc);
- }
- }
-
- out:
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-/*
- * update deadline and budget when now >= cur_deadline
- * it needs to be updated to the deadline of the current period
- */
-static void
-rt_update_deadline(s_time_t now, struct rt_unit *svc)
-{
- ASSERT(now >= svc->cur_deadline);
- ASSERT(svc->period != 0);
-
- if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
- {
- do
- svc->cur_deadline += svc->period;
- while ( svc->cur_deadline <= now );
- }
- else
- {
- long count = ((now - svc->cur_deadline) / svc->period) + 1;
- svc->cur_deadline += count * svc->period;
- }
-
- /*
- * svc may be scheduled to run immediately after it misses deadline
- * Then rt_update_deadline is called before rt_schedule, which
- * should only deduct the time spent in current period from the budget
- */
- svc->last_start = now;
- svc->cur_budget = svc->budget;
- svc->priority_level = 0;
-
- /* TRACE */
- {
- struct __packed {
- unsigned unit:16, dom:16;
- unsigned priority_level;
- uint64_t cur_deadline, cur_budget;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.priority_level = svc->priority_level;
- d.cur_deadline = (uint64_t) svc->cur_deadline;
- d.cur_budget = (uint64_t) svc->cur_budget;
- trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
- sizeof(d),
- (unsigned char *) &d);
- }
-
- return;
-}
-
-/*
- * Helpers for removing and inserting an unit in a queue
- * that is being kept ordered by the units' deadlines (as EDF
- * mandates).
- *
- * For callers' convenience, the unit removing helper returns
- * true if the unit removed was the one at the front of the
- * queue; similarly, the inserting helper returns true if the
- * inserted ended at the front of the queue (i.e., in both
- * cases, if the unit with the earliest deadline is what we
- * are dealing with).
- */
-static inline bool
-deadline_queue_remove(struct list_head *queue, struct list_head *elem)
-{
- int pos = 0;
-
- if ( queue->next != elem )
- pos = 1;
-
- list_del_init(elem);
- return !pos;
-}
-
-static inline bool
-deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
- struct rt_unit *svc, struct list_head *elem,
- struct list_head *queue)
-{
- struct list_head *iter;
- int pos = 0;
-
- list_for_each ( iter, queue )
- {
- struct rt_unit * iter_svc = (*qelem)(iter);
- if ( compare_unit_priority(svc, iter_svc) > 0 )
- break;
- pos++;
- }
- list_add_tail(elem, iter);
- return !pos;
-}
-#define deadline_runq_insert(...) \
- deadline_queue_insert(&q_elem, ##__VA_ARGS__)
-#define deadline_replq_insert(...) \
- deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
-
-static inline void
-q_remove(struct rt_unit *svc)
-{
- ASSERT( unit_on_q(svc) );
- list_del_init(&svc->q_elem);
-}
-
-static inline void
-replq_remove(const struct scheduler *ops, struct rt_unit *svc)
-{
- struct rt_private *prv = rt_priv(ops);
- struct list_head *replq = rt_replq(ops);
-
- ASSERT( unit_on_replq(svc) );
-
- if ( deadline_queue_remove(replq, &svc->replq_elem) )
- {
- /*
- * The replenishment timer needs to be set to fire when a
- * replenishment for the unit at the front of the replenishment
- * queue is due. If it is such unit that we just removed, we may
- * need to reprogram the timer.
- */
- if ( !list_empty(replq) )
- {
- struct rt_unit *svc_next = replq_elem(replq->next);
- set_timer(&prv->repl_timer, svc_next->cur_deadline);
- }
- else
- stop_timer(&prv->repl_timer);
- }
-}
-
-/*
- * Insert svc with budget in RunQ according to EDF:
- * units with smaller deadlines go first.
- * Insert svc without budget in DepletedQ unsorted;
- */
-static void
-runq_insert(const struct scheduler *ops, struct rt_unit *svc)
-{
- struct rt_private *prv = rt_priv(ops);
- struct list_head *runq = rt_runq(ops);
-
- ASSERT( spin_is_locked(&prv->lock) );
- ASSERT( !unit_on_q(svc) );
- ASSERT( unit_on_replq(svc) );
-
- /* add svc to runq if svc still has budget or its extratime is set */
- if ( svc->cur_budget > 0 ||
- has_extratime(svc) )
- deadline_runq_insert(svc, &svc->q_elem, runq);
- else
- list_add(&svc->q_elem, &prv->depletedq);
-}
-
-static void
-replq_insert(const struct scheduler *ops, struct rt_unit *svc)
-{
- struct list_head *replq = rt_replq(ops);
- struct rt_private *prv = rt_priv(ops);
-
- ASSERT( !unit_on_replq(svc) );
-
- /*
- * The timer may be re-programmed if svc is inserted
- * at the front of the event list.
- */
- if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
- set_timer(&prv->repl_timer, svc->cur_deadline);
-}
-
-/*
- * Removes and re-inserts an event to the replenishment queue.
- * The aim is to update its position inside the queue, as its
- * deadline (and hence its replenishment time) could have
- * changed.
- */
-static void
-replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
-{
- struct list_head *replq = rt_replq(ops);
- struct rt_unit *rearm_svc = svc;
- bool_t rearm = 0;
-
- ASSERT( unit_on_replq(svc) );
-
- /*
- * If svc was at the front of the replenishment queue, we certainly
- * need to re-program the timer, and we want to use the deadline of
- * the unit which is now at the front of the queue (which may still
- * be svc or not).
- *
- * We may also need to re-program, if svc has been put at the front
- * of the replenishment queue when being re-inserted.
- */
- if ( deadline_queue_remove(replq, &svc->replq_elem) )
- {
- deadline_replq_insert(svc, &svc->replq_elem, replq);
- rearm_svc = replq_elem(replq->next);
- rearm = 1;
- }
- else
- rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
-
- if ( rearm )
- set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
-}
-
-/*
- * Pick a valid resource for the unit vc
- * Valid resource of an unit is intesection of unit's affinity
- * and available resources
- */
-static struct sched_resource *
-rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
- cpumask_t cpus;
- cpumask_t *online;
- int cpu;
-
- online = cpupool_domain_master_cpumask(unit->domain);
- cpumask_and(&cpus, online, unit->cpu_hard_affinity);
-
- cpu = cpumask_test_cpu(sched_unit_master(unit), &cpus)
- ? sched_unit_master(unit)
- : cpumask_cycle(sched_unit_master(unit), &cpus);
- ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
-
- return get_sched_res(cpu);
-}
-
-/*
- * Init/Free related code
- */
-static int
-rt_init(struct scheduler *ops)
-{
- int rc = -ENOMEM;
- struct rt_private *prv = xzalloc(struct rt_private);
-
- printk("Initializing RTDS scheduler\n"
- "WARNING: This is experimental software in development.\n"
- "Use at your own risk.\n");
-
- if ( prv == NULL )
- goto err;
-
- spin_lock_init(&prv->lock);
- INIT_LIST_HEAD(&prv->sdom);
- INIT_LIST_HEAD(&prv->runq);
- INIT_LIST_HEAD(&prv->depletedq);
- INIT_LIST_HEAD(&prv->replq);
-
- ops->sched_data = prv;
- rc = 0;
-
- err:
- if ( rc )
- xfree(prv);
-
- return rc;
-}
-
-static void
-rt_deinit(struct scheduler *ops)
-{
- struct rt_private *prv = rt_priv(ops);
-
- ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
- prv->repl_timer.status == TIMER_STATUS_killed);
-
- ops->sched_data = NULL;
- xfree(prv);
-}
-
-/*
- * Point per_cpu spinlock to the global system lock;
- * All cpu have same global system lock
- */
-static void
-rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
- struct rt_private *prv = rt_priv(ops);
- spinlock_t *old_lock;
- unsigned long flags;
-
- old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
- /*
- * TIMER_STATUS_invalid means we are the first cpu that sees the timer
- * allocated but not initialized, and so it's up to us to initialize it.
- */
- if ( prv->repl_timer.status == TIMER_STATUS_invalid )
- {
- init_timer(&prv->repl_timer, repl_timer_handler, (void *)ops, cpu);
- dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
- }
-
- /* Move the scheduler lock to our global runqueue lock. */
- get_sched_res(cpu)->schedule_lock = &prv->lock;
-
- /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
- spin_unlock_irqrestore(old_lock, flags);
-}
-
-/* Change the scheduler of cpu to us (RTDS). */
-static spinlock_t *
-rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
- void *pdata, void *vdata)
-{
- struct rt_private *prv = rt_priv(new_ops);
- struct rt_unit *svc = vdata;
-
- ASSERT(!pdata && svc && is_idle_unit(svc->unit));
-
- /*
- * We are holding the runqueue lock already (it's been taken in
- * schedule_cpu_switch()). It's actually the runqueue lock of
- * another scheduler, but that is how things need to be, for
- * preventing races.
- */
- ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
-
- /*
- * If we are the absolute first cpu being switched toward this
- * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
- * first one that is added back to the cpupool that had all its cpus
- * removed (in which case we'll see TIMER_STATUS_killed), it's our
- * job to (re)initialize the timer.
- */
- if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
- prv->repl_timer.status == TIMER_STATUS_killed )
- {
- init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
- dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
- }
-
- sched_idle_unit(cpu)->priv = vdata;
-
- return &prv->lock;
-}
-
-static void
-rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
- unsigned long flags;
- struct rt_private *prv = rt_priv(ops);
-
- spin_lock_irqsave(&prv->lock, flags);
-
- if ( prv->repl_timer.cpu == cpu )
- {
- cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
- unsigned int new_cpu = cpumask_cycle(cpu, online);
-
- /*
- * Make sure the timer run on one of the cpus that are still available
- * to this scheduler. If there aren't any left, it means it's the time
- * to just kill it.
- */
- if ( new_cpu >= nr_cpu_ids )
- {
- kill_timer(&prv->repl_timer);
- dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
- }
- else
- {
- migrate_timer(&prv->repl_timer, new_cpu);
- }
- }
-
- spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void *
-rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
-{
- unsigned long flags;
- struct rt_dom *sdom;
- struct rt_private * prv = rt_priv(ops);
-
- sdom = xzalloc(struct rt_dom);
- if ( sdom == NULL )
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&sdom->sdom_elem);
- sdom->dom = dom;
-
- /* spinlock here to insert the dom */
- spin_lock_irqsave(&prv->lock, flags);
- list_add_tail(&sdom->sdom_elem, &(prv->sdom));
- spin_unlock_irqrestore(&prv->lock, flags);
-
- return sdom;
-}
-
-static void
-rt_free_domdata(const struct scheduler *ops, void *data)
-{
- struct rt_dom *sdom = data;
- struct rt_private *prv = rt_priv(ops);
-
- if ( sdom )
- {
- unsigned long flags;
-
- spin_lock_irqsave(&prv->lock, flags);
- list_del_init(&sdom->sdom_elem);
- spin_unlock_irqrestore(&prv->lock, flags);
-
- xfree(sdom);
- }
-}
-
-static void *
-rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
-{
- struct rt_unit *svc;
-
- /* Allocate per-UNIT info */
- svc = xzalloc(struct rt_unit);
- if ( svc == NULL )
- return NULL;
-
- INIT_LIST_HEAD(&svc->q_elem);
- INIT_LIST_HEAD(&svc->replq_elem);
- svc->flags = 0U;
- svc->sdom = dd;
- svc->unit = unit;
- svc->last_start = 0;
-
- __set_bit(__RTDS_extratime, &svc->flags);
- svc->priority_level = 0;
- svc->period = RTDS_DEFAULT_PERIOD;
- if ( !is_idle_unit(unit) )
- svc->budget = RTDS_DEFAULT_BUDGET;
-
- SCHED_STAT_CRANK(unit_alloc);
-
- return svc;
-}
-
-static void
-rt_free_udata(const struct scheduler *ops, void *priv)
-{
- struct rt_unit *svc = priv;
-
- xfree(svc);
-}
-
-/*
- * It is called in sched_move_domain() and sched_init_vcpu
- * in schedule.c.
- * When move a domain to a new cpupool.
- * It inserts units of moving domain to the scheduler's RunQ in
- * dest. cpupool.
- */
-static void
-rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct rt_unit *svc = rt_unit(unit);
- s_time_t now;
- spinlock_t *lock;
-
- BUG_ON( is_idle_unit(unit) );
-
- /* This is safe because unit isn't yet being scheduled */
- sched_set_res(unit, rt_res_pick(ops, unit));
-
- lock = unit_schedule_lock_irq(unit);
-
- now = NOW();
- if ( now >= svc->cur_deadline )
- rt_update_deadline(now, svc);
-
- if ( !unit_on_q(svc) && unit_runnable(unit) )
- {
- replq_insert(ops, svc);
-
- if ( !unit->is_running )
- runq_insert(ops, svc);
- }
- unit_schedule_unlock_irq(lock, unit);
-
- SCHED_STAT_CRANK(unit_insert);
-}
-
-/*
- * Remove rt_unit svc from the old scheduler in source cpupool.
- */
-static void
-rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct rt_unit * const svc = rt_unit(unit);
- struct rt_dom * const sdom = svc->sdom;
- spinlock_t *lock;
-
- SCHED_STAT_CRANK(unit_remove);
-
- BUG_ON( sdom == NULL );
-
- lock = unit_schedule_lock_irq(unit);
- if ( unit_on_q(svc) )
- q_remove(svc);
-
- if ( unit_on_replq(svc) )
- replq_remove(ops,svc);
-
- unit_schedule_unlock_irq(lock, unit);
-}
-
-/*
- * Burn budget in nanosecond granularity
- */
-static void
-burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
-{
- s_time_t delta;
-
- /* don't burn budget for idle UNIT */
- if ( is_idle_unit(svc->unit) )
- return;
-
- /* burn at nanoseconds level */
- delta = now - svc->last_start;
- /*
- * delta < 0 only happens in nested virtualization;
- * TODO: how should we handle delta < 0 in a better way?
- */
- if ( delta < 0 )
- {
- printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
- __func__, delta);
- svc->last_start = now;
- return;
- }
-
- svc->cur_budget -= delta;
- svc->last_start = now;
-
- if ( svc->cur_budget <= 0 )
- {
- if ( has_extratime(svc) )
- {
- svc->priority_level++;
- svc->cur_budget = svc->budget;
- }
- else
- {
- svc->cur_budget = 0;
- __set_bit(__RTDS_depleted, &svc->flags);
- }
- }
-
- /* TRACE */
- {
- struct __packed {
- unsigned unit:16, dom:16;
- uint64_t cur_budget;
- int delta;
- unsigned priority_level;
- bool has_extratime;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.cur_budget = (uint64_t) svc->cur_budget;
- d.delta = delta;
- d.priority_level = svc->priority_level;
- d.has_extratime = svc->flags & RTDS_extratime;
- trace_var(TRC_RTDS_BUDGET_BURN, 1,
- sizeof(d),
- (unsigned char *) &d);
- }
-}
-
-/*
- * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
- * lock is grabbed before calling this function
- */
-static struct rt_unit *
-runq_pick(const struct scheduler *ops, const cpumask_t *mask)
-{
- struct list_head *runq = rt_runq(ops);
- struct list_head *iter;
- struct rt_unit *svc = NULL;
- struct rt_unit *iter_svc = NULL;
- cpumask_t cpu_common;
- cpumask_t *online;
-
- list_for_each ( iter, runq )
- {
- iter_svc = q_elem(iter);
-
- /* mask cpu_hard_affinity & cpupool & mask */
- online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
- cpumask_and(&cpu_common, online, iter_svc->unit->cpu_hard_affinity);
- cpumask_and(&cpu_common, mask, &cpu_common);
- if ( cpumask_empty(&cpu_common) )
- continue;
-
- ASSERT( iter_svc->cur_budget > 0 );
-
- svc = iter_svc;
- break;
- }
-
- /* TRACE */
- {
- if( svc != NULL )
- {
- struct __packed {
- unsigned unit:16, dom:16;
- uint64_t cur_deadline, cur_budget;
- } d;
- d.dom = svc->unit->domain->domain_id;
- d.unit = svc->unit->unit_id;
- d.cur_deadline = (uint64_t) svc->cur_deadline;
- d.cur_budget = (uint64_t) svc->cur_budget;
- trace_var(TRC_RTDS_RUNQ_PICK, 1,
- sizeof(d),
- (unsigned char *) &d);
- }
- }
-
- return svc;
-}
-
-/*
- * schedule function for rt scheduler.
- * The lock is already grabbed in schedule.c, no need to lock here
- */
-static void
-rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
- s_time_t now, bool tasklet_work_scheduled)
-{
- const unsigned int cur_cpu = smp_processor_id();
- const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
- struct rt_private *prv = rt_priv(ops);
- struct rt_unit *const scurr = rt_unit(currunit);
- struct rt_unit *snext = NULL;
- bool migrated = false;
-
- /* TRACE */
- {
- struct __packed {
- unsigned cpu:16, tasklet:8, tickled:4, idle:4;
- } d;
- d.cpu = cur_cpu;
- d.tasklet = tasklet_work_scheduled;
- d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled);
- d.idle = is_idle_unit(currunit);
- trace_var(TRC_RTDS_SCHEDULE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- /* clear ticked bit now that we've been scheduled */
- cpumask_clear_cpu(sched_cpu, &prv->tickled);
-
- /* burn_budget would return for IDLE UNIT */
- burn_budget(ops, scurr, now);
-
- if ( tasklet_work_scheduled )
- {
- trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL);
- snext = rt_unit(sched_idle_unit(sched_cpu));
- }
- else
- {
- snext = runq_pick(ops, cpumask_of(sched_cpu));
-
- if ( snext == NULL )
- snext = rt_unit(sched_idle_unit(sched_cpu));
- else if ( !unit_runnable_state(snext->unit) )
- {
- q_remove(snext);
- snext = rt_unit(sched_idle_unit(sched_cpu));
- }
-
- /* if scurr has higher priority and budget, still pick scurr */
- if ( !is_idle_unit(currunit) &&
- unit_runnable_state(currunit) &&
- scurr->cur_budget > 0 &&
- ( is_idle_unit(snext->unit) ||
- compare_unit_priority(scurr, snext) > 0 ) )
- snext = scurr;
- }
-
- if ( snext != scurr &&
- !is_idle_unit(currunit) &&
- unit_runnable(currunit) )
- __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
-
- snext->last_start = now;
- currunit->next_time = -1; /* if an idle unit is picked */
- if ( !is_idle_unit(snext->unit) )
- {
- if ( snext != scurr )
- {
- q_remove(snext);
- __set_bit(__RTDS_scheduled, &snext->flags);
- }
- if ( sched_unit_master(snext->unit) != sched_cpu )
- {
- sched_set_res(snext->unit, get_sched_res(sched_cpu));
- migrated = true;
- }
- /* Invoke the scheduler next time. */
- currunit->next_time = snext->cur_budget;
- }
- currunit->next_task = snext->unit;
- snext->unit->migrated = migrated;
-}
-
-/*
- * Remove UNIT from RunQ
- * The lock is already grabbed in schedule.c, no need to lock here
- */
-static void
-rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct rt_unit * const svc = rt_unit(unit);
-
- BUG_ON( is_idle_unit(unit) );
- SCHED_STAT_CRANK(unit_sleep);
-
- if ( curr_on_cpu(sched_unit_master(unit)) == unit )
- cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
- else if ( unit_on_q(svc) )
- {
- q_remove(svc);
- replq_remove(ops, svc);
- }
- else if ( svc->flags & RTDS_delayed_runq_add )
- __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
-}
-
-/*
- * Pick a cpu where to run an unit,
- * possibly kicking out the unit running there
- * Called by wake() and context_saved()
- * We have a running candidate here, the kick logic is:
- * Among all the cpus that are within the cpu affinity
- * 1) if there are any idle CPUs, kick one.
- For cache benefit, we check new->cpu as first
- * 2) now all pcpus are busy;
- * among all the running units, pick lowest priority one
- * if snext has higher priority, kick it.
- *
- * TODO:
- * 1) what if these two units belongs to the same domain?
- * replace an unit belonging to the same domain introduces more overhead
- *
- * lock is grabbed before calling this function
- */
-static void
-runq_tickle(const struct scheduler *ops, struct rt_unit *new)
-{
- struct rt_private *prv = rt_priv(ops);
- struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
- struct rt_unit *iter_svc;
- struct sched_unit *iter_unit;
- int cpu = 0, cpu_to_tickle = 0;
- cpumask_t not_tickled;
- cpumask_t *online;
-
- if ( new == NULL || is_idle_unit(new->unit) )
- return;
-
- online = cpupool_domain_master_cpumask(new->unit->domain);
- cpumask_and(¬_tickled, online, new->unit->cpu_hard_affinity);
- cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled);
-
- /*
- * 1) If there are any idle CPUs, kick one.
- * For cache benefit,we first search new->cpu.
- * The same loop also find the one with lowest priority.
- */
- cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), ¬_tickled);
- while ( cpu!= nr_cpu_ids )
- {
- iter_unit = curr_on_cpu(cpu);
- if ( is_idle_unit(iter_unit) )
- {
- SCHED_STAT_CRANK(tickled_idle_cpu);
- cpu_to_tickle = cpu;
- goto out;
- }
- iter_svc = rt_unit(iter_unit);
- if ( latest_deadline_unit == NULL ||
- compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
- latest_deadline_unit = iter_svc;
-
- cpumask_clear_cpu(cpu, ¬_tickled);
- cpu = cpumask_cycle(cpu, ¬_tickled);
- }
-
- /* 2) candicate has higher priority, kick out lowest priority unit */
- if ( latest_deadline_unit != NULL &&
- compare_unit_priority(latest_deadline_unit, new) < 0 )
- {
- SCHED_STAT_CRANK(tickled_busy_cpu);
- cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
- goto out;
- }
-
- /* didn't tickle any cpu */
- SCHED_STAT_CRANK(tickled_no_cpu);
- return;
- out:
- /* TRACE */
- {
- struct {
- unsigned cpu:16, pad:16;
- } d;
- d.cpu = cpu_to_tickle;
- d.pad = 0;
- trace_var(TRC_RTDS_TICKLE, 1,
- sizeof(d),
- (unsigned char *)&d);
- }
-
- cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
- cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
- return;
-}
-
-/*
- * Should always wake up runnable unit, put it back to RunQ.
- * Check priority to raise interrupt
- * The lock is already grabbed in schedule.c, no need to lock here
- * TODO: what if these two units belongs to the same domain?
- */
-static void
-rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct rt_unit * const svc = rt_unit(unit);
- s_time_t now;
- bool_t missed;
-
- BUG_ON( is_idle_unit(unit) );
-
- if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
- {
- SCHED_STAT_CRANK(unit_wake_running);
- return;
- }
-
- /* on RunQ/DepletedQ, just update info is ok */
- if ( unlikely(unit_on_q(svc)) )
- {
- SCHED_STAT_CRANK(unit_wake_onrunq);
- return;
- }
-
- if ( likely(unit_runnable(unit)) )
- SCHED_STAT_CRANK(unit_wake_runnable);
- else
- SCHED_STAT_CRANK(unit_wake_not_runnable);
-
- /*
- * If a deadline passed while svc was asleep/blocked, we need new
- * scheduling parameters (a new deadline and full budget).
- */
- now = NOW();
-
- missed = ( now >= svc->cur_deadline );
- if ( missed )
- rt_update_deadline(now, svc);
-
- /*
- * If context hasn't been saved for this unit yet, we can't put it on
- * the run-queue/depleted-queue. Instead, we set the appropriate flag,
- * the unit will be put back on queue after the context has been saved
- * (in rt_context_save()).
- */
- if ( unlikely(svc->flags & RTDS_scheduled) )
- {
- __set_bit(__RTDS_delayed_runq_add, &svc->flags);
- /*
- * The unit is waking up already, and we didn't even had the time to
- * remove its next replenishment event from the replenishment queue
- * when it blocked! No big deal. If we did not miss the deadline in
- * the meantime, let's just leave it there. If we did, let's remove it
- * and queue a new one (to occur at our new deadline).
- */
- if ( missed )
- replq_reinsert(ops, svc);
- return;
- }
-
- /* Replenishment event got cancelled when we blocked. Add it back. */
- replq_insert(ops, svc);
- /* insert svc to runq/depletedq because svc is not in queue now */
- runq_insert(ops, svc);
-
- runq_tickle(ops, svc);
-}
-
-/*
- * scurr has finished context switch, insert it back to the RunQ,
- * and then pick the highest priority unit from runq to run
- */
-static void
-rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
-{
- struct rt_unit *svc = rt_unit(unit);
- spinlock_t *lock = unit_schedule_lock_irq(unit);
-
- __clear_bit(__RTDS_scheduled, &svc->flags);
- /* not insert idle unit to runq */
- if ( is_idle_unit(unit) )
- goto out;
-
- if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
- likely(unit_runnable(unit)) )
- {
- runq_insert(ops, svc);
- runq_tickle(ops, svc);
- }
- else
- replq_remove(ops, svc);
-
-out:
- unit_schedule_unlock_irq(lock, unit);
-}
-
-/*
- * set/get each unit info of each domain
- */
-static int
-rt_dom_cntl(
- const struct scheduler *ops,
- struct domain *d,
- struct xen_domctl_scheduler_op *op)
-{
- struct rt_private *prv = rt_priv(ops);
- struct rt_unit *svc;
- struct sched_unit *unit;
- unsigned long flags;
- int rc = 0;
- struct xen_domctl_schedparam_vcpu local_sched;
- s_time_t period, budget;
- uint32_t index = 0;
-
- switch ( op->cmd )
- {
- case XEN_DOMCTL_SCHEDOP_getinfo:
- /* Return the default parameters. */
- op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
- op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
- break;
- case XEN_DOMCTL_SCHEDOP_putinfo:
- if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
- {
- rc = -EINVAL;
- break;
- }
- spin_lock_irqsave(&prv->lock, flags);
- for_each_sched_unit ( d, unit )
- {
- svc = rt_unit(unit);
- svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
- svc->budget = MICROSECS(op->u.rtds.budget);
- }
- spin_unlock_irqrestore(&prv->lock, flags);
- break;
- case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
- case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
- while ( index < op->u.v.nr_vcpus )
- {
- if ( copy_from_guest_offset(&local_sched,
- op->u.v.vcpus, index, 1) )
- {
- rc = -EFAULT;
- break;
- }
- if ( local_sched.vcpuid >= d->max_vcpus ||
- d->vcpu[local_sched.vcpuid] == NULL )
- {
- rc = -EINVAL;
- break;
- }
-
- if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
- {
- spin_lock_irqsave(&prv->lock, flags);
- svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
- local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
- local_sched.u.rtds.period = svc->period / MICROSECS(1);
- if ( has_extratime(svc) )
- local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
- else
- local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
- spin_unlock_irqrestore(&prv->lock, flags);
-
- if ( copy_to_guest_offset(op->u.v.vcpus, index,
- &local_sched, 1) )
- {
- rc = -EFAULT;
- break;
- }
- }
- else
- {
- period = MICROSECS(local_sched.u.rtds.period);
- budget = MICROSECS(local_sched.u.rtds.budget);
- if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
- budget > period || period < RTDS_MIN_PERIOD )
- {
- rc = -EINVAL;
- break;
- }
-
- spin_lock_irqsave(&prv->lock, flags);
- svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
- svc->period = period;
- svc->budget = budget;
- if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
- __set_bit(__RTDS_extratime, &svc->flags);
- else
- __clear_bit(__RTDS_extratime, &svc->flags);
- spin_unlock_irqrestore(&prv->lock, flags);
- }
- /* Process a most 64 vCPUs without checking for preemptions. */
- if ( (++index > 63) && hypercall_preempt_check() )
- break;
- }
- if ( !rc )
- /* notify upper caller how many units have been processed. */
- op->u.v.nr_vcpus = index;
- break;
- }
-
- return rc;
-}
-
-/*
- * The replenishment timer handler picks units
- * from the replq and does the actual replenishment.
- */
-static void repl_timer_handler(void *data){
- s_time_t now;
- struct scheduler *ops = data;
- struct rt_private *prv = rt_priv(ops);
- struct list_head *replq = rt_replq(ops);
- struct list_head *runq = rt_runq(ops);
- struct list_head *iter, *tmp;
- struct rt_unit *svc;
- LIST_HEAD(tmp_replq);
-
- spin_lock_irq(&prv->lock);
-
- now = NOW();
-
- /*
- * Do the replenishment and move replenished units
- * to the temporary list to tickle.
- * If svc is on run queue, we need to put it at
- * the correct place since its deadline changes.
- */
- list_for_each_safe ( iter, tmp, replq )
- {
- svc = replq_elem(iter);
-
- if ( now < svc->cur_deadline )
- break;
-
- list_del(&svc->replq_elem);
- rt_update_deadline(now, svc);
- list_add(&svc->replq_elem, &tmp_replq);
-
- if ( unit_on_q(svc) )
- {
- q_remove(svc);
- runq_insert(ops, svc);
- }
- }
-
- /*
- * Iterate through the list of updated units.
- * If an updated unit is running, tickle the head of the
- * runqueue if it has a higher priority.
- * If an updated unit was depleted and on the runqueue, tickle it.
- * Finally, reinsert the units back to replenishement events list.
- */
- list_for_each_safe ( iter, tmp, &tmp_replq )
- {
- svc = replq_elem(iter);
-
- if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
- !list_empty(runq) )
- {
- struct rt_unit *next_on_runq = q_elem(runq->next);
-
- if ( compare_unit_priority(svc, next_on_runq) < 0 )
- runq_tickle(ops, next_on_runq);
- }
- else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
- unit_on_q(svc) )
- runq_tickle(ops, svc);
-
- list_del(&svc->replq_elem);
- deadline_replq_insert(svc, &svc->replq_elem, replq);
- }
-
- /*
- * If there are units left in the replenishment event list,
- * set the next replenishment to happen at the deadline of
- * the one in the front.
- */
- if ( !list_empty(replq) )
- set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
-
- spin_unlock_irq(&prv->lock);
-}
-
-static const struct scheduler sched_rtds_def = {
- .name = "SMP RTDS Scheduler",
- .opt_name = "rtds",
- .sched_id = XEN_SCHEDULER_RTDS,
- .sched_data = NULL,
-
- .dump_cpu_state = rt_dump_pcpu,
- .dump_settings = rt_dump,
- .init = rt_init,
- .deinit = rt_deinit,
- .init_pdata = rt_init_pdata,
- .switch_sched = rt_switch_sched,
- .deinit_pdata = rt_deinit_pdata,
- .alloc_domdata = rt_alloc_domdata,
- .free_domdata = rt_free_domdata,
- .alloc_udata = rt_alloc_udata,
- .free_udata = rt_free_udata,
- .insert_unit = rt_unit_insert,
- .remove_unit = rt_unit_remove,
-
- .adjust = rt_dom_cntl,
-
- .pick_resource = rt_res_pick,
- .do_schedule = rt_schedule,
- .sleep = rt_unit_sleep,
- .wake = rt_unit_wake,
- .context_saved = rt_context_saved,
-};
-
-REGISTER_SCHEDULER(sched_rtds_def);
+++ /dev/null
-/****************************************************************************
- * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2002-2003 University of Cambridge
- * (C) 2004 - Mark Williamson - Intel Research Cambridge
- ****************************************************************************
- *
- * File: common/schedule.c
- * Author: Rolf Neugebauer & Keir Fraser
- * Updated for generic API by Mark Williamson
- *
- * Description: Generic CPU scheduling code
- * implements support functionality for the Xen scheduler API.
- *
- */
-
-#ifndef COMPAT
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/timer.h>
-#include <xen/perfc.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <xen/trace.h>
-#include <xen/mm.h>
-#include <xen/err.h>
-#include <xen/guest_access.h>
-#include <xen/hypercall.h>
-#include <xen/multicall.h>
-#include <xen/cpu.h>
-#include <xen/preempt.h>
-#include <xen/event.h>
-#include <public/sched.h>
-#include <xsm/xsm.h>
-#include <xen/err.h>
-
-#ifdef CONFIG_XEN_GUEST
-#include <asm/guest.h>
-#else
-#define pv_shim false
-#endif
-
-/* opt_sched: scheduler - default to configured value */
-static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
-string_param("sched", opt_sched);
-
-/* if sched_smt_power_savings is set,
- * scheduler will give preferrence to partially idle package compared to
- * the full idle package, when picking pCPU to schedule vCPU.
- */
-bool_t sched_smt_power_savings = 0;
-boolean_param("sched_smt_power_savings", sched_smt_power_savings);
-
-/* Default scheduling rate limit: 1ms
- * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
- * */
-int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
-integer_param("sched_ratelimit_us", sched_ratelimit_us);
-
-/* Number of vcpus per struct sched_unit. */
-bool __read_mostly sched_disable_smt_switching;
-cpumask_t sched_res_mask;
-
-/* Common lock for free cpus. */
-static DEFINE_SPINLOCK(sched_free_cpu_lock);
-
-/* Various timer handlers. */
-static void s_timer_fn(void *unused);
-static void vcpu_periodic_timer_fn(void *data);
-static void vcpu_singleshot_timer_fn(void *data);
-static void poll_timer_fn(void *data);
-
-/* This is global for now so that private implementations can reach it */
-DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
-static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
-DEFINE_RCU_READ_LOCK(sched_res_rculock);
-
-/* Scratch space for cpumasks. */
-DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
-
-/* How many urgent vcpus. */
-DEFINE_PER_CPU(atomic_t, sched_urgent_count);
-
-extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
-#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
-#define schedulers __start_schedulers_array
-
-static struct scheduler __read_mostly ops;
-
-static bool scheduler_active;
-
-static void sched_set_affinity(
- struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
-
-static struct sched_resource *
-sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
- return unit->res;
-}
-
-static void *
-sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
- void *dd)
-{
- /* Any non-NULL pointer is fine here. */
- return ZERO_BLOCK_PTR;
-}
-
-static void
-sched_idle_free_udata(const struct scheduler *ops, void *priv)
-{
-}
-
-static void sched_idle_schedule(
- const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
- bool tasklet_work_scheduled)
-{
- const unsigned int cpu = smp_processor_id();
-
- unit->next_time = -1;
- unit->next_task = sched_idle_unit(cpu);
-}
-
-static struct scheduler sched_idle_ops = {
- .name = "Idle Scheduler",
- .opt_name = "idle",
- .sched_data = NULL,
-
- .pick_resource = sched_idle_res_pick,
- .do_schedule = sched_idle_schedule,
-
- .alloc_udata = sched_idle_alloc_udata,
- .free_udata = sched_idle_free_udata,
-};
-
-static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
- unsigned int cpu)
-{
- unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
- const struct domain *d = unit->domain;
-
- return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
-}
-
-static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
- unsigned int cpu)
-{
- struct vcpu *v = unit2vcpu_cpu(unit, cpu);
-
- return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
-}
-
-static inline struct scheduler *dom_scheduler(const struct domain *d)
-{
- if ( likely(d->cpupool != NULL) )
- return d->cpupool->sched;
-
- /*
- * If d->cpupool is NULL, this is the idle domain. This is special
- * because the idle domain does not really belong to any cpupool, and,
- * hence, does not really have a scheduler.
- *
- * This is (should be!) only called like this for allocating the idle
- * vCPUs for the first time, during boot, in which case what we want
- * is the default scheduler that has been, choosen at boot.
- */
- ASSERT(is_idle_domain(d));
- return &ops;
-}
-
-static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
-{
- struct domain *d = unit->domain;
-
- if ( likely(d->cpupool != NULL) )
- return d->cpupool->sched;
-
- /*
- * If d->cpupool is NULL, this is a unit of the idle domain. And this
- * case is special because the idle domain does not really belong to
- * a cpupool and, hence, doesn't really have a scheduler). In fact, its
- * units (may) run on pCPUs which are in different pools, with different
- * schedulers.
- *
- * What we want, in this case, is the scheduler of the pCPU where this
- * particular idle unit is running. And, since unit->res never changes
- * for idle units, it is safe to use it, with no locks, to figure that out.
- */
-
- ASSERT(is_idle_domain(d));
- return unit->res->scheduler;
-}
-
-static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
-{
- return unit_scheduler(v->sched_unit);
-}
-#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
-
-static inline void trace_runstate_change(struct vcpu *v, int new_state)
-{
- struct { uint32_t vcpu:16, domain:16; } d;
- uint32_t event;
-
- if ( likely(!tb_init_done) )
- return;
-
- d.vcpu = v->vcpu_id;
- d.domain = v->domain->domain_id;
-
- event = TRC_SCHED_RUNSTATE_CHANGE;
- event |= ( v->runstate.state & 0x3 ) << 8;
- event |= ( new_state & 0x3 ) << 4;
-
- __trace_var(event, 1/*tsc*/, sizeof(d), &d);
-}
-
-static inline void trace_continue_running(struct vcpu *v)
-{
- struct { uint32_t vcpu:16, domain:16; } d;
-
- if ( likely(!tb_init_done) )
- return;
-
- d.vcpu = v->vcpu_id;
- d.domain = v->domain->domain_id;
-
- __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
-}
-
-static inline void vcpu_urgent_count_update(struct vcpu *v)
-{
- if ( is_idle_vcpu(v) )
- return;
-
- if ( unlikely(v->is_urgent) )
- {
- if ( !(v->pause_flags & VPF_blocked) ||
- !test_bit(v->vcpu_id, v->domain->poll_mask) )
- {
- v->is_urgent = 0;
- atomic_dec(&per_cpu(sched_urgent_count, v->processor));
- }
- }
- else
- {
- if ( unlikely(v->pause_flags & VPF_blocked) &&
- unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
- {
- v->is_urgent = 1;
- atomic_inc(&per_cpu(sched_urgent_count, v->processor));
- }
- }
-}
-
-static inline void vcpu_runstate_change(
- struct vcpu *v, int new_state, s_time_t new_entry_time)
-{
- s_time_t delta;
- struct sched_unit *unit = v->sched_unit;
-
- ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
- if ( v->runstate.state == new_state )
- return;
-
- vcpu_urgent_count_update(v);
-
- trace_runstate_change(v, new_state);
-
- if ( !is_idle_vcpu(v) )
- {
- unit->runstate_cnt[v->runstate.state]--;
- unit->runstate_cnt[new_state]++;
- }
-
- delta = new_entry_time - v->runstate.state_entry_time;
- if ( delta > 0 )
- {
- v->runstate.time[v->runstate.state] += delta;
- v->runstate.state_entry_time = new_entry_time;
- }
-
- v->runstate.state = new_state;
-}
-
-void sched_guest_idle(void (*idle) (void), unsigned int cpu)
-{
- /*
- * Another vcpu of the unit is active in guest context while this one is
- * idle. In case of a scheduling event we don't want to have high latencies
- * due to a cpu needing to wake up from deep C state for joining the
- * rendezvous, so avoid those deep C states by incrementing the urgent
- * count of the cpu.
- */
- atomic_inc(&per_cpu(sched_urgent_count, cpu));
- idle();
- atomic_dec(&per_cpu(sched_urgent_count, cpu));
-}
-
-void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
-{
- spinlock_t *lock;
- s_time_t delta;
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit);
- memcpy(runstate, &v->runstate, sizeof(*runstate));
- delta = NOW() - runstate->state_entry_time;
- if ( delta > 0 )
- runstate->time[runstate->state] += delta;
-
- if ( unlikely(lock != NULL) )
- unit_schedule_unlock_irq(lock, v->sched_unit);
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-uint64_t get_cpu_idle_time(unsigned int cpu)
-{
- struct vcpu_runstate_info state = { 0 };
- struct vcpu *v = idle_vcpu[cpu];
-
- if ( cpu_online(cpu) && v )
- vcpu_runstate_get(v, &state);
-
- return state.time[RUNSTATE_running];
-}
-
-/*
- * If locks are different, take the one with the lower address first.
- * This avoids dead- or live-locks when this code is running on both
- * cpus at the same time.
- */
-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
- unsigned long *flags)
-{
- if ( lock1 == lock2 )
- {
- spin_lock_irqsave(lock1, *flags);
- }
- else if ( lock1 < lock2 )
- {
- spin_lock_irqsave(lock1, *flags);
- spin_lock(lock2);
- }
- else
- {
- spin_lock_irqsave(lock2, *flags);
- spin_lock(lock1);
- }
-}
-
-static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
- unsigned long flags)
-{
- if ( lock1 != lock2 )
- spin_unlock(lock2);
- spin_unlock_irqrestore(lock1, flags);
-}
-
-static void sched_free_unit_mem(struct sched_unit *unit)
-{
- struct sched_unit *prev_unit;
- struct domain *d = unit->domain;
-
- if ( d->sched_unit_list == unit )
- d->sched_unit_list = unit->next_in_list;
- else
- {
- for_each_sched_unit ( d, prev_unit )
- {
- if ( prev_unit->next_in_list == unit )
- {
- prev_unit->next_in_list = unit->next_in_list;
- break;
- }
- }
- }
-
- free_cpumask_var(unit->cpu_hard_affinity);
- free_cpumask_var(unit->cpu_hard_affinity_saved);
- free_cpumask_var(unit->cpu_soft_affinity);
-
- xfree(unit);
-}
-
-static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
-{
- struct vcpu *vunit;
- unsigned int cnt = 0;
-
- /* Don't count to be released vcpu, might be not in vcpu list yet. */
- for_each_sched_unit_vcpu ( unit, vunit )
- if ( vunit != v )
- cnt++;
-
- v->sched_unit = NULL;
- unit->runstate_cnt[v->runstate.state]--;
-
- if ( unit->vcpu_list == v )
- unit->vcpu_list = v->next_in_list;
-
- if ( !cnt )
- sched_free_unit_mem(unit);
-}
-
-static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
-{
- v->sched_unit = unit;
-
- /* All but idle vcpus are allocated with sequential vcpu_id. */
- if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
- {
- unit->vcpu_list = v;
- /*
- * unit_id is always the same as lowest vcpu_id of unit.
- * This is used for stopping for_each_sched_unit_vcpu() loop and in
- * order to support cpupools with different granularities.
- */
- unit->unit_id = v->vcpu_id;
- }
- unit->runstate_cnt[v->runstate.state]++;
-}
-
-static struct sched_unit *sched_alloc_unit_mem(void)
-{
- struct sched_unit *unit;
-
- unit = xzalloc(struct sched_unit);
- if ( !unit )
- return NULL;
-
- if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
- !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
- !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
- {
- sched_free_unit_mem(unit);
- unit = NULL;
- }
-
- return unit;
-}
-
-static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
-{
- struct sched_unit **prev_unit;
-
- unit->domain = d;
-
- for ( prev_unit = &d->sched_unit_list; *prev_unit;
- prev_unit = &(*prev_unit)->next_in_list )
- if ( (*prev_unit)->next_in_list &&
- (*prev_unit)->next_in_list->unit_id > unit->unit_id )
- break;
-
- unit->next_in_list = *prev_unit;
- *prev_unit = unit;
-}
-
-static struct sched_unit *sched_alloc_unit(struct vcpu *v)
-{
- struct sched_unit *unit;
- struct domain *d = v->domain;
- unsigned int gran = cpupool_get_granularity(d->cpupool);
-
- for_each_sched_unit ( d, unit )
- if ( unit->unit_id / gran == v->vcpu_id / gran )
- break;
-
- if ( unit )
- {
- sched_unit_add_vcpu(unit, v);
- return unit;
- }
-
- if ( (unit = sched_alloc_unit_mem()) == NULL )
- return NULL;
-
- sched_unit_add_vcpu(unit, v);
- sched_domain_insert_unit(unit, d);
-
- return unit;
-}
-
-static unsigned int sched_select_initial_cpu(const struct vcpu *v)
-{
- const struct domain *d = v->domain;
- nodeid_t node;
- spinlock_t *lock;
- unsigned long flags;
- unsigned int cpu_ret, cpu = smp_processor_id();
- cpumask_t *cpus = cpumask_scratch_cpu(cpu);
-
- lock = pcpu_schedule_lock_irqsave(cpu, &flags);
- cpumask_clear(cpus);
- for_each_node_mask ( node, d->node_affinity )
- cpumask_or(cpus, cpus, &node_to_cpumask(node));
- cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
- if ( cpumask_empty(cpus) )
- cpumask_copy(cpus, d->cpupool->cpu_valid);
-
- if ( v->vcpu_id == 0 )
- cpu_ret = cpumask_first(cpus);
- else
- {
- /* We can rely on previous vcpu being available. */
- ASSERT(!is_idle_domain(d));
-
- cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
- }
-
- pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
-
- return cpu_ret;
-}
-
-int sched_init_vcpu(struct vcpu *v)
-{
- struct domain *d = v->domain;
- struct sched_unit *unit;
- unsigned int processor;
-
- if ( (unit = sched_alloc_unit(v)) == NULL )
- return 1;
-
- if ( is_idle_domain(d) )
- processor = v->vcpu_id;
- else
- processor = sched_select_initial_cpu(v);
-
- /* Initialise the per-vcpu timers. */
- spin_lock_init(&v->periodic_timer_lock);
- init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
- init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
- init_timer(&v->poll_timer, poll_timer_fn, v, processor);
-
- /* If this is not the first vcpu of the unit we are done. */
- if ( unit->priv != NULL )
- {
- v->processor = processor;
- return 0;
- }
-
- rcu_read_lock(&sched_res_rculock);
-
- /* The first vcpu of an unit can be set via sched_set_res(). */
- sched_set_res(unit, get_sched_res(processor));
-
- unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
- if ( unit->priv == NULL )
- {
- sched_free_unit(unit, v);
- rcu_read_unlock(&sched_res_rculock);
- return 1;
- }
-
- /*
- * Initialize affinity settings. The idler, and potentially
- * domain-0 VCPUs, are pinned onto their respective physical CPUs.
- */
- if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
- sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
- else
- sched_set_affinity(unit, &cpumask_all, &cpumask_all);
-
- /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
- if ( is_idle_domain(d) )
- {
- get_sched_res(v->processor)->curr = unit;
- get_sched_res(v->processor)->sched_unit_idle = unit;
- v->is_running = 1;
- unit->is_running = true;
- unit->state_entry_time = NOW();
- }
- else
- {
- sched_insert_unit(dom_scheduler(d), unit);
- }
-
- rcu_read_unlock(&sched_res_rculock);
-
- return 0;
-}
-
-static void vcpu_move_irqs(struct vcpu *v)
-{
- arch_move_irqs(v);
- evtchn_move_pirqs(v);
-}
-
-static void sched_move_irqs(const struct sched_unit *unit)
-{
- struct vcpu *v;
-
- for_each_sched_unit_vcpu ( unit, v )
- vcpu_move_irqs(v);
-}
-
-int sched_move_domain(struct domain *d, struct cpupool *c)
-{
- struct vcpu *v;
- struct sched_unit *unit;
- unsigned int new_p, unit_idx;
- void **unit_priv;
- void *domdata;
- void *unitdata;
- struct scheduler *old_ops;
- void *old_domdata;
- unsigned int gran = cpupool_get_granularity(c);
- int ret = 0;
-
- for_each_vcpu ( d, v )
- {
- if ( v->affinity_broken )
- return -EBUSY;
- }
-
- rcu_read_lock(&sched_res_rculock);
-
- domdata = sched_alloc_domdata(c->sched, d);
- if ( IS_ERR(domdata) )
- {
- ret = PTR_ERR(domdata);
- goto out;
- }
-
- unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
- if ( unit_priv == NULL )
- {
- sched_free_domdata(c->sched, domdata);
- ret = -ENOMEM;
- goto out;
- }
-
- unit_idx = 0;
- for_each_sched_unit ( d, unit )
- {
- unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
- if ( unit_priv[unit_idx] == NULL )
- {
- for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
- sched_free_udata(c->sched, unit_priv[unit_idx]);
- xfree(unit_priv);
- sched_free_domdata(c->sched, domdata);
- ret = -ENOMEM;
- goto out;
- }
- unit_idx++;
- }
-
- domain_pause(d);
-
- old_ops = dom_scheduler(d);
- old_domdata = d->sched_priv;
-
- for_each_sched_unit ( d, unit )
- {
- sched_remove_unit(old_ops, unit);
- }
-
- d->cpupool = c;
- d->sched_priv = domdata;
-
- new_p = cpumask_first(c->cpu_valid);
- unit_idx = 0;
- for_each_sched_unit ( d, unit )
- {
- spinlock_t *lock;
- unsigned int unit_p = new_p;
-
- unitdata = unit->priv;
-
- for_each_sched_unit_vcpu ( unit, v )
- {
- migrate_timer(&v->periodic_timer, new_p);
- migrate_timer(&v->singleshot_timer, new_p);
- migrate_timer(&v->poll_timer, new_p);
- new_p = cpumask_cycle(new_p, c->cpu_valid);
- }
-
- lock = unit_schedule_lock_irq(unit);
-
- sched_set_affinity(unit, &cpumask_all, &cpumask_all);
-
- sched_set_res(unit, get_sched_res(unit_p));
- /*
- * With v->processor modified we must not
- * - make any further changes assuming we hold the scheduler lock,
- * - use unit_schedule_unlock_irq().
- */
- spin_unlock_irq(lock);
-
- unit->priv = unit_priv[unit_idx];
- if ( !d->is_dying )
- sched_move_irqs(unit);
-
- sched_insert_unit(c->sched, unit);
-
- sched_free_udata(old_ops, unitdata);
-
- unit_idx++;
- }
-
- domain_update_node_affinity(d);
-
- domain_unpause(d);
-
- sched_free_domdata(old_ops, old_domdata);
-
- xfree(unit_priv);
-
-out:
- rcu_read_unlock(&sched_res_rculock);
-
- return ret;
-}
-
-void sched_destroy_vcpu(struct vcpu *v)
-{
- struct sched_unit *unit = v->sched_unit;
-
- kill_timer(&v->periodic_timer);
- kill_timer(&v->singleshot_timer);
- kill_timer(&v->poll_timer);
- if ( test_and_clear_bool(v->is_urgent) )
- atomic_dec(&per_cpu(sched_urgent_count, v->processor));
- /*
- * Vcpus are being destroyed top-down. So being the first vcpu of an unit
- * is the same as being the only one.
- */
- if ( unit->vcpu_list == v )
- {
- rcu_read_lock(&sched_res_rculock);
-
- sched_remove_unit(vcpu_scheduler(v), unit);
- sched_free_udata(vcpu_scheduler(v), unit->priv);
- sched_free_unit(unit, v);
-
- rcu_read_unlock(&sched_res_rculock);
- }
-}
-
-int sched_init_domain(struct domain *d, int poolid)
-{
- void *sdom;
- int ret;
-
- ASSERT(d->cpupool == NULL);
- ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
-
- if ( (ret = cpupool_add_domain(d, poolid)) )
- return ret;
-
- SCHED_STAT_CRANK(dom_init);
- TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
-
- rcu_read_lock(&sched_res_rculock);
-
- sdom = sched_alloc_domdata(dom_scheduler(d), d);
-
- rcu_read_unlock(&sched_res_rculock);
-
- if ( IS_ERR(sdom) )
- return PTR_ERR(sdom);
-
- d->sched_priv = sdom;
-
- return 0;
-}
-
-void sched_destroy_domain(struct domain *d)
-{
- ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
-
- if ( d->cpupool )
- {
- SCHED_STAT_CRANK(dom_destroy);
- TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
-
- rcu_read_lock(&sched_res_rculock);
-
- sched_free_domdata(dom_scheduler(d), d->sched_priv);
- d->sched_priv = NULL;
-
- rcu_read_unlock(&sched_res_rculock);
-
- cpupool_rm_domain(d);
- }
-}
-
-static void vcpu_sleep_nosync_locked(struct vcpu *v)
-{
- struct sched_unit *unit = v->sched_unit;
-
- ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
-
- if ( likely(!vcpu_runnable(v)) )
- {
- if ( v->runstate.state == RUNSTATE_runnable )
- vcpu_runstate_change(v, RUNSTATE_offline, NOW());
-
- /* Only put unit to sleep in case all vcpus are not runnable. */
- if ( likely(!unit_runnable(unit)) )
- sched_sleep(unit_scheduler(unit), unit);
- else if ( unit_running(unit) > 1 && v->is_running &&
- !v->force_context_switch )
- {
- v->force_context_switch = true;
- cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
- }
- }
-}
-
-void vcpu_sleep_nosync(struct vcpu *v)
-{
- unsigned long flags;
- spinlock_t *lock;
-
- TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
-
- vcpu_sleep_nosync_locked(v);
-
- unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-void vcpu_sleep_sync(struct vcpu *v)
-{
- vcpu_sleep_nosync(v);
-
- while ( !vcpu_runnable(v) && v->is_running )
- cpu_relax();
-
- sync_vcpu_execstate(v);
-}
-
-void vcpu_wake(struct vcpu *v)
-{
- unsigned long flags;
- spinlock_t *lock;
- struct sched_unit *unit = v->sched_unit;
-
- TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = unit_schedule_lock_irqsave(unit, &flags);
-
- if ( likely(vcpu_runnable(v)) )
- {
- if ( v->runstate.state >= RUNSTATE_blocked )
- vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
- /*
- * Call sched_wake() unconditionally, even if unit is running already.
- * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
- * and are now to be woken up again.
- */
- sched_wake(unit_scheduler(unit), unit);
- if ( unit->is_running && !v->is_running && !v->force_context_switch )
- {
- v->force_context_switch = true;
- cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
- }
- }
- else if ( !(v->pause_flags & VPF_blocked) )
- {
- if ( v->runstate.state == RUNSTATE_blocked )
- vcpu_runstate_change(v, RUNSTATE_offline, NOW());
- }
-
- unit_schedule_unlock_irqrestore(lock, flags, unit);
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-void vcpu_unblock(struct vcpu *v)
-{
- if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
- return;
-
- /* Polling period ends when a VCPU is unblocked. */
- if ( unlikely(v->poll_evtchn != 0) )
- {
- v->poll_evtchn = 0;
- /*
- * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
- * this VCPU (and it then going back to sleep on poll_mask).
- * Test-and-clear is idiomatic and ensures clear_bit not reordered.
- */
- if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
- clear_bit(_VPF_blocked, &v->pause_flags);
- }
-
- vcpu_wake(v);
-}
-
-/*
- * Do the actual movement of an unit from old to new CPU. Locks for *both*
- * CPUs needs to have been taken already when calling this!
- */
-static void sched_unit_move_locked(struct sched_unit *unit,
- unsigned int new_cpu)
-{
- unsigned int old_cpu = unit->res->master_cpu;
- struct vcpu *v;
-
- rcu_read_lock(&sched_res_rculock);
-
- /*
- * Transfer urgency status to new CPU before switching CPUs, as
- * once the switch occurs, v->is_urgent is no longer protected by
- * the per-CPU scheduler lock we are holding.
- */
- for_each_sched_unit_vcpu ( unit, v )
- {
- if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
- {
- atomic_inc(&per_cpu(sched_urgent_count, new_cpu));
- atomic_dec(&per_cpu(sched_urgent_count, old_cpu));
- }
- }
-
- /*
- * Actual CPU switch to new CPU. This is safe because the lock
- * pointer can't change while the current lock is held.
- */
- sched_migrate(unit_scheduler(unit), unit, new_cpu);
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * Initiating migration
- *
- * In order to migrate, we need the unit in question to have stopped
- * running and have called sched_sleep() (to take it off any
- * runqueues, for instance); and if it is currently running, it needs
- * to be scheduled out. Finally, we need to hold the scheduling locks
- * for both the processor we're migrating from, and the processor
- * we're migrating to.
- *
- * In order to avoid deadlock while satisfying the final requirement,
- * we must release any scheduling lock we hold, then try to grab both
- * locks we want, then double-check to make sure that what we started
- * to do hasn't been changed in the mean time.
- *
- * These steps are encapsulated in the following two functions; they
- * should be called like this:
- *
- * lock = unit_schedule_lock_irq(unit);
- * sched_unit_migrate_start(unit);
- * unit_schedule_unlock_irq(lock, unit)
- * sched_unit_migrate_finish(unit);
- *
- * sched_unit_migrate_finish() will do the work now if it can, or simply
- * return if it can't (because unit is still running); in that case
- * sched_unit_migrate_finish() will be called by unit_context_saved().
- */
-static void sched_unit_migrate_start(struct sched_unit *unit)
-{
- struct vcpu *v;
-
- for_each_sched_unit_vcpu ( unit, v )
- {
- set_bit(_VPF_migrating, &v->pause_flags);
- vcpu_sleep_nosync_locked(v);
- }
-}
-
-static void sched_unit_migrate_finish(struct sched_unit *unit)
-{
- unsigned long flags;
- unsigned int old_cpu, new_cpu;
- spinlock_t *old_lock, *new_lock;
- bool_t pick_called = 0;
- struct vcpu *v;
-
- /*
- * If the unit is currently running, this will be handled by
- * unit_context_saved(); and in any case, if the bit is cleared, then
- * someone else has already done the work so we don't need to.
- */
- if ( unit->is_running )
- return;
- for_each_sched_unit_vcpu ( unit, v )
- if ( !test_bit(_VPF_migrating, &v->pause_flags) )
- return;
-
- old_cpu = new_cpu = unit->res->master_cpu;
- for ( ; ; )
- {
- /*
- * We need another iteration if the pre-calculated lock addresses
- * are not correct any longer after evaluating old and new cpu holding
- * the locks.
- */
- old_lock = get_sched_res(old_cpu)->schedule_lock;
- new_lock = get_sched_res(new_cpu)->schedule_lock;
-
- sched_spin_lock_double(old_lock, new_lock, &flags);
-
- old_cpu = unit->res->master_cpu;
- if ( old_lock == get_sched_res(old_cpu)->schedule_lock )
- {
- /*
- * If we selected a CPU on the previosu iteration, check if it
- * remains suitable for running this vCPU.
- */
- if ( pick_called &&
- (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
- cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) &&
- cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
- break;
-
- /* Select a new CPU. */
- new_cpu = sched_pick_resource(unit_scheduler(unit),
- unit)->master_cpu;
- if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
- cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
- break;
- pick_called = 1;
- }
- else
- {
- /*
- * We do not hold the scheduler lock appropriate for this vCPU.
- * Thus we cannot select a new CPU on this iteration. Try again.
- */
- pick_called = 0;
- }
-
- sched_spin_unlock_double(old_lock, new_lock, flags);
- }
-
- /*
- * NB. Check of v->running happens /after/ setting migration flag
- * because they both happen in (different) spinlock regions, and those
- * regions are strictly serialised.
- */
- if ( unit->is_running )
- {
- sched_spin_unlock_double(old_lock, new_lock, flags);
- return;
- }
- for_each_sched_unit_vcpu ( unit, v )
- {
- if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
- {
- sched_spin_unlock_double(old_lock, new_lock, flags);
- return;
- }
- }
-
- sched_unit_move_locked(unit, new_cpu);
-
- sched_spin_unlock_double(old_lock, new_lock, flags);
-
- if ( old_cpu != new_cpu )
- {
- /* Vcpus are moved to other pcpus, commit their states to memory. */
- for_each_sched_unit_vcpu ( unit, v )
- sync_vcpu_execstate(v);
- sched_move_irqs(unit);
- }
-
- /* Wake on new CPU. */
- for_each_sched_unit_vcpu ( unit, v )
- vcpu_wake(v);
-}
-
-static bool sched_check_affinity_broken(const struct sched_unit *unit)
-{
- const struct vcpu *v;
-
- for_each_sched_unit_vcpu ( unit, v )
- if ( v->affinity_broken )
- return true;
-
- return false;
-}
-
-static void sched_reset_affinity_broken(struct sched_unit *unit)
-{
- struct vcpu *v;
-
- for_each_sched_unit_vcpu ( unit, v )
- v->affinity_broken = false;
-}
-
-void restore_vcpu_affinity(struct domain *d)
-{
- unsigned int cpu = smp_processor_id();
- struct sched_unit *unit;
-
- ASSERT(system_state == SYS_STATE_resume);
-
- rcu_read_lock(&sched_res_rculock);
-
- for_each_sched_unit ( d, unit )
- {
- spinlock_t *lock;
- unsigned int old_cpu = sched_unit_master(unit);
- struct sched_resource *res;
-
- ASSERT(!unit_runnable(unit));
-
- /*
- * Re-assign the initial processor as after resume we have no
- * guarantee the old processor has come back to life again.
- *
- * Therefore, here, before actually unpausing the domains, we should
- * set v->processor of each of their vCPUs to something that will
- * make sense for the scheduler of the cpupool in which they are in.
- */
- lock = unit_schedule_lock_irq(unit);
-
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(d));
- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
- {
- if ( sched_check_affinity_broken(unit) )
- {
- sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
- sched_reset_affinity_broken(unit);
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(d));
- }
-
- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
- {
- /* Affinity settings of one vcpu are for the complete unit. */
- printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
- unit->vcpu_list);
- sched_set_affinity(unit, &cpumask_all, NULL);
- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
- cpupool_domain_master_cpumask(d));
- }
- }
-
- res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu)));
- sched_set_res(unit, res);
-
- spin_unlock_irq(lock);
-
- /* v->processor might have changed, so reacquire the lock. */
- lock = unit_schedule_lock_irq(unit);
- res = sched_pick_resource(unit_scheduler(unit), unit);
- sched_set_res(unit, res);
- spin_unlock_irq(lock);
-
- if ( old_cpu != sched_unit_master(unit) )
- sched_move_irqs(unit);
- }
-
- rcu_read_unlock(&sched_res_rculock);
-
- domain_update_node_affinity(d);
-}
-
-/*
- * This function is used by cpu_hotplug code via cpu notifier chain
- * and from cpupools to switch schedulers on a cpu.
- * Caller must get domlist_read_lock.
- */
-int cpu_disable_scheduler(unsigned int cpu)
-{
- struct domain *d;
- struct cpupool *c;
- cpumask_t online_affinity;
- int ret = 0;
-
- rcu_read_lock(&sched_res_rculock);
-
- c = get_sched_res(cpu)->cpupool;
- if ( c == NULL )
- goto out;
-
- for_each_domain_in_cpupool ( d, c )
- {
- struct sched_unit *unit;
-
- for_each_sched_unit ( d, unit )
- {
- unsigned long flags;
- spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
-
- cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
- if ( cpumask_empty(&online_affinity) &&
- cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
- {
- if ( sched_check_affinity_broken(unit) )
- {
- /* The unit is temporarily pinned, can't move it. */
- unit_schedule_unlock_irqrestore(lock, flags, unit);
- ret = -EADDRINUSE;
- break;
- }
-
- printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
- unit->vcpu_list);
-
- sched_set_affinity(unit, &cpumask_all, NULL);
- }
-
- if ( unit->res != get_sched_res(cpu) )
- {
- /* The unit is not on this cpu, so we can move on. */
- unit_schedule_unlock_irqrestore(lock, flags, unit);
- continue;
- }
-
- /* If it is on this cpu, we must send it away.
- * We are doing some cpupool manipulations:
- * * we want to call the scheduler, and let it re-evaluation
- * the placement of the vcpu, taking into account the new
- * cpupool configuration;
- * * the scheduler will always find a suitable solution, or
- * things would have failed before getting in here.
- */
- sched_unit_migrate_start(unit);
- unit_schedule_unlock_irqrestore(lock, flags, unit);
- sched_unit_migrate_finish(unit);
-
- /*
- * The only caveat, in this case, is that if a vcpu active in
- * the hypervisor isn't migratable. In this case, the caller
- * should try again after releasing and reaquiring all locks.
- */
- if ( unit->res == get_sched_res(cpu) )
- ret = -EAGAIN;
- }
- }
-
-out:
- rcu_read_unlock(&sched_res_rculock);
-
- return ret;
-}
-
-static int cpu_disable_scheduler_check(unsigned int cpu)
-{
- struct domain *d;
- struct vcpu *v;
- struct cpupool *c;
-
- c = get_sched_res(cpu)->cpupool;
- if ( c == NULL )
- return 0;
-
- for_each_domain_in_cpupool ( d, c )
- for_each_vcpu ( d, v )
- if ( v->affinity_broken )
- return -EADDRINUSE;
-
- return 0;
-}
-
-/*
- * In general, this must be called with the scheduler lock held, because the
- * adjust_affinity hook may want to modify the vCPU state. However, when the
- * vCPU is being initialized (either for dom0 or domU) there is no risk of
- * races, and it's fine to not take the look (we're talking about
- * sched_setup_dom0_vcpus() an sched_init_vcpu()).
- */
-static void sched_set_affinity(
- struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft)
-{
- rcu_read_lock(&sched_res_rculock);
- sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft);
- rcu_read_unlock(&sched_res_rculock);
-
- if ( hard )
- cpumask_copy(unit->cpu_hard_affinity, hard);
- if ( soft )
- cpumask_copy(unit->cpu_soft_affinity, soft);
-
- unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity,
- unit->cpu_soft_affinity) &&
- cpumask_intersects(unit->cpu_soft_affinity,
- unit->cpu_hard_affinity);
-}
-
-static int vcpu_set_affinity(
- struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which)
-{
- struct sched_unit *unit = v->sched_unit;
- spinlock_t *lock;
- int ret = 0;
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = unit_schedule_lock_irq(unit);
-
- if ( v->affinity_broken )
- ret = -EBUSY;
- else
- {
- /*
- * Tell the scheduler we changes something about affinity,
- * and ask to re-evaluate vcpu placement.
- */
- if ( which == unit->cpu_hard_affinity )
- {
- sched_set_affinity(unit, affinity, NULL);
- }
- else
- {
- ASSERT(which == unit->cpu_soft_affinity);
- sched_set_affinity(unit, NULL, affinity);
- }
- sched_unit_migrate_start(unit);
- }
-
- unit_schedule_unlock_irq(lock, unit);
-
- domain_update_node_affinity(v->domain);
-
- sched_unit_migrate_finish(unit);
-
- rcu_read_unlock(&sched_res_rculock);
-
- return ret;
-}
-
-int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
-{
- cpumask_t online_affinity;
- cpumask_t *online;
-
- online = VCPU2ONLINE(v);
- cpumask_and(&online_affinity, affinity, online);
- if ( cpumask_empty(&online_affinity) )
- return -EINVAL;
-
- return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity);
-}
-
-int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
-{
- return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity);
-}
-
-/* Block the currently-executing domain until a pertinent event occurs. */
-void vcpu_block(void)
-{
- struct vcpu *v = current;
-
- set_bit(_VPF_blocked, &v->pause_flags);
-
- arch_vcpu_block(v);
-
- /* Check for events /after/ blocking: avoids wakeup waiting race. */
- if ( local_events_need_delivery() )
- {
- clear_bit(_VPF_blocked, &v->pause_flags);
- }
- else
- {
- TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
- raise_softirq(SCHEDULE_SOFTIRQ);
- }
-}
-
-static void vcpu_block_enable_events(void)
-{
- local_event_delivery_enable();
- vcpu_block();
-}
-
-static long do_poll(struct sched_poll *sched_poll)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- evtchn_port_t port = 0;
- long rc;
- unsigned int i;
-
- /* Fairly arbitrary limit. */
- if ( sched_poll->nr_ports > 128 )
- return -EINVAL;
-
- if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
- return -EFAULT;
-
- set_bit(_VPF_blocked, &v->pause_flags);
- v->poll_evtchn = -1;
- set_bit(v->vcpu_id, d->poll_mask);
-
- arch_vcpu_block(v);
-
-#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
- /* Check for events /after/ setting flags: avoids wakeup waiting race. */
- smp_mb();
-
- /*
- * Someone may have seen we are blocked but not that we are polling, or
- * vice versa. We are certainly being woken, so clean up and bail. Beyond
- * this point others can be guaranteed to clean up for us if they wake us.
- */
- rc = 0;
- if ( (v->poll_evtchn == 0) ||
- !test_bit(_VPF_blocked, &v->pause_flags) ||
- !test_bit(v->vcpu_id, d->poll_mask) )
- goto out;
-#endif
-
- rc = 0;
- if ( local_events_need_delivery() )
- goto out;
-
- for ( i = 0; i < sched_poll->nr_ports; i++ )
- {
- rc = -EFAULT;
- if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
- goto out;
-
- rc = -EINVAL;
- if ( port >= d->max_evtchns )
- goto out;
-
- rc = 0;
- if ( evtchn_port_is_pending(d, port) )
- goto out;
- }
-
- if ( sched_poll->nr_ports == 1 )
- v->poll_evtchn = port;
-
- if ( sched_poll->timeout != 0 )
- set_timer(&v->poll_timer, sched_poll->timeout);
-
- TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
- raise_softirq(SCHEDULE_SOFTIRQ);
-
- return 0;
-
- out:
- v->poll_evtchn = 0;
- clear_bit(v->vcpu_id, d->poll_mask);
- clear_bit(_VPF_blocked, &v->pause_flags);
- return rc;
-}
-
-/* Voluntarily yield the processor for this allocation. */
-long vcpu_yield(void)
-{
- struct vcpu * v=current;
- spinlock_t *lock;
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = unit_schedule_lock_irq(v->sched_unit);
- sched_yield(vcpu_scheduler(v), v->sched_unit);
- unit_schedule_unlock_irq(lock, v->sched_unit);
-
- rcu_read_unlock(&sched_res_rculock);
-
- SCHED_STAT_CRANK(vcpu_yield);
-
- TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
- raise_softirq(SCHEDULE_SOFTIRQ);
- return 0;
-}
-
-static void domain_watchdog_timeout(void *data)
-{
- struct domain *d = data;
-
- if ( d->is_shutting_down || d->is_dying )
- return;
-
- printk("Watchdog timer fired for domain %u\n", d->domain_id);
- domain_shutdown(d, SHUTDOWN_watchdog);
-}
-
-static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
-{
- if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
- return -EINVAL;
-
- spin_lock(&d->watchdog_lock);
-
- if ( id == 0 )
- {
- for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
- {
- if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
- continue;
- set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
- break;
- }
- spin_unlock(&d->watchdog_lock);
- return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
- }
-
- id -= 1;
- if ( !test_bit(id, &d->watchdog_inuse_map) )
- {
- spin_unlock(&d->watchdog_lock);
- return -EINVAL;
- }
-
- if ( timeout == 0 )
- {
- stop_timer(&d->watchdog_timer[id]);
- clear_bit(id, &d->watchdog_inuse_map);
- }
- else
- {
- set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
- }
-
- spin_unlock(&d->watchdog_lock);
- return 0;
-}
-
-void watchdog_domain_init(struct domain *d)
-{
- unsigned int i;
-
- spin_lock_init(&d->watchdog_lock);
-
- d->watchdog_inuse_map = 0;
-
- for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
- init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
-}
-
-void watchdog_domain_destroy(struct domain *d)
-{
- unsigned int i;
-
- for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
- kill_timer(&d->watchdog_timer[i]);
-}
-
-/*
- * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
- * cpu is NR_CPUS).
- * Temporary pinning can be done due to two reasons, which may be nested:
- * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
- * of a conflict (e.g. in case cpupool doesn't include requested CPU, or
- * another conflicting temporary pinning is already in effect.
- * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
- * CPU it is just running on. Can't fail if used properly.
- */
-int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
-{
- struct sched_unit *unit = v->sched_unit;
- spinlock_t *lock;
- int ret = -EINVAL;
- bool migrate;
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = unit_schedule_lock_irq(unit);
-
- if ( cpu == NR_CPUS )
- {
- if ( v->affinity_broken & reason )
- {
- ret = 0;
- v->affinity_broken &= ~reason;
- }
- if ( !ret && !sched_check_affinity_broken(unit) )
- sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
- }
- else if ( cpu < nr_cpu_ids )
- {
- if ( (v->affinity_broken & reason) ||
- (sched_check_affinity_broken(unit) && v->processor != cpu) )
- ret = -EBUSY;
- else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
- {
- if ( !sched_check_affinity_broken(unit) )
- {
- cpumask_copy(unit->cpu_hard_affinity_saved,
- unit->cpu_hard_affinity);
- sched_set_affinity(unit, cpumask_of(cpu), NULL);
- }
- v->affinity_broken |= reason;
- ret = 0;
- }
- }
-
- migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity);
- if ( migrate )
- sched_unit_migrate_start(unit);
-
- unit_schedule_unlock_irq(lock, unit);
-
- if ( migrate )
- sched_unit_migrate_finish(unit);
-
- rcu_read_unlock(&sched_res_rculock);
-
- return ret;
-}
-
-typedef long ret_t;
-
-#endif /* !COMPAT */
-
-ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
-{
- ret_t ret = 0;
-
- switch ( cmd )
- {
- case SCHEDOP_yield:
- {
- ret = vcpu_yield();
- break;
- }
-
- case SCHEDOP_block:
- {
- vcpu_block_enable_events();
- break;
- }
-
- case SCHEDOP_shutdown:
- {
- struct sched_shutdown sched_shutdown;
-
- ret = -EFAULT;
- if ( copy_from_guest(&sched_shutdown, arg, 1) )
- break;
-
- TRACE_3D(TRC_SCHED_SHUTDOWN,
- current->domain->domain_id, current->vcpu_id,
- sched_shutdown.reason);
- ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
-
- break;
- }
-
- case SCHEDOP_shutdown_code:
- {
- struct sched_shutdown sched_shutdown;
- struct domain *d = current->domain;
-
- ret = -EFAULT;
- if ( copy_from_guest(&sched_shutdown, arg, 1) )
- break;
-
- TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
- d->domain_id, current->vcpu_id, sched_shutdown.reason);
-
- spin_lock(&d->shutdown_lock);
- if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
- d->shutdown_code = (u8)sched_shutdown.reason;
- spin_unlock(&d->shutdown_lock);
-
- ret = 0;
- break;
- }
-
- case SCHEDOP_poll:
- {
- struct sched_poll sched_poll;
-
- ret = -EFAULT;
- if ( copy_from_guest(&sched_poll, arg, 1) )
- break;
-
- ret = do_poll(&sched_poll);
-
- break;
- }
-
- case SCHEDOP_remote_shutdown:
- {
- struct domain *d;
- struct sched_remote_shutdown sched_remote_shutdown;
-
- ret = -EFAULT;
- if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
- break;
-
- ret = -ESRCH;
- d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
- if ( d == NULL )
- break;
-
- ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
- if ( likely(!ret) )
- domain_shutdown(d, sched_remote_shutdown.reason);
-
- rcu_unlock_domain(d);
-
- break;
- }
-
- case SCHEDOP_watchdog:
- {
- struct sched_watchdog sched_watchdog;
-
- ret = -EFAULT;
- if ( copy_from_guest(&sched_watchdog, arg, 1) )
- break;
-
- ret = domain_watchdog(
- current->domain, sched_watchdog.id, sched_watchdog.timeout);
- break;
- }
-
- case SCHEDOP_pin_override:
- {
- struct sched_pin_override sched_pin_override;
- unsigned int cpu;
-
- ret = -EPERM;
- if ( !is_hardware_domain(current->domain) )
- break;
-
- ret = -EFAULT;
- if ( copy_from_guest(&sched_pin_override, arg, 1) )
- break;
-
- ret = -EINVAL;
- if ( sched_pin_override.pcpu >= NR_CPUS )
- break;
-
- cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
- ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
-
- break;
- }
-
- default:
- ret = -ENOSYS;
- }
-
- return ret;
-}
-
-#ifndef COMPAT
-
-/* Per-vcpu oneshot-timer hypercall. */
-long do_set_timer_op(s_time_t timeout)
-{
- struct vcpu *v = current;
- s_time_t offset = timeout - NOW();
-
- if ( timeout == 0 )
- {
- stop_timer(&v->singleshot_timer);
- }
- else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
- unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
- {
- /*
- * Linux workaround: occasionally we will see timeouts a long way in
- * the future due to wrapping in Linux's jiffy time handling. We check
- * for timeouts wrapped negative, and for positive timeouts more than
- * about 13 days in the future (2^50ns). The correct fix is to trigger
- * an interrupt immediately (since Linux in fact has pending work to
- * do in this situation). However, older guests also set a long timeout
- * when they have *no* pending timers at all: setting an immediate
- * timeout in this case can burn a lot of CPU. We therefore go for a
- * reasonable middleground of triggering a timer event in 100ms.
- */
- gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
- timeout);
- set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
- }
- else
- {
- migrate_timer(&v->singleshot_timer, smp_processor_id());
- set_timer(&v->singleshot_timer, timeout);
- }
-
- return 0;
-}
-
-/* sched_id - fetch ID of current scheduler */
-int sched_id(void)
-{
- return ops.sched_id;
-}
-
-/* Adjust scheduling parameter for a given domain. */
-long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
-{
- long ret;
-
- ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
- if ( ret )
- return ret;
-
- if ( op->sched_id != dom_scheduler(d)->sched_id )
- return -EINVAL;
-
- switch ( op->cmd )
- {
- case XEN_DOMCTL_SCHEDOP_putinfo:
- case XEN_DOMCTL_SCHEDOP_getinfo:
- case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
- case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
- break;
- default:
- return -EINVAL;
- }
-
- /* NB: the pluggable scheduler code needs to take care
- * of locking by itself. */
- rcu_read_lock(&sched_res_rculock);
-
- if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 )
- TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
-
- rcu_read_unlock(&sched_res_rculock);
-
- return ret;
-}
-
-long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
-{
- struct cpupool *pool;
- int rc;
-
- rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
- if ( rc )
- return rc;
-
- if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
- (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
- return -EINVAL;
-
- pool = cpupool_get_by_id(op->cpupool_id);
- if ( pool == NULL )
- return -ESRCH;
-
- rcu_read_lock(&sched_res_rculock);
-
- rc = ((op->sched_id == pool->sched->sched_id)
- ? sched_adjust_cpupool(pool->sched, op) : -EINVAL);
-
- rcu_read_unlock(&sched_res_rculock);
-
- cpupool_put(pool);
-
- return rc;
-}
-
-static void vcpu_periodic_timer_work_locked(struct vcpu *v)
-{
- s_time_t now;
- s_time_t periodic_next_event;
-
- now = NOW();
- periodic_next_event = v->periodic_last_event + v->periodic_period;
-
- if ( now >= periodic_next_event )
- {
- send_timer_event(v);
- v->periodic_last_event = now;
- periodic_next_event = now + v->periodic_period;
- }
-
- migrate_timer(&v->periodic_timer, v->processor);
- set_timer(&v->periodic_timer, periodic_next_event);
-}
-
-static void vcpu_periodic_timer_work(struct vcpu *v)
-{
- if ( v->periodic_period == 0 )
- return;
-
- spin_lock(&v->periodic_timer_lock);
- if ( v->periodic_period )
- vcpu_periodic_timer_work_locked(v);
- spin_unlock(&v->periodic_timer_lock);
-}
-
-/*
- * Set the periodic timer of a vcpu.
- */
-void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
-{
- spin_lock(&v->periodic_timer_lock);
-
- stop_timer(&v->periodic_timer);
-
- v->periodic_period = value;
- if ( value )
- vcpu_periodic_timer_work_locked(v);
-
- spin_unlock(&v->periodic_timer_lock);
-}
-
-static void sched_switch_units(struct sched_resource *sr,
- struct sched_unit *next, struct sched_unit *prev,
- s_time_t now)
-{
- unsigned int cpu;
-
- ASSERT(unit_running(prev));
-
- if ( prev != next )
- {
- sr->curr = next;
- sr->prev = prev;
-
- TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
- prev->unit_id, now - prev->state_entry_time);
- TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
- next->unit_id,
- (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
- (now - next->state_entry_time) : 0, prev->next_time);
- TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
- next->domain->domain_id, next->unit_id);
-
- ASSERT(!unit_running(next));
-
- /*
- * NB. Don't add any trace records from here until the actual context
- * switch, else lost_records resume will not work properly.
- */
-
- ASSERT(!next->is_running);
- next->is_running = true;
- next->state_entry_time = now;
-
- if ( is_idle_unit(prev) )
- {
- prev->runstate_cnt[RUNSTATE_running] = 0;
- prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity;
- }
- if ( is_idle_unit(next) )
- {
- next->runstate_cnt[RUNSTATE_running] = sr->granularity;
- next->runstate_cnt[RUNSTATE_runnable] = 0;
- }
- }
-
- for_each_cpu ( cpu, sr->cpus )
- {
- struct vcpu *vprev = get_cpu_current(cpu);
- struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
-
- if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
- {
- vcpu_runstate_change(vprev,
- ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
- (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
- now);
- vcpu_runstate_change(vnext, vnext->new_state, now);
- }
-
- vnext->is_running = 1;
-
- if ( is_idle_vcpu(vnext) )
- vnext->sched_unit = next;
- }
-}
-
-static bool sched_tasklet_check_cpu(unsigned int cpu)
-{
- unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
-
- switch ( *tasklet_work )
- {
- case TASKLET_enqueued:
- set_bit(_TASKLET_scheduled, tasklet_work);
- /* fallthrough */
- case TASKLET_enqueued|TASKLET_scheduled:
- return true;
- break;
- case TASKLET_scheduled:
- clear_bit(_TASKLET_scheduled, tasklet_work);
- /* fallthrough */
- case 0:
- /* return false; */
- break;
- default:
- BUG();
- }
-
- return false;
-}
-
-static bool sched_tasklet_check(unsigned int cpu)
-{
- bool tasklet_work_scheduled = false;
- const cpumask_t *mask = get_sched_res(cpu)->cpus;
- unsigned int cpu_iter;
-
- for_each_cpu ( cpu_iter, mask )
- if ( sched_tasklet_check_cpu(cpu_iter) )
- tasklet_work_scheduled = true;
-
- return tasklet_work_scheduled;
-}
-
-static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
- unsigned int cpu)
-{
- struct sched_resource *sr = get_sched_res(cpu);
- struct scheduler *sched = sr->scheduler;
- struct sched_unit *next;
-
- /* get policy-specific decision on scheduling... */
- sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
-
- next = prev->next_task;
-
- if ( prev->next_time >= 0 ) /* -ve means no limit */
- set_timer(&sr->s_timer, now + prev->next_time);
-
- sched_switch_units(sr, next, prev, now);
-
- return next;
-}
-
-static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
-{
- /* Clear running flag /after/ writing context to memory. */
- smp_wmb();
-
- if ( vprev != vnext )
- vprev->is_running = 0;
-}
-
-static void unit_context_saved(struct sched_resource *sr)
-{
- struct sched_unit *unit = sr->prev;
-
- if ( !unit )
- return;
-
- unit->is_running = false;
- unit->state_entry_time = NOW();
- sr->prev = NULL;
-
- /* Check for migration request /after/ clearing running flag. */
- smp_mb();
-
- sched_context_saved(unit_scheduler(unit), unit);
-
- /* Idle never migrates and idle vcpus might belong to other units. */
- if ( !is_idle_unit(unit) )
- sched_unit_migrate_finish(unit);
-}
-
-/*
- * Rendezvous on end of context switch.
- * As no lock is protecting this rendezvous function we need to use atomic
- * access functions on the counter.
- * The counter will be 0 in case no rendezvous is needed. For the rendezvous
- * case it is initialised to the number of cpus to rendezvous plus 1. Each
- * member entering decrements the counter. The last one will decrement it to
- * 1 and perform the final needed action in that case (call of
- * unit_context_saved()), and then set the counter to zero. The other members
- * will wait until the counter becomes zero until they proceed.
- */
-void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
-{
- struct sched_unit *next = vnext->sched_unit;
- struct sched_resource *sr;
-
- rcu_read_lock(&sched_res_rculock);
-
- sr = get_sched_res(smp_processor_id());
-
- if ( atomic_read(&next->rendezvous_out_cnt) )
- {
- int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
-
- vcpu_context_saved(vprev, vnext);
-
- /* Call unit_context_saved() before releasing other waiters. */
- if ( cnt == 1 )
- {
- unit_context_saved(sr);
- atomic_set(&next->rendezvous_out_cnt, 0);
- }
- else
- while ( atomic_read(&next->rendezvous_out_cnt) )
- cpu_relax();
- }
- else
- {
- vcpu_context_saved(vprev, vnext);
- if ( sr->granularity == 1 )
- unit_context_saved(sr);
- }
-
- if ( is_idle_vcpu(vprev) && vprev != vnext )
- vprev->sched_unit = sr->sched_unit_idle;
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
- bool reset_idle_unit, s_time_t now)
-{
- if ( unlikely(vprev == vnext) )
- {
- TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
- vnext->domain->domain_id, vnext->sched_unit->unit_id,
- now - vprev->runstate.state_entry_time,
- vprev->sched_unit->next_time);
- sched_context_switched(vprev, vnext);
-
- /*
- * We are switching from a non-idle to an idle unit.
- * A vcpu of the idle unit might have been running before due to
- * the guest vcpu being blocked. We must adjust the unit of the idle
- * vcpu which might have been set to the guest's one.
- */
- if ( reset_idle_unit )
- vnext->sched_unit =
- get_sched_res(smp_processor_id())->sched_unit_idle;
-
- rcu_read_unlock(&sched_res_rculock);
-
- trace_continue_running(vnext);
- return continue_running(vprev);
- }
-
- SCHED_STAT_CRANK(sched_ctx);
-
- stop_timer(&vprev->periodic_timer);
-
- if ( vnext->sched_unit->migrated )
- vcpu_move_irqs(vnext);
-
- vcpu_periodic_timer_work(vnext);
-
- rcu_read_unlock(&sched_res_rculock);
-
- context_switch(vprev, vnext);
-}
-
-/*
- * Force a context switch of a single vcpu of an unit.
- * Might be called either if a vcpu of an already running unit is woken up
- * or if a vcpu of a running unit is put asleep with other vcpus of the same
- * unit still running.
- * Returns either NULL if v is already in the correct state or the vcpu to
- * run next.
- */
-static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
- struct vcpu *v,
- unsigned int cpu, s_time_t now)
-{
- v->force_context_switch = false;
-
- if ( vcpu_runnable(v) == v->is_running )
- return NULL;
-
- if ( vcpu_runnable(v) )
- {
- if ( is_idle_vcpu(vprev) )
- {
- vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
- vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
- }
- vcpu_runstate_change(v, RUNSTATE_running, now);
- }
- else
- {
- /* Make sure not to switch last vcpu of an unit away. */
- if ( unit_running(v->sched_unit) == 1 )
- return NULL;
-
- v->new_state = vcpu_runstate_blocked(v);
- vcpu_runstate_change(v, v->new_state, now);
- v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
- if ( v != vprev )
- {
- if ( is_idle_vcpu(vprev) )
- {
- vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
- vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
- }
- else
- {
- v->sched_unit = vprev->sched_unit;
- vcpu_runstate_change(v, RUNSTATE_running, now);
- }
- }
- }
-
- /* This vcpu will be switched to. */
- v->is_running = true;
-
- /* Make sure not to loose another slave call. */
- raise_softirq(SCHED_SLAVE_SOFTIRQ);
-
- return v;
-}
-
-/*
- * Rendezvous before taking a scheduling decision.
- * Called with schedule lock held, so all accesses to the rendezvous counter
- * can be normal ones (no atomic accesses needed).
- * The counter is initialized to the number of cpus to rendezvous initially.
- * Each cpu entering will decrement the counter. In case the counter becomes
- * zero do_schedule() is called and the rendezvous counter for leaving
- * context_switch() is set. All other members will wait until the counter is
- * becoming zero, dropping the schedule lock in between.
- */
-static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
- spinlock_t **lock, int cpu,
- s_time_t now)
-{
- struct sched_unit *next;
- struct vcpu *v;
- unsigned int gran = get_sched_res(cpu)->granularity;
-
- if ( !--prev->rendezvous_in_cnt )
- {
- next = do_schedule(prev, now, cpu);
- atomic_set(&next->rendezvous_out_cnt, gran + 1);
- return next;
- }
-
- v = unit2vcpu_cpu(prev, cpu);
- while ( prev->rendezvous_in_cnt )
- {
- if ( v && v->force_context_switch )
- {
- struct vcpu *vprev = current;
-
- v = sched_force_context_switch(vprev, v, cpu, now);
-
- if ( v )
- {
- /* We'll come back another time, so adjust rendezvous_in_cnt. */
- prev->rendezvous_in_cnt++;
- atomic_set(&prev->rendezvous_out_cnt, 0);
-
- pcpu_schedule_unlock_irq(*lock, cpu);
-
- sched_context_switch(vprev, v, false, now);
-
- return NULL; /* ARM only. */
- }
-
- v = unit2vcpu_cpu(prev, cpu);
- }
- /*
- * Coming from idle might need to do tasklet work.
- * In order to avoid deadlocks we can't do that here, but have to
- * continue the idle loop.
- * Undo the rendezvous_in_cnt decrement and schedule another call of
- * sched_slave().
- */
- if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) )
- {
- struct vcpu *vprev = current;
-
- prev->rendezvous_in_cnt++;
- atomic_set(&prev->rendezvous_out_cnt, 0);
-
- pcpu_schedule_unlock_irq(*lock, cpu);
-
- raise_softirq(SCHED_SLAVE_SOFTIRQ);
- sched_context_switch(vprev, vprev, false, now);
-
- return NULL; /* ARM only. */
- }
-
- pcpu_schedule_unlock_irq(*lock, cpu);
-
- cpu_relax();
-
- *lock = pcpu_schedule_lock_irq(cpu);
-
- if ( unlikely(!scheduler_active) )
- {
- ASSERT(is_idle_unit(prev));
- atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
- prev->rendezvous_in_cnt = 0;
- }
- }
-
- return prev->next_task;
-}
-
-static void sched_slave(void)
-{
- struct vcpu *v, *vprev = current;
- struct sched_unit *prev = vprev->sched_unit, *next;
- s_time_t now;
- spinlock_t *lock;
- bool do_softirq = false;
- unsigned int cpu = smp_processor_id();
-
- ASSERT_NOT_IN_ATOMIC();
-
- rcu_read_lock(&sched_res_rculock);
-
- lock = pcpu_schedule_lock_irq(cpu);
-
- now = NOW();
-
- v = unit2vcpu_cpu(prev, cpu);
- if ( v && v->force_context_switch )
- {
- v = sched_force_context_switch(vprev, v, cpu, now);
-
- if ( v )
- {
- pcpu_schedule_unlock_irq(lock, cpu);
-
- sched_context_switch(vprev, v, false, now);
-
- return;
- }
-
- do_softirq = true;
- }
-
- if ( !prev->rendezvous_in_cnt )
- {
- pcpu_schedule_unlock_irq(lock, cpu);
-
- rcu_read_unlock(&sched_res_rculock);
-
- /* Check for failed forced context switch. */
- if ( do_softirq )
- raise_softirq(SCHEDULE_SOFTIRQ);
-
- return;
- }
-
- stop_timer(&get_sched_res(cpu)->s_timer);
-
- next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
- if ( !next )
- return;
-
- pcpu_schedule_unlock_irq(lock, cpu);
-
- sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
- is_idle_unit(next) && !is_idle_unit(prev), now);
-}
-
-/*
- * The main function
- * - deschedule the current domain (scheduler independent).
- * - pick a new domain (scheduler dependent).
- */
-static void schedule(void)
-{
- struct vcpu *vnext, *vprev = current;
- struct sched_unit *prev = vprev->sched_unit, *next = NULL;
- s_time_t now;
- struct sched_resource *sr;
- spinlock_t *lock;
- int cpu = smp_processor_id();
- unsigned int gran;
-
- ASSERT_NOT_IN_ATOMIC();
-
- SCHED_STAT_CRANK(sched_run);
-
- rcu_read_lock(&sched_res_rculock);
-
- sr = get_sched_res(cpu);
- gran = sr->granularity;
-
- lock = pcpu_schedule_lock_irq(cpu);
-
- if ( prev->rendezvous_in_cnt )
- {
- /*
- * We have a race: sched_slave() should be called, so raise a softirq
- * in order to re-enter schedule() later and call sched_slave() now.
- */
- pcpu_schedule_unlock_irq(lock, cpu);
-
- rcu_read_unlock(&sched_res_rculock);
-
- raise_softirq(SCHEDULE_SOFTIRQ);
- return sched_slave();
- }
-
- stop_timer(&sr->s_timer);
-
- now = NOW();
-
- if ( gran > 1 )
- {
- cpumask_t mask;
-
- prev->rendezvous_in_cnt = gran;
- cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu));
- cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ);
- next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
- if ( !next )
- return;
- }
- else
- {
- prev->rendezvous_in_cnt = 0;
- next = do_schedule(prev, now, cpu);
- atomic_set(&next->rendezvous_out_cnt, 0);
- }
-
- pcpu_schedule_unlock_irq(lock, cpu);
-
- vnext = sched_unit2vcpu_cpu(next, cpu);
- sched_context_switch(vprev, vnext,
- !is_idle_unit(prev) && is_idle_unit(next), now);
-}
-
-/* The scheduler timer: force a run through the scheduler */
-static void s_timer_fn(void *unused)
-{
- raise_softirq(SCHEDULE_SOFTIRQ);
- SCHED_STAT_CRANK(sched_irq);
-}
-
-/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
-static void vcpu_periodic_timer_fn(void *data)
-{
- struct vcpu *v = data;
- vcpu_periodic_timer_work(v);
-}
-
-/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
-static void vcpu_singleshot_timer_fn(void *data)
-{
- struct vcpu *v = data;
- send_timer_event(v);
-}
-
-/* SCHEDOP_poll timeout callback. */
-static void poll_timer_fn(void *data)
-{
- struct vcpu *v = data;
-
- if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
- vcpu_unblock(v);
-}
-
-static struct sched_resource *sched_alloc_res(void)
-{
- struct sched_resource *sr;
-
- sr = xzalloc(struct sched_resource);
- if ( sr == NULL )
- return NULL;
- if ( !zalloc_cpumask_var(&sr->cpus) )
- {
- xfree(sr);
- return NULL;
- }
- return sr;
-}
-
-static int cpu_schedule_up(unsigned int cpu)
-{
- struct sched_resource *sr;
-
- sr = sched_alloc_res();
- if ( sr == NULL )
- return -ENOMEM;
-
- sr->master_cpu = cpu;
- cpumask_copy(sr->cpus, cpumask_of(cpu));
- set_sched_res(cpu, sr);
-
- sr->scheduler = &sched_idle_ops;
- spin_lock_init(&sr->_lock);
- sr->schedule_lock = &sched_free_cpu_lock;
- init_timer(&sr->s_timer, s_timer_fn, NULL, cpu);
- atomic_set(&per_cpu(sched_urgent_count, cpu), 0);
-
- /* We start with cpu granularity. */
- sr->granularity = 1;
-
- cpumask_set_cpu(cpu, &sched_res_mask);
-
- /* Boot CPU is dealt with later in scheduler_init(). */
- if ( cpu == 0 )
- return 0;
-
- if ( idle_vcpu[cpu] == NULL )
- vcpu_create(idle_vcpu[0]->domain, cpu);
- else
- idle_vcpu[cpu]->sched_unit->res = sr;
-
- if ( idle_vcpu[cpu] == NULL )
- return -ENOMEM;
-
- idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
-
- /*
- * No need to allocate any scheduler data, as cpus coming online are
- * free initially and the idle scheduler doesn't need any data areas
- * allocated.
- */
-
- sr->curr = idle_vcpu[cpu]->sched_unit;
- sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
-
- sr->sched_priv = NULL;
-
- return 0;
-}
-
-static void sched_res_free(struct rcu_head *head)
-{
- struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
-
- free_cpumask_var(sr->cpus);
- if ( sr->sched_unit_idle )
- sched_free_unit_mem(sr->sched_unit_idle);
- xfree(sr);
-}
-
-static void cpu_schedule_down(unsigned int cpu)
-{
- struct sched_resource *sr;
-
- rcu_read_lock(&sched_res_rculock);
-
- sr = get_sched_res(cpu);
-
- kill_timer(&sr->s_timer);
-
- cpumask_clear_cpu(cpu, &sched_res_mask);
- set_sched_res(cpu, NULL);
-
- /* Keep idle unit. */
- sr->sched_unit_idle = NULL;
- call_rcu(&sr->rcu, sched_res_free);
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-void sched_rm_cpu(unsigned int cpu)
-{
- int rc;
-
- rcu_read_lock(&domlist_read_lock);
- rc = cpu_disable_scheduler(cpu);
- BUG_ON(rc);
- rcu_read_unlock(&domlist_read_lock);
- cpu_schedule_down(cpu);
-}
-
-static int cpu_schedule_callback(
- struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int rc = 0;
-
- /*
- * All scheduler related suspend/resume handling needed is done in
- * cpupool.c.
- */
- if ( system_state > SYS_STATE_active )
- return NOTIFY_DONE;
-
- rcu_read_lock(&sched_res_rculock);
-
- /*
- * From the scheduler perspective, bringing up a pCPU requires
- * allocating and initializing the per-pCPU scheduler specific data,
- * as well as "registering" this pCPU to the scheduler (which may
- * involve modifying some scheduler wide data structures).
- * As new pCPUs always start as "free" cpus with the minimal idle
- * scheduler being in charge, we don't need any of that.
- *
- * On the other hand, at teardown, we need to reverse what has been done
- * during initialization, and then free the per-pCPU specific data. A
- * pCPU brought down is not forced through "free" cpus, so here we need to
- * use the appropriate hooks.
- *
- * This happens by calling the deinit_pdata and free_pdata hooks, in this
- * order. If no per-pCPU memory was allocated, there is no need to
- * provide an implementation of free_pdata. deinit_pdata may, however,
- * be necessary/useful in this case too (e.g., it can undo something done
- * on scheduler wide data structure during init_pdata). Both deinit_pdata
- * and free_pdata are called during CPU_DEAD.
- *
- * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED.
- */
- switch ( action )
- {
- case CPU_UP_PREPARE:
- rc = cpu_schedule_up(cpu);
- break;
- case CPU_DOWN_PREPARE:
- rcu_read_lock(&domlist_read_lock);
- rc = cpu_disable_scheduler_check(cpu);
- rcu_read_unlock(&domlist_read_lock);
- break;
- case CPU_DEAD:
- sched_rm_cpu(cpu);
- break;
- case CPU_UP_CANCELED:
- cpu_schedule_down(cpu);
- break;
- default:
- break;
- }
-
- rcu_read_unlock(&sched_res_rculock);
-
- return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
-}
-
-static struct notifier_block cpu_schedule_nfb = {
- .notifier_call = cpu_schedule_callback
-};
-
-const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu)
-{
- const cpumask_t *mask;
-
- switch ( opt )
- {
- case SCHED_GRAN_cpu:
- mask = cpumask_of(cpu);
- break;
- case SCHED_GRAN_core:
- mask = per_cpu(cpu_sibling_mask, cpu);
- break;
- case SCHED_GRAN_socket:
- mask = per_cpu(cpu_core_mask, cpu);
- break;
- default:
- ASSERT_UNREACHABLE();
- return NULL;
- }
-
- return mask;
-}
-
-static void schedule_dummy(void)
-{
- sched_tasklet_check_cpu(smp_processor_id());
-}
-
-void scheduler_disable(void)
-{
- scheduler_active = false;
- open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy);
- open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy);
-}
-
-void scheduler_enable(void)
-{
- open_softirq(SCHEDULE_SOFTIRQ, schedule);
- open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
- scheduler_active = true;
-}
-
-/* Initialise the data structures. */
-void __init scheduler_init(void)
-{
- struct domain *idle_domain;
- int i;
-
- scheduler_enable();
-
- for ( i = 0; i < NUM_SCHEDULERS; i++)
- {
-#define sched_test_func(f) \
- if ( !schedulers[i]->f ) \
- { \
- printk("scheduler %s misses .%s, dropped\n", \
- schedulers[i]->opt_name, #f); \
- schedulers[i] = NULL; \
- }
-
- sched_test_func(init);
- sched_test_func(deinit);
- sched_test_func(pick_resource);
- sched_test_func(alloc_udata);
- sched_test_func(free_udata);
- sched_test_func(switch_sched);
- sched_test_func(do_schedule);
-
-#undef sched_test_func
-
- if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
- {
- printk("scheduler %s failed initialization, dropped\n",
- schedulers[i]->opt_name);
- schedulers[i] = NULL;
- }
-
- if ( schedulers[i] && !ops.name &&
- !strcmp(schedulers[i]->opt_name, opt_sched) )
- ops = *schedulers[i];
- }
-
- if ( !ops.name )
- {
- printk("Could not find scheduler: %s\n", opt_sched);
- for ( i = 0; i < NUM_SCHEDULERS; i++ )
- if ( schedulers[i] &&
- !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
- {
- ops = *schedulers[i];
- break;
- }
- BUG_ON(!ops.name);
- printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
- }
-
- if ( cpu_schedule_up(0) )
- BUG();
- register_cpu_notifier(&cpu_schedule_nfb);
-
- printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
- if ( sched_init(&ops) )
- panic("scheduler returned error on init\n");
-
- if ( sched_ratelimit_us &&
- (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
- || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
- {
- printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
- " Resetting to default %u\n",
- XEN_SYSCTL_SCHED_RATELIMIT_MIN,
- XEN_SYSCTL_SCHED_RATELIMIT_MAX,
- SCHED_DEFAULT_RATELIMIT_US);
- sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
- }
-
- idle_domain = domain_create(DOMID_IDLE, NULL, false);
- BUG_ON(IS_ERR(idle_domain));
- BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu));
- idle_domain->vcpu = idle_vcpu;
- idle_domain->max_vcpus = nr_cpu_ids;
- if ( vcpu_create(idle_domain, 0) == NULL )
- BUG();
-
- rcu_read_lock(&sched_res_rculock);
-
- get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
- get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * Move a pCPU from free cpus (running the idle scheduler) to a cpupool
- * using any "real" scheduler.
- * The cpu is still marked as "free" and not yet valid for its cpupool.
- */
-int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
-{
- struct vcpu *idle;
- void *ppriv, *vpriv;
- struct scheduler *new_ops = c->sched;
- struct sched_resource *sr;
- spinlock_t *old_lock, *new_lock;
- unsigned long flags;
- int ret = 0;
-
- rcu_read_lock(&sched_res_rculock);
-
- sr = get_sched_res(cpu);
-
- ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
- ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid));
- ASSERT(get_sched_res(cpu)->cpupool == NULL);
-
- /*
- * To setup the cpu for the new scheduler we need:
- * - a valid instance of per-CPU scheduler specific data, as it is
- * allocated by sched_alloc_pdata(). Note that we do not want to
- * initialize it yet (i.e., we are not calling sched_init_pdata()).
- * That will be done by the target scheduler, in sched_switch_sched(),
- * in proper ordering and with locking.
- * - a valid instance of per-vCPU scheduler specific data, for the idle
- * vCPU of cpu. That is what the target scheduler will use for the
- * sched_priv field of the per-vCPU info of the idle domain.
- */
- idle = idle_vcpu[cpu];
- ppriv = sched_alloc_pdata(new_ops, cpu);
- if ( IS_ERR(ppriv) )
- {
- ret = PTR_ERR(ppriv);
- goto out;
- }
-
- vpriv = sched_alloc_udata(new_ops, idle->sched_unit,
- idle->domain->sched_priv);
- if ( vpriv == NULL )
- {
- sched_free_pdata(new_ops, ppriv, cpu);
- ret = -ENOMEM;
- goto out;
- }
-
- /*
- * The actual switch, including the rerouting of the scheduler lock to
- * whatever new_ops prefers, needs to happen in one critical section,
- * protected by old_ops' lock, or races are possible.
- * It is, in fact, the lock of the idle scheduler that we are taking.
- * But that is ok as anyone trying to schedule on this cpu will spin until
- * when we release that lock (bottom of this function). When he'll get the
- * lock --thanks to the loop inside *_schedule_lock() functions-- he'll
- * notice that the lock itself changed, and retry acquiring the new one
- * (which will be the correct, remapped one, at that point).
- */
- old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
- if ( cpupool_get_granularity(c) > 1 )
- {
- const cpumask_t *mask;
- unsigned int cpu_iter, idx = 0;
- struct sched_unit *old_unit, *master_unit;
- struct sched_resource *sr_old;
-
- /*
- * We need to merge multiple idle_vcpu units and sched_resource structs
- * into one. As the free cpus all share the same lock we are fine doing
- * that now. The worst which could happen would be someone waiting for
- * the lock, thus dereferencing sched_res->schedule_lock. This is the
- * reason we are freeing struct sched_res via call_rcu() to avoid the
- * lock pointer suddenly disappearing.
- */
- mask = sched_get_opt_cpumask(c->gran, cpu);
- master_unit = idle_vcpu[cpu]->sched_unit;
-
- for_each_cpu ( cpu_iter, mask )
- {
- if ( idx )
- cpumask_clear_cpu(cpu_iter, &sched_res_mask);
-
- per_cpu(sched_res_idx, cpu_iter) = idx++;
-
- if ( cpu == cpu_iter )
- continue;
-
- old_unit = idle_vcpu[cpu_iter]->sched_unit;
- sr_old = get_sched_res(cpu_iter);
- kill_timer(&sr_old->s_timer);
- idle_vcpu[cpu_iter]->sched_unit = master_unit;
- master_unit->runstate_cnt[RUNSTATE_running]++;
- set_sched_res(cpu_iter, sr);
- cpumask_set_cpu(cpu_iter, sr->cpus);
-
- call_rcu(&sr_old->rcu, sched_res_free);
- }
- }
-
- new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
-
- sr->scheduler = new_ops;
- sr->sched_priv = ppriv;
-
- /*
- * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
- * if it is free (and it can be) we want that anyone that manages
- * taking it, finds all the initializations we've done above in place.
- */
- smp_wmb();
- sr->schedule_lock = new_lock;
-
- /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
- spin_unlock_irqrestore(old_lock, flags);
-
- sr->granularity = cpupool_get_granularity(c);
- sr->cpupool = c;
- /* The cpu is added to a pool, trigger it to go pick up some work */
- cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-
-out:
- rcu_read_unlock(&sched_res_rculock);
-
- return ret;
-}
-
-/*
- * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
- * (the idle scheduler).
- * The cpu is already marked as "free" and not valid any longer for its
- * cpupool.
- */
-int schedule_cpu_rm(unsigned int cpu)
-{
- void *ppriv_old, *vpriv_old;
- struct sched_resource *sr, **sr_new = NULL;
- struct sched_unit *unit;
- struct scheduler *old_ops;
- spinlock_t *old_lock;
- unsigned long flags;
- int idx, ret = -ENOMEM;
- unsigned int cpu_iter;
-
- rcu_read_lock(&sched_res_rculock);
-
- sr = get_sched_res(cpu);
- old_ops = sr->scheduler;
-
- if ( sr->granularity > 1 )
- {
- sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
- if ( !sr_new )
- goto out;
- for ( idx = 0; idx < sr->granularity - 1; idx++ )
- {
- sr_new[idx] = sched_alloc_res();
- if ( sr_new[idx] )
- {
- sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
- if ( !sr_new[idx]->sched_unit_idle )
- {
- sched_res_free(&sr_new[idx]->rcu);
- sr_new[idx] = NULL;
- }
- }
- if ( !sr_new[idx] )
- {
- for ( idx--; idx >= 0; idx-- )
- sched_res_free(&sr_new[idx]->rcu);
- goto out;
- }
- sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
- sr_new[idx]->scheduler = &sched_idle_ops;
- sr_new[idx]->granularity = 1;
-
- /* We want the lock not to change when replacing the resource. */
- sr_new[idx]->schedule_lock = sr->schedule_lock;
- }
- }
-
- ret = 0;
- ASSERT(sr->cpupool != NULL);
- ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
- ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
-
- /* See comment in schedule_cpu_add() regarding lock switching. */
- old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
- vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
- ppriv_old = sr->sched_priv;
-
- idx = 0;
- for_each_cpu ( cpu_iter, sr->cpus )
- {
- per_cpu(sched_res_idx, cpu_iter) = 0;
- if ( cpu_iter == cpu )
- {
- idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
- }
- else
- {
- /* Initialize unit. */
- unit = sr_new[idx]->sched_unit_idle;
- unit->res = sr_new[idx];
- unit->is_running = true;
- sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
- sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
-
- /* Adjust cpu masks of resources (old and new). */
- cpumask_clear_cpu(cpu_iter, sr->cpus);
- cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
-
- /* Init timer. */
- init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
-
- /* Last resource initializations and insert resource pointer. */
- sr_new[idx]->master_cpu = cpu_iter;
- set_sched_res(cpu_iter, sr_new[idx]);
-
- /* Last action: set the new lock pointer. */
- smp_mb();
- sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
-
- idx++;
- }
- }
- sr->scheduler = &sched_idle_ops;
- sr->sched_priv = NULL;
-
- smp_mb();
- sr->schedule_lock = &sched_free_cpu_lock;
-
- /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
- spin_unlock_irqrestore(old_lock, flags);
-
- sched_deinit_pdata(old_ops, ppriv_old, cpu);
-
- sched_free_udata(old_ops, vpriv_old);
- sched_free_pdata(old_ops, ppriv_old, cpu);
-
- sr->granularity = 1;
- sr->cpupool = NULL;
-
-out:
- rcu_read_unlock(&sched_res_rculock);
- xfree(sr_new);
-
- return ret;
-}
-
-struct scheduler *scheduler_get_default(void)
-{
- return &ops;
-}
-
-struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
-{
- int i;
- struct scheduler *sched;
-
- for ( i = 0; i < NUM_SCHEDULERS; i++ )
- if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
- goto found;
- *perr = -ENOENT;
- return NULL;
-
- found:
- *perr = -ENOMEM;
- if ( (sched = xmalloc(struct scheduler)) == NULL )
- return NULL;
- memcpy(sched, schedulers[i], sizeof(*sched));
- if ( (*perr = sched_init(sched)) != 0 )
- {
- xfree(sched);
- sched = NULL;
- }
-
- return sched;
-}
-
-void scheduler_free(struct scheduler *sched)
-{
- BUG_ON(sched == &ops);
- sched_deinit(sched);
- xfree(sched);
-}
-
-void schedule_dump(struct cpupool *c)
-{
- unsigned int i;
- struct scheduler *sched;
- cpumask_t *cpus;
-
- /* Locking, if necessary, must be handled withing each scheduler */
-
- rcu_read_lock(&sched_res_rculock);
-
- if ( c != NULL )
- {
- sched = c->sched;
- cpus = c->cpu_valid;
- printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
- sched_dump_settings(sched);
- }
- else
- {
- sched = &ops;
- cpus = &cpupool_free_cpus;
- }
-
- if ( sched->dump_cpu_state != NULL )
- {
- printk("CPUs info:\n");
- for_each_cpu (i, cpus)
- sched_dump_cpu_state(sched, i);
- }
-
- rcu_read_unlock(&sched_res_rculock);
-}
-
-void sched_tick_suspend(void)
-{
- rcu_idle_enter(smp_processor_id());
- rcu_idle_timer_start();
-}
-
-void sched_tick_resume(void)
-{
- rcu_idle_timer_stop();
- rcu_idle_exit(smp_processor_id());
-}
-
-void wait(void)
-{
- schedule();
-}
-
-#ifdef CONFIG_X86
-void __init sched_setup_dom0_vcpus(struct domain *d)
-{
- unsigned int i;
- struct sched_unit *unit;
-
- for ( i = 1; i < d->max_vcpus; i++ )
- vcpu_create(d, i);
-
- /*
- * PV-shim: vcpus are pinned 1:1.
- * Initially only 1 cpu is online, others will be dealt with when
- * onlining them. This avoids pinning a vcpu to a not yet online cpu here.
- */
- if ( pv_shim )
- sched_set_affinity(d->vcpu[0]->sched_unit,
- cpumask_of(0), cpumask_of(0));
- else
- {
- for_each_sched_unit ( d, unit )
- {
- if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed )
- sched_set_affinity(unit, &dom0_cpus, NULL);
- sched_set_affinity(unit, NULL, &dom0_cpus);
- }
- }
-
- domain_update_node_affinity(d);
-}
-#endif
-
-#ifdef CONFIG_COMPAT
-#include "compat/schedule.c"
-#endif
-
-#endif /* !COMPAT */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */