static void vmx_invlpg_intercept(unsigned long vaddr);
static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
+struct vmx_pi_blocking_vcpu {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+/*
+ * We maintain a per-CPU linked-list of vCPUs, so in PI wakeup
+ * handler we can find which vCPU should be woken up.
+ */
+static DEFINE_PER_CPU(struct vmx_pi_blocking_vcpu, vmx_pi_blocking);
+
uint8_t __read_mostly posted_intr_vector;
+static uint8_t __read_mostly pi_wakeup_vector;
+
+void vmx_pi_per_cpu_init(unsigned int cpu)
+{
+ INIT_LIST_HEAD(&per_cpu(vmx_pi_blocking, cpu).list);
+ spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
+}
+
+static void vmx_vcpu_block(struct vcpu *v)
+{
+ unsigned long flags;
+ unsigned int dest;
+ spinlock_t *old_lock;
+ spinlock_t *pi_blocking_list_lock =
+ &per_cpu(vmx_pi_blocking, v->processor).lock;
+ struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+ spin_lock_irqsave(pi_blocking_list_lock, flags);
+ old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
+ pi_blocking_list_lock);
+
+ /*
+ * 'v->arch.hvm_vmx.pi_blocking.lock' should be NULL before
+ * being assigned to a new value, since the vCPU is currently
+ * running and it cannot be on any blocking list.
+ */
+ ASSERT(old_lock == NULL);
+
+ list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
+ &per_cpu(vmx_pi_blocking, v->processor).list);
+ spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+
+ ASSERT(!pi_test_sn(pi_desc));
+
+ dest = cpu_physical_id(v->processor);
+
+ ASSERT(pi_desc->ndst ==
+ (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
+
+ write_atomic(&pi_desc->nv, pi_wakeup_vector);
+}
+
+static void vmx_pi_switch_from(struct vcpu *v)
+{
+ struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+ if ( test_bit(_VPF_blocked, &v->pause_flags) )
+ return;
+
+ pi_set_sn(pi_desc);
+}
+
+static void vmx_pi_switch_to(struct vcpu *v)
+{
+ struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+ unsigned int dest = cpu_physical_id(v->processor);
+
+ write_atomic(&pi_desc->ndst,
+ x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
+
+ pi_clear_sn(pi_desc);
+}
+
+static void vmx_pi_do_resume(struct vcpu *v)
+{
+ unsigned long flags;
+ spinlock_t *pi_blocking_list_lock;
+ struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+ ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));
+
+ /*
+ * Set 'NV' field back to posted_intr_vector, so the
+ * Posted-Interrupts can be delivered to the vCPU when
+ * it is running in non-root mode.
+ */
+ write_atomic(&pi_desc->nv, posted_intr_vector);
+
+ /* The vCPU is not on any blocking list. */
+ pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
+
+ /* Prevent the compiler from eliminating the local variable.*/
+ smp_rmb();
+
+ if ( pi_blocking_list_lock == NULL )
+ return;
+
+ spin_lock_irqsave(pi_blocking_list_lock, flags);
+
+ /*
+ * v->arch.hvm_vmx.pi_blocking.lock == NULL here means the vCPU
+ * was removed from the blocking list while we are acquiring the lock.
+ */
+ if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
+ {
+ ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
+ list_del(&v->arch.hvm_vmx.pi_blocking.list);
+ v->arch.hvm_vmx.pi_blocking.lock = NULL;
+ }
+
+ spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+}
+
+/* This function is called when pcidevs_lock is held */
+void vmx_pi_hooks_assign(struct domain *d)
+{
+ if ( !iommu_intpost || !has_hvm_container_domain(d) )
+ return;
+
+ ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
+
+ d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
+ d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
+ d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
+ d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
+}
+
+/* This function is called when pcidevs_lock is held */
+void vmx_pi_hooks_deassign(struct domain *d)
+{
+ if ( !iommu_intpost || !has_hvm_container_domain(d) )
+ return;
+
+ ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
+
+ d->arch.hvm_domain.vmx.vcpu_block = NULL;
+ d->arch.hvm_domain.vmx.pi_switch_from = NULL;
+ d->arch.hvm_domain.vmx.pi_switch_to = NULL;
+ d->arch.hvm_domain.vmx.pi_do_resume = NULL;
+}
static int vmx_domain_initialise(struct domain *d)
{
spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
+ INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocking.list);
+
v->arch.schedule_tail = vmx_do_resume;
v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
vmx_save_guest_msrs(v);
vmx_restore_host_msrs();
vmx_save_dr(v);
+
+ if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
+ v->domain->arch.hvm_domain.vmx.pi_switch_from(v);
}
static void vmx_ctxt_switch_to(struct vcpu *v)
vmx_restore_guest_msrs(v);
vmx_restore_dr(v);
+
+ if ( v->domain->arch.hvm_domain.vmx.pi_switch_to )
+ v->domain->arch.hvm_domain.vmx.pi_switch_to(v);
}
},
};
+/* Handle VT-d posted-interrupt when VCPU is blocked. */
+static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
+{
+ struct arch_vmx_struct *vmx, *tmp;
+ spinlock_t *lock = &per_cpu(vmx_pi_blocking, smp_processor_id()).lock;
+ struct list_head *blocked_vcpus =
+ &per_cpu(vmx_pi_blocking, smp_processor_id()).list;
+
+ ack_APIC_irq();
+ this_cpu(irq_count)++;
+
+ spin_lock(lock);
+
+ /*
+ * XXX: The length of the list depends on how many vCPU is current
+ * blocked on this specific pCPU. This may hurt the interrupt latency
+ * if the list grows to too many entries.
+ */
+ list_for_each_entry_safe(vmx, tmp, blocked_vcpus, pi_blocking.list)
+ {
+ if ( pi_test_on(&vmx->pi_desc) )
+ {
+ list_del(&vmx->pi_blocking.list);
+ ASSERT(vmx->pi_blocking.lock == lock);
+ vmx->pi_blocking.lock = NULL;
+ vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
+ }
+ }
+
+ spin_unlock(lock);
+}
+
/* Handle VT-d posted-interrupt when VCPU is running. */
static void pi_notification_interrupt(struct cpu_user_regs *regs)
{
if ( cpu_has_vmx_posted_intr_processing )
{
if ( iommu_intpost )
+ {
alloc_direct_apic_vector(&posted_intr_vector, pi_notification_interrupt);
+ alloc_direct_apic_vector(&pi_wakeup_vector, pi_wakeup_interrupt);
+ }
else
alloc_direct_apic_vector(&posted_intr_vector, event_check_interrupt);
}
struct hvm_vcpu_asid *p_asid;
bool_t need_flush;
+ if ( curr->domain->arch.hvm_domain.vmx.pi_do_resume )
+ curr->domain->arch.hvm_domain.vmx.pi_do_resume(curr);
+
if ( !cpu_has_vmx_vpid )
goto out;
if ( nestedhvm_vcpu_in_guestmode(curr) )
unsigned long apic_access_mfn;
/* VMX_DOMAIN_* */
unsigned int status;
+
+ /*
+ * To handle posted interrupts correctly, we need to set the following
+ * state:
+ *
+ * * The PI notification vector (NV)
+ * * The PI notification destination processor (NDST)
+ * * The PI "suppress notification" bit (SN)
+ * * The vcpu pi "blocked" list
+ *
+ * If a VM is currently running, we want the PI delivered to the guest vcpu
+ * on the proper pcpu (NDST = v->processor, SN clear).
+ *
+ * If the vm is blocked, we want the PI delivered to Xen so that it can
+ * wake it up (SN clear, NV = pi_wakeup_vector, vcpu on block list).
+ *
+ * If the VM is currently either preempted or offline (i.e., not running
+ * because of some reason other than blocking waiting for an interrupt),
+ * there's nothing Xen can do -- we want the interrupt pending bit set in
+ * the guest, but we don't want to bother Xen with an interrupt (SN clear).
+ *
+ * There's a brief window of time between vmx_intr_assist() and checking
+ * softirqs where if an interrupt comes in it may be lost; so we need Xen
+ * to get an interrupt and raise a softirq so that it will go through the
+ * vmx_intr_assist() path again (SN clear, NV = posted_interrupt).
+ *
+ * The way we implement this now is by looking at what needs to happen on
+ * the following runstate transitions:
+ *
+ * A: runnable -> running
+ * - SN = 0
+ * - NDST = v->processor
+ * B: running -> runnable
+ * - SN = 1
+ * C: running -> blocked
+ * - NV = pi_wakeup_vector
+ * - Add vcpu to blocked list
+ * D: blocked -> runnable
+ * - NV = posted_intr_vector
+ * - Take vcpu off blocked list
+ *
+ * For transitions A and B, we add hooks into vmx_ctxt_switch_{from,to}
+ * paths.
+ *
+ * For transition C, we add a new arch hook, arch_vcpu_block(), which is
+ * called from vcpu_block() and vcpu_do_poll().
+ *
+ * For transition D, rather than add an extra arch hook on vcpu_wake, we
+ * add a hook on the vmentry path which checks to see if either of the two
+ * actions need to be taken.
+ *
+ * These hooks only need to be called when the domain in question actually
+ * has a physical device assigned to it, so we set and clear the callbacks
+ * as appropriate when device assignment changes.
+ */
+ void (*vcpu_block) (struct vcpu *);
+ void (*pi_switch_from) (struct vcpu *v);
+ void (*pi_switch_to) (struct vcpu *v);
+ void (*pi_do_resume) (struct vcpu *v);
};
struct pi_desc {
#define NR_PML_ENTRIES 512
+struct pi_blocking_vcpu {
+ struct list_head list;
+ spinlock_t *lock;
+};
+
struct arch_vmx_struct {
/* Physical address of VMCS. */
paddr_t vmcs_pa;
struct page_info *vmwrite_bitmap;
struct page_info *pml_pg;
+
+ /*
+ * Before it is blocked, vCPU is added to the per-cpu list.
+ * VT-d engine can send wakeup notification event to the
+ * pCPU and wakeup the related vCPU.
+ */
+ struct pi_blocking_vcpu pi_blocking;
};
int vmx_create_vmcs(struct vcpu *v);