vmx: VT-d posted-interrupt core logic handling

author Feng Wu <feng.wu@intel.com>

Tue, 1 Mar 2016 13:42:13 +0000 (14:42 +0100)

committer Wei Liu <wei.liu2@citrix.com>

Tue, 15 Mar 2016 16:32:32 +0000 (16:32 +0000)
author Feng Wu <feng.wu@intel.com>
Tue, 1 Mar 2016 13:42:13 +0000 (14:42 +0100)
committer Wei Liu <wei.liu2@citrix.com>
Tue, 15 Mar 2016 16:32:32 +0000 (16:32 +0000)
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c

index ed80350e726c137c0263f681a63697192f9f4a80..fd4d876fbeb61ba1aae1dd76fb795b77fe09770d 100644 (file)
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -679,6 +679,8 @@ int vmx_cpu_up(void)
      if ( cpu_has_vmx_vpid )
          vpid_sync_all();
  
+    vmx_pi_per_cpu_init(cpu);
+
      return 0;
  }
  
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c

index 6dc45b8f9775232c68c2c22380a7700454a4a7f7..9c5a3882eaa006451c85f9fe0c578f7f5d68827a 100644 (file)
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -84,7 +84,148 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
  static void vmx_invlpg_intercept(unsigned long vaddr);
  static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
  
+struct vmx_pi_blocking_vcpu {
+    struct list_head     list;
+    spinlock_t           lock;
+};
+
+/*
+ * We maintain a per-CPU linked-list of vCPUs, so in PI wakeup
+ * handler we can find which vCPU should be woken up.
+ */
+static DEFINE_PER_CPU(struct vmx_pi_blocking_vcpu, vmx_pi_blocking);
+
  uint8_t __read_mostly posted_intr_vector;
+static uint8_t __read_mostly pi_wakeup_vector;
+
+void vmx_pi_per_cpu_init(unsigned int cpu)
+{
+    INIT_LIST_HEAD(&per_cpu(vmx_pi_blocking, cpu).list);
+    spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
+}
+
+static void vmx_vcpu_block(struct vcpu *v)
+{
+    unsigned long flags;
+    unsigned int dest;
+    spinlock_t *old_lock;
+    spinlock_t *pi_blocking_list_lock =
+               &per_cpu(vmx_pi_blocking, v->processor).lock;
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    spin_lock_irqsave(pi_blocking_list_lock, flags);
+    old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
+                       pi_blocking_list_lock);
+
+    /*
+     * 'v->arch.hvm_vmx.pi_blocking.lock' should be NULL before
+     * being assigned to a new value, since the vCPU is currently
+     * running and it cannot be on any blocking list.
+     */
+    ASSERT(old_lock == NULL);
+
+    list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
+                  &per_cpu(vmx_pi_blocking, v->processor).list);
+    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+
+    ASSERT(!pi_test_sn(pi_desc));
+
+    dest = cpu_physical_id(v->processor);
+
+    ASSERT(pi_desc->ndst ==
+           (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
+
+    write_atomic(&pi_desc->nv, pi_wakeup_vector);
+}
+
+static void vmx_pi_switch_from(struct vcpu *v)
+{
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    if ( test_bit(_VPF_blocked, &v->pause_flags) )
+        return;
+
+    pi_set_sn(pi_desc);
+}
+
+static void vmx_pi_switch_to(struct vcpu *v)
+{
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    unsigned int dest = cpu_physical_id(v->processor);
+
+    write_atomic(&pi_desc->ndst,
+                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
+
+    pi_clear_sn(pi_desc);
+}
+
+static void vmx_pi_do_resume(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *pi_blocking_list_lock;
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));
+
+    /*
+     * Set 'NV' field back to posted_intr_vector, so the
+     * Posted-Interrupts can be delivered to the vCPU when
+     * it is running in non-root mode.
+     */
+    write_atomic(&pi_desc->nv, posted_intr_vector);
+
+    /* The vCPU is not on any blocking list. */
+    pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
+
+    /* Prevent the compiler from eliminating the local variable.*/
+    smp_rmb();
+
+    if ( pi_blocking_list_lock == NULL )
+        return;
+
+    spin_lock_irqsave(pi_blocking_list_lock, flags);
+
+    /*
+     * v->arch.hvm_vmx.pi_blocking.lock == NULL here means the vCPU
+     * was removed from the blocking list while we are acquiring the lock.
+     */
+    if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
+    {
+        ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
+        list_del(&v->arch.hvm_vmx.pi_blocking.list);
+        v->arch.hvm_vmx.pi_blocking.lock = NULL;
+    }
+
+    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+}
+
+/* This function is called when pcidevs_lock is held */
+void vmx_pi_hooks_assign(struct domain *d)
+{
+    if ( !iommu_intpost || !has_hvm_container_domain(d) )
+        return;
+
+    ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
+
+    d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
+    d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
+    d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
+    d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
+}
+
+/* This function is called when pcidevs_lock is held */
+void vmx_pi_hooks_deassign(struct domain *d)
+{
+    if ( !iommu_intpost || !has_hvm_container_domain(d) )
+        return;
+
+    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
+
+    d->arch.hvm_domain.vmx.vcpu_block = NULL;
+    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
+    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
+    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
+}
  
  static int vmx_domain_initialise(struct domain *d)
  {
@@ -113,6 +254,8 @@ static int vmx_vcpu_initialise(struct vcpu *v)
  
      spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
  
+    INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocking.list);
+
      v->arch.schedule_tail    = vmx_do_resume;
      v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
      v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
@@ -752,6 +895,9 @@ static void vmx_ctxt_switch_from(struct vcpu *v)
      vmx_save_guest_msrs(v);
      vmx_restore_host_msrs();
      vmx_save_dr(v);
+
+    if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
+        v->domain->arch.hvm_domain.vmx.pi_switch_from(v);
  }
  
  static void vmx_ctxt_switch_to(struct vcpu *v)
@@ -764,6 +910,9 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
  
      vmx_restore_guest_msrs(v);
      vmx_restore_dr(v);
+
+    if ( v->domain->arch.hvm_domain.vmx.pi_switch_to )
+        v->domain->arch.hvm_domain.vmx.pi_switch_to(v);
  }
  
  
@@ -2030,6 +2179,38 @@ static struct hvm_function_table __initdata vmx_function_table = {
      },
  };
  
+/* Handle VT-d posted-interrupt when VCPU is blocked. */
+static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
+{
+    struct arch_vmx_struct *vmx, *tmp;
+    spinlock_t *lock = &per_cpu(vmx_pi_blocking, smp_processor_id()).lock;
+    struct list_head *blocked_vcpus =
+               &per_cpu(vmx_pi_blocking, smp_processor_id()).list;
+
+    ack_APIC_irq();
+    this_cpu(irq_count)++;
+
+    spin_lock(lock);
+
+    /*
+     * XXX: The length of the list depends on how many vCPU is current
+     * blocked on this specific pCPU. This may hurt the interrupt latency
+     * if the list grows to too many entries.
+     */
+    list_for_each_entry_safe(vmx, tmp, blocked_vcpus, pi_blocking.list)
+    {
+        if ( pi_test_on(&vmx->pi_desc) )
+        {
+            list_del(&vmx->pi_blocking.list);
+            ASSERT(vmx->pi_blocking.lock == lock);
+            vmx->pi_blocking.lock = NULL;
+            vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
+        }
+    }
+
+    spin_unlock(lock);
+}
+
  /* Handle VT-d posted-interrupt when VCPU is running. */
  static void pi_notification_interrupt(struct cpu_user_regs *regs)
  {
@@ -2116,7 +2297,10 @@ const struct hvm_function_table * __init start_vmx(void)
      if ( cpu_has_vmx_posted_intr_processing )
      {
          if ( iommu_intpost )
+        {
              alloc_direct_apic_vector(&posted_intr_vector, pi_notification_interrupt);
+            alloc_direct_apic_vector(&pi_wakeup_vector, pi_wakeup_interrupt);
+        }
          else
              alloc_direct_apic_vector(&posted_intr_vector, event_check_interrupt);
      }
@@ -3631,6 +3815,9 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
      struct hvm_vcpu_asid *p_asid;
      bool_t need_flush;
  
+    if ( curr->domain->arch.hvm_domain.vmx.pi_do_resume )
+        curr->domain->arch.hvm_domain.vmx.pi_do_resume(curr);
+
      if ( !cpu_has_vmx_vpid )
          goto out;
      if ( nestedhvm_vcpu_in_guestmode(curr) )
diff --git a/xen/common/schedule.c b/xen/common/schedule.c

index 434dcfc37463fe16b296275cbd327c30e9b966b5..7523968224aeb04e921aa7a4f189826d48d103a2 100644 (file)
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -803,6 +803,8 @@ void vcpu_block(void)
  
      set_bit(_VPF_blocked, &v->pause_flags);
  
+    arch_vcpu_block(v);
+
      /* Check for events /after/ blocking: avoids wakeup waiting race. */
      if ( local_events_need_delivery() )
      {
@@ -840,6 +842,8 @@ static long do_poll(struct sched_poll *sched_poll)
      v->poll_evtchn = -1;
      set_bit(v->vcpu_id, d->poll_mask);
  
+    arch_vcpu_block(v);
+
  #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
      /* Check for events /after/ setting flags: avoids wakeup waiting race. */
      smp_mb();
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c

index ec31c6bf01612574e4a75c2a34376546b7d777a4..80227029537c741c9497060f1f800d7755626e58 100644 (file)
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2283,9 +2283,17 @@ static int reassign_device_ownership(
      if ( ret )
          return ret;
  
+    if ( !has_arch_pdevs(target) )
+        vmx_pi_hooks_assign(target);
+
      ret = domain_context_mapping(target, devfn, pdev);
      if ( ret )
+    {
+        if ( !has_arch_pdevs(target) )
+            vmx_pi_hooks_deassign(target);
+
          return ret;
+    }
  
      if ( devfn == pdev->devfn )
      {
@@ -2293,6 +2301,9 @@ static int reassign_device_ownership(
          pdev->domain = target;
      }
  
+    if ( !has_arch_pdevs(source) )
+        vmx_pi_hooks_deassign(source);
+
      return ret;
  }
  
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h

index 8e1161f2368fbafd302c278322db830ce9ace825..c35ed40b2ae8323c5ca90dc3788d8a8232302f64 100644 (file)
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -315,6 +315,8 @@ static inline void free_vcpu_guest_context(struct vcpu_guest_context *vgc)
      xfree(vgc);
  }
  
+static inline void arch_vcpu_block(struct vcpu *v) {}
+
  #endif /* __ASM_DOMAIN_H__ */
  
  /*
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h

index 12209d520534375ee50ea761f4338eb0cbcf0103..b8ab5b1a9524c4f0bf975496cb5cae007b3f6d38 100644 (file)
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -606,6 +606,19 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
                             signed int cr0_pg);
  unsigned long hvm_cr4_guest_reserved_bits(const struct vcpu *v, bool_t restore);
  
+/*
+ * This must be defined as a macro instead of an inline function,
+ * because it uses 'struct vcpu' and 'struct domain' which have
+ * not been defined yet.
+ */
+#define arch_vcpu_block(v) ({                                   \
+    struct vcpu *v_ = (v);                                      \
+    struct domain *d_ = v_->domain;                             \
+    if ( has_hvm_container_domain(d_) &&                        \
+         d_->arch.hvm_domain.vmx.vcpu_block )                   \
+        d_->arch.hvm_domain.vmx.vcpu_block(v_);                 \
+})
+
  #endif /* __ASM_X86_HVM_HVM_H__ */
  
  /*
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h

index 86a9f1ea1b6af073fed85ca2ba0d38695dae3a62..b54f52fd8775eea8be33823233ecd4a5825b4a3c 100644 (file)
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -77,6 +77,65 @@ struct vmx_domain {
      unsigned long apic_access_mfn;
      /* VMX_DOMAIN_* */
      unsigned int status;
+
+    /*
+     * To handle posted interrupts correctly, we need to set the following
+     * state:
+     *
+     * * The PI notification vector (NV)
+     * * The PI notification destination processor (NDST)
+     * * The PI "suppress notification" bit (SN)
+     * * The vcpu pi "blocked" list
+     *
+     * If a VM is currently running, we want the PI delivered to the guest vcpu
+     * on the proper pcpu (NDST = v->processor, SN clear).
+     *
+     * If the vm is blocked, we want the PI delivered to Xen so that it can
+     * wake it up  (SN clear, NV = pi_wakeup_vector, vcpu on block list).
+     *
+     * If the VM is currently either preempted or offline (i.e., not running
+     * because of some reason other than blocking waiting for an interrupt),
+     * there's nothing Xen can do -- we want the interrupt pending bit set in
+     * the guest, but we don't want to bother Xen with an interrupt (SN clear).
+     *
+     * There's a brief window of time between vmx_intr_assist() and checking
+     * softirqs where if an interrupt comes in it may be lost; so we need Xen
+     * to get an interrupt and raise a softirq so that it will go through the
+     * vmx_intr_assist() path again (SN clear, NV = posted_interrupt).
+     *
+     * The way we implement this now is by looking at what needs to happen on
+     * the following runstate transitions:
+     *
+     * A: runnable -> running
+     *  - SN = 0
+     *  - NDST = v->processor
+     * B: running -> runnable
+     *  - SN = 1
+     * C: running -> blocked
+     *  - NV = pi_wakeup_vector
+     *  - Add vcpu to blocked list
+     * D: blocked -> runnable
+     *  - NV = posted_intr_vector
+     *  - Take vcpu off blocked list
+     *
+     * For transitions A and B, we add hooks into vmx_ctxt_switch_{from,to}
+     * paths.
+     *
+     * For transition C, we add a new arch hook, arch_vcpu_block(), which is
+     * called from vcpu_block() and vcpu_do_poll().
+     *
+     * For transition D, rather than add an extra arch hook on vcpu_wake, we
+     * add a hook on the vmentry path which checks to see if either of the two
+     * actions need to be taken.
+     *
+     * These hooks only need to be called when the domain in question actually
+     * has a physical device assigned to it, so we set and clear the callbacks
+     * as appropriate when device assignment changes.
+     */
+    void (*vcpu_block) (struct vcpu *);
+    void (*pi_switch_from) (struct vcpu *v);
+    void (*pi_switch_to) (struct vcpu *v);
+    void (*pi_do_resume) (struct vcpu *v);
  };
  
  struct pi_desc {
@@ -101,6 +160,11 @@ struct pi_desc {
  
  #define NR_PML_ENTRIES   512
  
+struct pi_blocking_vcpu {
+    struct list_head     list;
+    spinlock_t           *lock;
+};
+
  struct arch_vmx_struct {
      /* Physical address of VMCS. */
      paddr_t              vmcs_pa;
@@ -160,6 +224,13 @@ struct arch_vmx_struct {
      struct page_info     *vmwrite_bitmap;
  
      struct page_info     *pml_pg;
+
+    /*
+     * Before it is blocked, vCPU is added to the per-cpu list.
+     * VT-d engine can send wakeup notification event to the
+     * pCPU and wakeup the related vCPU.
+     */
+    struct pi_blocking_vcpu pi_blocking;
  };
  
  int vmx_create_vmcs(struct vcpu *v);
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h

index 14f3d32ca15a51bb23bcf93e5fbc5264f8fbc16e..a85d4884e37d15892545842bfe0e18bf483bca58 100644 (file)
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -564,6 +564,11 @@ int alloc_p2m_hap_data(struct p2m_domain *p2m);
  void free_p2m_hap_data(struct p2m_domain *p2m);
  void p2m_init_hap_data(struct p2m_domain *p2m);
  
+void vmx_pi_per_cpu_init(unsigned int cpu);
+
+void vmx_pi_hooks_assign(struct domain *d);
+void vmx_pi_hooks_deassign(struct domain *d);
+
  /* EPT violation qualifications definitions */
  #define _EPT_READ_VIOLATION         0
  #define EPT_READ_VIOLATION          (1UL<<_EPT_READ_VIOLATION)
author	Feng Wu <feng.wu@intel.com>
	Tue, 1 Mar 2016 13:42:13 +0000 (14:42 +0100)
committer	Wei Liu <wei.liu2@citrix.com>
	Tue, 15 Mar 2016 16:32:32 +0000 (16:32 +0000)
xen/arch/x86/hvm/vmx/vmcs.c		patch \| blob \| blame \| history
xen/arch/x86/hvm/vmx/vmx.c		patch \| blob \| blame \| history
xen/common/schedule.c		patch \| blob \| blame \| history
xen/drivers/passthrough/vtd/iommu.c		patch \| blob \| blame \| history
xen/include/asm-arm/domain.h		patch \| blob \| blame \| history
xen/include/asm-x86/hvm/hvm.h		patch \| blob \| blame \| history
xen/include/asm-x86/hvm/vmx/vmcs.h		patch \| blob \| blame \| history
xen/include/asm-x86/hvm/vmx/vmx.h		patch \| blob \| blame \| history