ARM: GICv3 ITS: introduce host LPI array

author Andre Przywara <andre.przywara@arm.com>

Fri, 7 Apr 2017 22:07:59 +0000 (23:07 +0100)

committer Stefano Stabellini <sstabellini@kernel.org>

Fri, 7 Apr 2017 22:15:50 +0000 (15:15 -0700)
author Andre Przywara <andre.przywara@arm.com>
Fri, 7 Apr 2017 22:07:59 +0000 (23:07 +0100)
committer Stefano Stabellini <sstabellini@kernel.org>
Fri, 7 Apr 2017 22:15:50 +0000 (15:15 -0700)
diff --git a/xen/arch/arm/gic-v3-lpi.c b/xen/arch/arm/gic-v3-lpi.c

index d8a4f5a467697429b73332cf12b6a462fd2d3625..292f2d03eede65ee7aab49ea772cba852a009fe8 100644 (file)
--- a/xen/arch/arm/gic-v3-lpi.c
+++ b/xen/arch/arm/gic-v3-lpi.c
@@ -20,20 +20,48 @@
  
  #include <xen/lib.h>
  #include <xen/mm.h>
+#include <xen/sched.h>
  #include <xen/sizes.h>
  #include <xen/warning.h>
+#include <asm/atomic.h>
+#include <asm/domain.h>
  #include <asm/gic.h>
  #include <asm/gic_v3_defs.h>
  #include <asm/gic_v3_its.h>
  #include <asm/io.h>
  #include <asm/page.h>
  
+/*
+ * There could be a lot of LPIs on the host side, and they always go to
+ * a guest. So having a struct irq_desc for each of them would be wasteful
+ * and useless.
+ * Instead just store enough information to find the right VCPU to inject
+ * those LPIs into, which just requires the virtual LPI number.
+ * To avoid a global lock on this data structure, this is using a lockless
+ * approach relying on the architectural atomicity of native data types:
+ * We read or write the "data" view of this union atomically, then can
+ * access the broken-down fields in our local copy.
+ */
+union host_lpi {
+    uint64_t data;
+    struct {
+        uint32_t virt_lpi;
+        uint16_t dom_id;
+        uint16_t vcpu_id;
+    };
+};
+
  #define LPI_PROPTABLE_NEEDS_FLUSHING    (1U << 0)
  
  /* Global state */
  static struct {
      /* The global LPI property table, shared by all redistributors. */
      uint8_t *lpi_property;
+    /*
+     * A two-level table to lookup LPIs firing on the host and look up the
+     * VCPU and virtual LPI number to inject into.
+     */
+    union host_lpi **host_lpis;
      /*
       * Number of physical LPIs the host supports. This is a property of
       * the GIC hardware. We depart from the habit of naming these things
@@ -41,6 +69,12 @@ static struct {
       * in a different context to differentiate them from "virtual LPIs".
       */
      unsigned long int max_host_lpi_ids;
+    /*
+     * Protects allocation and deallocation of host LPIs and next_free_lpi,
+     * but not the actual data stored in the host_lpi entry.
+     */
+    spinlock_t host_lpis_lock;
+    uint32_t next_free_lpi;
      unsigned int flags;
  } lpi_data;
  
@@ -53,6 +87,28 @@ struct lpi_redist_data {
  static DEFINE_PER_CPU(struct lpi_redist_data, lpi_redist);
  
  #define MAX_NR_HOST_LPIS   (lpi_data.max_host_lpi_ids - LPI_OFFSET)
+#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
+
+static union host_lpi *gic_get_host_lpi(uint32_t plpi)
+{
+    union host_lpi *block;
+
+    if ( !is_lpi(plpi) || plpi >= MAX_NR_HOST_LPIS + LPI_OFFSET )
+        return NULL;
+
+    ASSERT(plpi >= LPI_OFFSET);
+
+    plpi -= LPI_OFFSET;
+
+    block = lpi_data.host_lpis[plpi / HOST_LPIS_PER_PAGE];
+    if ( !block )
+        return NULL;
+
+    /* Matches the write barrier in allocation code. */
+    smp_rmb();
+
+    return &block[plpi % HOST_LPIS_PER_PAGE];
+}
  
  /*
   * An ITS can refer to redistributors in two ways: either by an ID (possibly
@@ -220,8 +276,18 @@ int gicv3_lpi_init_rdist(void __iomem * rdist_base)
  static unsigned int max_lpi_bits = 20;
  integer_param("max_lpi_bits", max_lpi_bits);
  
+/*
+ * Allocate the 2nd level array for host LPIs. This one holds pointers
+ * to the page with the actual "union host_lpi" entries. Our LPI limit
+ * avoids excessive memory usage.
+ */
  int gicv3_lpi_init_host_lpis(unsigned int host_lpi_bits)
  {
+    unsigned int nr_lpi_ptrs;
+
+    /* We rely on the data structure being atomically accessible. */
+    BUILD_BUG_ON(sizeof(union host_lpi) > sizeof(unsigned long));
+
      /*
       * An implementation needs to support at least 14 bits of LPI IDs.
       * Tell the user about it, the actual number is reported below.
@@ -240,11 +306,175 @@ int gicv3_lpi_init_host_lpis(unsigned int host_lpi_bits)
      if ( lpi_data.max_host_lpi_ids > BIT(24) )
          warning_add("Using high number of LPIs, limit memory usage with max_lpi_bits\n");
  
+    spin_lock_init(&lpi_data.host_lpis_lock);
+    lpi_data.next_free_lpi = 0;
+
+    nr_lpi_ptrs = MAX_NR_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi));
+    lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs);
+    if ( !lpi_data.host_lpis )
+        return -ENOMEM;
+
      printk("GICv3: using at most %lu LPIs on the host.\n", MAX_NR_HOST_LPIS);
  
      return 0;
  }
  
+static int find_unused_host_lpi(uint32_t start, uint32_t *index)
+{
+    unsigned int chunk;
+    uint32_t i = *index;
+
+    ASSERT(spin_is_locked(&lpi_data.host_lpis_lock));
+
+    for ( chunk = start;
+          chunk < MAX_NR_HOST_LPIS / HOST_LPIS_PER_PAGE;
+          chunk++ )
+    {
+        /* If we hit an unallocated chunk, use entry 0 in that one. */
+        if ( !lpi_data.host_lpis[chunk] )
+        {
+            *index = 0;
+            return chunk;
+        }
+
+        /* Find an unallocated entry in this chunk. */
+        for ( ; i < HOST_LPIS_PER_PAGE; i += LPI_BLOCK )
+        {
+            if ( lpi_data.host_lpis[chunk][i].dom_id == DOMID_INVALID )
+            {
+                *index = i;
+                return chunk;
+            }
+        }
+        i = 0;
+    }
+
+    return -1;
+}
+
+/*
+ * Allocate a block of 32 LPIs on the given host ITS for device "devid",
+ * starting with "eventid". Put them into the respective ITT by issuing a
+ * MAPTI command for each of them.
+ */
+int gicv3_allocate_host_lpi_block(struct domain *d, uint32_t *first_lpi)
+{
+    uint32_t lpi, lpi_idx;
+    int chunk;
+    int i;
+
+    spin_lock(&lpi_data.host_lpis_lock);
+    lpi_idx = lpi_data.next_free_lpi % HOST_LPIS_PER_PAGE;
+    chunk = find_unused_host_lpi(lpi_data.next_free_lpi / HOST_LPIS_PER_PAGE,
+                                 &lpi_idx);
+
+    if ( chunk == - 1 )          /* rescan for a hole from the beginning */
+    {
+        lpi_idx = 0;
+        chunk = find_unused_host_lpi(0, &lpi_idx);
+        if ( chunk == -1 )
+        {
+            spin_unlock(&lpi_data.host_lpis_lock);
+            return -ENOSPC;
+        }
+    }
+
+    /* If we hit an unallocated chunk, we initialize it and use entry 0. */
+    if ( !lpi_data.host_lpis[chunk] )
+    {
+        union host_lpi *new_chunk;
+
+        /* TODO: NUMA locality for quicker IRQ path? */
+        new_chunk = alloc_xenheap_page();
+        if ( !new_chunk )
+        {
+            spin_unlock(&lpi_data.host_lpis_lock);
+            return -ENOMEM;
+        }
+
+        for ( i = 0; i < HOST_LPIS_PER_PAGE; i += LPI_BLOCK )
+            new_chunk[i].dom_id = DOMID_INVALID;
+
+        /*
+         * Make sure all slots are really marked empty before publishing the
+         * new chunk.
+         */
+        smp_wmb();
+
+        lpi_data.host_lpis[chunk] = new_chunk;
+        lpi_idx = 0;
+    }
+
+    lpi = chunk * HOST_LPIS_PER_PAGE + lpi_idx;
+
+    for ( i = 0; i < LPI_BLOCK; i++ )
+    {
+        union host_lpi hlpi;
+
+        /*
+         * Mark this host LPI as belonging to the domain, but don't assign
+         * any virtual LPI or a VCPU yet.
+         */
+        hlpi.virt_lpi = INVALID_LPI;
+        hlpi.dom_id = d->domain_id;
+        hlpi.vcpu_id = INVALID_VCPU_ID;
+        write_u64_atomic(&lpi_data.host_lpis[chunk][lpi_idx + i].data,
+                         hlpi.data);
+
+        /*
+         * Enable this host LPI, so we don't have to do this during the
+         * guest's runtime.
+         */
+        lpi_data.lpi_property[lpi + i] |= LPI_PROP_ENABLED;
+    }
+
+    lpi_data.next_free_lpi = lpi + LPI_BLOCK;
+
+    /*
+     * We have allocated and initialized the host LPI entries, so it's safe
+     * to drop the lock now. Access to the structures can be done concurrently
+     * as it involves only an atomic uint64_t access.
+     */
+    spin_unlock(&lpi_data.host_lpis_lock);
+
+    if ( lpi_data.flags & LPI_PROPTABLE_NEEDS_FLUSHING )
+        clean_and_invalidate_dcache_va_range(&lpi_data.lpi_property[lpi],
+                                             LPI_BLOCK);
+
+    *first_lpi = lpi + LPI_OFFSET;
+
+    return 0;
+}
+
+void gicv3_free_host_lpi_block(uint32_t first_lpi)
+{
+    union host_lpi *hlpi, empty_lpi = { .dom_id = DOMID_INVALID };
+    int i;
+
+    /* This should only be called with the beginning of a block. */
+    ASSERT((first_lpi % LPI_BLOCK) == 0);
+
+    hlpi = gic_get_host_lpi(first_lpi);
+    if ( !hlpi )
+        return;         /* Nothing to free here. */
+
+    spin_lock(&lpi_data.host_lpis_lock);
+
+    for ( i = 0; i < LPI_BLOCK; i++ )
+        write_u64_atomic(&hlpi[i].data, empty_lpi.data);
+
+    /*
+     * Make sure the next allocation can reuse this block, as we do only
+     * forward scanning when finding an unused block.
+     */
+    if ( lpi_data.next_free_lpi > first_lpi )
+        lpi_data.next_free_lpi = first_lpi;
+
+    spin_unlock(&lpi_data.host_lpis_lock);
+
+    return;
+}
+
  /*
   * Local variables:
   * mode: C
diff --git a/xen/include/asm-arm/gic_v3_its.h b/xen/include/asm-arm/gic_v3_its.h

index 13794e0bf3aff886ec6a915cccbd2efd41237ac4..a96c9dc1f3a6c17331c718bc2d37087b2abba27c 100644 (file)
--- a/xen/include/asm-arm/gic_v3_its.h
+++ b/xen/include/asm-arm/gic_v3_its.h
@@ -103,6 +103,9 @@
  #define HOST_ITS_FLUSH_CMD_QUEUE        (1U << 0)
  #define HOST_ITS_USES_PTA               (1U << 1)
  
+/* We allocate LPIs on the hosts in chunks of 32 to reduce handling overhead. */
+#define LPI_BLOCK                       32U
+
  /* data structure for each hardware ITS */
  struct host_its {
      struct list_head entry;
@@ -141,6 +144,9 @@ uint64_t gicv3_get_redist_address(unsigned int cpu, bool use_pta);
  /* Map a collection for this host CPU to each host ITS. */
  int gicv3_its_setup_collection(unsigned int cpu);
  
+int gicv3_allocate_host_lpi_block(struct domain *d, uint32_t *first_lpi);
+void gicv3_free_host_lpi_block(uint32_t first_lpi);
+
  #else
  
  static inline void gicv3_its_dt_init(const struct dt_device_node *node)
diff --git a/xen/include/asm-arm/irq.h b/xen/include/asm-arm/irq.h

index f940092045db721a19e99e013025f1f3593a657e..7c7662608aa35e001da8c4b858e19fb4c068a195 100644 (file)
--- a/xen/include/asm-arm/irq.h
+++ b/xen/include/asm-arm/irq.h
@@ -28,6 +28,9 @@ struct arch_irq_desc {
  
  #define LPI_OFFSET      8192
  
+/* LPIs are always numbered starting at 8192, so 0 is a good invalid case. */
+#define INVALID_LPI     0
+
  #define nr_irqs NR_IRQS
  #define nr_static_irqs NR_IRQS
  #define arch_hwdom_irqs(domid) NR_IRQS
@@ -41,6 +44,11 @@ struct irq_desc *__irq_to_desc(int irq);
  
  void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq);
  
+static inline bool is_lpi(unsigned int irq)
+{
+    return irq >= LPI_OFFSET;
+}
+
  #define domain_pirq_to_irq(d, pirq) (pirq)
  
  bool_t is_assignable_irq(unsigned int irq);
author	Andre Przywara <andre.przywara@arm.com>
	Fri, 7 Apr 2017 22:07:59 +0000 (23:07 +0100)
committer	Stefano Stabellini <sstabellini@kernel.org>
	Fri, 7 Apr 2017 22:15:50 +0000 (15:15 -0700)
xen/arch/arm/gic-v3-lpi.c		patch \| blob \| blame \| history
xen/include/asm-arm/gic_v3_its.h		patch \| blob \| blame \| history
xen/include/asm-arm/irq.h		patch \| blob \| blame \| history