#define IDR1_SSIDSIZE GENMASK(10, 6)
#define IDR1_SIDSIZE GENMASK(5, 0)
-#define ARM_SMMU_IDR3 0xc
-#define IDR3_RIL (1 << 10)
-
#define ARM_SMMU_IDR5 0x14
#define IDR5_STALL_MAX GENMASK(31, 16)
#define IDR5_GRAN64K (1 << 6)
#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
-#define Q_OVERFLOW_FLAG (1U << 31)
+#define Q_OVERFLOW_FLAG (1 << 31)
#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
#define Q_ENT(q, p) ((q)->base + \
Q_IDX(&((q)->llq), p) * \
#define CMDQ_ERR_CERROR_ABT_IDX 2
#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
-#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
-
-/*
- * This is used to size the command queue and therefore must be at least
- * BITS_PER_LONG so that the valid_map works correctly (it relies on the
- * total number of queue entries being a multiple of BITS_PER_LONG).
- */
-#define CMDQ_BATCH_ENTRIES BITS_PER_LONG
-
#define CMDQ_0_OP GENMASK_ULL(7, 0)
#define CMDQ_0_SSV (1UL << 11)
#define CMDQ_CFGI_1_LEAF (1UL << 0)
#define CMDQ_CFGI_1_RANGE GENMASK_ULL(4, 0)
-#define CMDQ_TLBI_0_NUM GENMASK_ULL(16, 12)
-#define CMDQ_TLBI_RANGE_NUM_MAX 31
-#define CMDQ_TLBI_0_SCALE GENMASK_ULL(24, 20)
#define CMDQ_TLBI_0_VMID GENMASK_ULL(47, 32)
#define CMDQ_TLBI_0_ASID GENMASK_ULL(63, 48)
#define CMDQ_TLBI_1_LEAF (1UL << 0)
-#define CMDQ_TLBI_1_TTL GENMASK_ULL(9, 8)
-#define CMDQ_TLBI_1_TG GENMASK_ULL(11, 10)
#define CMDQ_TLBI_1_VA_MASK GENMASK_ULL(63, 12)
#define CMDQ_TLBI_1_IPA_MASK GENMASK_ULL(51, 12)
#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)
/* High-level queue structures */
-#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
-#define ARM_SMMU_POLL_SPIN_COUNT 10
+#define ARM_SMMU_POLL_TIMEOUT_US 100
+#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */
+#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
#define MSI_IOVA_BASE 0x8000000
#define MSI_IOVA_LENGTH 0x100000
#define CMDQ_OP_TLBI_S2_IPA 0x2a
#define CMDQ_OP_TLBI_NSNH_ALL 0x30
struct {
- u8 num;
- u8 scale;
u16 asid;
u16 vmid;
bool leaf;
- u8 ttl;
- u8 tg;
u64 addr;
} tlbi;
#define CMDQ_OP_CMD_SYNC 0x46
struct {
+ u32 msidata;
u64 msiaddr;
} sync;
};
};
struct arm_smmu_ll_queue {
- union {
- u64 val;
- struct {
- u32 prod;
- u32 cons;
- };
- struct {
- atomic_t prod;
- atomic_t cons;
- } atomic;
- u8 __pad[SMP_CACHE_BYTES];
- } ____cacheline_aligned_in_smp;
+ u32 prod;
+ u32 cons;
u32 max_n_shift;
};
u32 __iomem *cons_reg;
};
-struct arm_smmu_queue_poll {
- ktime_t timeout;
- unsigned int delay;
- unsigned int spin_cnt;
- bool wfe;
-};
-
struct arm_smmu_cmdq {
struct arm_smmu_queue q;
- atomic_long_t *valid_map;
- atomic_t owner_prod;
- atomic_t lock;
-};
-
-struct arm_smmu_cmdq_batch {
- u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
- int num;
+ spinlock_t lock;
};
struct arm_smmu_evtq {
#define ARM_SMMU_FEAT_HYP (1 << 12)
#define ARM_SMMU_FEAT_STALL_FORCE (1 << 13)
#define ARM_SMMU_FEAT_VAX (1 << 14)
-#define ARM_SMMU_FEAT_RANGE_INV (1 << 15)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
int gerr_irq;
int combined_irq;
+ u32 sync_nr;
+ u8 prev_cmd_opcode;
unsigned long ias; /* IPA */
unsigned long oas; /* PA */
struct arm_smmu_strtab_cfg strtab_cfg;
+ /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
+ union {
+ u32 sync_count;
+ u64 padding;
+ };
+
/* IOMMU core code handle */
struct iommu_device iommu;
};
}
/* Low-level queue manipulation functions */
-static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
-{
- u32 space, prod, cons;
-
- prod = Q_IDX(q, q->prod);
- cons = Q_IDX(q, q->cons);
-
- if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
- space = (1 << q->max_n_shift) - (prod - cons);
- else
- space = cons - prod;
-
- return space >= n;
-}
-
static bool queue_full(struct arm_smmu_ll_queue *q)
{
return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
}
-static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
+static void queue_sync_cons_in(struct arm_smmu_queue *q)
{
- return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
- (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
- ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
- (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
+ q->llq.cons = readl_relaxed(q->cons_reg);
}
static void queue_sync_cons_out(struct arm_smmu_queue *q)
return ret;
}
-static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
+static void queue_sync_prod_out(struct arm_smmu_queue *q)
{
- u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
- return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+ writel(q->llq.prod, q->prod_reg);
}
-static void queue_poll_init(struct arm_smmu_device *smmu,
- struct arm_smmu_queue_poll *qp)
+static void queue_inc_prod(struct arm_smmu_ll_queue *q)
{
- qp->delay = 1;
- qp->spin_cnt = 0;
- qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
- qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
+ u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
+ q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
}
-static int queue_poll(struct arm_smmu_queue_poll *qp)
+/*
+ * Wait for the SMMU to consume items. If sync is true, wait until the queue
+ * is empty. Otherwise, wait until there is at least one free slot.
+ */
+static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
{
- if (ktime_compare(ktime_get(), qp->timeout) > 0)
- return -ETIMEDOUT;
+ ktime_t timeout;
+ unsigned int delay = 1, spin_cnt = 0;
- if (qp->wfe) {
- wfe();
- } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
- cpu_relax();
- } else {
- udelay(qp->delay);
- qp->delay *= 2;
- qp->spin_cnt = 0;
+ /* Wait longer if it's a CMD_SYNC */
+ timeout = ktime_add_us(ktime_get(), sync ?
+ ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
+ ARM_SMMU_POLL_TIMEOUT_US);
+
+ while (queue_sync_cons_in(q),
+ (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
+ if (ktime_compare(ktime_get(), timeout) > 0)
+ return -ETIMEDOUT;
+
+ if (wfe) {
+ wfe();
+ } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
+ cpu_relax();
+ continue;
+ } else {
+ udelay(delay);
+ delay *= 2;
+ spin_cnt = 0;
+ }
}
return 0;
*dst++ = cpu_to_le64(*src++);
}
+static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
+{
+ if (queue_full(&q->llq))
+ return -ENOSPC;
+
+ queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
+ queue_inc_prod(&q->llq);
+ queue_sync_prod_out(q);
+ return 0;
+}
+
static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
{
int i;
cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
break;
case CMDQ_OP_TLBI_NH_VA:
- cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
- cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
- cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
- cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
break;
case CMDQ_OP_TLBI_S2_IPA:
- cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
- cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
- cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
- cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
break;
case CMDQ_OP_TLBI_NH_ASID:
cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
break;
case CMDQ_OP_CMD_SYNC:
- if (ent->sync.msiaddr) {
+ if (ent->sync.msiaddr)
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
- cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
- } else {
+ else
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
- }
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
+ /*
+ * Commands are written little-endian, but we want the SMMU to
+ * receive MSIData, and thus write it back to memory, in CPU
+ * byte order, so big-endian needs an extra byteswap here.
+ */
+ cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
+ cpu_to_le32(ent->sync.msidata));
+ cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
break;
default:
return -ENOENT;
return 0;
}
-static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
- u32 prod)
-{
- struct arm_smmu_queue *q = &smmu->cmdq.q;
- struct arm_smmu_cmdq_ent ent = {
- .opcode = CMDQ_OP_CMD_SYNC,
- };
-
- /*
- * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
- * payload, so the write will zero the entire command on that platform.
- */
- if (smmu->features & ARM_SMMU_FEAT_MSI &&
- smmu->features & ARM_SMMU_FEAT_COHERENCY) {
- ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
- q->ent_dwords * 8;
- }
-
- arm_smmu_cmdq_build_cmd(cmd, &ent);
-}
-
static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
{
static const char *cerror_str[] = {
queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
}
-/*
- * Command queue locking.
- * This is a form of bastardised rwlock with the following major changes:
- *
- * - The only LOCK routines are exclusive_trylock() and shared_lock().
- * Neither have barrier semantics, and instead provide only a control
- * dependency.
- *
- * - The UNLOCK routines are supplemented with shared_tryunlock(), which
- * fails if the caller appears to be the last lock holder (yes, this is
- * racy). All successful UNLOCK routines have RELEASE semantics.
- */
-static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
-{
- int val;
-
- /*
- * We can try to avoid the cmpxchg() loop by simply incrementing the
- * lock counter. When held in exclusive state, the lock counter is set
- * to INT_MIN so these increments won't hurt as the value will remain
- * negative.
- */
- if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
- return;
-
- do {
- val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
- } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
-}
-
-static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
-{
- (void)atomic_dec_return_release(&cmdq->lock);
-}
-
-static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
+static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
{
- if (atomic_read(&cmdq->lock) == 1)
- return false;
-
- arm_smmu_cmdq_shared_unlock(cmdq);
- return true;
-}
-
-#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
-({ \
- bool __ret; \
- local_irq_save(flags); \
- __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
- if (!__ret) \
- local_irq_restore(flags); \
- __ret; \
-})
-
-#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
-({ \
- atomic_set_release(&cmdq->lock, 0); \
- local_irq_restore(flags); \
-})
-
-
-/*
- * Command queue insertion.
- * This is made fiddly by our attempts to achieve some sort of scalability
- * since there is one queue shared amongst all of the CPUs in the system. If
- * you like mixed-size concurrency, dependency ordering and relaxed atomics,
- * then you'll *love* this monstrosity.
- *
- * The basic idea is to split the queue up into ranges of commands that are
- * owned by a given CPU; the owner may not have written all of the commands
- * itself, but is responsible for advancing the hardware prod pointer when
- * the time comes. The algorithm is roughly:
- *
- * 1. Allocate some space in the queue. At this point we also discover
- * whether the head of the queue is currently owned by another CPU,
- * or whether we are the owner.
- *
- * 2. Write our commands into our allocated slots in the queue.
- *
- * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
- *
- * 4. If we are an owner:
- * a. Wait for the previous owner to finish.
- * b. Mark the queue head as unowned, which tells us the range
- * that we are responsible for publishing.
- * c. Wait for all commands in our owned range to become valid.
- * d. Advance the hardware prod pointer.
- * e. Tell the next owner we've finished.
- *
- * 5. If we are inserting a CMD_SYNC (we may or may not have been an
- * owner), then we need to stick around until it has completed:
- * a. If we have MSIs, the SMMU can write back into the CMD_SYNC
- * to clear the first 4 bytes.
- * b. Otherwise, we spin waiting for the hardware cons pointer to
- * advance past our command.
- *
- * The devil is in the details, particularly the use of locking for handling
- * SYNC completion and freeing up space in the queue before we think that it is
- * full.
- */
-static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
- u32 sprod, u32 eprod, bool set)
-{
- u32 swidx, sbidx, ewidx, ebidx;
- struct arm_smmu_ll_queue llq = {
- .max_n_shift = cmdq->q.llq.max_n_shift,
- .prod = sprod,
- };
-
- ewidx = BIT_WORD(Q_IDX(&llq, eprod));
- ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
-
- while (llq.prod != eprod) {
- unsigned long mask;
- atomic_long_t *ptr;
- u32 limit = BITS_PER_LONG;
-
- swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
- sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
-
- ptr = &cmdq->valid_map[swidx];
-
- if ((swidx == ewidx) && (sbidx < ebidx))
- limit = ebidx;
-
- mask = GENMASK(limit - 1, sbidx);
-
- /*
- * The valid bit is the inverse of the wrap bit. This means
- * that a zero-initialised queue is invalid and, after marking
- * all entries as valid, they become invalid again when we
- * wrap.
- */
- if (set) {
- atomic_long_xor(mask, ptr);
- } else { /* Poll */
- unsigned long valid;
+ struct arm_smmu_queue *q = &smmu->cmdq.q;
+ bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
- valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
- atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
- }
+ smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
- llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
+ while (queue_insert_raw(q, cmd) == -ENOSPC) {
+ if (queue_poll_cons(q, false, wfe))
+ dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
}
}
-/* Mark all entries in the range [sprod, eprod) as valid */
-static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
- u32 sprod, u32 eprod)
-{
- __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
-}
-
-/* Wait for all entries in the range [sprod, eprod) to become valid */
-static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
- u32 sprod, u32 eprod)
-{
- __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
-}
-
-/* Wait for the command queue to become non-full */
-static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
- struct arm_smmu_ll_queue *llq)
+static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
{
+ u64 cmd[CMDQ_ENT_DWORDS];
unsigned long flags;
- struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
- int ret = 0;
- /*
- * Try to update our copy of cons by grabbing exclusive cmdq access. If
- * that fails, spin until somebody else updates it for us.
- */
- if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
- WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
- arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
- llq->val = READ_ONCE(cmdq->q.llq.val);
- return 0;
+ if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+ dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+ ent->opcode);
+ return;
}
- queue_poll_init(smmu, &qp);
- do {
- llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
- if (!queue_full(llq))
- break;
-
- ret = queue_poll(&qp);
- } while (!ret);
-
- return ret;
+ spin_lock_irqsave(&smmu->cmdq.lock, flags);
+ arm_smmu_cmdq_insert_cmd(smmu, cmd);
+ spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
}
/*
- * Wait until the SMMU signals a CMD_SYNC completion MSI.
- * Must be called with the cmdq lock held in some capacity.
+ * The difference between val and sync_idx is bounded by the maximum size of
+ * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
*/
-static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
- struct arm_smmu_ll_queue *llq)
-{
- int ret = 0;
- struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
- u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
-
- queue_poll_init(smmu, &qp);
-
- /*
- * The MSI won't generate an event, since it's being written back
- * into the command queue.
- */
- qp.wfe = false;
- smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
- llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
- return ret;
-}
-
-/*
- * Wait until the SMMU cons index passes llq->prod.
- * Must be called with the cmdq lock held in some capacity.
- */
-static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
- struct arm_smmu_ll_queue *llq)
-{
- struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
- u32 prod = llq->prod;
- int ret = 0;
-
- queue_poll_init(smmu, &qp);
- llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
- do {
- if (queue_consumed(llq, prod))
- break;
-
- ret = queue_poll(&qp);
-
- /*
- * This needs to be a readl() so that our subsequent call
- * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
- *
- * Specifically, we need to ensure that we observe all
- * shared_lock()s by other CMD_SYNCs that share our owner,
- * so that a failing call to tryunlock() means that we're
- * the last one out and therefore we can safely advance
- * cmdq->q.llq.cons. Roughly speaking:
- *
- * CPU 0 CPU1 CPU2 (us)
- *
- * if (sync)
- * shared_lock();
- *
- * dma_wmb();
- * set_valid_map();
- *
- * if (owner) {
- * poll_valid_map();
- * <control dependency>
- * writel(prod_reg);
- *
- * readl(cons_reg);
- * tryunlock();
- *
- * Requires us to see CPU 0's shared_lock() acquisition.
- */
- llq->cons = readl(cmdq->q.cons_reg);
- } while (!ret);
-
- return ret;
-}
-
-static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
- struct arm_smmu_ll_queue *llq)
+static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
{
- if (smmu->features & ARM_SMMU_FEAT_MSI &&
- smmu->features & ARM_SMMU_FEAT_COHERENCY)
- return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
-
- return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
-}
-
-static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
- u32 prod, int n)
-{
- int i;
- struct arm_smmu_ll_queue llq = {
- .max_n_shift = cmdq->q.llq.max_n_shift,
- .prod = prod,
- };
+ ktime_t timeout;
+ u32 val;
- for (i = 0; i < n; ++i) {
- u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
+ timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
+ val = smp_cond_load_acquire(&smmu->sync_count,
+ (int)(VAL - sync_idx) >= 0 ||
+ !ktime_before(ktime_get(), timeout));
- prod = queue_inc_prod_n(&llq, i);
- queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
- }
+ return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
}
-/*
- * This is the actual insertion function, and provides the following
- * ordering guarantees to callers:
- *
- * - There is a dma_wmb() before publishing any commands to the queue.
- * This can be relied upon to order prior writes to data structures
- * in memory (such as a CD or an STE) before the command.
- *
- * - On completion of a CMD_SYNC, there is a control dependency.
- * This can be relied upon to order subsequent writes to memory (e.g.
- * freeing an IOVA) after completion of the CMD_SYNC.
- *
- * - Command insertion is totally ordered, so if two CPUs each race to
- * insert their own list of commands then all of the commands from one
- * CPU will appear before any of the commands from the other CPU.
- */
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
- u64 *cmds, int n, bool sync)
+static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
{
- u64 cmd_sync[CMDQ_ENT_DWORDS];
- u32 prod;
+ u64 cmd[CMDQ_ENT_DWORDS];
unsigned long flags;
- bool owner;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
- struct arm_smmu_ll_queue llq = {
- .max_n_shift = cmdq->q.llq.max_n_shift,
- }, head = llq;
- int ret = 0;
-
- /* 1. Allocate some space in the queue */
- local_irq_save(flags);
- llq.val = READ_ONCE(cmdq->q.llq.val);
- do {
- u64 old;
-
- while (!queue_has_space(&llq, n + sync)) {
- local_irq_restore(flags);
- if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
- dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
- local_irq_save(flags);
- }
-
- head.cons = llq.cons;
- head.prod = queue_inc_prod_n(&llq, n + sync) |
- CMDQ_PROD_OWNED_FLAG;
-
- old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
- if (old == llq.val)
- break;
-
- llq.val = old;
- } while (1);
- owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
- head.prod &= ~CMDQ_PROD_OWNED_FLAG;
- llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
-
- /*
- * 2. Write our commands into the queue
- * Dependency ordering from the cmpxchg() loop above.
- */
- arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
- if (sync) {
- prod = queue_inc_prod_n(&llq, n);
- arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
- queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
-
- /*
- * In order to determine completion of our CMD_SYNC, we must
- * ensure that the queue can't wrap twice without us noticing.
- * We achieve that by taking the cmdq lock as shared before
- * marking our slot as valid.
- */
- arm_smmu_cmdq_shared_lock(cmdq);
- }
-
- /* 3. Mark our slots as valid, ensuring commands are visible first */
- dma_wmb();
- arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
-
- /* 4. If we are the owner, take control of the SMMU hardware */
- if (owner) {
- /* a. Wait for previous owner to finish */
- atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
-
- /* b. Stop gathering work by clearing the owned flag */
- prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
- &cmdq->q.llq.atomic.prod);
- prod &= ~CMDQ_PROD_OWNED_FLAG;
-
- /*
- * c. Wait for any gathered work to be written to the queue.
- * Note that we read our own entries so that we have the control
- * dependency required by (d).
- */
- arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
+ struct arm_smmu_cmdq_ent ent = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ .sync = {
+ .msiaddr = virt_to_phys(&smmu->sync_count),
+ },
+ };
- /*
- * d. Advance the hardware prod pointer
- * Control dependency ordering from the entries becoming valid.
- */
- writel_relaxed(prod, cmdq->q.prod_reg);
+ spin_lock_irqsave(&smmu->cmdq.lock, flags);
- /*
- * e. Tell the next owner we're done
- * Make sure we've updated the hardware first, so that we don't
- * race to update prod and potentially move it backwards.
- */
- atomic_set_release(&cmdq->owner_prod, prod);
+ /* Piggy-back on the previous command if it's a SYNC */
+ if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
+ ent.sync.msidata = smmu->sync_nr;
+ } else {
+ ent.sync.msidata = ++smmu->sync_nr;
+ arm_smmu_cmdq_build_cmd(cmd, &ent);
+ arm_smmu_cmdq_insert_cmd(smmu, cmd);
}
- /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
- if (sync) {
- llq.prod = queue_inc_prod_n(&llq, n);
- ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
- if (ret) {
- dev_err_ratelimited(smmu->dev,
- "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
- llq.prod,
- readl_relaxed(cmdq->q.prod_reg),
- readl_relaxed(cmdq->q.cons_reg));
- }
+ spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
- /*
- * Try to unlock the cmq lock. This will fail if we're the last
- * reader, in which case we can safely update cmdq->q.llq.cons
- */
- if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
- WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
- arm_smmu_cmdq_shared_unlock(cmdq);
- }
- }
-
- local_irq_restore(flags);
- return ret;
+ return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
}
-static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_ent *ent)
+static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
{
u64 cmd[CMDQ_ENT_DWORDS];
+ unsigned long flags;
+ bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+ struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
+ int ret;
- if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
- dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
- ent->opcode);
- return -EINVAL;
- }
+ arm_smmu_cmdq_build_cmd(cmd, &ent);
- return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
-}
+ spin_lock_irqsave(&smmu->cmdq.lock, flags);
+ arm_smmu_cmdq_insert_cmd(smmu, cmd);
+ ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+ spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
-static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-{
- return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
+ return ret;
}
-static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_batch *cmds,
- struct arm_smmu_cmdq_ent *cmd)
+static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
{
- if (cmds->num == CMDQ_BATCH_ENTRIES) {
- arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
- cmds->num = 0;
- }
- arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
- cmds->num++;
-}
+ int ret;
+ bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+ (smmu->features & ARM_SMMU_FEAT_COHERENCY);
-static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_batch *cmds)
-{
- return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+ ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
+ : __arm_smmu_cmdq_issue_sync(smmu);
+ if (ret)
+ dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
+ return ret;
}
/* Context descriptor manipulation functions */
size_t i;
unsigned long flags;
struct arm_smmu_master *master;
- struct arm_smmu_cmdq_batch cmds = {};
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_cmdq_ent cmd = {
.opcode = CMDQ_OP_CFGI_CD,
list_for_each_entry(master, &smmu_domain->devices, domain_head) {
for (i = 0; i < master->num_sids; i++) {
cmd.cfgi.sid = master->sids[i];
- arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
}
}
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
- arm_smmu_cmdq_batch_submit(smmu, &cmds);
+ arm_smmu_cmdq_issue_sync(smmu);
}
static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
cmd->atc.size = log2_span;
}
-static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
+ struct arm_smmu_cmdq_ent *cmd)
{
int i;
- struct arm_smmu_cmdq_ent cmd;
- arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
+ if (!master->ats_enabled)
+ return 0;
for (i = 0; i < master->num_sids; i++) {
- cmd.atc.sid = master->sids[i];
- arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
+ cmd->atc.sid = master->sids[i];
+ arm_smmu_cmdq_issue_cmd(master->smmu, cmd);
}
return arm_smmu_cmdq_issue_sync(master->smmu);
static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
int ssid, unsigned long iova, size_t size)
{
- int i;
+ int ret = 0;
unsigned long flags;
struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_master *master;
- struct arm_smmu_cmdq_batch cmds = {};
if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
return 0;
arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_for_each_entry(master, &smmu_domain->devices, domain_head) {
- if (!master->ats_enabled)
- continue;
-
- for (i = 0; i < master->num_sids; i++) {
- cmd.atc.sid = master->sids[i];
- arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
- }
- }
+ list_for_each_entry(master, &smmu_domain->devices, domain_head)
+ ret |= arm_smmu_atc_inv_master(master, &cmd);
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
- return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
+ return ret ? -ETIMEDOUT : 0;
}
/* IO_PGTABLE API */
/*
* NOTE: when io-pgtable is in non-strict mode, we may get here with
* PTEs previously cleared by unmaps on the current CPU not yet visible
- * to the SMMU. We are relying on the dma_wmb() implicit during cmd
- * insertion to guarantee those are observed before the TLBI. Do be
- * careful, 007.
+ * to the SMMU. We are relying on the DSB implicit in
+ * queue_sync_prod_out() to guarantee those are observed before the
+ * TLBI. Do be careful, 007.
*/
arm_smmu_cmdq_issue_cmd(smmu, &cmd);
arm_smmu_cmdq_issue_sync(smmu);
- arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
}
-static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
- size_t granule, bool leaf,
- struct arm_smmu_domain *smmu_domain)
+static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
+ size_t granule, bool leaf, void *cookie)
{
+ struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_device *smmu = smmu_domain->smmu;
- unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
- size_t inv_range = granule;
- struct arm_smmu_cmdq_batch cmds = {};
struct arm_smmu_cmdq_ent cmd = {
.tlbi = {
.leaf = leaf,
+ .addr = iova,
},
};
cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
}
- if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
- /* Get the leaf page size */
- tg = __ffs(smmu_domain->domain.pgsize_bitmap);
-
- /* Convert page size of 12,14,16 (log2) to 1,2,3 */
- cmd.tlbi.tg = (tg - 10) / 2;
-
- /* Determine what level the granule is at */
- cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
-
- num_pages = size >> tg;
- }
-
- while (iova < end) {
- if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
- /*
- * On each iteration of the loop, the range is 5 bits
- * worth of the aligned size remaining.
- * The range in pages is:
- *
- * range = (num_pages & (0x1f << __ffs(num_pages)))
- */
- unsigned long scale, num;
-
- /* Determine the power of 2 multiple number of pages */
- scale = __ffs(num_pages);
- cmd.tlbi.scale = scale;
-
- /* Determine how many chunks of 2^scale size we have */
- num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
- cmd.tlbi.num = num - 1;
-
- /* range is num * 2^scale * pgsize */
- inv_range = num << (scale + tg);
-
- /* Clear out the lower order bits for the next iteration */
- num_pages -= num << scale;
- }
-
- cmd.tlbi.addr = iova;
- arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
- iova += inv_range;
- }
- arm_smmu_cmdq_batch_submit(smmu, &cmds);
-
- /*
- * Unfortunately, this can't be leaf-only since we may have
- * zapped an entire table.
- */
- arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
+ do {
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ cmd.tlbi.addr += granule;
+ } while (size -= granule);
}
static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
unsigned long iova, size_t granule,
void *cookie)
{
- struct arm_smmu_domain *smmu_domain = cookie;
- struct iommu_domain *domain = &smmu_domain->domain;
-
- iommu_iotlb_gather_add_page(domain, gather, iova, granule);
+ arm_smmu_tlb_inv_range_nosync(iova, granule, granule, true, cookie);
}
static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
size_t granule, void *cookie)
{
- arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ arm_smmu_tlb_inv_range_nosync(iova, size, granule, false, cookie);
+ arm_smmu_cmdq_issue_sync(smmu);
}
static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
size_t granule, void *cookie)
{
- arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ arm_smmu_tlb_inv_range_nosync(iova, size, granule, true, cookie);
+ arm_smmu_cmdq_issue_sync(smmu);
}
static const struct iommu_flush_ops arm_smmu_flush_ops = {
static void arm_smmu_disable_ats(struct arm_smmu_master *master)
{
+ struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_domain *smmu_domain = master->domain;
if (!master->ats_enabled)
* ATC invalidation via the SMMU.
*/
wmb();
- arm_smmu_atc_inv_master(master);
+ arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
+ arm_smmu_atc_inv_master(master, &cmd);
atomic_dec(&smmu_domain->nr_ats_masters);
}
static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
size_t size, struct iommu_iotlb_gather *gather)
{
+ int ret;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
if (!ops)
return 0;
- return ops->unmap(ops, iova, size, gather);
+ ret = ops->unmap(ops, iova, size, gather);
+ if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size))
+ return 0;
+
+ return ret;
}
static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
- arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start,
- gather->pgsize, true, smmu_domain);
+ if (smmu)
+ arm_smmu_cmdq_issue_sync(smmu);
}
static phys_addr_t
return 0;
}
-static void arm_smmu_cmdq_free_bitmap(void *data)
-{
- unsigned long *bitmap = data;
- bitmap_free(bitmap);
-}
-
-static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
-{
- int ret = 0;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
- unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
- atomic_long_t *bitmap;
-
- atomic_set(&cmdq->owner_prod, 0);
- atomic_set(&cmdq->lock, 0);
-
- bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
- if (!bitmap) {
- dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
- ret = -ENOMEM;
- } else {
- cmdq->valid_map = bitmap;
- devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
- }
-
- return ret;
-}
-
static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
{
int ret;
/* cmdq */
+ spin_lock_init(&smmu->cmdq.lock);
ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
"cmdq");
if (ret)
return ret;
- ret = arm_smmu_cmdq_init(smmu);
- if (ret)
- return ret;
-
/* evtq */
ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
/* Queue sizes, capped to ensure natural alignment */
smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
FIELD_GET(IDR1_CMDQS, reg));
- if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) {
- /*
- * We don't support splitting up batches, so one batch of
- * commands plus an extra sync needs to fit inside the command
- * queue. There's also no way we can handle the weird alignment
- * restrictions on the base pointer for a unit-length queue.
- */
- dev_err(smmu->dev, "command queue size <= %d entries not supported\n",
- CMDQ_BATCH_ENTRIES);
+ if (!smmu->cmdq.q.llq.max_n_shift) {
+ /* Odd alignment restrictions on the base, so ignore for now */
+ dev_err(smmu->dev, "unit-length command queue not supported\n");
return -ENXIO;
}
if (smmu->sid_bits <= STRTAB_SPLIT)
smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
- /* IDR3 */
- reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
- if (FIELD_GET(IDR3_RIL, reg))
- smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
-
/* IDR5 */
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);