* If the firmware has indicated an error, print out the reason for
* the firmware error.
*/
-static void t4_report_fw_error(struct adapter *adap)
+void t4_report_fw_error(struct adapter *adap)
{
static const char *const reason[] = {
"Crash", /* PCIE_FW_EVAL_CRASH */
pcie_fw = t4_read_reg(adap, A_PCIE_FW);
if (pcie_fw & F_PCIE_FW_ERR) {
- adap->flags &= ~FW_OK;
CH_ERR(adap, "firmware reports adapter error: %s (0x%08x)\n",
reason[G_PCIE_FW_EVAL(pcie_fw)], pcie_fw);
- if (pcie_fw != 0xffffffff)
- t4_os_dump_devlog(adap);
}
}
/*
* Attempt to gain access to the mailbox.
*/
+ pcie_fw = 0;
+ if (!(adap->flags & IS_VF)) {
+ pcie_fw = t4_read_reg(adap, A_PCIE_FW);
+ if (pcie_fw & F_PCIE_FW_ERR)
+ goto failed;
+ }
for (i = 0; i < 4; i++) {
ctl = t4_read_reg(adap, ctl_reg);
v = G_MBOWNER(ctl);
* If we were unable to gain access, report the error to our caller.
*/
if (v != X_MBOWNER_PL) {
- t4_report_fw_error(adap);
+ if (!(adap->flags & IS_VF)) {
+ pcie_fw = t4_read_reg(adap, A_PCIE_FW);
+ if (pcie_fw & F_PCIE_FW_ERR)
+ goto failed;
+ }
ret = (v == X_MBOWNER_FW) ? -EBUSY : -ETIMEDOUT;
return ret;
}
* Loop waiting for the reply; bail out if we time out or the firmware
* reports an error.
*/
- pcie_fw = 0;
for (i = 0; i < timeout; i += ms) {
if (!(adap->flags & IS_VF)) {
pcie_fw = t4_read_reg(adap, A_PCIE_FW);
*(const u8 *)cmd, mbox, pcie_fw);
CH_DUMP_MBOX(adap, mbox, 0, "cmdsent", cmd_rpl, true);
CH_DUMP_MBOX(adap, mbox, data_reg, "current", NULL, true);
-
- if (pcie_fw & F_PCIE_FW_ERR) {
- ret = -ENXIO;
- t4_report_fw_error(adap);
- } else {
- ret = -ETIMEDOUT;
- t4_os_dump_devlog(adap);
- }
-
+failed:
+ adap->flags &= ~FW_OK;
+ ret = pcie_fw & F_PCIE_FW_ERR ? -ENXIO : -ETIMEDOUT;
t4_fatal_err(adap, true);
return ret;
}
*/
static bool cim_intr_handler(struct adapter *adap, int arg, bool verbose)
{
- static const struct intr_action cim_host_intr_actions[] = {
- { F_TIMER0INT, 0, t4_os_dump_cimla },
- { 0 },
- };
static const struct intr_details cim_host_intr_details[] = {
/* T6+ */
{ F_PCIE2CIMINTFPARERR, "CIM IBQ PCIe interface parity error" },
.fatal = 0x007fffe6,
.flags = NONFATAL_IF_DISABLED,
.details = cim_host_intr_details,
- .actions = cim_host_intr_actions,
+ .actions = NULL,
};
static const struct intr_details cim_host_upacc_intr_details[] = {
{ F_EEPROMWRINT, "CIM EEPROM came out of busy state" },
u32 val, fw_err;
bool fatal;
- fw_err = t4_read_reg(adap, A_PCIE_FW);
- if (fw_err & F_PCIE_FW_ERR)
- t4_report_fw_error(adap);
-
/*
* When the Firmware detects an internal error which normally wouldn't
* raise a Host Interrupt, it forces a CIM Timer0 interrupt in order
* Timer0 interrupt and don't see a Firmware Crash, ignore the Timer0
* interrupt.
*/
+ fw_err = t4_read_reg(adap, A_PCIE_FW);
val = t4_read_reg(adap, A_CIM_HOST_INT_CAUSE);
if (val & F_TIMER0INT && (!(fw_err & F_PCIE_FW_ERR) ||
G_PCIE_FW_EVAL(fw_err) != PCIE_FW_EVAL_CRASH)) {
t4_write_reg(adap, A_CIM_HOST_INT_CAUSE, F_TIMER0INT);
}
- fatal = false;
+ fatal = (fw_err & F_PCIE_FW_ERR) != 0;
fatal |= t4_handle_intr(adap, &cim_host_intr_info, 0, verbose);
fatal |= t4_handle_intr(adap, &cim_host_upacc_intr_info, 0, verbose);
fatal |= t4_handle_intr(adap, &cim_pf_host_intr_info, 0, verbose);
+ if (fatal)
+ t4_os_cim_err(adap);
return (fatal);
}
* The designation 'slow' is because it involves register reads, while
* data interrupts typically don't involve any MMIOs.
*/
-int t4_slow_intr_handler(struct adapter *adap, bool verbose)
+bool t4_slow_intr_handler(struct adapter *adap, bool verbose)
{
static const struct intr_details pl_intr_details[] = {
{ F_MC1, "MC1" },
.details = pl_intr_details,
.actions = pl_intr_action,
};
- bool fatal;
u32 perr;
perr = t4_read_reg(adap, pl_perr_cause.cause_reg);
if (verbose)
perr |= t4_read_reg(adap, pl_intr_info.enable_reg);
}
- fatal = t4_handle_intr(adap, &pl_intr_info, perr, verbose);
- if (fatal)
- t4_fatal_err(adap, false);
- return (0);
+ return (t4_handle_intr(adap, &pl_intr_info, perr, verbose));
}
#define PF_INTR_MASK (F_PFSW | F_PFCIM)
if (ret != FW_SUCCESS) {
if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0)
goto retry;
- if (t4_read_reg(adap, A_PCIE_FW) & F_PCIE_FW_ERR)
- t4_report_fw_error(adap);
return ret;
}
static int release_clip_addr(struct adapter *, struct t4_clip_addr *);
#ifdef TCP_OFFLOAD
static int toe_capability(struct vi_info *, bool);
-static void t4_async_event(void *, int);
+static void t4_async_event(struct adapter *);
#endif
#ifdef KERN_TLS
static int ktls_capability(struct adapter *, bool);
static uint64_t vi_get_counter(struct ifnet *, ift_counter);
static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter);
static void enable_vxlan_rx(struct adapter *);
-static void reset_adapter(void *, int);
+static void reset_adapter_task(void *, int);
+static void fatal_error_task(void *, int);
+static void dump_devlog(struct adapter *);
+static void dump_cim_regs(struct adapter *);
+static void dump_cimla(struct adapter *);
struct {
uint16_t device;
callout_init(&sc->ktls_tick, 1);
-#ifdef TCP_OFFLOAD
- TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
-#endif
-
refcount_init(&sc->vxlan_refcount, 0);
- TASK_INIT(&sc->reset_task, 0, reset_adapter, sc);
+ TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc);
+ TASK_INIT(&sc->fatal_error_task, 0, fatal_error_task, sc);
sc->ctrlq_oid = SYSCTL_ADD_NODE(&sc->ctx,
SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "ctrlq",
}
}
-#ifdef TCP_OFFLOAD
- taskqueue_drain(taskqueue_thread, &sc->async_event_task);
-#endif
-
for (i = 0; i < sc->intr_count; i++)
t4_free_irq(sc, &sc->irq[i]);
return (true);
}
+static inline int
+stop_adapter(struct adapter *sc)
+{
+ if (atomic_testandset_int(&sc->error_flags, ilog2(ADAP_STOPPED)))
+ return (1); /* Already stopped. */
+ return (t4_shutdown_adapter(sc));
+}
+
static int
t4_suspend(device_t dev)
{
}
/* No more DMA or interrupts. */
- t4_shutdown_adapter(sc);
+ stop_adapter(sc);
/* Quiesce all activity. */
for_each_port(sc, i) {
/* Mark the adapter totally off limits. */
mtx_lock(&sc->reg_lock);
- sc->flags |= HW_OFF_LIMITS;
+ atomic_set_int(&sc->error_flags, HW_OFF_LIMITS);
sc->flags &= ~(FW_OK | MASTER_PF);
sc->reset_thread = NULL;
mtx_unlock(&sc->reg_lock);
- sc->num_resets++;
CH_ALERT(sc, "suspend completed.\n");
done:
end_synchronized_op(sc, 0);
goto done;
}
+ /* Note that HW_OFF_LIMITS is cleared a bit later. */
+ atomic_clear_int(&sc->error_flags, ADAP_FATAL_ERR | ADAP_STOPPED);
+
/* Restore memory window. */
setup_memwin(sc);
CH_ALERT(sc, "recovery mode on resume.\n");
rc = 0;
mtx_lock(&sc->reg_lock);
- sc->flags &= ~HW_OFF_LIMITS;
+ atomic_clear_int(&sc->error_flags, HW_OFF_LIMITS);
mtx_unlock(&sc->reg_lock);
goto done;
}
* this thread is still in the middle of a synchronized_op.
*/
mtx_lock(&sc->reg_lock);
- sc->flags &= ~HW_OFF_LIMITS;
+ atomic_clear_int(&sc->error_flags, HW_OFF_LIMITS);
mtx_unlock(&sc->reg_lock);
if (sc->flags & FULL_INIT_DONE) {
return (0);
}
-static void
-reset_adapter(void *arg, int pending)
+static int
+reset_adapter(struct adapter *sc)
{
- struct adapter *sc = arg;
- int rc;
+ int rc, oldinc, error_flags;
CH_ALERT(sc, "reset requested.\n");
rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst1");
if (rc != 0)
- return;
+ return (EBUSY);
if (hw_off_limits(sc)) {
CH_ERR(sc, "adapter is suspended, use resume (not reset).\n");
}
done:
+ oldinc = sc->incarnation;
end_synchronized_op(sc, 0);
if (rc != 0)
- return; /* Error logged already. */
+ return (rc); /* Error logged already. */
+ atomic_add_int(&sc->num_resets, 1);
mtx_lock(&Giant);
rc = BUS_RESET_CHILD(device_get_parent(sc->dev), sc->dev, 0);
mtx_unlock(&Giant);
if (rc != 0)
CH_ERR(sc, "bus_reset_child failed: %d.\n", rc);
- else
- CH_ALERT(sc, "bus_reset_child succeeded.\n");
+ else {
+ rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst2");
+ if (rc != 0)
+ return (EBUSY);
+ error_flags = atomic_load_int(&sc->error_flags);
+ if (sc->incarnation > oldinc && error_flags == 0) {
+ CH_ALERT(sc, "bus_reset_child succeeded.\n");
+ } else {
+ CH_ERR(sc, "adapter did not reset properly, flags "
+ "0x%08x, error_flags 0x%08x.\n", sc->flags,
+ error_flags);
+ rc = ENXIO;
+ }
+ end_synchronized_op(sc, 0);
+ }
+
+ return (rc);
+}
+
+static void
+reset_adapter_task(void *arg, int pending)
+{
+ /* XXX: t4_async_event here? */
+ reset_adapter(arg);
}
static int
panic("%s: panic on fatal error", device_get_nameunit(sc->dev));
}
-void
-t4_fatal_err(struct adapter *sc, bool fw_error)
+static void
+fatal_error_task(void *arg, int pending)
{
+ struct adapter *sc = arg;
+ int rc;
- t4_shutdown_adapter(sc);
- log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n",
- device_get_nameunit(sc->dev));
- if (fw_error) {
- if (sc->flags & CHK_MBOX_ACCESS)
- ASSERT_SYNCHRONIZED_OP(sc);
- sc->flags |= ADAP_ERR;
- } else {
- ADAPTER_LOCK(sc);
- sc->flags |= ADAP_ERR;
- ADAPTER_UNLOCK(sc);
- }
#ifdef TCP_OFFLOAD
- taskqueue_enqueue(taskqueue_thread, &sc->async_event_task);
+ t4_async_event(sc);
#endif
+ if (atomic_testandclear_int(&sc->error_flags, ilog2(ADAP_CIM_ERR))) {
+ dump_cim_regs(sc);
+ dump_cimla(sc);
+ dump_devlog(sc);
+ }
+
+ if (t4_reset_on_fatal_err) {
+ CH_ALERT(sc, "resetting on fatal error.\n");
+ rc = reset_adapter(sc);
+ if (rc == 0 && t4_panic_on_fatal_err) {
+ CH_ALERT(sc, "reset was successful, "
+ "system will NOT panic.\n");
+ return;
+ }
+ }
if (t4_panic_on_fatal_err) {
CH_ALERT(sc, "panicking on fatal error (after 30s).\n");
callout_reset(&fatal_callout, hz * 30, delayed_panic, sc);
- } else if (t4_reset_on_fatal_err) {
- CH_ALERT(sc, "resetting on fatal error.\n");
- taskqueue_enqueue(reset_tq, &sc->reset_task);
}
}
+void
+t4_fatal_err(struct adapter *sc, bool fw_error)
+{
+ const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
+
+ stop_adapter(sc);
+ if (atomic_testandset_int(&sc->error_flags, ilog2(ADAP_FATAL_ERR)))
+ return;
+ if (fw_error) {
+ /*
+ * We are here because of a firmware error/timeout and not
+ * because of a hardware interrupt. It is possible (although
+ * not very likely) that an error interrupt was also raised but
+ * this thread ran first and inhibited t4_intr_err. We walk the
+ * main INT_CAUSE registers here to make sure we haven't missed
+ * anything interesting.
+ */
+ t4_slow_intr_handler(sc, verbose);
+ atomic_set_int(&sc->error_flags, ADAP_CIM_ERR);
+ }
+ t4_report_fw_error(sc);
+ log(LOG_ALERT, "%s: encountered fatal error, adapter stopped (%d).\n",
+ device_get_nameunit(sc->dev), fw_error);
+ taskqueue_enqueue(reset_tq, &sc->fatal_error_task);
+}
+
void
t4_add_adapter(struct adapter *sc)
{
return (rc);
}
-bool
-t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose)
+static void
+dump_cim_regs(struct adapter *sc)
+{
+ log(LOG_DEBUG, "%s: CIM debug regs %08x %08x %08x %08x %08x\n",
+ device_get_nameunit(sc->dev),
+ t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA0),
+ t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA1),
+ t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA2),
+ t4_read_reg(sc, A_EDC_H_BIST_DATA_PATTERN),
+ t4_read_reg(sc, A_EDC_H_BIST_STATUS_RDATA));
+}
+
+static void
+dump_cimla(struct adapter *sc)
{
struct sbuf sb;
int rc;
- if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
- return (false);
+ if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) {
+ log(LOG_DEBUG, "%s: failed to generate CIM LA dump.\n",
+ device_get_nameunit(sc->dev));
+ return;
+ }
rc = sbuf_cim_la(sc, &sb, M_NOWAIT);
if (rc == 0) {
rc = sbuf_finish(&sb);
if (rc == 0) {
- log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s",
+ log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s\n",
device_get_nameunit(sc->dev), sbuf_data(&sb));
}
}
sbuf_delete(&sb);
- return (false);
+}
+
+void
+t4_os_cim_err(struct adapter *sc)
+{
+ atomic_set_int(&sc->error_flags, ADAP_CIM_ERR);
}
static int
return (rc);
}
-void
-t4_os_dump_devlog(struct adapter *sc)
+static void
+dump_devlog(struct adapter *sc)
{
int rc;
struct sbuf sb;
u_int val;
int rc;
- val = sc->num_resets;
+ val = atomic_load_int(&sc->num_resets);
rc = sysctl_handle_int(oidp, &val, 0, req);
if (rc != 0 || req->newptr == NULL)
return (rc);
if (val == 0) {
/* Zero out the counter that tracks reset. */
- sc->num_resets = 0;
+ atomic_store_int(&sc->num_resets, 0);
return (0);
}
}
static void
-t4_async_event(void *arg, int n)
+t4_async_event(struct adapter *sc)
{
struct uld_info *ui;
- struct adapter *sc = (struct adapter *)arg;
if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4async") != 0)
return;