Home Home > GIT Browse > SLE12-SP3-AZURE
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHannes Reinecke <hare@suse.de>2019-05-08 13:59:00 +0200
committerHannes Reinecke <hare@suse.de>2019-05-14 12:45:10 +0200
commitce9ba2dfebfc91642b49c9bd217b71aa8791af60 (patch)
tree5a3e5d9826522405e75dad2ebfd9d15729862424
parent61c9eff70acd20dfeb509a5da2876e2d051c07a9 (diff)
nvme-fc: resolve io failures during connect (bsc#1116803).
-rw-r--r--patches.fixes/nvme-fc-resolve-io-failures-during-connect.patch192
-rw-r--r--series.conf1
2 files changed, 193 insertions, 0 deletions
diff --git a/patches.fixes/nvme-fc-resolve-io-failures-during-connect.patch b/patches.fixes/nvme-fc-resolve-io-failures-during-connect.patch
new file mode 100644
index 0000000000..f9da1df80e
--- /dev/null
+++ b/patches.fixes/nvme-fc-resolve-io-failures-during-connect.patch
@@ -0,0 +1,192 @@
+From: James Smart <jsmart2021@gmail.com>
+Date: Wed, 14 Nov 2018 16:35:10 -0800
+Subject: [PATCH] nvme-fc: resolve io failures during connect
+Git-commit: 4cff280a5fccf6513ed9e895bb3a4e7ad8b0cedc
+Patch-mainline: v4.20-rc4
+References: bsc#1116803
+
+If an io error occurs on an io issued while connecting, recovery
+of the io falls flat as the state checking ends up nooping the error
+handler.
+
+Create an err_work work item that is scheduled upon an io error while
+connecting. The work thread terminates all io on all queues and marks
+the queues as not connected. The termination of the io will return
+back to the callee, which will then back out of the connection attempt
+and will reschedule, if possible, the connection attempt.
+
+The changes:
+- in case there are several commands hitting the error handler, a
+ state flag is kept so that the error work is only scheduled once,
+ on the first error. The subsequent errors can be ignored.
+- The calling sequence to stop keep alive and terminate the queues
+ and their io is lifted from the reset routine. Made a small
+ service routine used by both reset and err_work.
+- During debugging, found that the teardown path can reference
+ an uninitialized pointer, resulting in a NULL pointer oops.
+ The aen_ops weren't initialized yet. Add validation on their
+ initialization before calling the teardown routine.
+
+[hare: ported to SLE12 SP3]
+
+Signed-off-by: James Smart <jsmart2021@gmail.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Acked-by: Hannes Reinecke <hare@suse.com>
+---
+ drivers/nvme/host/fc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 61 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
+index e43a270122a4..8e4e71b2026f 100644
+--- a/drivers/nvme/host/fc.c
++++ b/drivers/nvme/host/fc.c
+@@ -157,6 +157,7 @@ struct nvme_fc_ctrl {
+
+ bool ioq_live;
+ bool assoc_active;
++ atomic_t err_work_active;
+ u64 association_id;
+
+ u64 cap;
+@@ -169,6 +170,7 @@ struct nvme_fc_ctrl {
+ struct work_struct delete_work;
+ struct work_struct reset_work;
+ struct delayed_work connect_work;
++ struct work_struct err_work;
+
+ struct kref ref;
+ u32 flags;
+@@ -1547,6 +1549,10 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
+ struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
+ int i;
+
++ /* ensure we've initialized the ops once */
++ if (!(aen_op->flags & FCOP_FLAGS_AEN))
++ return;
++
+ for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++)
+ __nvme_fc_abort_op(ctrl, aen_op);
+ }
+@@ -2070,7 +2076,25 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
+ static void
+ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
+ {
+- /* only proceed if in LIVE state - e.g. on first error */
++ int active;
++
++ /*
++ * if an error (io timeout, etc) while (re)connecting,
++ * it's an error on creating the new association.
++ * Start the error recovery thread if it hasn't already
++ * been started. It is expected there could be multiple
++ * ios hitting this path before things are cleaned up.
++ */
++ if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
++ active = atomic_xchg(&ctrl->err_work_active, 1);
++ if (!active && !schedule_work(&ctrl->err_work)) {
++ atomic_set(&ctrl->err_work_active, 0);
++ WARN_ON(1);
++ }
++ return;
++ }
++
++ /* Otherwise, only proceed if in LIVE state - e.g. on first error */
+ if (ctrl->ctrl.state != NVME_CTRL_LIVE)
+ return;
+
+@@ -2864,6 +2888,7 @@ nvme_fc_delete_ctrl_work(struct work_struct *work)
+ struct nvme_fc_ctrl *ctrl =
+ container_of(work, struct nvme_fc_ctrl, delete_work);
+
++ cancel_work_sync(&ctrl->err_work);
+ cancel_work_sync(&ctrl->reset_work);
+ cancel_delayed_work_sync(&ctrl->connect_work);
+
+@@ -2970,21 +2995,29 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
+ }
+
+ static void
+-nvme_fc_reset_ctrl_work(struct work_struct *work)
++__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
+ {
+- struct nvme_fc_ctrl *ctrl =
+- container_of(work, struct nvme_fc_ctrl, reset_work);
+- int ret;
++ nvme_stop_keep_alive(&ctrl->ctrl);
+
+ /* will block will waiting for io to terminate */
+ nvme_fc_delete_association(ctrl);
+
+- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
++ if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING &&
++ !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ dev_err(ctrl->ctrl.device,
+ "NVME-FC{%d}: error_recovery: Couldn't change state "
+ "to RECONNECTING\n", ctrl->cnum);
+- return;
+ }
++}
++
++static void
++nvme_fc_reset_ctrl_work(struct work_struct *work)
++{
++ struct nvme_fc_ctrl *ctrl =
++ container_of(work, struct nvme_fc_ctrl, reset_work);
++ int ret;
++
++ __nvme_fc_terminate_io(ctrl);
+
+ if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
+ ret = nvme_fc_create_association(ctrl);
+@@ -3022,6 +3055,24 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl)
+ return 0;
+ }
+
++static void
++nvme_fc_connect_err_work(struct work_struct *work)
++{
++ struct nvme_fc_ctrl *ctrl =
++ container_of(work, struct nvme_fc_ctrl, err_work);
++
++ __nvme_fc_terminate_io(ctrl);
++
++ atomic_set(&ctrl->err_work_active, 0);
++
++ /*
++ * Rescheduling the connection after recovering
++ * from the io error is left to the reconnect work
++ * item, which is what should have stalled waiting on
++ * the io that had the error that scheduled this work.
++ */
++}
++
+ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
+ .name = "fc",
+ .module = THIS_MODULE,
+@@ -3135,6 +3186,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+ ctrl->cnum = idx;
+ ctrl->ioq_live = false;
+ ctrl->assoc_active = false;
++ atomic_set(&ctrl->err_work_active, 0);
+ init_waitqueue_head(&ctrl->ioabort_wait);
+
+ get_device(ctrl->dev);
+@@ -3143,6 +3195,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+ INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
+ INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work);
+ INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
++ INIT_WORK(&ctrl->err_work, nvme_fc_connect_err_work);
+ spin_lock_init(&ctrl->lock);
+
+ /* io queue count */
+@@ -3231,6 +3284,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+ fail_ctrl:
+ nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
+ cancel_work_sync(&ctrl->reset_work);
++ cancel_work_sync(&ctrl->err_work);
+ cancel_delayed_work_sync(&ctrl->connect_work);
+
+ ctrl->ctrl.opts = NULL;
+--
+2.16.4
+
diff --git a/series.conf b/series.conf
index 263d4d56cc..0eb17149b3 100644
--- a/series.conf
+++ b/series.conf
@@ -25080,6 +25080,7 @@
patches.fixes/acpi-nfit-fix-ars-overflow-continuation.patch
patches.drivers/xhci-add-quirk-to-workaround-the-errata-seen-on-cavium-thunder-x2-soc.patch
patches.drivers/0003-amd-iommu-fix-guest-virtual-apic-log-tail-address-register
+ patches.fixes/nvme-fc-resolve-io-failures-during-connect.patch
patches.fixes/libceph-fall-back-to-sendmsg-for-slab-pages.patch
patches.drivers/net-ena-fix-crash-during-failed-resume-from-hibernat.patch
patches.drivers/net-ena-fix-crash-during-ena_remove.patch