diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 0b55bc146b29..9260ad47350f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -763,6 +763,12 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) free_cpumask_var(available_mask); } +#define UNLOAD_DELAY_UNIT_MS 10 /* 10 milliseconds */ +#define UNLOAD_WAIT_MS (100*1000) /* 100 seconds */ +#define UNLOAD_WAIT_LOOPS (UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS) +#define UNLOAD_MSG_MS (5*1000) /* Every 5 seconds */ +#define UNLOAD_MSG_LOOPS (UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS) + static void vmbus_wait_for_unload(void) { int cpu; @@ -780,12 +786,17 @@ static void vmbus_wait_for_unload(void) * vmbus_connection.unload_event. If not, the last thing we can do is * read message pages for all CPUs directly. * - * Wait no more than 10 seconds so that the panic path can't get - * hung forever in case the response message isn't seen. + * Wait up to 100 seconds since an Azure host must writeback any dirty + * data in its disk cache before the VMbus UNLOAD request will + * complete. This flushing has been empirically observed to take up + * to 50 seconds in cases with a lot of dirty data, so allow additional + * leeway and for inaccuracies in mdelay(). But eventually time out so + * that the panic path can't get hung forever in case the response + * message isn't seen. */ - for (i = 0; i < 1000; i++) { + for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) { if (completion_done(&vmbus_connection.unload_event)) - break; + goto completed; for_each_online_cpu(cpu) { struct hv_per_cpu_context *hv_cpu @@ -808,9 +819,18 @@ static void vmbus_wait_for_unload(void) vmbus_signal_eom(msg, message_type); } - mdelay(10); - } + /* + * Give a notice periodically so someone watching the + * serial output won't think it is completely hung. + */ + if (!(i % UNLOAD_MSG_LOOPS)) + pr_notice("Waiting for VMBus UNLOAD to complete\n"); + mdelay(UNLOAD_DELAY_UNIT_MS); + } + pr_err("Continuing even though VMBus UNLOAD did not complete\n"); + +completed: /* * We're crashing and already got the UNLOAD_RESPONSE, cleanup all * maybe-pending messages on all CPUs to be able to receive new