1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2012-2014 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/conf.h>
33 #include <sys/domainset.h>
34 #include <sys/proc.h>
35
36 #include <dev/pci/pcivar.h>
37
38 #include "nvme_private.h"
39
40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
41 #define DO_NOT_RETRY 1
42
43 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
44 struct nvme_request *req);
45 static void nvme_qpair_destroy(struct nvme_qpair *qpair);
46
47 #define DEFAULT_INDEX 256
48 #define DEFAULT_ENTRY(x) [DEFAULT_INDEX] = x
49 #define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x
50
51 static const char *admin_opcode[DEFAULT_INDEX + 1] = {
52 OPC_ENTRY(DELETE_IO_SQ),
53 OPC_ENTRY(CREATE_IO_SQ),
54 OPC_ENTRY(GET_LOG_PAGE),
55 OPC_ENTRY(DELETE_IO_CQ),
56 OPC_ENTRY(CREATE_IO_CQ),
57 OPC_ENTRY(IDENTIFY),
58 OPC_ENTRY(ABORT),
59 OPC_ENTRY(SET_FEATURES),
60 OPC_ENTRY(GET_FEATURES),
61 OPC_ENTRY(ASYNC_EVENT_REQUEST),
62 OPC_ENTRY(NAMESPACE_MANAGEMENT),
63 OPC_ENTRY(FIRMWARE_ACTIVATE),
64 OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
65 OPC_ENTRY(DEVICE_SELF_TEST),
66 OPC_ENTRY(NAMESPACE_ATTACHMENT),
67 OPC_ENTRY(KEEP_ALIVE),
68 OPC_ENTRY(DIRECTIVE_SEND),
69 OPC_ENTRY(DIRECTIVE_RECEIVE),
70 OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
71 OPC_ENTRY(NVME_MI_SEND),
72 OPC_ENTRY(NVME_MI_RECEIVE),
73 OPC_ENTRY(CAPACITY_MANAGEMENT),
74 OPC_ENTRY(LOCKDOWN),
75 OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
76 OPC_ENTRY(FABRICS_COMMANDS),
77 OPC_ENTRY(FORMAT_NVM),
78 OPC_ENTRY(SECURITY_SEND),
79 OPC_ENTRY(SECURITY_RECEIVE),
80 OPC_ENTRY(SANITIZE),
81 OPC_ENTRY(GET_LBA_STATUS),
82 DEFAULT_ENTRY("ADMIN COMMAND"),
83 };
84
85 static const char *io_opcode[DEFAULT_INDEX + 1] = {
86 OPC_ENTRY(FLUSH),
87 OPC_ENTRY(WRITE),
88 OPC_ENTRY(READ),
89 OPC_ENTRY(WRITE_UNCORRECTABLE),
90 OPC_ENTRY(COMPARE),
91 OPC_ENTRY(WRITE_ZEROES),
92 OPC_ENTRY(DATASET_MANAGEMENT),
93 OPC_ENTRY(VERIFY),
94 OPC_ENTRY(RESERVATION_REGISTER),
95 OPC_ENTRY(RESERVATION_REPORT),
96 OPC_ENTRY(RESERVATION_ACQUIRE),
97 OPC_ENTRY(RESERVATION_RELEASE),
98 OPC_ENTRY(COPY),
99 DEFAULT_ENTRY("IO COMMAND"),
100 };
101
102 static const char *
get_opcode_string(const char * op[DEFAULT_INDEX+1],uint16_t opc)103 get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc)
104 {
105 const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX];
106
107 return (nm != NULL ? nm : op[DEFAULT_INDEX]);
108 }
109
110 static const char *
get_admin_opcode_string(uint16_t opc)111 get_admin_opcode_string(uint16_t opc)
112 {
113 return (get_opcode_string(admin_opcode, opc));
114 }
115
116 static const char *
get_io_opcode_string(uint16_t opc)117 get_io_opcode_string(uint16_t opc)
118 {
119 return (get_opcode_string(io_opcode, opc));
120 }
121
122 static void
nvme_admin_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)123 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
124 struct nvme_command *cmd)
125 {
126
127 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x "
128 "cdw10:%08x cdw11:%08x\n",
129 get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid,
130 le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11));
131 }
132
133 static void
nvme_io_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)134 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
135 struct nvme_command *cmd)
136 {
137
138 switch (cmd->opc) {
139 case NVME_OPC_WRITE:
140 case NVME_OPC_READ:
141 case NVME_OPC_WRITE_UNCORRECTABLE:
142 case NVME_OPC_COMPARE:
143 case NVME_OPC_WRITE_ZEROES:
144 case NVME_OPC_VERIFY:
145 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
146 "lba:%llu len:%d\n",
147 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid),
148 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
149 (le32toh(cmd->cdw12) & 0xFFFF) + 1);
150 break;
151 case NVME_OPC_FLUSH:
152 case NVME_OPC_DATASET_MANAGEMENT:
153 case NVME_OPC_RESERVATION_REGISTER:
154 case NVME_OPC_RESERVATION_REPORT:
155 case NVME_OPC_RESERVATION_ACQUIRE:
156 case NVME_OPC_RESERVATION_RELEASE:
157 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
158 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid));
159 break;
160 default:
161 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n",
162 get_io_opcode_string(cmd->opc), cmd->opc, qpair->id,
163 cmd->cid, le32toh(cmd->nsid));
164 break;
165 }
166 }
167
168 void
nvme_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)169 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
170 {
171 if (qpair->id == 0)
172 nvme_admin_qpair_print_command(qpair, cmd);
173 else
174 nvme_io_qpair_print_command(qpair, cmd);
175 if (nvme_verbose_cmd_dump) {
176 nvme_printf(qpair->ctrlr,
177 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
178 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
179 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
180 nvme_printf(qpair->ctrlr,
181 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
182 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
183 cmd->cdw15);
184 }
185 }
186
187 struct nvme_status_string {
188 uint16_t sc;
189 const char * str;
190 };
191
192 static struct nvme_status_string generic_status[] = {
193 { NVME_SC_SUCCESS, "SUCCESS" },
194 { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
195 { NVME_SC_INVALID_FIELD, "INVALID_FIELD" },
196 { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
197 { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
198 { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
199 { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
200 { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
201 { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
202 { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
203 { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
204 { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
205 { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
206 { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" },
207 { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" },
208 { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
209 { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
210 { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
211 { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" },
212 { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" },
213 { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
214 { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
215 { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" },
216 { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" },
217 { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" },
218 { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" },
219 { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" },
220 { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
221 { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
222 { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" },
223 { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
224 { NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" },
225 { NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" },
226 { NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" },
227
228 { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
229 { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
230 { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
231 { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
232 { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
233 { 0xFFFF, "GENERIC" }
234 };
235
236 static struct nvme_status_string command_specific_status[] = {
237 { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
238 { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
239 { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
240 { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
241 { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
242 { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
243 { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
244 { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
245 { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
246 { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
247 { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" },
248 { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
249 { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" },
250 { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
251 { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
252 { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" },
253 { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" },
254 { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" },
255 { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
256 { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
257 { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
258 { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" },
259 { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
260 { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
261 { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" },
262 { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" },
263 { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" },
264 { NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
265 { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" },
266 { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" },
267 { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
268 { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" },
269 { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
270 { NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" },
271 { NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" },
272 { NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" },
273
274 { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
275 { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
276 { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
277 { 0xFFFF, "COMMAND SPECIFIC" }
278 };
279
280 static struct nvme_status_string media_error_status[] = {
281 { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
282 { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
283 { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
284 { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
285 { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
286 { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
287 { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
288 { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" },
289 { 0xFFFF, "MEDIA ERROR" }
290 };
291
292 static struct nvme_status_string path_related_status[] = {
293 { NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
294 { NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" },
295 { NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" },
296 { NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" },
297 { NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" },
298 { NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" },
299 { NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" },
300 { 0xFFFF, "PATH RELATED" },
301 };
302
303 static const char *
get_status_string(uint16_t sct,uint16_t sc)304 get_status_string(uint16_t sct, uint16_t sc)
305 {
306 struct nvme_status_string *entry;
307
308 switch (sct) {
309 case NVME_SCT_GENERIC:
310 entry = generic_status;
311 break;
312 case NVME_SCT_COMMAND_SPECIFIC:
313 entry = command_specific_status;
314 break;
315 case NVME_SCT_MEDIA_ERROR:
316 entry = media_error_status;
317 break;
318 case NVME_SCT_PATH_RELATED:
319 entry = path_related_status;
320 break;
321 case NVME_SCT_VENDOR_SPECIFIC:
322 return ("VENDOR SPECIFIC");
323 default:
324 return ("RESERVED");
325 }
326
327 while (entry->sc != 0xFFFF) {
328 if (entry->sc == sc)
329 return (entry->str);
330 entry++;
331 }
332 return (entry->str);
333 }
334
335 void
nvme_qpair_print_completion(struct nvme_qpair * qpair,struct nvme_completion * cpl)336 nvme_qpair_print_completion(struct nvme_qpair *qpair,
337 struct nvme_completion *cpl)
338 {
339 uint8_t sct, sc, crd, m, dnr, p;
340
341 sct = NVME_STATUS_GET_SCT(cpl->status);
342 sc = NVME_STATUS_GET_SC(cpl->status);
343 crd = NVME_STATUS_GET_CRD(cpl->status);
344 m = NVME_STATUS_GET_M(cpl->status);
345 dnr = NVME_STATUS_GET_DNR(cpl->status);
346 p = NVME_STATUS_GET_P(cpl->status);
347
348 nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d "
349 "sqid:%d cid:%d cdw0:%x\n",
350 get_status_string(sct, sc), sct, sc, crd, m, dnr, p,
351 cpl->sqid, cpl->cid, cpl->cdw0);
352 }
353
354 static bool
nvme_completion_is_retry(const struct nvme_completion * cpl)355 nvme_completion_is_retry(const struct nvme_completion *cpl)
356 {
357 uint8_t sct, sc, dnr;
358
359 sct = NVME_STATUS_GET_SCT(cpl->status);
360 sc = NVME_STATUS_GET_SC(cpl->status);
361 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */
362
363 /*
364 * TODO: spec is not clear how commands that are aborted due
365 * to TLER will be marked. So for now, it seems
366 * NAMESPACE_NOT_READY is the only case where we should
367 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
368 * set the DNR bit correctly since the driver controls that.
369 */
370 switch (sct) {
371 case NVME_SCT_GENERIC:
372 switch (sc) {
373 case NVME_SC_ABORTED_BY_REQUEST:
374 case NVME_SC_NAMESPACE_NOT_READY:
375 if (dnr)
376 return (0);
377 else
378 return (1);
379 case NVME_SC_INVALID_OPCODE:
380 case NVME_SC_INVALID_FIELD:
381 case NVME_SC_COMMAND_ID_CONFLICT:
382 case NVME_SC_DATA_TRANSFER_ERROR:
383 case NVME_SC_ABORTED_POWER_LOSS:
384 case NVME_SC_INTERNAL_DEVICE_ERROR:
385 case NVME_SC_ABORTED_SQ_DELETION:
386 case NVME_SC_ABORTED_FAILED_FUSED:
387 case NVME_SC_ABORTED_MISSING_FUSED:
388 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
389 case NVME_SC_COMMAND_SEQUENCE_ERROR:
390 case NVME_SC_LBA_OUT_OF_RANGE:
391 case NVME_SC_CAPACITY_EXCEEDED:
392 default:
393 return (0);
394 }
395 case NVME_SCT_COMMAND_SPECIFIC:
396 case NVME_SCT_MEDIA_ERROR:
397 return (0);
398 case NVME_SCT_PATH_RELATED:
399 switch (sc) {
400 case NVME_SC_INTERNAL_PATH_ERROR:
401 if (dnr)
402 return (0);
403 else
404 return (1);
405 default:
406 return (0);
407 }
408 case NVME_SCT_VENDOR_SPECIFIC:
409 default:
410 return (0);
411 }
412 }
413
414 static void
nvme_qpair_complete_tracker(struct nvme_tracker * tr,struct nvme_completion * cpl,error_print_t print_on_error)415 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
416 struct nvme_completion *cpl, error_print_t print_on_error)
417 {
418 struct nvme_qpair * qpair = tr->qpair;
419 struct nvme_request *req;
420 bool retry, error, retriable;
421
422 req = tr->req;
423 error = nvme_completion_is_error(cpl);
424 retriable = nvme_completion_is_retry(cpl);
425 retry = error && retriable && req->retries < nvme_retry_count;
426 if (retry)
427 qpair->num_retries++;
428 if (error && req->retries >= nvme_retry_count && retriable)
429 qpair->num_failures++;
430
431 if (error && (print_on_error == ERROR_PRINT_ALL ||
432 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
433 nvme_qpair_print_command(qpair, &req->cmd);
434 nvme_qpair_print_completion(qpair, cpl);
435 }
436
437 qpair->act_tr[cpl->cid] = NULL;
438
439 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
440
441 if (!retry) {
442 if (req->payload_valid) {
443 bus_dmamap_sync(qpair->dma_tag_payload,
444 tr->payload_dma_map,
445 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
446 }
447 if (req->cb_fn)
448 req->cb_fn(req->cb_arg, cpl);
449 }
450
451 mtx_lock(&qpair->lock);
452
453 if (retry) {
454 req->retries++;
455 nvme_qpair_submit_tracker(qpair, tr);
456 } else {
457 if (req->payload_valid) {
458 bus_dmamap_unload(qpair->dma_tag_payload,
459 tr->payload_dma_map);
460 }
461
462 nvme_free_request(req);
463 tr->req = NULL;
464
465 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
466 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
467
468 /*
469 * If the controller is in the middle of resetting, don't
470 * try to submit queued requests here - let the reset logic
471 * handle that instead.
472 */
473 if (!STAILQ_EMPTY(&qpair->queued_req) &&
474 !qpair->ctrlr->is_resetting) {
475 req = STAILQ_FIRST(&qpair->queued_req);
476 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
477 _nvme_qpair_submit_request(qpair, req);
478 }
479 }
480
481 mtx_unlock(&qpair->lock);
482 }
483
484 static void
nvme_qpair_manual_complete_tracker(struct nvme_tracker * tr,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)485 nvme_qpair_manual_complete_tracker(
486 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
487 error_print_t print_on_error)
488 {
489 struct nvme_completion cpl;
490
491 memset(&cpl, 0, sizeof(cpl));
492
493 struct nvme_qpair * qpair = tr->qpair;
494
495 cpl.sqid = qpair->id;
496 cpl.cid = tr->cid;
497 cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
498 cpl.status |= NVMEF(NVME_STATUS_SC, sc);
499 cpl.status |= NVMEF(NVME_STATUS_DNR, dnr);
500 /* M=0 : this is artificial so no data in error log page */
501 /* CRD=0 : this is artificial and no delayed retry support anyway */
502 /* P=0 : phase not checked */
503 nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
504 }
505
506 void
nvme_qpair_manual_complete_request(struct nvme_qpair * qpair,struct nvme_request * req,uint32_t sct,uint32_t sc)507 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
508 struct nvme_request *req, uint32_t sct, uint32_t sc)
509 {
510 struct nvme_completion cpl;
511 bool error;
512
513 memset(&cpl, 0, sizeof(cpl));
514 cpl.sqid = qpair->id;
515 cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
516 cpl.status |= NVMEF(NVME_STATUS_SC, sc);
517
518 error = nvme_completion_is_error(&cpl);
519
520 if (error) {
521 nvme_qpair_print_command(qpair, &req->cmd);
522 nvme_qpair_print_completion(qpair, &cpl);
523 }
524
525 if (req->cb_fn)
526 req->cb_fn(req->cb_arg, &cpl);
527
528 nvme_free_request(req);
529 }
530
531 /* Locked version of completion processor */
532 static bool
_nvme_qpair_process_completions(struct nvme_qpair * qpair)533 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
534 {
535 struct nvme_tracker *tr;
536 struct nvme_completion cpl;
537 bool done = false;
538 bool in_panic = dumping || SCHEDULER_STOPPED();
539
540 mtx_assert(&qpair->recovery, MA_OWNED);
541
542 /*
543 * qpair is not enabled, likely because a controller reset is in
544 * progress. Ignore the interrupt - any I/O that was associated with
545 * this interrupt will get retried when the reset is complete. Any
546 * pending completions for when we're in startup will be completed
547 * as soon as initialization is complete and we start sending commands
548 * to the device.
549 */
550 if (qpair->recovery_state != RECOVERY_NONE) {
551 qpair->num_ignored++;
552 return (false);
553 }
554
555 /*
556 * Sanity check initialization. After we reset the hardware, the phase
557 * is defined to be 1. So if we get here with zero prior calls and the
558 * phase is 0, it means that we've lost a race between the
559 * initialization and the ISR running. With the phase wrong, we'll
560 * process a bunch of completions that aren't really completions leading
561 * to a KASSERT below.
562 */
563 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
564 ("%s: Phase wrong for first interrupt call.",
565 device_get_nameunit(qpair->ctrlr->dev)));
566
567 qpair->num_intr_handler_calls++;
568
569 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
570 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
571 /*
572 * A panic can stop the CPU this routine is running on at any point. If
573 * we're called during a panic, complete the sq_head wrap protocol for
574 * the case where we are interrupted just after the increment at 1
575 * below, but before we can reset cq_head to zero at 2. Also cope with
576 * the case where we do the zero at 2, but may or may not have done the
577 * phase adjustment at step 3. The panic machinery flushes all pending
578 * memory writes, so we can make these strong ordering assumptions
579 * that would otherwise be unwise if we were racing in real time.
580 */
581 if (__predict_false(in_panic)) {
582 if (qpair->cq_head == qpair->num_entries) {
583 /*
584 * Here we know that we need to zero cq_head and then negate
585 * the phase, which hasn't been assigned if cq_head isn't
586 * zero due to the atomic_store_rel.
587 */
588 qpair->cq_head = 0;
589 qpair->phase = !qpair->phase;
590 } else if (qpair->cq_head == 0) {
591 /*
592 * In this case, we know that the assignment at 2
593 * happened below, but we don't know if it 3 happened or
594 * not. To do this, we look at the last completion
595 * entry and set the phase to the opposite phase
596 * that it has. This gets us back in sync
597 */
598 cpl = qpair->cpl[qpair->num_entries - 1];
599 nvme_completion_swapbytes(&cpl);
600 qpair->phase = !NVME_STATUS_GET_P(cpl.status);
601 }
602 }
603
604 while (1) {
605 uint16_t status;
606
607 /*
608 * We need to do this dance to avoid a race between the host and
609 * the device where the device overtakes the host while the host
610 * is reading this record, leaving the status field 'new' and
611 * the sqhd and cid fields potentially stale. If the phase
612 * doesn't match, that means status hasn't yet been updated and
613 * we'll get any pending changes next time. It also means that
614 * the phase must be the same the second time. We have to sync
615 * before reading to ensure any bouncing completes.
616 */
617 status = le16toh(qpair->cpl[qpair->cq_head].status);
618 if (NVME_STATUS_GET_P(status) != qpair->phase)
619 break;
620
621 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
622 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
623 cpl = qpair->cpl[qpair->cq_head];
624 nvme_completion_swapbytes(&cpl);
625
626 KASSERT(
627 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
628 ("Phase unexpectedly inconsistent"));
629
630 if (cpl.cid < qpair->num_trackers)
631 tr = qpair->act_tr[cpl.cid];
632 else
633 tr = NULL;
634
635 done = true;
636 if (tr != NULL) {
637 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
638 qpair->sq_head = cpl.sqhd;
639 } else if (!in_panic) {
640 /*
641 * A missing tracker is normally an error. However, a
642 * panic can stop the CPU this routine is running on
643 * after completing an I/O but before updating
644 * qpair->cq_head at 1 below. Later, we re-enter this
645 * routine to poll I/O associated with the kernel
646 * dump. We find that the tr has been set to null before
647 * calling the completion routine. If it hasn't
648 * completed (or it triggers a panic), then '1' below
649 * won't have updated cq_head. Rather than panic again,
650 * ignore this condition because it's not unexpected.
651 */
652 nvme_printf(qpair->ctrlr,
653 "cpl (cid = %u) does not map to outstanding cmd\n",
654 cpl.cid);
655 nvme_qpair_print_completion(qpair,
656 &qpair->cpl[qpair->cq_head]);
657 KASSERT(0, ("received completion for unknown cmd"));
658 }
659
660 /*
661 * There's a number of races with the following (see above) when
662 * the system panics. We compensate for each one of them by
663 * using the atomic store to force strong ordering (at least when
664 * viewed in the aftermath of a panic).
665 */
666 if (++qpair->cq_head == qpair->num_entries) { /* 1 */
667 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */
668 qpair->phase = !qpair->phase; /* 3 */
669 }
670 }
671
672 if (done) {
673 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
674 qpair->cq_hdbl_off, qpair->cq_head);
675 }
676
677 return (done);
678 }
679
680 bool
nvme_qpair_process_completions(struct nvme_qpair * qpair)681 nvme_qpair_process_completions(struct nvme_qpair *qpair)
682 {
683 bool done;
684
685 /*
686 * Interlock with reset / recovery code. This is an usually uncontended
687 * to make sure that we drain out of the ISRs before we reset the card
688 * and to prevent races with the recovery process called from a timeout
689 * context.
690 */
691 if (!mtx_trylock(&qpair->recovery)) {
692 qpair->num_recovery_nolock++;
693 return (false);
694 }
695
696 done = _nvme_qpair_process_completions(qpair);
697
698 mtx_unlock(&qpair->recovery);
699
700 return (done);
701 }
702
703 static void
nvme_qpair_msi_handler(void * arg)704 nvme_qpair_msi_handler(void *arg)
705 {
706 struct nvme_qpair *qpair = arg;
707
708 nvme_qpair_process_completions(qpair);
709 }
710
711 int
nvme_qpair_construct(struct nvme_qpair * qpair,uint32_t num_entries,uint32_t num_trackers,struct nvme_controller * ctrlr)712 nvme_qpair_construct(struct nvme_qpair *qpair,
713 uint32_t num_entries, uint32_t num_trackers,
714 struct nvme_controller *ctrlr)
715 {
716 struct nvme_tracker *tr;
717 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz;
718 uint64_t queuemem_phys, prpmem_phys, list_phys;
719 uint8_t *queuemem, *prpmem, *prp_list;
720 int i, err;
721
722 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
723 qpair->num_entries = num_entries;
724 qpair->num_trackers = num_trackers;
725 qpair->ctrlr = ctrlr;
726
727 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
728 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
729
730 /* Note: NVMe PRP format is restricted to 4-byte alignment. */
731 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
732 4, ctrlr->page_size, BUS_SPACE_MAXADDR,
733 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
734 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
735 ctrlr->page_size, 0,
736 NULL, NULL, &qpair->dma_tag_payload);
737 if (err != 0) {
738 nvme_printf(ctrlr, "payload tag create failed %d\n", err);
739 goto out;
740 }
741
742 /*
743 * Each component must be page aligned, and individual PRP lists
744 * cannot cross a page boundary.
745 */
746 cmdsz = qpair->num_entries * sizeof(struct nvme_command);
747 cmdsz = roundup2(cmdsz, ctrlr->page_size);
748 cplsz = qpair->num_entries * sizeof(struct nvme_completion);
749 cplsz = roundup2(cplsz, ctrlr->page_size);
750 /*
751 * For commands requiring more than 2 PRP entries, one PRP will be
752 * embedded in the command (prp1), and the rest of the PRP entries
753 * will be in a list pointed to by the command (prp2).
754 */
755 prpsz = sizeof(uint64_t) *
756 howmany(ctrlr->max_xfer_size, ctrlr->page_size);
757 prpmemsz = qpair->num_trackers * prpsz;
758 allocsz = cmdsz + cplsz + prpmemsz;
759
760 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
761 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
762 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
763 if (err != 0) {
764 nvme_printf(ctrlr, "tag create failed %d\n", err);
765 goto out;
766 }
767 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
768
769 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
770 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
771 nvme_printf(ctrlr, "failed to alloc qpair memory\n");
772 goto out;
773 }
774
775 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
776 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
777 nvme_printf(ctrlr, "failed to load qpair memory\n");
778 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
779 qpair->queuemem_map);
780 goto out;
781 }
782
783 qpair->num_cmds = 0;
784 qpair->num_intr_handler_calls = 0;
785 qpair->num_retries = 0;
786 qpair->num_failures = 0;
787 qpair->num_ignored = 0;
788 qpair->cmd = (struct nvme_command *)queuemem;
789 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
790 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
791 qpair->cmd_bus_addr = queuemem_phys;
792 qpair->cpl_bus_addr = queuemem_phys + cmdsz;
793 prpmem_phys = queuemem_phys + cmdsz + cplsz;
794
795 callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
796 qpair->timer_armed = false;
797 qpair->recovery_state = RECOVERY_WAITING;
798
799 /*
800 * Calcuate the stride of the doorbell register. Many emulators set this
801 * value to correspond to a cache line. However, some hardware has set
802 * it to various small values.
803 */
804 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
805 (qpair->id << (ctrlr->dstrd + 1));
806 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
807 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
808
809 TAILQ_INIT(&qpair->free_tr);
810 TAILQ_INIT(&qpair->outstanding_tr);
811 STAILQ_INIT(&qpair->queued_req);
812
813 list_phys = prpmem_phys;
814 prp_list = prpmem;
815 for (i = 0; i < qpair->num_trackers; i++) {
816 if (list_phys + prpsz > prpmem_phys + prpmemsz) {
817 qpair->num_trackers = i;
818 break;
819 }
820
821 /*
822 * Make sure that the PRP list for this tracker doesn't
823 * overflow to another nvme page.
824 */
825 if (trunc_page(list_phys) !=
826 trunc_page(list_phys + prpsz - 1)) {
827 list_phys = roundup2(list_phys, ctrlr->page_size);
828 prp_list =
829 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
830 }
831
832 tr = malloc_domainset(sizeof(*tr), M_NVME,
833 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
834 bus_dmamap_create(qpair->dma_tag_payload, 0,
835 &tr->payload_dma_map);
836 tr->cid = i;
837 tr->qpair = qpair;
838 tr->prp = (uint64_t *)prp_list;
839 tr->prp_bus_addr = list_phys;
840 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
841 list_phys += prpsz;
842 prp_list += prpsz;
843 }
844
845 if (qpair->num_trackers == 0) {
846 nvme_printf(ctrlr, "failed to allocate enough trackers\n");
847 goto out;
848 }
849
850 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
851 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
852 M_ZERO | M_WAITOK);
853
854 if (ctrlr->msi_count > 1) {
855 /*
856 * MSI-X vector resource IDs start at 1, so we add one to
857 * the queue's vector to get the corresponding rid to use.
858 */
859 qpair->rid = qpair->vector + 1;
860
861 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
862 &qpair->rid, RF_ACTIVE);
863 if (qpair->res == NULL) {
864 nvme_printf(ctrlr, "unable to allocate MSI\n");
865 goto out;
866 }
867 if (bus_setup_intr(ctrlr->dev, qpair->res,
868 INTR_TYPE_MISC | INTR_MPSAFE, NULL,
869 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
870 nvme_printf(ctrlr, "unable to setup MSI\n");
871 goto out;
872 }
873 if (qpair->id == 0) {
874 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
875 "admin");
876 } else {
877 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
878 "io%d", qpair->id - 1);
879 }
880 }
881
882 return (0);
883
884 out:
885 nvme_qpair_destroy(qpair);
886 return (ENOMEM);
887 }
888
889 static void
nvme_qpair_destroy(struct nvme_qpair * qpair)890 nvme_qpair_destroy(struct nvme_qpair *qpair)
891 {
892 struct nvme_tracker *tr;
893
894 callout_drain(&qpair->timer);
895
896 if (qpair->tag) {
897 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
898 qpair->tag = NULL;
899 }
900
901 if (qpair->act_tr) {
902 free(qpair->act_tr, M_NVME);
903 qpair->act_tr = NULL;
904 }
905
906 while (!TAILQ_EMPTY(&qpair->free_tr)) {
907 tr = TAILQ_FIRST(&qpair->free_tr);
908 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
909 bus_dmamap_destroy(qpair->dma_tag_payload,
910 tr->payload_dma_map);
911 free(tr, M_NVME);
912 }
913
914 if (qpair->cmd != NULL) {
915 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
916 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
917 qpair->queuemem_map);
918 qpair->cmd = NULL;
919 }
920
921 if (qpair->dma_tag) {
922 bus_dma_tag_destroy(qpair->dma_tag);
923 qpair->dma_tag = NULL;
924 }
925
926 if (qpair->dma_tag_payload) {
927 bus_dma_tag_destroy(qpair->dma_tag_payload);
928 qpair->dma_tag_payload = NULL;
929 }
930
931 if (mtx_initialized(&qpair->lock))
932 mtx_destroy(&qpair->lock);
933 if (mtx_initialized(&qpair->recovery))
934 mtx_destroy(&qpair->recovery);
935
936 if (qpair->res) {
937 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
938 rman_get_rid(qpair->res), qpair->res);
939 qpair->res = NULL;
940 }
941 }
942
943 static void
nvme_admin_qpair_abort_aers(struct nvme_qpair * qpair)944 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
945 {
946 struct nvme_tracker *tr;
947
948 /*
949 * nvme_complete_tracker must be called without the qpair lock held. It
950 * takes the lock to adjust outstanding_tr list, so make sure we don't
951 * have it yet (since this is a general purpose routine). We take the
952 * lock to make the list traverse safe, but have to drop the lock to
953 * complete any AER. We restart the list scan when we do this to make
954 * this safe. There's interlock with the ISR so we know this tracker
955 * won't be completed twice.
956 */
957 mtx_assert(&qpair->lock, MA_NOTOWNED);
958
959 mtx_lock(&qpair->lock);
960 tr = TAILQ_FIRST(&qpair->outstanding_tr);
961 while (tr != NULL) {
962 if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
963 mtx_unlock(&qpair->lock);
964 nvme_qpair_manual_complete_tracker(tr,
965 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
966 ERROR_PRINT_NONE);
967 mtx_lock(&qpair->lock);
968 tr = TAILQ_FIRST(&qpair->outstanding_tr);
969 } else {
970 tr = TAILQ_NEXT(tr, tailq);
971 }
972 }
973 mtx_unlock(&qpair->lock);
974 }
975
976 void
nvme_admin_qpair_destroy(struct nvme_qpair * qpair)977 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
978 {
979 mtx_assert(&qpair->lock, MA_NOTOWNED);
980
981 nvme_admin_qpair_abort_aers(qpair);
982 nvme_qpair_destroy(qpair);
983 }
984
985 void
nvme_io_qpair_destroy(struct nvme_qpair * qpair)986 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
987 {
988
989 nvme_qpair_destroy(qpair);
990 }
991
992 static void
nvme_abort_complete(void * arg,const struct nvme_completion * status)993 nvme_abort_complete(void *arg, const struct nvme_completion *status)
994 {
995 struct nvme_tracker *tr = arg;
996
997 /*
998 * If cdw0 == 1, the controller was not able to abort the command
999 * we requested. We still need to check the active tracker array,
1000 * to cover race where I/O timed out at same time controller was
1001 * completing the I/O.
1002 */
1003 if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
1004 /*
1005 * An I/O has timed out, and the controller was unable to
1006 * abort it for some reason. Construct a fake completion
1007 * status, and then complete the I/O's tracker manually.
1008 */
1009 nvme_printf(tr->qpair->ctrlr,
1010 "abort command failed, aborting command manually\n");
1011 nvme_qpair_manual_complete_tracker(tr,
1012 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
1013 }
1014 }
1015
1016 static void
nvme_qpair_timeout(void * arg)1017 nvme_qpair_timeout(void *arg)
1018 {
1019 struct nvme_qpair *qpair = arg;
1020 struct nvme_controller *ctrlr = qpair->ctrlr;
1021 struct nvme_tracker *tr;
1022 sbintime_t now;
1023 bool idle = false;
1024 bool needs_reset;
1025 uint32_t csts;
1026 uint8_t cfs;
1027
1028 mtx_assert(&qpair->recovery, MA_OWNED);
1029
1030 switch (qpair->recovery_state) {
1031 case RECOVERY_NONE:
1032 /*
1033 * Read csts to get value of cfs - controller fatal status. If
1034 * we are in the hot-plug or controller failed status proceed
1035 * directly to reset. We also bail early if the status reads all
1036 * 1's or the control fatal status bit is now 1. The latter is
1037 * always true when the former is true, but not vice versa. The
1038 * intent of the code is that if the card is gone (all 1's) or
1039 * we've failed, then try to do a reset (which someitmes
1040 * unwedges a card reading all 1's that's not gone away, but
1041 * usually doesn't).
1042 */
1043 csts = nvme_mmio_read_4(ctrlr, csts);
1044 cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
1045 if (csts == NVME_GONE || cfs == 1)
1046 goto do_reset;
1047
1048 /*
1049 * Process completions. We already have the recovery lock, so
1050 * call the locked version.
1051 */
1052 _nvme_qpair_process_completions(qpair);
1053
1054 /*
1055 * Check to see if we need to timeout any commands. If we do, then
1056 * we also enter a recovery phase.
1057 */
1058 now = getsbinuptime();
1059 needs_reset = false;
1060 idle = true;
1061 mtx_lock(&qpair->lock);
1062 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
1063 /*
1064 * Skip async commands, they are posted to the card for
1065 * an indefinite amount of time and have no deadline.
1066 */
1067 if (tr->deadline == SBT_MAX)
1068 continue;
1069 if (now > tr->deadline) {
1070 if (tr->req->cb_fn != nvme_abort_complete &&
1071 ctrlr->enable_aborts) {
1072 /*
1073 * This isn't an abort command, ask
1074 * for a hardware abort.
1075 */
1076 nvme_ctrlr_cmd_abort(ctrlr, tr->cid,
1077 qpair->id, nvme_abort_complete, tr);
1078 } else {
1079 /*
1080 * Otherwise we have a live command in
1081 * the card (either one we couldn't
1082 * abort, or aborts weren't enabled).
1083 * The only safe way to proceed is to do
1084 * a reset.
1085 */
1086 needs_reset = true;
1087 }
1088 } else {
1089 idle = false;
1090 }
1091 }
1092 mtx_unlock(&qpair->lock);
1093 if (!needs_reset)
1094 break;
1095
1096 /*
1097 * We've had a command timeout that we weren't able to abort
1098 *
1099 * If we get here due to a possible surprise hot-unplug event,
1100 * then we let nvme_ctrlr_reset confirm and fail the
1101 * controller.
1102 */
1103 do_reset:
1104 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
1105 (csts == 0xffffffff) ? " and possible hot unplug" :
1106 (cfs ? " and fatal error status" : ""));
1107 qpair->recovery_state = RECOVERY_WAITING;
1108 nvme_ctrlr_reset(ctrlr);
1109 idle = false; /* We want to keep polling */
1110 break;
1111 case RECOVERY_WAITING:
1112 /*
1113 * These messages aren't interesting while we're suspended. We
1114 * put the queues into waiting state while
1115 * suspending. Suspending takes a while, so we'll see these
1116 * during that time and they aren't diagnostic. At other times,
1117 * they indicate a problem that's worth complaining about.
1118 */
1119 if (!device_is_suspended(ctrlr->dev))
1120 nvme_printf(ctrlr, "Waiting for reset to complete\n");
1121 idle = false; /* We want to keep polling */
1122 break;
1123 case RECOVERY_FAILED:
1124 KASSERT(qpair->ctrlr->is_failed,
1125 ("Recovery state failed w/o failed controller\n"));
1126 idle = true; /* nothing to monitor */
1127 break;
1128 }
1129
1130 /*
1131 * Rearm the timeout.
1132 */
1133 if (!idle) {
1134 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1135 } else {
1136 qpair->timer_armed = false;
1137 }
1138 }
1139
1140 /*
1141 * Submit the tracker to the hardware. Must already be in the
1142 * outstanding queue when called.
1143 */
1144 void
nvme_qpair_submit_tracker(struct nvme_qpair * qpair,struct nvme_tracker * tr)1145 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1146 {
1147 struct nvme_request *req;
1148 struct nvme_controller *ctrlr;
1149 int timeout;
1150
1151 mtx_assert(&qpair->lock, MA_OWNED);
1152
1153 req = tr->req;
1154 req->cmd.cid = tr->cid;
1155 qpair->act_tr[tr->cid] = tr;
1156 ctrlr = qpair->ctrlr;
1157
1158 if (req->timeout) {
1159 if (req->cb_fn == nvme_completion_poll_cb)
1160 timeout = 1;
1161 else if (qpair->id == 0)
1162 timeout = ctrlr->admin_timeout_period;
1163 else
1164 timeout = ctrlr->timeout_period;
1165 tr->deadline = getsbinuptime() + timeout * SBT_1S;
1166 if (!qpair->timer_armed) {
1167 qpair->timer_armed = true;
1168 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1169 nvme_qpair_timeout, qpair, qpair->cpu, 0);
1170 }
1171 } else
1172 tr->deadline = SBT_MAX;
1173
1174 /* Copy the command from the tracker to the submission queue. */
1175 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1176
1177 if (++qpair->sq_tail == qpair->num_entries)
1178 qpair->sq_tail = 0;
1179
1180 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1181 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1182 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
1183 qpair->sq_tdbl_off, qpair->sq_tail);
1184 qpair->num_cmds++;
1185 }
1186
1187 static void
nvme_payload_map(void * arg,bus_dma_segment_t * seg,int nseg,int error)1188 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1189 {
1190 struct nvme_tracker *tr = arg;
1191 uint32_t cur_nseg;
1192
1193 /*
1194 * If the mapping operation failed, return immediately. The caller
1195 * is responsible for detecting the error status and failing the
1196 * tracker manually.
1197 */
1198 if (error != 0) {
1199 nvme_printf(tr->qpair->ctrlr,
1200 "nvme_payload_map err %d\n", error);
1201 return;
1202 }
1203
1204 /*
1205 * Note that we specified ctrlr->page_size for alignment and max
1206 * segment size when creating the bus dma tags. So here we can safely
1207 * just transfer each segment to its associated PRP entry.
1208 */
1209 tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1210
1211 if (nseg == 2) {
1212 tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1213 } else if (nseg > 2) {
1214 cur_nseg = 1;
1215 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1216 while (cur_nseg < nseg) {
1217 tr->prp[cur_nseg-1] =
1218 htole64((uint64_t)seg[cur_nseg].ds_addr);
1219 cur_nseg++;
1220 }
1221 } else {
1222 /*
1223 * prp2 should not be used by the controller
1224 * since there is only one segment, but set
1225 * to 0 just to be safe.
1226 */
1227 tr->req->cmd.prp2 = 0;
1228 }
1229
1230 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1231 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1232 nvme_qpair_submit_tracker(tr->qpair, tr);
1233 }
1234
1235 static void
_nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1236 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1237 {
1238 struct nvme_tracker *tr;
1239 int err = 0;
1240
1241 mtx_assert(&qpair->lock, MA_OWNED);
1242
1243 tr = TAILQ_FIRST(&qpair->free_tr);
1244 req->qpair = qpair;
1245
1246 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1247 /*
1248 * No tracker is available, or the qpair is disabled due to an
1249 * in-progress controller-level reset or controller failure. If
1250 * we lose the race with recovery_state, then we may add an
1251 * extra request to the queue which will be resubmitted later.
1252 * We only set recovery_state to NONE with qpair->lock also
1253 * held.
1254 */
1255
1256 if (qpair->ctrlr->is_failed) {
1257 /*
1258 * The controller has failed, so fail the request.
1259 */
1260 nvme_qpair_manual_complete_request(qpair, req,
1261 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
1262 } else {
1263 /*
1264 * Put the request on the qpair's request queue to be
1265 * processed when a tracker frees up via a command
1266 * completion or when the controller reset is
1267 * completed.
1268 */
1269 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1270 }
1271 return;
1272 }
1273
1274 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1275 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1276 tr->deadline = SBT_MAX;
1277 tr->req = req;
1278
1279 if (!req->payload_valid) {
1280 nvme_qpair_submit_tracker(tr->qpair, tr);
1281 return;
1282 }
1283
1284 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
1285 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
1286 if (err != 0) {
1287 /*
1288 * The dmamap operation failed, so we manually fail the
1289 * tracker here with DATA_TRANSFER_ERROR status.
1290 *
1291 * nvme_qpair_manual_complete_tracker must not be called
1292 * with the qpair lock held.
1293 */
1294 nvme_printf(qpair->ctrlr,
1295 "bus_dmamap_load_mem returned 0x%x!\n", err);
1296 mtx_unlock(&qpair->lock);
1297 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1298 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1299 mtx_lock(&qpair->lock);
1300 }
1301 }
1302
1303 void
nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1304 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1305 {
1306
1307 mtx_lock(&qpair->lock);
1308 _nvme_qpair_submit_request(qpair, req);
1309 mtx_unlock(&qpair->lock);
1310 }
1311
1312 static void
nvme_qpair_enable(struct nvme_qpair * qpair)1313 nvme_qpair_enable(struct nvme_qpair *qpair)
1314 {
1315 if (mtx_initialized(&qpair->recovery))
1316 mtx_assert(&qpair->recovery, MA_OWNED);
1317 if (mtx_initialized(&qpair->lock))
1318 mtx_assert(&qpair->lock, MA_OWNED);
1319 KASSERT(qpair->recovery_state != RECOVERY_FAILED,
1320 ("Enabling a failed qpair\n"));
1321
1322 qpair->recovery_state = RECOVERY_NONE;
1323 }
1324
1325 void
nvme_qpair_reset(struct nvme_qpair * qpair)1326 nvme_qpair_reset(struct nvme_qpair *qpair)
1327 {
1328
1329 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1330
1331 /*
1332 * First time through the completion queue, HW will set phase
1333 * bit on completions to 1. So set this to 1 here, indicating
1334 * we're looking for a 1 to know which entries have completed.
1335 * we'll toggle the bit each time when the completion queue
1336 * rolls over.
1337 */
1338 qpair->phase = 1;
1339
1340 memset(qpair->cmd, 0,
1341 qpair->num_entries * sizeof(struct nvme_command));
1342 memset(qpair->cpl, 0,
1343 qpair->num_entries * sizeof(struct nvme_completion));
1344 }
1345
1346 void
nvme_admin_qpair_enable(struct nvme_qpair * qpair)1347 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1348 {
1349 struct nvme_tracker *tr;
1350 struct nvme_tracker *tr_temp;
1351 bool rpt;
1352
1353 /*
1354 * Manually abort each outstanding admin command. Do not retry
1355 * admin commands found here, since they will be left over from
1356 * a controller reset and its likely the context in which the
1357 * command was issued no longer applies.
1358 */
1359 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
1360 if (rpt)
1361 nvme_printf(qpair->ctrlr,
1362 "aborting outstanding admin command\n");
1363 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1364 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1365 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1366 }
1367 if (rpt)
1368 nvme_printf(qpair->ctrlr,
1369 "done aborting outstanding admin\n");
1370
1371 mtx_lock(&qpair->recovery);
1372 mtx_lock(&qpair->lock);
1373 nvme_qpair_enable(qpair);
1374 mtx_unlock(&qpair->lock);
1375 mtx_unlock(&qpair->recovery);
1376 }
1377
1378 void
nvme_io_qpair_enable(struct nvme_qpair * qpair)1379 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1380 {
1381 STAILQ_HEAD(, nvme_request) temp;
1382 struct nvme_tracker *tr;
1383 struct nvme_tracker *tr_temp;
1384 struct nvme_request *req;
1385 bool report;
1386
1387 /*
1388 * Manually abort each outstanding I/O. This normally results in a
1389 * retry, unless the retry count on the associated request has
1390 * reached its limit.
1391 */
1392 report = !TAILQ_EMPTY(&qpair->outstanding_tr);
1393 if (report)
1394 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1395 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1396 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1397 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1398 }
1399 if (report)
1400 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
1401
1402 mtx_lock(&qpair->recovery);
1403 mtx_lock(&qpair->lock);
1404 nvme_qpair_enable(qpair);
1405
1406 STAILQ_INIT(&temp);
1407 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1408
1409 report = !STAILQ_EMPTY(&temp);
1410 if (report)
1411 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1412 while (!STAILQ_EMPTY(&temp)) {
1413 req = STAILQ_FIRST(&temp);
1414 STAILQ_REMOVE_HEAD(&temp, stailq);
1415 nvme_qpair_print_command(qpair, &req->cmd);
1416 _nvme_qpair_submit_request(qpair, req);
1417 }
1418 if (report)
1419 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
1420
1421 mtx_unlock(&qpair->lock);
1422 mtx_unlock(&qpair->recovery);
1423 }
1424
1425 static void
nvme_qpair_disable(struct nvme_qpair * qpair)1426 nvme_qpair_disable(struct nvme_qpair *qpair)
1427 {
1428 struct nvme_tracker *tr, *tr_temp;
1429
1430 if (mtx_initialized(&qpair->recovery))
1431 mtx_assert(&qpair->recovery, MA_OWNED);
1432 if (mtx_initialized(&qpair->lock))
1433 mtx_assert(&qpair->lock, MA_OWNED);
1434
1435 qpair->recovery_state = RECOVERY_WAITING;
1436 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1437 tr->deadline = SBT_MAX;
1438 }
1439 }
1440
1441 void
nvme_admin_qpair_disable(struct nvme_qpair * qpair)1442 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1443 {
1444 mtx_lock(&qpair->recovery);
1445
1446 mtx_lock(&qpair->lock);
1447 nvme_qpair_disable(qpair);
1448 mtx_unlock(&qpair->lock);
1449
1450 nvme_admin_qpair_abort_aers(qpair);
1451
1452 mtx_unlock(&qpair->recovery);
1453 }
1454
1455 void
nvme_io_qpair_disable(struct nvme_qpair * qpair)1456 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1457 {
1458 mtx_lock(&qpair->recovery);
1459 mtx_lock(&qpair->lock);
1460
1461 nvme_qpair_disable(qpair);
1462
1463 mtx_unlock(&qpair->lock);
1464 mtx_unlock(&qpair->recovery);
1465 }
1466
1467 void
nvme_qpair_fail(struct nvme_qpair * qpair)1468 nvme_qpair_fail(struct nvme_qpair *qpair)
1469 {
1470 struct nvme_tracker *tr;
1471 struct nvme_request *req;
1472
1473 if (!mtx_initialized(&qpair->lock))
1474 return;
1475
1476 mtx_lock(&qpair->recovery);
1477 qpair->recovery_state = RECOVERY_FAILED;
1478 mtx_unlock(&qpair->recovery);
1479
1480 mtx_lock(&qpair->lock);
1481
1482 if (!STAILQ_EMPTY(&qpair->queued_req)) {
1483 nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1484 }
1485 while (!STAILQ_EMPTY(&qpair->queued_req)) {
1486 req = STAILQ_FIRST(&qpair->queued_req);
1487 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1488 mtx_unlock(&qpair->lock);
1489 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1490 NVME_SC_ABORTED_BY_REQUEST);
1491 mtx_lock(&qpair->lock);
1492 }
1493
1494 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1495 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1496 }
1497 /* Manually abort each outstanding I/O. */
1498 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1499 tr = TAILQ_FIRST(&qpair->outstanding_tr);
1500 /*
1501 * Do not remove the tracker. The abort_tracker path will
1502 * do that for us.
1503 */
1504 mtx_unlock(&qpair->lock);
1505 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1506 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1507 mtx_lock(&qpair->lock);
1508 }
1509
1510 mtx_unlock(&qpair->lock);
1511 }
1512