1 /*
2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32 #define _GNU_SOURCE
33 #include <config.h>
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX 0x15b3
51 #endif
52
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60
61
62 #define HCA(v, d) \
63 { .vendor = PCI_VENDOR_ID_##v, \
64 .device = d }
65
66 static struct {
67 unsigned vendor;
68 unsigned device;
69 } hca_table[] = {
70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */
71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */
72 HCA(MELLANOX, 4115), /* ConnectX-4 */
73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */
74 HCA(MELLANOX, 4117), /* ConnectX-4LX */
75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */
76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */
77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */
78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */
79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */
80 HCA(MELLANOX, 4123), /* ConnectX-6 */
81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */
82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */
83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */
84 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */
85 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */
86 };
87
88 uint32_t mlx5_debug_mask = 0;
89 int mlx5_freeze_on_error_cqe;
90
91 static struct ibv_context_ops mlx5_ctx_ops = {
92 .query_device = mlx5_query_device,
93 .query_port = mlx5_query_port,
94 .alloc_pd = mlx5_alloc_pd,
95 .dealloc_pd = mlx5_free_pd,
96 .reg_mr = mlx5_reg_mr,
97 .rereg_mr = mlx5_rereg_mr,
98 .dereg_mr = mlx5_dereg_mr,
99 .alloc_mw = mlx5_alloc_mw,
100 .dealloc_mw = mlx5_dealloc_mw,
101 .bind_mw = mlx5_bind_mw,
102 .create_cq = mlx5_create_cq,
103 .poll_cq = mlx5_poll_cq,
104 .req_notify_cq = mlx5_arm_cq,
105 .cq_event = mlx5_cq_event,
106 .resize_cq = mlx5_resize_cq,
107 .destroy_cq = mlx5_destroy_cq,
108 .create_srq = mlx5_create_srq,
109 .modify_srq = mlx5_modify_srq,
110 .query_srq = mlx5_query_srq,
111 .destroy_srq = mlx5_destroy_srq,
112 .post_srq_recv = mlx5_post_srq_recv,
113 .create_qp = mlx5_create_qp,
114 .query_qp = mlx5_query_qp,
115 .modify_qp = mlx5_modify_qp,
116 .destroy_qp = mlx5_destroy_qp,
117 .post_send = mlx5_post_send,
118 .post_recv = mlx5_post_recv,
119 .create_ah = mlx5_create_ah,
120 .destroy_ah = mlx5_destroy_ah,
121 .attach_mcast = mlx5_attach_mcast,
122 .detach_mcast = mlx5_detach_mcast
123 };
124
read_number_from_line(const char * line,int * value)125 static int read_number_from_line(const char *line, int *value)
126 {
127 const char *ptr;
128
129 ptr = strchr(line, ':');
130 if (!ptr)
131 return 1;
132
133 ++ptr;
134
135 *value = atoi(ptr);
136 return 0;
137 }
138 /**
139 * The function looks for the first free user-index in all the
140 * user-index tables. If all are used, returns -1, otherwise
141 * a valid user-index.
142 * In case the reference count of the table is zero, it means the
143 * table is not in use and wasn't allocated yet, therefore the
144 * mlx5_store_uidx allocates the table, and increment the reference
145 * count on the table.
146 */
get_free_uidx(struct mlx5_context * ctx)147 static int32_t get_free_uidx(struct mlx5_context *ctx)
148 {
149 int32_t tind;
150 int32_t i;
151
152 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
153 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
154 break;
155 }
156
157 if (tind == MLX5_UIDX_TABLE_SIZE)
158 return -1;
159
160 if (!ctx->uidx_table[tind].refcnt)
161 return tind << MLX5_UIDX_TABLE_SHIFT;
162
163 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
164 if (!ctx->uidx_table[tind].table[i])
165 break;
166 }
167
168 return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
169 }
170
mlx5_store_uidx(struct mlx5_context * ctx,void * rsc)171 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
172 {
173 int32_t tind;
174 int32_t ret = -1;
175 int32_t uidx;
176
177 pthread_mutex_lock(&ctx->uidx_table_mutex);
178 uidx = get_free_uidx(ctx);
179 if (uidx < 0)
180 goto out;
181
182 tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
183
184 if (!ctx->uidx_table[tind].refcnt) {
185 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
186 sizeof(struct mlx5_resource *));
187 if (!ctx->uidx_table[tind].table)
188 goto out;
189 }
190
191 ++ctx->uidx_table[tind].refcnt;
192 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
193 ret = uidx;
194
195 out:
196 pthread_mutex_unlock(&ctx->uidx_table_mutex);
197 return ret;
198 }
199
mlx5_clear_uidx(struct mlx5_context * ctx,uint32_t uidx)200 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
201 {
202 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
203
204 pthread_mutex_lock(&ctx->uidx_table_mutex);
205
206 if (!--ctx->uidx_table[tind].refcnt)
207 free(ctx->uidx_table[tind].table);
208 else
209 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
210
211 pthread_mutex_unlock(&ctx->uidx_table_mutex);
212 }
213
mlx5_is_sandy_bridge(int * num_cores)214 static int mlx5_is_sandy_bridge(int *num_cores)
215 {
216 char line[128];
217 FILE *fd;
218 int rc = 0;
219 int cur_cpu_family = -1;
220 int cur_cpu_model = -1;
221
222 fd = fopen("/proc/cpuinfo", "r");
223 if (!fd)
224 return 0;
225
226 *num_cores = 0;
227
228 while (fgets(line, 128, fd)) {
229 int value;
230
231 /* if this is information on new processor */
232 if (!strncmp(line, "processor", 9)) {
233 ++*num_cores;
234
235 cur_cpu_family = -1;
236 cur_cpu_model = -1;
237 } else if (!strncmp(line, "cpu family", 10)) {
238 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
239 cur_cpu_family = value;
240 } else if (!strncmp(line, "model", 5)) {
241 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
242 cur_cpu_model = value;
243 }
244
245 /* if this is a Sandy Bridge CPU */
246 if ((cur_cpu_family == 6) &&
247 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
248 rc = 1;
249 }
250
251 fclose(fd);
252 return rc;
253 }
254
255 /*
256 man cpuset
257
258 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
259 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
260 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
261 within a word are also in big-endian order.
262
263 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
264 the size of the bitmask.
265
266 Examples of the Mask Format:
267
268 00000001 # just bit 0 set
269 40000000,00000000,00000000 # just bit 94 set
270 000000ff,00000000 # bits 32-39 set
271 00000000,000E3862 # 1,5,6,11-13,17-19 set
272
273 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
274
275 00000001,00000001,00010117
276
277 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
278 bit 4, and the "7" is for bits 2, 1, and 0.
279 */
mlx5_local_cpu_set(struct ibv_device * ibdev,cpuset_t * cpu_set)280 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
281 {
282 char *p, buf[1024];
283 char *env_value;
284 uint32_t word;
285 int i, k;
286
287 env_value = getenv("MLX5_LOCAL_CPUS");
288 if (env_value)
289 strncpy(buf, env_value, sizeof(buf));
290 else {
291 char fname[MAXPATHLEN];
292
293 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
294 ibv_get_device_name(ibdev));
295
296 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
297 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
298 return;
299 }
300 }
301
302 p = strrchr(buf, ',');
303 if (!p)
304 p = buf;
305
306 i = 0;
307 do {
308 if (*p == ',') {
309 *p = 0;
310 p ++;
311 }
312
313 word = strtoul(p, NULL, 16);
314
315 for (k = 0; word; ++k, word >>= 1)
316 if (word & 1)
317 CPU_SET(k+i, cpu_set);
318
319 if (p == buf)
320 break;
321
322 p = strrchr(buf, ',');
323 if (!p)
324 p = buf;
325
326 i += 32;
327 } while (i < CPU_SETSIZE);
328 }
329
mlx5_enable_sandy_bridge_fix(struct ibv_device * ibdev)330 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
331 {
332 cpuset_t my_cpus, dev_local_cpus, result_set;
333 int stall_enable;
334 int ret;
335 int num_cores;
336
337 if (!mlx5_is_sandy_bridge(&num_cores))
338 return 0;
339
340 /* by default enable stall on sandy bridge arch */
341 stall_enable = 1;
342
343 /*
344 * check if app is bound to cpu set that is inside
345 * of device local cpu set. Disable stalling if true
346 */
347
348 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
349 CPU_ZERO(&my_cpus);
350 CPU_ZERO(&dev_local_cpus);
351 CPU_ZERO(&result_set);
352 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
353 sizeof(my_cpus), &my_cpus);
354 if (ret == -1) {
355 if (errno == EINVAL)
356 fprintf(stderr, PFX "Warning: my cpu set is too small\n");
357 else
358 fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
359 goto out;
360 }
361
362 /* get device local cpu set */
363 mlx5_local_cpu_set(ibdev, &dev_local_cpus);
364
365 /* check if my cpu set is in dev cpu */
366 CPU_OR(&result_set, &my_cpus);
367 CPU_OR(&result_set, &dev_local_cpus);
368 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
369
370 out:
371 return stall_enable;
372 }
373
mlx5_read_env(struct ibv_device * ibdev,struct mlx5_context * ctx)374 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
375 {
376 char *env_value;
377
378 env_value = getenv("MLX5_STALL_CQ_POLL");
379 if (env_value)
380 /* check if cq stall is enforced by user */
381 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
382 else
383 /* autodetect if we need to do cq polling */
384 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
385
386 env_value = getenv("MLX5_STALL_NUM_LOOP");
387 if (env_value)
388 mlx5_stall_num_loop = atoi(env_value);
389
390 env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
391 if (env_value)
392 mlx5_stall_cq_poll_min = atoi(env_value);
393
394 env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
395 if (env_value)
396 mlx5_stall_cq_poll_max = atoi(env_value);
397
398 env_value = getenv("MLX5_STALL_CQ_INC_STEP");
399 if (env_value)
400 mlx5_stall_cq_inc_step = atoi(env_value);
401
402 env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
403 if (env_value)
404 mlx5_stall_cq_dec_step = atoi(env_value);
405
406 ctx->stall_adaptive_enable = 0;
407 ctx->stall_cycles = 0;
408
409 if (mlx5_stall_num_loop < 0) {
410 ctx->stall_adaptive_enable = 1;
411 ctx->stall_cycles = mlx5_stall_cq_poll_min;
412 }
413
414 }
415
get_total_uuars(int page_size)416 static int get_total_uuars(int page_size)
417 {
418 int size = MLX5_DEF_TOT_UUARS;
419 int uuars_in_page;
420 char *env;
421
422 env = getenv("MLX5_TOTAL_UUARS");
423 if (env)
424 size = atoi(env);
425
426 if (size < 1)
427 return -EINVAL;
428
429 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
430 size = max(uuars_in_page, size);
431 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
432 if (size > MLX5_MAX_BFREGS)
433 return -ENOMEM;
434
435 return size;
436 }
437
open_debug_file(struct mlx5_context * ctx)438 static void open_debug_file(struct mlx5_context *ctx)
439 {
440 char *env;
441
442 env = getenv("MLX5_DEBUG_FILE");
443 if (!env) {
444 ctx->dbg_fp = stderr;
445 return;
446 }
447
448 ctx->dbg_fp = fopen(env, "aw+");
449 if (!ctx->dbg_fp) {
450 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
451 ctx->dbg_fp = stderr;
452 return;
453 }
454 }
455
close_debug_file(struct mlx5_context * ctx)456 static void close_debug_file(struct mlx5_context *ctx)
457 {
458 if (ctx->dbg_fp && ctx->dbg_fp != stderr)
459 fclose(ctx->dbg_fp);
460 }
461
set_debug_mask(void)462 static void set_debug_mask(void)
463 {
464 char *env;
465
466 env = getenv("MLX5_DEBUG_MASK");
467 if (env)
468 mlx5_debug_mask = strtol(env, NULL, 0);
469 }
470
set_freeze_on_error(void)471 static void set_freeze_on_error(void)
472 {
473 char *env;
474
475 env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
476 if (env)
477 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
478 }
479
get_always_bf(void)480 static int get_always_bf(void)
481 {
482 char *env;
483
484 env = getenv("MLX5_POST_SEND_PREFER_BF");
485 if (!env)
486 return 1;
487
488 return strcmp(env, "0") ? 1 : 0;
489 }
490
get_shut_up_bf(void)491 static int get_shut_up_bf(void)
492 {
493 char *env;
494
495 env = getenv("MLX5_SHUT_UP_BF");
496 if (!env)
497 return 0;
498
499 return strcmp(env, "0") ? 1 : 0;
500 }
501
get_num_low_lat_uuars(int tot_uuars)502 static int get_num_low_lat_uuars(int tot_uuars)
503 {
504 char *env;
505 int num = 4;
506
507 env = getenv("MLX5_NUM_LOW_LAT_UUARS");
508 if (env)
509 num = atoi(env);
510
511 if (num < 0)
512 return -EINVAL;
513
514 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
515 return num;
516 }
517
518 /* The library allocates an array of uuar contexts. The one in index zero does
519 * not to execersize odd/even policy so it can avoid a lock but it may not use
520 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
521 * since they are assigned to one QP only. The rest can use blue flame but since
522 * they are shared they need a lock
523 */
need_uuar_lock(struct mlx5_context * ctx,int uuarn)524 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
525 {
526 if (uuarn == 0 || mlx5_single_threaded)
527 return 0;
528
529 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
530 return 0;
531
532 return 1;
533 }
534
single_threaded_app(void)535 static int single_threaded_app(void)
536 {
537
538 char *env;
539
540 env = getenv("MLX5_SINGLE_THREADED");
541 if (env)
542 return strcmp(env, "1") ? 0 : 1;
543
544 return 0;
545 }
546
mlx5_cmd_get_context(struct mlx5_context * context,struct mlx5_alloc_ucontext * req,size_t req_len,struct mlx5_alloc_ucontext_resp * resp,size_t resp_len)547 static int mlx5_cmd_get_context(struct mlx5_context *context,
548 struct mlx5_alloc_ucontext *req,
549 size_t req_len,
550 struct mlx5_alloc_ucontext_resp *resp,
551 size_t resp_len)
552 {
553 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
554 req_len, &resp->ibv_resp, resp_len))
555 return 0;
556
557 /* The ibv_cmd_get_context fails in older kernels when passing
558 * a request length that the kernel doesn't know.
559 * To avoid breaking compatibility of new libmlx5 and older
560 * kernels, when ibv_cmd_get_context fails with the full
561 * request length, we try once again with the legacy length.
562 * We repeat this process while reducing requested size based
563 * on the feature input size. To avoid this in the future, we
564 * will remove the check in kernel that requires fields unknown
565 * to the kernel to be cleared. This will require that any new
566 * feature that involves extending struct mlx5_alloc_ucontext
567 * will be accompanied by an indication in the form of one or
568 * more fields in struct mlx5_alloc_ucontext_resp. If the
569 * response value can be interpreted as feature not supported
570 * when the returned value is zero, this will suffice to
571 * indicate to the library that the request was ignored by the
572 * kernel, either because it is unaware or because it decided
573 * to do so. If zero is a valid response, we will add a new
574 * field that indicates whether the request was handled.
575 */
576 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
577 offsetof(struct mlx5_alloc_ucontext, lib_caps),
578 &resp->ibv_resp, resp_len))
579 return 0;
580
581 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
582 offsetof(struct mlx5_alloc_ucontext,
583 cqe_version),
584 &resp->ibv_resp, resp_len);
585 }
586
mlx5_map_internal_clock(struct mlx5_device * mdev,struct ibv_context * ibv_ctx)587 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
588 struct ibv_context *ibv_ctx)
589 {
590 struct mlx5_context *context = to_mctx(ibv_ctx);
591 void *hca_clock_page;
592 off_t offset = 0;
593
594 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
595 hca_clock_page = mmap(NULL, mdev->page_size,
596 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
597 mdev->page_size * offset);
598
599 if (hca_clock_page == MAP_FAILED) {
600 fprintf(stderr, PFX
601 "Warning: Timestamp available,\n"
602 "but failed to mmap() hca core clock page.\n");
603 return -1;
604 }
605
606 context->hca_core_clock = hca_clock_page +
607 (context->core_clock.offset & (mdev->page_size - 1));
608 return 0;
609 }
610
mlx5dv_query_device(struct ibv_context * ctx_in,struct mlx5dv_context * attrs_out)611 int mlx5dv_query_device(struct ibv_context *ctx_in,
612 struct mlx5dv_context *attrs_out)
613 {
614 struct mlx5_context *mctx = to_mctx(ctx_in);
615 uint64_t comp_mask_out = 0;
616
617 attrs_out->version = 0;
618 attrs_out->flags = 0;
619
620 if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
621 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
622
623 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
624 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
625
626 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
627 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
628 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
629 }
630
631 attrs_out->comp_mask = comp_mask_out;
632
633 return 0;
634 }
635
mlx5dv_get_qp(struct ibv_qp * qp_in,struct mlx5dv_qp * qp_out)636 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
637 struct mlx5dv_qp *qp_out)
638 {
639 struct mlx5_qp *mqp = to_mqp(qp_in);
640
641 qp_out->comp_mask = 0;
642 qp_out->dbrec = mqp->db;
643
644 if (mqp->sq_buf_size)
645 /* IBV_QPT_RAW_PACKET */
646 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
647 else
648 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
649 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
650 qp_out->sq.stride = 1 << mqp->sq.wqe_shift;
651
652 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
653 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
654 qp_out->rq.stride = 1 << mqp->rq.wqe_shift;
655
656 qp_out->bf.reg = mqp->bf->reg;
657
658 if (mqp->bf->uuarn > 0)
659 qp_out->bf.size = mqp->bf->buf_size;
660 else
661 qp_out->bf.size = 0;
662
663 return 0;
664 }
665
mlx5dv_get_cq(struct ibv_cq * cq_in,struct mlx5dv_cq * cq_out)666 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
667 struct mlx5dv_cq *cq_out)
668 {
669 struct mlx5_cq *mcq = to_mcq(cq_in);
670 struct mlx5_context *mctx = to_mctx(cq_in->context);
671
672 cq_out->comp_mask = 0;
673 cq_out->cqn = mcq->cqn;
674 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1;
675 cq_out->cqe_size = mcq->cqe_sz;
676 cq_out->buf = mcq->active_buf->buf;
677 cq_out->dbrec = mcq->dbrec;
678 cq_out->uar = mctx->uar;
679
680 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED;
681
682 return 0;
683 }
684
mlx5dv_get_rwq(struct ibv_wq * wq_in,struct mlx5dv_rwq * rwq_out)685 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
686 struct mlx5dv_rwq *rwq_out)
687 {
688 struct mlx5_rwq *mrwq = to_mrwq(wq_in);
689
690 rwq_out->comp_mask = 0;
691 rwq_out->buf = mrwq->pbuff;
692 rwq_out->dbrec = mrwq->recv_db;
693 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt;
694 rwq_out->stride = 1 << mrwq->rq.wqe_shift;
695
696 return 0;
697 }
698
mlx5dv_get_srq(struct ibv_srq * srq_in,struct mlx5dv_srq * srq_out)699 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
700 struct mlx5dv_srq *srq_out)
701 {
702 struct mlx5_srq *msrq;
703
704 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
705
706 srq_out->comp_mask = 0;
707 srq_out->buf = msrq->buf.buf;
708 srq_out->dbrec = msrq->db;
709 srq_out->stride = 1 << msrq->wqe_shift;
710 srq_out->head = msrq->head;
711 srq_out->tail = msrq->tail;
712
713 return 0;
714 }
715
mlx5dv_init_obj(struct mlx5dv_obj * obj,uint64_t obj_type)716 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
717 {
718 int ret = 0;
719
720 if (obj_type & MLX5DV_OBJ_QP)
721 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
722 if (!ret && (obj_type & MLX5DV_OBJ_CQ))
723 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
724 if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
725 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
726 if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
727 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
728
729 return ret;
730 }
731
adjust_uar_info(struct mlx5_device * mdev,struct mlx5_context * context,struct mlx5_alloc_ucontext_resp resp)732 static void adjust_uar_info(struct mlx5_device *mdev,
733 struct mlx5_context *context,
734 struct mlx5_alloc_ucontext_resp resp)
735 {
736 if (!resp.log_uar_size && !resp.num_uars_per_page) {
737 /* old kernel */
738 context->uar_size = mdev->page_size;
739 context->num_uars_per_page = 1;
740 return;
741 }
742
743 context->uar_size = 1 << resp.log_uar_size;
744 context->num_uars_per_page = resp.num_uars_per_page;
745 }
746
mlx5_init_context(struct verbs_device * vdev,struct ibv_context * ctx,int cmd_fd)747 static int mlx5_init_context(struct verbs_device *vdev,
748 struct ibv_context *ctx, int cmd_fd)
749 {
750 struct mlx5_context *context;
751 struct mlx5_alloc_ucontext req;
752 struct mlx5_alloc_ucontext_resp resp;
753 int i;
754 int page_size;
755 int tot_uuars;
756 int low_lat_uuars;
757 int gross_uuars;
758 int j;
759 off_t offset;
760 struct mlx5_device *mdev;
761 struct verbs_context *v_ctx;
762 struct ibv_port_attr port_attr;
763 struct ibv_device_attr_ex device_attr;
764 int k;
765 int bfi;
766 int num_sys_page_map;
767
768 mdev = to_mdev(&vdev->device);
769 v_ctx = verbs_get_ctx(ctx);
770 page_size = mdev->page_size;
771 mlx5_single_threaded = single_threaded_app();
772
773 context = to_mctx(ctx);
774 context->ibv_ctx.cmd_fd = cmd_fd;
775
776 open_debug_file(context);
777 set_debug_mask();
778 set_freeze_on_error();
779 if (gethostname(context->hostname, sizeof(context->hostname)))
780 strcpy(context->hostname, "host_unknown");
781
782 tot_uuars = get_total_uuars(page_size);
783 if (tot_uuars < 0) {
784 errno = -tot_uuars;
785 goto err_free;
786 }
787
788 low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
789 if (low_lat_uuars < 0) {
790 errno = -low_lat_uuars;
791 goto err_free;
792 }
793
794 if (low_lat_uuars > tot_uuars - 1) {
795 errno = ENOMEM;
796 goto err_free;
797 }
798
799 memset(&req, 0, sizeof(req));
800 memset(&resp, 0, sizeof(resp));
801
802 req.total_num_uuars = tot_uuars;
803 req.num_low_latency_uuars = low_lat_uuars;
804 req.cqe_version = MLX5_CQE_VERSION_V1;
805 req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
806
807 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
808 sizeof(resp)))
809 goto err_free;
810
811 context->max_num_qps = resp.qp_tab_size;
812 context->bf_reg_size = resp.bf_reg_size;
813 context->tot_uuars = resp.tot_uuars;
814 context->low_lat_uuars = low_lat_uuars;
815 context->cache_line_size = resp.cache_line_size;
816 context->max_sq_desc_sz = resp.max_sq_desc_sz;
817 context->max_rq_desc_sz = resp.max_rq_desc_sz;
818 context->max_send_wqebb = resp.max_send_wqebb;
819 context->num_ports = resp.num_ports;
820 context->max_recv_wr = resp.max_recv_wr;
821 context->max_srq_recv_wr = resp.max_srq_recv_wr;
822
823 context->cqe_version = resp.cqe_version;
824 if (context->cqe_version) {
825 if (context->cqe_version == MLX5_CQE_VERSION_V1)
826 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
827 else
828 goto err_free;
829 }
830
831 adjust_uar_info(mdev, context, resp);
832
833 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
834 context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
835 if (!context->bfs) {
836 errno = ENOMEM;
837 goto err_free;
838 }
839
840 context->cmds_supp_uhw = resp.cmds_supp_uhw;
841 context->vendor_cap_flags = 0;
842
843 pthread_mutex_init(&context->qp_table_mutex, NULL);
844 pthread_mutex_init(&context->srq_table_mutex, NULL);
845 pthread_mutex_init(&context->uidx_table_mutex, NULL);
846 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
847 context->qp_table[i].refcnt = 0;
848
849 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
850 context->uidx_table[i].refcnt = 0;
851
852 context->db_list = NULL;
853
854 pthread_mutex_init(&context->db_list_mutex, NULL);
855
856 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
857 for (i = 0; i < num_sys_page_map; ++i) {
858 offset = 0;
859 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
860 set_index(i, &offset);
861 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
862 cmd_fd, page_size * offset);
863 if (context->uar[i] == MAP_FAILED) {
864 context->uar[i] = NULL;
865 goto err_free_bf;
866 }
867 }
868
869 for (i = 0; i < num_sys_page_map; i++) {
870 for (j = 0; j < context->num_uars_per_page; j++) {
871 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
872 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
873 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
874 MLX5_BF_OFFSET + k * context->bf_reg_size;
875 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
876 mlx5_spinlock_init(&context->bfs[bfi].lock);
877 context->bfs[bfi].offset = 0;
878 if (bfi)
879 context->bfs[bfi].buf_size = context->bf_reg_size / 2;
880 context->bfs[bfi].uuarn = bfi;
881 }
882 }
883 }
884 context->hca_core_clock = NULL;
885 if (resp.response_length + sizeof(resp.ibv_resp) >=
886 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
887 sizeof(resp.hca_core_clock_offset) &&
888 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
889 context->core_clock.offset = resp.hca_core_clock_offset;
890 mlx5_map_internal_clock(mdev, ctx);
891 }
892
893 mlx5_spinlock_init(&context->lock32);
894
895 context->prefer_bf = get_always_bf();
896 context->shut_up_bf = get_shut_up_bf();
897 mlx5_read_env(&vdev->device, context);
898
899 mlx5_spinlock_init(&context->hugetlb_lock);
900 TAILQ_INIT(&context->hugetlb_list);
901
902 context->ibv_ctx.ops = mlx5_ctx_ops;
903
904 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
905 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
906 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
907 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
908 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
909 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
910 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
911 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
912 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
913 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
914 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
915 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
916 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
917 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
918 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
919
920 memset(&device_attr, 0, sizeof(device_attr));
921 if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
922 sizeof(struct ibv_device_attr_ex))) {
923 context->cached_device_cap_flags =
924 device_attr.orig_attr.device_cap_flags;
925 context->atomic_cap = device_attr.orig_attr.atomic_cap;
926 context->cached_tso_caps = device_attr.tso_caps;
927 }
928
929 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
930 memset(&port_attr, 0, sizeof(port_attr));
931 if (!mlx5_query_port(ctx, j + 1, &port_attr))
932 context->cached_link_layer[j] = port_attr.link_layer;
933 }
934
935 return 0;
936
937 err_free_bf:
938 free(context->bfs);
939
940 err_free:
941 for (i = 0; i < MLX5_MAX_UARS; ++i) {
942 if (context->uar[i])
943 munmap(context->uar[i], page_size);
944 }
945 close_debug_file(context);
946 return errno;
947 }
948
mlx5_cleanup_context(struct verbs_device * device,struct ibv_context * ibctx)949 static void mlx5_cleanup_context(struct verbs_device *device,
950 struct ibv_context *ibctx)
951 {
952 struct mlx5_context *context = to_mctx(ibctx);
953 int page_size = to_mdev(ibctx->device)->page_size;
954 int i;
955
956 free(context->bfs);
957 for (i = 0; i < MLX5_MAX_UARS; ++i) {
958 if (context->uar[i])
959 munmap(context->uar[i], page_size);
960 }
961 if (context->hca_core_clock)
962 munmap(context->hca_core_clock - context->core_clock.offset,
963 page_size);
964 close_debug_file(context);
965 }
966
967 static struct verbs_device_ops mlx5_dev_ops = {
968 .init_context = mlx5_init_context,
969 .uninit_context = mlx5_cleanup_context,
970 };
971
mlx5_driver_init(const char * uverbs_sys_path,int abi_version)972 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
973 int abi_version)
974 {
975 char value[8];
976 struct mlx5_device *dev;
977 unsigned vendor, device;
978 int i;
979
980 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
981 value, sizeof value) < 0)
982 return NULL;
983 sscanf(value, "%i", &vendor);
984
985 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
986 value, sizeof value) < 0)
987 return NULL;
988 sscanf(value, "%i", &device);
989
990 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
991 if (vendor == hca_table[i].vendor &&
992 device == hca_table[i].device)
993 goto found;
994
995 return NULL;
996
997 found:
998 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
999 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1000 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1001 "(min supported %d, max supported %d)\n",
1002 abi_version, uverbs_sys_path,
1003 MLX5_UVERBS_MIN_ABI_VERSION,
1004 MLX5_UVERBS_MAX_ABI_VERSION);
1005 return NULL;
1006 }
1007
1008 dev = calloc(1, sizeof *dev);
1009 if (!dev) {
1010 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1011 uverbs_sys_path);
1012 return NULL;
1013 }
1014
1015 dev->page_size = sysconf(_SC_PAGESIZE);
1016 dev->driver_abi_ver = abi_version;
1017
1018 dev->verbs_dev.ops = &mlx5_dev_ops;
1019 dev->verbs_dev.sz = sizeof(*dev);
1020 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1021 sizeof(struct ibv_context);
1022
1023 return &dev->verbs_dev;
1024 }
1025
mlx5_register_driver(void)1026 static __attribute__((constructor)) void mlx5_register_driver(void)
1027 {
1028 verbs_register_driver("mlx5", mlx5_driver_init);
1029 }
1030