xref: /trueos/contrib/ofed/libmthca/src/qp.c (revision 8fe640108653f13042f1b15213769e338aa524f6)
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #if HAVE_CONFIG_H
35 #  include <config.h>
36 #endif /* HAVE_CONFIG_H */
37 
38 #include <stdlib.h>
39 #include <netinet/in.h>
40 #include <pthread.h>
41 #include <string.h>
42 
43 #include "mthca.h"
44 #include "doorbell.h"
45 #include "wqe.h"
46 
47 enum {
48 	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
49 };
50 
51 static const uint8_t mthca_opcode[] = {
52 	[IBV_WR_SEND]                 = MTHCA_OPCODE_SEND,
53 	[IBV_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
54 	[IBV_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
55 	[IBV_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
56 	[IBV_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
57 	[IBV_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
58 	[IBV_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
59 };
60 
get_recv_wqe(struct mthca_qp * qp,int n)61 static void *get_recv_wqe(struct mthca_qp *qp, int n)
62 {
63 	return qp->buf.buf + (n << qp->rq.wqe_shift);
64 }
65 
get_send_wqe(struct mthca_qp * qp,int n)66 static void *get_send_wqe(struct mthca_qp *qp, int n)
67 {
68 	return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
69 }
70 
mthca_init_qp_indices(struct mthca_qp * qp)71 void mthca_init_qp_indices(struct mthca_qp *qp)
72 {
73 	qp->sq.next_ind  = 0;
74 	qp->sq.last_comp = qp->sq.max - 1;
75 	qp->sq.head    	 = 0;
76 	qp->sq.tail    	 = 0;
77 	qp->sq.last      = get_send_wqe(qp, qp->sq.max - 1);
78 
79 	qp->rq.next_ind	 = 0;
80 	qp->rq.last_comp = qp->rq.max - 1;
81 	qp->rq.head    	 = 0;
82 	qp->rq.tail    	 = 0;
83 	qp->rq.last      = get_recv_wqe(qp, qp->rq.max - 1);
84 }
85 
wq_overflow(struct mthca_wq * wq,int nreq,struct mthca_cq * cq)86 static inline int wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq)
87 {
88 	unsigned cur;
89 
90 	cur = wq->head - wq->tail;
91 	if (cur + nreq < wq->max)
92 		return 0;
93 
94 	pthread_spin_lock(&cq->lock);
95 	cur = wq->head - wq->tail;
96 	pthread_spin_unlock(&cq->lock);
97 
98 	return cur + nreq >= wq->max;
99 }
100 
mthca_tavor_post_send(struct ibv_qp * ibqp,struct ibv_send_wr * wr,struct ibv_send_wr ** bad_wr)101 int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
102 			  struct ibv_send_wr **bad_wr)
103 {
104 	struct mthca_qp *qp = to_mqp(ibqp);
105 	void *wqe, *prev_wqe;
106 	int ind;
107 	int nreq;
108 	int ret = 0;
109 	int size;
110 	int size0 = 0;
111 	int i;
112 	/*
113 	 * f0 and op0 cannot be used unless nreq > 0, which means this
114 	 * function makes it through the loop at least once.  So the
115 	 * code inside the if (!size0) will be executed, and f0 and
116 	 * op0 will be initialized.  So any gcc warning about "may be
117 	 * used unitialized" is bogus.
118 	 */
119 	uint32_t f0;
120 	uint32_t op0;
121 
122 	pthread_spin_lock(&qp->sq.lock);
123 
124 	ind = qp->sq.next_ind;
125 
126 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
127 		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
128 			ret = -1;
129 			*bad_wr = wr;
130 			goto out;
131 		}
132 
133 		wqe = get_send_wqe(qp, ind);
134 		prev_wqe = qp->sq.last;
135 		qp->sq.last = wqe;
136 
137 		((struct mthca_next_seg *) wqe)->nda_op = 0;
138 		((struct mthca_next_seg *) wqe)->ee_nds = 0;
139 		((struct mthca_next_seg *) wqe)->flags =
140 			((wr->send_flags & IBV_SEND_SIGNALED) ?
141 			 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
142 			((wr->send_flags & IBV_SEND_SOLICITED) ?
143 			 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
144 			htonl(1);
145 		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
146 		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
147 			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
148 
149 		wqe += sizeof (struct mthca_next_seg);
150 		size = sizeof (struct mthca_next_seg) / 16;
151 
152 		switch (ibqp->qp_type) {
153 		case IBV_QPT_RC:
154 			switch (wr->opcode) {
155 			case IBV_WR_ATOMIC_CMP_AND_SWP:
156 			case IBV_WR_ATOMIC_FETCH_AND_ADD:
157 				((struct mthca_raddr_seg *) wqe)->raddr =
158 					htonll(wr->wr.atomic.remote_addr);
159 				((struct mthca_raddr_seg *) wqe)->rkey =
160 					htonl(wr->wr.atomic.rkey);
161 				((struct mthca_raddr_seg *) wqe)->reserved = 0;
162 
163 				wqe += sizeof (struct mthca_raddr_seg);
164 
165 				if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166 					((struct mthca_atomic_seg *) wqe)->swap_add =
167 						htonll(wr->wr.atomic.swap);
168 					((struct mthca_atomic_seg *) wqe)->compare =
169 						htonll(wr->wr.atomic.compare_add);
170 				} else {
171 					((struct mthca_atomic_seg *) wqe)->swap_add =
172 						htonll(wr->wr.atomic.compare_add);
173 					((struct mthca_atomic_seg *) wqe)->compare = 0;
174 				}
175 
176 				wqe += sizeof (struct mthca_atomic_seg);
177 				size += (sizeof (struct mthca_raddr_seg) +
178 					 sizeof (struct mthca_atomic_seg)) / 16;
179 				break;
180 
181 			case IBV_WR_RDMA_WRITE:
182 			case IBV_WR_RDMA_WRITE_WITH_IMM:
183 			case IBV_WR_RDMA_READ:
184 				((struct mthca_raddr_seg *) wqe)->raddr =
185 					htonll(wr->wr.rdma.remote_addr);
186 				((struct mthca_raddr_seg *) wqe)->rkey =
187 					htonl(wr->wr.rdma.rkey);
188 				((struct mthca_raddr_seg *) wqe)->reserved = 0;
189 				wqe += sizeof (struct mthca_raddr_seg);
190 				size += sizeof (struct mthca_raddr_seg) / 16;
191 				break;
192 
193 			default:
194 				/* No extra segments required for sends */
195 				break;
196 			}
197 
198 			break;
199 
200 		case IBV_QPT_UC:
201 			switch (wr->opcode) {
202 			case IBV_WR_RDMA_WRITE:
203 			case IBV_WR_RDMA_WRITE_WITH_IMM:
204 				((struct mthca_raddr_seg *) wqe)->raddr =
205 					htonll(wr->wr.rdma.remote_addr);
206 				((struct mthca_raddr_seg *) wqe)->rkey =
207 					htonl(wr->wr.rdma.rkey);
208 				((struct mthca_raddr_seg *) wqe)->reserved = 0;
209 				wqe += sizeof (struct mthca_raddr_seg);
210 				size += sizeof (struct mthca_raddr_seg) / 16;
211 				break;
212 
213 			default:
214 				/* No extra segments required for sends */
215 				break;
216 			}
217 
218 			break;
219 
220 		case IBV_QPT_UD:
221 			((struct mthca_tavor_ud_seg *) wqe)->lkey =
222 				htonl(to_mah(wr->wr.ud.ah)->key);
223 			((struct mthca_tavor_ud_seg *) wqe)->av_addr =
224 				htonll((uintptr_t) to_mah(wr->wr.ud.ah)->av);
225 			((struct mthca_tavor_ud_seg *) wqe)->dqpn =
226 				htonl(wr->wr.ud.remote_qpn);
227 			((struct mthca_tavor_ud_seg *) wqe)->qkey =
228 				htonl(wr->wr.ud.remote_qkey);
229 
230 			wqe += sizeof (struct mthca_tavor_ud_seg);
231 			size += sizeof (struct mthca_tavor_ud_seg) / 16;
232 			break;
233 
234 		default:
235 			break;
236 		}
237 
238 		if (wr->num_sge > qp->sq.max_gs) {
239 			ret = -1;
240 			*bad_wr = wr;
241 			goto out;
242 		}
243 
244 		if (wr->send_flags & IBV_SEND_INLINE) {
245 			if (wr->num_sge) {
246 				struct mthca_inline_seg *seg = wqe;
247 				int s = 0;
248 
249 				wqe += sizeof *seg;
250 				for (i = 0; i < wr->num_sge; ++i) {
251 					struct ibv_sge *sge = &wr->sg_list[i];
252 
253 					s += sge->length;
254 
255 					if (s > qp->max_inline_data) {
256 						ret = -1;
257 						*bad_wr = wr;
258 						goto out;
259 					}
260 
261 					memcpy(wqe, (void *) (intptr_t) sge->addr,
262 					       sge->length);
263 					wqe += sge->length;
264 				}
265 
266 				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
267 				size += align(s + sizeof *seg, 16) / 16;
268 			}
269 		} else {
270 			struct mthca_data_seg *seg;
271 
272 			for (i = 0; i < wr->num_sge; ++i) {
273 				seg = wqe;
274 				seg->byte_count = htonl(wr->sg_list[i].length);
275 				seg->lkey = htonl(wr->sg_list[i].lkey);
276 				seg->addr = htonll(wr->sg_list[i].addr);
277 				wqe += sizeof *seg;
278 			}
279 
280 			size += wr->num_sge * (sizeof *seg / 16);
281 		}
282 
283 		qp->wrid[ind + qp->rq.max] = wr->wr_id;
284 
285 		if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
286 			ret = -1;
287 			*bad_wr = wr;
288 			goto out;
289 		}
290 
291 		((struct mthca_next_seg *) prev_wqe)->nda_op =
292 			htonl(((ind << qp->sq.wqe_shift) +
293 			       qp->send_wqe_offset) |
294 			      mthca_opcode[wr->opcode]);
295 		/*
296 		 * Make sure that nda_op is written before setting ee_nds.
297 		 */
298 		wmb();
299 		((struct mthca_next_seg *) prev_wqe)->ee_nds =
300 			htonl((size0 ? 0 : MTHCA_NEXT_DBD) | size |
301 			((wr->send_flags & IBV_SEND_FENCE) ?
302 			 MTHCA_NEXT_FENCE : 0));
303 
304 		if (!size0) {
305 			size0 = size;
306 			op0   = mthca_opcode[wr->opcode];
307 			f0    = wr->send_flags & IBV_SEND_FENCE ?
308 				MTHCA_SEND_DOORBELL_FENCE : 0;
309 		}
310 
311 		++ind;
312 		if (ind >= qp->sq.max)
313 			ind -= qp->sq.max;
314 	}
315 
316 out:
317 	if (nreq) {
318 		uint32_t doorbell[2];
319 
320 		doorbell[0] = htonl(((qp->sq.next_ind << qp->sq.wqe_shift) +
321 				     qp->send_wqe_offset) | f0 | op0);
322 		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
323 
324 		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
325 	}
326 
327 	qp->sq.next_ind = ind;
328 	qp->sq.head    += nreq;
329 
330 	pthread_spin_unlock(&qp->sq.lock);
331 	return ret;
332 }
333 
mthca_tavor_post_recv(struct ibv_qp * ibqp,struct ibv_recv_wr * wr,struct ibv_recv_wr ** bad_wr)334 int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
335 			  struct ibv_recv_wr **bad_wr)
336 {
337 	struct mthca_qp *qp = to_mqp(ibqp);
338 	uint32_t doorbell[2];
339 	int ret = 0;
340 	int nreq;
341 	int i;
342 	int size;
343 	int size0 = 0;
344 	int ind;
345 	void *wqe;
346 	void *prev_wqe;
347 
348 	pthread_spin_lock(&qp->rq.lock);
349 
350 	ind = qp->rq.next_ind;
351 
352 	for (nreq = 0; wr; wr = wr->next) {
353 		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
354 			ret = -1;
355 			*bad_wr = wr;
356 			goto out;
357 		}
358 
359 		wqe = get_recv_wqe(qp, ind);
360 		prev_wqe = qp->rq.last;
361 		qp->rq.last = wqe;
362 
363 		((struct mthca_next_seg *) wqe)->ee_nds =
364 			htonl(MTHCA_NEXT_DBD);
365 		((struct mthca_next_seg *) wqe)->flags =
366 			htonl(MTHCA_NEXT_CQ_UPDATE);
367 
368 		wqe += sizeof (struct mthca_next_seg);
369 		size = sizeof (struct mthca_next_seg) / 16;
370 
371 		if (wr->num_sge > qp->rq.max_gs) {
372 			ret = -1;
373 			*bad_wr = wr;
374 			goto out;
375 		}
376 
377 		for (i = 0; i < wr->num_sge; ++i) {
378 			((struct mthca_data_seg *) wqe)->byte_count =
379 				htonl(wr->sg_list[i].length);
380 			((struct mthca_data_seg *) wqe)->lkey =
381 				htonl(wr->sg_list[i].lkey);
382 			((struct mthca_data_seg *) wqe)->addr =
383 				htonll(wr->sg_list[i].addr);
384 			wqe += sizeof (struct mthca_data_seg);
385 			size += sizeof (struct mthca_data_seg) / 16;
386 		}
387 
388 		qp->wrid[ind] = wr->wr_id;
389 
390 		((struct mthca_next_seg *) prev_wqe)->ee_nds =
391 			htonl(MTHCA_NEXT_DBD | size);
392 
393 		if (!size0)
394 			size0 = size;
395 
396 		++ind;
397 		if (ind >= qp->rq.max)
398 			ind -= qp->rq.max;
399 
400 		++nreq;
401 		if (nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) {
402 			nreq = 0;
403 
404 			doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
405 			doorbell[1] = htonl(ibqp->qp_num << 8);
406 
407 			/*
408 			 * Make sure that descriptors are written
409 			 * before doorbell is rung.
410 			 */
411 			wmb();
412 
413 			mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
414 
415 			qp->rq.next_ind = ind;
416 			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
417 			size0 = 0;
418 		}
419 	}
420 
421 out:
422 	if (nreq) {
423 		doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
424 		doorbell[1] = htonl((ibqp->qp_num << 8) | nreq);
425 
426 		/*
427 		 * Make sure that descriptors are written before
428 		 * doorbell is rung.
429 		 */
430 		wmb();
431 
432 		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
433 	}
434 
435 	qp->rq.next_ind = ind;
436 	qp->rq.head    += nreq;
437 
438 	pthread_spin_unlock(&qp->rq.lock);
439 	return ret;
440 }
441 
mthca_arbel_post_send(struct ibv_qp * ibqp,struct ibv_send_wr * wr,struct ibv_send_wr ** bad_wr)442 int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
443 			  struct ibv_send_wr **bad_wr)
444 {
445 	struct mthca_qp *qp = to_mqp(ibqp);
446 	uint32_t doorbell[2];
447 	void *wqe, *prev_wqe;
448 	int ind;
449 	int nreq;
450 	int ret = 0;
451 	int size;
452 	int size0 = 0;
453 	int i;
454 	/*
455 	 * f0 and op0 cannot be used unless nreq > 0, which means this
456 	 * function makes it through the loop at least once.  So the
457 	 * code inside the if (!size0) will be executed, and f0 and
458 	 * op0 will be initialized.  So any gcc warning about "may be
459 	 * used unitialized" is bogus.
460 	 */
461 	uint32_t f0;
462 	uint32_t op0;
463 
464 	pthread_spin_lock(&qp->sq.lock);
465 
466 	/* XXX check that state is OK to post send */
467 
468 	ind = qp->sq.head & (qp->sq.max - 1);
469 
470 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
471 		if (nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB) {
472 			nreq = 0;
473 
474 			doorbell[0] = htonl((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
475 					    ((qp->sq.head & 0xffff) << 8) | f0 | op0);
476 			doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
477 
478 			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
479 
480 			/*
481 			 * Make sure that descriptors are written before
482 			 * doorbell record.
483 			 */
484 			wmb();
485 			*qp->sq.db = htonl(qp->sq.head & 0xffff);
486 
487 			/*
488 			 * Make sure doorbell record is written before we
489 			 * write MMIO send doorbell.
490 			 */
491 			wmb();
492 			mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
493 
494 			size0 = 0;
495 		}
496 
497 		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
498 			ret = -1;
499 			*bad_wr = wr;
500 			goto out;
501 		}
502 
503 		wqe = get_send_wqe(qp, ind);
504 		prev_wqe = qp->sq.last;
505 		qp->sq.last = wqe;
506 
507 		((struct mthca_next_seg *) wqe)->flags =
508 			((wr->send_flags & IBV_SEND_SIGNALED) ?
509 			 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
510 			((wr->send_flags & IBV_SEND_SOLICITED) ?
511 			 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
512 			htonl(1);
513 		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
514 		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
515 			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
516 
517 		wqe += sizeof (struct mthca_next_seg);
518 		size = sizeof (struct mthca_next_seg) / 16;
519 
520 		switch (ibqp->qp_type) {
521 		case IBV_QPT_RC:
522 			switch (wr->opcode) {
523 			case IBV_WR_ATOMIC_CMP_AND_SWP:
524 			case IBV_WR_ATOMIC_FETCH_AND_ADD:
525 				((struct mthca_raddr_seg *) wqe)->raddr =
526 					htonll(wr->wr.atomic.remote_addr);
527 				((struct mthca_raddr_seg *) wqe)->rkey =
528 					htonl(wr->wr.atomic.rkey);
529 				((struct mthca_raddr_seg *) wqe)->reserved = 0;
530 
531 				wqe += sizeof (struct mthca_raddr_seg);
532 
533 				if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
534 					((struct mthca_atomic_seg *) wqe)->swap_add =
535 						htonll(wr->wr.atomic.swap);
536 					((struct mthca_atomic_seg *) wqe)->compare =
537 						htonll(wr->wr.atomic.compare_add);
538 				} else {
539 					((struct mthca_atomic_seg *) wqe)->swap_add =
540 						htonll(wr->wr.atomic.compare_add);
541 					((struct mthca_atomic_seg *) wqe)->compare = 0;
542 				}
543 
544 				wqe += sizeof (struct mthca_atomic_seg);
545 				size += (sizeof (struct mthca_raddr_seg) +
546 					 sizeof (struct mthca_atomic_seg)) / 16;
547 				break;
548 
549 			case IBV_WR_RDMA_WRITE:
550 			case IBV_WR_RDMA_WRITE_WITH_IMM:
551 			case IBV_WR_RDMA_READ:
552 				((struct mthca_raddr_seg *) wqe)->raddr =
553 					htonll(wr->wr.rdma.remote_addr);
554 				((struct mthca_raddr_seg *) wqe)->rkey =
555 					htonl(wr->wr.rdma.rkey);
556 				((struct mthca_raddr_seg *) wqe)->reserved = 0;
557 				wqe += sizeof (struct mthca_raddr_seg);
558 				size += sizeof (struct mthca_raddr_seg) / 16;
559 				break;
560 
561 			default:
562 				/* No extra segments required for sends */
563 				break;
564 			}
565 
566 			break;
567 
568 		case IBV_QPT_UC:
569 			switch (wr->opcode) {
570 			case IBV_WR_RDMA_WRITE:
571 			case IBV_WR_RDMA_WRITE_WITH_IMM:
572 				((struct mthca_raddr_seg *) wqe)->raddr =
573 					htonll(wr->wr.rdma.remote_addr);
574 				((struct mthca_raddr_seg *) wqe)->rkey =
575 					htonl(wr->wr.rdma.rkey);
576 				((struct mthca_raddr_seg *) wqe)->reserved = 0;
577 				wqe += sizeof (struct mthca_raddr_seg);
578 				size += sizeof (struct mthca_raddr_seg) / 16;
579 				break;
580 
581 			default:
582 				/* No extra segments required for sends */
583 				break;
584 			}
585 
586 			break;
587 
588 		case IBV_QPT_UD:
589 			memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
590 			       to_mah(wr->wr.ud.ah)->av, sizeof (struct mthca_av));
591 			((struct mthca_arbel_ud_seg *) wqe)->dqpn =
592 				htonl(wr->wr.ud.remote_qpn);
593 			((struct mthca_arbel_ud_seg *) wqe)->qkey =
594 				htonl(wr->wr.ud.remote_qkey);
595 
596 			wqe += sizeof (struct mthca_arbel_ud_seg);
597 			size += sizeof (struct mthca_arbel_ud_seg) / 16;
598 			break;
599 
600 		default:
601 			break;
602 		}
603 
604 		if (wr->num_sge > qp->sq.max_gs) {
605 			ret = -1;
606 			*bad_wr = wr;
607 			goto out;
608 		}
609 
610 		if (wr->send_flags & IBV_SEND_INLINE) {
611 			if (wr->num_sge) {
612 				struct mthca_inline_seg *seg = wqe;
613 				int s = 0;
614 
615 				wqe += sizeof *seg;
616 				for (i = 0; i < wr->num_sge; ++i) {
617 					struct ibv_sge *sge = &wr->sg_list[i];
618 
619 					s += sge->length;
620 
621 					if (s > qp->max_inline_data) {
622 						ret = -1;
623 						*bad_wr = wr;
624 						goto out;
625 					}
626 
627 					memcpy(wqe, (void *) (uintptr_t) sge->addr,
628 					       sge->length);
629 					wqe += sge->length;
630 				}
631 
632 				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
633 				size += align(s + sizeof *seg, 16) / 16;
634 			}
635 		} else {
636 			struct mthca_data_seg *seg;
637 
638 			for (i = 0; i < wr->num_sge; ++i) {
639 				seg = wqe;
640 				seg->byte_count = htonl(wr->sg_list[i].length);
641 				seg->lkey = htonl(wr->sg_list[i].lkey);
642 				seg->addr = htonll(wr->sg_list[i].addr);
643 				wqe += sizeof *seg;
644 			}
645 
646 			size += wr->num_sge * (sizeof *seg / 16);
647 		}
648 
649 		qp->wrid[ind + qp->rq.max] = wr->wr_id;
650 
651 		if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
652 			ret = -1;
653 			*bad_wr = wr;
654 			goto out;
655 		}
656 
657 		((struct mthca_next_seg *) prev_wqe)->nda_op =
658 			htonl(((ind << qp->sq.wqe_shift) +
659 			       qp->send_wqe_offset) |
660 			      mthca_opcode[wr->opcode]);
661 		wmb();
662 		((struct mthca_next_seg *) prev_wqe)->ee_nds =
663 			htonl(MTHCA_NEXT_DBD | size |
664 			      ((wr->send_flags & IBV_SEND_FENCE) ?
665 			       MTHCA_NEXT_FENCE : 0));
666 
667 		if (!size0) {
668 			size0 = size;
669 			op0   = mthca_opcode[wr->opcode];
670 			f0    = wr->send_flags & IBV_SEND_FENCE ?
671 				MTHCA_SEND_DOORBELL_FENCE : 0;
672 		}
673 
674 		++ind;
675 		if (ind >= qp->sq.max)
676 			ind -= qp->sq.max;
677 	}
678 
679 out:
680 	if (nreq) {
681 		doorbell[0] = htonl((nreq << 24)                  |
682 				    ((qp->sq.head & 0xffff) << 8) |
683 				    f0 | op0);
684 		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
685 
686 		qp->sq.head += nreq;
687 
688 		/*
689 		 * Make sure that descriptors are written before
690 		 * doorbell record.
691 		 */
692 		wmb();
693 		*qp->sq.db = htonl(qp->sq.head & 0xffff);
694 
695 		/*
696 		 * Make sure doorbell record is written before we
697 		 * write MMIO send doorbell.
698 		 */
699 		wmb();
700 		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
701 	}
702 
703 	pthread_spin_unlock(&qp->sq.lock);
704 	return ret;
705 }
706 
mthca_arbel_post_recv(struct ibv_qp * ibqp,struct ibv_recv_wr * wr,struct ibv_recv_wr ** bad_wr)707 int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
708 			  struct ibv_recv_wr **bad_wr)
709 {
710 	struct mthca_qp *qp = to_mqp(ibqp);
711 	int ret = 0;
712 	int nreq;
713 	int ind;
714 	int i;
715 	void *wqe;
716 
717 	pthread_spin_lock(&qp->rq.lock);
718 
719 	/* XXX check that state is OK to post receive */
720 
721 	ind = qp->rq.head & (qp->rq.max - 1);
722 
723 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
724 		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
725 			ret = -1;
726 			*bad_wr = wr;
727 			goto out;
728 		}
729 
730 		wqe = get_recv_wqe(qp, ind);
731 
732 		((struct mthca_next_seg *) wqe)->flags = 0;
733 
734 		wqe += sizeof (struct mthca_next_seg);
735 
736 		if (wr->num_sge > qp->rq.max_gs) {
737 			ret = -1;
738 			*bad_wr = wr;
739 			goto out;
740 		}
741 
742 		for (i = 0; i < wr->num_sge; ++i) {
743 			((struct mthca_data_seg *) wqe)->byte_count =
744 				htonl(wr->sg_list[i].length);
745 			((struct mthca_data_seg *) wqe)->lkey =
746 				htonl(wr->sg_list[i].lkey);
747 			((struct mthca_data_seg *) wqe)->addr =
748 				htonll(wr->sg_list[i].addr);
749 			wqe += sizeof (struct mthca_data_seg);
750 		}
751 
752 		if (i < qp->rq.max_gs) {
753 			((struct mthca_data_seg *) wqe)->byte_count = 0;
754 			((struct mthca_data_seg *) wqe)->lkey = htonl(MTHCA_INVAL_LKEY);
755 			((struct mthca_data_seg *) wqe)->addr = 0;
756 		}
757 
758 		qp->wrid[ind] = wr->wr_id;
759 
760 		++ind;
761 		if (ind >= qp->rq.max)
762 			ind -= qp->rq.max;
763 	}
764 out:
765 	if (nreq) {
766 		qp->rq.head += nreq;
767 
768 		/*
769 		 * Make sure that descriptors are written before
770 		 * doorbell record.
771 		 */
772 		wmb();
773 		*qp->rq.db = htonl(qp->rq.head & 0xffff);
774 	}
775 
776 	pthread_spin_unlock(&qp->rq.lock);
777 	return ret;
778 }
779 
mthca_alloc_qp_buf(struct ibv_pd * pd,struct ibv_qp_cap * cap,enum ibv_qp_type type,struct mthca_qp * qp)780 int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
781 		       enum ibv_qp_type type, struct mthca_qp *qp)
782 {
783 	int size;
784 	int max_sq_sge;
785 	struct mthca_next_seg *next;
786 	int i;
787 
788 	qp->rq.max_gs 	 = cap->max_recv_sge;
789 	qp->sq.max_gs 	 = cap->max_send_sge;
790 	max_sq_sge 	 = align(cap->max_inline_data + sizeof (struct mthca_inline_seg),
791 				 sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg);
792 	if (max_sq_sge < cap->max_send_sge)
793 		max_sq_sge = cap->max_send_sge;
794 
795 	qp->wrid = malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t));
796 	if (!qp->wrid)
797 		return -1;
798 
799 	size = sizeof (struct mthca_next_seg) +
800 		qp->rq.max_gs * sizeof (struct mthca_data_seg);
801 
802 	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
803 	     qp->rq.wqe_shift++)
804 		; /* nothing */
805 
806 	size = max_sq_sge * sizeof (struct mthca_data_seg);
807 	switch (type) {
808 	case IBV_QPT_UD:
809 		size += mthca_is_memfree(pd->context) ?
810 			sizeof (struct mthca_arbel_ud_seg) :
811 			sizeof (struct mthca_tavor_ud_seg);
812 		break;
813 
814 	case IBV_QPT_UC:
815 		size += sizeof (struct mthca_raddr_seg);
816 		break;
817 
818 	case IBV_QPT_RC:
819 		size += sizeof (struct mthca_raddr_seg);
820 		/*
821 		 * An atomic op will require an atomic segment, a
822 		 * remote address segment and one scatter entry.
823 		 */
824 		if (size < (sizeof (struct mthca_atomic_seg) +
825 			    sizeof (struct mthca_raddr_seg) +
826 			    sizeof (struct mthca_data_seg)))
827 			size = (sizeof (struct mthca_atomic_seg) +
828 				sizeof (struct mthca_raddr_seg) +
829 				sizeof (struct mthca_data_seg));
830 		break;
831 
832 	default:
833 		break;
834 	}
835 
836 	/* Make sure that we have enough space for a bind request */
837 	if (size < sizeof (struct mthca_bind_seg))
838 		size = sizeof (struct mthca_bind_seg);
839 
840 	size += sizeof (struct mthca_next_seg);
841 
842 	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
843 	     qp->sq.wqe_shift++)
844 		; /* nothing */
845 
846 	qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift,
847 				    1 << qp->sq.wqe_shift);
848 
849 	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
850 
851 	if (mthca_alloc_buf(&qp->buf,
852 			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
853 			    to_mdev(pd->context->device)->page_size)) {
854 		free(qp->wrid);
855 		return -1;
856 	}
857 
858 	memset(qp->buf.buf, 0, qp->buf_size);
859 
860 	if (mthca_is_memfree(pd->context)) {
861 		struct mthca_data_seg *scatter;
862 		uint32_t sz;
863 
864 		sz = htonl((sizeof (struct mthca_next_seg) +
865 			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16);
866 
867 		for (i = 0; i < qp->rq.max; ++i) {
868 			next = get_recv_wqe(qp, i);
869 			next->nda_op = htonl(((i + 1) & (qp->rq.max - 1)) <<
870 					     qp->rq.wqe_shift);
871 			next->ee_nds = sz;
872 
873 			for (scatter = (void *) (next + 1);
874 			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
875 			     ++scatter)
876 				scatter->lkey = htonl(MTHCA_INVAL_LKEY);
877 		}
878 
879 		for (i = 0; i < qp->sq.max; ++i) {
880 			next = get_send_wqe(qp, i);
881 			next->nda_op = htonl((((i + 1) & (qp->sq.max - 1)) <<
882 					      qp->sq.wqe_shift) +
883 					     qp->send_wqe_offset);
884 		}
885 	} else {
886 		for (i = 0; i < qp->rq.max; ++i) {
887 			next = get_recv_wqe(qp, i);
888 			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
889 					     qp->rq.wqe_shift) | 1);
890 		}
891 	}
892 
893 	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
894 	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
895 
896 	return 0;
897 }
898 
mthca_find_qp(struct mthca_context * ctx,uint32_t qpn)899 struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn)
900 {
901 	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
902 
903 	if (ctx->qp_table[tind].refcnt)
904 		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
905 	else
906 		return NULL;
907 }
908 
mthca_store_qp(struct mthca_context * ctx,uint32_t qpn,struct mthca_qp * qp)909 int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp)
910 {
911 	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
912 
913 	if (!ctx->qp_table[tind].refcnt) {
914 		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
915 						   sizeof (struct mthca_qp *));
916 		if (!ctx->qp_table[tind].table)
917 			return -1;
918 	}
919 
920 	++ctx->qp_table[tind].refcnt;
921 	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
922 	return 0;
923 }
924 
mthca_clear_qp(struct mthca_context * ctx,uint32_t qpn)925 void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn)
926 {
927 	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
928 
929 	if (!--ctx->qp_table[tind].refcnt)
930 		free(ctx->qp_table[tind].table);
931 	else
932 		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
933 }
934 
mthca_free_err_wqe(struct mthca_qp * qp,int is_send,int index,int * dbd,uint32_t * new_wqe)935 int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
936 		       int index, int *dbd, uint32_t *new_wqe)
937 {
938 	struct mthca_next_seg *next;
939 
940 	/*
941 	 * For SRQs, all receive WQEs generate a CQE, so we're always
942 	 * at the end of the doorbell chain.
943 	 */
944 	if (qp->ibv_qp.srq && !is_send) {
945 		*new_wqe = 0;
946 		return 0;
947 	}
948 
949 	if (is_send)
950 		next = get_send_wqe(qp, index);
951 	else
952 		next = get_recv_wqe(qp, index);
953 
954 	*dbd = !!(next->ee_nds & htonl(MTHCA_NEXT_DBD));
955 	if (next->ee_nds & htonl(0x3f))
956 		*new_wqe = (next->nda_op & htonl(~0x3f)) |
957 			(next->ee_nds & htonl(0x3f));
958 	else
959 		*new_wqe = 0;
960 
961 	return 0;
962 }
963 
964