1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_ddp.c 312337 2017-01-17 07:43:37Z np $");
30 
31 #include "opt_inet.h"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/ktr.h>
38 #include <sys/module.h>
39 #include <sys/protosw.h>
40 #include <sys/proc.h>
41 #include <sys/domain.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/uio.h>
45 #include <netinet/in.h>
46 #include <netinet/in_pcb.h>
47 #include <netinet/ip.h>
48 #include <netinet/tcp_var.h>
49 #define TCPSTATES
50 #include <netinet/tcp_fsm.h>
51 #include <netinet/toecore.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_param.h>
56 #include <vm/pmap.h>
57 #include <vm/vm_map.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_object.h>
60 
61 #ifdef TCP_OFFLOAD
62 #include "common/common.h"
63 #include "common/t4_msg.h"
64 #include "common/t4_regs.h"
65 #include "common/t4_tcb.h"
66 #include "tom/t4_tom.h"
67 
68 VNET_DECLARE(int, tcp_do_autorcvbuf);
69 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
70 VNET_DECLARE(int, tcp_autorcvbuf_inc);
71 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
72 VNET_DECLARE(int, tcp_autorcvbuf_max);
73 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
74 
75 static struct mbuf *get_ddp_mbuf(int len);
76 
77 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
78 
79 static struct ddp_buffer *
alloc_ddp_buffer(vm_page_t * pages,int npages,int offset,int len)80 alloc_ddp_buffer(vm_page_t *pages, int npages, int offset, int len)
81 {
82 	struct ddp_buffer *db;
83 
84 	db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT | M_ZERO);
85 	if (db == NULL) {
86 		CTR1(KTR_CXGBE, "%s: malloc failed.", __func__);
87 		return (NULL);
88 	}
89 
90 	db->npages = npages;
91 	db->pages = pages;
92 	db->offset = offset;
93 	db->len = len;
94 
95 	return (db);
96 }
97 
98 static void
free_ddp_buffer(struct ddp_buffer * db)99 free_ddp_buffer(struct ddp_buffer *db)
100 {
101 
102 	if (db == NULL)
103 		return;
104 
105 	if (db->pages)
106 		free(db->pages, M_CXGBE);
107 
108 	if (db->prsv.prsv_nppods > 0)
109 		t4_free_page_pods(&db->prsv);
110 
111 	free(db, M_CXGBE);
112 }
113 
114 void
release_ddp_resources(struct toepcb * toep)115 release_ddp_resources(struct toepcb *toep)
116 {
117 	int i;
118 
119 	for (i = 0; i < nitems(toep->db); i++) {
120 		if (toep->db[i] != NULL) {
121 			free_ddp_buffer(toep->db[i]);
122 			toep->db[i] = NULL;
123 		}
124 	}
125 }
126 
127 /* XXX: handle_ddp_data code duplication */
128 void
insert_ddp_data(struct toepcb * toep,uint32_t n)129 insert_ddp_data(struct toepcb *toep, uint32_t n)
130 {
131 	struct inpcb *inp = toep->inp;
132 	struct tcpcb *tp = intotcpcb(inp);
133 	struct sockbuf *sb = &inp->inp_socket->so_rcv;
134 	struct mbuf *m;
135 
136 	INP_WLOCK_ASSERT(inp);
137 	SOCKBUF_LOCK_ASSERT(sb);
138 
139 	m = get_ddp_mbuf(n);
140 	tp->rcv_nxt += n;
141 #ifndef USE_DDP_RX_FLOW_CONTROL
142 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
143 	tp->rcv_wnd -= n;
144 #endif
145 
146 	KASSERT(toep->sb_cc >= sb->sb_cc,
147 	    ("%s: sb %p has more data (%d) than last time (%d).",
148 	    __func__, sb, sb->sb_cc, toep->sb_cc));
149 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
150 #ifdef USE_DDP_RX_FLOW_CONTROL
151 	toep->rx_credits -= n;	/* adjust for F_RX_FC_DDP */
152 #endif
153 	sbappendstream_locked(sb, m);
154 	toep->sb_cc = sb->sb_cc;
155 }
156 
157 /* SET_TCB_FIELD sent as a ULP command looks like this */
158 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
159     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
160 
161 /* RX_DATA_ACK sent as a ULP command looks like this */
162 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
163     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
164 
165 static inline void *
mk_set_tcb_field_ulp(struct ulp_txpkt * ulpmc,struct toepcb * toep,uint64_t word,uint64_t mask,uint64_t val)166 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
167     uint64_t word, uint64_t mask, uint64_t val)
168 {
169 	struct ulptx_idata *ulpsc;
170 	struct cpl_set_tcb_field_core *req;
171 
172 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
173 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
174 
175 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
176 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
177 	ulpsc->len = htobe32(sizeof(*req));
178 
179 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
180 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
181 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
182 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
183 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
184         req->mask = htobe64(mask);
185         req->val = htobe64(val);
186 
187 	ulpsc = (struct ulptx_idata *)(req + 1);
188 	if (LEN__SET_TCB_FIELD_ULP % 16) {
189 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
190 		ulpsc->len = htobe32(0);
191 		return (ulpsc + 1);
192 	}
193 	return (ulpsc);
194 }
195 
196 static inline void *
mk_rx_data_ack_ulp(struct ulp_txpkt * ulpmc,struct toepcb * toep)197 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
198 {
199 	struct ulptx_idata *ulpsc;
200 	struct cpl_rx_data_ack_core *req;
201 
202 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
203 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
204 
205 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
206 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
207 	ulpsc->len = htobe32(sizeof(*req));
208 
209 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
210 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
211 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
212 
213 	ulpsc = (struct ulptx_idata *)(req + 1);
214 	if (LEN__RX_DATA_ACK_ULP % 16) {
215 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
216 		ulpsc->len = htobe32(0);
217 		return (ulpsc + 1);
218 	}
219 	return (ulpsc);
220 }
221 
222 static inline uint64_t
select_ddp_flags(struct socket * so,int flags,int db_idx)223 select_ddp_flags(struct socket *so, int flags, int db_idx)
224 {
225 	uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
226 	int waitall = flags & MSG_WAITALL;
227 	int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
228 
229 	KASSERT(db_idx == 0 || db_idx == 1,
230 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
231 
232 	if (db_idx == 0) {
233 		ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
234 		if (waitall)
235 			ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
236 		else if (nb)
237 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
238 		else
239 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
240 	} else {
241 		ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
242 		if (waitall)
243 			ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
244 		else if (nb)
245 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
246 		else
247 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
248 	}
249 
250 	return (ddp_flags);
251 }
252 
253 static struct wrqe *
mk_update_tcb_for_ddp(struct adapter * sc,struct toepcb * toep,int db_idx,int offset,uint64_t ddp_flags)254 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
255     int offset, uint64_t ddp_flags)
256 {
257 	struct ddp_buffer *db = toep->db[db_idx];
258 	struct wrqe *wr;
259 	struct work_request_hdr *wrh;
260 	struct ulp_txpkt *ulpmc;
261 	int len;
262 
263 	KASSERT(db_idx == 0 || db_idx == 1,
264 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
265 
266 	/*
267 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
268 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
269 	 *
270 	 * The work request header is 16B and always ends at a 16B boundary.
271 	 * The ULPTX master commands that follow must all end at 16B boundaries
272 	 * too so we round up the size to 16.
273 	 */
274 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
275 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
276 
277 	wr = alloc_wrqe(len, toep->ctrlq);
278 	if (wr == NULL)
279 		return (NULL);
280 	wrh = wrtod(wr);
281 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
282 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
283 
284 	/* Write the buffer's tag */
285 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
286 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
287 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
288 	    V_TCB_RX_DDP_BUF0_TAG(db->prsv.prsv_tag));
289 
290 	/* Update the current offset in the DDP buffer and its total length */
291 	if (db_idx == 0)
292 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
293 		    W_TCB_RX_DDP_BUF0_OFFSET,
294 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
295 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
296 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
297 		    V_TCB_RX_DDP_BUF0_LEN(db->len));
298 	else
299 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
300 		    W_TCB_RX_DDP_BUF1_OFFSET,
301 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
302 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
303 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
304 		    V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
305 
306 	/* Update DDP flags */
307 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
308 	    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
309 	    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
310 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
311 	    V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
312 
313 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
314 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
315 
316 	return (wr);
317 }
318 
319 static void
discourage_ddp(struct toepcb * toep)320 discourage_ddp(struct toepcb *toep)
321 {
322 
323 	if (toep->ddp_score && --toep->ddp_score == 0) {
324 		toep->ddp_flags &= ~DDP_OK;
325 		toep->ddp_disabled = time_uptime;
326 		CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
327 		    __func__, toep->tid, time_uptime);
328 	}
329 }
330 
331 static int
handle_ddp_data(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)332 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
333 {
334 	uint32_t report = be32toh(ddp_report);
335 	unsigned int db_flag;
336 	struct inpcb *inp = toep->inp;
337 	struct tcpcb *tp;
338 	struct socket *so;
339 	struct sockbuf *sb;
340 	struct mbuf *m;
341 
342 	db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
343 
344 	if (__predict_false(!(report & F_DDP_INV)))
345 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
346 
347 	INP_WLOCK(inp);
348 	so = inp_inpcbtosocket(inp);
349 	sb = &so->so_rcv;
350 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
351 
352 		/*
353 		 * XXX: think a bit more.
354 		 * tcpcb probably gone, but socket should still be around
355 		 * because we always wait for DDP completion in soreceive no
356 		 * matter what.  Just wake it up and let it clean up.
357 		 */
358 
359 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
360 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
361 		SOCKBUF_LOCK(sb);
362 		goto wakeup;
363 	}
364 
365 	tp = intotcpcb(inp);
366 
367 	/*
368 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
369 	 * sequence number of the next byte to receive.  The length of
370 	 * the data received for this message must be computed by
371 	 * comparing the new and old values of rcv_nxt.
372 	 *
373 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
374 	 * length of the most recent DMA.  It does not include the
375 	 * total length of the data received since the previous update
376 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
377 	 * first received byte from the most recent DMA.
378 	 */
379 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
380 	tp->rcv_nxt += len;
381 	tp->t_rcvtime = ticks;
382 #ifndef USE_DDP_RX_FLOW_CONTROL
383 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
384 	tp->rcv_wnd -= len;
385 #endif
386 	m = get_ddp_mbuf(len);
387 
388 	SOCKBUF_LOCK(sb);
389 	if (report & F_DDP_BUF_COMPLETE)
390 		toep->ddp_score = DDP_HIGH_SCORE;
391 	else
392 		discourage_ddp(toep);
393 
394 	/* receive buffer autosize */
395 	MPASS(toep->vnet == so->so_vnet);
396 	CURVNET_SET(toep->vnet);
397 	if (sb->sb_flags & SB_AUTOSIZE &&
398 	    V_tcp_do_autorcvbuf &&
399 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
400 	    len > (sbspace(sb) / 8 * 7)) {
401 		unsigned int hiwat = sb->sb_hiwat;
402 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
403 		    V_tcp_autorcvbuf_max);
404 
405 		if (!sbreserve_locked(sb, newsize, so, NULL))
406 			sb->sb_flags &= ~SB_AUTOSIZE;
407 		else
408 			toep->rx_credits += newsize - hiwat;
409 	}
410 	CURVNET_RESTORE();
411 
412 	KASSERT(toep->sb_cc >= sb->sb_cc,
413 	    ("%s: sb %p has more data (%d) than last time (%d).",
414 	    __func__, sb, sb->sb_cc, toep->sb_cc));
415 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
416 #ifdef USE_DDP_RX_FLOW_CONTROL
417 	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
418 #endif
419 	sbappendstream_locked(sb, m);
420 	toep->sb_cc = sb->sb_cc;
421 wakeup:
422 	KASSERT(toep->ddp_flags & db_flag,
423 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
424 	    __func__, toep, toep->ddp_flags, report));
425 	toep->ddp_flags &= ~db_flag;
426 	sorwakeup_locked(so);
427 	SOCKBUF_UNLOCK_ASSERT(sb);
428 
429 	INP_WUNLOCK(inp);
430 	return (0);
431 }
432 
433 void
handle_ddp_close(struct toepcb * toep,struct tcpcb * tp,struct sockbuf * sb,__be32 rcv_nxt)434 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, struct sockbuf *sb,
435     __be32 rcv_nxt)
436 {
437 	struct mbuf *m;
438 	int len;
439 
440 	SOCKBUF_LOCK_ASSERT(sb);
441 	INP_WLOCK_ASSERT(toep->inp);
442 	len = be32toh(rcv_nxt) - tp->rcv_nxt;
443 
444 	/* Signal handle_ddp() to break out of its sleep loop. */
445 	toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE);
446 	if (len == 0)
447 		return;
448 
449 	tp->rcv_nxt += len;
450 	KASSERT(toep->sb_cc >= sb->sb_cc,
451 	    ("%s: sb %p has more data (%d) than last time (%d).",
452 	    __func__, sb, sb->sb_cc, toep->sb_cc));
453 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
454 #ifdef USE_DDP_RX_FLOW_CONTROL
455 	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
456 #endif
457 
458 	m = get_ddp_mbuf(len);
459 
460 	sbappendstream_locked(sb, m);
461 	toep->sb_cc = sb->sb_cc;
462 }
463 
464 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
465 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
466 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
467 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
468 
469 extern cpl_handler_t t4_cpl_handler[];
470 
471 static int
do_rx_data_ddp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)472 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
473 {
474 	struct adapter *sc = iq->adapter;
475 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
476 	unsigned int tid = GET_TID(cpl);
477 	uint32_t vld;
478 	struct toepcb *toep = lookup_tid(sc, tid);
479 
480 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
481 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
482 	KASSERT(!(toep->flags & TPF_SYNQE),
483 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
484 
485 	vld = be32toh(cpl->ddpvld);
486 	if (__predict_false(vld & DDP_ERR)) {
487 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
488 		    __func__, vld, tid, toep);
489 	}
490 
491 	if (toep->ulp_mode == ULP_MODE_ISCSI) {
492 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
493 		return (0);
494 	}
495 
496 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
497 
498 	return (0);
499 }
500 
501 static int
do_rx_ddp_complete(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)502 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
503     struct mbuf *m)
504 {
505 	struct adapter *sc = iq->adapter;
506 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
507 	unsigned int tid = GET_TID(cpl);
508 	struct toepcb *toep = lookup_tid(sc, tid);
509 
510 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
511 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
512 	KASSERT(!(toep->flags & TPF_SYNQE),
513 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
514 
515 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
516 
517 	return (0);
518 }
519 
520 void
enable_ddp(struct adapter * sc,struct toepcb * toep)521 enable_ddp(struct adapter *sc, struct toepcb *toep)
522 {
523 
524 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
525 	    ("%s: toep %p has bad ddp_flags 0x%x",
526 	    __func__, toep, toep->ddp_flags));
527 
528 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
529 	    __func__, toep->tid, time_uptime);
530 
531 	toep->ddp_flags |= DDP_SC_REQ;
532 	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
533 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
534 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
535 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
536 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0,
537 	    toep->ofld_rxq->iq.abs_id);
538 	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
539 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id);
540 }
541 
542 static inline void
disable_ddp(struct adapter * sc,struct toepcb * toep)543 disable_ddp(struct adapter *sc, struct toepcb *toep)
544 {
545 
546 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
547 	    ("%s: toep %p has bad ddp_flags 0x%x",
548 	    __func__, toep, toep->ddp_flags));
549 
550 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
551 	    __func__, toep->tid, time_uptime);
552 
553 	toep->ddp_flags |= DDP_SC_REQ;
554 	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
555 	    V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1), 0, 0,
556 	    toep->ofld_rxq->iq.abs_id);
557 	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
558 	    V_TF_DDP_OFF(1), V_TF_DDP_OFF(1), 0, 0, toep->ofld_rxq->iq.abs_id);
559 }
560 
561 static int
hold_uio(struct uio * uio,vm_page_t ** ppages,int * pnpages)562 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
563 {
564 	struct vm_map *map;
565 	struct iovec *iov;
566 	vm_offset_t start, end;
567 	vm_page_t *pp;
568 	int n;
569 
570 	KASSERT(uio->uio_iovcnt == 1,
571 	    ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
572 	KASSERT(uio->uio_td->td_proc == curproc,
573 	    ("%s: uio proc (%p) is not curproc (%p)",
574 	    __func__, uio->uio_td->td_proc, curproc));
575 
576 	map = &curproc->p_vmspace->vm_map;
577 	iov = &uio->uio_iov[0];
578 	start = trunc_page((uintptr_t)iov->iov_base);
579 	end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
580 	n = howmany(end - start, PAGE_SIZE);
581 
582 	if (end - start > MAX_DDP_BUFFER_SIZE)
583 		return (E2BIG);
584 
585 	pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
586 	if (pp == NULL)
587 		return (ENOMEM);
588 
589 	if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
590 	    iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
591 		free(pp, M_CXGBE);
592 		return (EFAULT);
593 	}
594 
595 	*ppages = pp;
596 	*pnpages = n;
597 
598 	return (0);
599 }
600 
601 static int
bufcmp(struct ddp_buffer * db,vm_page_t * pages,int npages,int offset,int len)602 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
603 {
604 	int i;
605 
606 	if (db == NULL || db->npages != npages || db->offset != offset ||
607 	    db->len != len)
608 		return (1);
609 
610 	for (i = 0; i < npages; i++) {
611 		if (pages[i]->phys_addr != db->pages[i]->phys_addr)
612 			return (1);
613 	}
614 
615 	return (0);
616 }
617 
618 static int
calculate_hcf(int n1,int n2)619 calculate_hcf(int n1, int n2)
620 {
621 	int a, b, t;
622 
623 	if (n1 <= n2) {
624 		a = n1;
625 		b = n2;
626 	} else {
627 		a = n2;
628 		b = n1;
629 	}
630 
631 	while (a != 0) {
632 		t = a;
633 		a = b % a;
634 		b = t;
635 	}
636 
637 	return (b);
638 }
639 
640 static inline int
pages_to_nppods(int npages,int ddp_page_shift)641 pages_to_nppods(int npages, int ddp_page_shift)
642 {
643 
644 	MPASS(ddp_page_shift >= PAGE_SHIFT);
645 
646 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
647 }
648 
649 static int
alloc_page_pods(struct ppod_region * pr,u_int nppods,u_int pgsz_idx,struct ppod_reservation * prsv)650 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
651     struct ppod_reservation *prsv)
652 {
653 	vmem_addr_t addr;       /* relative to start of region */
654 
655 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
656 	    &addr) != 0)
657 		return (ENOMEM);
658 
659 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
660 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
661 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
662 
663 	/*
664 	 * The hardware tagmask includes an extra invalid bit but the arena was
665 	 * seeded with valid values only.  An allocation out of this arena will
666 	 * fit inside the tagmask but won't have the invalid bit set.
667 	 */
668 	MPASS((addr & pr->pr_tag_mask) == addr);
669 	MPASS((addr & pr->pr_invalid_bit) == 0);
670 
671 	prsv->prsv_pr = pr;
672 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
673 	prsv->prsv_nppods = nppods;
674 
675 	return (0);
676 }
677 
678 int
t4_alloc_page_pods_for_db(struct ppod_region * pr,struct ddp_buffer * db)679 t4_alloc_page_pods_for_db(struct ppod_region *pr, struct ddp_buffer *db)
680 {
681 	int i, hcf, seglen, idx, nppods;
682 	struct ppod_reservation *prsv = &db->prsv;
683 
684 	KASSERT(prsv->prsv_nppods == 0,
685 	    ("%s: page pods already allocated", __func__));
686 
687 	/*
688 	 * The DDP page size is unrelated to the VM page size.  We combine
689 	 * contiguous physical pages into larger segments to get the best DDP
690 	 * page size possible.  This is the largest of the four sizes in
691 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
692 	 * the page list.
693 	 */
694 	hcf = 0;
695 	for (i = 0; i < db->npages; i++) {
696 		seglen = PAGE_SIZE;
697 		while (i < db->npages - 1 &&
698 		    db->pages[i]->phys_addr + PAGE_SIZE ==
699 		    db->pages[i + 1]->phys_addr) {
700 			seglen += PAGE_SIZE;
701 			i++;
702 		}
703 
704 		hcf = calculate_hcf(hcf, seglen);
705 		if (hcf < (1 << pr->pr_page_shift[1])) {
706 			idx = 0;
707 			goto have_pgsz;	/* give up, short circuit */
708 		}
709 	}
710 
711 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
712 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
713 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
714 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
715 			break;
716 	}
717 #undef PR_PAGE_MASK
718 
719 have_pgsz:
720 	MPASS(idx <= M_PPOD_PGSZ);
721 
722 	nppods = pages_to_nppods(db->npages, pr->pr_page_shift[idx]);
723 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
724 		return (0);
725 	MPASS(prsv->prsv_nppods > 0);
726 
727 	return (1);
728 }
729 
730 int
t4_alloc_page_pods_for_buf(struct ppod_region * pr,vm_offset_t buf,int len,struct ppod_reservation * prsv)731 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
732     struct ppod_reservation *prsv)
733 {
734 	int hcf, seglen, idx, npages, nppods;
735 	uintptr_t start_pva, end_pva, pva, p1;
736 
737 	MPASS(buf > 0);
738 	MPASS(len > 0);
739 
740 	/*
741 	 * The DDP page size is unrelated to the VM page size.  We combine
742 	 * contiguous physical pages into larger segments to get the best DDP
743 	 * page size possible.  This is the largest of the four sizes in
744 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
745 	 * in the page list.
746 	 */
747 	hcf = 0;
748 	start_pva = trunc_page(buf);
749 	end_pva = trunc_page(buf + len - 1);
750 	pva = start_pva;
751 	while (pva <= end_pva) {
752 		seglen = PAGE_SIZE;
753 		p1 = pmap_kextract(pva);
754 		pva += PAGE_SIZE;
755 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
756 			seglen += PAGE_SIZE;
757 			pva += PAGE_SIZE;
758 		}
759 
760 		hcf = calculate_hcf(hcf, seglen);
761 		if (hcf < (1 << pr->pr_page_shift[1])) {
762 			idx = 0;
763 			goto have_pgsz;	/* give up, short circuit */
764 		}
765 	}
766 
767 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
768 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
769 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
770 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
771 			break;
772 	}
773 #undef PR_PAGE_MASK
774 
775 have_pgsz:
776 	MPASS(idx <= M_PPOD_PGSZ);
777 
778 	npages = 1;
779 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
780 	nppods = howmany(npages, PPOD_PAGES);
781 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
782 		return (ENOMEM);
783 	MPASS(prsv->prsv_nppods > 0);
784 
785 	return (0);
786 }
787 
788 void
t4_free_page_pods(struct ppod_reservation * prsv)789 t4_free_page_pods(struct ppod_reservation *prsv)
790 {
791 	struct ppod_region *pr = prsv->prsv_pr;
792 	vmem_addr_t addr;
793 
794 	MPASS(prsv != NULL);
795 	MPASS(prsv->prsv_nppods != 0);
796 
797 	addr = prsv->prsv_tag & pr->pr_tag_mask;
798 	MPASS((addr & pr->pr_invalid_bit) == 0);
799 
800 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
801 	    pr->pr_arena, addr, prsv->prsv_nppods);
802 
803 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
804 	prsv->prsv_nppods = 0;
805 }
806 
807 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
808 
809 int
t4_write_page_pods_for_db(struct adapter * sc,struct sge_wrq * wrq,int tid,struct ddp_buffer * db)810 t4_write_page_pods_for_db(struct adapter *sc, struct sge_wrq *wrq, int tid,
811     struct ddp_buffer *db)
812 {
813 	struct wrqe *wr;
814 	struct ulp_mem_io *ulpmc;
815 	struct ulptx_idata *ulpsc;
816 	struct pagepod *ppod;
817 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
818 	u_int ppod_addr;
819 	uint32_t cmd;
820 	struct ppod_reservation *prsv = &db->prsv;
821 	struct ppod_region *pr = prsv->prsv_pr;
822 
823 	MPASS(prsv->prsv_nppods > 0);
824 
825 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
826 	if (is_t4(sc))
827 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
828 	else
829 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
830 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
831 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
832 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
833 
834 		/* How many page pods are we writing in this cycle */
835 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
836 		chunk = PPOD_SZ(n);
837 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
838 
839 		wr = alloc_wrqe(len, wrq);
840 		if (wr == NULL)
841 			return (ENOMEM);	/* ok to just bail out */
842 		ulpmc = wrtod(wr);
843 
844 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
845 		ulpmc->cmd = cmd;
846 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
847 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
848 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
849 
850 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
851 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
852 		ulpsc->len = htobe32(chunk);
853 
854 		ppod = (struct pagepod *)(ulpsc + 1);
855 		for (j = 0; j < n; i++, j++, ppod++) {
856 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
857 			    V_PPOD_TID(tid) | prsv->prsv_tag);
858 			ppod->len_offset = htobe64(V_PPOD_LEN(db->len) |
859 			    V_PPOD_OFST(db->offset));
860 			ppod->rsvd = 0;
861 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
862 			for (k = 0; k < nitems(ppod->addr); k++) {
863 				if (idx < db->npages) {
864 					ppod->addr[k] =
865 					    htobe64(db->pages[idx]->phys_addr);
866 					idx += ddp_pgsz / PAGE_SIZE;
867 				} else
868 					ppod->addr[k] = 0;
869 #if 0
870 				CTR5(KTR_CXGBE,
871 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
872 				    __func__, toep->tid, i, k,
873 				    htobe64(ppod->addr[k]));
874 #endif
875 			}
876 
877 		}
878 
879 		t4_wrq_tx(sc, wr);
880 	}
881 
882 	return (0);
883 }
884 
885 int
t4_write_page_pods_for_buf(struct adapter * sc,struct sge_wrq * wrq,int tid,struct ppod_reservation * prsv,vm_offset_t buf,int buflen)886 t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
887     struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
888 {
889 	struct wrqe *wr;
890 	struct ulp_mem_io *ulpmc;
891 	struct ulptx_idata *ulpsc;
892 	struct pagepod *ppod;
893 	int i, j, k, n, chunk, len, ddp_pgsz;
894 	u_int ppod_addr, offset;
895 	uint32_t cmd;
896 	struct ppod_region *pr = prsv->prsv_pr;
897 	uintptr_t end_pva, pva, pa;
898 
899 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
900 	if (is_t4(sc))
901 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
902 	else
903 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
904 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
905 	offset = buf & PAGE_MASK;
906 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
907 	pva = trunc_page(buf);
908 	end_pva = trunc_page(buf + buflen - 1);
909 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
910 
911 		/* How many page pods are we writing in this cycle */
912 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
913 		MPASS(n > 0);
914 		chunk = PPOD_SZ(n);
915 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
916 
917 		wr = alloc_wrqe(len, wrq);
918 		if (wr == NULL)
919 			return (ENOMEM);	/* ok to just bail out */
920 		ulpmc = wrtod(wr);
921 
922 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
923 		ulpmc->cmd = cmd;
924 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
925 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
926 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
927 
928 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
929 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
930 		ulpsc->len = htobe32(chunk);
931 
932 		ppod = (struct pagepod *)(ulpsc + 1);
933 		for (j = 0; j < n; i++, j++, ppod++) {
934 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
935 			    V_PPOD_TID(tid) |
936 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
937 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
938 			    V_PPOD_OFST(offset));
939 			ppod->rsvd = 0;
940 
941 			for (k = 0; k < nitems(ppod->addr); k++) {
942 				if (pva > end_pva)
943 					ppod->addr[k] = 0;
944 				else {
945 					pa = pmap_kextract(pva);
946 					ppod->addr[k] = htobe64(pa);
947 					pva += ddp_pgsz;
948 				}
949 #if 0
950 				CTR5(KTR_CXGBE,
951 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
952 				    __func__, tid, i, k,
953 				    htobe64(ppod->addr[k]));
954 #endif
955 			}
956 
957 			/*
958 			 * Walk back 1 segment so that the first address in the
959 			 * next pod is the same as the last one in the current
960 			 * pod.
961 			 */
962 			pva -= ddp_pgsz;
963 		}
964 
965 		t4_wrq_tx(sc, wr);
966 	}
967 
968 	MPASS(pva <= end_pva);
969 
970 	return (0);
971 }
972 
973 /*
974  * Reuse, or allocate (and program the page pods for) a new DDP buffer.  The
975  * "pages" array is handed over to this function and should not be used in any
976  * way by the caller after that.
977  */
978 static int
select_ddp_buffer(struct adapter * sc,struct toepcb * toep,vm_page_t * pages,int npages,int db_off,int db_len)979 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages,
980     int npages, int db_off, int db_len)
981 {
982 	struct ddp_buffer *db;
983 	struct tom_data *td = sc->tom_softc;
984 	int i, empty_slot = -1;
985 
986 	/* Try to reuse */
987 	for (i = 0; i < nitems(toep->db); i++) {
988 		if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) {
989 			free(pages, M_CXGBE);
990 			return (i);	/* pages still held */
991 		} else if (toep->db[i] == NULL && empty_slot < 0)
992 			empty_slot = i;
993 	}
994 
995 	/* Allocate new buffer, write its page pods. */
996 	db = alloc_ddp_buffer(pages, npages, db_off, db_len);
997 	if (db == NULL) {
998 		vm_page_unhold_pages(pages, npages);
999 		free(pages, M_CXGBE);
1000 		return (-1);
1001 	}
1002 	if (t4_alloc_page_pods_for_db(&td->pr, db)) {
1003 		vm_page_unhold_pages(pages, npages);
1004 		free_ddp_buffer(db);
1005 		return (-1);
1006 	}
1007 	if (t4_write_page_pods_for_db(sc, toep->ctrlq, toep->tid, db) != 0) {
1008 		vm_page_unhold_pages(pages, npages);
1009 		free_ddp_buffer(db);
1010 		return (-1);
1011 	}
1012 
1013 	i = empty_slot;
1014 	if (i < 0) {
1015 		i = arc4random() % nitems(toep->db);
1016 		free_ddp_buffer(toep->db[i]);
1017 	}
1018 	toep->db[i] = db;
1019 
1020 	CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)",
1021 	    __func__, toep->tid, i, db, db->prsv.prsv_tag);
1022 
1023 	return (i);
1024 }
1025 
1026 static void
wire_ddp_buffer(struct ddp_buffer * db)1027 wire_ddp_buffer(struct ddp_buffer *db)
1028 {
1029 	int i;
1030 	vm_page_t p;
1031 
1032 	for (i = 0; i < db->npages; i++) {
1033 		p = db->pages[i];
1034 		vm_page_lock(p);
1035 		vm_page_wire(p);
1036 		vm_page_unhold(p);
1037 		vm_page_unlock(p);
1038 	}
1039 }
1040 
1041 static void
unwire_ddp_buffer(struct ddp_buffer * db)1042 unwire_ddp_buffer(struct ddp_buffer *db)
1043 {
1044 	int i;
1045 	vm_page_t p;
1046 
1047 	for (i = 0; i < db->npages; i++) {
1048 		p = db->pages[i];
1049 		vm_page_lock(p);
1050 		vm_page_unwire(p, 0);
1051 		vm_page_unlock(p);
1052 	}
1053 }
1054 
1055 static int
handle_ddp(struct socket * so,struct uio * uio,int flags,int error)1056 handle_ddp(struct socket *so, struct uio *uio, int flags, int error)
1057 {
1058 	struct sockbuf *sb = &so->so_rcv;
1059 	struct tcpcb *tp = so_sototcpcb(so);
1060 	struct toepcb *toep = tp->t_toe;
1061 	struct adapter *sc = td_adapter(toep->td);
1062 	vm_page_t *pages;
1063 	int npages, db_idx, rc, buf_flag;
1064 	struct ddp_buffer *db;
1065 	struct wrqe *wr;
1066 	uint64_t ddp_flags;
1067 
1068 	SOCKBUF_LOCK_ASSERT(sb);
1069 
1070 #if 0
1071 	if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) {
1072 		CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d",
1073 		    __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid);
1074 	}
1075 #endif
1076 
1077 	/* XXX: too eager to disable DDP, could handle NBIO better than this. */
1078 	if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
1079 	    uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
1080 	    so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
1081 	    error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
1082 		goto no_ddp;
1083 
1084 	/*
1085 	 * Fault in and then hold the pages of the uio buffers.  We'll wire them
1086 	 * a bit later if everything else works out.
1087 	 */
1088 	SOCKBUF_UNLOCK(sb);
1089 	if (hold_uio(uio, &pages, &npages) != 0) {
1090 		SOCKBUF_LOCK(sb);
1091 		goto no_ddp;
1092 	}
1093 	SOCKBUF_LOCK(sb);
1094 	if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) {
1095 		vm_page_unhold_pages(pages, npages);
1096 		free(pages, M_CXGBE);
1097 		goto no_ddp;
1098 	}
1099 
1100 	/*
1101 	 * Figure out which one of the two DDP buffers to use this time.
1102 	 */
1103 	db_idx = select_ddp_buffer(sc, toep, pages, npages,
1104 	    (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid);
1105 	pages = NULL;	/* handed off to select_ddp_buffer */
1106 	if (db_idx < 0)
1107 		goto no_ddp;
1108 	db = toep->db[db_idx];
1109 	buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE;
1110 
1111 	/*
1112 	 * Build the compound work request that tells the chip where to DMA the
1113 	 * payload.
1114 	 */
1115 	ddp_flags = select_ddp_flags(so, flags, db_idx);
1116 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
1117 	if (wr == NULL) {
1118 		/*
1119 		 * Just unhold the pages.  The DDP buffer's software state is
1120 		 * left as-is in the toep.  The page pods were written
1121 		 * successfully and we may have an opportunity to use it in the
1122 		 * future.
1123 		 */
1124 		vm_page_unhold_pages(db->pages, db->npages);
1125 		goto no_ddp;
1126 	}
1127 
1128 	/* Wire (and then unhold) the pages, and give the chip the go-ahead. */
1129 	wire_ddp_buffer(db);
1130 	t4_wrq_tx(sc, wr);
1131 	sb->sb_flags &= ~SB_DDP_INDICATE;
1132 	toep->ddp_flags |= buf_flag;
1133 
1134 	/*
1135 	 * Wait for the DDP operation to complete and then unwire the pages.
1136 	 * The return code from the sbwait will be the final return code of this
1137 	 * function.  But we do need to wait for DDP no matter what.
1138 	 */
1139 	rc = sbwait(sb);
1140 	while (toep->ddp_flags & buf_flag) {
1141 		sb->sb_flags |= SB_WAIT;
1142 		msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
1143 	}
1144 	unwire_ddp_buffer(db);
1145 	return (rc);
1146 no_ddp:
1147 	disable_ddp(sc, toep);
1148 	discourage_ddp(toep);
1149 	sb->sb_flags &= ~SB_DDP_INDICATE;
1150 	return (0);
1151 }
1152 
1153 int
t4_init_ppod_region(struct ppod_region * pr,struct t4_range * r,u_int psz,const char * name)1154 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
1155     const char *name)
1156 {
1157 	int i;
1158 
1159 	MPASS(pr != NULL);
1160 	MPASS(r->size > 0);
1161 
1162 	pr->pr_start = r->start;
1163 	pr->pr_len = r->size;
1164 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
1165 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
1166 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
1167 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
1168 
1169 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
1170 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
1171 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
1172 			return (ENXIO);
1173 	}
1174 
1175 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
1176 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
1177 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
1178 		return (ENXIO);
1179 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
1180 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
1181 
1182 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
1183 	    M_FIRSTFIT | M_NOWAIT);
1184 	if (pr->pr_arena == NULL)
1185 		return (ENOMEM);
1186 
1187 	return (0);
1188 }
1189 
1190 void
t4_free_ppod_region(struct ppod_region * pr)1191 t4_free_ppod_region(struct ppod_region *pr)
1192 {
1193 
1194 	MPASS(pr != NULL);
1195 
1196 	if (pr->pr_arena)
1197 		vmem_destroy(pr->pr_arena);
1198 	bzero(pr, sizeof(*pr));
1199 }
1200 
1201 #define	VNET_SO_ASSERT(so)						\
1202 	VNET_ASSERT(curvnet != NULL,					\
1203 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
1204 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1205 static int
soreceive_rcvoob(struct socket * so,struct uio * uio,int flags)1206 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1207 {
1208 
1209 	CXGBE_UNIMPLEMENTED(__func__);
1210 }
1211 
1212 static char ddp_magic_str[] = "nothing to see here";
1213 
1214 static struct mbuf *
get_ddp_mbuf(int len)1215 get_ddp_mbuf(int len)
1216 {
1217 	struct mbuf *m;
1218 
1219 	m = m_get(M_NOWAIT, MT_DATA);
1220 	if (m == NULL)
1221 		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
1222 	m->m_len = len;
1223 	m->m_data = &ddp_magic_str[0];
1224 
1225 	return (m);
1226 }
1227 
1228 static inline int
is_ddp_mbuf(struct mbuf * m)1229 is_ddp_mbuf(struct mbuf *m)
1230 {
1231 
1232 	return (m->m_data == &ddp_magic_str[0]);
1233 }
1234 
1235 /*
1236  * Copy an mbuf chain into a uio limited by len if set.
1237  */
1238 static int
m_mbuftouio_ddp(struct uio * uio,struct mbuf * m,int len)1239 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len)
1240 {
1241 	int error, length, total;
1242 	int progress = 0;
1243 
1244 	if (len > 0)
1245 		total = min(uio->uio_resid, len);
1246 	else
1247 		total = uio->uio_resid;
1248 
1249 	/* Fill the uio with data from the mbufs. */
1250 	for (; m != NULL; m = m->m_next) {
1251 		length = min(m->m_len, total - progress);
1252 
1253 		if (is_ddp_mbuf(m)) {
1254 			enum uio_seg segflag = uio->uio_segflg;
1255 
1256 			uio->uio_segflg	= UIO_NOCOPY;
1257 			error = uiomove(mtod(m, void *), length, uio);
1258 			uio->uio_segflg	= segflag;
1259 		} else
1260 			error = uiomove(mtod(m, void *), length, uio);
1261 		if (error)
1262 			return (error);
1263 
1264 		progress += length;
1265 	}
1266 
1267 	return (0);
1268 }
1269 
1270 /*
1271  * Based on soreceive_stream() in uipc_socket.c
1272  */
1273 int
t4_soreceive_ddp(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1274 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1275     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1276 {
1277 	int len = 0, error = 0, flags, oresid, ddp_handled = 0;
1278 	struct sockbuf *sb;
1279 	struct mbuf *m, *n = NULL;
1280 
1281 	/* We only do stream sockets. */
1282 	if (so->so_type != SOCK_STREAM)
1283 		return (EINVAL);
1284 	if (psa != NULL)
1285 		*psa = NULL;
1286 	if (controlp != NULL)
1287 		return (EINVAL);
1288 	if (flagsp != NULL)
1289 		flags = *flagsp &~ MSG_EOR;
1290 	else
1291 		flags = 0;
1292 	if (flags & MSG_OOB)
1293 		return (soreceive_rcvoob(so, uio, flags));
1294 	if (mp0 != NULL)
1295 		*mp0 = NULL;
1296 
1297 	sb = &so->so_rcv;
1298 
1299 	/* Prevent other readers from entering the socket. */
1300 	error = sblock(sb, SBLOCKWAIT(flags));
1301 	SOCKBUF_LOCK(sb);
1302 	if (error)
1303 		goto out;
1304 
1305 	/* Easy one, no space to copyout anything. */
1306 	if (uio->uio_resid == 0) {
1307 		error = EINVAL;
1308 		goto out;
1309 	}
1310 	oresid = uio->uio_resid;
1311 
1312 	/* We will never ever get anything unless we are or were connected. */
1313 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1314 		error = ENOTCONN;
1315 		goto out;
1316 	}
1317 
1318 restart:
1319 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1320 
1321 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1322 
1323 		/* uio should be just as it was at entry */
1324 		KASSERT(oresid == uio->uio_resid,
1325 		    ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
1326 		    __func__, oresid, uio->uio_resid, sb->sb_cc));
1327 
1328 		error = handle_ddp(so, uio, flags, 0);
1329 		ddp_handled = 1;
1330 		if (error)
1331 			goto out;
1332 	}
1333 
1334 	/* Abort if socket has reported problems. */
1335 	if (so->so_error) {
1336 		if (sb->sb_cc > 0)
1337 			goto deliver;
1338 		if (oresid > uio->uio_resid)
1339 			goto out;
1340 		error = so->so_error;
1341 		if (!(flags & MSG_PEEK))
1342 			so->so_error = 0;
1343 		goto out;
1344 	}
1345 
1346 	/* Door is closed.  Deliver what is left, if any. */
1347 	if (sb->sb_state & SBS_CANTRCVMORE) {
1348 		if (sb->sb_cc > 0)
1349 			goto deliver;
1350 		else
1351 			goto out;
1352 	}
1353 
1354 	/* Socket buffer is empty and we shall not block. */
1355 	if (sb->sb_cc == 0 &&
1356 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1357 		error = EAGAIN;
1358 		goto out;
1359 	}
1360 
1361 	/* Socket buffer got some data that we shall deliver now. */
1362 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1363 	    ((so->so_state & SS_NBIO) ||
1364 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1365 	     sb->sb_cc >= sb->sb_lowat ||
1366 	     sb->sb_cc >= uio->uio_resid ||
1367 	     sb->sb_cc >= sb->sb_hiwat) ) {
1368 		goto deliver;
1369 	}
1370 
1371 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1372 	if ((flags & MSG_WAITALL) &&
1373 	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1374 		goto deliver;
1375 
1376 	/*
1377 	 * Wait and block until (more) data comes in.
1378 	 * NB: Drops the sockbuf lock during wait.
1379 	 */
1380 	error = sbwait(sb);
1381 	if (error) {
1382 		if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1383 			(void) handle_ddp(so, uio, flags, 1);
1384 			ddp_handled = 1;
1385 		}
1386 		goto out;
1387 	}
1388 	goto restart;
1389 
1390 deliver:
1391 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1392 	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1393 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1394 
1395 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
1396 		goto restart;
1397 
1398 	/* Statistics. */
1399 	if (uio->uio_td)
1400 		uio->uio_td->td_ru.ru_msgrcv++;
1401 
1402 	/* Fill uio until full or current end of socket buffer is reached. */
1403 	len = min(uio->uio_resid, sb->sb_cc);
1404 	if (mp0 != NULL) {
1405 		/* Dequeue as many mbufs as possible. */
1406 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1407 			for (*mp0 = m = sb->sb_mb;
1408 			     m != NULL && m->m_len <= len;
1409 			     m = m->m_next) {
1410 				len -= m->m_len;
1411 				uio->uio_resid -= m->m_len;
1412 				sbfree(sb, m);
1413 				n = m;
1414 			}
1415 			sb->sb_mb = m;
1416 			if (sb->sb_mb == NULL)
1417 				SB_EMPTY_FIXUP(sb);
1418 			n->m_next = NULL;
1419 		}
1420 		/* Copy the remainder. */
1421 		if (len > 0) {
1422 			KASSERT(sb->sb_mb != NULL,
1423 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1424 
1425 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1426 			if (m == NULL)
1427 				len = 0;	/* Don't flush data from sockbuf. */
1428 			else
1429 				uio->uio_resid -= m->m_len;
1430 			if (*mp0 != NULL)
1431 				n->m_next = m;
1432 			else
1433 				*mp0 = m;
1434 			if (*mp0 == NULL) {
1435 				error = ENOBUFS;
1436 				goto out;
1437 			}
1438 		}
1439 	} else {
1440 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1441 		SOCKBUF_UNLOCK(sb);
1442 		error = m_mbuftouio_ddp(uio, sb->sb_mb, len);
1443 		SOCKBUF_LOCK(sb);
1444 		if (error)
1445 			goto out;
1446 	}
1447 	SBLASTRECORDCHK(sb);
1448 	SBLASTMBUFCHK(sb);
1449 
1450 	/*
1451 	 * Remove the delivered data from the socket buffer unless we
1452 	 * were only peeking.
1453 	 */
1454 	if (!(flags & MSG_PEEK)) {
1455 		if (len > 0)
1456 			sbdrop_locked(sb, len);
1457 
1458 		/* Notify protocol that we drained some data. */
1459 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
1460 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
1461 		     !(flags & MSG_SOCALLBCK))) {
1462 			SOCKBUF_UNLOCK(sb);
1463 			VNET_SO_ASSERT(so);
1464 			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1465 			SOCKBUF_LOCK(sb);
1466 		}
1467 	}
1468 
1469 	/*
1470 	 * For MSG_WAITALL we may have to loop again and wait for
1471 	 * more data to come in.
1472 	 */
1473 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1474 		goto restart;
1475 out:
1476 	SOCKBUF_LOCK_ASSERT(sb);
1477 	SBLASTRECORDCHK(sb);
1478 	SBLASTMBUFCHK(sb);
1479 	SOCKBUF_UNLOCK(sb);
1480 	sbunlock(sb);
1481 	return (error);
1482 }
1483 
1484 int
t4_ddp_mod_load(void)1485 t4_ddp_mod_load(void)
1486 {
1487 
1488 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
1489 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
1490 	return (0);
1491 }
1492 
1493 void
t4_ddp_mod_unload(void)1494 t4_ddp_mod_unload(void)
1495 {
1496 
1497 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
1498 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
1499 }
1500 #endif
1501