1 /*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_ddp.c 312337 2017-01-17 07:43:37Z np $");
30
31 #include "opt_inet.h"
32
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/ktr.h>
38 #include <sys/module.h>
39 #include <sys/protosw.h>
40 #include <sys/proc.h>
41 #include <sys/domain.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/uio.h>
45 #include <netinet/in.h>
46 #include <netinet/in_pcb.h>
47 #include <netinet/ip.h>
48 #include <netinet/tcp_var.h>
49 #define TCPSTATES
50 #include <netinet/tcp_fsm.h>
51 #include <netinet/toecore.h>
52
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_param.h>
56 #include <vm/pmap.h>
57 #include <vm/vm_map.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_object.h>
60
61 #ifdef TCP_OFFLOAD
62 #include "common/common.h"
63 #include "common/t4_msg.h"
64 #include "common/t4_regs.h"
65 #include "common/t4_tcb.h"
66 #include "tom/t4_tom.h"
67
68 VNET_DECLARE(int, tcp_do_autorcvbuf);
69 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
70 VNET_DECLARE(int, tcp_autorcvbuf_inc);
71 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
72 VNET_DECLARE(int, tcp_autorcvbuf_max);
73 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
74
75 static struct mbuf *get_ddp_mbuf(int len);
76
77 #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN)
78
79 static struct ddp_buffer *
alloc_ddp_buffer(vm_page_t * pages,int npages,int offset,int len)80 alloc_ddp_buffer(vm_page_t *pages, int npages, int offset, int len)
81 {
82 struct ddp_buffer *db;
83
84 db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT | M_ZERO);
85 if (db == NULL) {
86 CTR1(KTR_CXGBE, "%s: malloc failed.", __func__);
87 return (NULL);
88 }
89
90 db->npages = npages;
91 db->pages = pages;
92 db->offset = offset;
93 db->len = len;
94
95 return (db);
96 }
97
98 static void
free_ddp_buffer(struct ddp_buffer * db)99 free_ddp_buffer(struct ddp_buffer *db)
100 {
101
102 if (db == NULL)
103 return;
104
105 if (db->pages)
106 free(db->pages, M_CXGBE);
107
108 if (db->prsv.prsv_nppods > 0)
109 t4_free_page_pods(&db->prsv);
110
111 free(db, M_CXGBE);
112 }
113
114 void
release_ddp_resources(struct toepcb * toep)115 release_ddp_resources(struct toepcb *toep)
116 {
117 int i;
118
119 for (i = 0; i < nitems(toep->db); i++) {
120 if (toep->db[i] != NULL) {
121 free_ddp_buffer(toep->db[i]);
122 toep->db[i] = NULL;
123 }
124 }
125 }
126
127 /* XXX: handle_ddp_data code duplication */
128 void
insert_ddp_data(struct toepcb * toep,uint32_t n)129 insert_ddp_data(struct toepcb *toep, uint32_t n)
130 {
131 struct inpcb *inp = toep->inp;
132 struct tcpcb *tp = intotcpcb(inp);
133 struct sockbuf *sb = &inp->inp_socket->so_rcv;
134 struct mbuf *m;
135
136 INP_WLOCK_ASSERT(inp);
137 SOCKBUF_LOCK_ASSERT(sb);
138
139 m = get_ddp_mbuf(n);
140 tp->rcv_nxt += n;
141 #ifndef USE_DDP_RX_FLOW_CONTROL
142 KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
143 tp->rcv_wnd -= n;
144 #endif
145
146 KASSERT(toep->sb_cc >= sb->sb_cc,
147 ("%s: sb %p has more data (%d) than last time (%d).",
148 __func__, sb, sb->sb_cc, toep->sb_cc));
149 toep->rx_credits += toep->sb_cc - sb->sb_cc;
150 #ifdef USE_DDP_RX_FLOW_CONTROL
151 toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */
152 #endif
153 sbappendstream_locked(sb, m);
154 toep->sb_cc = sb->sb_cc;
155 }
156
157 /* SET_TCB_FIELD sent as a ULP command looks like this */
158 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
159 sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
160
161 /* RX_DATA_ACK sent as a ULP command looks like this */
162 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
163 sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
164
165 static inline void *
mk_set_tcb_field_ulp(struct ulp_txpkt * ulpmc,struct toepcb * toep,uint64_t word,uint64_t mask,uint64_t val)166 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
167 uint64_t word, uint64_t mask, uint64_t val)
168 {
169 struct ulptx_idata *ulpsc;
170 struct cpl_set_tcb_field_core *req;
171
172 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
173 ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
174
175 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
176 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
177 ulpsc->len = htobe32(sizeof(*req));
178
179 req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
180 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
181 req->reply_ctrl = htobe16(V_NO_REPLY(1) |
182 V_QUEUENO(toep->ofld_rxq->iq.abs_id));
183 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
184 req->mask = htobe64(mask);
185 req->val = htobe64(val);
186
187 ulpsc = (struct ulptx_idata *)(req + 1);
188 if (LEN__SET_TCB_FIELD_ULP % 16) {
189 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
190 ulpsc->len = htobe32(0);
191 return (ulpsc + 1);
192 }
193 return (ulpsc);
194 }
195
196 static inline void *
mk_rx_data_ack_ulp(struct ulp_txpkt * ulpmc,struct toepcb * toep)197 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
198 {
199 struct ulptx_idata *ulpsc;
200 struct cpl_rx_data_ack_core *req;
201
202 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
203 ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
204
205 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
206 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
207 ulpsc->len = htobe32(sizeof(*req));
208
209 req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
210 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
211 req->credit_dack = htobe32(F_RX_MODULATE_RX);
212
213 ulpsc = (struct ulptx_idata *)(req + 1);
214 if (LEN__RX_DATA_ACK_ULP % 16) {
215 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
216 ulpsc->len = htobe32(0);
217 return (ulpsc + 1);
218 }
219 return (ulpsc);
220 }
221
222 static inline uint64_t
select_ddp_flags(struct socket * so,int flags,int db_idx)223 select_ddp_flags(struct socket *so, int flags, int db_idx)
224 {
225 uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
226 int waitall = flags & MSG_WAITALL;
227 int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
228
229 KASSERT(db_idx == 0 || db_idx == 1,
230 ("%s: bad DDP buffer index %d", __func__, db_idx));
231
232 if (db_idx == 0) {
233 ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
234 if (waitall)
235 ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
236 else if (nb)
237 ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
238 else
239 ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
240 } else {
241 ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
242 if (waitall)
243 ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
244 else if (nb)
245 ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
246 else
247 ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
248 }
249
250 return (ddp_flags);
251 }
252
253 static struct wrqe *
mk_update_tcb_for_ddp(struct adapter * sc,struct toepcb * toep,int db_idx,int offset,uint64_t ddp_flags)254 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
255 int offset, uint64_t ddp_flags)
256 {
257 struct ddp_buffer *db = toep->db[db_idx];
258 struct wrqe *wr;
259 struct work_request_hdr *wrh;
260 struct ulp_txpkt *ulpmc;
261 int len;
262
263 KASSERT(db_idx == 0 || db_idx == 1,
264 ("%s: bad DDP buffer index %d", __func__, db_idx));
265
266 /*
267 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
268 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
269 *
270 * The work request header is 16B and always ends at a 16B boundary.
271 * The ULPTX master commands that follow must all end at 16B boundaries
272 * too so we round up the size to 16.
273 */
274 len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
275 roundup2(LEN__RX_DATA_ACK_ULP, 16);
276
277 wr = alloc_wrqe(len, toep->ctrlq);
278 if (wr == NULL)
279 return (NULL);
280 wrh = wrtod(wr);
281 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */
282 ulpmc = (struct ulp_txpkt *)(wrh + 1);
283
284 /* Write the buffer's tag */
285 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
286 W_TCB_RX_DDP_BUF0_TAG + db_idx,
287 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
288 V_TCB_RX_DDP_BUF0_TAG(db->prsv.prsv_tag));
289
290 /* Update the current offset in the DDP buffer and its total length */
291 if (db_idx == 0)
292 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
293 W_TCB_RX_DDP_BUF0_OFFSET,
294 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
295 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
296 V_TCB_RX_DDP_BUF0_OFFSET(offset) |
297 V_TCB_RX_DDP_BUF0_LEN(db->len));
298 else
299 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
300 W_TCB_RX_DDP_BUF1_OFFSET,
301 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
302 V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
303 V_TCB_RX_DDP_BUF1_OFFSET(offset) |
304 V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
305
306 /* Update DDP flags */
307 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
308 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
309 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
310 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
311 V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
312
313 /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
314 ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
315
316 return (wr);
317 }
318
319 static void
discourage_ddp(struct toepcb * toep)320 discourage_ddp(struct toepcb *toep)
321 {
322
323 if (toep->ddp_score && --toep->ddp_score == 0) {
324 toep->ddp_flags &= ~DDP_OK;
325 toep->ddp_disabled = time_uptime;
326 CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
327 __func__, toep->tid, time_uptime);
328 }
329 }
330
331 static int
handle_ddp_data(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)332 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
333 {
334 uint32_t report = be32toh(ddp_report);
335 unsigned int db_flag;
336 struct inpcb *inp = toep->inp;
337 struct tcpcb *tp;
338 struct socket *so;
339 struct sockbuf *sb;
340 struct mbuf *m;
341
342 db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
343
344 if (__predict_false(!(report & F_DDP_INV)))
345 CXGBE_UNIMPLEMENTED("DDP buffer still valid");
346
347 INP_WLOCK(inp);
348 so = inp_inpcbtosocket(inp);
349 sb = &so->so_rcv;
350 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
351
352 /*
353 * XXX: think a bit more.
354 * tcpcb probably gone, but socket should still be around
355 * because we always wait for DDP completion in soreceive no
356 * matter what. Just wake it up and let it clean up.
357 */
358
359 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
360 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
361 SOCKBUF_LOCK(sb);
362 goto wakeup;
363 }
364
365 tp = intotcpcb(inp);
366
367 /*
368 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
369 * sequence number of the next byte to receive. The length of
370 * the data received for this message must be computed by
371 * comparing the new and old values of rcv_nxt.
372 *
373 * For RX_DATA_DDP, len might be non-zero, but it is only the
374 * length of the most recent DMA. It does not include the
375 * total length of the data received since the previous update
376 * for this DDP buffer. rcv_nxt is the sequence number of the
377 * first received byte from the most recent DMA.
378 */
379 len += be32toh(rcv_nxt) - tp->rcv_nxt;
380 tp->rcv_nxt += len;
381 tp->t_rcvtime = ticks;
382 #ifndef USE_DDP_RX_FLOW_CONTROL
383 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
384 tp->rcv_wnd -= len;
385 #endif
386 m = get_ddp_mbuf(len);
387
388 SOCKBUF_LOCK(sb);
389 if (report & F_DDP_BUF_COMPLETE)
390 toep->ddp_score = DDP_HIGH_SCORE;
391 else
392 discourage_ddp(toep);
393
394 /* receive buffer autosize */
395 MPASS(toep->vnet == so->so_vnet);
396 CURVNET_SET(toep->vnet);
397 if (sb->sb_flags & SB_AUTOSIZE &&
398 V_tcp_do_autorcvbuf &&
399 sb->sb_hiwat < V_tcp_autorcvbuf_max &&
400 len > (sbspace(sb) / 8 * 7)) {
401 unsigned int hiwat = sb->sb_hiwat;
402 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
403 V_tcp_autorcvbuf_max);
404
405 if (!sbreserve_locked(sb, newsize, so, NULL))
406 sb->sb_flags &= ~SB_AUTOSIZE;
407 else
408 toep->rx_credits += newsize - hiwat;
409 }
410 CURVNET_RESTORE();
411
412 KASSERT(toep->sb_cc >= sb->sb_cc,
413 ("%s: sb %p has more data (%d) than last time (%d).",
414 __func__, sb, sb->sb_cc, toep->sb_cc));
415 toep->rx_credits += toep->sb_cc - sb->sb_cc;
416 #ifdef USE_DDP_RX_FLOW_CONTROL
417 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */
418 #endif
419 sbappendstream_locked(sb, m);
420 toep->sb_cc = sb->sb_cc;
421 wakeup:
422 KASSERT(toep->ddp_flags & db_flag,
423 ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
424 __func__, toep, toep->ddp_flags, report));
425 toep->ddp_flags &= ~db_flag;
426 sorwakeup_locked(so);
427 SOCKBUF_UNLOCK_ASSERT(sb);
428
429 INP_WUNLOCK(inp);
430 return (0);
431 }
432
433 void
handle_ddp_close(struct toepcb * toep,struct tcpcb * tp,struct sockbuf * sb,__be32 rcv_nxt)434 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, struct sockbuf *sb,
435 __be32 rcv_nxt)
436 {
437 struct mbuf *m;
438 int len;
439
440 SOCKBUF_LOCK_ASSERT(sb);
441 INP_WLOCK_ASSERT(toep->inp);
442 len = be32toh(rcv_nxt) - tp->rcv_nxt;
443
444 /* Signal handle_ddp() to break out of its sleep loop. */
445 toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE);
446 if (len == 0)
447 return;
448
449 tp->rcv_nxt += len;
450 KASSERT(toep->sb_cc >= sb->sb_cc,
451 ("%s: sb %p has more data (%d) than last time (%d).",
452 __func__, sb, sb->sb_cc, toep->sb_cc));
453 toep->rx_credits += toep->sb_cc - sb->sb_cc;
454 #ifdef USE_DDP_RX_FLOW_CONTROL
455 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */
456 #endif
457
458 m = get_ddp_mbuf(len);
459
460 sbappendstream_locked(sb, m);
461 toep->sb_cc = sb->sb_cc;
462 }
463
464 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
465 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
466 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
467 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
468
469 extern cpl_handler_t t4_cpl_handler[];
470
471 static int
do_rx_data_ddp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)472 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
473 {
474 struct adapter *sc = iq->adapter;
475 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
476 unsigned int tid = GET_TID(cpl);
477 uint32_t vld;
478 struct toepcb *toep = lookup_tid(sc, tid);
479
480 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
481 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
482 KASSERT(!(toep->flags & TPF_SYNQE),
483 ("%s: toep %p claims to be a synq entry", __func__, toep));
484
485 vld = be32toh(cpl->ddpvld);
486 if (__predict_false(vld & DDP_ERR)) {
487 panic("%s: DDP error 0x%x (tid %d, toep %p)",
488 __func__, vld, tid, toep);
489 }
490
491 if (toep->ulp_mode == ULP_MODE_ISCSI) {
492 t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
493 return (0);
494 }
495
496 handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
497
498 return (0);
499 }
500
501 static int
do_rx_ddp_complete(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)502 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
503 struct mbuf *m)
504 {
505 struct adapter *sc = iq->adapter;
506 const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
507 unsigned int tid = GET_TID(cpl);
508 struct toepcb *toep = lookup_tid(sc, tid);
509
510 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
511 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
512 KASSERT(!(toep->flags & TPF_SYNQE),
513 ("%s: toep %p claims to be a synq entry", __func__, toep));
514
515 handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
516
517 return (0);
518 }
519
520 void
enable_ddp(struct adapter * sc,struct toepcb * toep)521 enable_ddp(struct adapter *sc, struct toepcb *toep)
522 {
523
524 KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
525 ("%s: toep %p has bad ddp_flags 0x%x",
526 __func__, toep, toep->ddp_flags));
527
528 CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
529 __func__, toep->tid, time_uptime);
530
531 toep->ddp_flags |= DDP_SC_REQ;
532 t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
533 V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
534 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
535 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
536 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0,
537 toep->ofld_rxq->iq.abs_id);
538 t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
539 V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id);
540 }
541
542 static inline void
disable_ddp(struct adapter * sc,struct toepcb * toep)543 disable_ddp(struct adapter *sc, struct toepcb *toep)
544 {
545
546 KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
547 ("%s: toep %p has bad ddp_flags 0x%x",
548 __func__, toep, toep->ddp_flags));
549
550 CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
551 __func__, toep->tid, time_uptime);
552
553 toep->ddp_flags |= DDP_SC_REQ;
554 t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
555 V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1), 0, 0,
556 toep->ofld_rxq->iq.abs_id);
557 t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
558 V_TF_DDP_OFF(1), V_TF_DDP_OFF(1), 0, 0, toep->ofld_rxq->iq.abs_id);
559 }
560
561 static int
hold_uio(struct uio * uio,vm_page_t ** ppages,int * pnpages)562 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
563 {
564 struct vm_map *map;
565 struct iovec *iov;
566 vm_offset_t start, end;
567 vm_page_t *pp;
568 int n;
569
570 KASSERT(uio->uio_iovcnt == 1,
571 ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
572 KASSERT(uio->uio_td->td_proc == curproc,
573 ("%s: uio proc (%p) is not curproc (%p)",
574 __func__, uio->uio_td->td_proc, curproc));
575
576 map = &curproc->p_vmspace->vm_map;
577 iov = &uio->uio_iov[0];
578 start = trunc_page((uintptr_t)iov->iov_base);
579 end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
580 n = howmany(end - start, PAGE_SIZE);
581
582 if (end - start > MAX_DDP_BUFFER_SIZE)
583 return (E2BIG);
584
585 pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
586 if (pp == NULL)
587 return (ENOMEM);
588
589 if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
590 iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
591 free(pp, M_CXGBE);
592 return (EFAULT);
593 }
594
595 *ppages = pp;
596 *pnpages = n;
597
598 return (0);
599 }
600
601 static int
bufcmp(struct ddp_buffer * db,vm_page_t * pages,int npages,int offset,int len)602 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
603 {
604 int i;
605
606 if (db == NULL || db->npages != npages || db->offset != offset ||
607 db->len != len)
608 return (1);
609
610 for (i = 0; i < npages; i++) {
611 if (pages[i]->phys_addr != db->pages[i]->phys_addr)
612 return (1);
613 }
614
615 return (0);
616 }
617
618 static int
calculate_hcf(int n1,int n2)619 calculate_hcf(int n1, int n2)
620 {
621 int a, b, t;
622
623 if (n1 <= n2) {
624 a = n1;
625 b = n2;
626 } else {
627 a = n2;
628 b = n1;
629 }
630
631 while (a != 0) {
632 t = a;
633 a = b % a;
634 b = t;
635 }
636
637 return (b);
638 }
639
640 static inline int
pages_to_nppods(int npages,int ddp_page_shift)641 pages_to_nppods(int npages, int ddp_page_shift)
642 {
643
644 MPASS(ddp_page_shift >= PAGE_SHIFT);
645
646 return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
647 }
648
649 static int
alloc_page_pods(struct ppod_region * pr,u_int nppods,u_int pgsz_idx,struct ppod_reservation * prsv)650 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
651 struct ppod_reservation *prsv)
652 {
653 vmem_addr_t addr; /* relative to start of region */
654
655 if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
656 &addr) != 0)
657 return (ENOMEM);
658
659 CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
660 __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
661 nppods, 1 << pr->pr_page_shift[pgsz_idx]);
662
663 /*
664 * The hardware tagmask includes an extra invalid bit but the arena was
665 * seeded with valid values only. An allocation out of this arena will
666 * fit inside the tagmask but won't have the invalid bit set.
667 */
668 MPASS((addr & pr->pr_tag_mask) == addr);
669 MPASS((addr & pr->pr_invalid_bit) == 0);
670
671 prsv->prsv_pr = pr;
672 prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
673 prsv->prsv_nppods = nppods;
674
675 return (0);
676 }
677
678 int
t4_alloc_page_pods_for_db(struct ppod_region * pr,struct ddp_buffer * db)679 t4_alloc_page_pods_for_db(struct ppod_region *pr, struct ddp_buffer *db)
680 {
681 int i, hcf, seglen, idx, nppods;
682 struct ppod_reservation *prsv = &db->prsv;
683
684 KASSERT(prsv->prsv_nppods == 0,
685 ("%s: page pods already allocated", __func__));
686
687 /*
688 * The DDP page size is unrelated to the VM page size. We combine
689 * contiguous physical pages into larger segments to get the best DDP
690 * page size possible. This is the largest of the four sizes in
691 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
692 * the page list.
693 */
694 hcf = 0;
695 for (i = 0; i < db->npages; i++) {
696 seglen = PAGE_SIZE;
697 while (i < db->npages - 1 &&
698 db->pages[i]->phys_addr + PAGE_SIZE ==
699 db->pages[i + 1]->phys_addr) {
700 seglen += PAGE_SIZE;
701 i++;
702 }
703
704 hcf = calculate_hcf(hcf, seglen);
705 if (hcf < (1 << pr->pr_page_shift[1])) {
706 idx = 0;
707 goto have_pgsz; /* give up, short circuit */
708 }
709 }
710
711 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
712 MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
713 for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
714 if ((hcf & PR_PAGE_MASK(idx)) == 0)
715 break;
716 }
717 #undef PR_PAGE_MASK
718
719 have_pgsz:
720 MPASS(idx <= M_PPOD_PGSZ);
721
722 nppods = pages_to_nppods(db->npages, pr->pr_page_shift[idx]);
723 if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
724 return (0);
725 MPASS(prsv->prsv_nppods > 0);
726
727 return (1);
728 }
729
730 int
t4_alloc_page_pods_for_buf(struct ppod_region * pr,vm_offset_t buf,int len,struct ppod_reservation * prsv)731 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
732 struct ppod_reservation *prsv)
733 {
734 int hcf, seglen, idx, npages, nppods;
735 uintptr_t start_pva, end_pva, pva, p1;
736
737 MPASS(buf > 0);
738 MPASS(len > 0);
739
740 /*
741 * The DDP page size is unrelated to the VM page size. We combine
742 * contiguous physical pages into larger segments to get the best DDP
743 * page size possible. This is the largest of the four sizes in
744 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
745 * in the page list.
746 */
747 hcf = 0;
748 start_pva = trunc_page(buf);
749 end_pva = trunc_page(buf + len - 1);
750 pva = start_pva;
751 while (pva <= end_pva) {
752 seglen = PAGE_SIZE;
753 p1 = pmap_kextract(pva);
754 pva += PAGE_SIZE;
755 while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
756 seglen += PAGE_SIZE;
757 pva += PAGE_SIZE;
758 }
759
760 hcf = calculate_hcf(hcf, seglen);
761 if (hcf < (1 << pr->pr_page_shift[1])) {
762 idx = 0;
763 goto have_pgsz; /* give up, short circuit */
764 }
765 }
766
767 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
768 MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
769 for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
770 if ((hcf & PR_PAGE_MASK(idx)) == 0)
771 break;
772 }
773 #undef PR_PAGE_MASK
774
775 have_pgsz:
776 MPASS(idx <= M_PPOD_PGSZ);
777
778 npages = 1;
779 npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
780 nppods = howmany(npages, PPOD_PAGES);
781 if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
782 return (ENOMEM);
783 MPASS(prsv->prsv_nppods > 0);
784
785 return (0);
786 }
787
788 void
t4_free_page_pods(struct ppod_reservation * prsv)789 t4_free_page_pods(struct ppod_reservation *prsv)
790 {
791 struct ppod_region *pr = prsv->prsv_pr;
792 vmem_addr_t addr;
793
794 MPASS(prsv != NULL);
795 MPASS(prsv->prsv_nppods != 0);
796
797 addr = prsv->prsv_tag & pr->pr_tag_mask;
798 MPASS((addr & pr->pr_invalid_bit) == 0);
799
800 CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
801 pr->pr_arena, addr, prsv->prsv_nppods);
802
803 vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
804 prsv->prsv_nppods = 0;
805 }
806
807 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
808
809 int
t4_write_page_pods_for_db(struct adapter * sc,struct sge_wrq * wrq,int tid,struct ddp_buffer * db)810 t4_write_page_pods_for_db(struct adapter *sc, struct sge_wrq *wrq, int tid,
811 struct ddp_buffer *db)
812 {
813 struct wrqe *wr;
814 struct ulp_mem_io *ulpmc;
815 struct ulptx_idata *ulpsc;
816 struct pagepod *ppod;
817 int i, j, k, n, chunk, len, ddp_pgsz, idx;
818 u_int ppod_addr;
819 uint32_t cmd;
820 struct ppod_reservation *prsv = &db->prsv;
821 struct ppod_region *pr = prsv->prsv_pr;
822
823 MPASS(prsv->prsv_nppods > 0);
824
825 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
826 if (is_t4(sc))
827 cmd |= htobe32(F_ULP_MEMIO_ORDER);
828 else
829 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
830 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
831 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
832 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
833
834 /* How many page pods are we writing in this cycle */
835 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
836 chunk = PPOD_SZ(n);
837 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
838
839 wr = alloc_wrqe(len, wrq);
840 if (wr == NULL)
841 return (ENOMEM); /* ok to just bail out */
842 ulpmc = wrtod(wr);
843
844 INIT_ULPTX_WR(ulpmc, len, 0, 0);
845 ulpmc->cmd = cmd;
846 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
847 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
848 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
849
850 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
851 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
852 ulpsc->len = htobe32(chunk);
853
854 ppod = (struct pagepod *)(ulpsc + 1);
855 for (j = 0; j < n; i++, j++, ppod++) {
856 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
857 V_PPOD_TID(tid) | prsv->prsv_tag);
858 ppod->len_offset = htobe64(V_PPOD_LEN(db->len) |
859 V_PPOD_OFST(db->offset));
860 ppod->rsvd = 0;
861 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
862 for (k = 0; k < nitems(ppod->addr); k++) {
863 if (idx < db->npages) {
864 ppod->addr[k] =
865 htobe64(db->pages[idx]->phys_addr);
866 idx += ddp_pgsz / PAGE_SIZE;
867 } else
868 ppod->addr[k] = 0;
869 #if 0
870 CTR5(KTR_CXGBE,
871 "%s: tid %d ppod[%d]->addr[%d] = %p",
872 __func__, toep->tid, i, k,
873 htobe64(ppod->addr[k]));
874 #endif
875 }
876
877 }
878
879 t4_wrq_tx(sc, wr);
880 }
881
882 return (0);
883 }
884
885 int
t4_write_page_pods_for_buf(struct adapter * sc,struct sge_wrq * wrq,int tid,struct ppod_reservation * prsv,vm_offset_t buf,int buflen)886 t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
887 struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
888 {
889 struct wrqe *wr;
890 struct ulp_mem_io *ulpmc;
891 struct ulptx_idata *ulpsc;
892 struct pagepod *ppod;
893 int i, j, k, n, chunk, len, ddp_pgsz;
894 u_int ppod_addr, offset;
895 uint32_t cmd;
896 struct ppod_region *pr = prsv->prsv_pr;
897 uintptr_t end_pva, pva, pa;
898
899 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
900 if (is_t4(sc))
901 cmd |= htobe32(F_ULP_MEMIO_ORDER);
902 else
903 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
904 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
905 offset = buf & PAGE_MASK;
906 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
907 pva = trunc_page(buf);
908 end_pva = trunc_page(buf + buflen - 1);
909 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
910
911 /* How many page pods are we writing in this cycle */
912 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
913 MPASS(n > 0);
914 chunk = PPOD_SZ(n);
915 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
916
917 wr = alloc_wrqe(len, wrq);
918 if (wr == NULL)
919 return (ENOMEM); /* ok to just bail out */
920 ulpmc = wrtod(wr);
921
922 INIT_ULPTX_WR(ulpmc, len, 0, 0);
923 ulpmc->cmd = cmd;
924 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
925 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
926 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
927
928 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
929 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
930 ulpsc->len = htobe32(chunk);
931
932 ppod = (struct pagepod *)(ulpsc + 1);
933 for (j = 0; j < n; i++, j++, ppod++) {
934 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
935 V_PPOD_TID(tid) |
936 (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
937 ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
938 V_PPOD_OFST(offset));
939 ppod->rsvd = 0;
940
941 for (k = 0; k < nitems(ppod->addr); k++) {
942 if (pva > end_pva)
943 ppod->addr[k] = 0;
944 else {
945 pa = pmap_kextract(pva);
946 ppod->addr[k] = htobe64(pa);
947 pva += ddp_pgsz;
948 }
949 #if 0
950 CTR5(KTR_CXGBE,
951 "%s: tid %d ppod[%d]->addr[%d] = %p",
952 __func__, tid, i, k,
953 htobe64(ppod->addr[k]));
954 #endif
955 }
956
957 /*
958 * Walk back 1 segment so that the first address in the
959 * next pod is the same as the last one in the current
960 * pod.
961 */
962 pva -= ddp_pgsz;
963 }
964
965 t4_wrq_tx(sc, wr);
966 }
967
968 MPASS(pva <= end_pva);
969
970 return (0);
971 }
972
973 /*
974 * Reuse, or allocate (and program the page pods for) a new DDP buffer. The
975 * "pages" array is handed over to this function and should not be used in any
976 * way by the caller after that.
977 */
978 static int
select_ddp_buffer(struct adapter * sc,struct toepcb * toep,vm_page_t * pages,int npages,int db_off,int db_len)979 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages,
980 int npages, int db_off, int db_len)
981 {
982 struct ddp_buffer *db;
983 struct tom_data *td = sc->tom_softc;
984 int i, empty_slot = -1;
985
986 /* Try to reuse */
987 for (i = 0; i < nitems(toep->db); i++) {
988 if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) {
989 free(pages, M_CXGBE);
990 return (i); /* pages still held */
991 } else if (toep->db[i] == NULL && empty_slot < 0)
992 empty_slot = i;
993 }
994
995 /* Allocate new buffer, write its page pods. */
996 db = alloc_ddp_buffer(pages, npages, db_off, db_len);
997 if (db == NULL) {
998 vm_page_unhold_pages(pages, npages);
999 free(pages, M_CXGBE);
1000 return (-1);
1001 }
1002 if (t4_alloc_page_pods_for_db(&td->pr, db)) {
1003 vm_page_unhold_pages(pages, npages);
1004 free_ddp_buffer(db);
1005 return (-1);
1006 }
1007 if (t4_write_page_pods_for_db(sc, toep->ctrlq, toep->tid, db) != 0) {
1008 vm_page_unhold_pages(pages, npages);
1009 free_ddp_buffer(db);
1010 return (-1);
1011 }
1012
1013 i = empty_slot;
1014 if (i < 0) {
1015 i = arc4random() % nitems(toep->db);
1016 free_ddp_buffer(toep->db[i]);
1017 }
1018 toep->db[i] = db;
1019
1020 CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)",
1021 __func__, toep->tid, i, db, db->prsv.prsv_tag);
1022
1023 return (i);
1024 }
1025
1026 static void
wire_ddp_buffer(struct ddp_buffer * db)1027 wire_ddp_buffer(struct ddp_buffer *db)
1028 {
1029 int i;
1030 vm_page_t p;
1031
1032 for (i = 0; i < db->npages; i++) {
1033 p = db->pages[i];
1034 vm_page_lock(p);
1035 vm_page_wire(p);
1036 vm_page_unhold(p);
1037 vm_page_unlock(p);
1038 }
1039 }
1040
1041 static void
unwire_ddp_buffer(struct ddp_buffer * db)1042 unwire_ddp_buffer(struct ddp_buffer *db)
1043 {
1044 int i;
1045 vm_page_t p;
1046
1047 for (i = 0; i < db->npages; i++) {
1048 p = db->pages[i];
1049 vm_page_lock(p);
1050 vm_page_unwire(p, 0);
1051 vm_page_unlock(p);
1052 }
1053 }
1054
1055 static int
handle_ddp(struct socket * so,struct uio * uio,int flags,int error)1056 handle_ddp(struct socket *so, struct uio *uio, int flags, int error)
1057 {
1058 struct sockbuf *sb = &so->so_rcv;
1059 struct tcpcb *tp = so_sototcpcb(so);
1060 struct toepcb *toep = tp->t_toe;
1061 struct adapter *sc = td_adapter(toep->td);
1062 vm_page_t *pages;
1063 int npages, db_idx, rc, buf_flag;
1064 struct ddp_buffer *db;
1065 struct wrqe *wr;
1066 uint64_t ddp_flags;
1067
1068 SOCKBUF_LOCK_ASSERT(sb);
1069
1070 #if 0
1071 if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) {
1072 CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d",
1073 __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid);
1074 }
1075 #endif
1076
1077 /* XXX: too eager to disable DDP, could handle NBIO better than this. */
1078 if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
1079 uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
1080 so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
1081 error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
1082 goto no_ddp;
1083
1084 /*
1085 * Fault in and then hold the pages of the uio buffers. We'll wire them
1086 * a bit later if everything else works out.
1087 */
1088 SOCKBUF_UNLOCK(sb);
1089 if (hold_uio(uio, &pages, &npages) != 0) {
1090 SOCKBUF_LOCK(sb);
1091 goto no_ddp;
1092 }
1093 SOCKBUF_LOCK(sb);
1094 if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) {
1095 vm_page_unhold_pages(pages, npages);
1096 free(pages, M_CXGBE);
1097 goto no_ddp;
1098 }
1099
1100 /*
1101 * Figure out which one of the two DDP buffers to use this time.
1102 */
1103 db_idx = select_ddp_buffer(sc, toep, pages, npages,
1104 (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid);
1105 pages = NULL; /* handed off to select_ddp_buffer */
1106 if (db_idx < 0)
1107 goto no_ddp;
1108 db = toep->db[db_idx];
1109 buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE;
1110
1111 /*
1112 * Build the compound work request that tells the chip where to DMA the
1113 * payload.
1114 */
1115 ddp_flags = select_ddp_flags(so, flags, db_idx);
1116 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
1117 if (wr == NULL) {
1118 /*
1119 * Just unhold the pages. The DDP buffer's software state is
1120 * left as-is in the toep. The page pods were written
1121 * successfully and we may have an opportunity to use it in the
1122 * future.
1123 */
1124 vm_page_unhold_pages(db->pages, db->npages);
1125 goto no_ddp;
1126 }
1127
1128 /* Wire (and then unhold) the pages, and give the chip the go-ahead. */
1129 wire_ddp_buffer(db);
1130 t4_wrq_tx(sc, wr);
1131 sb->sb_flags &= ~SB_DDP_INDICATE;
1132 toep->ddp_flags |= buf_flag;
1133
1134 /*
1135 * Wait for the DDP operation to complete and then unwire the pages.
1136 * The return code from the sbwait will be the final return code of this
1137 * function. But we do need to wait for DDP no matter what.
1138 */
1139 rc = sbwait(sb);
1140 while (toep->ddp_flags & buf_flag) {
1141 sb->sb_flags |= SB_WAIT;
1142 msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
1143 }
1144 unwire_ddp_buffer(db);
1145 return (rc);
1146 no_ddp:
1147 disable_ddp(sc, toep);
1148 discourage_ddp(toep);
1149 sb->sb_flags &= ~SB_DDP_INDICATE;
1150 return (0);
1151 }
1152
1153 int
t4_init_ppod_region(struct ppod_region * pr,struct t4_range * r,u_int psz,const char * name)1154 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
1155 const char *name)
1156 {
1157 int i;
1158
1159 MPASS(pr != NULL);
1160 MPASS(r->size > 0);
1161
1162 pr->pr_start = r->start;
1163 pr->pr_len = r->size;
1164 pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
1165 pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
1166 pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
1167 pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
1168
1169 /* The SGL -> page pod algorithm requires the sizes to be in order. */
1170 for (i = 1; i < nitems(pr->pr_page_shift); i++) {
1171 if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
1172 return (ENXIO);
1173 }
1174
1175 pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
1176 pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
1177 if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
1178 return (ENXIO);
1179 pr->pr_alias_shift = fls(pr->pr_tag_mask);
1180 pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
1181
1182 pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
1183 M_FIRSTFIT | M_NOWAIT);
1184 if (pr->pr_arena == NULL)
1185 return (ENOMEM);
1186
1187 return (0);
1188 }
1189
1190 void
t4_free_ppod_region(struct ppod_region * pr)1191 t4_free_ppod_region(struct ppod_region *pr)
1192 {
1193
1194 MPASS(pr != NULL);
1195
1196 if (pr->pr_arena)
1197 vmem_destroy(pr->pr_arena);
1198 bzero(pr, sizeof(*pr));
1199 }
1200
1201 #define VNET_SO_ASSERT(so) \
1202 VNET_ASSERT(curvnet != NULL, \
1203 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
1204 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1205 static int
soreceive_rcvoob(struct socket * so,struct uio * uio,int flags)1206 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1207 {
1208
1209 CXGBE_UNIMPLEMENTED(__func__);
1210 }
1211
1212 static char ddp_magic_str[] = "nothing to see here";
1213
1214 static struct mbuf *
get_ddp_mbuf(int len)1215 get_ddp_mbuf(int len)
1216 {
1217 struct mbuf *m;
1218
1219 m = m_get(M_NOWAIT, MT_DATA);
1220 if (m == NULL)
1221 CXGBE_UNIMPLEMENTED("mbuf alloc failure");
1222 m->m_len = len;
1223 m->m_data = &ddp_magic_str[0];
1224
1225 return (m);
1226 }
1227
1228 static inline int
is_ddp_mbuf(struct mbuf * m)1229 is_ddp_mbuf(struct mbuf *m)
1230 {
1231
1232 return (m->m_data == &ddp_magic_str[0]);
1233 }
1234
1235 /*
1236 * Copy an mbuf chain into a uio limited by len if set.
1237 */
1238 static int
m_mbuftouio_ddp(struct uio * uio,struct mbuf * m,int len)1239 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len)
1240 {
1241 int error, length, total;
1242 int progress = 0;
1243
1244 if (len > 0)
1245 total = min(uio->uio_resid, len);
1246 else
1247 total = uio->uio_resid;
1248
1249 /* Fill the uio with data from the mbufs. */
1250 for (; m != NULL; m = m->m_next) {
1251 length = min(m->m_len, total - progress);
1252
1253 if (is_ddp_mbuf(m)) {
1254 enum uio_seg segflag = uio->uio_segflg;
1255
1256 uio->uio_segflg = UIO_NOCOPY;
1257 error = uiomove(mtod(m, void *), length, uio);
1258 uio->uio_segflg = segflag;
1259 } else
1260 error = uiomove(mtod(m, void *), length, uio);
1261 if (error)
1262 return (error);
1263
1264 progress += length;
1265 }
1266
1267 return (0);
1268 }
1269
1270 /*
1271 * Based on soreceive_stream() in uipc_socket.c
1272 */
1273 int
t4_soreceive_ddp(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1274 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1275 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1276 {
1277 int len = 0, error = 0, flags, oresid, ddp_handled = 0;
1278 struct sockbuf *sb;
1279 struct mbuf *m, *n = NULL;
1280
1281 /* We only do stream sockets. */
1282 if (so->so_type != SOCK_STREAM)
1283 return (EINVAL);
1284 if (psa != NULL)
1285 *psa = NULL;
1286 if (controlp != NULL)
1287 return (EINVAL);
1288 if (flagsp != NULL)
1289 flags = *flagsp &~ MSG_EOR;
1290 else
1291 flags = 0;
1292 if (flags & MSG_OOB)
1293 return (soreceive_rcvoob(so, uio, flags));
1294 if (mp0 != NULL)
1295 *mp0 = NULL;
1296
1297 sb = &so->so_rcv;
1298
1299 /* Prevent other readers from entering the socket. */
1300 error = sblock(sb, SBLOCKWAIT(flags));
1301 SOCKBUF_LOCK(sb);
1302 if (error)
1303 goto out;
1304
1305 /* Easy one, no space to copyout anything. */
1306 if (uio->uio_resid == 0) {
1307 error = EINVAL;
1308 goto out;
1309 }
1310 oresid = uio->uio_resid;
1311
1312 /* We will never ever get anything unless we are or were connected. */
1313 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1314 error = ENOTCONN;
1315 goto out;
1316 }
1317
1318 restart:
1319 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1320
1321 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1322
1323 /* uio should be just as it was at entry */
1324 KASSERT(oresid == uio->uio_resid,
1325 ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
1326 __func__, oresid, uio->uio_resid, sb->sb_cc));
1327
1328 error = handle_ddp(so, uio, flags, 0);
1329 ddp_handled = 1;
1330 if (error)
1331 goto out;
1332 }
1333
1334 /* Abort if socket has reported problems. */
1335 if (so->so_error) {
1336 if (sb->sb_cc > 0)
1337 goto deliver;
1338 if (oresid > uio->uio_resid)
1339 goto out;
1340 error = so->so_error;
1341 if (!(flags & MSG_PEEK))
1342 so->so_error = 0;
1343 goto out;
1344 }
1345
1346 /* Door is closed. Deliver what is left, if any. */
1347 if (sb->sb_state & SBS_CANTRCVMORE) {
1348 if (sb->sb_cc > 0)
1349 goto deliver;
1350 else
1351 goto out;
1352 }
1353
1354 /* Socket buffer is empty and we shall not block. */
1355 if (sb->sb_cc == 0 &&
1356 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1357 error = EAGAIN;
1358 goto out;
1359 }
1360
1361 /* Socket buffer got some data that we shall deliver now. */
1362 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1363 ((so->so_state & SS_NBIO) ||
1364 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1365 sb->sb_cc >= sb->sb_lowat ||
1366 sb->sb_cc >= uio->uio_resid ||
1367 sb->sb_cc >= sb->sb_hiwat) ) {
1368 goto deliver;
1369 }
1370
1371 /* On MSG_WAITALL we must wait until all data or error arrives. */
1372 if ((flags & MSG_WAITALL) &&
1373 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1374 goto deliver;
1375
1376 /*
1377 * Wait and block until (more) data comes in.
1378 * NB: Drops the sockbuf lock during wait.
1379 */
1380 error = sbwait(sb);
1381 if (error) {
1382 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1383 (void) handle_ddp(so, uio, flags, 1);
1384 ddp_handled = 1;
1385 }
1386 goto out;
1387 }
1388 goto restart;
1389
1390 deliver:
1391 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1392 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1393 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1394
1395 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
1396 goto restart;
1397
1398 /* Statistics. */
1399 if (uio->uio_td)
1400 uio->uio_td->td_ru.ru_msgrcv++;
1401
1402 /* Fill uio until full or current end of socket buffer is reached. */
1403 len = min(uio->uio_resid, sb->sb_cc);
1404 if (mp0 != NULL) {
1405 /* Dequeue as many mbufs as possible. */
1406 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1407 for (*mp0 = m = sb->sb_mb;
1408 m != NULL && m->m_len <= len;
1409 m = m->m_next) {
1410 len -= m->m_len;
1411 uio->uio_resid -= m->m_len;
1412 sbfree(sb, m);
1413 n = m;
1414 }
1415 sb->sb_mb = m;
1416 if (sb->sb_mb == NULL)
1417 SB_EMPTY_FIXUP(sb);
1418 n->m_next = NULL;
1419 }
1420 /* Copy the remainder. */
1421 if (len > 0) {
1422 KASSERT(sb->sb_mb != NULL,
1423 ("%s: len > 0 && sb->sb_mb empty", __func__));
1424
1425 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1426 if (m == NULL)
1427 len = 0; /* Don't flush data from sockbuf. */
1428 else
1429 uio->uio_resid -= m->m_len;
1430 if (*mp0 != NULL)
1431 n->m_next = m;
1432 else
1433 *mp0 = m;
1434 if (*mp0 == NULL) {
1435 error = ENOBUFS;
1436 goto out;
1437 }
1438 }
1439 } else {
1440 /* NB: Must unlock socket buffer as uiomove may sleep. */
1441 SOCKBUF_UNLOCK(sb);
1442 error = m_mbuftouio_ddp(uio, sb->sb_mb, len);
1443 SOCKBUF_LOCK(sb);
1444 if (error)
1445 goto out;
1446 }
1447 SBLASTRECORDCHK(sb);
1448 SBLASTMBUFCHK(sb);
1449
1450 /*
1451 * Remove the delivered data from the socket buffer unless we
1452 * were only peeking.
1453 */
1454 if (!(flags & MSG_PEEK)) {
1455 if (len > 0)
1456 sbdrop_locked(sb, len);
1457
1458 /* Notify protocol that we drained some data. */
1459 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
1460 (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
1461 !(flags & MSG_SOCALLBCK))) {
1462 SOCKBUF_UNLOCK(sb);
1463 VNET_SO_ASSERT(so);
1464 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1465 SOCKBUF_LOCK(sb);
1466 }
1467 }
1468
1469 /*
1470 * For MSG_WAITALL we may have to loop again and wait for
1471 * more data to come in.
1472 */
1473 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1474 goto restart;
1475 out:
1476 SOCKBUF_LOCK_ASSERT(sb);
1477 SBLASTRECORDCHK(sb);
1478 SBLASTMBUFCHK(sb);
1479 SOCKBUF_UNLOCK(sb);
1480 sbunlock(sb);
1481 return (error);
1482 }
1483
1484 int
t4_ddp_mod_load(void)1485 t4_ddp_mod_load(void)
1486 {
1487
1488 t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
1489 t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
1490 return (0);
1491 }
1492
1493 void
t4_ddp_mod_unload(void)1494 t4_ddp_mod_unload(void)
1495 {
1496
1497 t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
1498 t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
1499 }
1500 #endif
1501