1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
5 * All rights reserved.
6 * Written by: Navdeep Parhar <np@FreeBSD.org>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_kern_tls.h"
34 #include "opt_ratelimit.h"
35
36 #ifdef TCP_OFFLOAD
37 #include <sys/param.h>
38 #include <sys/aio.h>
39 #include <sys/file.h>
40 #include <sys/kernel.h>
41 #include <sys/ktr.h>
42 #include <sys/module.h>
43 #include <sys/proc.h>
44 #include <sys/protosw.h>
45 #include <sys/domain.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sglist.h>
49 #include <sys/taskqueue.h>
50 #include <netinet/in.h>
51 #include <netinet/in_pcb.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #define TCPSTATES
55 #include <netinet/tcp_fsm.h>
56 #include <netinet/tcp_seq.h>
57 #include <netinet/tcp_var.h>
58 #include <netinet/toecore.h>
59
60 #include <security/mac/mac_framework.h>
61
62 #include <vm/vm.h>
63 #include <vm/vm_extern.h>
64 #include <vm/pmap.h>
65 #include <vm/vm_map.h>
66 #include <vm/vm_page.h>
67
68 #include <dev/iscsi/iscsi_proto.h>
69
70 #include "common/common.h"
71 #include "common/t4_msg.h"
72 #include "common/t4_regs.h"
73 #include "common/t4_tcb.h"
74 #include "tom/t4_tom_l2t.h"
75 #include "tom/t4_tom.h"
76
77 static void t4_aiotx_cancel(struct kaiocb *job);
78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
79
80 void
send_flowc_wr(struct toepcb * toep,struct tcpcb * tp)81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
82 {
83 struct wrqe *wr;
84 struct fw_flowc_wr *flowc;
85 unsigned int nparams, flowclen, paramidx;
86 struct vi_info *vi = toep->vi;
87 struct port_info *pi = vi->pi;
88 struct adapter *sc = pi->adapter;
89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
91
92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
93 ("%s: flowc for tid %u sent already", __func__, toep->tid));
94
95 if (tp != NULL)
96 nparams = 8;
97 else
98 nparams = 6;
99 if (ulp_mode(toep) == ULP_MODE_TLS)
100 nparams++;
101 if (toep->tls.fcplenmax != 0)
102 nparams++;
103 if (toep->params.tc_idx != -1) {
104 MPASS(toep->params.tc_idx >= 0 &&
105 toep->params.tc_idx < sc->params.nsched_cls);
106 nparams++;
107 }
108
109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
110
111 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
112 if (wr == NULL) {
113 /* XXX */
114 panic("%s: allocation failure.", __func__);
115 }
116 flowc = wrtod(wr);
117 memset(flowc, 0, wr->wr_len);
118
119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
120 V_FW_FLOWC_WR_NPARAMS(nparams));
121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
122 V_FW_WR_FLOWID(toep->tid));
123
124 #define FLOWC_PARAM(__m, __v) \
125 do { \
126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
127 flowc->mnemval[paramidx].val = htobe32(__v); \
128 paramidx++; \
129 } while (0)
130
131 paramidx = 0;
132
133 FLOWC_PARAM(PFNVFN, pfvf);
134 FLOWC_PARAM(CH, pi->tx_chan);
135 FLOWC_PARAM(PORT, pi->tx_chan);
136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
138 if (tp) {
139 FLOWC_PARAM(MSS, toep->params.emss);
140 FLOWC_PARAM(SNDNXT, tp->snd_nxt);
141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
142 } else
143 FLOWC_PARAM(MSS, 512);
144 CTR6(KTR_CXGBE,
145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
148
149 if (ulp_mode(toep) == ULP_MODE_TLS)
150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep));
151 if (toep->tls.fcplenmax != 0)
152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
153 if (toep->params.tc_idx != -1)
154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
155 #undef FLOWC_PARAM
156
157 KASSERT(paramidx == nparams, ("nparams mismatch"));
158
159 txsd->tx_credits = howmany(flowclen, 16);
160 txsd->plen = 0;
161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
162 ("%s: not enough credits (%d)", __func__, toep->tx_credits));
163 toep->tx_credits -= txsd->tx_credits;
164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
165 toep->txsd_pidx = 0;
166 toep->txsd_avail--;
167
168 toep->flags |= TPF_FLOWC_WR_SENT;
169 t4_wrq_tx(sc, wr);
170 }
171
172 #ifdef RATELIMIT
173 /*
174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
175 */
176 static int
update_tx_rate_limit(struct adapter * sc,struct toepcb * toep,u_int Bps)177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
178 {
179 int tc_idx, rc;
180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
181 const int port_id = toep->vi->pi->port_id;
182
183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
184
185 if (kbps == 0) {
186 /* unbind */
187 tc_idx = -1;
188 } else {
189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
190 if (rc != 0)
191 return (rc);
192 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
193 }
194
195 if (toep->params.tc_idx != tc_idx) {
196 struct wrqe *wr;
197 struct fw_flowc_wr *flowc;
198 int nparams = 1, flowclen, flowclen16;
199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
200
201 flowclen = sizeof(*flowc) + nparams * sizeof(struct
202 fw_flowc_mnemval);
203 flowclen16 = howmany(flowclen, 16);
204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
205 (wr = alloc_wrqe(roundup2(flowclen, 16),
206 &toep->ofld_txq->wrq)) == NULL) {
207 if (tc_idx >= 0)
208 t4_release_cl_rl(sc, port_id, tc_idx);
209 return (ENOMEM);
210 }
211
212 flowc = wrtod(wr);
213 memset(flowc, 0, wr->wr_len);
214
215 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
216 V_FW_FLOWC_WR_NPARAMS(nparams));
217 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
218 V_FW_WR_FLOWID(toep->tid));
219
220 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
221 if (tc_idx == -1)
222 flowc->mnemval[0].val = htobe32(0xff);
223 else
224 flowc->mnemval[0].val = htobe32(tc_idx);
225
226 txsd->tx_credits = flowclen16;
227 txsd->plen = 0;
228 toep->tx_credits -= txsd->tx_credits;
229 if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
230 toep->txsd_pidx = 0;
231 toep->txsd_avail--;
232 t4_wrq_tx(sc, wr);
233 }
234
235 if (toep->params.tc_idx >= 0)
236 t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
237 toep->params.tc_idx = tc_idx;
238
239 return (0);
240 }
241 #endif
242
243 void
send_reset(struct adapter * sc,struct toepcb * toep,uint32_t snd_nxt)244 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
245 {
246 struct wrqe *wr;
247 struct cpl_abort_req *req;
248 int tid = toep->tid;
249 struct inpcb *inp = toep->inp;
250 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */
251
252 INP_WLOCK_ASSERT(inp);
253
254 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
255 __func__, toep->tid,
256 inp->inp_flags & INP_DROPPED ? "inp dropped" :
257 tcpstates[tp->t_state],
258 toep->flags, inp->inp_flags,
259 toep->flags & TPF_ABORT_SHUTDOWN ?
260 " (abort already in progress)" : "");
261
262 if (toep->flags & TPF_ABORT_SHUTDOWN)
263 return; /* abort already in progress */
264
265 toep->flags |= TPF_ABORT_SHUTDOWN;
266
267 KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
268 ("%s: flowc_wr not sent for tid %d.", __func__, tid));
269
270 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
271 if (wr == NULL) {
272 /* XXX */
273 panic("%s: allocation failure.", __func__);
274 }
275 req = wrtod(wr);
276
277 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
278 if (inp->inp_flags & INP_DROPPED)
279 req->rsvd0 = htobe32(snd_nxt);
280 else
281 req->rsvd0 = htobe32(tp->snd_nxt);
282 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
283 req->cmd = CPL_ABORT_SEND_RST;
284
285 /*
286 * XXX: What's the correct way to tell that the inp hasn't been detached
287 * from its socket? Should I even be flushing the snd buffer here?
288 */
289 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
290 struct socket *so = inp->inp_socket;
291
292 if (so != NULL) /* because I'm not sure. See comment above */
293 sbflush(&so->so_snd);
294 }
295
296 t4_l2t_send(sc, wr, toep->l2te);
297 }
298
299 /*
300 * Called when a connection is established to translate the TCP options
301 * reported by HW to FreeBSD's native format.
302 */
303 static void
assign_rxopt(struct tcpcb * tp,uint16_t opt)304 assign_rxopt(struct tcpcb *tp, uint16_t opt)
305 {
306 struct toepcb *toep = tp->t_toe;
307 struct inpcb *inp = tp->t_inpcb;
308 struct adapter *sc = td_adapter(toep->td);
309
310 INP_LOCK_ASSERT(inp);
311
312 toep->params.mtu_idx = G_TCPOPT_MSS(opt);
313 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
314 if (inp->inp_inc.inc_flags & INC_ISIPV6)
315 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
316 else
317 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
318
319 toep->params.emss = tp->t_maxseg;
320 if (G_TCPOPT_TSTAMP(opt)) {
321 toep->params.tstamp = 1;
322 toep->params.emss -= TCPOLEN_TSTAMP_APPA;
323 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */
324 tp->ts_recent = 0; /* hmmm */
325 tp->ts_recent_age = tcp_ts_getticks();
326 } else
327 toep->params.tstamp = 0;
328
329 if (G_TCPOPT_SACK(opt)) {
330 toep->params.sack = 1;
331 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */
332 } else {
333 toep->params.sack = 0;
334 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */
335 }
336
337 if (G_TCPOPT_WSCALE_OK(opt))
338 tp->t_flags |= TF_RCVD_SCALE;
339
340 /* Doing window scaling? */
341 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
342 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
343 tp->rcv_scale = tp->request_r_scale;
344 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
345 } else
346 toep->params.wscale = 0;
347
348 CTR6(KTR_CXGBE,
349 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
350 toep->tid, toep->params.mtu_idx, toep->params.emss,
351 toep->params.tstamp, toep->params.sack, toep->params.wscale);
352 }
353
354 /*
355 * Completes some final bits of initialization for just established connections
356 * and changes their state to TCPS_ESTABLISHED.
357 *
358 * The ISNs are from the exchange of SYNs.
359 */
360 void
make_established(struct toepcb * toep,uint32_t iss,uint32_t irs,uint16_t opt)361 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
362 {
363 struct inpcb *inp = toep->inp;
364 struct socket *so = inp->inp_socket;
365 struct tcpcb *tp = intotcpcb(inp);
366 uint16_t tcpopt = be16toh(opt);
367
368 INP_WLOCK_ASSERT(inp);
369 KASSERT(tp->t_state == TCPS_SYN_SENT ||
370 tp->t_state == TCPS_SYN_RECEIVED,
371 ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
372
373 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
374 __func__, toep->tid, so, inp, tp, toep);
375
376 tcp_state_change(tp, TCPS_ESTABLISHED);
377 tp->t_starttime = ticks;
378 TCPSTAT_INC(tcps_connects);
379
380 tp->irs = irs;
381 tcp_rcvseqinit(tp);
382 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
383 tp->rcv_adv += tp->rcv_wnd;
384 tp->last_ack_sent = tp->rcv_nxt;
385
386 tp->iss = iss;
387 tcp_sendseqinit(tp);
388 tp->snd_una = iss + 1;
389 tp->snd_nxt = iss + 1;
390 tp->snd_max = iss + 1;
391
392 assign_rxopt(tp, tcpopt);
393 send_flowc_wr(toep, tp);
394
395 soisconnected(so);
396
397 if (ulp_mode(toep) == ULP_MODE_TLS)
398 tls_establish(toep);
399 }
400
401 int
send_rx_credits(struct adapter * sc,struct toepcb * toep,int credits)402 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
403 {
404 struct wrqe *wr;
405 struct cpl_rx_data_ack *req;
406 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
407
408 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
409
410 wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
411 if (wr == NULL)
412 return (0);
413 req = wrtod(wr);
414
415 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
416 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
417
418 t4_wrq_tx(sc, wr);
419 return (credits);
420 }
421
422 void
send_rx_modulate(struct adapter * sc,struct toepcb * toep)423 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
424 {
425 struct wrqe *wr;
426 struct cpl_rx_data_ack *req;
427
428 wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
429 if (wr == NULL)
430 return;
431 req = wrtod(wr);
432
433 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
434 req->credit_dack = htobe32(F_RX_MODULATE_RX);
435
436 t4_wrq_tx(sc, wr);
437 }
438
439 void
t4_rcvd_locked(struct toedev * tod,struct tcpcb * tp)440 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
441 {
442 struct adapter *sc = tod->tod_softc;
443 struct inpcb *inp = tp->t_inpcb;
444 struct socket *so = inp->inp_socket;
445 struct sockbuf *sb = &so->so_rcv;
446 struct toepcb *toep = tp->t_toe;
447 int rx_credits;
448
449 INP_WLOCK_ASSERT(inp);
450 SOCKBUF_LOCK_ASSERT(sb);
451
452 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
453 if (rx_credits > 0 &&
454 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
455 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
456 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
457 rx_credits = send_rx_credits(sc, toep, rx_credits);
458 tp->rcv_wnd += rx_credits;
459 tp->rcv_adv += rx_credits;
460 } else if (toep->flags & TPF_FORCE_CREDITS)
461 send_rx_modulate(sc, toep);
462 }
463
464 void
t4_rcvd(struct toedev * tod,struct tcpcb * tp)465 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
466 {
467 struct inpcb *inp = tp->t_inpcb;
468 struct socket *so = inp->inp_socket;
469 struct sockbuf *sb = &so->so_rcv;
470
471 SOCKBUF_LOCK(sb);
472 t4_rcvd_locked(tod, tp);
473 SOCKBUF_UNLOCK(sb);
474 }
475
476 /*
477 * Close a connection by sending a CPL_CLOSE_CON_REQ message.
478 */
479 int
t4_close_conn(struct adapter * sc,struct toepcb * toep)480 t4_close_conn(struct adapter *sc, struct toepcb *toep)
481 {
482 struct wrqe *wr;
483 struct cpl_close_con_req *req;
484 unsigned int tid = toep->tid;
485
486 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
487 toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
488
489 if (toep->flags & TPF_FIN_SENT)
490 return (0);
491
492 KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
493 ("%s: flowc_wr not sent for tid %u.", __func__, tid));
494
495 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
496 if (wr == NULL) {
497 /* XXX */
498 panic("%s: allocation failure.", __func__);
499 }
500 req = wrtod(wr);
501
502 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
503 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
504 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
505 V_FW_WR_FLOWID(tid));
506 req->wr.wr_lo = cpu_to_be64(0);
507 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
508 req->rsvd = 0;
509
510 toep->flags |= TPF_FIN_SENT;
511 toep->flags &= ~TPF_SEND_FIN;
512 t4_l2t_send(sc, wr, toep->l2te);
513
514 return (0);
515 }
516
517 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
518 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
519 #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16))
520 #define MIN_TX_CREDITS(iso) \
521 (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
522
523 /* Maximum amount of immediate data we could stuff in a WR */
524 static inline int
max_imm_payload(int tx_credits,int iso)525 max_imm_payload(int tx_credits, int iso)
526 {
527 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
528 const int n = 1; /* Use no more than one desc for imm. data WR */
529
530 KASSERT(tx_credits >= 0 &&
531 tx_credits <= MAX_OFLD_TX_CREDITS,
532 ("%s: %d credits", __func__, tx_credits));
533
534 if (tx_credits < MIN_TX_CREDITS(iso))
535 return (0);
536
537 if (tx_credits >= (n * EQ_ESIZE) / 16)
538 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) -
539 iso_cpl_size);
540 else
541 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) -
542 iso_cpl_size);
543 }
544
545 /* Maximum number of SGL entries we could stuff in a WR */
546 static inline int
max_dsgl_nsegs(int tx_credits,int iso)547 max_dsgl_nsegs(int tx_credits, int iso)
548 {
549 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
550 int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso);
551
552 KASSERT(tx_credits >= 0 &&
553 tx_credits <= MAX_OFLD_TX_CREDITS,
554 ("%s: %d credits", __func__, tx_credits));
555
556 if (tx_credits < MIN_TX_CREDITS(iso))
557 return (0);
558
559 nseg += 2 * (sge_pair_credits * 16 / 24);
560 if ((sge_pair_credits * 16) % 24 == 16)
561 nseg++;
562
563 return (nseg);
564 }
565
566 static inline void
write_tx_wr(void * dst,struct toepcb * toep,int fw_wr_opcode,unsigned int immdlen,unsigned int plen,uint8_t credits,int shove,int ulp_submode)567 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
568 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
569 int ulp_submode)
570 {
571 struct fw_ofld_tx_data_wr *txwr = dst;
572
573 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
574 V_FW_WR_IMMDLEN(immdlen));
575 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
576 V_FW_WR_LEN16(credits));
577 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
578 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
579 txwr->plen = htobe32(plen);
580
581 if (toep->params.tx_align > 0) {
582 if (plen < 2 * toep->params.emss)
583 txwr->lsodisable_to_flags |=
584 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
585 else
586 txwr->lsodisable_to_flags |=
587 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
588 (toep->params.nagle == 0 ? 0 :
589 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
590 }
591 }
592
593 /*
594 * Generate a DSGL from a starting mbuf. The total number of segments and the
595 * maximum segments in any one mbuf are provided.
596 */
597 static void
write_tx_sgl(void * dst,struct mbuf * start,struct mbuf * stop,int nsegs,int n)598 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
599 {
600 struct mbuf *m;
601 struct ulptx_sgl *usgl = dst;
602 int i, j, rc;
603 struct sglist sg;
604 struct sglist_seg segs[n];
605
606 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
607
608 sglist_init(&sg, n, segs);
609 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
610 V_ULPTX_NSGE(nsegs));
611
612 i = -1;
613 for (m = start; m != stop; m = m->m_next) {
614 if (m->m_flags & M_EXTPG)
615 rc = sglist_append_mbuf_epg(&sg, m,
616 mtod(m, vm_offset_t), m->m_len);
617 else
618 rc = sglist_append(&sg, mtod(m, void *), m->m_len);
619 if (__predict_false(rc != 0))
620 panic("%s: sglist_append %d", __func__, rc);
621
622 for (j = 0; j < sg.sg_nseg; i++, j++) {
623 if (i < 0) {
624 usgl->len0 = htobe32(segs[j].ss_len);
625 usgl->addr0 = htobe64(segs[j].ss_paddr);
626 } else {
627 usgl->sge[i / 2].len[i & 1] =
628 htobe32(segs[j].ss_len);
629 usgl->sge[i / 2].addr[i & 1] =
630 htobe64(segs[j].ss_paddr);
631 }
632 #ifdef INVARIANTS
633 nsegs--;
634 #endif
635 }
636 sglist_reset(&sg);
637 }
638 if (i & 1)
639 usgl->sge[i / 2].len[1] = htobe32(0);
640 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
641 __func__, nsegs, start, stop));
642 }
643
644 /*
645 * Max number of SGL entries an offload tx work request can have. This is 41
646 * (1 + 40) for a full 512B work request.
647 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
648 */
649 #define OFLD_SGL_LEN (41)
650
651 /*
652 * Send data and/or a FIN to the peer.
653 *
654 * The socket's so_snd buffer consists of a stream of data starting with sb_mb
655 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that
656 * was transmitted.
657 *
658 * drop indicates the number of bytes that should be dropped from the head of
659 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating
660 * contention on the send buffer lock (before this change it used to do
661 * sowwakeup and then t4_push_frames right after that when recovering from tx
662 * stalls). When drop is set this function MUST drop the bytes and wake up any
663 * writers.
664 */
665 void
t4_push_frames(struct adapter * sc,struct toepcb * toep,int drop)666 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
667 {
668 struct mbuf *sndptr, *m, *sb_sndptr;
669 struct fw_ofld_tx_data_wr *txwr;
670 struct wrqe *wr;
671 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
672 struct inpcb *inp = toep->inp;
673 struct tcpcb *tp = intotcpcb(inp);
674 struct socket *so = inp->inp_socket;
675 struct sockbuf *sb = &so->so_snd;
676 int tx_credits, shove, compl, sowwakeup;
677 struct ofld_tx_sdesc *txsd;
678 bool nomap_mbuf_seen;
679
680 INP_WLOCK_ASSERT(inp);
681 KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
682 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
683
684 KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
685 ulp_mode(toep) == ULP_MODE_TCPDDP ||
686 ulp_mode(toep) == ULP_MODE_TLS ||
687 ulp_mode(toep) == ULP_MODE_RDMA,
688 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
689
690 #ifdef VERBOSE_TRACES
691 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
692 __func__, toep->tid, toep->flags, tp->t_flags, drop);
693 #endif
694 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
695 return;
696
697 #ifdef RATELIMIT
698 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
699 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
700 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
701 }
702 #endif
703
704 /*
705 * This function doesn't resume by itself. Someone else must clear the
706 * flag and call this function.
707 */
708 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
709 KASSERT(drop == 0,
710 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
711 return;
712 }
713
714 txsd = &toep->txsd[toep->txsd_pidx];
715 do {
716 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
717 max_imm = max_imm_payload(tx_credits, 0);
718 max_nsegs = max_dsgl_nsegs(tx_credits, 0);
719
720 SOCKBUF_LOCK(sb);
721 sowwakeup = drop;
722 if (drop) {
723 sbdrop_locked(sb, drop);
724 drop = 0;
725 }
726 sb_sndptr = sb->sb_sndptr;
727 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
728 plen = 0;
729 nsegs = 0;
730 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
731 nomap_mbuf_seen = false;
732 for (m = sndptr; m != NULL; m = m->m_next) {
733 int n;
734
735 if ((m->m_flags & M_NOTAVAIL) != 0)
736 break;
737 if (m->m_flags & M_EXTPG) {
738 #ifdef KERN_TLS
739 if (m->m_epg_tls != NULL) {
740 toep->flags |= TPF_KTLS;
741 if (plen == 0) {
742 SOCKBUF_UNLOCK(sb);
743 t4_push_ktls(sc, toep, 0);
744 return;
745 }
746 break;
747 }
748 #endif
749 n = sglist_count_mbuf_epg(m,
750 mtod(m, vm_offset_t), m->m_len);
751 } else
752 n = sglist_count(mtod(m, void *), m->m_len);
753
754 nsegs += n;
755 plen += m->m_len;
756
757 /* This mbuf sent us _over_ the nsegs limit, back out */
758 if (plen > max_imm && nsegs > max_nsegs) {
759 nsegs -= n;
760 plen -= m->m_len;
761 if (plen == 0) {
762 /* Too few credits */
763 toep->flags |= TPF_TX_SUSPENDED;
764 if (sowwakeup) {
765 if (!TAILQ_EMPTY(
766 &toep->aiotx_jobq))
767 t4_aiotx_queue_toep(so,
768 toep);
769 sowwakeup_locked(so);
770 } else
771 SOCKBUF_UNLOCK(sb);
772 SOCKBUF_UNLOCK_ASSERT(sb);
773 return;
774 }
775 break;
776 }
777
778 if (m->m_flags & M_EXTPG)
779 nomap_mbuf_seen = true;
780 if (max_nsegs_1mbuf < n)
781 max_nsegs_1mbuf = n;
782 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */
783
784 /* This mbuf put us right at the max_nsegs limit */
785 if (plen > max_imm && nsegs == max_nsegs) {
786 m = m->m_next;
787 break;
788 }
789 }
790
791 if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
792 toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
793 compl = 1;
794 else
795 compl = 0;
796
797 if (sb->sb_flags & SB_AUTOSIZE &&
798 V_tcp_do_autosndbuf &&
799 sb->sb_hiwat < V_tcp_autosndbuf_max &&
800 sbused(sb) >= sb->sb_hiwat * 7 / 8) {
801 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
802 V_tcp_autosndbuf_max);
803
804 if (!sbreserve_locked(sb, newsize, so, NULL))
805 sb->sb_flags &= ~SB_AUTOSIZE;
806 else
807 sowwakeup = 1; /* room available */
808 }
809 if (sowwakeup) {
810 if (!TAILQ_EMPTY(&toep->aiotx_jobq))
811 t4_aiotx_queue_toep(so, toep);
812 sowwakeup_locked(so);
813 } else
814 SOCKBUF_UNLOCK(sb);
815 SOCKBUF_UNLOCK_ASSERT(sb);
816
817 /* nothing to send */
818 if (plen == 0) {
819 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0,
820 ("%s: nothing to send, but m != NULL is ready",
821 __func__));
822 break;
823 }
824
825 if (__predict_false(toep->flags & TPF_FIN_SENT))
826 panic("%s: excess tx.", __func__);
827
828 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
829 if (plen <= max_imm && !nomap_mbuf_seen) {
830
831 /* Immediate data tx */
832
833 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
834 &toep->ofld_txq->wrq);
835 if (wr == NULL) {
836 /* XXX: how will we recover from this? */
837 toep->flags |= TPF_TX_SUSPENDED;
838 return;
839 }
840 txwr = wrtod(wr);
841 credits = howmany(wr->wr_len, 16);
842 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen,
843 credits, shove, 0);
844 m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
845 nsegs = 0;
846 } else {
847 int wr_len;
848
849 /* DSGL tx */
850
851 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
852 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
853 wr = alloc_wrqe(roundup2(wr_len, 16),
854 &toep->ofld_txq->wrq);
855 if (wr == NULL) {
856 /* XXX: how will we recover from this? */
857 toep->flags |= TPF_TX_SUSPENDED;
858 return;
859 }
860 txwr = wrtod(wr);
861 credits = howmany(wr_len, 16);
862 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen,
863 credits, shove, 0);
864 write_tx_sgl(txwr + 1, sndptr, m, nsegs,
865 max_nsegs_1mbuf);
866 if (wr_len & 0xf) {
867 uint64_t *pad = (uint64_t *)
868 ((uintptr_t)txwr + wr_len);
869 *pad = 0;
870 }
871 }
872
873 KASSERT(toep->tx_credits >= credits,
874 ("%s: not enough credits", __func__));
875
876 toep->tx_credits -= credits;
877 toep->tx_nocompl += credits;
878 toep->plen_nocompl += plen;
879 if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
880 toep->tx_nocompl >= toep->tx_total / 4)
881 compl = 1;
882
883 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
884 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
885 toep->tx_nocompl = 0;
886 toep->plen_nocompl = 0;
887 }
888
889 tp->snd_nxt += plen;
890 tp->snd_max += plen;
891
892 SOCKBUF_LOCK(sb);
893 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
894 sb->sb_sndptr = sb_sndptr;
895 SOCKBUF_UNLOCK(sb);
896
897 toep->flags |= TPF_TX_DATA_SENT;
898 if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
899 toep->flags |= TPF_TX_SUSPENDED;
900
901 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
902 txsd->plen = plen;
903 txsd->tx_credits = credits;
904 txsd++;
905 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
906 toep->txsd_pidx = 0;
907 txsd = &toep->txsd[0];
908 }
909 toep->txsd_avail--;
910
911 t4_l2t_send(sc, wr, toep->l2te);
912 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0);
913
914 /* Send a FIN if requested, but only if there's no more data to send */
915 if (m == NULL && toep->flags & TPF_SEND_FIN)
916 t4_close_conn(sc, toep);
917 }
918
919 static inline void
rqdrop_locked(struct mbufq * q,int plen)920 rqdrop_locked(struct mbufq *q, int plen)
921 {
922 struct mbuf *m;
923
924 while (plen > 0) {
925 m = mbufq_dequeue(q);
926
927 /* Too many credits. */
928 MPASS(m != NULL);
929 M_ASSERTPKTHDR(m);
930
931 /* Partial credits. */
932 MPASS(plen >= m->m_pkthdr.len);
933
934 plen -= m->m_pkthdr.len;
935 m_freem(m);
936 }
937 }
938
939 /*
940 * Not a bit in the TCB, but is a bit in the ulp_submode field of the
941 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR.
942 */
943 #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
944
945 static void
write_tx_data_iso(void * dst,u_int ulp_submode,uint8_t flags,uint16_t mss,int len,int npdu)946 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
947 int len, int npdu)
948 {
949 struct cpl_tx_data_iso *cpl;
950 unsigned int burst_size;
951 unsigned int last;
952
953 /*
954 * The firmware will set the 'F' bit on the last PDU when
955 * either condition is true:
956 *
957 * - this large PDU is marked as the "last" slice
958 *
959 * - the amount of data payload bytes equals the burst_size
960 *
961 * The strategy used here is to always set the burst_size
962 * artificially high (len includes the size of the template
963 * BHS) and only set the "last" flag if the original PDU had
964 * 'F' set.
965 */
966 burst_size = len;
967 last = !!(flags & CXGBE_ISO_F);
968
969 cpl = (struct cpl_tx_data_iso *)dst;
970 cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) |
971 V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) |
972 V_CPL_TX_DATA_ISO_CPLHDRLEN(0) |
973 V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
974 V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
975 V_CPL_TX_DATA_ISO_IMMEDIATE(0) |
976 V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags)));
977
978 cpl->ahs_len = 0;
979 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
980 cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4));
981 cpl->len = htonl(len);
982 cpl->reserved2_seglen_offset = htonl(0);
983 cpl->datasn_offset = htonl(0);
984 cpl->buffer_offset = htonl(0);
985 cpl->reserved3 = 0;
986 }
987
988 static struct wrqe *
write_iscsi_mbuf_wr(struct toepcb * toep,struct mbuf * sndptr)989 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
990 {
991 struct mbuf *m;
992 struct fw_ofld_tx_data_wr *txwr;
993 struct cpl_tx_data_iso *cpl_iso;
994 void *p;
995 struct wrqe *wr;
996 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
997 u_int adjusted_plen, imm_data, ulp_submode;
998 struct inpcb *inp = toep->inp;
999 struct tcpcb *tp = intotcpcb(inp);
1000 int tx_credits, shove, npdu, wr_len;
1001 uint16_t iso_mss;
1002 static const u_int ulp_extra_len[] = {0, 4, 4, 8};
1003 bool iso, nomap_mbuf_seen;
1004
1005 M_ASSERTPKTHDR(sndptr);
1006
1007 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
1008 if (mbuf_raw_wr(sndptr)) {
1009 plen = sndptr->m_pkthdr.len;
1010 KASSERT(plen <= SGE_MAX_WR_LEN,
1011 ("raw WR len %u is greater than max WR len", plen));
1012 if (plen > tx_credits * 16)
1013 return (NULL);
1014
1015 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
1016 if (__predict_false(wr == NULL))
1017 return (NULL);
1018
1019 m_copydata(sndptr, 0, plen, wrtod(wr));
1020 return (wr);
1021 }
1022
1023 iso = mbuf_iscsi_iso(sndptr);
1024 max_imm = max_imm_payload(tx_credits, iso);
1025 max_nsegs = max_dsgl_nsegs(tx_credits, iso);
1026 iso_mss = mbuf_iscsi_iso_mss(sndptr);
1027
1028 plen = 0;
1029 nsegs = 0;
1030 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
1031 nomap_mbuf_seen = false;
1032 for (m = sndptr; m != NULL; m = m->m_next) {
1033 int n;
1034
1035 if (m->m_flags & M_EXTPG)
1036 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t),
1037 m->m_len);
1038 else
1039 n = sglist_count(mtod(m, void *), m->m_len);
1040
1041 nsegs += n;
1042 plen += m->m_len;
1043
1044 /*
1045 * This mbuf would send us _over_ the nsegs limit.
1046 * Suspend tx because the PDU can't be sent out.
1047 */
1048 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs)
1049 return (NULL);
1050
1051 if (m->m_flags & M_EXTPG)
1052 nomap_mbuf_seen = true;
1053 if (max_nsegs_1mbuf < n)
1054 max_nsegs_1mbuf = n;
1055 }
1056
1057 if (__predict_false(toep->flags & TPF_FIN_SENT))
1058 panic("%s: excess tx.", __func__);
1059
1060 /*
1061 * We have a PDU to send. All of it goes out in one WR so 'm'
1062 * is NULL. A PDU's length is always a multiple of 4.
1063 */
1064 MPASS(m == NULL);
1065 MPASS((plen & 3) == 0);
1066 MPASS(sndptr->m_pkthdr.len == plen);
1067
1068 shove = !(tp->t_flags & TF_MORETOCOME);
1069
1070 /*
1071 * plen doesn't include header and data digests, which are
1072 * generated and inserted in the right places by the TOE, but
1073 * they do occupy TCP sequence space and need to be accounted
1074 * for.
1075 */
1076 ulp_submode = mbuf_ulp_submode(sndptr);
1077 MPASS(ulp_submode < nitems(ulp_extra_len));
1078 npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1;
1079 adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu;
1080 if (iso)
1081 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1);
1082 wr_len = sizeof(*txwr);
1083 if (iso)
1084 wr_len += sizeof(struct cpl_tx_data_iso);
1085 if (plen <= max_imm && !nomap_mbuf_seen) {
1086 /* Immediate data tx */
1087 imm_data = plen;
1088 wr_len += plen;
1089 nsegs = 0;
1090 } else {
1091 /* DSGL tx */
1092 imm_data = 0;
1093 wr_len += sizeof(struct ulptx_sgl) +
1094 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
1095 }
1096
1097 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
1098 if (wr == NULL) {
1099 /* XXX: how will we recover from this? */
1100 return (NULL);
1101 }
1102 txwr = wrtod(wr);
1103 credits = howmany(wr->wr_len, 16);
1104
1105 if (iso) {
1106 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR,
1107 imm_data + sizeof(struct cpl_tx_data_iso),
1108 adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
1109 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
1110 MPASS(plen == sndptr->m_pkthdr.len);
1111 write_tx_data_iso(cpl_iso, ulp_submode,
1112 mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
1113 p = cpl_iso + 1;
1114 } else {
1115 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data,
1116 adjusted_plen, credits, shove, ulp_submode);
1117 p = txwr + 1;
1118 }
1119
1120 if (imm_data != 0) {
1121 m_copydata(sndptr, 0, plen, p);
1122 } else {
1123 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf);
1124 if (wr_len & 0xf) {
1125 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
1126 *pad = 0;
1127 }
1128 }
1129
1130 KASSERT(toep->tx_credits >= credits,
1131 ("%s: not enough credits: credits %u "
1132 "toep->tx_credits %u tx_credits %u nsegs %u "
1133 "max_nsegs %u iso %d", __func__, credits,
1134 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
1135
1136 tp->snd_nxt += adjusted_plen;
1137 tp->snd_max += adjusted_plen;
1138
1139 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu);
1140 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
1141 if (iso)
1142 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1);
1143
1144 return (wr);
1145 }
1146
1147 void
t4_push_pdus(struct adapter * sc,struct toepcb * toep,int drop)1148 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
1149 {
1150 struct mbuf *sndptr, *m;
1151 struct fw_wr_hdr *wrhdr;
1152 struct wrqe *wr;
1153 u_int plen, credits;
1154 struct inpcb *inp = toep->inp;
1155 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
1156 struct mbufq *pduq = &toep->ulp_pduq;
1157
1158 INP_WLOCK_ASSERT(inp);
1159 KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1160 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
1161 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
1162 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
1163
1164 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
1165 return;
1166
1167 /*
1168 * This function doesn't resume by itself. Someone else must clear the
1169 * flag and call this function.
1170 */
1171 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
1172 KASSERT(drop == 0,
1173 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
1174 return;
1175 }
1176
1177 if (drop) {
1178 struct socket *so = inp->inp_socket;
1179 struct sockbuf *sb = &so->so_snd;
1180 int sbu;
1181
1182 /*
1183 * An unlocked read is ok here as the data should only
1184 * transition from a non-zero value to either another
1185 * non-zero value or zero. Once it is zero it should
1186 * stay zero.
1187 */
1188 if (__predict_false(sbused(sb)) > 0) {
1189 SOCKBUF_LOCK(sb);
1190 sbu = sbused(sb);
1191 if (sbu > 0) {
1192 /*
1193 * The data transmitted before the
1194 * tid's ULP mode changed to ISCSI is
1195 * still in so_snd. Incoming credits
1196 * should account for so_snd first.
1197 */
1198 sbdrop_locked(sb, min(sbu, drop));
1199 drop -= min(sbu, drop);
1200 }
1201 sowwakeup_locked(so); /* unlocks so_snd */
1202 }
1203 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
1204 }
1205
1206 while ((sndptr = mbufq_first(pduq)) != NULL) {
1207 wr = write_iscsi_mbuf_wr(toep, sndptr);
1208 if (wr == NULL) {
1209 toep->flags |= TPF_TX_SUSPENDED;
1210 return;
1211 }
1212
1213 plen = sndptr->m_pkthdr.len;
1214 credits = howmany(wr->wr_len, 16);
1215 KASSERT(toep->tx_credits >= credits,
1216 ("%s: not enough credits", __func__));
1217
1218 m = mbufq_dequeue(pduq);
1219 MPASS(m == sndptr);
1220 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
1221
1222 toep->tx_credits -= credits;
1223 toep->tx_nocompl += credits;
1224 toep->plen_nocompl += plen;
1225
1226 /*
1227 * Ensure there are enough credits for a full-sized WR
1228 * as page pod WRs can be full-sized.
1229 */
1230 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 &&
1231 toep->tx_nocompl >= toep->tx_total / 4) {
1232 wrhdr = wrtod(wr);
1233 wrhdr->hi |= htobe32(F_FW_WR_COMPL);
1234 toep->tx_nocompl = 0;
1235 toep->plen_nocompl = 0;
1236 }
1237
1238 toep->flags |= TPF_TX_DATA_SENT;
1239 if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
1240 toep->flags |= TPF_TX_SUSPENDED;
1241
1242 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
1243 txsd->plen = plen;
1244 txsd->tx_credits = credits;
1245 txsd++;
1246 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
1247 toep->txsd_pidx = 0;
1248 txsd = &toep->txsd[0];
1249 }
1250 toep->txsd_avail--;
1251
1252 t4_l2t_send(sc, wr, toep->l2te);
1253 }
1254
1255 /* Send a FIN if requested, but only if there are no more PDUs to send */
1256 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
1257 t4_close_conn(sc, toep);
1258 }
1259
1260 static inline void
t4_push_data(struct adapter * sc,struct toepcb * toep,int drop)1261 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
1262 {
1263
1264 if (ulp_mode(toep) == ULP_MODE_ISCSI)
1265 t4_push_pdus(sc, toep, drop);
1266 else if (toep->flags & TPF_KTLS)
1267 t4_push_ktls(sc, toep, drop);
1268 else
1269 t4_push_frames(sc, toep, drop);
1270 }
1271
1272 int
t4_tod_output(struct toedev * tod,struct tcpcb * tp)1273 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
1274 {
1275 struct adapter *sc = tod->tod_softc;
1276 #ifdef INVARIANTS
1277 struct inpcb *inp = tp->t_inpcb;
1278 #endif
1279 struct toepcb *toep = tp->t_toe;
1280
1281 INP_WLOCK_ASSERT(inp);
1282 KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1283 ("%s: inp %p dropped.", __func__, inp));
1284 KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1285
1286 t4_push_data(sc, toep, 0);
1287
1288 return (0);
1289 }
1290
1291 int
t4_send_fin(struct toedev * tod,struct tcpcb * tp)1292 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
1293 {
1294 struct adapter *sc = tod->tod_softc;
1295 #ifdef INVARIANTS
1296 struct inpcb *inp = tp->t_inpcb;
1297 #endif
1298 struct toepcb *toep = tp->t_toe;
1299
1300 INP_WLOCK_ASSERT(inp);
1301 KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1302 ("%s: inp %p dropped.", __func__, inp));
1303 KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1304
1305 toep->flags |= TPF_SEND_FIN;
1306 if (tp->t_state >= TCPS_ESTABLISHED)
1307 t4_push_data(sc, toep, 0);
1308
1309 return (0);
1310 }
1311
1312 int
t4_send_rst(struct toedev * tod,struct tcpcb * tp)1313 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
1314 {
1315 struct adapter *sc = tod->tod_softc;
1316 #if defined(INVARIANTS)
1317 struct inpcb *inp = tp->t_inpcb;
1318 #endif
1319 struct toepcb *toep = tp->t_toe;
1320
1321 INP_WLOCK_ASSERT(inp);
1322 KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1323 ("%s: inp %p dropped.", __func__, inp));
1324 KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1325
1326 /* hmmmm */
1327 KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1328 ("%s: flowc for tid %u [%s] not sent already",
1329 __func__, toep->tid, tcpstates[tp->t_state]));
1330
1331 send_reset(sc, toep, 0);
1332 return (0);
1333 }
1334
1335 /*
1336 * Peer has sent us a FIN.
1337 */
1338 static int
do_peer_close(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1339 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1340 {
1341 struct adapter *sc = iq->adapter;
1342 const struct cpl_peer_close *cpl = (const void *)(rss + 1);
1343 unsigned int tid = GET_TID(cpl);
1344 struct toepcb *toep = lookup_tid(sc, tid);
1345 struct inpcb *inp = toep->inp;
1346 struct tcpcb *tp = NULL;
1347 struct socket *so;
1348 struct epoch_tracker et;
1349 #ifdef INVARIANTS
1350 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1351 #endif
1352
1353 KASSERT(opcode == CPL_PEER_CLOSE,
1354 ("%s: unexpected opcode 0x%x", __func__, opcode));
1355 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1356
1357 if (__predict_false(toep->flags & TPF_SYNQE)) {
1358 /*
1359 * do_pass_establish must have run before do_peer_close and if
1360 * this is still a synqe instead of a toepcb then the connection
1361 * must be getting aborted.
1362 */
1363 MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1364 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1365 toep, toep->flags);
1366 return (0);
1367 }
1368
1369 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1370
1371 CURVNET_SET(toep->vnet);
1372 NET_EPOCH_ENTER(et);
1373 INP_WLOCK(inp);
1374 tp = intotcpcb(inp);
1375
1376 CTR6(KTR_CXGBE,
1377 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
1378 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1379 toep->ddp.flags, inp);
1380
1381 if (toep->flags & TPF_ABORT_SHUTDOWN)
1382 goto done;
1383
1384 so = inp->inp_socket;
1385 socantrcvmore(so);
1386 if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
1387 DDP_LOCK(toep);
1388 if (__predict_false(toep->ddp.flags &
1389 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
1390 handle_ddp_close(toep, tp, cpl->rcv_nxt);
1391 DDP_UNLOCK(toep);
1392 }
1393
1394 if (ulp_mode(toep) == ULP_MODE_RDMA ||
1395 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
1396 /*
1397 * There might be data received via DDP before the FIN
1398 * not reported to the driver. Just assume the
1399 * sequence number in the CPL is correct as the
1400 * sequence number of the FIN.
1401 */
1402 } else {
1403 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt),
1404 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
1405 be32toh(cpl->rcv_nxt)));
1406 }
1407
1408 tp->rcv_nxt = be32toh(cpl->rcv_nxt);
1409
1410 switch (tp->t_state) {
1411 case TCPS_SYN_RECEIVED:
1412 tp->t_starttime = ticks;
1413 /* FALLTHROUGH */
1414
1415 case TCPS_ESTABLISHED:
1416 tcp_state_change(tp, TCPS_CLOSE_WAIT);
1417 break;
1418
1419 case TCPS_FIN_WAIT_1:
1420 tcp_state_change(tp, TCPS_CLOSING);
1421 break;
1422
1423 case TCPS_FIN_WAIT_2:
1424 restore_so_proto(so, inp->inp_vflag & INP_IPV6);
1425 tcp_twstart(tp);
1426 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
1427 NET_EPOCH_EXIT(et);
1428 CURVNET_RESTORE();
1429
1430 INP_WLOCK(inp);
1431 final_cpl_received(toep);
1432 return (0);
1433
1434 default:
1435 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
1436 __func__, tid, tp->t_state);
1437 }
1438 done:
1439 INP_WUNLOCK(inp);
1440 NET_EPOCH_EXIT(et);
1441 CURVNET_RESTORE();
1442 return (0);
1443 }
1444
1445 /*
1446 * Peer has ACK'd our FIN.
1447 */
1448 static int
do_close_con_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1449 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
1450 struct mbuf *m)
1451 {
1452 struct adapter *sc = iq->adapter;
1453 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
1454 unsigned int tid = GET_TID(cpl);
1455 struct toepcb *toep = lookup_tid(sc, tid);
1456 struct inpcb *inp = toep->inp;
1457 struct tcpcb *tp = NULL;
1458 struct socket *so = NULL;
1459 struct epoch_tracker et;
1460 #ifdef INVARIANTS
1461 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1462 #endif
1463
1464 KASSERT(opcode == CPL_CLOSE_CON_RPL,
1465 ("%s: unexpected opcode 0x%x", __func__, opcode));
1466 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1467 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1468
1469 CURVNET_SET(toep->vnet);
1470 NET_EPOCH_ENTER(et);
1471 INP_WLOCK(inp);
1472 tp = intotcpcb(inp);
1473
1474 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
1475 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
1476
1477 if (toep->flags & TPF_ABORT_SHUTDOWN)
1478 goto done;
1479
1480 so = inp->inp_socket;
1481 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */
1482
1483 switch (tp->t_state) {
1484 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */
1485 restore_so_proto(so, inp->inp_vflag & INP_IPV6);
1486 tcp_twstart(tp);
1487 release:
1488 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
1489 NET_EPOCH_EXIT(et);
1490 CURVNET_RESTORE();
1491
1492 INP_WLOCK(inp);
1493 final_cpl_received(toep); /* no more CPLs expected */
1494
1495 return (0);
1496 case TCPS_LAST_ACK:
1497 if (tcp_close(tp))
1498 INP_WUNLOCK(inp);
1499 goto release;
1500
1501 case TCPS_FIN_WAIT_1:
1502 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1503 soisdisconnected(so);
1504 tcp_state_change(tp, TCPS_FIN_WAIT_2);
1505 break;
1506
1507 default:
1508 log(LOG_ERR,
1509 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
1510 __func__, tid, tcpstates[tp->t_state]);
1511 }
1512 done:
1513 INP_WUNLOCK(inp);
1514 NET_EPOCH_EXIT(et);
1515 CURVNET_RESTORE();
1516 return (0);
1517 }
1518
1519 void
send_abort_rpl(struct adapter * sc,struct sge_ofld_txq * ofld_txq,int tid,int rst_status)1520 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid,
1521 int rst_status)
1522 {
1523 struct wrqe *wr;
1524 struct cpl_abort_rpl *cpl;
1525
1526 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq);
1527 if (wr == NULL) {
1528 /* XXX */
1529 panic("%s: allocation failure.", __func__);
1530 }
1531 cpl = wrtod(wr);
1532
1533 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
1534 cpl->cmd = rst_status;
1535
1536 t4_wrq_tx(sc, wr);
1537 }
1538
1539 static int
abort_status_to_errno(struct tcpcb * tp,unsigned int abort_reason)1540 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
1541 {
1542 switch (abort_reason) {
1543 case CPL_ERR_BAD_SYN:
1544 case CPL_ERR_CONN_RESET:
1545 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1546 case CPL_ERR_XMIT_TIMEDOUT:
1547 case CPL_ERR_PERSIST_TIMEDOUT:
1548 case CPL_ERR_FINWAIT2_TIMEDOUT:
1549 case CPL_ERR_KEEPALIVE_TIMEDOUT:
1550 return (ETIMEDOUT);
1551 default:
1552 return (EIO);
1553 }
1554 }
1555
1556 /*
1557 * TCP RST from the peer, timeout, or some other such critical error.
1558 */
1559 static int
do_abort_req(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1560 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1561 {
1562 struct adapter *sc = iq->adapter;
1563 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1564 unsigned int tid = GET_TID(cpl);
1565 struct toepcb *toep = lookup_tid(sc, tid);
1566 struct sge_ofld_txq *ofld_txq = toep->ofld_txq;
1567 struct inpcb *inp;
1568 struct tcpcb *tp;
1569 struct epoch_tracker et;
1570 #ifdef INVARIANTS
1571 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1572 #endif
1573
1574 KASSERT(opcode == CPL_ABORT_REQ_RSS,
1575 ("%s: unexpected opcode 0x%x", __func__, opcode));
1576 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1577
1578 if (toep->flags & TPF_SYNQE)
1579 return (do_abort_req_synqe(iq, rss, m));
1580
1581 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1582
1583 if (negative_advice(cpl->status)) {
1584 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
1585 __func__, cpl->status, tid, toep->flags);
1586 return (0); /* Ignore negative advice */
1587 }
1588
1589 inp = toep->inp;
1590 CURVNET_SET(toep->vnet);
1591 NET_EPOCH_ENTER(et); /* for tcp_close */
1592 INP_WLOCK(inp);
1593
1594 tp = intotcpcb(inp);
1595
1596 CTR6(KTR_CXGBE,
1597 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
1598 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1599 inp->inp_flags, cpl->status);
1600
1601 /*
1602 * If we'd initiated an abort earlier the reply to it is responsible for
1603 * cleaning up resources. Otherwise we tear everything down right here
1604 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
1605 */
1606 if (toep->flags & TPF_ABORT_SHUTDOWN) {
1607 INP_WUNLOCK(inp);
1608 goto done;
1609 }
1610 toep->flags |= TPF_ABORT_SHUTDOWN;
1611
1612 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
1613 struct socket *so = inp->inp_socket;
1614
1615 if (so != NULL)
1616 so_error_set(so, abort_status_to_errno(tp,
1617 cpl->status));
1618 tp = tcp_close(tp);
1619 if (tp == NULL)
1620 INP_WLOCK(inp); /* re-acquire */
1621 }
1622
1623 final_cpl_received(toep);
1624 done:
1625 NET_EPOCH_EXIT(et);
1626 CURVNET_RESTORE();
1627 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1628 return (0);
1629 }
1630
1631 /*
1632 * Reply to the CPL_ABORT_REQ (send_reset)
1633 */
1634 static int
do_abort_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1635 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1636 {
1637 struct adapter *sc = iq->adapter;
1638 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1639 unsigned int tid = GET_TID(cpl);
1640 struct toepcb *toep = lookup_tid(sc, tid);
1641 struct inpcb *inp = toep->inp;
1642 #ifdef INVARIANTS
1643 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1644 #endif
1645
1646 KASSERT(opcode == CPL_ABORT_RPL_RSS,
1647 ("%s: unexpected opcode 0x%x", __func__, opcode));
1648 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1649
1650 if (toep->flags & TPF_SYNQE)
1651 return (do_abort_rpl_synqe(iq, rss, m));
1652
1653 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1654
1655 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
1656 __func__, tid, toep, inp, cpl->status);
1657
1658 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1659 ("%s: wasn't expecting abort reply", __func__));
1660
1661 INP_WLOCK(inp);
1662 final_cpl_received(toep);
1663
1664 return (0);
1665 }
1666
1667 static int
do_rx_data(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1668 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1669 {
1670 struct adapter *sc = iq->adapter;
1671 const struct cpl_rx_data *cpl = mtod(m, const void *);
1672 unsigned int tid = GET_TID(cpl);
1673 struct toepcb *toep = lookup_tid(sc, tid);
1674 struct inpcb *inp = toep->inp;
1675 struct tcpcb *tp;
1676 struct socket *so;
1677 struct sockbuf *sb;
1678 struct epoch_tracker et;
1679 int len;
1680 uint32_t ddp_placed = 0;
1681
1682 if (__predict_false(toep->flags & TPF_SYNQE)) {
1683 /*
1684 * do_pass_establish must have run before do_rx_data and if this
1685 * is still a synqe instead of a toepcb then the connection must
1686 * be getting aborted.
1687 */
1688 MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1689 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1690 toep, toep->flags);
1691 m_freem(m);
1692 return (0);
1693 }
1694
1695 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1696
1697 /* strip off CPL header */
1698 m_adj(m, sizeof(*cpl));
1699 len = m->m_pkthdr.len;
1700
1701 INP_WLOCK(inp);
1702 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1703 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1704 __func__, tid, len, inp->inp_flags);
1705 INP_WUNLOCK(inp);
1706 m_freem(m);
1707 return (0);
1708 }
1709
1710 tp = intotcpcb(inp);
1711
1712 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS &&
1713 toep->flags & TPF_TLS_RECEIVE)) {
1714 /* Received "raw" data on a TLS socket. */
1715 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)",
1716 __func__, tid, len);
1717 do_rx_data_tls(cpl, toep, m);
1718 return (0);
1719 }
1720
1721 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
1722 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
1723
1724 tp->rcv_nxt += len;
1725 if (tp->rcv_wnd < len) {
1726 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
1727 ("%s: negative window size", __func__));
1728 }
1729
1730 tp->rcv_wnd -= len;
1731 tp->t_rcvtime = ticks;
1732
1733 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1734 DDP_LOCK(toep);
1735 so = inp_inpcbtosocket(inp);
1736 sb = &so->so_rcv;
1737 SOCKBUF_LOCK(sb);
1738
1739 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1740 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
1741 __func__, tid, len);
1742 m_freem(m);
1743 SOCKBUF_UNLOCK(sb);
1744 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1745 DDP_UNLOCK(toep);
1746 INP_WUNLOCK(inp);
1747
1748 CURVNET_SET(toep->vnet);
1749 NET_EPOCH_ENTER(et);
1750 INP_WLOCK(inp);
1751 tp = tcp_drop(tp, ECONNRESET);
1752 if (tp)
1753 INP_WUNLOCK(inp);
1754 NET_EPOCH_EXIT(et);
1755 CURVNET_RESTORE();
1756
1757 return (0);
1758 }
1759
1760 /* receive buffer autosize */
1761 MPASS(toep->vnet == so->so_vnet);
1762 CURVNET_SET(toep->vnet);
1763 if (sb->sb_flags & SB_AUTOSIZE &&
1764 V_tcp_do_autorcvbuf &&
1765 sb->sb_hiwat < V_tcp_autorcvbuf_max &&
1766 len > (sbspace(sb) / 8 * 7)) {
1767 unsigned int hiwat = sb->sb_hiwat;
1768 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
1769 V_tcp_autorcvbuf_max);
1770
1771 if (!sbreserve_locked(sb, newsize, so, NULL))
1772 sb->sb_flags &= ~SB_AUTOSIZE;
1773 }
1774
1775 if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
1776 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
1777
1778 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
1779 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
1780 __func__, tid, len);
1781
1782 if (changed) {
1783 if (toep->ddp.flags & DDP_SC_REQ)
1784 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
1785 else {
1786 KASSERT(cpl->ddp_off == 1,
1787 ("%s: DDP switched on by itself.",
1788 __func__));
1789
1790 /* Fell out of DDP mode */
1791 toep->ddp.flags &= ~DDP_ON;
1792 CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
1793 __func__);
1794
1795 insert_ddp_data(toep, ddp_placed);
1796 }
1797 }
1798
1799 if (toep->ddp.flags & DDP_ON) {
1800 /*
1801 * CPL_RX_DATA with DDP on can only be an indicate.
1802 * Start posting queued AIO requests via DDP. The
1803 * payload that arrived in this indicate is appended
1804 * to the socket buffer as usual.
1805 */
1806 handle_ddp_indicate(toep);
1807 }
1808 }
1809
1810 sbappendstream_locked(sb, m, 0);
1811 t4_rcvd_locked(&toep->td->tod, tp);
1812
1813 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
1814 sbavail(sb) != 0) {
1815 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
1816 tid);
1817 ddp_queue_toep(toep);
1818 }
1819 sorwakeup_locked(so);
1820 SOCKBUF_UNLOCK_ASSERT(sb);
1821 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
1822 DDP_UNLOCK(toep);
1823
1824 INP_WUNLOCK(inp);
1825 CURVNET_RESTORE();
1826 return (0);
1827 }
1828
1829 static int
do_fw4_ack(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1830 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1831 {
1832 struct adapter *sc = iq->adapter;
1833 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
1834 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
1835 struct toepcb *toep = lookup_tid(sc, tid);
1836 struct inpcb *inp;
1837 struct tcpcb *tp;
1838 struct socket *so;
1839 uint8_t credits = cpl->credits;
1840 struct ofld_tx_sdesc *txsd;
1841 int plen;
1842 #ifdef INVARIANTS
1843 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
1844 #endif
1845
1846 /*
1847 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
1848 * now this comes back carrying the credits for the flowc.
1849 */
1850 if (__predict_false(toep->flags & TPF_SYNQE)) {
1851 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1852 ("%s: credits for a synq entry %p", __func__, toep));
1853 return (0);
1854 }
1855
1856 inp = toep->inp;
1857
1858 KASSERT(opcode == CPL_FW4_ACK,
1859 ("%s: unexpected opcode 0x%x", __func__, opcode));
1860 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1861 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1862
1863 INP_WLOCK(inp);
1864
1865 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
1866 INP_WUNLOCK(inp);
1867 return (0);
1868 }
1869
1870 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
1871 ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
1872
1873 tp = intotcpcb(inp);
1874
1875 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
1876 tcp_seq snd_una = be32toh(cpl->snd_una);
1877
1878 #ifdef INVARIANTS
1879 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
1880 log(LOG_ERR,
1881 "%s: unexpected seq# %x for TID %u, snd_una %x\n",
1882 __func__, snd_una, toep->tid, tp->snd_una);
1883 }
1884 #endif
1885
1886 if (tp->snd_una != snd_una) {
1887 tp->snd_una = snd_una;
1888 tp->ts_recent_age = tcp_ts_getticks();
1889 }
1890 }
1891
1892 #ifdef VERBOSE_TRACES
1893 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
1894 #endif
1895 so = inp->inp_socket;
1896 txsd = &toep->txsd[toep->txsd_cidx];
1897 plen = 0;
1898 while (credits) {
1899 KASSERT(credits >= txsd->tx_credits,
1900 ("%s: too many (or partial) credits", __func__));
1901 credits -= txsd->tx_credits;
1902 toep->tx_credits += txsd->tx_credits;
1903 plen += txsd->plen;
1904 txsd++;
1905 toep->txsd_avail++;
1906 KASSERT(toep->txsd_avail <= toep->txsd_total,
1907 ("%s: txsd avail > total", __func__));
1908 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
1909 txsd = &toep->txsd[0];
1910 toep->txsd_cidx = 0;
1911 }
1912 }
1913
1914 if (toep->tx_credits == toep->tx_total) {
1915 toep->tx_nocompl = 0;
1916 toep->plen_nocompl = 0;
1917 }
1918
1919 if (toep->flags & TPF_TX_SUSPENDED &&
1920 toep->tx_credits >= toep->tx_total / 4) {
1921 #ifdef VERBOSE_TRACES
1922 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
1923 tid);
1924 #endif
1925 toep->flags &= ~TPF_TX_SUSPENDED;
1926 CURVNET_SET(toep->vnet);
1927 t4_push_data(sc, toep, plen);
1928 CURVNET_RESTORE();
1929 } else if (plen > 0) {
1930 struct sockbuf *sb = &so->so_snd;
1931 int sbu;
1932
1933 SOCKBUF_LOCK(sb);
1934 sbu = sbused(sb);
1935 if (ulp_mode(toep) == ULP_MODE_ISCSI) {
1936 if (__predict_false(sbu > 0)) {
1937 /*
1938 * The data transmitted before the
1939 * tid's ULP mode changed to ISCSI is
1940 * still in so_snd. Incoming credits
1941 * should account for so_snd first.
1942 */
1943 sbdrop_locked(sb, min(sbu, plen));
1944 plen -= min(sbu, plen);
1945 }
1946 sowwakeup_locked(so); /* unlocks so_snd */
1947 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
1948 } else {
1949 #ifdef VERBOSE_TRACES
1950 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
1951 tid, plen);
1952 #endif
1953 sbdrop_locked(sb, plen);
1954 if (!TAILQ_EMPTY(&toep->aiotx_jobq))
1955 t4_aiotx_queue_toep(so, toep);
1956 sowwakeup_locked(so); /* unlocks so_snd */
1957 }
1958 SOCKBUF_UNLOCK_ASSERT(sb);
1959 }
1960
1961 INP_WUNLOCK(inp);
1962
1963 return (0);
1964 }
1965
1966 void
t4_set_tcb_field(struct adapter * sc,struct sge_wrq * wrq,struct toepcb * toep,uint16_t word,uint64_t mask,uint64_t val,int reply,int cookie)1967 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
1968 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
1969 {
1970 struct wrqe *wr;
1971 struct cpl_set_tcb_field *req;
1972 struct ofld_tx_sdesc *txsd;
1973
1974 MPASS((cookie & ~M_COOKIE) == 0);
1975 if (reply) {
1976 MPASS(cookie != CPL_COOKIE_RESERVED);
1977 }
1978
1979 wr = alloc_wrqe(sizeof(*req), wrq);
1980 if (wr == NULL) {
1981 /* XXX */
1982 panic("%s: allocation failure.", __func__);
1983 }
1984 req = wrtod(wr);
1985
1986 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
1987 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
1988 if (reply == 0)
1989 req->reply_ctrl |= htobe16(F_NO_REPLY);
1990 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
1991 req->mask = htobe64(mask);
1992 req->val = htobe64(val);
1993 if (wrq->eq.type == EQ_OFLD) {
1994 txsd = &toep->txsd[toep->txsd_pidx];
1995 txsd->tx_credits = howmany(sizeof(*req), 16);
1996 txsd->plen = 0;
1997 KASSERT(toep->tx_credits >= txsd->tx_credits &&
1998 toep->txsd_avail > 0,
1999 ("%s: not enough credits (%d)", __func__,
2000 toep->tx_credits));
2001 toep->tx_credits -= txsd->tx_credits;
2002 if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
2003 toep->txsd_pidx = 0;
2004 toep->txsd_avail--;
2005 }
2006
2007 t4_wrq_tx(sc, wr);
2008 }
2009
2010 void
t4_init_cpl_io_handlers(void)2011 t4_init_cpl_io_handlers(void)
2012 {
2013
2014 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
2015 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
2016 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
2017 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
2018 CPL_COOKIE_TOM);
2019 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
2020 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
2021 }
2022
2023 void
t4_uninit_cpl_io_handlers(void)2024 t4_uninit_cpl_io_handlers(void)
2025 {
2026
2027 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
2028 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
2029 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
2030 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
2031 t4_register_cpl_handler(CPL_RX_DATA, NULL);
2032 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
2033 }
2034
2035 /*
2036 * Use the 'backend1' field in AIO jobs to hold an error that should
2037 * be reported when the job is completed, the 'backend3' field to
2038 * store the amount of data sent by the AIO job so far, and the
2039 * 'backend4' field to hold a reference count on the job.
2040 *
2041 * Each unmapped mbuf holds a reference on the job as does the queue
2042 * so long as the job is queued.
2043 */
2044 #define aio_error backend1
2045 #define aio_sent backend3
2046 #define aio_refs backend4
2047
2048 #define jobtotid(job) \
2049 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
2050
2051 static void
aiotx_free_job(struct kaiocb * job)2052 aiotx_free_job(struct kaiocb *job)
2053 {
2054 long status;
2055 int error;
2056
2057 if (refcount_release(&job->aio_refs) == 0)
2058 return;
2059
2060 error = (intptr_t)job->aio_error;
2061 status = job->aio_sent;
2062 #ifdef VERBOSE_TRACES
2063 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
2064 jobtotid(job), job, status, error);
2065 #endif
2066 if (error != 0 && status != 0)
2067 error = 0;
2068 if (error == ECANCELED)
2069 aio_cancel(job);
2070 else if (error)
2071 aio_complete(job, -1, error);
2072 else {
2073 job->msgsnd = 1;
2074 aio_complete(job, status, 0);
2075 }
2076 }
2077
2078 static void
aiotx_free_pgs(struct mbuf * m)2079 aiotx_free_pgs(struct mbuf *m)
2080 {
2081 struct kaiocb *job;
2082 vm_page_t pg;
2083
2084 M_ASSERTEXTPG(m);
2085 job = m->m_ext.ext_arg1;
2086 #ifdef VERBOSE_TRACES
2087 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
2088 m->m_len, jobtotid(job));
2089 #endif
2090
2091 for (int i = 0; i < m->m_epg_npgs; i++) {
2092 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
2093 vm_page_unwire(pg, PQ_ACTIVE);
2094 }
2095
2096 aiotx_free_job(job);
2097 }
2098
2099 /*
2100 * Allocate a chain of unmapped mbufs describing the next 'len' bytes
2101 * of an AIO job.
2102 */
2103 static struct mbuf *
alloc_aiotx_mbuf(struct kaiocb * job,int len)2104 alloc_aiotx_mbuf(struct kaiocb *job, int len)
2105 {
2106 struct vmspace *vm;
2107 vm_page_t pgs[MBUF_PEXT_MAX_PGS];
2108 struct mbuf *m, *top, *last;
2109 vm_map_t map;
2110 vm_offset_t start;
2111 int i, mlen, npages, pgoff;
2112
2113 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
2114 ("%s(%p, %d): request to send beyond end of buffer", __func__,
2115 job, len));
2116
2117 /*
2118 * The AIO subsystem will cancel and drain all requests before
2119 * permitting a process to exit or exec, so p_vmspace should
2120 * be stable here.
2121 */
2122 vm = job->userproc->p_vmspace;
2123 map = &vm->vm_map;
2124 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
2125 pgoff = start & PAGE_MASK;
2126
2127 top = NULL;
2128 last = NULL;
2129 while (len > 0) {
2130 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
2131 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
2132 ("%s: next start (%#jx + %#x) is not page aligned",
2133 __func__, (uintmax_t)start, mlen));
2134
2135 npages = vm_fault_quick_hold_pages(map, start, mlen,
2136 VM_PROT_WRITE, pgs, nitems(pgs));
2137 if (npages < 0)
2138 break;
2139
2140 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
2141 m->m_epg_1st_off = pgoff;
2142 m->m_epg_npgs = npages;
2143 if (npages == 1) {
2144 KASSERT(mlen + pgoff <= PAGE_SIZE,
2145 ("%s: single page is too large (off %d len %d)",
2146 __func__, pgoff, mlen));
2147 m->m_epg_last_len = mlen;
2148 } else {
2149 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) -
2150 (npages - 2) * PAGE_SIZE;
2151 }
2152 for (i = 0; i < npages; i++)
2153 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
2154
2155 m->m_len = mlen;
2156 m->m_ext.ext_size = npages * PAGE_SIZE;
2157 m->m_ext.ext_arg1 = job;
2158 refcount_acquire(&job->aio_refs);
2159
2160 #ifdef VERBOSE_TRACES
2161 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
2162 __func__, jobtotid(job), m, job, npages);
2163 #endif
2164
2165 if (top == NULL)
2166 top = m;
2167 else
2168 last->m_next = m;
2169 last = m;
2170
2171 len -= mlen;
2172 start += mlen;
2173 pgoff = 0;
2174 }
2175
2176 return (top);
2177 }
2178
2179 static void
t4_aiotx_process_job(struct toepcb * toep,struct socket * so,struct kaiocb * job)2180 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
2181 {
2182 struct sockbuf *sb;
2183 struct file *fp;
2184 struct inpcb *inp;
2185 struct tcpcb *tp;
2186 struct mbuf *m;
2187 u_int sent;
2188 int error, len;
2189 bool moretocome, sendmore;
2190
2191 sb = &so->so_snd;
2192 SOCKBUF_UNLOCK(sb);
2193 fp = job->fd_file;
2194 m = NULL;
2195
2196 #ifdef MAC
2197 error = mac_socket_check_send(fp->f_cred, so);
2198 if (error != 0)
2199 goto out;
2200 #endif
2201
2202 /* Inline sosend_generic(). */
2203
2204 error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
2205 MPASS(error == 0);
2206
2207 sendanother:
2208 SOCKBUF_LOCK(sb);
2209 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2210 SOCKBUF_UNLOCK(sb);
2211 SOCK_IO_SEND_UNLOCK(so);
2212 if ((so->so_options & SO_NOSIGPIPE) == 0) {
2213 PROC_LOCK(job->userproc);
2214 kern_psignal(job->userproc, SIGPIPE);
2215 PROC_UNLOCK(job->userproc);
2216 }
2217 error = EPIPE;
2218 goto out;
2219 }
2220 if (so->so_error) {
2221 error = so->so_error;
2222 so->so_error = 0;
2223 SOCKBUF_UNLOCK(sb);
2224 SOCK_IO_SEND_UNLOCK(so);
2225 goto out;
2226 }
2227 if ((so->so_state & SS_ISCONNECTED) == 0) {
2228 SOCKBUF_UNLOCK(sb);
2229 SOCK_IO_SEND_UNLOCK(so);
2230 error = ENOTCONN;
2231 goto out;
2232 }
2233 if (sbspace(sb) < sb->sb_lowat) {
2234 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
2235
2236 /*
2237 * Don't block if there is too little room in the socket
2238 * buffer. Instead, requeue the request.
2239 */
2240 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2241 SOCKBUF_UNLOCK(sb);
2242 SOCK_IO_SEND_UNLOCK(so);
2243 error = ECANCELED;
2244 goto out;
2245 }
2246 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2247 SOCKBUF_UNLOCK(sb);
2248 SOCK_IO_SEND_UNLOCK(so);
2249 goto out;
2250 }
2251
2252 /*
2253 * Write as much data as the socket permits, but no more than a
2254 * a single sndbuf at a time.
2255 */
2256 len = sbspace(sb);
2257 if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
2258 len = job->uaiocb.aio_nbytes - job->aio_sent;
2259 moretocome = false;
2260 } else
2261 moretocome = true;
2262 if (len > toep->params.sndbuf) {
2263 len = toep->params.sndbuf;
2264 sendmore = true;
2265 } else
2266 sendmore = false;
2267
2268 if (!TAILQ_EMPTY(&toep->aiotx_jobq))
2269 moretocome = true;
2270 SOCKBUF_UNLOCK(sb);
2271 MPASS(len != 0);
2272
2273 m = alloc_aiotx_mbuf(job, len);
2274 if (m == NULL) {
2275 SOCK_IO_SEND_UNLOCK(so);
2276 error = EFAULT;
2277 goto out;
2278 }
2279
2280 /* Inlined tcp_usr_send(). */
2281
2282 inp = toep->inp;
2283 INP_WLOCK(inp);
2284 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
2285 INP_WUNLOCK(inp);
2286 SOCK_IO_SEND_UNLOCK(so);
2287 error = ECONNRESET;
2288 goto out;
2289 }
2290
2291 sent = m_length(m, NULL);
2292 job->aio_sent += sent;
2293 counter_u64_add(toep->ofld_txq->tx_aio_octets, sent);
2294
2295 sbappendstream(sb, m, 0);
2296 m = NULL;
2297
2298 if (!(inp->inp_flags & INP_DROPPED)) {
2299 tp = intotcpcb(inp);
2300 if (moretocome)
2301 tp->t_flags |= TF_MORETOCOME;
2302 error = tp->t_fb->tfb_tcp_output(tp);
2303 if (moretocome)
2304 tp->t_flags &= ~TF_MORETOCOME;
2305 }
2306
2307 INP_WUNLOCK(inp);
2308 if (sendmore)
2309 goto sendanother;
2310 SOCK_IO_SEND_UNLOCK(so);
2311
2312 if (error)
2313 goto out;
2314
2315 /*
2316 * If this is a blocking socket and the request has not been
2317 * fully completed, requeue it until the socket is ready
2318 * again.
2319 */
2320 if (job->aio_sent < job->uaiocb.aio_nbytes &&
2321 !(so->so_state & SS_NBIO)) {
2322 SOCKBUF_LOCK(sb);
2323 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2324 SOCKBUF_UNLOCK(sb);
2325 error = ECANCELED;
2326 goto out;
2327 }
2328 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2329 return;
2330 }
2331
2332 /*
2333 * If the request will not be requeued, drop the queue's
2334 * reference to the job. Any mbufs in flight should still
2335 * hold a reference, but this drops the reference that the
2336 * queue owns while it is waiting to queue mbufs to the
2337 * socket.
2338 */
2339 aiotx_free_job(job);
2340 counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1);
2341
2342 out:
2343 if (error) {
2344 job->aio_error = (void *)(intptr_t)error;
2345 aiotx_free_job(job);
2346 }
2347 m_freem(m);
2348 SOCKBUF_LOCK(sb);
2349 }
2350
2351 static void
t4_aiotx_task(void * context,int pending)2352 t4_aiotx_task(void *context, int pending)
2353 {
2354 struct toepcb *toep = context;
2355 struct socket *so;
2356 struct kaiocb *job;
2357
2358 so = toep->aiotx_so;
2359 CURVNET_SET(toep->vnet);
2360 SOCKBUF_LOCK(&so->so_snd);
2361 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
2362 job = TAILQ_FIRST(&toep->aiotx_jobq);
2363 TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2364 if (!aio_clear_cancel_function(job))
2365 continue;
2366
2367 t4_aiotx_process_job(toep, so, job);
2368 }
2369 toep->aiotx_so = NULL;
2370 SOCKBUF_UNLOCK(&so->so_snd);
2371 CURVNET_RESTORE();
2372
2373 free_toepcb(toep);
2374 SOCK_LOCK(so);
2375 sorele(so);
2376 }
2377
2378 static void
t4_aiotx_queue_toep(struct socket * so,struct toepcb * toep)2379 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
2380 {
2381
2382 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
2383 #ifdef VERBOSE_TRACES
2384 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
2385 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
2386 #endif
2387 if (toep->aiotx_so != NULL)
2388 return;
2389 soref(so);
2390 toep->aiotx_so = so;
2391 hold_toepcb(toep);
2392 soaio_enqueue(&toep->aiotx_task);
2393 }
2394
2395 static void
t4_aiotx_cancel(struct kaiocb * job)2396 t4_aiotx_cancel(struct kaiocb *job)
2397 {
2398 struct socket *so;
2399 struct sockbuf *sb;
2400 struct tcpcb *tp;
2401 struct toepcb *toep;
2402
2403 so = job->fd_file->f_data;
2404 tp = so_sototcpcb(so);
2405 toep = tp->t_toe;
2406 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
2407 sb = &so->so_snd;
2408
2409 SOCKBUF_LOCK(sb);
2410 if (!aio_cancel_cleared(job))
2411 TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2412 SOCKBUF_UNLOCK(sb);
2413
2414 job->aio_error = (void *)(intptr_t)ECANCELED;
2415 aiotx_free_job(job);
2416 }
2417
2418 int
t4_aio_queue_aiotx(struct socket * so,struct kaiocb * job)2419 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
2420 {
2421 struct tcpcb *tp = so_sototcpcb(so);
2422 struct toepcb *toep = tp->t_toe;
2423 struct adapter *sc = td_adapter(toep->td);
2424
2425 /* This only handles writes. */
2426 if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
2427 return (EOPNOTSUPP);
2428
2429 if (!sc->tt.tx_zcopy)
2430 return (EOPNOTSUPP);
2431
2432 if (tls_tx_key(toep))
2433 return (EOPNOTSUPP);
2434
2435 SOCKBUF_LOCK(&so->so_snd);
2436 #ifdef VERBOSE_TRACES
2437 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
2438 #endif
2439 if (!aio_set_cancel_function(job, t4_aiotx_cancel))
2440 panic("new job was cancelled");
2441 refcount_init(&job->aio_refs, 1);
2442 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
2443 if (sowriteable(so))
2444 t4_aiotx_queue_toep(so, toep);
2445 SOCKBUF_UNLOCK(&so->so_snd);
2446 return (0);
2447 }
2448
2449 void
aiotx_init_toep(struct toepcb * toep)2450 aiotx_init_toep(struct toepcb *toep)
2451 {
2452
2453 TAILQ_INIT(&toep->aiotx_jobq);
2454 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
2455 }
2456 #endif
2457