xref: /trueos/sys/dev/iscsi/icl.c (revision 17d83a70d11062ccf00ec19e142b61af05794ef2)
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 
31 /*
32  * iSCSI Common Layer.  It's used by both the initiator and target to send
33  * and receive iSCSI PDUs.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/param.h>
40 #include <sys/capsicum.h>
41 #include <sys/condvar.h>
42 #include <sys/conf.h>
43 #include <sys/file.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/lock.h>
47 #include <sys/mbuf.h>
48 #include <sys/mutex.h>
49 #include <sys/module.h>
50 #include <sys/protosw.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/sx.h>
56 #include <sys/uio.h>
57 #include <vm/uma.h>
58 #include <netinet/in.h>
59 #include <netinet/tcp.h>
60 
61 #include <dev/iscsi/icl.h>
62 #include <dev/iscsi/iscsi_proto.h>
63 
64 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
65 static int debug = 1;
66 TUNABLE_INT("kern.icl.debug", &debug);
67 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN,
68     &debug, 0, "Enable debug messages");
69 static int coalesce = 1;
70 TUNABLE_INT("kern.icl.coalesce", &coalesce);
71 SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
72     &coalesce, 0, "Try to coalesce PDUs before sending");
73 static int partial_receive_len = 128 * 1024;
74 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
75 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
76     &partial_receive_len, 0, "Minimum read size for partially received "
77     "data segment");
78 static int sendspace = 1048576;
79 TUNABLE_INT("kern.icl.sendspace", &sendspace);
80 SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
81     &sendspace, 0, "Default send socket buffer size");
82 static int recvspace = 1048576;
83 TUNABLE_INT("kern.icl.recvspace", &recvspace);
84 SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
85     &recvspace, 0, "Default receive socket buffer size");
86 
87 static uma_zone_t icl_conn_zone;
88 static uma_zone_t icl_pdu_zone;
89 
90 static volatile u_int	icl_ncons;
91 
92 #define	ICL_DEBUG(X, ...)						\
93 	do {								\
94 		if (debug > 1)						\
95 			printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
96 	} while (0)
97 
98 #define	ICL_WARN(X, ...)						\
99 	do {								\
100 		if (debug > 0) {					\
101 			printf("WARNING: %s: " X "\n",			\
102 			    __func__, ## __VA_ARGS__);			\
103 		}							\
104 	} while (0)
105 
106 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
107 #define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
108 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
109 #define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
110 
111 STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
112 
113 static void
icl_conn_fail(struct icl_conn * ic)114 icl_conn_fail(struct icl_conn *ic)
115 {
116 	if (ic->ic_socket == NULL)
117 		return;
118 
119 	/*
120 	 * XXX
121 	 */
122 	ic->ic_socket->so_error = EDOOFUS;
123 	(ic->ic_error)(ic);
124 }
125 
126 static struct mbuf *
icl_conn_receive(struct icl_conn * ic,size_t len)127 icl_conn_receive(struct icl_conn *ic, size_t len)
128 {
129 	struct uio uio;
130 	struct socket *so;
131 	struct mbuf *m;
132 	int error, flags;
133 
134 	so = ic->ic_socket;
135 
136 	memset(&uio, 0, sizeof(uio));
137 	uio.uio_resid = len;
138 
139 	flags = MSG_DONTWAIT;
140 	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
141 	if (error != 0) {
142 		ICL_DEBUG("soreceive error %d", error);
143 		return (NULL);
144 	}
145 	if (uio.uio_resid != 0) {
146 		m_freem(m);
147 		ICL_DEBUG("short read");
148 		return (NULL);
149 	}
150 
151 	return (m);
152 }
153 
154 static struct icl_pdu *
icl_pdu_new_empty(struct icl_conn * ic,int flags)155 icl_pdu_new_empty(struct icl_conn *ic, int flags)
156 {
157 	struct icl_pdu *ip;
158 
159 #ifdef DIAGNOSTIC
160 	refcount_acquire(&ic->ic_outstanding_pdus);
161 #endif
162 	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
163 	if (ip == NULL) {
164 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
165 #ifdef DIAGNOSTIC
166 		refcount_release(&ic->ic_outstanding_pdus);
167 #endif
168 		return (NULL);
169 	}
170 
171 	ip->ip_conn = ic;
172 
173 	return (ip);
174 }
175 
176 void
icl_pdu_free(struct icl_pdu * ip)177 icl_pdu_free(struct icl_pdu *ip)
178 {
179 	struct icl_conn *ic;
180 
181 	ic = ip->ip_conn;
182 
183 	m_freem(ip->ip_bhs_mbuf);
184 	m_freem(ip->ip_ahs_mbuf);
185 	m_freem(ip->ip_data_mbuf);
186 	uma_zfree(icl_pdu_zone, ip);
187 #ifdef DIAGNOSTIC
188 	refcount_release(&ic->ic_outstanding_pdus);
189 #endif
190 }
191 
192 /*
193  * Allocate icl_pdu with empty BHS to fill up by the caller.
194  */
195 struct icl_pdu *
icl_pdu_new(struct icl_conn * ic,int flags)196 icl_pdu_new(struct icl_conn *ic, int flags)
197 {
198 	struct icl_pdu *ip;
199 
200 	ip = icl_pdu_new_empty(ic, flags);
201 	if (ip == NULL)
202 		return (NULL);
203 
204 	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
205 	    flags, MT_DATA, M_PKTHDR);
206 	if (ip->ip_bhs_mbuf == NULL) {
207 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
208 		icl_pdu_free(ip);
209 		return (NULL);
210 	}
211 	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
212 	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
213 	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
214 
215 	return (ip);
216 }
217 
218 static int
icl_pdu_ahs_length(const struct icl_pdu * request)219 icl_pdu_ahs_length(const struct icl_pdu *request)
220 {
221 
222 	return (request->ip_bhs->bhs_total_ahs_len * 4);
223 }
224 
225 size_t
icl_pdu_data_segment_length(const struct icl_pdu * request)226 icl_pdu_data_segment_length(const struct icl_pdu *request)
227 {
228 	uint32_t len = 0;
229 
230 	len += request->ip_bhs->bhs_data_segment_len[0];
231 	len <<= 8;
232 	len += request->ip_bhs->bhs_data_segment_len[1];
233 	len <<= 8;
234 	len += request->ip_bhs->bhs_data_segment_len[2];
235 
236 	return (len);
237 }
238 
239 static void
icl_pdu_set_data_segment_length(struct icl_pdu * response,uint32_t len)240 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
241 {
242 
243 	response->ip_bhs->bhs_data_segment_len[2] = len;
244 	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
245 	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
246 }
247 
248 static size_t
icl_pdu_padding(const struct icl_pdu * ip)249 icl_pdu_padding(const struct icl_pdu *ip)
250 {
251 
252 	if ((ip->ip_data_len % 4) != 0)
253 		return (4 - (ip->ip_data_len % 4));
254 
255 	return (0);
256 }
257 
258 static size_t
icl_pdu_size(const struct icl_pdu * response)259 icl_pdu_size(const struct icl_pdu *response)
260 {
261 	size_t len;
262 
263 	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
264 
265 	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
266 	    icl_pdu_padding(response);
267 	if (response->ip_conn->ic_header_crc32c)
268 		len += ISCSI_HEADER_DIGEST_SIZE;
269 	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
270 		len += ISCSI_DATA_DIGEST_SIZE;
271 
272 	return (len);
273 }
274 
275 static int
icl_pdu_receive_bhs(struct icl_pdu * request,size_t * availablep)276 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
277 {
278 	struct mbuf *m;
279 
280 	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
281 	if (m == NULL) {
282 		ICL_DEBUG("failed to receive BHS");
283 		return (-1);
284 	}
285 
286 	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
287 	if (request->ip_bhs_mbuf == NULL) {
288 		ICL_WARN("m_pullup failed");
289 		return (-1);
290 	}
291 	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
292 
293 	/*
294 	 * XXX: For architectures with strict alignment requirements
295 	 * 	we may need to allocate ip_bhs and copy the data into it.
296 	 * 	For some reason, though, not doing this doesn't seem
297 	 * 	to cause problems; tested on sparc64.
298 	 */
299 
300 	*availablep -= sizeof(struct iscsi_bhs);
301 	return (0);
302 }
303 
304 static int
icl_pdu_receive_ahs(struct icl_pdu * request,size_t * availablep)305 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
306 {
307 
308 	request->ip_ahs_len = icl_pdu_ahs_length(request);
309 	if (request->ip_ahs_len == 0)
310 		return (0);
311 
312 	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
313 	    request->ip_ahs_len);
314 	if (request->ip_ahs_mbuf == NULL) {
315 		ICL_DEBUG("failed to receive AHS");
316 		return (-1);
317 	}
318 
319 	*availablep -= request->ip_ahs_len;
320 	return (0);
321 }
322 
323 static uint32_t
icl_mbuf_to_crc32c(const struct mbuf * m0)324 icl_mbuf_to_crc32c(const struct mbuf *m0)
325 {
326 	uint32_t digest = 0xffffffff;
327 	const struct mbuf *m;
328 
329 	for (m = m0; m != NULL; m = m->m_next)
330 		digest = calculate_crc32c(digest,
331 		    mtod(m, const void *), m->m_len);
332 
333 	digest = digest ^ 0xffffffff;
334 
335 	return (digest);
336 }
337 
338 static int
icl_pdu_check_header_digest(struct icl_pdu * request,size_t * availablep)339 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
340 {
341 	struct mbuf *m;
342 	uint32_t received_digest, valid_digest;
343 
344 	if (request->ip_conn->ic_header_crc32c == false)
345 		return (0);
346 
347 	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
348 	if (m == NULL) {
349 		ICL_DEBUG("failed to receive header digest");
350 		return (-1);
351 	}
352 
353 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
354 	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
355 	m_freem(m);
356 
357 	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
358 
359 	/*
360 	 * XXX: Handle AHS.
361 	 */
362 	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
363 	if (received_digest != valid_digest) {
364 		ICL_WARN("header digest check failed; got 0x%x, "
365 		    "should be 0x%x", received_digest, valid_digest);
366 		return (-1);
367 	}
368 
369 	return (0);
370 }
371 
372 /*
373  * Return the number of bytes that should be waiting in the receive socket
374  * before icl_pdu_receive_data_segment() gets called.
375  */
376 static size_t
icl_pdu_data_segment_receive_len(const struct icl_pdu * request)377 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
378 {
379 	size_t len;
380 
381 	len = icl_pdu_data_segment_length(request);
382 	if (len == 0)
383 		return (0);
384 
385 	/*
386 	 * Account for the parts of data segment already read from
387 	 * the socket buffer.
388 	 */
389 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
390 	len -= request->ip_data_len;
391 
392 	/*
393 	 * Don't always wait for the full data segment to be delivered
394 	 * to the socket; this might badly affect performance due to
395 	 * TCP window scaling.
396 	 */
397 	if (len > partial_receive_len) {
398 #if 0
399 		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
400 		    len, partial_receive_len));
401 #endif
402 		len = partial_receive_len;
403 
404 		return (len);
405 	}
406 
407 	/*
408 	 * Account for padding.  Note that due to the way code is written,
409 	 * the icl_pdu_receive_data_segment() must always receive padding
410 	 * along with the last part of data segment, because it would be
411 	 * impossible to tell whether we've already received the full data
412 	 * segment including padding, or without it.
413 	 */
414 	if ((len % 4) != 0)
415 		len += 4 - (len % 4);
416 
417 #if 0
418 	ICL_DEBUG("need %zd bytes of data", len));
419 #endif
420 
421 	return (len);
422 }
423 
424 static int
icl_pdu_receive_data_segment(struct icl_pdu * request,size_t * availablep,bool * more_neededp)425 icl_pdu_receive_data_segment(struct icl_pdu *request,
426     size_t *availablep, bool *more_neededp)
427 {
428 	struct icl_conn *ic;
429 	size_t len, padding = 0;
430 	struct mbuf *m;
431 
432 	ic = request->ip_conn;
433 
434 	*more_neededp = false;
435 	ic->ic_receive_len = 0;
436 
437 	len = icl_pdu_data_segment_length(request);
438 	if (len == 0)
439 		return (0);
440 
441 	if ((len % 4) != 0)
442 		padding = 4 - (len % 4);
443 
444 	/*
445 	 * Account for already received parts of data segment.
446 	 */
447 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
448 	len -= request->ip_data_len;
449 
450 	if (len + padding > *availablep) {
451 		/*
452 		 * Not enough data in the socket buffer.  Receive as much
453 		 * as we can.  Don't receive padding, since, obviously, it's
454 		 * not the end of data segment yet.
455 		 */
456 #if 0
457 		ICL_DEBUG("limited from %zd to %zd",
458 		    len + padding, *availablep - padding));
459 #endif
460 		len = *availablep - padding;
461 		*more_neededp = true;
462 		padding = 0;
463 	}
464 
465 	/*
466 	 * Must not try to receive padding without at least one byte
467 	 * of actual data segment.
468 	 */
469 	if (len > 0) {
470 		m = icl_conn_receive(request->ip_conn, len + padding);
471 		if (m == NULL) {
472 			ICL_DEBUG("failed to receive data segment");
473 			return (-1);
474 		}
475 
476 		if (request->ip_data_mbuf == NULL)
477 			request->ip_data_mbuf = m;
478 		else
479 			m_cat(request->ip_data_mbuf, m);
480 
481 		request->ip_data_len += len;
482 		*availablep -= len + padding;
483 	} else
484 		ICL_DEBUG("len 0");
485 
486 	if (*more_neededp)
487 		ic->ic_receive_len =
488 		    icl_pdu_data_segment_receive_len(request);
489 
490 	return (0);
491 }
492 
493 static int
icl_pdu_check_data_digest(struct icl_pdu * request,size_t * availablep)494 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
495 {
496 	struct mbuf *m;
497 	uint32_t received_digest, valid_digest;
498 
499 	if (request->ip_conn->ic_data_crc32c == false)
500 		return (0);
501 
502 	if (request->ip_data_len == 0)
503 		return (0);
504 
505 	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
506 	if (m == NULL) {
507 		ICL_DEBUG("failed to receive data digest");
508 		return (-1);
509 	}
510 
511 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
512 	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
513 	m_freem(m);
514 
515 	*availablep -= ISCSI_DATA_DIGEST_SIZE;
516 
517 	/*
518 	 * Note that ip_data_mbuf also contains padding; since digest
519 	 * calculation is supposed to include that, we iterate over
520 	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
521 	 */
522 	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
523 	if (received_digest != valid_digest) {
524 		ICL_WARN("data digest check failed; got 0x%x, "
525 		    "should be 0x%x", received_digest, valid_digest);
526 		return (-1);
527 	}
528 
529 	return (0);
530 }
531 
532 /*
533  * Somewhat contrary to the name, this attempts to receive only one
534  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
535  */
536 static struct icl_pdu *
icl_conn_receive_pdu(struct icl_conn * ic,size_t * availablep)537 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
538 {
539 	struct icl_pdu *request;
540 	struct socket *so;
541 	size_t len;
542 	int error;
543 	bool more_needed;
544 
545 	so = ic->ic_socket;
546 
547 	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
548 		KASSERT(ic->ic_receive_pdu == NULL,
549 		    ("ic->ic_receive_pdu != NULL"));
550 		request = icl_pdu_new_empty(ic, M_NOWAIT);
551 		if (request == NULL) {
552 			ICL_DEBUG("failed to allocate PDU; "
553 			    "dropping connection");
554 			icl_conn_fail(ic);
555 			return (NULL);
556 		}
557 		ic->ic_receive_pdu = request;
558 	} else {
559 		KASSERT(ic->ic_receive_pdu != NULL,
560 		    ("ic->ic_receive_pdu == NULL"));
561 		request = ic->ic_receive_pdu;
562 	}
563 
564 	if (*availablep < ic->ic_receive_len) {
565 #if 0
566 		ICL_DEBUG("not enough data; need %zd, "
567 		    "have %zd", ic->ic_receive_len, *availablep);
568 #endif
569 		return (NULL);
570 	}
571 
572 	switch (ic->ic_receive_state) {
573 	case ICL_CONN_STATE_BHS:
574 		//ICL_DEBUG("receiving BHS");
575 		error = icl_pdu_receive_bhs(request, availablep);
576 		if (error != 0) {
577 			ICL_DEBUG("failed to receive BHS; "
578 			    "dropping connection");
579 			break;
580 		}
581 
582 		/*
583 		 * We don't enforce any limit for AHS length;
584 		 * its length is stored in 8 bit field.
585 		 */
586 
587 		len = icl_pdu_data_segment_length(request);
588 		if (len > ic->ic_max_data_segment_length) {
589 			ICL_WARN("received data segment "
590 			    "length %zd is larger than negotiated "
591 			    "MaxDataSegmentLength %zd; "
592 			    "dropping connection",
593 			    len, ic->ic_max_data_segment_length);
594 			error = EINVAL;
595 			break;
596 		}
597 
598 		ic->ic_receive_state = ICL_CONN_STATE_AHS;
599 		ic->ic_receive_len = icl_pdu_ahs_length(request);
600 		break;
601 
602 	case ICL_CONN_STATE_AHS:
603 		//ICL_DEBUG("receiving AHS");
604 		error = icl_pdu_receive_ahs(request, availablep);
605 		if (error != 0) {
606 			ICL_DEBUG("failed to receive AHS; "
607 			    "dropping connection");
608 			break;
609 		}
610 		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
611 		if (ic->ic_header_crc32c == false)
612 			ic->ic_receive_len = 0;
613 		else
614 			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
615 		break;
616 
617 	case ICL_CONN_STATE_HEADER_DIGEST:
618 		//ICL_DEBUG("receiving header digest");
619 		error = icl_pdu_check_header_digest(request, availablep);
620 		if (error != 0) {
621 			ICL_DEBUG("header digest failed; "
622 			    "dropping connection");
623 			break;
624 		}
625 
626 		ic->ic_receive_state = ICL_CONN_STATE_DATA;
627 		ic->ic_receive_len =
628 		    icl_pdu_data_segment_receive_len(request);
629 		break;
630 
631 	case ICL_CONN_STATE_DATA:
632 		//ICL_DEBUG("receiving data segment");
633 		error = icl_pdu_receive_data_segment(request, availablep,
634 		    &more_needed);
635 		if (error != 0) {
636 			ICL_DEBUG("failed to receive data segment;"
637 			    "dropping connection");
638 			break;
639 		}
640 
641 		if (more_needed)
642 			break;
643 
644 		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
645 		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
646 			ic->ic_receive_len = 0;
647 		else
648 			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
649 		break;
650 
651 	case ICL_CONN_STATE_DATA_DIGEST:
652 		//ICL_DEBUG("receiving data digest");
653 		error = icl_pdu_check_data_digest(request, availablep);
654 		if (error != 0) {
655 			ICL_DEBUG("data digest failed; "
656 			    "dropping connection");
657 			break;
658 		}
659 
660 		/*
661 		 * We've received complete PDU; reset the receive state machine
662 		 * and return the PDU.
663 		 */
664 		ic->ic_receive_state = ICL_CONN_STATE_BHS;
665 		ic->ic_receive_len = sizeof(struct iscsi_bhs);
666 		ic->ic_receive_pdu = NULL;
667 		return (request);
668 
669 	default:
670 		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
671 	}
672 
673 	if (error != 0) {
674 		/*
675 		 * Don't free the PDU; it's pointed to by ic->ic_receive_pdu
676 		 * and will get freed in icl_conn_close().
677 		 */
678 		icl_conn_fail(ic);
679 	}
680 
681 	return (NULL);
682 }
683 
684 static void
icl_conn_receive_pdus(struct icl_conn * ic,size_t available)685 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
686 {
687 	struct icl_pdu *response;
688 	struct socket *so;
689 
690 	so = ic->ic_socket;
691 
692 	/*
693 	 * This can never happen; we're careful to only mess with ic->ic_socket
694 	 * pointer when the send/receive threads are not running.
695 	 */
696 	KASSERT(so != NULL, ("NULL socket"));
697 
698 	for (;;) {
699 		if (ic->ic_disconnecting)
700 			return;
701 
702 		if (so->so_error != 0) {
703 			ICL_DEBUG("connection error %d; "
704 			    "dropping connection", so->so_error);
705 			icl_conn_fail(ic);
706 			return;
707 		}
708 
709 		/*
710 		 * Loop until we have a complete PDU or there is not enough
711 		 * data in the socket buffer.
712 		 */
713 		if (available < ic->ic_receive_len) {
714 #if 0
715 			ICL_DEBUG("not enough data; have %zd, "
716 			    "need %zd", available,
717 			    ic->ic_receive_len);
718 #endif
719 			return;
720 		}
721 
722 		response = icl_conn_receive_pdu(ic, &available);
723 		if (response == NULL)
724 			continue;
725 
726 		if (response->ip_ahs_len > 0) {
727 			ICL_WARN("received PDU with unsupported "
728 			    "AHS; opcode 0x%x; dropping connection",
729 			    response->ip_bhs->bhs_opcode);
730 			icl_pdu_free(response);
731 			icl_conn_fail(ic);
732 			return;
733 		}
734 
735 		(ic->ic_receive)(response);
736 	}
737 }
738 
739 static void
icl_receive_thread(void * arg)740 icl_receive_thread(void *arg)
741 {
742 	struct icl_conn *ic;
743 	size_t available;
744 	struct socket *so;
745 
746 	ic = arg;
747 	so = ic->ic_socket;
748 
749 	ICL_CONN_LOCK(ic);
750 	ic->ic_receive_running = true;
751 	ICL_CONN_UNLOCK(ic);
752 
753 	for (;;) {
754 		if (ic->ic_disconnecting) {
755 			//ICL_DEBUG("terminating");
756 			break;
757 		}
758 
759 		/*
760 		 * Set the low watermark, to be checked by
761 		 * soreadable() in icl_soupcall_receive()
762 		 * to avoid unneccessary wakeups until there
763 		 * is enough data received to read the PDU.
764 		 */
765 		SOCKBUF_LOCK(&so->so_rcv);
766 		available = so->so_rcv.sb_cc;
767 		if (available < ic->ic_receive_len) {
768 			so->so_rcv.sb_lowat = ic->ic_receive_len;
769 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
770 		} else
771 			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
772 		SOCKBUF_UNLOCK(&so->so_rcv);
773 
774 		icl_conn_receive_pdus(ic, available);
775 	}
776 
777 	ICL_CONN_LOCK(ic);
778 	ic->ic_receive_running = false;
779 	cv_signal(&ic->ic_send_cv);
780 	ICL_CONN_UNLOCK(ic);
781 	kthread_exit();
782 }
783 
784 static int
icl_soupcall_receive(struct socket * so,void * arg,int waitflag)785 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
786 {
787 	struct icl_conn *ic;
788 
789 	if (!soreadable(so))
790 		return (SU_OK);
791 
792 	ic = arg;
793 	cv_signal(&ic->ic_receive_cv);
794 	return (SU_OK);
795 }
796 
797 static int
icl_pdu_finalize(struct icl_pdu * request)798 icl_pdu_finalize(struct icl_pdu *request)
799 {
800 	size_t padding, pdu_len;
801 	uint32_t digest, zero = 0;
802 	int ok;
803 	struct icl_conn *ic;
804 
805 	ic = request->ip_conn;
806 
807 	icl_pdu_set_data_segment_length(request, request->ip_data_len);
808 
809 	pdu_len = icl_pdu_size(request);
810 
811 	if (ic->ic_header_crc32c) {
812 		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
813 		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
814 		    (void *)&digest);
815 		if (ok != 1) {
816 			ICL_WARN("failed to append header digest");
817 			return (1);
818 		}
819 	}
820 
821 	if (request->ip_data_len != 0) {
822 		padding = icl_pdu_padding(request);
823 		if (padding > 0) {
824 			ok = m_append(request->ip_data_mbuf, padding,
825 			    (void *)&zero);
826 			if (ok != 1) {
827 				ICL_WARN("failed to append padding");
828 				return (1);
829 			}
830 		}
831 
832 		if (ic->ic_data_crc32c) {
833 			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
834 
835 			ok = m_append(request->ip_data_mbuf, sizeof(digest),
836 			    (void *)&digest);
837 			if (ok != 1) {
838 				ICL_WARN("failed to append data digest");
839 				return (1);
840 			}
841 		}
842 
843 		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
844 		request->ip_data_mbuf = NULL;
845 	}
846 
847 	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
848 
849 	return (0);
850 }
851 
852 static void
icl_conn_send_pdus(struct icl_conn * ic,struct icl_pdu_stailq * queue)853 icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
854 {
855 	struct icl_pdu *request, *request2;
856 	struct socket *so;
857 	size_t available, size, size2;
858 	int coalesced, error;
859 
860 	ICL_CONN_LOCK_ASSERT_NOT(ic);
861 
862 	so = ic->ic_socket;
863 
864 	SOCKBUF_LOCK(&so->so_snd);
865 	/*
866 	 * Check how much space do we have for transmit.  We can't just
867 	 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
868 	 * as it always frees the mbuf chain passed to it, even in case
869 	 * of error.
870 	 */
871 	available = sbspace(&so->so_snd);
872 
873 	/*
874 	 * Notify the socket upcall that we don't need wakeups
875 	 * for the time being.
876 	 */
877 	so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
878 	SOCKBUF_UNLOCK(&so->so_snd);
879 
880 	while (!STAILQ_EMPTY(queue)) {
881 		request = STAILQ_FIRST(queue);
882 		size = icl_pdu_size(request);
883 		if (available < size) {
884 
885 			/*
886 			 * Set the low watermark, to be checked by
887 			 * sowriteable() in icl_soupcall_send()
888 			 * to avoid unneccessary wakeups until there
889 			 * is enough space for the PDU to fit.
890 			 */
891 			SOCKBUF_LOCK(&so->so_snd);
892 			available = sbspace(&so->so_snd);
893 			if (available < size) {
894 #if 1
895 				ICL_DEBUG("no space to send; "
896 				    "have %zd, need %zd",
897 				    available, size);
898 #endif
899 				so->so_snd.sb_lowat = size;
900 				SOCKBUF_UNLOCK(&so->so_snd);
901 				return;
902 			}
903 			SOCKBUF_UNLOCK(&so->so_snd);
904 		}
905 		STAILQ_REMOVE_HEAD(queue, ip_next);
906 		error = icl_pdu_finalize(request);
907 		if (error != 0) {
908 			ICL_DEBUG("failed to finalize PDU; "
909 			    "dropping connection");
910 			icl_conn_fail(ic);
911 			icl_pdu_free(request);
912 			return;
913 		}
914 		if (coalesce) {
915 			coalesced = 1;
916 			for (;;) {
917 				request2 = STAILQ_FIRST(queue);
918 				if (request2 == NULL)
919 					break;
920 				size2 = icl_pdu_size(request2);
921 				if (available < size + size2)
922 					break;
923 				STAILQ_REMOVE_HEAD(queue, ip_next);
924 				error = icl_pdu_finalize(request2);
925 				if (error != 0) {
926 					ICL_DEBUG("failed to finalize PDU; "
927 					    "dropping connection");
928 					icl_conn_fail(ic);
929 					icl_pdu_free(request);
930 					icl_pdu_free(request2);
931 					return;
932 				}
933 				m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf);
934 				request2->ip_bhs_mbuf = NULL;
935 				request->ip_bhs_mbuf->m_pkthdr.len += size2;
936 				size += size2;
937 				STAILQ_REMOVE_AFTER(queue, request, ip_next);
938 				icl_pdu_free(request2);
939 				coalesced++;
940 			}
941 #if 0
942 			if (coalesced > 1) {
943 				ICL_DEBUG("coalesced %d PDUs into %zd bytes",
944 				    coalesced, size);
945 			}
946 #endif
947 		}
948 		available -= size;
949 		error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
950 		    NULL, MSG_DONTWAIT, curthread);
951 		request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
952 		if (error != 0) {
953 			ICL_DEBUG("failed to send PDU, error %d; "
954 			    "dropping connection", error);
955 			icl_conn_fail(ic);
956 			icl_pdu_free(request);
957 			return;
958 		}
959 		icl_pdu_free(request);
960 	}
961 }
962 
963 static void
icl_send_thread(void * arg)964 icl_send_thread(void *arg)
965 {
966 	struct icl_conn *ic;
967 	struct icl_pdu_stailq queue;
968 
969 	ic = arg;
970 
971 	STAILQ_INIT(&queue);
972 
973 	ICL_CONN_LOCK(ic);
974 	ic->ic_send_running = true;
975 
976 	for (;;) {
977 		for (;;) {
978 			/*
979 			 * If the local queue is empty, populate it from
980 			 * the main one.  This way the icl_conn_send_pdus()
981 			 * can go through all the queued PDUs without holding
982 			 * any locks.
983 			 */
984 			if (STAILQ_EMPTY(&queue))
985 				STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu);
986 
987 			ic->ic_check_send_space = false;
988 			ICL_CONN_UNLOCK(ic);
989 			icl_conn_send_pdus(ic, &queue);
990 			ICL_CONN_LOCK(ic);
991 
992 			/*
993 			 * The icl_soupcall_send() was called since the last
994 			 * call to sbspace(); go around;
995 			 */
996 			if (ic->ic_check_send_space)
997 				continue;
998 
999 			/*
1000 			 * Local queue is empty, but we still have PDUs
1001 			 * in the main one; go around.
1002 			 */
1003 			if (STAILQ_EMPTY(&queue) &&
1004 			    !STAILQ_EMPTY(&ic->ic_to_send))
1005 				continue;
1006 
1007 			/*
1008 			 * There might be some stuff in the local queue,
1009 			 * which didn't get sent due to not having enough send
1010 			 * space.  Wait for socket upcall.
1011 			 */
1012 			break;
1013 		}
1014 
1015 		if (ic->ic_disconnecting) {
1016 			//ICL_DEBUG("terminating");
1017 			break;
1018 		}
1019 
1020 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1021 	}
1022 
1023 	/*
1024 	 * We're exiting; move PDUs back to the main queue, so they can
1025 	 * get freed properly.  At this point ordering doesn't matter.
1026 	 */
1027 	STAILQ_CONCAT(&ic->ic_to_send, &queue);
1028 
1029 	ic->ic_send_running = false;
1030 	cv_signal(&ic->ic_send_cv);
1031 	ICL_CONN_UNLOCK(ic);
1032 	kthread_exit();
1033 }
1034 
1035 static int
icl_soupcall_send(struct socket * so,void * arg,int waitflag)1036 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
1037 {
1038 	struct icl_conn *ic;
1039 
1040 	if (!sowriteable(so))
1041 		return (SU_OK);
1042 
1043 	ic = arg;
1044 
1045 	ICL_CONN_LOCK(ic);
1046 	ic->ic_check_send_space = true;
1047 	ICL_CONN_UNLOCK(ic);
1048 
1049 	cv_signal(&ic->ic_send_cv);
1050 
1051 	return (SU_OK);
1052 }
1053 
1054 int
icl_pdu_append_data(struct icl_pdu * request,const void * addr,size_t len,int flags)1055 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len,
1056     int flags)
1057 {
1058 	struct mbuf *mb, *newmb;
1059 	size_t copylen, off = 0;
1060 
1061 	KASSERT(len > 0, ("len == 0"));
1062 
1063 	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
1064 	if (newmb == NULL) {
1065 		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
1066 		return (ENOMEM);
1067 	}
1068 
1069 	for (mb = newmb; mb != NULL; mb = mb->m_next) {
1070 		copylen = min(M_TRAILINGSPACE(mb), len - off);
1071 		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
1072 		mb->m_len = copylen;
1073 		off += copylen;
1074 	}
1075 	KASSERT(off == len, ("%s: off != len", __func__));
1076 
1077 	if (request->ip_data_mbuf == NULL) {
1078 		request->ip_data_mbuf = newmb;
1079 		request->ip_data_len = len;
1080 	} else {
1081 		m_cat(request->ip_data_mbuf, newmb);
1082 		request->ip_data_len += len;
1083 	}
1084 
1085 	return (0);
1086 }
1087 
1088 void
icl_pdu_get_data(struct icl_pdu * ip,size_t off,void * addr,size_t len)1089 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
1090 {
1091 
1092 	m_copydata(ip->ip_data_mbuf, off, len, addr);
1093 }
1094 
1095 void
icl_pdu_queue(struct icl_pdu * ip)1096 icl_pdu_queue(struct icl_pdu *ip)
1097 {
1098 	struct icl_conn *ic;
1099 
1100 	ic = ip->ip_conn;
1101 
1102 	ICL_CONN_LOCK_ASSERT(ic);
1103 
1104 	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
1105 		ICL_DEBUG("icl_pdu_queue on closed connection");
1106 		icl_pdu_free(ip);
1107 		return;
1108 	}
1109 
1110 	if (!STAILQ_EMPTY(&ic->ic_to_send)) {
1111 		STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1112 		/*
1113 		 * If the queue is not empty, someone else had already
1114 		 * signaled the send thread; no need to do that again,
1115 		 * just return.
1116 		 */
1117 		return;
1118 	}
1119 
1120 	STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1121 	cv_signal(&ic->ic_send_cv);
1122 }
1123 
1124 struct icl_conn *
icl_conn_new(const char * name,struct mtx * lock)1125 icl_conn_new(const char *name, struct mtx *lock)
1126 {
1127 	struct icl_conn *ic;
1128 
1129 	refcount_acquire(&icl_ncons);
1130 
1131 	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
1132 
1133 	STAILQ_INIT(&ic->ic_to_send);
1134 	ic->ic_lock = lock;
1135 	cv_init(&ic->ic_send_cv, "icl_tx");
1136 	cv_init(&ic->ic_receive_cv, "icl_rx");
1137 #ifdef DIAGNOSTIC
1138 	refcount_init(&ic->ic_outstanding_pdus, 0);
1139 #endif
1140 	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
1141 	ic->ic_name = name;
1142 
1143 	return (ic);
1144 }
1145 
1146 void
icl_conn_free(struct icl_conn * ic)1147 icl_conn_free(struct icl_conn *ic)
1148 {
1149 
1150 	cv_destroy(&ic->ic_send_cv);
1151 	cv_destroy(&ic->ic_receive_cv);
1152 	uma_zfree(icl_conn_zone, ic);
1153 	refcount_release(&icl_ncons);
1154 }
1155 
1156 static int
icl_conn_start(struct icl_conn * ic)1157 icl_conn_start(struct icl_conn *ic)
1158 {
1159 	size_t minspace;
1160 	struct sockopt opt;
1161 	int error, one = 1;
1162 
1163 	ICL_CONN_LOCK(ic);
1164 
1165 	/*
1166 	 * XXX: Ugly hack.
1167 	 */
1168 	if (ic->ic_socket == NULL) {
1169 		ICL_CONN_UNLOCK(ic);
1170 		return (EINVAL);
1171 	}
1172 
1173 	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1174 	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1175 	ic->ic_disconnecting = false;
1176 
1177 	ICL_CONN_UNLOCK(ic);
1178 
1179 	/*
1180 	 * For sendspace, this is required because the current code cannot
1181 	 * send a PDU in pieces; thus, the minimum buffer size is equal
1182 	 * to the maximum PDU size.  "+4" is to account for possible padding.
1183 	 *
1184 	 * What we should actually do here is to use autoscaling, but set
1185 	 * some minimal buffer size to "minspace".  I don't know a way to do
1186 	 * that, though.
1187 	 */
1188 	minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1189 	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1190 	if (sendspace < minspace) {
1191 		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1192 		    minspace);
1193 		sendspace = minspace;
1194 	}
1195 	if (recvspace < minspace) {
1196 		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1197 		    minspace);
1198 		recvspace = minspace;
1199 	}
1200 
1201 	error = soreserve(ic->ic_socket, sendspace, recvspace);
1202 	if (error != 0) {
1203 		ICL_WARN("soreserve failed with error %d", error);
1204 		icl_conn_close(ic);
1205 		return (error);
1206 	}
1207 	ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE;
1208 	ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE;
1209 
1210 	/*
1211 	 * Disable Nagle.
1212 	 */
1213 	bzero(&opt, sizeof(opt));
1214 	opt.sopt_dir = SOPT_SET;
1215 	opt.sopt_level = IPPROTO_TCP;
1216 	opt.sopt_name = TCP_NODELAY;
1217 	opt.sopt_val = &one;
1218 	opt.sopt_valsize = sizeof(one);
1219 	error = sosetopt(ic->ic_socket, &opt);
1220 	if (error != 0) {
1221 		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1222 		icl_conn_close(ic);
1223 		return (error);
1224 	}
1225 
1226 	/*
1227 	 * Start threads.
1228 	 */
1229 	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1230 	    ic->ic_name);
1231 	if (error != 0) {
1232 		ICL_WARN("kthread_add(9) failed with error %d", error);
1233 		icl_conn_close(ic);
1234 		return (error);
1235 	}
1236 
1237 	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1238 	    ic->ic_name);
1239 	if (error != 0) {
1240 		ICL_WARN("kthread_add(9) failed with error %d", error);
1241 		icl_conn_close(ic);
1242 		return (error);
1243 	}
1244 
1245 	/*
1246 	 * Register socket upcall, to get notified about incoming PDUs
1247 	 * and free space to send outgoing ones.
1248 	 */
1249 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1250 	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1251 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1252 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1253 	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1254 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1255 
1256 	return (0);
1257 }
1258 
1259 int
icl_conn_handoff(struct icl_conn * ic,int fd)1260 icl_conn_handoff(struct icl_conn *ic, int fd)
1261 {
1262 	struct file *fp;
1263 	struct socket *so;
1264 	cap_rights_t rights;
1265 	int error;
1266 
1267 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1268 
1269 	/*
1270 	 * Steal the socket from userland.
1271 	 */
1272 	error = fget(curthread, fd,
1273 	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1274 	if (error != 0)
1275 		return (error);
1276 	if (fp->f_type != DTYPE_SOCKET) {
1277 		fdrop(fp, curthread);
1278 		return (EINVAL);
1279 	}
1280 	so = fp->f_data;
1281 	if (so->so_type != SOCK_STREAM) {
1282 		fdrop(fp, curthread);
1283 		return (EINVAL);
1284 	}
1285 
1286 	ICL_CONN_LOCK(ic);
1287 
1288 	if (ic->ic_socket != NULL) {
1289 		ICL_CONN_UNLOCK(ic);
1290 		fdrop(fp, curthread);
1291 		return (EBUSY);
1292 	}
1293 
1294 	ic->ic_socket = fp->f_data;
1295 	fp->f_ops = &badfileops;
1296 	fp->f_data = NULL;
1297 	fdrop(fp, curthread);
1298 	ICL_CONN_UNLOCK(ic);
1299 
1300 	error = icl_conn_start(ic);
1301 
1302 	return (error);
1303 }
1304 
1305 void
icl_conn_close(struct icl_conn * ic)1306 icl_conn_close(struct icl_conn *ic)
1307 {
1308 	struct icl_pdu *pdu;
1309 
1310 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1311 
1312 	ICL_CONN_LOCK(ic);
1313 	if (ic->ic_socket == NULL) {
1314 		ICL_CONN_UNLOCK(ic);
1315 		return;
1316 	}
1317 
1318 	/*
1319 	 * Deregister socket upcalls.
1320 	 */
1321 	ICL_CONN_UNLOCK(ic);
1322 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1323 	if (ic->ic_socket->so_snd.sb_upcall != NULL)
1324 		soupcall_clear(ic->ic_socket, SO_SND);
1325 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1326 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1327 	if (ic->ic_socket->so_rcv.sb_upcall != NULL)
1328 		soupcall_clear(ic->ic_socket, SO_RCV);
1329 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1330 	ICL_CONN_LOCK(ic);
1331 
1332 	ic->ic_disconnecting = true;
1333 
1334 	/*
1335 	 * Wake up the threads, so they can properly terminate.
1336 	 */
1337 	while (ic->ic_receive_running || ic->ic_send_running) {
1338 		//ICL_DEBUG("waiting for send/receive threads to terminate");
1339 		cv_signal(&ic->ic_receive_cv);
1340 		cv_signal(&ic->ic_send_cv);
1341 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1342 	}
1343 	//ICL_DEBUG("send/receive threads terminated");
1344 
1345 	ICL_CONN_UNLOCK(ic);
1346 	soclose(ic->ic_socket);
1347 	ICL_CONN_LOCK(ic);
1348 	ic->ic_socket = NULL;
1349 
1350 	if (ic->ic_receive_pdu != NULL) {
1351 		//ICL_DEBUG("freeing partially received PDU");
1352 		icl_pdu_free(ic->ic_receive_pdu);
1353 		ic->ic_receive_pdu = NULL;
1354 	}
1355 
1356 	/*
1357 	 * Remove any outstanding PDUs from the send queue.
1358 	 */
1359 	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1360 		pdu = STAILQ_FIRST(&ic->ic_to_send);
1361 		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1362 		icl_pdu_free(pdu);
1363 	}
1364 
1365 	KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1366 	    ("destroying session with non-empty send queue"));
1367 #ifdef DIAGNOSTIC
1368 	KASSERT(ic->ic_outstanding_pdus == 0,
1369 	    ("destroying session with %d outstanding PDUs",
1370 	     ic->ic_outstanding_pdus));
1371 #endif
1372 	ICL_CONN_UNLOCK(ic);
1373 }
1374 
1375 bool
icl_conn_connected(struct icl_conn * ic)1376 icl_conn_connected(struct icl_conn *ic)
1377 {
1378 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1379 
1380 	ICL_CONN_LOCK(ic);
1381 	if (ic->ic_socket == NULL) {
1382 		ICL_CONN_UNLOCK(ic);
1383 		return (false);
1384 	}
1385 	if (ic->ic_socket->so_error != 0) {
1386 		ICL_CONN_UNLOCK(ic);
1387 		return (false);
1388 	}
1389 	ICL_CONN_UNLOCK(ic);
1390 	return (true);
1391 }
1392 
1393 #ifdef ICL_KERNEL_PROXY
1394 int
icl_conn_handoff_sock(struct icl_conn * ic,struct socket * so)1395 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1396 {
1397 	int error;
1398 
1399 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1400 
1401 	if (so->so_type != SOCK_STREAM)
1402 		return (EINVAL);
1403 
1404 	ICL_CONN_LOCK(ic);
1405 	if (ic->ic_socket != NULL) {
1406 		ICL_CONN_UNLOCK(ic);
1407 		return (EBUSY);
1408 	}
1409 	ic->ic_socket = so;
1410 	ICL_CONN_UNLOCK(ic);
1411 
1412 	error = icl_conn_start(ic);
1413 
1414 	return (error);
1415 }
1416 #endif /* ICL_KERNEL_PROXY */
1417 
1418 static int
icl_unload(void)1419 icl_unload(void)
1420 {
1421 
1422 	if (icl_ncons != 0)
1423 		return (EBUSY);
1424 
1425 	uma_zdestroy(icl_conn_zone);
1426 	uma_zdestroy(icl_pdu_zone);
1427 
1428 	return (0);
1429 }
1430 
1431 static void
icl_load(void)1432 icl_load(void)
1433 {
1434 
1435 	icl_conn_zone = uma_zcreate("icl_conn",
1436 	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1437 	    UMA_ALIGN_PTR, 0);
1438 	icl_pdu_zone = uma_zcreate("icl_pdu",
1439 	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1440 	    UMA_ALIGN_PTR, 0);
1441 
1442 	refcount_init(&icl_ncons, 0);
1443 }
1444 
1445 static int
icl_modevent(module_t mod,int what,void * arg)1446 icl_modevent(module_t mod, int what, void *arg)
1447 {
1448 
1449 	switch (what) {
1450 	case MOD_LOAD:
1451 		icl_load();
1452 		return (0);
1453 	case MOD_UNLOAD:
1454 		return (icl_unload());
1455 	default:
1456 		return (EINVAL);
1457 	}
1458 }
1459 
1460 moduledata_t icl_data = {
1461 	"icl",
1462 	icl_modevent,
1463 	0
1464 };
1465 
1466 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1467 MODULE_VERSION(icl, 1);
1468