1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1991, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36 #include <sys/cdefs.h>
37 /*
38 * Socket operations for use by nfs
39 */
40
41 #include "opt_kgssapi.h"
42 #include "opt_nfs.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/limits.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #include <sys/mount.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 #include <sys/vnode.h>
59
60 #include <rpc/rpc.h>
61 #include <rpc/krpc.h>
62
63 #include <kgssapi/krb5/kcrypto.h>
64
65 #include <fs/nfs/nfsport.h>
66
67 #ifdef KDTRACE_HOOKS
68 #include <sys/dtrace_bsd.h>
69
70 dtrace_nfsclient_nfs23_start_probe_func_t
71 dtrace_nfscl_nfs234_start_probe;
72
73 dtrace_nfsclient_nfs23_done_probe_func_t
74 dtrace_nfscl_nfs234_done_probe;
75
76 /*
77 * Registered probes by RPC type.
78 */
79 uint32_t nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
80 uint32_t nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
81
82 uint32_t nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
83 uint32_t nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
84
85 uint32_t nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
86 uint32_t nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
87 #endif
88
89 NFSSTATESPINLOCK;
90 NFSREQSPINLOCK;
91 NFSDLOCKMUTEX;
92 NFSCLSTATEMUTEX;
93 extern struct nfsstatsv1 nfsstatsv1;
94 extern struct nfsreqhead nfsd_reqq;
95 extern int nfscl_ticks;
96 extern void (*ncl_call_invalcaches)(struct vnode *);
97 extern int nfs_numnfscbd;
98 extern int nfscl_debuglevel;
99 extern int nfsrv_lease;
100
101 SVCPOOL *nfscbd_pool;
102 int nfs_bufpackets = 4;
103 static int nfsrv_gsscallbackson = 0;
104 static int nfs_reconnects;
105 static int nfs3_jukebox_delay = 10;
106 static int nfs_skip_wcc_data_onerr = 1;
107 static int nfs_dsretries = 2;
108 static struct timespec nfs_trylater_max = {
109 .tv_sec = NFS_TRYLATERDEL,
110 .tv_nsec = 0,
111 };
112
113 SYSCTL_DECL(_vfs_nfs);
114
115 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
116 "Buffer reservation size 2 < x < 64");
117 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
118 "Number of times the nfs client has had to reconnect");
119 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
120 "Number of seconds to delay a retry after receiving EJUKEBOX");
121 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
122 "Disable weak cache consistency checking when server returns an error");
123 SYSCTL_INT(_vfs_nfs, OID_AUTO, dsretries, CTLFLAG_RW, &nfs_dsretries, 0,
124 "Number of retries for a DS RPC before failure");
125
126 static void nfs_down(struct nfsmount *, struct thread *, const char *,
127 int, int);
128 static void nfs_up(struct nfsmount *, struct thread *, const char *,
129 int, int);
130 static int nfs_msg(struct thread *, const char *, const char *, int);
131
132 struct nfs_cached_auth {
133 int ca_refs; /* refcount, including 1 from the cache */
134 uid_t ca_uid; /* uid that corresponds to this auth */
135 AUTH *ca_auth; /* RPC auth handle */
136 };
137
138 static int nfsv2_procid[NFS_V3NPROCS] = {
139 NFSV2PROC_NULL,
140 NFSV2PROC_GETATTR,
141 NFSV2PROC_SETATTR,
142 NFSV2PROC_LOOKUP,
143 NFSV2PROC_NOOP,
144 NFSV2PROC_READLINK,
145 NFSV2PROC_READ,
146 NFSV2PROC_WRITE,
147 NFSV2PROC_CREATE,
148 NFSV2PROC_MKDIR,
149 NFSV2PROC_SYMLINK,
150 NFSV2PROC_CREATE,
151 NFSV2PROC_REMOVE,
152 NFSV2PROC_RMDIR,
153 NFSV2PROC_RENAME,
154 NFSV2PROC_LINK,
155 NFSV2PROC_READDIR,
156 NFSV2PROC_NOOP,
157 NFSV2PROC_STATFS,
158 NFSV2PROC_NOOP,
159 NFSV2PROC_NOOP,
160 NFSV2PROC_NOOP,
161 };
162
163 /*
164 * This static array indicates that a NFSv4 RPC should use
165 * RPCSEC_GSS, if the mount indicates that via sec=krb5[ip].
166 * System RPCs that do not use file handles will be false
167 * in this array so that they will use AUTH_SYS when the
168 * "syskrb5" mount option is specified, along with
169 * "sec=krb5[ip]".
170 */
171 static bool nfscl_use_gss[NFSV42_NPROCS] = {
172 true,
173 true,
174 true,
175 true,
176 true,
177 true,
178 true,
179 true,
180 true,
181 true,
182 true,
183 true,
184 true,
185 true,
186 true,
187 true,
188 true,
189 true,
190 true,
191 true,
192 true,
193 true,
194 true,
195 false, /* SetClientID */
196 false, /* SetClientIDConfirm */
197 true,
198 true,
199 true,
200 true,
201 true,
202 true,
203 true,
204 false, /* Renew */
205 true,
206 false, /* ReleaseLockOwn */
207 true,
208 true,
209 true,
210 true,
211 true,
212 true,
213 false, /* ExchangeID */
214 false, /* CreateSession */
215 false, /* DestroySession */
216 false, /* DestroyClientID */
217 false, /* FreeStateID */
218 true,
219 true,
220 true,
221 true,
222 false, /* ReclaimComplete */
223 true,
224 true,
225 true,
226 true,
227 true,
228 true,
229 true,
230 true,
231 true,
232 true,
233 true,
234 true,
235 true,
236 true,
237 false, /* BindConnectionToSession */
238 true,
239 true,
240 true,
241 true,
242 };
243
244 /*
245 * Initialize sockets and congestion for a new NFS connection.
246 * We do not free the sockaddr if error.
247 * Which arguments are set to NULL indicate what kind of call it is.
248 * cred == NULL --> a call to connect to a pNFS DS
249 * nmp == NULL --> indicates an upcall to userland or a NFSv4.0 callback
250 */
251 int
newnfs_connect(struct nfsmount * nmp,struct nfssockreq * nrp,struct ucred * cred,NFSPROC_T * p,int callback_retry_mult,bool dotls,struct __rpc_client ** clipp)252 newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
253 struct ucred *cred, NFSPROC_T *p, int callback_retry_mult, bool dotls,
254 struct __rpc_client **clipp)
255 {
256 int rcvreserve, sndreserve;
257 int pktscale, pktscalesav;
258 struct sockaddr *saddr;
259 struct ucred *origcred;
260 CLIENT *client;
261 struct netconfig *nconf;
262 struct socket *so;
263 int one = 1, retries, error = 0;
264 struct thread *td = curthread;
265 SVCXPRT *xprt;
266 struct timeval timo;
267 uint64_t tval;
268
269 /*
270 * We need to establish the socket using the credentials of
271 * the mountpoint. Some parts of this process (such as
272 * sobind() and soconnect()) will use the curent thread's
273 * credential instead of the socket credential. To work
274 * around this, temporarily change the current thread's
275 * credential to that of the mountpoint.
276 *
277 * XXX: It would be better to explicitly pass the correct
278 * credential to sobind() and soconnect().
279 */
280 origcred = td->td_ucred;
281
282 /*
283 * Use the credential in nr_cred, if not NULL.
284 */
285 if (nrp->nr_cred != NULL)
286 td->td_ucred = nrp->nr_cred;
287 else
288 td->td_ucred = cred;
289 saddr = nrp->nr_nam;
290
291 if (saddr->sa_family == AF_INET)
292 if (nrp->nr_sotype == SOCK_DGRAM)
293 nconf = getnetconfigent("udp");
294 else
295 nconf = getnetconfigent("tcp");
296 else
297 if (nrp->nr_sotype == SOCK_DGRAM)
298 nconf = getnetconfigent("udp6");
299 else
300 nconf = getnetconfigent("tcp6");
301
302 pktscale = nfs_bufpackets;
303 if (pktscale < 2)
304 pktscale = 2;
305 if (pktscale > 64)
306 pktscale = 64;
307 pktscalesav = pktscale;
308 /*
309 * soreserve() can fail if sb_max is too small, so shrink pktscale
310 * and try again if there is an error.
311 * Print a log message suggesting increasing sb_max.
312 * Creating a socket and doing this is necessary since, if the
313 * reservation sizes are too large and will make soreserve() fail,
314 * the connection will work until a large send is attempted and
315 * then it will loop in the krpc code.
316 */
317 so = NULL;
318 saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
319 error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
320 nrp->nr_soproto, td->td_ucred, td);
321 if (error != 0)
322 goto out;
323 do {
324 if (error != 0 && pktscale > 2) {
325 if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
326 pktscale == pktscalesav) {
327 /*
328 * Suggest vfs.nfs.bufpackets * maximum RPC message,
329 * adjusted for the sb_max->sb_max_adj conversion of
330 * MCLBYTES / (MSIZE + MCLBYTES) as the minimum setting
331 * for kern.ipc.maxsockbuf.
332 */
333 tval = (NFS_MAXBSIZE + NFS_MAXXDR) * nfs_bufpackets;
334 tval *= MSIZE + MCLBYTES;
335 tval += MCLBYTES - 1; /* Round up divide by MCLBYTES. */
336 tval /= MCLBYTES;
337 printf("Consider increasing kern.ipc.maxsockbuf to a "
338 "minimum of %ju to support %ubyte NFS I/O\n",
339 (uintmax_t)tval, NFS_MAXBSIZE);
340 }
341 pktscale--;
342 }
343 if (nrp->nr_sotype == SOCK_DGRAM) {
344 if (nmp != NULL) {
345 sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
346 pktscale;
347 rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
348 pktscale;
349 } else {
350 sndreserve = rcvreserve = 1024 * pktscale;
351 }
352 } else {
353 if (nrp->nr_sotype != SOCK_STREAM)
354 panic("nfscon sotype");
355 if (nmp != NULL) {
356 sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
357 pktscale;
358 rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
359 pktscale;
360 } else {
361 sndreserve = rcvreserve = 1024 * pktscale;
362 }
363 }
364 error = soreserve(so, sndreserve, rcvreserve);
365 if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
366 pktscale <= 2)
367 printf("Must increase kern.ipc.maxsockbuf or reduce"
368 " rsize, wsize\n");
369 } while (error != 0 && pktscale > 2);
370 soclose(so);
371 if (error != 0)
372 goto out;
373
374 client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
375 nrp->nr_vers, sndreserve, rcvreserve);
376 CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
377 if (nmp != NULL) {
378 if ((nmp->nm_flag & NFSMNT_INT))
379 CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
380 if ((nmp->nm_flag & NFSMNT_RESVPORT))
381 CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
382 if (NFSHASTLS(nmp)) {
383 CLNT_CONTROL(client, CLSET_TLS, &one);
384 if (nmp->nm_tlscertname != NULL)
385 CLNT_CONTROL(client, CLSET_TLSCERTNAME,
386 nmp->nm_tlscertname);
387 }
388 if (NFSHASSOFT(nmp)) {
389 if (nmp->nm_sotype == SOCK_DGRAM)
390 /*
391 * For UDP, the large timeout for a reconnect
392 * will be set to "nm_retry * nm_timeo / 2", so
393 * we only want to do 2 reconnect timeout
394 * retries.
395 */
396 retries = 2;
397 else
398 retries = nmp->nm_retry;
399 } else
400 retries = INT_MAX;
401 if (NFSHASNFSV4N(nmp)) {
402 if (cred != NULL) {
403 if (NFSHASSOFT(nmp)) {
404 /*
405 * This should be a DS mount.
406 * Use CLSET_TIMEOUT to set the timeout
407 * for connections to DSs instead of
408 * specifying a timeout on each RPC.
409 * This is done so that SO_SNDTIMEO
410 * is set on the TCP socket as well
411 * as specifying a time limit when
412 * waiting for an RPC reply. Useful
413 * if the send queue for the TCP
414 * connection has become constipated,
415 * due to a failed DS.
416 * The choice of lease_duration / 4 is
417 * fairly arbitrary, but seems to work
418 * ok, with a lower bound of 10sec.
419 */
420 timo.tv_sec = nfsrv_lease / 4;
421 if (timo.tv_sec < 10)
422 timo.tv_sec = 10;
423 timo.tv_usec = 0;
424 CLNT_CONTROL(client, CLSET_TIMEOUT,
425 &timo);
426 }
427 /*
428 * Make sure the nfscbd_pool doesn't get
429 * destroyed while doing this.
430 */
431 NFSD_LOCK();
432 if (nfs_numnfscbd > 0) {
433 nfs_numnfscbd++;
434 NFSD_UNLOCK();
435 xprt = svc_vc_create_backchannel(
436 nfscbd_pool);
437 CLNT_CONTROL(client, CLSET_BACKCHANNEL,
438 xprt);
439 NFSD_LOCK();
440 nfs_numnfscbd--;
441 if (nfs_numnfscbd == 0)
442 wakeup(&nfs_numnfscbd);
443 }
444 NFSD_UNLOCK();
445 } else {
446 /*
447 * cred == NULL for a DS connect.
448 * For connects to a DS, set a retry limit
449 * so that failed DSs will be detected.
450 * This is ok for NFSv4.1, since a DS does
451 * not maintain open/lock state and is the
452 * only case where using a "soft" mount is
453 * recommended for NFSv4.
454 * For mounts from the MDS to DS, this is done
455 * via mount options, but that is not the case
456 * here. The retry limit here can be adjusted
457 * via the sysctl vfs.nfs.dsretries.
458 * See the comment above w.r.t. timeout.
459 */
460 timo.tv_sec = nfsrv_lease / 4;
461 if (timo.tv_sec < 10)
462 timo.tv_sec = 10;
463 timo.tv_usec = 0;
464 CLNT_CONTROL(client, CLSET_TIMEOUT, &timo);
465 retries = nfs_dsretries;
466 }
467 }
468 } else {
469 /*
470 * Three cases:
471 * - Null RPC callback to client
472 * - Non-Null RPC callback to client, wait a little longer
473 * - upcalls to nfsuserd and gssd (clp == NULL)
474 */
475 if (callback_retry_mult == 0) {
476 retries = NFSV4_UPCALLRETRY;
477 CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
478 } else {
479 retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
480 }
481 if (dotls)
482 CLNT_CONTROL(client, CLSET_TLS, &one);
483 }
484 CLNT_CONTROL(client, CLSET_RETRIES, &retries);
485
486 if (nmp != NULL) {
487 /*
488 * For UDP, there are 2 timeouts:
489 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
490 * that does a retransmit of an RPC request using the same
491 * socket and xid. This is what you normally want to do,
492 * since NFS servers depend on "same xid" for their
493 * Duplicate Request Cache.
494 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
495 * retransmits on the same socket should fail and a fresh
496 * socket created. Each of these timeouts counts as one
497 * CLSET_RETRIES as set above.
498 * Set the initial retransmit timeout for UDP. This timeout
499 * doesn't exist for TCP and the following call just fails,
500 * which is ok.
501 */
502 timo.tv_sec = nmp->nm_timeo / NFS_HZ;
503 timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
504 CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
505 }
506
507 /*
508 * *clipp is &nrp->nr_client or &nm_aconn[nmp->nm_nextaconn].
509 * The latter case is for additional connections specified by the
510 * "nconnect" mount option. nr_mtx etc is used for these additional
511 * connections, as well as nr_client in the nfssockreq
512 * structure for the mount.
513 */
514 mtx_lock(&nrp->nr_mtx);
515 if (*clipp != NULL) {
516 mtx_unlock(&nrp->nr_mtx);
517 /*
518 * Someone else already connected.
519 */
520 CLNT_RELEASE(client);
521 } else {
522 *clipp = client;
523 /*
524 * Protocols that do not require connections may be optionally
525 * left unconnected for servers that reply from a port other
526 * than NFS_PORT.
527 */
528 if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
529 mtx_unlock(&nrp->nr_mtx);
530 CLNT_CONTROL(client, CLSET_CONNECT, &one);
531 } else
532 mtx_unlock(&nrp->nr_mtx);
533 }
534
535 out:
536 /* Restore current thread's credentials. */
537 td->td_ucred = origcred;
538
539 NFSEXITCODE(error);
540 return (error);
541 }
542
543 /*
544 * NFS disconnect. Clean up and unlink.
545 */
546 void
newnfs_disconnect(struct nfsmount * nmp,struct nfssockreq * nrp)547 newnfs_disconnect(struct nfsmount *nmp, struct nfssockreq *nrp)
548 {
549 CLIENT *client, *aconn[NFS_MAXNCONN - 1];
550 int i;
551
552 mtx_lock(&nrp->nr_mtx);
553 if (nrp->nr_client != NULL) {
554 client = nrp->nr_client;
555 nrp->nr_client = NULL;
556 if (nmp != NULL && nmp->nm_aconnect > 0) {
557 for (i = 0; i < nmp->nm_aconnect; i++) {
558 aconn[i] = nmp->nm_aconn[i];
559 nmp->nm_aconn[i] = NULL;
560 }
561 }
562 mtx_unlock(&nrp->nr_mtx);
563 rpc_gss_secpurge_call(client);
564 CLNT_CLOSE(client);
565 CLNT_RELEASE(client);
566 if (nmp != NULL && nmp->nm_aconnect > 0) {
567 for (i = 0; i < nmp->nm_aconnect; i++) {
568 if (aconn[i] != NULL) {
569 rpc_gss_secpurge_call(aconn[i]);
570 CLNT_CLOSE(aconn[i]);
571 CLNT_RELEASE(aconn[i]);
572 }
573 }
574 }
575 } else {
576 mtx_unlock(&nrp->nr_mtx);
577 }
578 }
579
580 static AUTH *
nfs_getauth(struct nfssockreq * nrp,int secflavour,char * clnt_principal,char * srv_principal,gss_OID mech_oid,struct ucred * cred)581 nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
582 char *srv_principal, gss_OID mech_oid, struct ucred *cred)
583 {
584 rpc_gss_service_t svc;
585 AUTH *auth;
586
587 switch (secflavour) {
588 case RPCSEC_GSS_KRB5:
589 case RPCSEC_GSS_KRB5I:
590 case RPCSEC_GSS_KRB5P:
591 if (!mech_oid) {
592 if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
593 return (NULL);
594 }
595 if (secflavour == RPCSEC_GSS_KRB5)
596 svc = rpc_gss_svc_none;
597 else if (secflavour == RPCSEC_GSS_KRB5I)
598 svc = rpc_gss_svc_integrity;
599 else
600 svc = rpc_gss_svc_privacy;
601
602 if (clnt_principal == NULL)
603 auth = rpc_gss_secfind_call(nrp->nr_client, cred,
604 srv_principal, mech_oid, svc);
605 else {
606 auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
607 clnt_principal, srv_principal, "kerberosv5",
608 svc, NULL, NULL, NULL);
609 return (auth);
610 }
611 if (auth != NULL)
612 return (auth);
613 /* fallthrough */
614 case AUTH_SYS:
615 default:
616 return (authunix_create(cred));
617 }
618 }
619
620 /*
621 * Callback from the RPC code to generate up/down notifications.
622 */
623
624 struct nfs_feedback_arg {
625 struct nfsmount *nf_mount;
626 int nf_lastmsg; /* last tprintf */
627 int nf_tprintfmsg;
628 struct thread *nf_td;
629 };
630
631 static void
nfs_feedback(int type,int proc,void * arg)632 nfs_feedback(int type, int proc, void *arg)
633 {
634 struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
635 struct nfsmount *nmp = nf->nf_mount;
636 time_t now;
637
638 switch (type) {
639 case FEEDBACK_REXMIT2:
640 case FEEDBACK_RECONNECT:
641 now = NFSD_MONOSEC;
642 if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
643 nfs_down(nmp, nf->nf_td,
644 "not responding", 0, NFSSTA_TIMEO);
645 nf->nf_tprintfmsg = TRUE;
646 nf->nf_lastmsg = now;
647 }
648 break;
649
650 case FEEDBACK_OK:
651 nfs_up(nf->nf_mount, nf->nf_td,
652 "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
653 break;
654 }
655 }
656
657 /*
658 * newnfs_request - goes something like this
659 * - does the rpc by calling the krpc layer
660 * - break down rpc header and return with nfs reply
661 * nb: always frees up nd_mreq mbuf list
662 */
663 int
newnfs_request(struct nfsrv_descript * nd,struct nfsmount * nmp,struct nfsclient * clp,struct nfssockreq * nrp,vnode_t vp,struct thread * td,struct ucred * cred,u_int32_t prog,u_int32_t vers,u_char * retsum,int toplevel,u_int64_t * xidp,struct nfsclsession * dssep)664 newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
665 struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
666 struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
667 u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
668 {
669 uint32_t retseq, retval, slotseq, *tl;
670 int i = 0, j = 0, opcnt, set_sigset = 0, slot;
671 int error = 0, usegssname = 0, secflavour = AUTH_SYS;
672 int freeslot, maxslot, reterr, slotpos, timeo;
673 u_int16_t procnum;
674 u_int nextconn;
675 struct nfs_feedback_arg nf;
676 struct timeval timo;
677 AUTH *auth;
678 struct rpc_callextra ext;
679 enum clnt_stat stat;
680 struct nfsreq *rep = NULL;
681 char *srv_principal = NULL, *clnt_principal = NULL;
682 sigset_t oldset;
683 struct ucred *authcred;
684 struct nfsclsession *sep;
685 uint8_t sessionid[NFSX_V4SESSIONID];
686 bool nextconn_set;
687 struct timespec trylater_delay, ts, waituntil;
688
689 /* Initially 1msec. */
690 trylater_delay.tv_sec = 0;
691 trylater_delay.tv_nsec = 1000000;
692 sep = dssep;
693 if (xidp != NULL)
694 *xidp = 0;
695 /* Reject requests while attempting a forced unmount. */
696 if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
697 m_freem(nd->nd_mreq);
698 return (ESTALE);
699 }
700
701 /*
702 * Set authcred, which is used to acquire RPC credentials to
703 * the cred argument, by default. The crhold() should not be
704 * necessary, but will ensure that some future code change
705 * doesn't result in the credential being free'd prematurely.
706 */
707 authcred = crhold(cred);
708
709 /* For client side interruptible mounts, mask off the signals. */
710 if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
711 newnfs_set_sigmask(td, &oldset);
712 set_sigset = 1;
713 }
714
715 /*
716 * If not already connected call newnfs_connect now.
717 */
718 if (nrp->nr_client == NULL)
719 newnfs_connect(nmp, nrp, cred, td, 0, false, &nrp->nr_client);
720
721 /*
722 * If the "nconnect" mount option was specified and this RPC is
723 * one that can have a large RPC message and is being done through
724 * the NFS/MDS server, use an additional connection. (When the RPC is
725 * being done through the server/MDS, nrp == &nmp->nm_sockreq.)
726 * The "nconnect" mount option normally has minimal effect when the
727 * "pnfs" mount option is specified, since only Readdir RPCs are
728 * normally done through the NFS/MDS server.
729 */
730 nextconn_set = false;
731 if (nmp != NULL && nmp->nm_aconnect > 0 && nrp == &nmp->nm_sockreq &&
732 (nd->nd_procnum == NFSPROC_READ ||
733 nd->nd_procnum == NFSPROC_READDIR ||
734 nd->nd_procnum == NFSPROC_READDIRPLUS ||
735 nd->nd_procnum == NFSPROC_WRITE)) {
736 nextconn = atomic_fetchadd_int(&nmp->nm_nextaconn, 1);
737 nextconn %= nmp->nm_aconnect;
738 nextconn_set = true;
739 if (nmp->nm_aconn[nextconn] == NULL)
740 newnfs_connect(nmp, nrp, cred, td, 0, false,
741 &nmp->nm_aconn[nextconn]);
742 }
743
744 /*
745 * For a client side mount, nmp is != NULL and clp == NULL. For
746 * server calls (callbacks or upcalls), nmp == NULL.
747 */
748 if (clp != NULL) {
749 NFSLOCKSTATE();
750 if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
751 secflavour = RPCSEC_GSS_KRB5;
752 if (nd->nd_procnum != NFSPROC_NULL) {
753 if (clp->lc_flags & LCL_GSSINTEGRITY)
754 secflavour = RPCSEC_GSS_KRB5I;
755 else if (clp->lc_flags & LCL_GSSPRIVACY)
756 secflavour = RPCSEC_GSS_KRB5P;
757 }
758 }
759 NFSUNLOCKSTATE();
760 } else if (nmp != NULL && NFSHASKERB(nmp) &&
761 nd->nd_procnum != NFSPROC_NULL && (!NFSHASSYSKRB5(nmp) ||
762 nfscl_use_gss[nd->nd_procnum])) {
763 if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
764 nd->nd_flag |= ND_USEGSSNAME;
765 if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
766 /*
767 * If there is a client side host based credential,
768 * use that, otherwise use the system uid, if set.
769 * The system uid is in the nmp->nm_sockreq.nr_cred
770 * credentials.
771 */
772 if (nmp->nm_krbnamelen > 0) {
773 usegssname = 1;
774 clnt_principal = nmp->nm_krbname;
775 } else if (nmp->nm_uid != (uid_t)-1) {
776 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
777 ("newnfs_request: NULL nr_cred"));
778 crfree(authcred);
779 authcred = crhold(nmp->nm_sockreq.nr_cred);
780 }
781 } else if (nmp->nm_krbnamelen == 0 &&
782 nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
783 /*
784 * If there is no host based principal name and
785 * the system uid is set and this is root, use the
786 * system uid, since root won't have user
787 * credentials in a credentials cache file.
788 * The system uid is in the nmp->nm_sockreq.nr_cred
789 * credentials.
790 */
791 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
792 ("newnfs_request: NULL nr_cred"));
793 crfree(authcred);
794 authcred = crhold(nmp->nm_sockreq.nr_cred);
795 }
796 if (NFSHASINTEGRITY(nmp))
797 secflavour = RPCSEC_GSS_KRB5I;
798 else if (NFSHASPRIVACY(nmp))
799 secflavour = RPCSEC_GSS_KRB5P;
800 else
801 secflavour = RPCSEC_GSS_KRB5;
802 srv_principal = NFSMNT_SRVKRBNAME(nmp);
803 } else if (nmp != NULL && (!NFSHASKERB(nmp) || NFSHASSYSKRB5(nmp)) &&
804 nd->nd_procnum != NFSPROC_NULL &&
805 (nd->nd_flag & ND_USEGSSNAME) != 0) {
806 /*
807 * Use the uid that did the mount when the RPC is doing
808 * NFSv4 system operations, as indicated by the
809 * ND_USEGSSNAME flag, for the AUTH_SYS case.
810 * The credentials in nm_sockreq.nr_cred were used for the
811 * mount.
812 */
813 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
814 ("newnfs_request: NULL nr_cred"));
815 crfree(authcred);
816 authcred = crhold(nmp->nm_sockreq.nr_cred);
817 }
818
819 if (nmp != NULL) {
820 bzero(&nf, sizeof(struct nfs_feedback_arg));
821 nf.nf_mount = nmp;
822 nf.nf_td = td;
823 nf.nf_lastmsg = NFSD_MONOSEC -
824 ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
825 }
826
827 if (nd->nd_procnum == NFSPROC_NULL)
828 auth = authnone_create();
829 else if (usegssname) {
830 /*
831 * For this case, the authenticator is held in the
832 * nfssockreq structure, so don't release the reference count
833 * held on it. --> Don't AUTH_DESTROY() it in this function.
834 */
835 if (nrp->nr_auth == NULL)
836 nrp->nr_auth = nfs_getauth(nrp, secflavour,
837 clnt_principal, srv_principal, NULL, authcred);
838 else
839 rpc_gss_refresh_auth_call(nrp->nr_auth);
840 auth = nrp->nr_auth;
841 } else
842 auth = nfs_getauth(nrp, secflavour, NULL,
843 srv_principal, NULL, authcred);
844 crfree(authcred);
845 if (auth == NULL) {
846 m_freem(nd->nd_mreq);
847 if (set_sigset)
848 newnfs_restore_sigmask(td, &oldset);
849 return (EACCES);
850 }
851 bzero(&ext, sizeof(ext));
852 ext.rc_auth = auth;
853 if (nmp != NULL) {
854 ext.rc_feedback = nfs_feedback;
855 ext.rc_feedback_arg = &nf;
856 }
857
858 procnum = nd->nd_procnum;
859 if ((nd->nd_flag & ND_NFSV4) &&
860 nd->nd_procnum != NFSPROC_NULL &&
861 nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
862 procnum = NFSV4PROC_COMPOUND;
863
864 if (nmp != NULL) {
865 NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
866
867 /* Map the procnum to the old NFSv2 one, as required. */
868 if ((nd->nd_flag & ND_NFSV2) != 0) {
869 if (nd->nd_procnum < NFS_V3NPROCS)
870 procnum = nfsv2_procid[nd->nd_procnum];
871 else
872 procnum = NFSV2PROC_NOOP;
873 }
874
875 /*
876 * Now only used for the R_DONTRECOVER case, but until that is
877 * supported within the krpc code, I need to keep a queue of
878 * outstanding RPCs for nfsv4 client requests.
879 */
880 if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
881 rep = malloc(sizeof(struct nfsreq),
882 M_NFSDREQ, M_WAITOK);
883 #ifdef KDTRACE_HOOKS
884 if (dtrace_nfscl_nfs234_start_probe != NULL) {
885 uint32_t probe_id;
886 int probe_procnum;
887
888 if (nd->nd_flag & ND_NFSV4) {
889 probe_id =
890 nfscl_nfs4_start_probes[nd->nd_procnum];
891 probe_procnum = nd->nd_procnum;
892 } else if (nd->nd_flag & ND_NFSV3) {
893 probe_id = nfscl_nfs3_start_probes[procnum];
894 probe_procnum = procnum;
895 } else {
896 probe_id =
897 nfscl_nfs2_start_probes[nd->nd_procnum];
898 probe_procnum = procnum;
899 }
900 if (probe_id != 0)
901 (dtrace_nfscl_nfs234_start_probe)
902 (probe_id, vp, nd->nd_mreq, cred,
903 probe_procnum);
904 }
905 #endif
906 }
907 freeslot = -1; /* Set to slot that needs to be free'd */
908 tryagain:
909 slot = -1; /* Slot that needs a sequence# increment. */
910 /*
911 * This timeout specifies when a new socket should be created,
912 * along with new xid values. For UDP, this should be done
913 * infrequently, since retransmits of RPC requests should normally
914 * use the same xid.
915 */
916 if (nmp == NULL) {
917 if (clp == NULL) {
918 timo.tv_sec = NFSV4_UPCALLTIMEO;
919 timo.tv_usec = 0;
920 } else {
921 timo.tv_sec = NFSV4_CALLBACKTIMEO / 1000;
922 timo.tv_usec = NFSV4_CALLBACKTIMEO * 1000;
923 }
924 } else {
925 if (nrp->nr_sotype != SOCK_DGRAM) {
926 timo.tv_usec = 0;
927 if ((nmp->nm_flag & NFSMNT_NFSV4))
928 timo.tv_sec = INT_MAX;
929 else
930 timo.tv_sec = NFS_TCPTIMEO;
931 } else {
932 if (NFSHASSOFT(nmp)) {
933 /*
934 * CLSET_RETRIES is set to 2, so this should be
935 * half of the total timeout required.
936 */
937 timeo = nmp->nm_retry * nmp->nm_timeo / 2;
938 if (timeo < 1)
939 timeo = 1;
940 timo.tv_sec = timeo / NFS_HZ;
941 timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
942 NFS_HZ;
943 } else {
944 /* For UDP hard mounts, use a large value. */
945 timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
946 timo.tv_usec = 0;
947 }
948 }
949
950 if (rep != NULL) {
951 rep->r_flags = 0;
952 rep->r_nmp = nmp;
953 /*
954 * Chain request into list of outstanding requests.
955 */
956 NFSLOCKREQ();
957 TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
958 NFSUNLOCKREQ();
959 }
960 }
961
962 nd->nd_mrep = NULL;
963 if (clp != NULL && sep != NULL)
964 stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
965 nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
966 else if (nextconn_set)
967 /*
968 * When there are multiple TCP connections, send the
969 * RPCs with large messages on the alternate TCP
970 * connection(s) in a round robin fashion.
971 * The small RPC messages are sent on the default
972 * TCP connection because they do not require much
973 * network bandwidth and separating them from the
974 * large RPC messages avoids them getting "log jammed"
975 * behind several large RPC messages.
976 */
977 stat = CLNT_CALL_MBUF(nmp->nm_aconn[nextconn],
978 &ext, procnum, nd->nd_mreq, &nd->nd_mrep, timo);
979 else
980 stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
981 nd->nd_mreq, &nd->nd_mrep, timo);
982 NFSCL_DEBUG(2, "clnt call=%d\n", stat);
983
984 if (rep != NULL) {
985 /*
986 * RPC done, unlink the request.
987 */
988 NFSLOCKREQ();
989 TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
990 NFSUNLOCKREQ();
991 }
992
993 /*
994 * If there was a successful reply and a tprintf msg.
995 * tprintf a response.
996 */
997 if (stat == RPC_SUCCESS) {
998 error = 0;
999 } else if (stat == RPC_TIMEDOUT) {
1000 NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
1001 error = ETIMEDOUT;
1002 } else if (stat == RPC_VERSMISMATCH) {
1003 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1004 error = EOPNOTSUPP;
1005 } else if (stat == RPC_PROGVERSMISMATCH) {
1006 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1007 error = EPROTONOSUPPORT;
1008 } else if (stat == RPC_CANTSEND || stat == RPC_CANTRECV ||
1009 stat == RPC_SYSTEMERROR || stat == RPC_INTR) {
1010 /* Check for a session slot that needs to be free'd. */
1011 if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1012 (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1013 nd->nd_procnum != NFSPROC_NULL) {
1014 /*
1015 * This should only occur when either the MDS or
1016 * a client has an RPC against a DS fail.
1017 * This happens because these cases use "soft"
1018 * connections that can time out and fail.
1019 * The slot used for this RPC is now in a
1020 * non-deterministic state, but if the slot isn't
1021 * free'd, threads can get stuck waiting for a slot.
1022 */
1023 if (sep == NULL)
1024 sep = nfsmnt_mdssession(nmp);
1025 /*
1026 * Bump the sequence# out of range, so that reuse of
1027 * this slot will result in an NFSERR_SEQMISORDERED
1028 * error and not a bogus cached RPC reply.
1029 */
1030 mtx_lock(&sep->nfsess_mtx);
1031 sep->nfsess_slotseq[nd->nd_slotid] += 10;
1032 sep->nfsess_badslots |= (0x1ULL << nd->nd_slotid);
1033 mtx_unlock(&sep->nfsess_mtx);
1034 /* And free the slot. */
1035 nfsv4_freeslot(sep, nd->nd_slotid, false);
1036 }
1037 if (stat == RPC_INTR)
1038 error = EINTR;
1039 else {
1040 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1041 error = ENXIO;
1042 }
1043 } else if (stat == RPC_AUTHERROR) {
1044 /* Check for a session slot that needs to be free'd. */
1045 if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1046 (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1047 nd->nd_procnum != NFSPROC_NULL) {
1048 /*
1049 * This can occur when a Kerberos/RPCSEC_GSS session
1050 * expires, due to TGT expiration.
1051 * Free the slot, resetting the slot's sequence#.
1052 */
1053 if (sep == NULL)
1054 sep = nfsmnt_mdssession(nmp);
1055 nfsv4_freeslot(sep, nd->nd_slotid, true);
1056 }
1057 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1058 error = EACCES;
1059 } else {
1060 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1061 error = EACCES;
1062 }
1063 if (error) {
1064 m_freem(nd->nd_mreq);
1065 if (usegssname == 0)
1066 AUTH_DESTROY(auth);
1067 if (rep != NULL)
1068 free(rep, M_NFSDREQ);
1069 if (set_sigset)
1070 newnfs_restore_sigmask(td, &oldset);
1071 return (error);
1072 }
1073
1074 KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
1075
1076 /*
1077 * Search for any mbufs that are not a multiple of 4 bytes long
1078 * or with m_data not longword aligned.
1079 * These could cause pointer alignment problems, so copy them to
1080 * well aligned mbufs.
1081 */
1082 newnfs_realign(&nd->nd_mrep, M_WAITOK);
1083 nd->nd_md = nd->nd_mrep;
1084 nd->nd_dpos = mtod(nd->nd_md, caddr_t);
1085 nd->nd_repstat = 0;
1086 if (nd->nd_procnum != NFSPROC_NULL &&
1087 nd->nd_procnum != NFSV4PROC_CBNULL) {
1088 /* If sep == NULL, set it to the default in nmp. */
1089 if (sep == NULL && nmp != NULL)
1090 sep = nfsmnt_mdssession(nmp);
1091 /*
1092 * and now the actual NFS xdr.
1093 */
1094 NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1095 nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
1096 if (nd->nd_repstat >= 10000)
1097 NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
1098 (int)nd->nd_repstat);
1099
1100 /*
1101 * Get rid of the tag, return count and SEQUENCE result for
1102 * NFSv4.
1103 */
1104 if ((nd->nd_flag & ND_NFSV4) != 0 && nd->nd_repstat !=
1105 NFSERR_MINORVERMISMATCH) {
1106 NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1107 i = fxdr_unsigned(int, *tl);
1108 error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
1109 if (error)
1110 goto nfsmout;
1111 NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1112 opcnt = fxdr_unsigned(int, *tl++);
1113 i = fxdr_unsigned(int, *tl++);
1114 j = fxdr_unsigned(int, *tl);
1115 if (j >= 10000)
1116 NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
1117 /*
1118 * If the first op is Sequence, free up the slot.
1119 */
1120 if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
1121 (clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0)) {
1122 NFSCL_DEBUG(1, "failed seq=%d\n", j);
1123 if (sep != NULL && i == NFSV4OP_SEQUENCE &&
1124 j == NFSERR_SEQMISORDERED) {
1125 mtx_lock(&sep->nfsess_mtx);
1126 sep->nfsess_badslots |=
1127 (0x1ULL << nd->nd_slotid);
1128 mtx_unlock(&sep->nfsess_mtx);
1129 }
1130 }
1131 if (((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
1132 (clp != NULL && i == NFSV4OP_CBSEQUENCE &&
1133 j == 0)) && sep != NULL) {
1134 if (i == NFSV4OP_SEQUENCE)
1135 NFSM_DISSECT(tl, uint32_t *,
1136 NFSX_V4SESSIONID +
1137 5 * NFSX_UNSIGNED);
1138 else
1139 NFSM_DISSECT(tl, uint32_t *,
1140 NFSX_V4SESSIONID +
1141 4 * NFSX_UNSIGNED);
1142 mtx_lock(&sep->nfsess_mtx);
1143 if (bcmp(tl, sep->nfsess_sessionid,
1144 NFSX_V4SESSIONID) == 0) {
1145 tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
1146 retseq = fxdr_unsigned(uint32_t, *tl++);
1147 slot = fxdr_unsigned(int, *tl++);
1148 if ((nd->nd_flag & ND_HASSLOTID) != 0) {
1149 if (slot >= NFSV4_SLOTS ||
1150 (i == NFSV4OP_CBSEQUENCE &&
1151 slot >= NFSV4_CBSLOTS)) {
1152 printf("newnfs_request:"
1153 " Bogus slot\n");
1154 slot = nd->nd_slotid;
1155 } else if (slot !=
1156 nd->nd_slotid) {
1157 printf("newnfs_request:"
1158 " Wrong session "
1159 "srvslot=%d "
1160 "slot=%d\n", slot,
1161 nd->nd_slotid);
1162 if (i == NFSV4OP_SEQUENCE) {
1163 /*
1164 * Mark both slots as
1165 * bad, because we do
1166 * not know if the
1167 * server has advanced
1168 * the sequence# for
1169 * either of them.
1170 */
1171 sep->nfsess_badslots |=
1172 (0x1ULL << slot);
1173 sep->nfsess_badslots |=
1174 (0x1ULL <<
1175 nd->nd_slotid);
1176 }
1177 slot = nd->nd_slotid;
1178 }
1179 freeslot = slot;
1180 } else if (slot != 0) {
1181 printf("newnfs_request: Bad "
1182 "session slot=%d\n", slot);
1183 slot = 0;
1184 }
1185 if (retseq != sep->nfsess_slotseq[slot])
1186 printf("retseq diff 0x%x\n",
1187 retseq);
1188 retval = fxdr_unsigned(uint32_t, *++tl);
1189 if ((retval + 1) < sep->nfsess_foreslots
1190 )
1191 sep->nfsess_foreslots = (retval
1192 + 1);
1193 else if ((retval + 1) >
1194 sep->nfsess_foreslots)
1195 sep->nfsess_foreslots = (retval
1196 < 64) ? (retval + 1) : 64;
1197 }
1198 mtx_unlock(&sep->nfsess_mtx);
1199
1200 /* Grab the op and status for the next one. */
1201 if (opcnt > 1) {
1202 NFSM_DISSECT(tl, uint32_t *,
1203 2 * NFSX_UNSIGNED);
1204 i = fxdr_unsigned(int, *tl++);
1205 j = fxdr_unsigned(int, *tl);
1206 }
1207 }
1208 }
1209 if (nd->nd_repstat != 0) {
1210 if (nd->nd_repstat == NFSERR_BADSESSION &&
1211 nmp != NULL && dssep == NULL &&
1212 (nd->nd_flag & ND_NFSV41) != 0) {
1213 /*
1214 * If this is a client side MDS RPC, mark
1215 * the MDS session defunct and initiate
1216 * recovery, as required.
1217 * The nfsess_defunct field is protected by
1218 * the NFSLOCKMNT()/nm_mtx lock and not the
1219 * nfsess_mtx lock to simplify its handling,
1220 * for the MDS session. This lock is also
1221 * sufficient for nfsess_sessionid, since it
1222 * never changes in the structure.
1223 */
1224 NFSCL_DEBUG(1, "Got badsession\n");
1225 NFSLOCKCLSTATE();
1226 NFSLOCKMNT(nmp);
1227 if (TAILQ_EMPTY(&nmp->nm_sess)) {
1228 NFSUNLOCKMNT(nmp);
1229 NFSUNLOCKCLSTATE();
1230 printf("If server has not rebooted, "
1231 "check NFS clients for unique "
1232 "/etc/hostid's\n");
1233 goto out;
1234 }
1235 sep = NFSMNT_MDSSESSION(nmp);
1236 if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
1237 NFSX_V4SESSIONID) == 0) {
1238 printf("Initiate recovery. If server "
1239 "has not rebooted, "
1240 "check NFS clients for unique "
1241 "/etc/hostid's\n");
1242 /* Initiate recovery. */
1243 sep->nfsess_defunct = 1;
1244 NFSCL_DEBUG(1, "Marked defunct\n");
1245 if (nmp->nm_clp != NULL) {
1246 nmp->nm_clp->nfsc_flags |=
1247 NFSCLFLAGS_RECOVER;
1248 wakeup(nmp->nm_clp);
1249 }
1250 }
1251 NFSUNLOCKCLSTATE();
1252 /*
1253 * Sleep for up to 1sec waiting for a new
1254 * session.
1255 */
1256 mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
1257 "nfsbadsess", hz);
1258 /*
1259 * Get the session again, in case a new one
1260 * has been created during the sleep.
1261 */
1262 sep = NFSMNT_MDSSESSION(nmp);
1263 NFSUNLOCKMNT(nmp);
1264 if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
1265 reterr = nfsv4_sequencelookup(nmp, sep,
1266 &slotpos, &maxslot, &slotseq,
1267 sessionid, true);
1268 if (reterr == 0) {
1269 /* Fill in new session info. */
1270 NFSCL_DEBUG(1,
1271 "Filling in new sequence\n");
1272 tl = nd->nd_sequence;
1273 bcopy(sessionid, tl,
1274 NFSX_V4SESSIONID);
1275 tl += NFSX_V4SESSIONID /
1276 NFSX_UNSIGNED;
1277 *tl++ = txdr_unsigned(slotseq);
1278 *tl++ = txdr_unsigned(slotpos);
1279 *tl = txdr_unsigned(maxslot);
1280 nd->nd_slotid = slotpos;
1281 nd->nd_flag |= ND_HASSLOTID;
1282 }
1283 if (reterr == NFSERR_BADSESSION ||
1284 reterr == 0) {
1285 NFSCL_DEBUG(1,
1286 "Badsession looping\n");
1287 m_freem(nd->nd_mrep);
1288 nd->nd_mrep = NULL;
1289 goto tryagain;
1290 }
1291 nd->nd_repstat = reterr;
1292 NFSCL_DEBUG(1, "Got err=%d\n", reterr);
1293 }
1294 }
1295 /*
1296 * When clp != NULL, it is a callback and all
1297 * callback operations can be retried for NFSERR_DELAY.
1298 */
1299 if (((nd->nd_repstat == NFSERR_DELAY ||
1300 nd->nd_repstat == NFSERR_GRACE) &&
1301 (nd->nd_flag & ND_NFSV4) && (clp != NULL ||
1302 (nd->nd_procnum != NFSPROC_DELEGRETURN &&
1303 nd->nd_procnum != NFSPROC_SETATTR &&
1304 nd->nd_procnum != NFSPROC_READ &&
1305 nd->nd_procnum != NFSPROC_READDS &&
1306 nd->nd_procnum != NFSPROC_WRITE &&
1307 nd->nd_procnum != NFSPROC_WRITEDS &&
1308 nd->nd_procnum != NFSPROC_OPEN &&
1309 nd->nd_procnum != NFSPROC_OPENLAYGET &&
1310 nd->nd_procnum != NFSPROC_CREATE &&
1311 nd->nd_procnum != NFSPROC_CREATELAYGET &&
1312 nd->nd_procnum != NFSPROC_OPENCONFIRM &&
1313 nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
1314 nd->nd_procnum != NFSPROC_CLOSE &&
1315 nd->nd_procnum != NFSPROC_LOCK &&
1316 nd->nd_procnum != NFSPROC_LOCKU))) ||
1317 (nd->nd_repstat == NFSERR_DELAY &&
1318 (nd->nd_flag & ND_NFSV4) == 0) ||
1319 nd->nd_repstat == NFSERR_RESOURCE ||
1320 nd->nd_repstat == NFSERR_RETRYUNCACHEDREP) {
1321 /* Clip at NFS_TRYLATERDEL. */
1322 if (timespeccmp(&trylater_delay,
1323 &nfs_trylater_max, >))
1324 trylater_delay = nfs_trylater_max;
1325 getnanouptime(&waituntil);
1326 timespecadd(&waituntil, &trylater_delay,
1327 &waituntil);
1328 do {
1329 nfs_catnap(PZERO, 0, "nfstry");
1330 getnanouptime(&ts);
1331 } while (timespeccmp(&ts, &waituntil, <));
1332 timespecadd(&trylater_delay, &trylater_delay,
1333 &trylater_delay); /* Double each time. */
1334 if (slot != -1) {
1335 mtx_lock(&sep->nfsess_mtx);
1336 sep->nfsess_slotseq[slot]++;
1337 *nd->nd_slotseq = txdr_unsigned(
1338 sep->nfsess_slotseq[slot]);
1339 mtx_unlock(&sep->nfsess_mtx);
1340 }
1341 m_freem(nd->nd_mrep);
1342 nd->nd_mrep = NULL;
1343 goto tryagain;
1344 }
1345
1346 /*
1347 * If the File Handle was stale, invalidate the
1348 * lookup cache, just in case.
1349 * (vp != NULL implies a client side call)
1350 */
1351 if (nd->nd_repstat == ESTALE && vp != NULL) {
1352 cache_purge(vp);
1353 if (ncl_call_invalcaches != NULL)
1354 (*ncl_call_invalcaches)(vp);
1355 }
1356 }
1357 if ((nd->nd_flag & ND_NFSV4) != 0) {
1358 /* Free the slot, as required. */
1359 if (freeslot != -1)
1360 nfsv4_freeslot(sep, freeslot, false);
1361 /*
1362 * If this op is Putfh, throw its results away.
1363 */
1364 if (j >= 10000)
1365 NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
1366 if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
1367 NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
1368 i = fxdr_unsigned(int, *tl++);
1369 j = fxdr_unsigned(int, *tl);
1370 if (j >= 10000)
1371 NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
1372 j);
1373 /*
1374 * All Compounds that do an Op that must
1375 * be in sequence consist of NFSV4OP_PUTFH
1376 * followed by one of these. As such, we
1377 * can determine if the seqid# should be
1378 * incremented, here.
1379 */
1380 if ((i == NFSV4OP_OPEN ||
1381 i == NFSV4OP_OPENCONFIRM ||
1382 i == NFSV4OP_OPENDOWNGRADE ||
1383 i == NFSV4OP_CLOSE ||
1384 i == NFSV4OP_LOCK ||
1385 i == NFSV4OP_LOCKU) &&
1386 (j == 0 ||
1387 (j != NFSERR_STALECLIENTID &&
1388 j != NFSERR_STALESTATEID &&
1389 j != NFSERR_BADSTATEID &&
1390 j != NFSERR_BADSEQID &&
1391 j != NFSERR_BADXDR &&
1392 j != NFSERR_RESOURCE &&
1393 j != NFSERR_NOFILEHANDLE)))
1394 nd->nd_flag |= ND_INCRSEQID;
1395 }
1396 /*
1397 * If this op's status is non-zero, mark
1398 * that there is no more data to process.
1399 * The exception is Setattr, which always has xdr
1400 * when it has failed.
1401 */
1402 if (j != 0 && i != NFSV4OP_SETATTR)
1403 nd->nd_flag |= ND_NOMOREDATA;
1404
1405 /*
1406 * If R_DONTRECOVER is set, replace the stale error
1407 * reply, so that recovery isn't initiated.
1408 */
1409 if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
1410 nd->nd_repstat == NFSERR_BADSESSION ||
1411 nd->nd_repstat == NFSERR_STALESTATEID) &&
1412 rep != NULL && (rep->r_flags & R_DONTRECOVER))
1413 nd->nd_repstat = NFSERR_STALEDONTRECOVER;
1414 }
1415 }
1416 out:
1417
1418 #ifdef KDTRACE_HOOKS
1419 if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
1420 uint32_t probe_id;
1421 int probe_procnum;
1422
1423 if (nd->nd_flag & ND_NFSV4) {
1424 probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
1425 probe_procnum = nd->nd_procnum;
1426 } else if (nd->nd_flag & ND_NFSV3) {
1427 probe_id = nfscl_nfs3_done_probes[procnum];
1428 probe_procnum = procnum;
1429 } else {
1430 probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
1431 probe_procnum = procnum;
1432 }
1433 if (probe_id != 0)
1434 (dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
1435 nd->nd_mreq, cred, probe_procnum, 0);
1436 }
1437 #endif
1438
1439 m_freem(nd->nd_mreq);
1440 if (usegssname == 0)
1441 AUTH_DESTROY(auth);
1442 if (rep != NULL)
1443 free(rep, M_NFSDREQ);
1444 if (set_sigset)
1445 newnfs_restore_sigmask(td, &oldset);
1446 return (0);
1447 nfsmout:
1448 m_freem(nd->nd_mrep);
1449 m_freem(nd->nd_mreq);
1450 if (usegssname == 0)
1451 AUTH_DESTROY(auth);
1452 if (rep != NULL)
1453 free(rep, M_NFSDREQ);
1454 if (set_sigset)
1455 newnfs_restore_sigmask(td, &oldset);
1456 return (error);
1457 }
1458
1459 /*
1460 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1461 * wait for all requests to complete. This is used by forced unmounts
1462 * to terminate any outstanding RPCs.
1463 */
1464 int
newnfs_nmcancelreqs(struct nfsmount * nmp)1465 newnfs_nmcancelreqs(struct nfsmount *nmp)
1466 {
1467 struct nfsclds *dsp;
1468 struct __rpc_client *cl;
1469 int i;
1470
1471 if (nmp->nm_sockreq.nr_client != NULL)
1472 CLNT_CLOSE(nmp->nm_sockreq.nr_client);
1473 for (i = 0; i < nmp->nm_aconnect; i++)
1474 if (nmp->nm_aconn[i] != NULL)
1475 CLNT_CLOSE(nmp->nm_aconn[i]);
1476 lookformore:
1477 NFSLOCKMNT(nmp);
1478 TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
1479 NFSLOCKDS(dsp);
1480 if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
1481 (dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
1482 dsp->nfsclds_sockp != NULL &&
1483 dsp->nfsclds_sockp->nr_client != NULL) {
1484 dsp->nfsclds_flags |= NFSCLDS_CLOSED;
1485 cl = dsp->nfsclds_sockp->nr_client;
1486 NFSUNLOCKDS(dsp);
1487 NFSUNLOCKMNT(nmp);
1488 CLNT_CLOSE(cl);
1489 goto lookformore;
1490 }
1491 NFSUNLOCKDS(dsp);
1492 }
1493 NFSUNLOCKMNT(nmp);
1494 return (0);
1495 }
1496
1497 /*
1498 * Any signal that can interrupt an NFS operation in an intr mount
1499 * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
1500 */
1501 int newnfs_sig_set[] = {
1502 SIGINT,
1503 SIGTERM,
1504 SIGHUP,
1505 SIGKILL,
1506 SIGQUIT
1507 };
1508
1509 /*
1510 * Check to see if one of the signals in our subset is pending on
1511 * the process (in an intr mount).
1512 */
1513 static int
nfs_sig_pending(sigset_t set)1514 nfs_sig_pending(sigset_t set)
1515 {
1516 int i;
1517
1518 for (i = 0 ; i < nitems(newnfs_sig_set); i++)
1519 if (SIGISMEMBER(set, newnfs_sig_set[i]))
1520 return (1);
1521 return (0);
1522 }
1523
1524 /*
1525 * The set/restore sigmask functions are used to (temporarily) overwrite
1526 * the thread td_sigmask during an RPC call (for example). These are also
1527 * used in other places in the NFS client that might tsleep().
1528 */
1529 void
newnfs_set_sigmask(struct thread * td,sigset_t * oldset)1530 newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
1531 {
1532 sigset_t newset;
1533 int i;
1534 struct proc *p;
1535
1536 SIGFILLSET(newset);
1537 if (td == NULL)
1538 td = curthread; /* XXX */
1539 p = td->td_proc;
1540 /* Remove the NFS set of signals from newset */
1541 PROC_LOCK(p);
1542 mtx_lock(&p->p_sigacts->ps_mtx);
1543 for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
1544 /*
1545 * But make sure we leave the ones already masked
1546 * by the process, ie. remove the signal from the
1547 * temporary signalmask only if it wasn't already
1548 * in p_sigmask.
1549 */
1550 if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
1551 !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
1552 SIGDELSET(newset, newnfs_sig_set[i]);
1553 }
1554 mtx_unlock(&p->p_sigacts->ps_mtx);
1555 kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
1556 SIGPROCMASK_PROC_LOCKED);
1557 PROC_UNLOCK(p);
1558 }
1559
1560 void
newnfs_restore_sigmask(struct thread * td,sigset_t * set)1561 newnfs_restore_sigmask(struct thread *td, sigset_t *set)
1562 {
1563 if (td == NULL)
1564 td = curthread; /* XXX */
1565 kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
1566 }
1567
1568 /*
1569 * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
1570 * old one after msleep() returns.
1571 */
1572 int
newnfs_msleep(struct thread * td,void * ident,struct mtx * mtx,int priority,char * wmesg,int timo)1573 newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
1574 {
1575 sigset_t oldset;
1576 int error;
1577
1578 if ((priority & PCATCH) == 0)
1579 return msleep(ident, mtx, priority, wmesg, timo);
1580 if (td == NULL)
1581 td = curthread; /* XXX */
1582 newnfs_set_sigmask(td, &oldset);
1583 error = msleep(ident, mtx, priority, wmesg, timo);
1584 newnfs_restore_sigmask(td, &oldset);
1585 return (error);
1586 }
1587
1588 /*
1589 * Test for a termination condition pending on the process.
1590 * This is used for NFSMNT_INT mounts.
1591 */
1592 int
newnfs_sigintr(struct nfsmount * nmp,struct thread * td)1593 newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
1594 {
1595 struct proc *p;
1596 sigset_t tmpset;
1597
1598 /* Terminate all requests while attempting a forced unmount. */
1599 if (NFSCL_FORCEDISM(nmp->nm_mountp))
1600 return (EIO);
1601 if (!(nmp->nm_flag & NFSMNT_INT))
1602 return (0);
1603 if (td == NULL)
1604 return (0);
1605 p = td->td_proc;
1606 PROC_LOCK(p);
1607 tmpset = p->p_siglist;
1608 SIGSETOR(tmpset, td->td_siglist);
1609 SIGSETNAND(tmpset, td->td_sigmask);
1610 mtx_lock(&p->p_sigacts->ps_mtx);
1611 SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
1612 mtx_unlock(&p->p_sigacts->ps_mtx);
1613 if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
1614 && nfs_sig_pending(tmpset)) {
1615 PROC_UNLOCK(p);
1616 return (EINTR);
1617 }
1618 PROC_UNLOCK(p);
1619 return (0);
1620 }
1621
1622 static int
nfs_msg(struct thread * td,const char * server,const char * msg,int error)1623 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
1624 {
1625 struct proc *p;
1626
1627 p = td ? td->td_proc : NULL;
1628 if (error) {
1629 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
1630 server, msg, error);
1631 } else {
1632 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
1633 }
1634 return (0);
1635 }
1636
1637 static void
nfs_down(struct nfsmount * nmp,struct thread * td,const char * msg,int error,int flags)1638 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
1639 int error, int flags)
1640 {
1641 if (nmp == NULL)
1642 return;
1643 mtx_lock(&nmp->nm_mtx);
1644 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
1645 nmp->nm_state |= NFSSTA_TIMEO;
1646 mtx_unlock(&nmp->nm_mtx);
1647 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1648 VQ_NOTRESP, 0);
1649 } else
1650 mtx_unlock(&nmp->nm_mtx);
1651 mtx_lock(&nmp->nm_mtx);
1652 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1653 nmp->nm_state |= NFSSTA_LOCKTIMEO;
1654 mtx_unlock(&nmp->nm_mtx);
1655 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1656 VQ_NOTRESPLOCK, 0);
1657 } else
1658 mtx_unlock(&nmp->nm_mtx);
1659 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
1660 }
1661
1662 static void
nfs_up(struct nfsmount * nmp,struct thread * td,const char * msg,int flags,int tprintfmsg)1663 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
1664 int flags, int tprintfmsg)
1665 {
1666 if (nmp == NULL)
1667 return;
1668 if (tprintfmsg) {
1669 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
1670 }
1671
1672 mtx_lock(&nmp->nm_mtx);
1673 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
1674 nmp->nm_state &= ~NFSSTA_TIMEO;
1675 mtx_unlock(&nmp->nm_mtx);
1676 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1677 VQ_NOTRESP, 1);
1678 } else
1679 mtx_unlock(&nmp->nm_mtx);
1680
1681 mtx_lock(&nmp->nm_mtx);
1682 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1683 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
1684 mtx_unlock(&nmp->nm_mtx);
1685 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1686 VQ_NOTRESPLOCK, 1);
1687 } else
1688 mtx_unlock(&nmp->nm_mtx);
1689 }
1690