1 /*	$OpenBSD: rf_driver.c,v 1.11 2002/12/16 07:01:03 tdeval Exp $	*/
2 /*	$NetBSD: rf_driver.c,v 1.37 2000/06/04 02:05:13 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1999 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Greg Oster
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1995 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Author:	Mark Holland, Khalil Amiri, Claudson Bornstein,
45  *		William V. Courtright II, Robby Findler, Daniel Stodolsky,
46  *		Rachad Youssef, Jim Zelenka
47  *
48  * Permission to use, copy, modify and distribute this software and
49  * its documentation is hereby granted, provided that both the copyright
50  * notice and this permission notice appear in all copies of the
51  * software, derivative works or modified versions, and any portions
52  * thereof, and that both notices appear in supporting documentation.
53  *
54  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
55  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
56  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
57  *
58  * Carnegie Mellon requests users of this software to return to
59  *
60  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
61  *  School of Computer Science
62  *  Carnegie Mellon University
63  *  Pittsburgh PA 15213-3890
64  *
65  * any improvements or extensions that they make and grant Carnegie the
66  * rights to redistribute these changes.
67  */
68 
69 /*****************************************************************************
70  *
71  * rf_driver.c -- Main setup, teardown, and access routines for the RAID
72  *		  driver
73  *
74  * All routines are prefixed with rf_ (RAIDframe), to avoid conficts.
75  *
76  *****************************************************************************/
77 
78 #include <sys/types.h>
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #ifdef	__NetBSD__
84 #include <sys/vnode.h>
85 #endif
86 
87 
88 #include "rf_archs.h"
89 #include "rf_threadstuff.h"
90 
91 
92 #include <sys/errno.h>
93 
94 #include "rf_raid.h"
95 #include "rf_dag.h"
96 #include "rf_aselect.h"
97 #include "rf_diskqueue.h"
98 #include "rf_parityscan.h"
99 #include "rf_alloclist.h"
100 #include "rf_dagutils.h"
101 #include "rf_utils.h"
102 #include "rf_etimer.h"
103 #include "rf_acctrace.h"
104 #include "rf_configure.h"
105 #include "rf_general.h"
106 #include "rf_desc.h"
107 #include "rf_states.h"
108 #include "rf_freelist.h"
109 #include "rf_decluster.h"
110 #include "rf_map.h"
111 #include "rf_revent.h"
112 #include "rf_callback.h"
113 #include "rf_engine.h"
114 #include "rf_memchunk.h"
115 #include "rf_mcpair.h"
116 #include "rf_nwayxor.h"
117 #include "rf_debugprint.h"
118 #include "rf_copyback.h"
119 #include "rf_driver.h"
120 #include "rf_options.h"
121 #include "rf_shutdown.h"
122 #include "rf_kintf.h"
123 
124 #include <sys/buf.h>
125 
126 /* rad == RF_RaidAccessDesc_t */
127 static RF_FreeList_t *rf_rad_freelist;
128 #define	RF_MAX_FREE_RAD		128
129 #define	RF_RAD_INC		 16
130 #define	RF_RAD_INITIAL		 32
131 
132 /* Debug variables. */
133 char	rf_panicbuf[2048];	/*
134 				 * A buffer to hold an error msg when we panic.
135 				 */
136 
137 /* Main configuration routines. */
138 static int raidframe_booted = 0;
139 
140 void rf_ConfigureDebug(RF_Config_t *);
141 void rf_set_debug_option(char *, long);
142 void rf_UnconfigureArray(void);
143 int  rf_init_rad(RF_RaidAccessDesc_t *);
144 void rf_clean_rad(RF_RaidAccessDesc_t *);
145 void rf_ShutdownRDFreeList(void *);
146 int  rf_ConfigureRDFreeList(RF_ShutdownList_t **);
147 
148 RF_DECLARE_MUTEX(rf_printf_mutex);	/*
149 					 * Debug only: Avoids interleaved
150 					 * printfs by different stripes.
151 					 */
152 
153 #define	SIGNAL_QUIESCENT_COND(_raid_)	wakeup(&((_raid_)->accesses_suspended))
154 #define	WAIT_FOR_QUIESCENCE(_raid_)					\
155 	tsleep(&((_raid_)->accesses_suspended), PRIBIO, "RAIDframe quiesce", 0);
156 
157 #define	IO_BUF_ERR(bp, err)						\
158 do {									\
159 	bp->b_flags |= B_ERROR;						\
160 	bp->b_resid = bp->b_bcount;					\
161 	bp->b_error = err;						\
162 	biodone(bp);							\
163 } while (0)
164 
165 static int configureCount = 0;	/* Number of active configurations. */
166 static int isconfigged = 0;	/*
167 				 * Is basic RAIDframe (non per-array)
168 				 * stuff configured ?
169 				 */
170 RF_DECLARE_STATIC_MUTEX(configureMutex);	/*
171 						 * Used to lock the
172 						 * configuration stuff.
173 						 */
174 static RF_ShutdownList_t *globalShutdown;	/* Non array-specific stuff. */
175 int  rf_ConfigureRDFreeList(RF_ShutdownList_t **);
176 
177 
178 /* Called at system boot time. */
179 int
rf_BootRaidframe(void)180 rf_BootRaidframe(void)
181 {
182 	int rc;
183 
184 	if (raidframe_booted)
185 		return (EBUSY);
186 	raidframe_booted = 1;
187 
188 	rc = rf_mutex_init(&configureMutex);
189 	if (rc) {
190 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d.\n",
191 		    __FILE__, __LINE__, rc);
192 		RF_PANIC();
193 	}
194 	configureCount = 0;
195 	isconfigged = 0;
196 	globalShutdown = NULL;
197 	return (0);
198 }
199 
200 
201 /*
202  * This function is really just for debugging user-level stuff: It
203  * frees up all memory, other RAIDframe resources that might otherwise
204  * be kept around. This is used with systems like "sentinel" to detect
205  * memory leaks.
206  */
207 int
rf_UnbootRaidframe(void)208 rf_UnbootRaidframe(void)
209 {
210 	int rc;
211 
212 	RF_LOCK_MUTEX(configureMutex);
213 	if (configureCount) {
214 		RF_UNLOCK_MUTEX(configureMutex);
215 		return (EBUSY);
216 	}
217 	raidframe_booted = 0;
218 	RF_UNLOCK_MUTEX(configureMutex);
219 	rc = rf_mutex_destroy(&configureMutex);
220 	if (rc) {
221 		RF_ERRORMSG3("Unable to destroy mutex file %s line %d"
222 		    " rc=%d.\n", __FILE__, __LINE__, rc);
223 		RF_PANIC();
224 	}
225 	return (0);
226 }
227 
228 
229 /*
230  * Called whenever an array is shutdown.
231  */
232 void
rf_UnconfigureArray(void)233 rf_UnconfigureArray(void)
234 {
235 	int rc;
236 
237 	RF_LOCK_MUTEX(configureMutex);
238 	if (--configureCount == 0) {	/*
239 					 * If no active configurations, shut
240 					 * everything down.
241 					 */
242 		isconfigged = 0;
243 
244 		rc = rf_ShutdownList(&globalShutdown);
245 		if (rc) {
246 			RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown,"
247 			    " rc=%d.\n", rc);
248 		}
249 
250 		/*
251 		 * We must wait until now, because the AllocList module
252 		 * uses the DebugMem module.
253 		 */
254 		if (rf_memDebug)
255 			rf_print_unfreed();
256 	}
257 	RF_UNLOCK_MUTEX(configureMutex);
258 }
259 
260 
261 /*
262  * Called to shut down an array.
263  */
264 int
rf_Shutdown(RF_Raid_t * raidPtr)265 rf_Shutdown(RF_Raid_t *raidPtr)
266 {
267 	if (!raidPtr->valid) {
268 		RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe"
269 		    " driver. Aborting shutdown.\n");
270 		return (EINVAL);
271 	}
272 	/*
273 	 * Wait for outstanding IOs to land.
274 	 * As described in rf_raid.h, we use the rad_freelist lock
275 	 * to protect the per-array info about outstanding descs,
276 	 * since we need to do freelist locking anyway, and this
277 	 * cuts down on the amount of serialization we've got going
278 	 * on.
279 	 */
280 	RF_FREELIST_DO_LOCK(rf_rad_freelist);
281 	if (raidPtr->waitShutdown) {
282 		RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
283 		return (EBUSY);
284 	}
285 	raidPtr->waitShutdown = 1;
286 	while (raidPtr->nAccOutstanding) {
287 		RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
288 	}
289 	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
290 
291 	/* Wait for any parity re-writes to stop... */
292 	while (raidPtr->parity_rewrite_in_progress) {
293 		printf("Waiting for parity re-write to exit...\n");
294 		tsleep(&raidPtr->parity_rewrite_in_progress, PRIBIO,
295 		       "rfprwshutdown", 0);
296 	}
297 
298 	raidPtr->valid = 0;
299 
300 	rf_update_component_labels(raidPtr, RF_FINAL_COMPONENT_UPDATE);
301 
302 	rf_UnconfigureVnodes(raidPtr);
303 
304 	rf_ShutdownList(&raidPtr->shutdownList);
305 
306 	rf_UnconfigureArray();
307 
308 	return (0);
309 }
310 
311 #define	DO_INIT_CONFIGURE(f)						\
312 do {									\
313 	rc = f (&globalShutdown);					\
314 	if (rc) {							\
315 		RF_ERRORMSG2("RAIDFRAME: failed %s with %d.\n",		\
316 		    RF_STRING(f), rc);					\
317 		rf_ShutdownList(&globalShutdown);			\
318 		configureCount--;					\
319 		RF_UNLOCK_MUTEX(configureMutex);			\
320 		return(rc);						\
321 	}								\
322 } while (0)
323 
324 #define	DO_RAID_FAIL()							\
325 do {									\
326 	rf_UnconfigureVnodes(raidPtr);					\
327 	rf_ShutdownList(&raidPtr->shutdownList);			\
328 	rf_UnconfigureArray();						\
329 } while (0)
330 
331 #define	DO_RAID_INIT_CONFIGURE(f)					\
332 do {									\
333 	rc = (f)(&raidPtr->shutdownList, raidPtr, cfgPtr);		\
334 	if (rc) {							\
335 		RF_ERRORMSG2("RAIDFRAME: failed %s with %d.\n",		\
336 		    RF_STRING(f), rc);					\
337 		DO_RAID_FAIL();						\
338 		return(rc);						\
339 	}								\
340 } while (0)
341 
342 #define	DO_RAID_MUTEX(_m_)						\
343 do {									\
344 	rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_));	\
345 	if (rc) {							\
346 		RF_ERRORMSG3("Unable to init mutex file %s line %d"	\
347 		    " rc=%d.\n", __FILE__, __LINE__, rc);		\
348 		DO_RAID_FAIL();						\
349 		return(rc);						\
350 	}								\
351 } while (0)
352 
353 #define	DO_RAID_COND(_c_)						\
354 do {									\
355 	rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_));	\
356 	if (rc) {							\
357 		RF_ERRORMSG3("Unable to init cond file %s line %d"	\
358 		    " rc=%d.\n", __FILE__, __LINE__, rc);		\
359 		DO_RAID_FAIL();						\
360 		return(rc);						\
361 	}								\
362 } while (0)
363 
364 int
rf_Configure(RF_Raid_t * raidPtr,RF_Config_t * cfgPtr,RF_AutoConfig_t * ac)365 rf_Configure(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr, RF_AutoConfig_t *ac)
366 {
367 	RF_RowCol_t row, col;
368 	int i, rc;
369 
370 	/*
371 	 * XXX This check can probably be removed now, since
372 	 * RAIDFRAME_CONFIGURE now checks to make sure that the
373 	 * RAID set is not already valid.
374 	 */
375 	if (raidPtr->valid) {
376 		RF_ERRORMSG("RAIDframe configuration not shut down."
377 		    " Aborting configure.\n");
378 		return (EINVAL);
379 	}
380 	RF_LOCK_MUTEX(configureMutex);
381 	configureCount++;
382 	if (isconfigged == 0) {
383 		rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
384 		if (rc) {
385 			RF_ERRORMSG3("Unable to init mutex file %s line %d"
386 			    " rc=%d.\n", __FILE__, __LINE__, rc);
387 			rf_ShutdownList(&globalShutdown);
388 			return (rc);
389 		}
390 		/* Initialize globals. */
391 #ifdef	RAIDDEBUG
392 		printf("RAIDFRAME: protectedSectors is %ld.\n",
393 		       rf_protectedSectors);
394 #endif	/* RAIDDEBUG */
395 
396 		rf_clear_debug_print_buffer();
397 
398 		DO_INIT_CONFIGURE(rf_ConfigureAllocList);
399 
400 		/*
401 		 * Yes, this does make debugging general to the whole
402 		 * system instead of being array specific. Bummer, drag.
403 		 */
404 		rf_ConfigureDebug(cfgPtr);
405 		DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
406 		DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
407 		DO_INIT_CONFIGURE(rf_ConfigureMapModule);
408 		DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
409 		DO_INIT_CONFIGURE(rf_ConfigureCallback);
410 		DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
411 		DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
412 		DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
413 		DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
414 		DO_INIT_CONFIGURE(rf_ConfigureMCPair);
415 		DO_INIT_CONFIGURE(rf_ConfigureDAGs);
416 		DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
417 		DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
418 		DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
419 		DO_INIT_CONFIGURE(rf_ConfigureCopyback);
420 		DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
421 		isconfigged = 1;
422 	}
423 	RF_UNLOCK_MUTEX(configureMutex);
424 
425 	DO_RAID_MUTEX(&raidPtr->mutex);
426 	/*
427 	 * Set up the cleanup list. Do this after ConfigureDebug so that
428 	 * value of memDebug will be set.
429 	 */
430 
431 	rf_MakeAllocList(raidPtr->cleanupList);
432 	if (raidPtr->cleanupList == NULL) {
433 		DO_RAID_FAIL();
434 		return (ENOMEM);
435 	}
436 	rc = rf_ShutdownCreate(&raidPtr->shutdownList,
437 	    (void (*) (void *)) rf_FreeAllocList, raidPtr->cleanupList);
438 	if (rc) {
439 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
440 		    " rc=%d.\n", __FILE__, __LINE__, rc);
441 		DO_RAID_FAIL();
442 		return (rc);
443 	}
444 	raidPtr->numRow = cfgPtr->numRow;
445 	raidPtr->numCol = cfgPtr->numCol;
446 	raidPtr->numSpare = cfgPtr->numSpare;
447 
448 	/*
449 	 * XXX We don't even pretend to support more than one row in the
450 	 * kernel...
451 	 */
452 	if (raidPtr->numRow != 1) {
453 		RF_ERRORMSG("Only one row supported in kernel.\n");
454 		DO_RAID_FAIL();
455 		return (EINVAL);
456 	}
457 	RF_CallocAndAdd(raidPtr->status, raidPtr->numRow,
458 	    sizeof(RF_RowStatus_t), (RF_RowStatus_t *), raidPtr->cleanupList);
459 	if (raidPtr->status == NULL) {
460 		DO_RAID_FAIL();
461 		return (ENOMEM);
462 	}
463 	RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
464 	    sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
465 	if (raidPtr->reconControl == NULL) {
466 		DO_RAID_FAIL();
467 		return (ENOMEM);
468 	}
469 	for (i = 0; i < raidPtr->numRow; i++) {
470 		raidPtr->status[i] = rf_rs_optimal;
471 		raidPtr->reconControl[i] = NULL;
472 	}
473 
474 	DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
475 	DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
476 
477 	DO_RAID_COND(&raidPtr->outstandingCond);
478 
479 	raidPtr->nAccOutstanding = 0;
480 	raidPtr->waitShutdown = 0;
481 
482 	DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
483 	DO_RAID_COND(&raidPtr->quiescent_cond);
484 
485 	DO_RAID_COND(&raidPtr->waitForReconCond);
486 
487 	DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
488 
489 	if (ac != NULL) {
490 		/*
491 		 * We have an AutoConfig structure... Don't do the
492 		 * normal disk configuration... call the auto config
493 		 * stuff.
494 		 */
495 		rf_AutoConfigureDisks(raidPtr, cfgPtr, ac);
496 	} else {
497 		DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
498 		DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
499 	}
500 	/*
501 	 * Do this after ConfigureDisks & ConfigureSpareDisks to be sure
502 	 * devno is set.
503 	 */
504 	DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
505 
506 	DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
507 
508 	DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
509 
510 	for (row = 0; row < raidPtr->numRow; row++) {
511 		for (col = 0; col < raidPtr->numCol; col++) {
512 			/*
513 			 * XXX Better distribution.
514 			 */
515 			raidPtr->hist_diskreq[row][col] = 0;
516 		}
517 	}
518 
519 	raidPtr->numNewFailures = 0;
520 	raidPtr->copyback_in_progress = 0;
521 	raidPtr->parity_rewrite_in_progress = 0;
522 	raidPtr->recon_in_progress = 0;
523 	raidPtr->maxOutstanding = cfgPtr->maxOutstandingDiskReqs;
524 
525 	/*
526 	 * Autoconfigure and root_partition will actually get filled in
527 	 * after the config is done.
528 	 */
529 	raidPtr->autoconfigure = 0;
530 	raidPtr->root_partition = 0;
531 	raidPtr->last_unit = raidPtr->raidid;
532 	raidPtr->config_order = 0;
533 
534 	if (rf_keepAccTotals) {
535 		raidPtr->keep_acc_totals = 1;
536 	}
537 	rf_StartUserStats(raidPtr);
538 
539 	raidPtr->valid = 1;
540 	return (0);
541 }
542 
543 int
rf_init_rad(RF_RaidAccessDesc_t * desc)544 rf_init_rad(RF_RaidAccessDesc_t *desc)
545 {
546 	int rc;
547 
548 	rc = rf_mutex_init(&desc->mutex);
549 	if (rc) {
550 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d.\n", __FILE__,
551 		    __LINE__, rc);
552 		return (rc);
553 	}
554 	rc = rf_cond_init(&desc->cond);
555 	if (rc) {
556 		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d.\n", __FILE__,
557 		    __LINE__, rc);
558 		rf_mutex_destroy(&desc->mutex);
559 		return (rc);
560 	}
561 	return (0);
562 }
563 
564 void
rf_clean_rad(RF_RaidAccessDesc_t * desc)565 rf_clean_rad(RF_RaidAccessDesc_t *desc)
566 {
567 	rf_mutex_destroy(&desc->mutex);
568 	rf_cond_destroy(&desc->cond);
569 }
570 
571 void
rf_ShutdownRDFreeList(void * ignored)572 rf_ShutdownRDFreeList(void *ignored)
573 {
574 	RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist, next,
575 	    (RF_RaidAccessDesc_t *), rf_clean_rad);
576 }
577 
578 int
rf_ConfigureRDFreeList(RF_ShutdownList_t ** listp)579 rf_ConfigureRDFreeList(RF_ShutdownList_t **listp)
580 {
581 	int rc;
582 
583 	RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
584 	    RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
585 	if (rf_rad_freelist == NULL) {
586 		return (ENOMEM);
587 	}
588 	rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
589 	if (rc) {
590 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d.\n", __FILE__,
591 		    __LINE__, rc);
592 		rf_ShutdownRDFreeList(NULL);
593 		return (rc);
594 	}
595 	RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL, next,
596 	    (RF_RaidAccessDesc_t *), rf_init_rad);
597 	return (0);
598 }
599 
600 RF_RaidAccessDesc_t *
rf_AllocRaidAccDesc(RF_Raid_t * raidPtr,RF_IoType_t type,RF_RaidAddr_t raidAddress,RF_SectorCount_t numBlocks,caddr_t bufPtr,void * bp,RF_DagHeader_t ** paramDAG,RF_AccessStripeMapHeader_t ** paramASM,RF_RaidAccessFlags_t flags,void (* cbF)(struct buf *),void * cbA,RF_AccessState_t * states)601 rf_AllocRaidAccDesc(
602     RF_Raid_t			 *raidPtr,
603     RF_IoType_t			  type,
604     RF_RaidAddr_t		  raidAddress,
605     RF_SectorCount_t		  numBlocks,
606     caddr_t			  bufPtr,
607     void			 *bp,
608     RF_DagHeader_t		**paramDAG,
609     RF_AccessStripeMapHeader_t	**paramASM,
610     RF_RaidAccessFlags_t	  flags,
611     void			(*cbF) (struct buf *),
612     void			 *cbA,
613     RF_AccessState_t		 *states
614 )
615 {
616 	RF_RaidAccessDesc_t *desc;
617 
618 	RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist, desc, next,
619 	    (RF_RaidAccessDesc_t *), rf_init_rad);
620 	if (raidPtr->waitShutdown) {
621 		/*
622 		 * Actually, we're shutting the array down. Free the desc
623 		 * and return NULL.
624 		 */
625 		RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
626 		RF_FREELIST_FREE_CLEAN(rf_rad_freelist, desc, next,
627 		    rf_clean_rad);
628 		return (NULL);
629 	}
630 	raidPtr->nAccOutstanding++;
631 	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
632 
633 	desc->raidPtr = (void *) raidPtr;
634 	desc->type = type;
635 	desc->raidAddress = raidAddress;
636 	desc->numBlocks = numBlocks;
637 	desc->bufPtr = bufPtr;
638 	desc->bp = bp;
639 	desc->paramDAG = paramDAG;
640 	desc->paramASM = paramASM;
641 	desc->flags = flags;
642 	desc->states = states;
643 	desc->state = 0;
644 
645 	desc->status = 0;
646 	bzero((char *) &desc->tracerec, sizeof(RF_AccTraceEntry_t));
647 	desc->callbackFunc = (void (*) (RF_CBParam_t)) cbF;	/* XXX */
648 	desc->callbackArg = cbA;
649 	desc->next = NULL;
650 	desc->head = desc;
651 	desc->numPending = 0;
652 	desc->cleanupList = NULL;
653 	rf_MakeAllocList(desc->cleanupList);
654 	return (desc);
655 }
656 
657 void
rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc)658 rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc)
659 {
660 	RF_Raid_t *raidPtr = desc->raidPtr;
661 
662 	RF_ASSERT(desc);
663 
664 	rf_FreeAllocList(desc->cleanupList);
665 	RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist, desc, next, rf_clean_rad);
666 	raidPtr->nAccOutstanding--;
667 	if (raidPtr->waitShutdown) {
668 		RF_SIGNAL_COND(raidPtr->outstandingCond);
669 	}
670 	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
671 }
672 
673 
674 /********************************************************************
675  * Main routine for performing an access.
676  * Accesses are retried until a DAG can not be selected. This occurs
677  * when either the DAG library is incomplete or there are too many
678  * failures in a parity group.
679  ********************************************************************/
680 int
rf_DoAccess(RF_Raid_t * raidPtr,RF_IoType_t type,int async_flag,RF_RaidAddr_t raidAddress,RF_SectorCount_t numBlocks,caddr_t bufPtr,void * bp_in,RF_DagHeader_t ** paramDAG,RF_AccessStripeMapHeader_t ** paramASM,RF_RaidAccessFlags_t flags,RF_RaidAccessDesc_t ** paramDesc,void (* cbF)(struct buf *),void * cbA)681 rf_DoAccess(
682     RF_Raid_t			 *raidPtr,
683     RF_IoType_t			  type,		/* Should be read or write. */
684     int				  async_flag,	/*
685 						 * Should be RF_TRUE
686 						 * or RF_FALSE.
687 						 */
688     RF_RaidAddr_t		  raidAddress,
689     RF_SectorCount_t		  numBlocks,
690     caddr_t			  bufPtr,
691     void			 *bp_in,	/*
692 						 * It's a buf pointer.
693 						 * void * to facilitate
694 						 * ignoring it outside
695 						 * the kernel.
696 						 */
697     RF_DagHeader_t		**paramDAG,
698     RF_AccessStripeMapHeader_t	**paramASM,
699     RF_RaidAccessFlags_t	  flags,
700     RF_RaidAccessDesc_t		**paramDesc,
701     void			(*cbF) (struct buf *),
702     void			 *cbA
703 )
704 {
705 	RF_RaidAccessDesc_t *desc;
706 	caddr_t lbufPtr = bufPtr;
707 	struct buf *bp = (struct buf *) bp_in;
708 
709 	raidAddress += rf_raidSectorOffset;
710 
711 	if (!raidPtr->valid) {
712 		RF_ERRORMSG("RAIDframe driver not successfully configured."
713 		    " Rejecting access.\n");
714 		IO_BUF_ERR(bp, EINVAL);
715 		return (EINVAL);
716 	}
717 
718 	if (rf_accessDebug) {
719 
720 		printf("logBytes is: %d %d %d.\n", raidPtr->raidid,
721 		    raidPtr->logBytesPerSector,
722 		    (int) rf_RaidAddressToByte(raidPtr, numBlocks));
723 		printf("raid%d: %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx.\n", raidPtr->raidid,
724 		    (type == RF_IO_TYPE_READ) ? "READ" : "WRITE", (int) raidAddress,
725 		    (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
726 		    (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress + numBlocks - 1),
727 		    (int) numBlocks,
728 		    (int) rf_RaidAddressToByte(raidPtr, numBlocks),
729 		    (long) bufPtr);
730 	}
731 	if (raidAddress + numBlocks > raidPtr->totalSectors) {
732 
733 		printf("DoAccess: raid addr %lu too large to access %lu sectors. Max legal addr is %lu.\n",
734 		    (u_long) raidAddress, (u_long) numBlocks, (u_long) raidPtr->totalSectors);
735 
736 			IO_BUF_ERR(bp, ENOSPC);
737 			return (ENOSPC);
738 	}
739 	desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
740 	    numBlocks, lbufPtr, bp, paramDAG, paramASM,
741 	    flags, cbF, cbA, raidPtr->Layout.map->states);
742 
743 	if (desc == NULL) {
744 		return (ENOMEM);
745 	}
746 	RF_ETIMER_START(desc->tracerec.tot_timer);
747 
748 	desc->async_flag = async_flag;
749 
750 	rf_ContinueRaidAccess(desc);
751 
752 	return (0);
753 }
754 
755 
756 /* Force the array into reconfigured mode without doing reconstruction. */
757 int
rf_SetReconfiguredMode(RF_Raid_t * raidPtr,int row,int col)758 rf_SetReconfiguredMode(RF_Raid_t *raidPtr, int row, int col)
759 {
760 	if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
761 		printf("Can't set reconfigured mode in dedicated-spare"
762 		    " array.\n");
763 		RF_PANIC();
764 	}
765 	RF_LOCK_MUTEX(raidPtr->mutex);
766 	raidPtr->numFailures++;
767 	raidPtr->Disks[row][col].status = rf_ds_dist_spared;
768 	raidPtr->status[row] = rf_rs_reconfigured;
769 	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
770 	/*
771 	 * Install spare table only if declustering + distributed sparing
772 	 * architecture.
773 	 */
774 	if (raidPtr->Layout.map->flags & RF_BD_DECLUSTERED)
775 		rf_InstallSpareTable(raidPtr, row, col);
776 	RF_UNLOCK_MUTEX(raidPtr->mutex);
777 	return (0);
778 }
779 
780 extern int fail_row, fail_col, fail_time;
781 extern int delayed_recon;
782 
783 int
rf_FailDisk(RF_Raid_t * raidPtr,int frow,int fcol,int initRecon)784 rf_FailDisk(RF_Raid_t *raidPtr, int frow, int fcol, int initRecon)
785 {
786 	printf("raid%d: Failing disk r%d c%d.\n", raidPtr->raidid, frow, fcol);
787 	RF_LOCK_MUTEX(raidPtr->mutex);
788 	raidPtr->numFailures++;
789 	raidPtr->Disks[frow][fcol].status = rf_ds_failed;
790 	raidPtr->status[frow] = rf_rs_degraded;
791 	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
792 	RF_UNLOCK_MUTEX(raidPtr->mutex);
793 	if (initRecon)
794 		rf_ReconstructFailedDisk(raidPtr, frow, fcol);
795 	return (0);
796 }
797 
798 
799 /*
800  * Releases a thread that is waiting for the array to become quiesced.
801  * access_suspend_mutex should be locked upon calling this.
802  */
803 void
rf_SignalQuiescenceLock(RF_Raid_t * raidPtr,RF_RaidReconDesc_t * reconDesc)804 rf_SignalQuiescenceLock(RF_Raid_t *raidPtr, RF_RaidReconDesc_t *reconDesc)
805 {
806 	if (rf_quiesceDebug) {
807 		printf("raid%d: Signalling quiescence lock.\n",
808 		       raidPtr->raidid);
809 	}
810 	raidPtr->access_suspend_release = 1;
811 
812 	if (raidPtr->waiting_for_quiescence) {
813 		SIGNAL_QUIESCENT_COND(raidPtr);
814 	}
815 }
816 
817 
818 /*
819  * Suspends all new requests to the array. No effect on accesses that are
820  * in flight.
821  */
822 int
rf_SuspendNewRequestsAndWait(RF_Raid_t * raidPtr)823 rf_SuspendNewRequestsAndWait(RF_Raid_t *raidPtr)
824 {
825 	if (rf_quiesceDebug)
826 		printf("Suspending new reqs.\n");
827 
828 	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
829 	raidPtr->accesses_suspended++;
830 	raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
831 
832 	if (raidPtr->waiting_for_quiescence) {
833 		raidPtr->access_suspend_release = 0;
834 		while (!raidPtr->access_suspend_release) {
835 			printf("Suspending: Waiting for Quiescence.\n");
836 			WAIT_FOR_QUIESCENCE(raidPtr);
837 			raidPtr->waiting_for_quiescence = 0;
838 		}
839 	}
840 	printf("Quiescence reached...\n");
841 
842 	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
843 	return (raidPtr->waiting_for_quiescence);
844 }
845 
846 
847 /* Wake up everyone waiting for quiescence to be released. */
848 void
rf_ResumeNewRequests(RF_Raid_t * raidPtr)849 rf_ResumeNewRequests(RF_Raid_t *raidPtr)
850 {
851 	RF_CallbackDesc_t *t, *cb;
852 
853 	if (rf_quiesceDebug)
854 		printf("Resuming new reqs.\n");
855 
856 	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
857 	raidPtr->accesses_suspended--;
858 	if (raidPtr->accesses_suspended == 0)
859 		cb = raidPtr->quiesce_wait_list;
860 	else
861 		cb = NULL;
862 	raidPtr->quiesce_wait_list = NULL;
863 	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
864 
865 	while (cb) {
866 		t = cb;
867 		cb = cb->next;
868 		(t->callbackFunc) (t->callbackArg);
869 		rf_FreeCallbackDesc(t);
870 	}
871 }
872 
873 
874 /*****************************************************************************
875  *
876  * Debug routines.
877  *
878  *****************************************************************************/
879 
880 void
rf_set_debug_option(char * name,long val)881 rf_set_debug_option(char *name, long val)
882 {
883 	RF_DebugName_t *p;
884 
885 	for (p = rf_debugNames; p->name; p++) {
886 		if (!strcmp(p->name, name)) {
887 			*(p->ptr) = val;
888 			printf("[Set debug variable %s to %ld]\n", name, val);
889 			return;
890 		}
891 	}
892 	RF_ERRORMSG1("Unknown debug string \"%s\"\n", name);
893 }
894 
895 
896 /* Would like to use sscanf here, but apparently not available in kernel. */
897 /*ARGSUSED*/
898 void
rf_ConfigureDebug(RF_Config_t * cfgPtr)899 rf_ConfigureDebug(RF_Config_t *cfgPtr)
900 {
901 	char *val_p, *name_p, *white_p;
902 	long val;
903 	int i;
904 
905 	rf_ResetDebugOptions();
906 	for (i = 0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
907 		name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
908 		white_p = rf_find_white(name_p);	/*
909 							 * Skip to start of 2nd
910 							 * word.
911 							 */
912 		val_p = rf_find_non_white(white_p);
913 		if (*val_p == '0' && *(val_p + 1) == 'x')
914 			val = rf_htoi(val_p + 2);
915 		else
916 			val = rf_atoi(val_p);
917 		*white_p = '\0';
918 		rf_set_debug_option(name_p, val);
919 	}
920 }
921 
922 
923 /* Performance monitoring stuff. */
924 
925 #if	!defined(_KERNEL) && !defined(SIMULATE)
926 
927 /*
928  * Throughput stats currently only used in user-level RAIDframe.
929  */
930 
931 int
rf_InitThroughputStats(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)932 rf_InitThroughputStats(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
933     RF_Config_t *cfgPtr)
934 {
935 	int rc;
936 
937 	/* These used by user-level RAIDframe only. */
938 	rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
939 	if (rc) {
940 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d.\n",
941 		    __FILE__, __LINE__, rc);
942 		return (rc);
943 	}
944 	raidPtr->throughputstats.sum_io_us = 0;
945 	raidPtr->throughputstats.num_ios = 0;
946 	raidPtr->throughputstats.num_out_ios = 0;
947 	return (0);
948 }
949 
950 void
rf_StartThroughputStats(RF_Raid_t * raidPtr)951 rf_StartThroughputStats(RF_Raid_t *raidPtr)
952 {
953 	RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
954 	raidPtr->throughputstats.num_ios++;
955 	raidPtr->throughputstats.num_out_ios++;
956 	if (raidPtr->throughputstats.num_out_ios == 1)
957 		RF_GETTIME(raidPtr->throughputstats.start);
958 	RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
959 }
960 
961 void
rf_StopThroughputStats(RF_Raid_t * raidPtr)962 rf_StopThroughputStats(RF_Raid_t *raidPtr)
963 {
964 	struct timeval diff;
965 
966 	RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
967 	raidPtr->throughputstats.num_out_ios--;
968 	if (raidPtr->throughputstats.num_out_ios == 0) {
969 		RF_GETTIME(raidPtr->throughputstats.stop);
970 		RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start,
971 		    &raidPtr->throughputstats.stop, &diff);
972 		raidPtr->throughputstats.sum_io_us += RF_TIMEVAL_TO_US(diff);
973 	}
974 	RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
975 }
976 
977 void
rf_PrintThroughputStats(RF_Raid_t * raidPtr)978 rf_PrintThroughputStats(RF_Raid_t *raidPtr)
979 {
980 	RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
981 	if (raidPtr->throughputstats.sum_io_us != 0) {
982 		printf("[Througphut: %8.2f IOs/second]\n",
983 		    raidPtr->throughputstats.num_ios /
984 		    (raidPtr->throughputstats.sum_io_us / 1000000.0));
985 	}
986 }
987 
988 #endif	/* !_KERNEL && !SIMULATE */
989 
990 void
rf_StartUserStats(RF_Raid_t * raidPtr)991 rf_StartUserStats(RF_Raid_t *raidPtr)
992 {
993 	RF_GETTIME(raidPtr->userstats.start);
994 	raidPtr->userstats.sum_io_us = 0;
995 	raidPtr->userstats.num_ios = 0;
996 	raidPtr->userstats.num_sect_moved = 0;
997 }
998 
999 void
rf_StopUserStats(RF_Raid_t * raidPtr)1000 rf_StopUserStats(RF_Raid_t *raidPtr)
1001 {
1002 	RF_GETTIME(raidPtr->userstats.stop);
1003 }
1004 
1005 void
rf_UpdateUserStats(RF_Raid_t * raidPtr,int rt,int numsect)1006 rf_UpdateUserStats(
1007     RF_Raid_t	*raidPtr,
1008     int		 rt,		/* Response time in us. */
1009     int		 numsect	/* Number of sectors for this access. */
1010 )
1011 {
1012 	raidPtr->userstats.sum_io_us += rt;
1013 	raidPtr->userstats.num_ios++;
1014 	raidPtr->userstats.num_sect_moved += numsect;
1015 }
1016 
1017 void
rf_PrintUserStats(RF_Raid_t * raidPtr)1018 rf_PrintUserStats(RF_Raid_t *raidPtr)
1019 {
1020 	long    elapsed_us, mbs, mbs_frac;
1021 	struct timeval diff;
1022 
1023 	RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop,
1024 	    &diff);
1025 	elapsed_us = RF_TIMEVAL_TO_US(diff);
1026 
1027 	/* 2000 sectors per megabyte, 10000000 microseconds per second. */
1028 	if (elapsed_us)
1029 		mbs = (raidPtr->userstats.num_sect_moved / 2000) /
1030 		    (elapsed_us / 1000000);
1031 	else
1032 		mbs = 0;
1033 
1034 	/* This computes only the first digit of the fractional mb/s moved. */
1035 	if (elapsed_us) {
1036 		mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) /
1037 		    (elapsed_us / 1000000)) - (mbs * 10);
1038 	} else {
1039 		mbs_frac = 0;
1040 	}
1041 
1042 	printf("Number of I/Os:             %ld\n",
1043 	    raidPtr->userstats.num_ios);
1044 	printf("Elapsed time (us):          %ld\n",
1045 	    elapsed_us);
1046 	printf("User I/Os per second:       %ld\n",
1047 	    RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us / 1000000)));
1048 	printf("Average user response time: %ld us\n",
1049 	    RF_DB0_CHECK(raidPtr->userstats.sum_io_us,
1050 	     raidPtr->userstats.num_ios));
1051 	printf("Total sectors moved:        %ld\n",
1052 	    raidPtr->userstats.num_sect_moved);
1053 	printf("Average access size (sect): %ld\n",
1054 	    RF_DB0_CHECK(raidPtr->userstats.num_sect_moved,
1055 	    raidPtr->userstats.num_ios));
1056 	printf("Achieved data rate:         %ld.%ld MB/sec\n",
1057 	    mbs, mbs_frac);
1058 }
1059