1 /*	$OpenBSD: rf_decluster.c,v 1.5 2002/12/16 07:01:03 tdeval Exp $	*/
2 /*	$NetBSD: rf_decluster.c,v 1.5 2000/03/07 01:54:29 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: Mark Holland
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*****************************************************************************
32  *
33  * rf_decluster.c -- Code related to the declustered layout.
34  *
35  * Created 10-21-92 (MCH)
36  *
37  * Nov 93:	Adding support for distributed sparing. This code is a little
38  *		complex; the basic layout used is as follows:
39  *		Let F = (v-1)/GCD(r,v-1). The spare space for each set of
40  *		F consecutive fulltables is grouped together and placed after
41  *		that set of tables.
42  *			+-------------------------------+
43  *			|	  F fulltables		|
44  *			|	  Spare Space		|
45  *			|	  F fulltables		|
46  *			|	  Spare Space		|
47  *			|	      ...		|
48  *			+-------------------------------+
49  *
50  *****************************************************************************/
51 
52 #include "rf_types.h"
53 #include "rf_raid.h"
54 #include "rf_raidframe.h"
55 #include "rf_configure.h"
56 #include "rf_decluster.h"
57 #include "rf_debugMem.h"
58 #include "rf_utils.h"
59 #include "rf_alloclist.h"
60 #include "rf_general.h"
61 #include "rf_shutdown.h"
62 
63 extern int rf_copyback_in_progress;	/* Debug only. */
64 
65 /* Found in rf_kintf.c */
66 int  rf_GetSpareTableFromDaemon(RF_SparetWait_t *);
67 
68 /* Configuration code. */
69 
70 int
rf_ConfigureDeclustered(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)71 rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
72     RF_Config_t *cfgPtr)
73 {
74 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
75 	int b, v, k, r, lambda;	/* block design params */
76 	int i, j;
77 	RF_RowCol_t *first_avail_slot;
78 	RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
79 	RF_DeclusteredConfigInfo_t *info;
80 	RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs,
81 	    numCompleteSpareRegionsPerDisk, extraPUsPerDisk;
82 	RF_StripeCount_t totSparePUsPerDisk;
83 	RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
84 	RF_SectorCount_t SpareSpaceInSUs;
85 	char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
86 	RF_StripeNum_t l, SUID;
87 
88 	SUID = l = 0;
89 	numCompleteSpareRegionsPerDisk = 0;
90 
91 	/* 1. Create layout specific structure. */
92 	RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t),
93 	    (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
94 	if (info == NULL)
95 		return (ENOMEM);
96 	layoutPtr->layoutSpecificInfo = (void *) info;
97 	info->SpareTable = NULL;
98 
99 	/* 2. Extract parameters from the config structure. */
100 	if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
101 		bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
102 	}
103 	cfgBuf += RF_SPAREMAP_NAME_LEN;
104 
105 	b = *((int *) cfgBuf);
106 	cfgBuf += sizeof(int);
107 	v = *((int *) cfgBuf);
108 	cfgBuf += sizeof(int);
109 	k = *((int *) cfgBuf);
110 	cfgBuf += sizeof(int);
111 	r = *((int *) cfgBuf);
112 	cfgBuf += sizeof(int);
113 	lambda = *((int *) cfgBuf);
114 	cfgBuf += sizeof(int);
115 	raidPtr->noRotate = *((int *) cfgBuf);
116 	cfgBuf += sizeof(int);
117 
118 	/*
119 	 * The sparemaps are generated assuming that parity is rotated, so we
120 	 * issue a warning if both distributed sparing and no-rotate are on at
121 	 * the same time.
122 	 */
123 	if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) &&
124 	    raidPtr->noRotate) {
125 		RF_ERRORMSG("Warning:  distributed sparing specified without"
126 		    " parity rotation.\n");
127 	}
128 	if (raidPtr->numCol != v) {
129 		RF_ERRORMSG2("RAID: config error: table element count (%d)"
130 		    " not equal to no. of cols (%d).\n", v, raidPtr->numCol);
131 		return (EINVAL);
132 	}
133 	/* 3. Set up the values used in the mapping code. */
134 	info->BlocksPerTable = b;
135 	info->Lambda = lambda;
136 	info->NumParityReps = info->groupSize = k;
137 	/* b blks, k-1 SUs each. */
138 	info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU;
139 	info->SUsPerFullTable = k * info->SUsPerTable;	/* rot k times */
140 	info->PUsPerBlock = k - 1;
141 	info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
142 	info->TableDepthInPUs = (b * k) / v;
143 	/* k repetitions. */
144 	info->FullTableDepthInPUs = info->TableDepthInPUs * k;
145 
146 	/* Used only in distributed sparing case. */
147 	/* (v-1)/gcd fulltables. */
148 	info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1);
149 	info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
150 	info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion /
151 	    (v - 1)) * layoutPtr->SUsPerPU;
152 
153 	/* Check to make sure the block design is sufficiently small. */
154 	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
155 		if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU +
156 		    info->SpareSpaceDepthPerRegionInSUs >
157 		    layoutPtr->stripeUnitsPerDisk) {
158 			RF_ERRORMSG3("RAID: config error: Full Table depth"
159 			    " (%d) + Spare Space (%d) larger than disk size"
160 			    " (%d) (BD too big).\n",
161 			    (int) info->FullTableDepthInPUs,
162 			    (int) info->SpareSpaceDepthPerRegionInSUs,
163 			    (int) layoutPtr->stripeUnitsPerDisk);
164 			return (EINVAL);
165 		}
166 	} else {
167 		if (info->TableDepthInPUs * layoutPtr->SUsPerPU >
168 		    layoutPtr->stripeUnitsPerDisk) {
169 			RF_ERRORMSG2("RAID: config error: Table depth (%d)"
170 			    " larger than disk size (%d) (BD too big).\n",
171 			    (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU),
172 			    (int) layoutPtr->stripeUnitsPerDisk);
173 			return (EINVAL);
174 		}
175 	}
176 
177 
178 	/*
179 	 * Compute the size of each disk, and the number of tables in the last
180 	 * fulltable (which need not be complete).
181 	 */
182 	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
183 
184 		PUsPerDisk = layoutPtr->stripeUnitsPerDisk /
185 		    layoutPtr->SUsPerPU;
186 		spareRegionDepthInPUs =
187 		    (info->TablesPerSpareRegion * info->TableDepthInPUs +
188 		    (info->TablesPerSpareRegion * info->TableDepthInPUs) /
189 		    (v - 1));
190 		info->SpareRegionDepthInSUs =
191 		    spareRegionDepthInPUs * layoutPtr->SUsPerPU;
192 
193 		numCompleteSpareRegionsPerDisk =
194 		    PUsPerDisk / spareRegionDepthInPUs;
195 		info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
196 		extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
197 
198 		/*
199 		 * Assume conservatively that we need the full amount of spare
200 		 * space in one region in order to provide spares for the
201 		 * partial spare region at the end of the array. We set "i"
202 		 * to the number of tables in the partial spare region. This
203 		 * may actually include some fulltables.
204 		 */
205 		extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs /
206 		    layoutPtr->SUsPerPU);
207 		if (extraPUsPerDisk <= 0)
208 			i = 0;
209 		else
210 			i = extraPUsPerDisk / info->TableDepthInPUs;
211 
212 		complete_FT_count = raidPtr->numRow *
213 		    (numCompleteSpareRegionsPerDisk *
214 		    (info->TablesPerSpareRegion / k) + i / k);
215 		info->FullTableLimitSUID =
216 		    complete_FT_count * info->SUsPerFullTable;
217 		info->ExtraTablesPerDisk = i % k;
218 
219 		/*
220 		 * Note that in the last spare region, the spare space is
221 		 * complete even though data/parity space is not.
222 		 */
223 		totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) *
224 		    (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
225 		info->TotSparePUsPerDisk = totSparePUsPerDisk;
226 
227 		layoutPtr->stripeUnitsPerDisk =
228 		    ((complete_FT_count / raidPtr->numRow) *
229 		    info->FullTableDepthInPUs +	/* data & parity space */
230 		    info->ExtraTablesPerDisk * info->TableDepthInPUs +
231 		    totSparePUsPerDisk		/* spare space */
232 		    ) * layoutPtr->SUsPerPU;
233 		layoutPtr->dataStripeUnitsPerDisk =
234 		    (complete_FT_count * info->FullTableDepthInPUs +
235 		    info->ExtraTablesPerDisk * info->TableDepthInPUs) *
236 		    layoutPtr->SUsPerPU * (k - 1) / k;
237 
238 	} else {
239 		/*
240 		 * Non-dist spare case:  force each disk to contain an
241 		 * integral number of tables.
242 		 */
243 		layoutPtr->stripeUnitsPerDisk /=
244 		    (info->TableDepthInPUs * layoutPtr->SUsPerPU);
245 		layoutPtr->stripeUnitsPerDisk *=
246 		    (info->TableDepthInPUs * layoutPtr->SUsPerPU);
247 
248 		/*
249 		 * Compute the number of tables in the last fulltable, which
250 		 * need not be complete.
251 		 */
252 		complete_FT_count =
253 		    ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) /
254 		    info->FullTableDepthInPUs) * raidPtr->numRow;
255 
256 		info->FullTableLimitSUID =
257 		    complete_FT_count * info->SUsPerFullTable;
258 		info->ExtraTablesPerDisk =
259 		    ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) /
260 		    info->TableDepthInPUs) % k;
261 	}
262 
263 	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
264 		    layoutPtr->sectorsPerStripeUnit;
265 
266 	/*
267 	 * Find the disk offset of the stripe unit where the last fulltable
268 	 * starts.
269 	 */
270 	numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
271 	diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk *
272 	    info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
273 	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
274 		SpareSpaceInSUs = numCompleteSpareRegionsPerDisk *
275 		    info->SpareSpaceDepthPerRegionInSUs;
276 		diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
277 		info->DiskOffsetOfLastSpareSpaceChunkInSUs =
278 		    diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk *
279 		    info->TableDepthInPUs * layoutPtr->SUsPerPU;
280 	}
281 	info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
282 	info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
283 
284 	/* 4. Create and initialize the lookup tables. */
285 	info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
286 	if (info->LayoutTable == NULL)
287 		return (ENOMEM);
288 	info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
289 	if (info->OffsetTable == NULL)
290 		return (ENOMEM);
291 	info->BlockTable = rf_make_2d_array(info->TableDepthInPUs *
292 	    layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
293 	if (info->BlockTable == NULL)
294 		return (ENOMEM);
295 
296 	first_avail_slot = rf_make_1d_array(v, NULL);
297 	if (first_avail_slot == NULL)
298 		return (ENOMEM);
299 
300 	for (i = 0; i < b; i++)
301 		for (j = 0; j < k; j++)
302 			info->LayoutTable[i][j] = *cfgBuf++;
303 
304 	/* Initialize the offset table. */
305 	for (i = 0; i < b; i++)
306 		for (j = 0; j < k; j++) {
307 			info->OffsetTable[i][j] =
308 			    first_avail_slot[info->LayoutTable[i][j]];
309 			first_avail_slot[info->LayoutTable[i][j]]++;
310 		}
311 
312 	/* Initialize the block table. */
313 	for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
314 		for (i = 0; i < b; i++) {
315 			for (j = 0; j < k; j++) {
316 				info->BlockTable[(info->OffsetTable[i][j] *
317 				    layoutPtr->SUsPerPU) + l]
318 				    [info->LayoutTable[i][j]] = SUID;
319 			}
320 			SUID++;
321 		}
322 	}
323 
324 	rf_free_1d_array(first_avail_slot, v);
325 
326 	/* 5. Set up the remaining redundant-but-useful parameters. */
327 
328 	raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow *
329 	    info->ExtraTablesPerDisk) * info->SUsPerTable *
330 	    layoutPtr->sectorsPerStripeUnit;
331 	layoutPtr->numStripe = (raidPtr->totalSectors /
332 	    layoutPtr->sectorsPerStripeUnit) / (k - 1);
333 
334 	/*
335 	 * Strange evaluation order below to try and minimize overflow
336 	 * problems.
337 	 */
338 
339 	layoutPtr->dataSectorsPerStripe =
340 	    (k - 1) * layoutPtr->sectorsPerStripeUnit;
341 	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
342 	    raidPtr->logBytesPerSector;
343 	layoutPtr->numDataCol = k - 1;
344 	layoutPtr->numParityCol = 1;
345 
346 	return (0);
347 }
348 
349 /* Declustering with distributed sparing. */
350 void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
351 void
rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg)352 rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg)
353 {
354 	RF_DeclusteredConfigInfo_t *info;
355 	RF_Raid_t *raidPtr;
356 
357 	raidPtr = (RF_Raid_t *) arg;
358 	info =
359 	    (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
360 	if (info->SpareTable)
361 		rf_FreeSpareTable(raidPtr);
362 }
363 
364 int
rf_ConfigureDeclusteredDS(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)365 rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
366     RF_Config_t *cfgPtr)
367 {
368 	int rc;
369 
370 	rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
371 	if (rc)
372 		return (rc);
373 
374 	rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
375 	if (rc) {
376 		RF_ERRORMSG1("Got %d adding shutdown event for"
377 		    " DeclusteredDS.\n", rc);
378 		rf_ShutdownDeclusteredDS(raidPtr);
379 		return (rc);
380 	}
381 
382 	return (0);
383 }
384 
385 void
rf_MapSectorDeclustered(RF_Raid_t * raidPtr,RF_RaidAddr_t raidSector,RF_RowCol_t * row,RF_RowCol_t * col,RF_SectorNum_t * diskSector,int remap)386 rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
387     RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
388 {
389 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
390 	RF_DeclusteredConfigInfo_t *info =
391 	    (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
392 	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
393 	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
394 	RF_StripeNum_t BlockID, BlockOffset, RepIndex;
395 	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
396 	RF_StripeCount_t fulltable_depth =
397 	    info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
398 	RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
399 
400 	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
401 	    &fulltable_depth, &base_suid);
402 
403 	/* Fulltable ID within array (across rows). */
404 	FullTableID = SUID / sus_per_fulltable;
405 	if (raidPtr->numRow == 1)
406 		*row = 0;	/* Avoid a mod and a div in the common case. */
407 	else {
408 		*row = FullTableID % raidPtr->numRow;
409 		/* Convert to fulltable ID on this disk. */
410 		FullTableID /= raidPtr->numRow;
411 	}
412 	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
413 		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
414 		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
415 	}
416 	FullTableOffset = SUID % sus_per_fulltable;
417 	TableID = FullTableOffset / info->SUsPerTable;
418 	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
419 	BlockID = TableOffset / info->PUsPerBlock;
420 	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
421 	BlockID %= info->BlocksPerTable;
422 	RepIndex = info->PUsPerBlock - TableID;
423 	if (!raidPtr->noRotate)
424 		BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0);
425 	*col = info->LayoutTable[BlockID][BlockOffset];
426 
427 	/* Remap to distributed spare space if indicated. */
428 	if (remap) {
429 		RF_ASSERT(raidPtr->Disks[*row][*col].status ==
430 		    rf_ds_reconstructing ||
431 		    raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
432 		    (rf_copyback_in_progress &&
433 		    raidPtr->Disks[*row][*col].status == rf_ds_optimal));
434 		rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID,
435 		    TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col,
436 		    &outSU);
437 	} else {
438 
439 		outSU = base_suid;
440 		outSU += FullTableID * fulltable_depth;
441 			/* Offset to start of FT. */
442 		outSU += SpareSpace;
443 			/* Skip rsvd spare space. */
444 		outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
445 			/* Offset to start of table. */
446 		outSU += info->OffsetTable[BlockID][BlockOffset] *
447 		    layoutPtr->SUsPerPU;
448 			/* Offset to the PU. */
449 	}
450 	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
451 		/* offs to the SU within a PU */
452 
453 	/*
454 	 * Convert SUs to sectors, and, if not aligned to SU boundary, add in
455 	 * offset to sector.
456 	 */
457 	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit +
458 	    (raidSector % layoutPtr->sectorsPerStripeUnit);
459 
460 	RF_ASSERT(*col != -1);
461 }
462 
463 /*
464  * Prototyping this inexplicably causes the compile of the layout table
465  * (rf_layout.c) to fail.
466  */
467 void
rf_MapParityDeclustered(RF_Raid_t * raidPtr,RF_RaidAddr_t raidSector,RF_RowCol_t * row,RF_RowCol_t * col,RF_SectorNum_t * diskSector,int remap)468 rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
469     RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
470 {
471 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
472 	RF_DeclusteredConfigInfo_t *info =
473 	    (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
474 	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
475 	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
476 	RF_StripeNum_t BlockID, BlockOffset, RepIndex;
477 	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
478 	RF_StripeCount_t fulltable_depth =
479 	    info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
480 	RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
481 
482 	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
483 	    &fulltable_depth, &base_suid);
484 
485 	/* Compute row & (possibly) spare space exactly as before. */
486 	FullTableID = SUID / sus_per_fulltable;
487 	if (raidPtr->numRow == 1)
488 		*row = 0;	/* Avoid a mod and a div in the common case. */
489 	else {
490 		*row = FullTableID % raidPtr->numRow;
491 		/* Convert to fulltable ID on this disk. */
492 		FullTableID /= raidPtr->numRow;
493 	}
494 	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
495 		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
496 		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
497 	}
498 	/* Compute BlockID and RepIndex exactly as before. */
499 	FullTableOffset = SUID % sus_per_fulltable;
500 	TableID = FullTableOffset / info->SUsPerTable;
501 	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
502 	/*TableOffset	= FullTableOffset % info->SUsPerTable;*/
503 	/*BlockID	= (TableOffset / info->PUsPerBlock) %
504 	 *info->BlocksPerTable;*/
505 	BlockID = TableOffset / info->PUsPerBlock;
506 	/*BlockOffset	= TableOffset % info->PUsPerBlock;*/
507 	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
508 	BlockID %= info->BlocksPerTable;
509 
510 	/* The parity block is in the position indicated by RepIndex. */
511 	RepIndex = (raidPtr->noRotate) ?
512 	    info->PUsPerBlock : info->PUsPerBlock - TableID;
513 	*col = info->LayoutTable[BlockID][RepIndex];
514 
515 	if (remap) {
516 		RF_ASSERT(raidPtr->Disks[*row][*col].status ==
517 		    rf_ds_reconstructing ||
518 		    raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
519 		    (rf_copyback_in_progress &&
520 		    raidPtr->Disks[*row][*col].status == rf_ds_optimal));
521 		rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID,
522 		    TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col,
523 		    &outSU);
524 	} else {
525 
526 		/*
527 		 * Compute sector as before, except use RepIndex instead of
528 		 * BlockOffset.
529 		 */
530 		outSU = base_suid;
531 		outSU += FullTableID * fulltable_depth;
532 		outSU += SpareSpace;	/* skip rsvd spare space */
533 		outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
534 		outSU += info->OffsetTable[BlockID][RepIndex] *
535 		    layoutPtr->SUsPerPU;
536 	}
537 
538 	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
539 	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit +
540 	    (raidSector % layoutPtr->sectorsPerStripeUnit);
541 
542 	RF_ASSERT(*col != -1);
543 }
544 
545 /*
546  * Return an array of ints identifying the disks that comprise the stripe
547  * containing the indicated address.
548  * The caller must _never_ attempt to modify this array.
549  */
550 void
rf_IdentifyStripeDeclustered(RF_Raid_t * raidPtr,RF_RaidAddr_t addr,RF_RowCol_t ** diskids,RF_RowCol_t * outRow)551 rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
552     RF_RowCol_t **diskids, RF_RowCol_t *outRow)
553 {
554 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
555 	RF_DeclusteredConfigInfo_t *info =
556 	    (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
557 	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
558 	RF_StripeCount_t fulltable_depth =
559 	    info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
560 	RF_StripeNum_t base_suid = 0;
561 	RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
562 	RF_StripeNum_t stripeID, FullTableID;
563 	int tableOffset;
564 
565 	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
566 	    &fulltable_depth, &base_suid);
567 	/* Fulltable ID within array (across rows). */
568 	FullTableID = SUID / sus_per_fulltable;
569 	*outRow = FullTableID % raidPtr->numRow;
570 	/* Find stripe offset into array. */
571 	stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID);
572 	/* Find offset into block design table. */
573 	tableOffset = (stripeID % info->BlocksPerTable);
574 	*diskids = info->LayoutTable[tableOffset];
575 }
576 
577 /*
578  * This returns the default head-separation limit, measured in
579  * "required units for reconstruction". Each time a disk fetches
580  * a unit, it bumps a counter. The head-sep code prohibits any disk
581  * from getting more than headSepLimit counter values ahead of any
582  * other.
583  *
584  * We assume here that the number of floating recon buffers is already
585  * set. There are r stripes to be reconstructed in each table, and so
586  * if we have a total of B buffers, we can have at most B/r tables
587  * under recon at any one time. In each table, lambda units are required
588  * from each disk, so given B buffers, the head sep limit has to be
589  * (lambda*B)/r units. We subtract one to avoid weird boundary cases.
590  *
591  * For example, suppose we are given 50 buffers, r=19, and lambda=4 as in
592  * the 20.5 design. There are 19 stripes/table to be reconstructed, so
593  * we can have 50/19 tables concurrently under reconstruction, which means
594  * we can allow the fastest disk to get 50/19 tables ahead of the slower
595  * disk. There are lambda "required units" for each disk, so the fastest
596  * disk can get 4*50/19 = 10 counter values ahead of the slowest.
597  *
598  * If numBufsToAccumulate is not 1, we need to limit the head sep further
599  * because multiple bufs will be required for each stripe under recon.
600  */
601 RF_HeadSepLimit_t
rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t * raidPtr)602 rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr)
603 {
604 	RF_DeclusteredConfigInfo_t *info =
605 	    (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
606 
607 	return (info->Lambda * raidPtr->numFloatingReconBufs /
608 	    info->TableDepthInPUs / rf_numBufsToAccumulate);
609 }
610 
611 /*
612  * Return the default number of recon buffers to use. The value
613  * is somewhat arbitrary...  It's intended to be large enough to
614  * allow for a reasonably large head-sep limit, but small enough
615  * that you don't use up all your system memory with buffers.
616  */
617 int
rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr)618 rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr)
619 {
620 	return (100 * rf_numBufsToAccumulate);
621 }
622 
623 /*
624  * Sectors in the last fulltable of the array need to be handled
625  * specially since this fulltable can be incomplete. This function
626  * changes the values of certain params to handle this.
627  *
628  * The idea here is that MapSector et. al. figure out which disk the
629  * addressed unit lives on by computing the modulos of the unit number
630  * with the number of units per fulltable, table, etc.  In the last
631  * fulltable, there are fewer units per fulltable, so we need to adjust
632  * the number of user data units per fulltable to reflect this.
633  *
634  * So, we (1) convert the fulltable size and depth parameters to
635  * the size of the partial fulltable at the end, (2) compute the
636  * disk sector offset where this fulltable starts, and (3) convert
637  * the users stripe unit number from an offset into the array to
638  * an offset into the last fulltable.
639  */
640 void
rf_decluster_adjust_params(RF_RaidLayout_t * layoutPtr,RF_StripeNum_t * SUID,RF_StripeCount_t * sus_per_fulltable,RF_StripeCount_t * fulltable_depth,RF_StripeNum_t * base_suid)641 rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t *SUID,
642     RF_StripeCount_t *sus_per_fulltable, RF_StripeCount_t *fulltable_depth,
643     RF_StripeNum_t *base_suid)
644 {
645 	RF_DeclusteredConfigInfo_t *info =
646 	    (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
647 
648 	if (*SUID >= info->FullTableLimitSUID) {
649 		/* New full table size is size of last full table on disk. */
650 		*sus_per_fulltable =
651 		    info->ExtraTablesPerDisk * info->SUsPerTable;
652 
653 		/* New full table depth is corresponding depth. */
654 		*fulltable_depth =
655 		    info->ExtraTablesPerDisk * info->TableDepthInPUs *
656 		    layoutPtr->SUsPerPU;
657 
658 		/* Set up the new base offset. */
659 		*base_suid = info->DiskOffsetOfLastFullTableInSUs;
660 
661 		/*
662 		 * Convert user's array address to an offset into the last
663 		 * fulltable.
664 		 */
665 		*SUID -= info->FullTableLimitSUID;
666 	}
667 }
668 
669 /*
670  * Map a stripe ID to a parity stripe ID.
671  * See comment above RaidAddressToParityStripeID in layout.c.
672  */
673 void
rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t * layoutPtr,RF_StripeNum_t stripeID,RF_StripeNum_t * psID,RF_ReconUnitNum_t * which_ru)674 rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID,
675     RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru)
676 {
677 	RF_DeclusteredConfigInfo_t *info;
678 
679 	info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
680 
681 	*psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) *
682 	    info->BlocksPerTable + (stripeID % info->BlocksPerTable);
683 	*which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) /
684 	    info->BlocksPerTable;
685 	RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU);
686 }
687 
688 /*
689  * Called from MapSector and MapParity to retarget an access at the spare unit.
690  * Modifies the "col" and "outSU" parameters only.
691  */
692 void
rf_remap_to_spare_space(RF_RaidLayout_t * layoutPtr,RF_DeclusteredConfigInfo_t * info,RF_RowCol_t row,RF_StripeNum_t FullTableID,RF_StripeNum_t TableID,RF_SectorNum_t BlockID,RF_StripeNum_t base_suid,RF_StripeNum_t SpareRegion,RF_RowCol_t * outCol,RF_StripeNum_t * outSU)693 rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr,
694     RF_DeclusteredConfigInfo_t *info, RF_RowCol_t row,
695     RF_StripeNum_t FullTableID, RF_StripeNum_t TableID, RF_SectorNum_t BlockID,
696     RF_StripeNum_t base_suid, RF_StripeNum_t SpareRegion, RF_RowCol_t *outCol,
697     RF_StripeNum_t *outSU)
698 {
699 	RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion,
700 	    lastSROffset, which_ft;
701 
702 	/*
703 	 * Note that FullTableID and hence SpareRegion may have gotten
704 	 * tweaked by rf_decluster_adjust_params. We detect this by
705 	 * noticing that base_suid is not 0.
706 	 */
707 	if (base_suid == 0) {
708 		ftID = FullTableID;
709 	} else {
710 		/*
711 		 * There may be > 1.0 full tables in the last (i.e. partial)
712 		 * spare region. Find out which of these we are in.
713 		 */
714 		lastSROffset = info->NumCompleteSRs *
715 		    info->SpareRegionDepthInSUs;
716 		which_ft =
717 		    (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) /
718 		    (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
719 
720 		/* Compute the actual full table ID. */
721 		ftID = info->DiskOffsetOfLastFullTableInSUs /
722 		    (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) +
723 		    which_ft;
724 		SpareRegion = info->NumCompleteSRs;
725 	}
726 	TableInSpareRegion = (ftID * info->NumParityReps + TableID) %
727 	    info->TablesPerSpareRegion;
728 
729 	*outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
730 	RF_ASSERT(*outCol != -1);
731 
732 	spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
733 	    info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk *
734 	    info->TableDepthInPUs * layoutPtr->SUsPerPU :
735 	    (SpareRegion + 1) * info->SpareRegionDepthInSUs -
736 	    info->SpareSpaceDepthPerRegionInSUs;
737 	*outSU = spareTableStartSU +
738 	    info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
739 	if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
740 		printf("rf_remap_to_spare_space: invalid remapped disk SU"
741 		    " offset %ld.\n", (long) *outSU);
742 	}
743 }
744 
745 int
rf_InstallSpareTable(RF_Raid_t * raidPtr,RF_RowCol_t frow,RF_RowCol_t fcol)746 rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol)
747 {
748 	RF_DeclusteredConfigInfo_t *info =
749 	    (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
750 	RF_SparetWait_t *req;
751 	int retcode;
752 
753 	RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
754 	req->C = raidPtr->numCol;
755 	req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
756 	req->fcol = fcol;
757 	req->SUsPerPU = raidPtr->Layout.SUsPerPU;
758 	req->TablesPerSpareRegion = info->TablesPerSpareRegion;
759 	req->BlocksPerTable = info->BlocksPerTable;
760 	req->TableDepthInPUs = info->TableDepthInPUs;
761 	req->SpareSpaceDepthPerRegionInSUs =
762 	    info->SpareSpaceDepthPerRegionInSUs;
763 
764 	retcode = rf_GetSpareTableFromDaemon(req);
765 	RF_ASSERT(!retcode);
766 	/* XXX -- Fix this to recover gracefully. -- XXX */
767 
768 	return (retcode);
769 }
770 
771 /*
772  * Invoked via ioctl to install a spare table in the kernel.
773  */
774 int
rf_SetSpareTable(RF_Raid_t * raidPtr,void * data)775 rf_SetSpareTable(RF_Raid_t *raidPtr, void *data)
776 {
777 	RF_DeclusteredConfigInfo_t *info =
778 	    (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
779 	RF_SpareTableEntry_t **ptrs;
780 	int i, retcode;
781 
782 	/*
783 	 * What we need to copyin is a 2-d array, so first copyin the user
784 	 * pointers to the rows in the table.
785 	 */
786 	RF_Malloc(ptrs, info->TablesPerSpareRegion *
787 	    sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
788 	retcode = copyin((caddr_t) data, (caddr_t) ptrs,
789 	    info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
790 
791 	if (retcode)
792 		return (retcode);
793 
794 	/* Now allocate kernel space for the row pointers. */
795 	RF_Malloc(info->SpareTable, info->TablesPerSpareRegion *
796 	    sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
797 
798 	/*
799 	 * Now allocate kernel space for each row in the table, and copy it in
800 	 * from user space. */
801 	for (i = 0; i < info->TablesPerSpareRegion; i++) {
802 		RF_Malloc(info->SpareTable[i], info->BlocksPerTable *
803 		    sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
804 		retcode = copyin(ptrs[i], info->SpareTable[i],
805 		    info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
806 		if (retcode) {
807 			/* Blow off the memory we have allocated. */
808 			info->SpareTable = NULL;
809 			return (retcode);
810 		}
811 	}
812 
813 	/* Free up the temporary array we used. */
814 	RF_Free(ptrs, info->TablesPerSpareRegion *
815 	    sizeof(RF_SpareTableEntry_t *));
816 
817 	return (0);
818 }
819 
820 RF_ReconUnitCount_t
rf_GetNumSpareRUsDeclustered(RF_Raid_t * raidPtr)821 rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr)
822 {
823 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
824 
825 	return (((RF_DeclusteredConfigInfo_t *)
826 	    layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk);
827 }
828 
829 
830 void
rf_FreeSpareTable(RF_Raid_t * raidPtr)831 rf_FreeSpareTable(RF_Raid_t *raidPtr)
832 {
833 	long i;
834 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
835 	RF_DeclusteredConfigInfo_t *info =
836 	    (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
837 	RF_SpareTableEntry_t **table = info->SpareTable;
838 
839 	for (i = 0; i < info->TablesPerSpareRegion; i++) {
840 		RF_Free(table[i], info->BlocksPerTable *
841 		    sizeof(RF_SpareTableEntry_t));
842 	}
843 	RF_Free(table, info->TablesPerSpareRegion *
844 	    sizeof(RF_SpareTableEntry_t *));
845 	info->SpareTable = (RF_SpareTableEntry_t **) NULL;
846 }
847