1 /* $OpenBSD: rf_decluster.c,v 1.5 2002/12/16 07:01:03 tdeval Exp $ */
2 /* $NetBSD: rf_decluster.c,v 1.5 2000/03/07 01:54:29 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Mark Holland
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*****************************************************************************
32 *
33 * rf_decluster.c -- Code related to the declustered layout.
34 *
35 * Created 10-21-92 (MCH)
36 *
37 * Nov 93: Adding support for distributed sparing. This code is a little
38 * complex; the basic layout used is as follows:
39 * Let F = (v-1)/GCD(r,v-1). The spare space for each set of
40 * F consecutive fulltables is grouped together and placed after
41 * that set of tables.
42 * +-------------------------------+
43 * | F fulltables |
44 * | Spare Space |
45 * | F fulltables |
46 * | Spare Space |
47 * | ... |
48 * +-------------------------------+
49 *
50 *****************************************************************************/
51
52 #include "rf_types.h"
53 #include "rf_raid.h"
54 #include "rf_raidframe.h"
55 #include "rf_configure.h"
56 #include "rf_decluster.h"
57 #include "rf_debugMem.h"
58 #include "rf_utils.h"
59 #include "rf_alloclist.h"
60 #include "rf_general.h"
61 #include "rf_shutdown.h"
62
63 extern int rf_copyback_in_progress; /* Debug only. */
64
65 /* Found in rf_kintf.c */
66 int rf_GetSpareTableFromDaemon(RF_SparetWait_t *);
67
68 /* Configuration code. */
69
70 int
rf_ConfigureDeclustered(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)71 rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
72 RF_Config_t *cfgPtr)
73 {
74 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
75 int b, v, k, r, lambda; /* block design params */
76 int i, j;
77 RF_RowCol_t *first_avail_slot;
78 RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
79 RF_DeclusteredConfigInfo_t *info;
80 RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs,
81 numCompleteSpareRegionsPerDisk, extraPUsPerDisk;
82 RF_StripeCount_t totSparePUsPerDisk;
83 RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
84 RF_SectorCount_t SpareSpaceInSUs;
85 char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
86 RF_StripeNum_t l, SUID;
87
88 SUID = l = 0;
89 numCompleteSpareRegionsPerDisk = 0;
90
91 /* 1. Create layout specific structure. */
92 RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t),
93 (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
94 if (info == NULL)
95 return (ENOMEM);
96 layoutPtr->layoutSpecificInfo = (void *) info;
97 info->SpareTable = NULL;
98
99 /* 2. Extract parameters from the config structure. */
100 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
101 bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
102 }
103 cfgBuf += RF_SPAREMAP_NAME_LEN;
104
105 b = *((int *) cfgBuf);
106 cfgBuf += sizeof(int);
107 v = *((int *) cfgBuf);
108 cfgBuf += sizeof(int);
109 k = *((int *) cfgBuf);
110 cfgBuf += sizeof(int);
111 r = *((int *) cfgBuf);
112 cfgBuf += sizeof(int);
113 lambda = *((int *) cfgBuf);
114 cfgBuf += sizeof(int);
115 raidPtr->noRotate = *((int *) cfgBuf);
116 cfgBuf += sizeof(int);
117
118 /*
119 * The sparemaps are generated assuming that parity is rotated, so we
120 * issue a warning if both distributed sparing and no-rotate are on at
121 * the same time.
122 */
123 if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) &&
124 raidPtr->noRotate) {
125 RF_ERRORMSG("Warning: distributed sparing specified without"
126 " parity rotation.\n");
127 }
128 if (raidPtr->numCol != v) {
129 RF_ERRORMSG2("RAID: config error: table element count (%d)"
130 " not equal to no. of cols (%d).\n", v, raidPtr->numCol);
131 return (EINVAL);
132 }
133 /* 3. Set up the values used in the mapping code. */
134 info->BlocksPerTable = b;
135 info->Lambda = lambda;
136 info->NumParityReps = info->groupSize = k;
137 /* b blks, k-1 SUs each. */
138 info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU;
139 info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */
140 info->PUsPerBlock = k - 1;
141 info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
142 info->TableDepthInPUs = (b * k) / v;
143 /* k repetitions. */
144 info->FullTableDepthInPUs = info->TableDepthInPUs * k;
145
146 /* Used only in distributed sparing case. */
147 /* (v-1)/gcd fulltables. */
148 info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1);
149 info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
150 info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion /
151 (v - 1)) * layoutPtr->SUsPerPU;
152
153 /* Check to make sure the block design is sufficiently small. */
154 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
155 if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU +
156 info->SpareSpaceDepthPerRegionInSUs >
157 layoutPtr->stripeUnitsPerDisk) {
158 RF_ERRORMSG3("RAID: config error: Full Table depth"
159 " (%d) + Spare Space (%d) larger than disk size"
160 " (%d) (BD too big).\n",
161 (int) info->FullTableDepthInPUs,
162 (int) info->SpareSpaceDepthPerRegionInSUs,
163 (int) layoutPtr->stripeUnitsPerDisk);
164 return (EINVAL);
165 }
166 } else {
167 if (info->TableDepthInPUs * layoutPtr->SUsPerPU >
168 layoutPtr->stripeUnitsPerDisk) {
169 RF_ERRORMSG2("RAID: config error: Table depth (%d)"
170 " larger than disk size (%d) (BD too big).\n",
171 (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU),
172 (int) layoutPtr->stripeUnitsPerDisk);
173 return (EINVAL);
174 }
175 }
176
177
178 /*
179 * Compute the size of each disk, and the number of tables in the last
180 * fulltable (which need not be complete).
181 */
182 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
183
184 PUsPerDisk = layoutPtr->stripeUnitsPerDisk /
185 layoutPtr->SUsPerPU;
186 spareRegionDepthInPUs =
187 (info->TablesPerSpareRegion * info->TableDepthInPUs +
188 (info->TablesPerSpareRegion * info->TableDepthInPUs) /
189 (v - 1));
190 info->SpareRegionDepthInSUs =
191 spareRegionDepthInPUs * layoutPtr->SUsPerPU;
192
193 numCompleteSpareRegionsPerDisk =
194 PUsPerDisk / spareRegionDepthInPUs;
195 info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
196 extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
197
198 /*
199 * Assume conservatively that we need the full amount of spare
200 * space in one region in order to provide spares for the
201 * partial spare region at the end of the array. We set "i"
202 * to the number of tables in the partial spare region. This
203 * may actually include some fulltables.
204 */
205 extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs /
206 layoutPtr->SUsPerPU);
207 if (extraPUsPerDisk <= 0)
208 i = 0;
209 else
210 i = extraPUsPerDisk / info->TableDepthInPUs;
211
212 complete_FT_count = raidPtr->numRow *
213 (numCompleteSpareRegionsPerDisk *
214 (info->TablesPerSpareRegion / k) + i / k);
215 info->FullTableLimitSUID =
216 complete_FT_count * info->SUsPerFullTable;
217 info->ExtraTablesPerDisk = i % k;
218
219 /*
220 * Note that in the last spare region, the spare space is
221 * complete even though data/parity space is not.
222 */
223 totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) *
224 (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
225 info->TotSparePUsPerDisk = totSparePUsPerDisk;
226
227 layoutPtr->stripeUnitsPerDisk =
228 ((complete_FT_count / raidPtr->numRow) *
229 info->FullTableDepthInPUs + /* data & parity space */
230 info->ExtraTablesPerDisk * info->TableDepthInPUs +
231 totSparePUsPerDisk /* spare space */
232 ) * layoutPtr->SUsPerPU;
233 layoutPtr->dataStripeUnitsPerDisk =
234 (complete_FT_count * info->FullTableDepthInPUs +
235 info->ExtraTablesPerDisk * info->TableDepthInPUs) *
236 layoutPtr->SUsPerPU * (k - 1) / k;
237
238 } else {
239 /*
240 * Non-dist spare case: force each disk to contain an
241 * integral number of tables.
242 */
243 layoutPtr->stripeUnitsPerDisk /=
244 (info->TableDepthInPUs * layoutPtr->SUsPerPU);
245 layoutPtr->stripeUnitsPerDisk *=
246 (info->TableDepthInPUs * layoutPtr->SUsPerPU);
247
248 /*
249 * Compute the number of tables in the last fulltable, which
250 * need not be complete.
251 */
252 complete_FT_count =
253 ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) /
254 info->FullTableDepthInPUs) * raidPtr->numRow;
255
256 info->FullTableLimitSUID =
257 complete_FT_count * info->SUsPerFullTable;
258 info->ExtraTablesPerDisk =
259 ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) /
260 info->TableDepthInPUs) % k;
261 }
262
263 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
264 layoutPtr->sectorsPerStripeUnit;
265
266 /*
267 * Find the disk offset of the stripe unit where the last fulltable
268 * starts.
269 */
270 numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
271 diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk *
272 info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
273 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
274 SpareSpaceInSUs = numCompleteSpareRegionsPerDisk *
275 info->SpareSpaceDepthPerRegionInSUs;
276 diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
277 info->DiskOffsetOfLastSpareSpaceChunkInSUs =
278 diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk *
279 info->TableDepthInPUs * layoutPtr->SUsPerPU;
280 }
281 info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
282 info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
283
284 /* 4. Create and initialize the lookup tables. */
285 info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
286 if (info->LayoutTable == NULL)
287 return (ENOMEM);
288 info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
289 if (info->OffsetTable == NULL)
290 return (ENOMEM);
291 info->BlockTable = rf_make_2d_array(info->TableDepthInPUs *
292 layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
293 if (info->BlockTable == NULL)
294 return (ENOMEM);
295
296 first_avail_slot = rf_make_1d_array(v, NULL);
297 if (first_avail_slot == NULL)
298 return (ENOMEM);
299
300 for (i = 0; i < b; i++)
301 for (j = 0; j < k; j++)
302 info->LayoutTable[i][j] = *cfgBuf++;
303
304 /* Initialize the offset table. */
305 for (i = 0; i < b; i++)
306 for (j = 0; j < k; j++) {
307 info->OffsetTable[i][j] =
308 first_avail_slot[info->LayoutTable[i][j]];
309 first_avail_slot[info->LayoutTable[i][j]]++;
310 }
311
312 /* Initialize the block table. */
313 for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
314 for (i = 0; i < b; i++) {
315 for (j = 0; j < k; j++) {
316 info->BlockTable[(info->OffsetTable[i][j] *
317 layoutPtr->SUsPerPU) + l]
318 [info->LayoutTable[i][j]] = SUID;
319 }
320 SUID++;
321 }
322 }
323
324 rf_free_1d_array(first_avail_slot, v);
325
326 /* 5. Set up the remaining redundant-but-useful parameters. */
327
328 raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow *
329 info->ExtraTablesPerDisk) * info->SUsPerTable *
330 layoutPtr->sectorsPerStripeUnit;
331 layoutPtr->numStripe = (raidPtr->totalSectors /
332 layoutPtr->sectorsPerStripeUnit) / (k - 1);
333
334 /*
335 * Strange evaluation order below to try and minimize overflow
336 * problems.
337 */
338
339 layoutPtr->dataSectorsPerStripe =
340 (k - 1) * layoutPtr->sectorsPerStripeUnit;
341 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
342 raidPtr->logBytesPerSector;
343 layoutPtr->numDataCol = k - 1;
344 layoutPtr->numParityCol = 1;
345
346 return (0);
347 }
348
349 /* Declustering with distributed sparing. */
350 void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
351 void
rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg)352 rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg)
353 {
354 RF_DeclusteredConfigInfo_t *info;
355 RF_Raid_t *raidPtr;
356
357 raidPtr = (RF_Raid_t *) arg;
358 info =
359 (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
360 if (info->SpareTable)
361 rf_FreeSpareTable(raidPtr);
362 }
363
364 int
rf_ConfigureDeclusteredDS(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)365 rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
366 RF_Config_t *cfgPtr)
367 {
368 int rc;
369
370 rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
371 if (rc)
372 return (rc);
373
374 rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
375 if (rc) {
376 RF_ERRORMSG1("Got %d adding shutdown event for"
377 " DeclusteredDS.\n", rc);
378 rf_ShutdownDeclusteredDS(raidPtr);
379 return (rc);
380 }
381
382 return (0);
383 }
384
385 void
rf_MapSectorDeclustered(RF_Raid_t * raidPtr,RF_RaidAddr_t raidSector,RF_RowCol_t * row,RF_RowCol_t * col,RF_SectorNum_t * diskSector,int remap)386 rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
387 RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
388 {
389 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
390 RF_DeclusteredConfigInfo_t *info =
391 (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
392 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
393 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
394 RF_StripeNum_t BlockID, BlockOffset, RepIndex;
395 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
396 RF_StripeCount_t fulltable_depth =
397 info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
398 RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
399
400 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
401 &fulltable_depth, &base_suid);
402
403 /* Fulltable ID within array (across rows). */
404 FullTableID = SUID / sus_per_fulltable;
405 if (raidPtr->numRow == 1)
406 *row = 0; /* Avoid a mod and a div in the common case. */
407 else {
408 *row = FullTableID % raidPtr->numRow;
409 /* Convert to fulltable ID on this disk. */
410 FullTableID /= raidPtr->numRow;
411 }
412 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
413 SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
414 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
415 }
416 FullTableOffset = SUID % sus_per_fulltable;
417 TableID = FullTableOffset / info->SUsPerTable;
418 TableOffset = FullTableOffset - TableID * info->SUsPerTable;
419 BlockID = TableOffset / info->PUsPerBlock;
420 BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
421 BlockID %= info->BlocksPerTable;
422 RepIndex = info->PUsPerBlock - TableID;
423 if (!raidPtr->noRotate)
424 BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0);
425 *col = info->LayoutTable[BlockID][BlockOffset];
426
427 /* Remap to distributed spare space if indicated. */
428 if (remap) {
429 RF_ASSERT(raidPtr->Disks[*row][*col].status ==
430 rf_ds_reconstructing ||
431 raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
432 (rf_copyback_in_progress &&
433 raidPtr->Disks[*row][*col].status == rf_ds_optimal));
434 rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID,
435 TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col,
436 &outSU);
437 } else {
438
439 outSU = base_suid;
440 outSU += FullTableID * fulltable_depth;
441 /* Offset to start of FT. */
442 outSU += SpareSpace;
443 /* Skip rsvd spare space. */
444 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
445 /* Offset to start of table. */
446 outSU += info->OffsetTable[BlockID][BlockOffset] *
447 layoutPtr->SUsPerPU;
448 /* Offset to the PU. */
449 }
450 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
451 /* offs to the SU within a PU */
452
453 /*
454 * Convert SUs to sectors, and, if not aligned to SU boundary, add in
455 * offset to sector.
456 */
457 *diskSector = outSU * layoutPtr->sectorsPerStripeUnit +
458 (raidSector % layoutPtr->sectorsPerStripeUnit);
459
460 RF_ASSERT(*col != -1);
461 }
462
463 /*
464 * Prototyping this inexplicably causes the compile of the layout table
465 * (rf_layout.c) to fail.
466 */
467 void
rf_MapParityDeclustered(RF_Raid_t * raidPtr,RF_RaidAddr_t raidSector,RF_RowCol_t * row,RF_RowCol_t * col,RF_SectorNum_t * diskSector,int remap)468 rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
469 RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
470 {
471 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
472 RF_DeclusteredConfigInfo_t *info =
473 (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
474 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
475 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
476 RF_StripeNum_t BlockID, BlockOffset, RepIndex;
477 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
478 RF_StripeCount_t fulltable_depth =
479 info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
480 RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
481
482 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
483 &fulltable_depth, &base_suid);
484
485 /* Compute row & (possibly) spare space exactly as before. */
486 FullTableID = SUID / sus_per_fulltable;
487 if (raidPtr->numRow == 1)
488 *row = 0; /* Avoid a mod and a div in the common case. */
489 else {
490 *row = FullTableID % raidPtr->numRow;
491 /* Convert to fulltable ID on this disk. */
492 FullTableID /= raidPtr->numRow;
493 }
494 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
495 SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
496 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
497 }
498 /* Compute BlockID and RepIndex exactly as before. */
499 FullTableOffset = SUID % sus_per_fulltable;
500 TableID = FullTableOffset / info->SUsPerTable;
501 TableOffset = FullTableOffset - TableID * info->SUsPerTable;
502 /*TableOffset = FullTableOffset % info->SUsPerTable;*/
503 /*BlockID = (TableOffset / info->PUsPerBlock) %
504 *info->BlocksPerTable;*/
505 BlockID = TableOffset / info->PUsPerBlock;
506 /*BlockOffset = TableOffset % info->PUsPerBlock;*/
507 BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
508 BlockID %= info->BlocksPerTable;
509
510 /* The parity block is in the position indicated by RepIndex. */
511 RepIndex = (raidPtr->noRotate) ?
512 info->PUsPerBlock : info->PUsPerBlock - TableID;
513 *col = info->LayoutTable[BlockID][RepIndex];
514
515 if (remap) {
516 RF_ASSERT(raidPtr->Disks[*row][*col].status ==
517 rf_ds_reconstructing ||
518 raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
519 (rf_copyback_in_progress &&
520 raidPtr->Disks[*row][*col].status == rf_ds_optimal));
521 rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID,
522 TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col,
523 &outSU);
524 } else {
525
526 /*
527 * Compute sector as before, except use RepIndex instead of
528 * BlockOffset.
529 */
530 outSU = base_suid;
531 outSU += FullTableID * fulltable_depth;
532 outSU += SpareSpace; /* skip rsvd spare space */
533 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
534 outSU += info->OffsetTable[BlockID][RepIndex] *
535 layoutPtr->SUsPerPU;
536 }
537
538 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
539 *diskSector = outSU * layoutPtr->sectorsPerStripeUnit +
540 (raidSector % layoutPtr->sectorsPerStripeUnit);
541
542 RF_ASSERT(*col != -1);
543 }
544
545 /*
546 * Return an array of ints identifying the disks that comprise the stripe
547 * containing the indicated address.
548 * The caller must _never_ attempt to modify this array.
549 */
550 void
rf_IdentifyStripeDeclustered(RF_Raid_t * raidPtr,RF_RaidAddr_t addr,RF_RowCol_t ** diskids,RF_RowCol_t * outRow)551 rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
552 RF_RowCol_t **diskids, RF_RowCol_t *outRow)
553 {
554 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
555 RF_DeclusteredConfigInfo_t *info =
556 (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
557 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
558 RF_StripeCount_t fulltable_depth =
559 info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
560 RF_StripeNum_t base_suid = 0;
561 RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
562 RF_StripeNum_t stripeID, FullTableID;
563 int tableOffset;
564
565 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
566 &fulltable_depth, &base_suid);
567 /* Fulltable ID within array (across rows). */
568 FullTableID = SUID / sus_per_fulltable;
569 *outRow = FullTableID % raidPtr->numRow;
570 /* Find stripe offset into array. */
571 stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID);
572 /* Find offset into block design table. */
573 tableOffset = (stripeID % info->BlocksPerTable);
574 *diskids = info->LayoutTable[tableOffset];
575 }
576
577 /*
578 * This returns the default head-separation limit, measured in
579 * "required units for reconstruction". Each time a disk fetches
580 * a unit, it bumps a counter. The head-sep code prohibits any disk
581 * from getting more than headSepLimit counter values ahead of any
582 * other.
583 *
584 * We assume here that the number of floating recon buffers is already
585 * set. There are r stripes to be reconstructed in each table, and so
586 * if we have a total of B buffers, we can have at most B/r tables
587 * under recon at any one time. In each table, lambda units are required
588 * from each disk, so given B buffers, the head sep limit has to be
589 * (lambda*B)/r units. We subtract one to avoid weird boundary cases.
590 *
591 * For example, suppose we are given 50 buffers, r=19, and lambda=4 as in
592 * the 20.5 design. There are 19 stripes/table to be reconstructed, so
593 * we can have 50/19 tables concurrently under reconstruction, which means
594 * we can allow the fastest disk to get 50/19 tables ahead of the slower
595 * disk. There are lambda "required units" for each disk, so the fastest
596 * disk can get 4*50/19 = 10 counter values ahead of the slowest.
597 *
598 * If numBufsToAccumulate is not 1, we need to limit the head sep further
599 * because multiple bufs will be required for each stripe under recon.
600 */
601 RF_HeadSepLimit_t
rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t * raidPtr)602 rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr)
603 {
604 RF_DeclusteredConfigInfo_t *info =
605 (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
606
607 return (info->Lambda * raidPtr->numFloatingReconBufs /
608 info->TableDepthInPUs / rf_numBufsToAccumulate);
609 }
610
611 /*
612 * Return the default number of recon buffers to use. The value
613 * is somewhat arbitrary... It's intended to be large enough to
614 * allow for a reasonably large head-sep limit, but small enough
615 * that you don't use up all your system memory with buffers.
616 */
617 int
rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr)618 rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr)
619 {
620 return (100 * rf_numBufsToAccumulate);
621 }
622
623 /*
624 * Sectors in the last fulltable of the array need to be handled
625 * specially since this fulltable can be incomplete. This function
626 * changes the values of certain params to handle this.
627 *
628 * The idea here is that MapSector et. al. figure out which disk the
629 * addressed unit lives on by computing the modulos of the unit number
630 * with the number of units per fulltable, table, etc. In the last
631 * fulltable, there are fewer units per fulltable, so we need to adjust
632 * the number of user data units per fulltable to reflect this.
633 *
634 * So, we (1) convert the fulltable size and depth parameters to
635 * the size of the partial fulltable at the end, (2) compute the
636 * disk sector offset where this fulltable starts, and (3) convert
637 * the users stripe unit number from an offset into the array to
638 * an offset into the last fulltable.
639 */
640 void
rf_decluster_adjust_params(RF_RaidLayout_t * layoutPtr,RF_StripeNum_t * SUID,RF_StripeCount_t * sus_per_fulltable,RF_StripeCount_t * fulltable_depth,RF_StripeNum_t * base_suid)641 rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t *SUID,
642 RF_StripeCount_t *sus_per_fulltable, RF_StripeCount_t *fulltable_depth,
643 RF_StripeNum_t *base_suid)
644 {
645 RF_DeclusteredConfigInfo_t *info =
646 (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
647
648 if (*SUID >= info->FullTableLimitSUID) {
649 /* New full table size is size of last full table on disk. */
650 *sus_per_fulltable =
651 info->ExtraTablesPerDisk * info->SUsPerTable;
652
653 /* New full table depth is corresponding depth. */
654 *fulltable_depth =
655 info->ExtraTablesPerDisk * info->TableDepthInPUs *
656 layoutPtr->SUsPerPU;
657
658 /* Set up the new base offset. */
659 *base_suid = info->DiskOffsetOfLastFullTableInSUs;
660
661 /*
662 * Convert user's array address to an offset into the last
663 * fulltable.
664 */
665 *SUID -= info->FullTableLimitSUID;
666 }
667 }
668
669 /*
670 * Map a stripe ID to a parity stripe ID.
671 * See comment above RaidAddressToParityStripeID in layout.c.
672 */
673 void
rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t * layoutPtr,RF_StripeNum_t stripeID,RF_StripeNum_t * psID,RF_ReconUnitNum_t * which_ru)674 rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID,
675 RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru)
676 {
677 RF_DeclusteredConfigInfo_t *info;
678
679 info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
680
681 *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) *
682 info->BlocksPerTable + (stripeID % info->BlocksPerTable);
683 *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) /
684 info->BlocksPerTable;
685 RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU);
686 }
687
688 /*
689 * Called from MapSector and MapParity to retarget an access at the spare unit.
690 * Modifies the "col" and "outSU" parameters only.
691 */
692 void
rf_remap_to_spare_space(RF_RaidLayout_t * layoutPtr,RF_DeclusteredConfigInfo_t * info,RF_RowCol_t row,RF_StripeNum_t FullTableID,RF_StripeNum_t TableID,RF_SectorNum_t BlockID,RF_StripeNum_t base_suid,RF_StripeNum_t SpareRegion,RF_RowCol_t * outCol,RF_StripeNum_t * outSU)693 rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr,
694 RF_DeclusteredConfigInfo_t *info, RF_RowCol_t row,
695 RF_StripeNum_t FullTableID, RF_StripeNum_t TableID, RF_SectorNum_t BlockID,
696 RF_StripeNum_t base_suid, RF_StripeNum_t SpareRegion, RF_RowCol_t *outCol,
697 RF_StripeNum_t *outSU)
698 {
699 RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion,
700 lastSROffset, which_ft;
701
702 /*
703 * Note that FullTableID and hence SpareRegion may have gotten
704 * tweaked by rf_decluster_adjust_params. We detect this by
705 * noticing that base_suid is not 0.
706 */
707 if (base_suid == 0) {
708 ftID = FullTableID;
709 } else {
710 /*
711 * There may be > 1.0 full tables in the last (i.e. partial)
712 * spare region. Find out which of these we are in.
713 */
714 lastSROffset = info->NumCompleteSRs *
715 info->SpareRegionDepthInSUs;
716 which_ft =
717 (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) /
718 (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
719
720 /* Compute the actual full table ID. */
721 ftID = info->DiskOffsetOfLastFullTableInSUs /
722 (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) +
723 which_ft;
724 SpareRegion = info->NumCompleteSRs;
725 }
726 TableInSpareRegion = (ftID * info->NumParityReps + TableID) %
727 info->TablesPerSpareRegion;
728
729 *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
730 RF_ASSERT(*outCol != -1);
731
732 spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
733 info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk *
734 info->TableDepthInPUs * layoutPtr->SUsPerPU :
735 (SpareRegion + 1) * info->SpareRegionDepthInSUs -
736 info->SpareSpaceDepthPerRegionInSUs;
737 *outSU = spareTableStartSU +
738 info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
739 if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
740 printf("rf_remap_to_spare_space: invalid remapped disk SU"
741 " offset %ld.\n", (long) *outSU);
742 }
743 }
744
745 int
rf_InstallSpareTable(RF_Raid_t * raidPtr,RF_RowCol_t frow,RF_RowCol_t fcol)746 rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol)
747 {
748 RF_DeclusteredConfigInfo_t *info =
749 (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
750 RF_SparetWait_t *req;
751 int retcode;
752
753 RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
754 req->C = raidPtr->numCol;
755 req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
756 req->fcol = fcol;
757 req->SUsPerPU = raidPtr->Layout.SUsPerPU;
758 req->TablesPerSpareRegion = info->TablesPerSpareRegion;
759 req->BlocksPerTable = info->BlocksPerTable;
760 req->TableDepthInPUs = info->TableDepthInPUs;
761 req->SpareSpaceDepthPerRegionInSUs =
762 info->SpareSpaceDepthPerRegionInSUs;
763
764 retcode = rf_GetSpareTableFromDaemon(req);
765 RF_ASSERT(!retcode);
766 /* XXX -- Fix this to recover gracefully. -- XXX */
767
768 return (retcode);
769 }
770
771 /*
772 * Invoked via ioctl to install a spare table in the kernel.
773 */
774 int
rf_SetSpareTable(RF_Raid_t * raidPtr,void * data)775 rf_SetSpareTable(RF_Raid_t *raidPtr, void *data)
776 {
777 RF_DeclusteredConfigInfo_t *info =
778 (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
779 RF_SpareTableEntry_t **ptrs;
780 int i, retcode;
781
782 /*
783 * What we need to copyin is a 2-d array, so first copyin the user
784 * pointers to the rows in the table.
785 */
786 RF_Malloc(ptrs, info->TablesPerSpareRegion *
787 sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
788 retcode = copyin((caddr_t) data, (caddr_t) ptrs,
789 info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
790
791 if (retcode)
792 return (retcode);
793
794 /* Now allocate kernel space for the row pointers. */
795 RF_Malloc(info->SpareTable, info->TablesPerSpareRegion *
796 sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
797
798 /*
799 * Now allocate kernel space for each row in the table, and copy it in
800 * from user space. */
801 for (i = 0; i < info->TablesPerSpareRegion; i++) {
802 RF_Malloc(info->SpareTable[i], info->BlocksPerTable *
803 sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
804 retcode = copyin(ptrs[i], info->SpareTable[i],
805 info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
806 if (retcode) {
807 /* Blow off the memory we have allocated. */
808 info->SpareTable = NULL;
809 return (retcode);
810 }
811 }
812
813 /* Free up the temporary array we used. */
814 RF_Free(ptrs, info->TablesPerSpareRegion *
815 sizeof(RF_SpareTableEntry_t *));
816
817 return (0);
818 }
819
820 RF_ReconUnitCount_t
rf_GetNumSpareRUsDeclustered(RF_Raid_t * raidPtr)821 rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr)
822 {
823 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
824
825 return (((RF_DeclusteredConfigInfo_t *)
826 layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk);
827 }
828
829
830 void
rf_FreeSpareTable(RF_Raid_t * raidPtr)831 rf_FreeSpareTable(RF_Raid_t *raidPtr)
832 {
833 long i;
834 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
835 RF_DeclusteredConfigInfo_t *info =
836 (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
837 RF_SpareTableEntry_t **table = info->SpareTable;
838
839 for (i = 0; i < info->TablesPerSpareRegion; i++) {
840 RF_Free(table[i], info->BlocksPerTable *
841 sizeof(RF_SpareTableEntry_t));
842 }
843 RF_Free(table, info->TablesPerSpareRegion *
844 sizeof(RF_SpareTableEntry_t *));
845 info->SpareTable = (RF_SpareTableEntry_t **) NULL;
846 }
847