1 /*        $NetBSD: rf_paritylog.c,v 1.20 2019/10/10 03:43:59 christos Exp $     */
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /* Code for manipulating in-core parity logs
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_paritylog.c,v 1.20 2019/10/10 03:43:59 christos Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if RF_INCLUDE_PARITYLOGGING > 0
39 
40 /*
41  * Append-only log for recording parity "update" and "overwrite" records
42  */
43 
44 #include <dev/raidframe/raidframevar.h>
45 
46 #include "rf_threadstuff.h"
47 #include "rf_mcpair.h"
48 #include "rf_raid.h"
49 #include "rf_dag.h"
50 #include "rf_dagfuncs.h"
51 #include "rf_desc.h"
52 #include "rf_layout.h"
53 #include "rf_diskqueue.h"
54 #include "rf_etimer.h"
55 #include "rf_paritylog.h"
56 #include "rf_general.h"
57 #include "rf_map.h"
58 #include "rf_paritylogging.h"
59 #include "rf_paritylogDiskMgr.h"
60 
61 static RF_CommonLogData_t *
AllocParityLogCommonData(RF_Raid_t * raidPtr)62 AllocParityLogCommonData(RF_Raid_t * raidPtr)
63 {
64           RF_CommonLogData_t *common = NULL;
65 
66           /* Return a struct for holding common parity log information from the
67            * free list (rf_parityLogDiskQueue.freeCommonList).  If the free list
68            * is empty, call RF_Malloc to create a new structure. NON-BLOCKING */
69 
70           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
71           if (raidPtr->parityLogDiskQueue.freeCommonList) {
72                     common = raidPtr->parityLogDiskQueue.freeCommonList;
73                     raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
74                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
75           } else {
76                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
77                     common = RF_Malloc(sizeof(*common));
78                     /* destroy is in rf_paritylogging.c */
79                     rf_init_mutex2(common->mutex, IPL_VM);
80           }
81           common->next = NULL;
82           return (common);
83 }
84 
85 static void
FreeParityLogCommonData(RF_CommonLogData_t * common)86 FreeParityLogCommonData(RF_CommonLogData_t * common)
87 {
88           RF_Raid_t *raidPtr;
89 
90           /* Insert a single struct for holding parity log information (data)
91            * into the free list (rf_parityLogDiskQueue.freeCommonList).
92            * NON-BLOCKING */
93 
94           raidPtr = common->raidPtr;
95           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
96           common->next = raidPtr->parityLogDiskQueue.freeCommonList;
97           raidPtr->parityLogDiskQueue.freeCommonList = common;
98           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
99 }
100 
101 static RF_ParityLogData_t *
AllocParityLogData(RF_Raid_t * raidPtr)102 AllocParityLogData(RF_Raid_t * raidPtr)
103 {
104           RF_ParityLogData_t *data = NULL;
105 
106           /* Return a struct for holding parity log information from the free
107            * list (rf_parityLogDiskQueue.freeList).  If the free list is empty,
108            * call RF_Malloc to create a new structure. NON-BLOCKING */
109 
110           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
111           if (raidPtr->parityLogDiskQueue.freeDataList) {
112                     data = raidPtr->parityLogDiskQueue.freeDataList;
113                     raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
114                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
115           } else {
116                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
117                     data = RF_Malloc(sizeof(*data));
118           }
119           data->next = NULL;
120           data->prev = NULL;
121           return (data);
122 }
123 
124 
125 static void
FreeParityLogData(RF_ParityLogData_t * data)126 FreeParityLogData(RF_ParityLogData_t * data)
127 {
128           RF_ParityLogData_t *nextItem;
129           RF_Raid_t *raidPtr;
130 
131           /* Insert a linked list of structs for holding parity log information
132            * (data) into the free list (parityLogDiskQueue.freeList).
133            * NON-BLOCKING */
134 
135           raidPtr = data->common->raidPtr;
136           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
137           while (data) {
138                     nextItem = data->next;
139                     data->next = raidPtr->parityLogDiskQueue.freeDataList;
140                     raidPtr->parityLogDiskQueue.freeDataList = data;
141                     data = nextItem;
142           }
143           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
144 }
145 
146 
147 static void
EnqueueParityLogData(RF_ParityLogData_t * data,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail)148 EnqueueParityLogData(
149     RF_ParityLogData_t * data,
150     RF_ParityLogData_t ** head,
151     RF_ParityLogData_t ** tail)
152 {
153           RF_Raid_t *raidPtr;
154 
155           /* Insert an in-core parity log (*data) into the head of a disk queue
156            * (*head, *tail). NON-BLOCKING */
157 
158           raidPtr = data->common->raidPtr;
159           if (rf_parityLogDebug)
160                     printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
161           RF_ASSERT(data->prev == NULL);
162           RF_ASSERT(data->next == NULL);
163           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
164           if (*head) {
165                     /* insert into head of queue */
166                     RF_ASSERT((*head)->prev == NULL);
167                     RF_ASSERT((*tail)->next == NULL);
168                     data->next = *head;
169                     (*head)->prev = data;
170                     *head = data;
171           } else {
172                     /* insert into empty list */
173                     RF_ASSERT(*head == NULL);
174                     RF_ASSERT(*tail == NULL);
175                     *head = data;
176                     *tail = data;
177           }
178           RF_ASSERT((*head)->prev == NULL);
179           RF_ASSERT((*tail)->next == NULL);
180           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
181 }
182 
183 static RF_ParityLogData_t *
DequeueParityLogData(RF_Raid_t * raidPtr,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail,int ignoreLocks)184 DequeueParityLogData(
185     RF_Raid_t * raidPtr,
186     RF_ParityLogData_t ** head,
187     RF_ParityLogData_t ** tail,
188     int ignoreLocks)
189 {
190           RF_ParityLogData_t *data;
191 
192           /* Remove and return an in-core parity log from the tail of a disk
193            * queue (*head, *tail). NON-BLOCKING */
194 
195           /* remove from tail, preserving FIFO order */
196           if (!ignoreLocks)
197                     rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
198           data = *tail;
199           if (data) {
200                     if (*head == *tail) {
201                               /* removing last item from queue */
202                               *head = NULL;
203                               *tail = NULL;
204                     } else {
205                               *tail = (*tail)->prev;
206                               (*tail)->next = NULL;
207                               RF_ASSERT((*head)->prev == NULL);
208                               RF_ASSERT((*tail)->next == NULL);
209                     }
210                     data->next = NULL;
211                     data->prev = NULL;
212                     if (rf_parityLogDebug)
213                               printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
214           }
215           if (*head) {
216                     RF_ASSERT((*head)->prev == NULL);
217                     RF_ASSERT((*tail)->next == NULL);
218           }
219           if (!ignoreLocks)
220                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
221           return (data);
222 }
223 
224 
225 static void
RequeueParityLogData(RF_ParityLogData_t * data,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail)226 RequeueParityLogData(
227     RF_ParityLogData_t * data,
228     RF_ParityLogData_t ** head,
229     RF_ParityLogData_t ** tail)
230 {
231           RF_Raid_t *raidPtr;
232 
233           /* Insert an in-core parity log (*data) into the tail of a disk queue
234            * (*head, *tail). NON-BLOCKING */
235 
236           raidPtr = data->common->raidPtr;
237           RF_ASSERT(data);
238           if (rf_parityLogDebug)
239                     printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
240           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
241           if (*tail) {
242                     /* append to tail of list */
243                     data->prev = *tail;
244                     data->next = NULL;
245                     (*tail)->next = data;
246                     *tail = data;
247           } else {
248                     /* inserting into an empty list */
249                     *head = data;
250                     *tail = data;
251                     (*head)->prev = NULL;
252                     (*tail)->next = NULL;
253           }
254           RF_ASSERT((*head)->prev == NULL);
255           RF_ASSERT((*tail)->next == NULL);
256           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
257 }
258 
259 RF_ParityLogData_t *
rf_CreateParityLogData(RF_ParityRecordType_t operation,RF_PhysDiskAddr_t * pda,void * bufPtr,RF_Raid_t * raidPtr,void (* wakeFunc)(void *,int),void * wakeArg,RF_AccTraceEntry_t * tracerec,RF_Etimer_t startTime)260 rf_CreateParityLogData(
261     RF_ParityRecordType_t operation,
262     RF_PhysDiskAddr_t * pda,
263     void *bufPtr,
264     RF_Raid_t * raidPtr,
265     void (*wakeFunc)(void *, int),
266     void *wakeArg,
267     RF_AccTraceEntry_t * tracerec,
268     RF_Etimer_t startTime)
269 {
270           RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
271           RF_CommonLogData_t *common;
272           RF_PhysDiskAddr_t *diskAddress;
273           int     boundary, offset = 0;
274 
275           /* Return an initialized struct of info to be logged. Build one item
276            * per physical disk address, one item per region.
277            *
278            * NON-BLOCKING */
279 
280           diskAddress = pda;
281           common = AllocParityLogCommonData(raidPtr);
282           RF_ASSERT(common);
283 
284           common->operation = operation;
285           common->bufPtr = bufPtr;
286           common->raidPtr = raidPtr;
287           common->wakeFunc = wakeFunc;
288           common->wakeArg = wakeArg;
289           common->tracerec = tracerec;
290           common->startTime = startTime;
291           common->cnt = 0;
292 
293           if (rf_parityLogDebug)
294                     printf("[entering CreateParityLogData]\n");
295           while (diskAddress) {
296                     common->cnt++;
297                     data = AllocParityLogData(raidPtr);
298                     RF_ASSERT(data);
299                     data->common = common;
300                     data->next = NULL;
301                     data->prev = NULL;
302                     data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
303                     if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) {
304                               /* disk address does not cross a region boundary */
305                               data->diskAddress = *diskAddress;
306                               data->bufOffset = offset;
307                               offset = offset + diskAddress->numSector;
308                               EnqueueParityLogData(data, &resultHead, &resultTail);
309                               /* adjust disk address */
310                               diskAddress = diskAddress->next;
311                     } else {
312                               /* disk address crosses a region boundary */
313                               /* find address where region is crossed */
314                               boundary = 0;
315                               while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
316                                         boundary++;
317 
318                               /* enter data before the boundary */
319                               data->diskAddress = *diskAddress;
320                               data->diskAddress.numSector = boundary;
321                               data->bufOffset = offset;
322                               offset += boundary;
323                               EnqueueParityLogData(data, &resultHead, &resultTail);
324                               /* adjust disk address */
325                               diskAddress->startSector += boundary;
326                               diskAddress->numSector -= boundary;
327                     }
328           }
329           if (rf_parityLogDebug)
330                     printf("[leaving CreateParityLogData]\n");
331           return (resultHead);
332 }
333 
334 
335 RF_ParityLogData_t *
rf_SearchAndDequeueParityLogData(RF_Raid_t * raidPtr,int regionID,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail,int ignoreLocks)336 rf_SearchAndDequeueParityLogData(
337     RF_Raid_t * raidPtr,
338     int regionID,
339     RF_ParityLogData_t ** head,
340     RF_ParityLogData_t ** tail,
341     int ignoreLocks)
342 {
343           RF_ParityLogData_t *w;
344 
345           /* Remove and return an in-core parity log from a specified region
346            * (regionID). If a matching log is not found, return NULL.
347            *
348            * NON-BLOCKING. */
349 
350           /* walk backward through a list, looking for an entry with a matching
351            * region ID */
352           if (!ignoreLocks)
353                     rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
354           w = (*tail);
355           while (w) {
356                     if (w->regionID == regionID) {
357                               /* remove an element from the list */
358                               if (w == *tail) {
359                                         if (*head == *tail) {
360                                                   /* removing only element in the list */
361                                                   *head = NULL;
362                                                   *tail = NULL;
363                                         } else {
364                                                   /* removing last item in the list */
365                                                   *tail = (*tail)->prev;
366                                                   (*tail)->next = NULL;
367                                                   RF_ASSERT((*head)->prev == NULL);
368                                                   RF_ASSERT((*tail)->next == NULL);
369                                         }
370                               } else {
371                                         if (w == *head) {
372                                                   /* removing first item in the list */
373                                                   *head = (*head)->next;
374                                                   (*head)->prev = NULL;
375                                                   RF_ASSERT((*head)->prev == NULL);
376                                                   RF_ASSERT((*tail)->next == NULL);
377                                         } else {
378                                                   /* removing an item from the middle of
379                                                    * the list */
380                                                   w->prev->next = w->next;
381                                                   w->next->prev = w->prev;
382                                                   RF_ASSERT((*head)->prev == NULL);
383                                                   RF_ASSERT((*tail)->next == NULL);
384                                         }
385                               }
386                               w->prev = NULL;
387                               w->next = NULL;
388                               if (rf_parityLogDebug)
389                                         printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", w->regionID, (int) w->diskAddress.raidAddress, (int) w->diskAddress.numSector);
390                               return (w);
391                     } else
392                               w = w->prev;
393           }
394           if (!ignoreLocks)
395                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
396           return (NULL);
397 }
398 
399 static RF_ParityLogData_t *
DequeueMatchingLogData(RF_Raid_t * raidPtr,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail)400 DequeueMatchingLogData(
401     RF_Raid_t * raidPtr,
402     RF_ParityLogData_t ** head,
403     RF_ParityLogData_t ** tail)
404 {
405           RF_ParityLogData_t *logDataList, *logData;
406           int     regionID;
407 
408           /* Remove and return an in-core parity log from the tail of a disk
409            * queue (*head, *tail).  Then remove all matching (identical
410            * regionIDs) logData and return as a linked list.
411            *
412            * NON-BLOCKING */
413 
414           logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
415           if (logDataList) {
416                     regionID = logDataList->regionID;
417                     logData = logDataList;
418                     logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
419                     while (logData->next) {
420                               logData = logData->next;
421                               logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
422                     }
423           }
424           return (logDataList);
425 }
426 
427 
428 static RF_ParityLog_t *
AcquireParityLog(RF_ParityLogData_t * logData,int finish)429 AcquireParityLog(
430     RF_ParityLogData_t * logData,
431     int finish)
432 {
433           RF_ParityLog_t *log = NULL;
434           RF_Raid_t *raidPtr;
435 
436           /* Grab a log buffer from the pool and return it. If no buffers are
437            * available, return NULL. NON-BLOCKING */
438           raidPtr = logData->common->raidPtr;
439           rf_lock_mutex2(raidPtr->parityLogPool.mutex);
440           if (raidPtr->parityLogPool.parityLogs) {
441                     log = raidPtr->parityLogPool.parityLogs;
442                     raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
443                     log->regionID = logData->regionID;
444                     log->numRecords = 0;
445                     log->next = NULL;
446                     raidPtr->logsInUse++;
447                     RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
448           } else {
449                     /* no logs available, so place ourselves on the queue of work
450                      * waiting on log buffers this is done while
451                      * parityLogPool.mutex is held, to ensure synchronization with
452                      * ReleaseParityLogs. */
453                     if (rf_parityLogDebug)
454                               printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
455                     if (finish)
456                               RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
457                     else
458                               EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
459           }
460           rf_unlock_mutex2(raidPtr->parityLogPool.mutex);
461           return (log);
462 }
463 
464 void
rf_ReleaseParityLogs(RF_Raid_t * raidPtr,RF_ParityLog_t * firstLog)465 rf_ReleaseParityLogs(
466     RF_Raid_t * raidPtr,
467     RF_ParityLog_t * firstLog)
468 {
469           RF_ParityLogData_t *logDataList;
470           RF_ParityLog_t *log, *lastLog;
471           int     cnt;
472 
473           /* Insert a linked list of parity logs (firstLog) to the free list
474            * (parityLogPool.parityLogPool)
475            *
476            * NON-BLOCKING. */
477 
478           RF_ASSERT(firstLog);
479 
480           /* Before returning logs to global free list, service all requests
481            * which are blocked on logs.  Holding mutexes for parityLogPool and
482            * parityLogDiskQueue forces synchronization with AcquireParityLog(). */
483           rf_lock_mutex2(raidPtr->parityLogPool.mutex);
484           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
485           logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
486           log = firstLog;
487           if (firstLog)
488                     firstLog = firstLog->next;
489           log->numRecords = 0;
490           log->next = NULL;
491           while (logDataList && log) {
492                     rf_unlock_mutex2(raidPtr->parityLogPool.mutex);
493                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
494                     rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
495                     if (rf_parityLogDebug)
496                               printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
497                     if (log == NULL) {
498                               log = firstLog;
499                               if (firstLog) {
500                                         firstLog = firstLog->next;
501                                         log->numRecords = 0;
502                                         log->next = NULL;
503                               }
504                     }
505                     rf_lock_mutex2(raidPtr->parityLogPool.mutex);
506                     rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
507                     if (log)
508                               logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
509           }
510           /* return remaining logs to pool */
511           if (log) {
512                     log->next = firstLog;
513                     firstLog = log;
514           }
515           if (firstLog) {
516                     lastLog = firstLog;
517                     raidPtr->logsInUse--;
518                     RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
519                     while (lastLog->next) {
520                               lastLog = lastLog->next;
521                               raidPtr->logsInUse--;
522                               RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
523                     }
524                     lastLog->next = raidPtr->parityLogPool.parityLogs;
525                     raidPtr->parityLogPool.parityLogs = firstLog;
526                     cnt = 0;
527                     log = raidPtr->parityLogPool.parityLogs;
528                     while (log) {
529                               cnt++;
530                               log = log->next;
531                     }
532                     RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
533           }
534           rf_unlock_mutex2(raidPtr->parityLogPool.mutex);
535           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
536 }
537 
538 static void
ReintLog(RF_Raid_t * raidPtr,int regionID,RF_ParityLog_t * log)539 ReintLog(
540     RF_Raid_t * raidPtr,
541     int regionID,
542     RF_ParityLog_t * log)
543 {
544           RF_ASSERT(log);
545 
546           /* Insert an in-core parity log (log) into the disk queue of
547            * reintegration work.  Set the flag (reintInProgress) for the
548            * specified region (regionID) to indicate that reintegration is in
549            * progress for this region. NON-BLOCKING */
550 
551           rf_lock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
552           raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE;    /* cleared when reint
553                                                                                            * complete */
554 
555           if (rf_parityLogDebug)
556                     printf("[requesting reintegration of region %d]\n", log->regionID);
557           /* move record to reintegration queue */
558           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
559           log->next = raidPtr->parityLogDiskQueue.reintQueue;
560           raidPtr->parityLogDiskQueue.reintQueue = log;
561           rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
562           rf_signal_cond2(raidPtr->parityLogDiskQueue.cond);
563           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
564 }
565 
566 static void
FlushLog(RF_Raid_t * raidPtr,RF_ParityLog_t * log)567 FlushLog(
568     RF_Raid_t * raidPtr,
569     RF_ParityLog_t * log)
570 {
571           /* insert a core log (log) into a list of logs
572            * (parityLogDiskQueue.flushQueue) waiting to be written to disk.
573            * NON-BLOCKING */
574 
575           RF_ASSERT(log);
576           RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
577           RF_ASSERT(log->next == NULL);
578           /* move log to flush queue */
579           rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
580           log->next = raidPtr->parityLogDiskQueue.flushQueue;
581           raidPtr->parityLogDiskQueue.flushQueue = log;
582           rf_signal_cond2(raidPtr->parityLogDiskQueue.cond);
583           rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
584 }
585 
586 static int
DumpParityLogToDisk(int finish,RF_ParityLogData_t * logData)587 DumpParityLogToDisk(
588     int finish,
589     RF_ParityLogData_t * logData)
590 {
591           int     i, diskCount, regionID = logData->regionID;
592           RF_ParityLog_t *log;
593           RF_Raid_t *raidPtr;
594 
595           raidPtr = logData->common->raidPtr;
596 
597           /* Move a core log to disk.  If the log disk is full, initiate
598            * reintegration.
599            *
600            * Return (0) if we can enqueue the dump immediately, otherwise return
601            * (1) to indicate we are blocked on reintegration and control of the
602            * thread should be relinquished.
603            *
604            * Caller must hold regionInfo[regionID].mutex
605            *
606            * NON-BLOCKING */
607 
608           RF_ASSERT(rf_owned_mutex2(raidPtr->regionInfo[regionID].mutex));
609 
610           if (rf_parityLogDebug)
611                     printf("[dumping parity log to disk, region %d]\n", regionID);
612           log = raidPtr->regionInfo[regionID].coreLog;
613           RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
614           RF_ASSERT(log->next == NULL);
615 
616           /* if reintegration is in progress, must queue work */
617           rf_lock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
618           if (raidPtr->regionInfo[regionID].reintInProgress) {
619                     /* Can not proceed since this region is currently being
620                      * reintegrated. We can not block, so queue remaining work and
621                      * return */
622                     if (rf_parityLogDebug)
623                               printf("[region %d waiting on reintegration]\n", regionID);
624                     /* XXX not sure about the use of finish - shouldn't this
625                      * always be "Enqueue"? */
626                     if (finish)
627                               RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
628                     else
629                               EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
630                     rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
631                     return (1);         /* relenquish control of this thread */
632           }
633           rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
634           raidPtr->regionInfo[regionID].coreLog = NULL;
635           if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
636                     /* IMPORTANT!! this loop bound assumes region disk holds an
637                      * integral number of core logs */
638           {
639                     /* update disk map for this region */
640                     diskCount = raidPtr->regionInfo[regionID].diskCount;
641                     for (i = 0; i < raidPtr->numSectorsPerLog; i++) {
642                               raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
643                               raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
644                     }
645                     log->diskOffset = diskCount;
646                     raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
647                     FlushLog(raidPtr, log);
648           } else {
649                     /* no room for log on disk, send it to disk manager and
650                      * request reintegration */
651                     RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
652                     ReintLog(raidPtr, regionID, log);
653           }
654           if (rf_parityLogDebug)
655                     printf("[finished dumping parity log to disk, region %d]\n", regionID);
656           return (0);
657 }
658 
659 int
rf_ParityLogAppend(RF_ParityLogData_t * logData,int finish,RF_ParityLog_t ** incomingLog,int clearReintFlag)660 rf_ParityLogAppend(
661     RF_ParityLogData_t * logData,
662     int finish,
663     RF_ParityLog_t ** incomingLog,
664     int clearReintFlag)
665 {
666           int     regionID, logItem, itemDone;
667           RF_ParityLogData_t *item;
668           int     punt, done = RF_FALSE;
669           RF_ParityLog_t *log;
670           RF_Raid_t *raidPtr;
671           RF_Etimer_t timer;
672           void     (*wakeFunc) (void *, int);
673           void   *wakeArg;
674 
675           /* Add parity to the appropriate log, one sector at a time. This
676            * routine is called is called by dag functions ParityLogUpdateFunc
677            * and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
678            *
679            * Parity to be logged is contained in a linked-list (logData).  When
680            * this routine returns, every sector in the list will be in one of
681            * three places: 1) entered into the parity log 2) queued, waiting on
682            * reintegration 3) queued, waiting on a core log
683            *
684            * Blocked work is passed to the ParityLoggingDiskManager for completion.
685            * Later, as conditions which required the block are removed, the work
686            * reenters this routine with the "finish" parameter set to "RF_TRUE."
687            *
688            * NON-BLOCKING */
689 
690           raidPtr = logData->common->raidPtr;
691           /* lock the region for the first item in logData */
692           RF_ASSERT(logData != NULL);
693           regionID = logData->regionID;
694           rf_lock_mutex2(raidPtr->regionInfo[regionID].mutex);
695           RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
696 
697           if (clearReintFlag) {
698                     /* Enable flushing for this region.  Holding both locks
699                      * provides a synchronization barrier with DumpParityLogToDisk */
700                     rf_lock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
701                     /* XXXmrg need this? */
702                     rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
703                     RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
704                     raidPtr->regionInfo[regionID].diskCount = 0;
705                     raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
706                     rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now
707                                                                                                      * enabled */
708                     /* XXXmrg need this? */
709                     rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
710           }
711           /* process each item in logData */
712           while (logData) {
713                     /* remove an item from logData */
714                     item = logData;
715                     logData = logData->next;
716                     item->next = NULL;
717                     item->prev = NULL;
718 
719                     if (rf_parityLogDebug)
720                               printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n", item->regionID, (int) item->diskAddress.raidAddress, (int) item->diskAddress.numSector);
721 
722                     /* see if we moved to a new region */
723                     if (regionID != item->regionID) {
724                               rf_unlock_mutex2(raidPtr->regionInfo[regionID].mutex);
725                               regionID = item->regionID;
726                               rf_lock_mutex2(raidPtr->regionInfo[regionID].mutex);
727                               RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
728                     }
729                     punt = RF_FALSE;/* Set to RF_TRUE if work is blocked.  This
730                                          * can happen in one of two ways: 1) no core
731                                          * log (AcquireParityLog) 2) waiting on
732                                          * reintegration (DumpParityLogToDisk) If punt
733                                          * is RF_TRUE, the dataItem was queued, so
734                                          * skip to next item. */
735 
736                     /* process item, one sector at a time, until all sectors
737                      * processed or we punt */
738                     if (item->diskAddress.numSector > 0)
739                               done = RF_FALSE;
740                     else
741                               RF_ASSERT(0);
742                     while (!punt && !done) {
743                               /* verify that a core log exists for this region */
744                               if (!raidPtr->regionInfo[regionID].coreLog) {
745                                         /* Attempt to acquire a parity log. If
746                                          * acquisition fails, queue remaining work in
747                                          * data item and move to nextItem. */
748                                         if (incomingLog)
749                                                   if (*incomingLog) {
750                                                             RF_ASSERT((*incomingLog)->next == NULL);
751                                                             raidPtr->regionInfo[regionID].coreLog = *incomingLog;
752                                                             raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
753                                                             *incomingLog = NULL;
754                                                   } else
755                                                             raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
756                                         else
757                                                   raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
758                                         /* Note: AcquireParityLog either returns a log
759                                          * or enqueues currentItem */
760                               }
761                               if (!raidPtr->regionInfo[regionID].coreLog)
762                                         punt = RF_TRUE;     /* failed to find a core log */
763                               else {
764                                         RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
765                                         /* verify that the log has room for new
766                                          * entries */
767                                         /* if log is full, dump it to disk and grab a
768                                          * new log */
769                                         if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) {
770                                                   /* log is full, dump it to disk */
771                                                   if (DumpParityLogToDisk(finish, item))
772                                                             punt = RF_TRUE;     /* dump unsuccessful,
773                                                                                  * blocked on
774                                                                                  * reintegration */
775                                                   else {
776                                                             /* dump was successful */
777                                                             if (incomingLog)
778                                                                       if (*incomingLog) {
779                                                                                 RF_ASSERT((*incomingLog)->next == NULL);
780                                                                                 raidPtr->regionInfo[regionID].coreLog = *incomingLog;
781                                                                                 raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
782                                                                                 *incomingLog = NULL;
783                                                                       } else
784                                                                                 raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
785                                                             else
786                                                                       raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
787                                                             /* if a core log is not
788                                                              * available, must queue work
789                                                              * and return */
790                                                             if (!raidPtr->regionInfo[regionID].coreLog)
791                                                                       punt = RF_TRUE;     /* blocked on log
792                                                                                            * availability */
793                                                   }
794                                         }
795                               }
796                               /* if we didn't punt on this item, attempt to add a
797                                * sector to the core log */
798                               if (!punt) {
799                                         RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
800                                         /* at this point, we have a core log with
801                                          * enough room for a sector */
802                                         /* copy a sector into the log */
803                                         log = raidPtr->regionInfo[regionID].coreLog;
804                                         RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
805                                         logItem = log->numRecords++;
806                                         log->records[logItem].parityAddr = item->diskAddress;
807                                         RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
808                                         RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
809                                         log->records[logItem].parityAddr.numSector = 1;
810                                         log->records[logItem].operation = item->common->operation;
811                                         memcpy((char *)log->bufPtr + (logItem * (1 << item->common->raidPtr->logBytesPerSector)), ((char *)item->common->bufPtr + (item->bufOffset++ * (1 << item->common->raidPtr->logBytesPerSector))), (1 << item->common->raidPtr->logBytesPerSector));
812                                         item->diskAddress.numSector--;
813                                         item->diskAddress.startSector++;
814                                         if (item->diskAddress.numSector == 0)
815                                                   done = RF_TRUE;
816                               }
817                     }
818 
819                     if (!punt) {
820                               /* Processed this item completely, decrement count of
821                                * items to be processed. */
822                               RF_ASSERT(item->diskAddress.numSector == 0);
823                               rf_lock_mutex2(item->common->mutex);
824                               item->common->cnt--;
825                               if (item->common->cnt == 0)
826                                         itemDone = RF_TRUE;
827                               else
828                                         itemDone = RF_FALSE;
829                               rf_unlock_mutex2(item->common->mutex);
830                               if (itemDone) {
831                                         /* Finished processing all log data for this
832                                          * IO Return structs to free list and invoke
833                                          * wakeup function. */
834                                         timer = item->common->startTime;        /* grab initial value of
835                                                                                            * timer */
836                                         RF_ETIMER_STOP(timer);
837                                         RF_ETIMER_EVAL(timer);
838                                         item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
839                                         if (rf_parityLogDebug)
840                                                   printf("[waking process for region %d]\n", item->regionID);
841                                         wakeFunc = item->common->wakeFunc;
842                                         wakeArg = item->common->wakeArg;
843                                         FreeParityLogCommonData(item->common);
844                                         FreeParityLogData(item);
845                                         (wakeFunc) (wakeArg, 0);
846                               } else
847                                         FreeParityLogData(item);
848                     }
849           }
850           rf_unlock_mutex2(raidPtr->regionInfo[regionID].mutex);
851           if (rf_parityLogDebug)
852                     printf("[exiting ParityLogAppend]\n");
853           return (0);
854 }
855 
856 
857 void
rf_EnableParityLogging(RF_Raid_t * raidPtr)858 rf_EnableParityLogging(RF_Raid_t * raidPtr)
859 {
860           int     regionID;
861 
862           for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
863                     rf_lock_mutex2(raidPtr->regionInfo[regionID].mutex);
864                     raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
865                     rf_unlock_mutex2(raidPtr->regionInfo[regionID].mutex);
866           }
867           if (rf_parityLogDebug)
868                     printf("[parity logging enabled]\n");
869 }
870 #endif                                  /* RF_INCLUDE_PARITYLOGGING > 0 */
871