1 /*        $NetBSD: rf_diskqueue.c,v 1.64 2023/09/17 20:07:39 oster Exp $        */
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /****************************************************************************
30  *
31  * rf_diskqueue.c -- higher-level disk queue code
32  *
33  * the routines here are a generic wrapper around the actual queueing
34  * routines.  The code here implements thread scheduling, synchronization,
35  * and locking ops (see below) on top of the lower-level queueing code.
36  *
37  * to support atomic RMW, we implement "locking operations".  When a
38  * locking op is dispatched to the lower levels of the driver, the
39  * queue is locked, and no further I/Os are dispatched until the queue
40  * receives & completes a corresponding "unlocking operation".  This
41  * code relies on the higher layers to guarantee that a locking op
42  * will always be eventually followed by an unlocking op.  The model
43  * is that the higher layers are structured so locking and unlocking
44  * ops occur in pairs, i.e.  an unlocking op cannot be generated until
45  * after a locking op reports completion.  There is no good way to
46  * check to see that an unlocking op "corresponds" to the op that
47  * currently has the queue locked, so we make no such attempt.  Since
48  * by definition there can be only one locking op outstanding on a
49  * disk, this should not be a problem.
50  *
51  * In the kernel, we allow multiple I/Os to be concurrently dispatched
52  * to the disk driver.  In order to support locking ops in this
53  * environment, when we decide to do a locking op, we stop dispatching
54  * new I/Os and wait until all dispatched I/Os have completed before
55  * dispatching the locking op.
56  *
57  * Unfortunately, the code is different in the 3 different operating
58  * states (user level, kernel, simulator).  In the kernel, I/O is
59  * non-blocking, and we have no disk threads to dispatch for us.
60  * Therefore, we have to dispatch new I/Os to the scsi driver at the
61  * time of enqueue, and also at the time of completion.  At user
62  * level, I/O is blocking, and so only the disk threads may dispatch
63  * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
64  * and wake up the disk thread to do the dispatch.
65  *
66  ****************************************************************************/
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.64 2023/09/17 20:07:39 oster Exp $");
70 
71 #include <dev/raidframe/raidframevar.h>
72 
73 #include "rf_threadstuff.h"
74 #include "rf_raid.h"
75 #include "rf_diskqueue.h"
76 #include "rf_alloclist.h"
77 #include "rf_acctrace.h"
78 #include "rf_etimer.h"
79 #include "rf_general.h"
80 #include "rf_debugprint.h"
81 #include "rf_shutdown.h"
82 #include "rf_cvscan.h"
83 #include "rf_sstf.h"
84 #include "rf_fifo.h"
85 #include "rf_kintf.h"
86 
87 #include <sys/buf.h>
88 
89 static void rf_ShutdownDiskQueueSystem(void *);
90 
91 #ifndef RF_DEBUG_DISKQUEUE
92 #define RF_DEBUG_DISKQUEUE 0
93 #endif
94 
95 #if RF_DEBUG_DISKQUEUE
96 #define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97 #define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98 #define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99 #else
100 #define Dprintf1(s,a)
101 #define Dprintf2(s,a,b)
102 #define Dprintf3(s,a,b,c)
103 #endif
104 
105 /*****************************************************************************
106  *
107  * the disk queue switch defines all the functions used in the
108  * different queueing disciplines queue ID, init routine, enqueue
109  * routine, dequeue routine
110  *
111  ****************************************************************************/
112 
113 static const RF_DiskQueueSW_t diskqueuesw[] = {
114           {"fifo",            /* FIFO */
115                     rf_FifoCreate,
116                     rf_FifoEnqueue,
117                     rf_FifoDequeue,
118                     rf_FifoPromote},
119 
120           {"cvscan",                    /* cvscan */
121                     rf_CvscanCreate,
122                     rf_CvscanEnqueue,
123                     rf_CvscanDequeue,
124                     rf_CvscanPromote},
125 
126           {"sstf",            /* shortest seek time first */
127                     rf_SstfCreate,
128                     rf_SstfEnqueue,
129                     rf_SstfDequeue,
130                     rf_SstfPromote},
131 
132           {"scan",            /* SCAN (two-way elevator) */
133                     rf_ScanCreate,
134                     rf_SstfEnqueue,
135                     rf_ScanDequeue,
136                     rf_SstfPromote},
137 
138           {"cscan",           /* CSCAN (one-way elevator) */
139                     rf_CscanCreate,
140                     rf_SstfEnqueue,
141                     rf_CscanDequeue,
142                     rf_SstfPromote},
143 
144 };
145 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
146 
147 
148 #define RF_MAX_FREE_DQD 256
149 #define RF_MIN_FREE_DQD  64
150 
151 /* XXX: scale these... */
152 #define RF_MAX_FREE_BUFIO 256
153 #define RF_MIN_FREE_BUFIO  64
154 
155 
156 
157 /* configures a single disk queue */
158 
159 static void
rf_ShutdownDiskQueue(void * arg)160 rf_ShutdownDiskQueue(void *arg)
161 {
162           RF_DiskQueue_t *diskqueue = arg;
163 
164           rf_destroy_mutex2(diskqueue->mutex);
165 }
166 
167 int
rf_ConfigureDiskQueue(RF_Raid_t * raidPtr,RF_DiskQueue_t * diskqueue,RF_RowCol_t c,const RF_DiskQueueSW_t * p,RF_SectorCount_t sectPerDisk,dev_t dev,int maxOutstanding,RF_ShutdownList_t ** listp,RF_AllocListElem_t * clList)168 rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
169                           RF_RowCol_t c, const RF_DiskQueueSW_t *p,
170                           RF_SectorCount_t sectPerDisk, dev_t dev,
171                           int maxOutstanding, RF_ShutdownList_t **listp,
172                           RF_AllocListElem_t *clList)
173 {
174           diskqueue->col = c;
175           diskqueue->qPtr = p;
176           diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
177           diskqueue->dev = dev;
178           diskqueue->numOutstanding = 0;
179           diskqueue->queueLength = 0;
180           diskqueue->maxOutstanding = maxOutstanding;
181           diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
182           diskqueue->flags = 0;
183           diskqueue->raidPtr = raidPtr;
184           diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
185           rf_init_mutex2(diskqueue->mutex, IPL_VM);
186           rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue);
187           return (0);
188 }
189 
190 int
rf_UpdateDiskQueue(RF_DiskQueue_t * diskqueue,RF_RaidDisk_t * disk)191 rf_UpdateDiskQueue(RF_DiskQueue_t *diskqueue, RF_RaidDisk_t *disk)
192 {
193           diskqueue->dev = disk->dev;
194           return(0);
195 }
196 
197 static void
rf_ShutdownDiskQueueSystem(void * arg)198 rf_ShutdownDiskQueueSystem(void *arg)
199 {
200           RF_Raid_t *raidPtr;
201 
202           raidPtr = (RF_Raid_t *) arg;
203 
204           pool_destroy(&raidPtr->pools.dqd);
205           pool_destroy(&raidPtr->pools.bufio);
206 }
207 
208 int
rf_ConfigureDiskQueueSystem(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)209 rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
210                                   RF_Config_t *cfgPtr)
211 
212 {
213 
214           rf_pool_init(raidPtr, raidPtr->poolNames.dqd, &raidPtr->pools.dqd, sizeof(RF_DiskQueueData_t),
215                          "dqd", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
216           rf_pool_init(raidPtr, raidPtr->poolNames.bufio, &raidPtr->pools.bufio, sizeof(buf_t),
217                          "bufio", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO);
218           rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, raidPtr);
219 
220           return (0);
221 }
222 
223 int
rf_ConfigureDiskQueues(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)224 rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
225                            RF_Config_t *cfgPtr)
226 {
227           RF_DiskQueue_t *diskQueues, *spareQueues;
228           const RF_DiskQueueSW_t *p;
229           RF_RowCol_t r,c;
230           int     rc, i;
231 
232           raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
233 
234           for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
235                     if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
236                               p = &diskqueuesw[i];
237                               break;
238                     }
239           }
240           if (p == NULL) {
241                     RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
242                     p = &diskqueuesw[0];
243           }
244           raidPtr->qType = p;
245 
246           diskQueues = RF_MallocAndAdd(
247               (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues),
248               raidPtr->cleanupList);
249           if (diskQueues == NULL)
250                     return (ENOMEM);
251           raidPtr->Queues = diskQueues;
252 
253           for (c = 0; c < raidPtr->numCol; c++) {
254                     rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
255                                                      c, p,
256                                                      raidPtr->sectorsPerDisk,
257                                                      raidPtr->Disks[c].dev,
258                                                      cfgPtr->maxOutstandingDiskReqs,
259                                                      listp, raidPtr->cleanupList);
260                     if (rc)
261                               return (rc);
262           }
263 
264           spareQueues = &raidPtr->Queues[raidPtr->numCol];
265           for (r = 0; r < raidPtr->maxQueue; r++) {
266                     rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
267                                                      raidPtr->numCol + r, p,
268                                                      raidPtr->sectorsPerDisk,
269                                                      raidPtr->Disks[raidPtr->numCol + r].dev,
270                                                      cfgPtr->maxOutstandingDiskReqs, listp,
271                                                      raidPtr->cleanupList);
272                     if (rc)
273                               return (rc);
274           }
275           return (0);
276 }
277 /* Enqueue a disk I/O
278  *
279  * In the kernel, I/O is non-blocking and so we'd like to have multiple
280  * I/Os outstanding on the physical disks when possible.
281  *
282  * when any request arrives at a queue, we have two choices:
283  *    dispatch it to the lower levels
284  *    queue it up
285  *
286  * kernel rules for when to do what:
287  *    unlocking req  :  always dispatch it
288  *    normal req     :  queue empty => dispatch it & set priority
289  *                      queue not full & priority is ok => dispatch it
290  *                      else queue it
291  */
292 void
rf_DiskIOEnqueue(RF_DiskQueue_t * queue,RF_DiskQueueData_t * req,int pri)293 rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
294 {
295           RF_ETIMER_START(req->qtime);
296           RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
297           req->priority = pri;
298 
299 #if RF_DEBUG_DISKQUEUE
300           if (rf_queueDebug && (req->numSector == 0)) {
301                     printf("Warning: Enqueueing zero-sector access\n");
302           }
303 #endif
304           RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
305           if (RF_OK_TO_DISPATCH(queue, req)) {
306                     Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
307                     rf_DispatchKernelIO(queue, req);
308           } else {
309                     queue->queueLength++;         /* increment count of number of requests waiting in this queue */
310                     Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
311                     req->queue = (void *) queue;
312                     (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
313           }
314           RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
315 }
316 
317 
318 /* get the next set of I/Os started */
319 void
rf_DiskIOComplete(RF_DiskQueue_t * queue,RF_DiskQueueData_t * req,int status)320 rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
321 {
322           int     done = 0;
323 
324           RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
325           queue->numOutstanding--;
326           RF_ASSERT(queue->numOutstanding >= 0);
327 
328           /* dispatch requests to the disk until we find one that we can't. */
329           /* no reason to continue once we've filled up the queue */
330           /* no reason to even start if the queue is locked */
331 
332           while (!done && !RF_QUEUE_FULL(queue)) {
333                     req = (queue->qPtr->Dequeue) (queue->qHdr);
334                     if (req) {
335                               Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
336                               queue->queueLength--;         /* decrement count of number of requests waiting in this queue */
337                               RF_ASSERT(queue->queueLength >= 0);
338                               if (RF_OK_TO_DISPATCH(queue, req)) {
339                                         Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
340                                         rf_DispatchKernelIO(queue, req);
341                               } else {
342                                         /* we can't dispatch it, so just re-enqueue it.
343                                            potential trouble here if disk queues batch reqs */
344                                         Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
345                                         queue->queueLength++;
346                                         (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
347                                         done = 1;
348                               }
349                     } else {
350                               Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
351                               done = 1;
352                     }
353           }
354 
355           RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
356 }
357 /* promotes accesses tagged with the given parityStripeID from low priority
358  * to normal priority.  This promotion is optional, meaning that a queue
359  * need not implement it.  If there is no promotion routine associated with
360  * a queue, this routine does nothing and returns -1.
361  */
362 int
rf_DiskIOPromote(RF_DiskQueue_t * queue,RF_StripeNum_t parityStripeID,RF_ReconUnitNum_t which_ru)363 rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
364                      RF_ReconUnitNum_t which_ru)
365 {
366           int     retval;
367 
368           if (!queue->qPtr->Promote)
369                     return (-1);
370           RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
371           retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
372           RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
373           return (retval);
374 }
375 
376 RF_DiskQueueData_t *
rf_CreateDiskQueueData(RF_IoType_t typ,RF_SectorNum_t ssect,RF_SectorCount_t nsect,void * bf,RF_StripeNum_t parityStripeID,RF_ReconUnitNum_t which_ru,void (* wakeF)(void *,int),void * arg,RF_AccTraceEntry_t * tracerec,RF_Raid_t * raidPtr,RF_DiskQueueDataFlags_t flags,const struct buf * mbp)377 rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
378                            RF_SectorCount_t nsect, void *bf,
379                            RF_StripeNum_t parityStripeID,
380                            RF_ReconUnitNum_t which_ru,
381                            void (*wakeF) (void *, int), void *arg,
382                            RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr,
383                            RF_DiskQueueDataFlags_t flags, const struct buf *mbp)
384 {
385           RF_DiskQueueData_t *p;
386 
387           p = pool_get(&raidPtr->pools.dqd, PR_WAITOK | PR_ZERO);
388           KASSERT(p != NULL);
389 
390           /* Obtain a buffer from our own pool.  It is possible for the
391              regular getiobuf() to run out of memory and return NULL.
392              We need to guarantee that never happens, as RAIDframe
393              doesn't have a good way to recover if memory allocation
394              fails here.
395           */
396           p->bp = pool_get(&raidPtr->pools.bufio, PR_WAITOK | PR_ZERO);
397           KASSERT(p->bp != NULL);
398 
399           buf_init(p->bp);
400 
401           SET(p->bp->b_cflags, BC_BUSY);          /* mark buffer busy */
402           if (mbp) {
403                     SET(p->bp->b_flags, mbp->b_flags & rf_b_pass);
404                     p->bp->b_proc = mbp->b_proc;
405           }
406 
407           p->sectorOffset = ssect + rf_protectedSectors;
408           p->numSector = nsect;
409           p->type = typ;
410           p->buf = bf;
411           p->parityStripeID = parityStripeID;
412           p->which_ru = which_ru;
413           p->CompleteFunc = wakeF;
414           p->argument = arg;
415           p->next = NULL;
416           p->tracerec = tracerec;
417           p->priority = RF_IO_NORMAL_PRIORITY;
418           p->raidPtr = raidPtr;
419           p->flags = flags;
420           return (p);
421 }
422 
423 void
rf_FreeDiskQueueData(RF_DiskQueueData_t * p)424 rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
425 {
426 
427           buf_destroy(p->bp);
428 
429           pool_put(&p->raidPtr->pools.bufio, p->bp);
430           pool_put(&p->raidPtr->pools.dqd, p);
431 }
432