1 /*        $NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $ */
2 
3 /*-
4  * Copyright (c) 1990, 1993, 1994
5  *        The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Mike Olson.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #if HAVE_NBTOOL_CONFIG_H
36 #include "nbtool_config.h"
37 #endif
38 
39 #include <sys/cdefs.h>
40 __RCSID("$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $");
41 
42 #include "namespace.h"
43 #include <sys/types.h>
44 
45 #include <assert.h>
46 #include <errno.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 
51 #include <db.h>
52 #include "btree.h"
53 
54 static int __bt_first(BTREE *, const DBT *, EPG *, int *);
55 static int __bt_seqadv(BTREE *, EPG *, int);
56 static int __bt_seqset(BTREE *, EPG *, DBT *, int);
57 static int __bt_rseq_next(BTREE *, EPG *);
58 static int __bt_rseq_prev(BTREE *, EPG *);
59 
60 /*
61  * Sequential scan support.
62  *
63  * The tree can be scanned sequentially, starting from either end of the
64  * tree or from any specific key.  A scan request before any scanning is
65  * done is initialized as starting from the least node.
66  */
67 
68 /*
69  * __bt_seq --
70  *        Btree sequential scan interface.
71  *
72  * Parameters:
73  *        dbp:      pointer to access method
74  *        key:      key for positioning and return value
75  *        data:     data return value
76  *        flags:    R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
77  *
78  * Returns:
79  *        RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
80  */
81 int
__bt_seq(const DB * dbp,DBT * key,DBT * data,u_int flags)82 __bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags)
83 {
84           BTREE *t;
85           EPG e;
86           int status;
87 
88           t = dbp->internal;
89 
90           /* Toss any page pinned across calls. */
91           if (t->bt_pinned != NULL) {
92                     mpool_put(t->bt_mp, t->bt_pinned, 0);
93                     t->bt_pinned = NULL;
94           }
95 
96           /*
97            * If scan uninitialized as yet, or starting at a specific record, set
98            * the scan to a specific key.  Both __bt_seqset and __bt_seqadv pin
99            * the page the cursor references if they're successful.
100            */
101           switch (flags) {
102           case R_NEXT:
103           case R_PREV:
104           case R_RNEXT:
105           case R_RPREV:
106                     if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
107                               status = __bt_seqadv(t, &e, (int)flags);
108                               break;
109                     }
110                     /* FALLTHROUGH */
111           case R_FIRST:
112           case R_LAST:
113           case R_CURSOR:
114                     status = __bt_seqset(t, &e, key, (int)flags);
115                     break;
116           default:
117                     errno = EINVAL;
118                     return (RET_ERROR);
119           }
120 
121           if (status == RET_SUCCESS) {
122                     __bt_setcur(t, e.page->pgno, (u_int)e.index);
123 
124                     status =
125                         __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
126 
127                     /*
128                      * If the user is doing concurrent access, we copied the
129                      * key/data, toss the page.
130                      */
131                     if (F_ISSET(t, B_DB_LOCK))
132                               mpool_put(t->bt_mp, e.page, 0);
133                     else
134                               t->bt_pinned = e.page;
135           }
136           return (status);
137 }
138 
139 /*
140  * __bt_seqset --
141  *        Set the sequential scan to a specific key.
142  *
143  * Parameters:
144  *        t:        tree
145  *        ep:       storage for returned key
146  *        key:      key for initial scan position
147  *        flags:    R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
148  *
149  * Side effects:
150  *        Pins the page the cursor references.
151  *
152  * Returns:
153  *        RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
154  */
155 static int
__bt_seqset(BTREE * t,EPG * ep,DBT * key,int flags)156 __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags)
157 {
158           PAGE *h;
159           pgno_t pg;
160           int exact;
161 
162           /*
163            * Find the first, last or specific key in the tree and point the
164            * cursor at it.  The cursor may not be moved until a new key has
165            * been found.
166            */
167           switch (flags) {
168           case R_CURSOR:                                    /* Keyed scan. */
169                     /*
170                      * Find the first instance of the key or the smallest key
171                      * which is greater than or equal to the specified key.
172                      */
173                     if (key->data == NULL || key->size == 0) {
174                               errno = EINVAL;
175                               return (RET_ERROR);
176                     }
177                     return (__bt_first(t, key, ep, &exact));
178           case R_FIRST:                                     /* First record. */
179           case R_NEXT:
180           case R_RNEXT:
181                     BT_CLR(t);
182                     /* Walk down the left-hand side of the tree. */
183                     for (pg = P_ROOT;;) {
184                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
185                                         return (RET_ERROR);
186 
187                               /* Check for an empty tree. */
188                               if (NEXTINDEX(h) == 0) {
189                                         mpool_put(t->bt_mp, h, 0);
190                                         return (RET_SPECIAL);
191                               }
192 
193                               if (h->flags & (P_BLEAF | P_RLEAF))
194                                         break;
195                               pg = GETBINTERNAL(h, 0)->pgno;
196                               BT_PUSH(t, h->pgno, 0);
197                               mpool_put(t->bt_mp, h, 0);
198                     }
199                     ep->page = h;
200                     ep->index = 0;
201                     break;
202           case R_LAST:                                      /* Last record. */
203           case R_PREV:
204           case R_RPREV:
205                     BT_CLR(t);
206                     /* Walk down the right-hand side of the tree. */
207                     for (pg = P_ROOT;;) {
208                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
209                                         return (RET_ERROR);
210 
211                               /* Check for an empty tree. */
212                               if (NEXTINDEX(h) == 0) {
213                                         mpool_put(t->bt_mp, h, 0);
214                                         return (RET_SPECIAL);
215                               }
216 
217                               if (h->flags & (P_BLEAF | P_RLEAF))
218                                         break;
219                               pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
220                               BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
221                               mpool_put(t->bt_mp, h, 0);
222                     }
223 
224                     ep->page = h;
225                     ep->index = NEXTINDEX(h) - 1;
226                     break;
227           }
228           return (RET_SUCCESS);
229 }
230 
231 /*
232  * __bt_seqadvance --
233  *        Advance the sequential scan.
234  *
235  * Parameters:
236  *        t:        tree
237  *        flags:    R_NEXT, R_PREV, R_RNEXT, R_RPREV
238  *
239  * Side effects:
240  *        Pins the page the new key/data record is on.
241  *
242  * Returns:
243  *        RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
244  */
245 static int
__bt_seqadv(BTREE * t,EPG * ep,int flags)246 __bt_seqadv(BTREE *t, EPG *ep, int flags)
247 {
248           CURSOR *c;
249           PAGE *h;
250           indx_t idx = 0;     /* pacify gcc */
251           pgno_t pg;
252           int exact, rval;
253 
254           /*
255            * There are a couple of states that we can be in.  The cursor has
256            * been initialized by the time we get here, but that's all we know.
257            */
258           c = &t->bt_cursor;
259 
260           /*
261            * The cursor was deleted and there weren't any duplicate records,
262            * so the cursor's key was saved.  Find out where that key would
263            * be in the current tree.  If the returned key is an exact match,
264            * it means that a key/data pair was inserted into the tree after
265            * the delete.  We could reasonably return the key, but the problem
266            * is that this is the access pattern we'll see if the user is
267            * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
268            * the cursor record and then replaces it, so the cursor was saved,
269            * and we'll simply return the same "new" record until the user
270            * notices and doesn't do a put() of it.  Since the key is an exact
271            * match, we could as easily put the new record before the cursor,
272            * and we've made no guarantee to return it.  So, move forward or
273            * back a record if it's an exact match.
274            *
275            * XXX
276            * In the current implementation, put's to the cursor are done with
277            * delete/add pairs.  This has two consequences.  First, it means
278            * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
279            * the same behavior as above.  Second, you can return the same key
280            * twice if you have duplicate records.  The scenario is that the
281            * cursor record is deleted, moving the cursor forward or backward
282            * to a duplicate.  The add then inserts the new record at a location
283            * ahead of the cursor because duplicates aren't sorted in any way,
284            * and the new record is later returned.  This has to be fixed at some
285            * point.
286            */
287           if (F_ISSET(c, CURS_ACQUIRE)) {
288                     if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
289                               return RET_ERROR;
290                     if (!exact)
291                               return rval;
292                     /*
293                      * XXX
294                      * Kluge -- get, release, get the page.
295                      */
296                     c->pg.pgno = ep->page->pgno;
297                     c->pg.index = ep->index;
298                     mpool_put(t->bt_mp, ep->page, 0);
299           }
300 
301           /* Get the page referenced by the cursor. */
302           if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
303                     return (RET_ERROR);
304 
305           /*
306            * Find the next/previous record in the tree and point the cursor at
307            * it.  The cursor may not be moved until a new key has been found.
308            */
309           switch (flags) {
310           case R_NEXT:                            /* Next record. */
311           case R_RNEXT:
312                     /*
313                      * The cursor was deleted in duplicate records, and moved
314                      * forward to a record that has yet to be returned.  Clear
315                      * that flag, and return the record.
316                      */
317                     if (F_ISSET(c, CURS_AFTER))
318                               goto usecurrent;
319                     idx = c->pg.index;
320                     if (++idx == NEXTINDEX(h)) {
321                               if (flags == R_RNEXT) {
322                                         ep->page = h;
323                                         ep->index = idx;
324                                         return __bt_rseq_next(t, ep);
325                               }
326                               pg = h->nextpg;
327                               mpool_put(t->bt_mp, h, 0);
328                               if (pg == P_INVALID)
329                                         return RET_SPECIAL;
330                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
331                                         return RET_ERROR;
332                               idx = 0;
333                     }
334                     break;
335           case R_PREV:                            /* Previous record. */
336           case R_RPREV:
337                     /*
338                      * The cursor was deleted in duplicate records, and moved
339                      * backward to a record that has yet to be returned.  Clear
340                      * that flag, and return the record.
341                      */
342                     if (F_ISSET(c, CURS_BEFORE)) {
343 usecurrent:                   F_CLR(c, CURS_AFTER | CURS_BEFORE);
344                               ep->page = h;
345                               ep->index = c->pg.index;
346                               return (RET_SUCCESS);
347                     }
348                     idx = c->pg.index;
349                     if (idx == 0) {
350                               if (flags == R_RPREV) {
351                                         ep->page = h;
352                                         ep->index = idx;
353                                         return __bt_rseq_prev(t, ep);
354                               }
355                               pg = h->prevpg;
356                               mpool_put(t->bt_mp, h, 0);
357                               if (pg == P_INVALID)
358                                         return RET_SPECIAL;
359                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
360                                         return RET_ERROR;
361                               idx = NEXTINDEX(h) - 1;
362                     } else
363                               --idx;
364                     break;
365           }
366 
367           ep->page = h;
368           ep->index = idx;
369           return (RET_SUCCESS);
370 }
371 /*
372  * Get the first item on the next page, but by going up and down the tree.
373  */
374 static int
__bt_rseq_next(BTREE * t,EPG * ep)375 __bt_rseq_next(BTREE *t, EPG *ep)
376 {
377           PAGE *h;
378           indx_t idx;
379           EPGNO *up;
380           pgno_t pg;
381 
382           h = ep->page;
383           idx = ep->index;
384           do {
385                     /* Move up the tree. */
386                     up = BT_POP(t);
387                     mpool_put(t->bt_mp, h, 0);
388                     /* Did we hit the right edge of the root? */
389                     if (up == NULL)
390                               return RET_SPECIAL;
391                     if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
392                               return RET_ERROR;
393                     idx = up->index;
394           } while (++idx == NEXTINDEX(h));
395 
396           while (!(h->flags & (P_BLEAF | P_RLEAF))) {
397                     /* Move back down the tree. */
398                     BT_PUSH(t, h->pgno, idx);
399                     pg = GETBINTERNAL(h, idx)->pgno;
400                     mpool_put(t->bt_mp, h, 0);
401                     if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
402                               return RET_ERROR;
403                     idx = 0;
404           }
405           ep->page = h;
406           ep->index = idx;
407           return RET_SUCCESS;
408 }
409 
410 /*
411  * Get the last item on the previous page, but by going up and down the tree.
412  */
413 static int
__bt_rseq_prev(BTREE * t,EPG * ep)414 __bt_rseq_prev(BTREE *t, EPG *ep)
415 {
416           PAGE *h;
417           indx_t idx;
418           EPGNO *up;
419           pgno_t pg;
420 
421           h = ep->page;
422           idx = ep->index;
423           do {
424                     /* Move up the tree. */
425                     up = BT_POP(t);
426                     mpool_put(t->bt_mp, h, 0);
427                     /* Did we hit the left edge of the root? */
428                     if (up == NULL)
429                               return RET_SPECIAL;
430                     if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
431                               return RET_ERROR;
432                     idx = up->index;
433           } while (idx == 0);
434           --idx;
435           while (!(h->flags & (P_BLEAF | P_RLEAF))) {
436                     /* Move back down the tree. */
437                     BT_PUSH(t, h->pgno, idx);
438                     pg = GETBINTERNAL(h, idx)->pgno;
439                     mpool_put(t->bt_mp, h, 0);
440                     if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
441                               return RET_ERROR;
442                     idx = NEXTINDEX(h) - 1;
443           }
444           ep->page = h;
445           ep->index = idx;
446           return RET_SUCCESS;
447 }
448 
449 /*
450  * __bt_first --
451  *        Find the first entry.
452  *
453  * Parameters:
454  *        t:        the tree
455  *    key:          the key
456  *  erval:          return EPG
457  * exactp:          pointer to exact match flag
458  *
459  * Returns:
460  *        The first entry in the tree greater than or equal to key,
461  *        or RET_SPECIAL if no such key exists.
462  */
463 static int
__bt_first(BTREE * t,const DBT * key,EPG * erval,int * exactp)464 __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp)
465 {
466           PAGE *h, *hprev;
467           EPG *ep, save;
468           pgno_t pg;
469 
470           /*
471            * Find any matching record; __bt_search pins the page.
472            *
473            * If it's an exact match and duplicates are possible, walk backwards
474            * in the tree until we find the first one.  Otherwise, make sure it's
475            * a valid key (__bt_search may return an index just past the end of a
476            * page) and return it.
477            */
478           if ((ep = __bt_search(t, key, exactp)) == NULL)
479                     return RET_SPECIAL;
480           if (*exactp) {
481                     if (F_ISSET(t, B_NODUPS)) {
482                               *erval = *ep;
483                               return (RET_SUCCESS);
484                     }
485 
486                     /*
487                      * Walk backwards, as long as the entry matches and there are
488                      * keys left in the tree.  Save a copy of each match in case
489                      * we go too far.
490                      */
491                     save = *ep;
492                     h = ep->page;
493                     do {
494                               if (save.page->pgno != ep->page->pgno) {
495                                         mpool_put(t->bt_mp, save.page, 0);
496                                         save = *ep;
497                               } else
498                                         save.index = ep->index;
499 
500                               /*
501                                * Don't unpin the page the last (or original) match
502                                * was on, but make sure it's unpinned if an error
503                                * occurs.
504                                */
505                               if (ep->index == 0) {
506                                         if (h->prevpg == P_INVALID)
507                                                   break;
508                                         if (h->pgno != save.page->pgno)
509                                                   mpool_put(t->bt_mp, h, 0);
510                                         if ((hprev = mpool_get(t->bt_mp,
511                                             h->prevpg, 0)) == NULL) {
512                                                   if (h->pgno == save.page->pgno)
513                                                             mpool_put(t->bt_mp,
514                                                                 save.page, 0);
515                                                   return RET_ERROR;
516                                         }
517                                         ep->page = h = hprev;
518                                         ep->index = NEXTINDEX(h);
519                               }
520                               --ep->index;
521                     } while (__bt_cmp(t, key, ep) == 0);
522 
523                     /*
524                      * Reach here with the last page that was looked at pinned,
525                      * which may or may not be the same as the last (or original)
526                      * match page.  If it's not useful, release it.
527                      */
528                     if (h->pgno != save.page->pgno)
529                               mpool_put(t->bt_mp, h, 0);
530 
531                     *erval = save;
532                     return (RET_SUCCESS);
533           }
534 
535           /* If at the end of a page, find the next entry. */
536           if (ep->index == NEXTINDEX(ep->page)) {
537                     h = ep->page;
538                     pg = h->nextpg;
539                     mpool_put(t->bt_mp, h, 0);
540                     if (pg == P_INVALID)
541                               return (RET_SPECIAL);
542                     if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
543                               return (RET_ERROR);
544                     ep->index = 0;
545                     ep->page = h;
546           }
547           *erval = *ep;
548           return (RET_SUCCESS);
549 }
550 
551 /*
552  * __bt_setcur --
553  *        Set the cursor to an entry in the tree.
554  *
555  * Parameters:
556  *        t:        the tree
557  *   pgno:          page number
558  *    idx:          page index
559  */
560 void
__bt_setcur(BTREE * t,pgno_t pgno,u_int idx)561 __bt_setcur(BTREE *t, pgno_t pgno, u_int idx)
562 {
563           /* Lose any already deleted key. */
564           if (t->bt_cursor.key.data != NULL) {
565                     free(t->bt_cursor.key.data);
566                     t->bt_cursor.key.size = 0;
567                     t->bt_cursor.key.data = NULL;
568           }
569           F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
570 
571           /* Update the cursor. */
572           t->bt_cursor.pg.pgno = pgno;
573           t->bt_cursor.pg.index = idx;
574           F_SET(&t->bt_cursor, CURS_INIT);
575 }
576