1 /*        $NetBSD: bt_split.c,v 1.22 2016/09/24 21:31:25 christos Exp $         */
2 
3 /*-
4  * Copyright (c) 1990, 1993, 1994
5  *        The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Mike Olson.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #if HAVE_NBTOOL_CONFIG_H
36 #include "nbtool_config.h"
37 #endif
38 
39 #include <sys/cdefs.h>
40 __RCSID("$NetBSD: bt_split.c,v 1.22 2016/09/24 21:31:25 christos Exp $");
41 
42 #include "namespace.h"
43 #include <sys/types.h>
44 
45 #include <assert.h>
46 #include <limits.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 
51 #include <db.h>
52 #include "btree.h"
53 
54 static int           bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
55 static PAGE         *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
56 static int           bt_preserve(BTREE *, pgno_t);
57 static PAGE         *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
58 static PAGE         *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
59 static int           bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
60 static recno_t       rec_total(PAGE *);
61 
62 #ifdef STATISTICS
63 unsigned long       bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
64 #endif
65 
66 /*
67  * __BT_SPLIT -- Split the tree.
68  *
69  * Parameters:
70  *        t:        tree
71  *        sp:       page to split
72  *        key:      key to insert
73  *        data:     data to insert
74  *        flags:    BIGKEY/BIGDATA flags
75  *        ilen:     insert length
76  *        skip:     index to leave open
77  *
78  * Returns:
79  *        RET_ERROR, RET_SUCCESS
80  */
81 int
__bt_split(BTREE * t,PAGE * sp,const DBT * key,const DBT * data,int flags,size_t ilen,uint32_t argskip)82 __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
83     size_t ilen, uint32_t argskip)
84 {
85           BINTERNAL *bi = NULL;         /* pacify gcc */
86           BLEAF *bl = NULL, *tbl;       /* pacify gcc */
87           DBT a, b;
88           EPGNO *parent;
89           PAGE *h, *l, *r, *lchild, *rchild;
90           indx_t nxtindex;
91           uint16_t skip;
92           uint32_t n, nbytes, nksize = 0; /* pacify gcc */
93           int parentsplit;
94           char *dest;
95 
96           /*
97            * Split the page into two pages, l and r.  The split routines return
98            * a pointer to the page into which the key should be inserted and with
99            * skip set to the offset which should be used.  Additionally, l and r
100            * are pinned.
101            */
102           skip = argskip;
103           h = sp->pgno == P_ROOT ?
104               bt_root(t, sp, &l, &r, &skip, ilen) :
105               bt_page(t, sp, &l, &r, &skip, ilen);
106           if (h == NULL)
107                     return (RET_ERROR);
108 
109           /*
110            * Insert the new key/data pair into the leaf page.  (Key inserts
111            * always cause a leaf page to split first.)
112            */
113           _DBFIT(ilen, indx_t);
114           h->upper -= (indx_t)ilen;
115           h->linp[skip] = h->upper;
116           dest = (char *)(void *)h + h->upper;
117           if (F_ISSET(t, R_RECNO))
118                     WR_RLEAF(dest, data, flags);
119           else
120                     WR_BLEAF(dest, key, data, flags);
121 
122           /* If the root page was split, make it look right. */
123           if (sp->pgno == P_ROOT &&
124               (F_ISSET(t, R_RECNO) ?
125               bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
126                     goto err2;
127 
128           /*
129            * Now we walk the parent page stack -- a LIFO stack of the pages that
130            * were traversed when we searched for the page that split.  Each stack
131            * entry is a page number and a page index offset.  The offset is for
132            * the page traversed on the search.  We've just split a page, so we
133            * have to insert a new key into the parent page.
134            *
135            * If the insert into the parent page causes it to split, may have to
136            * continue splitting all the way up the tree.  We stop if the root
137            * splits or the page inserted into didn't have to split to hold the
138            * new key.  Some algorithms replace the key for the old page as well
139            * as the new page.  We don't, as there's no reason to believe that the
140            * first key on the old page is any better than the key we have, and,
141            * in the case of a key being placed at index 0 causing the split, the
142            * key is unavailable.
143            *
144            * There are a maximum of 5 pages pinned at any time.  We keep the left
145            * and right pages pinned while working on the parent.   The 5 are the
146            * two children, left parent and right parent (when the parent splits)
147            * and the root page or the overflow key page when calling bt_preserve.
148            * This code must make sure that all pins are released other than the
149            * root page or overflow page which is unlocked elsewhere.
150            */
151           while ((parent = BT_POP(t)) != NULL) {
152                     lchild = l;
153                     rchild = r;
154 
155                     /* Get the parent page. */
156                     if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
157                               goto err2;
158 
159                     /*
160                      * The new key goes ONE AFTER the index, because the split
161                      * was to the right.
162                      */
163                     skip = parent->index + 1;
164 
165                     /*
166                      * Calculate the space needed on the parent page.
167                      *
168                      * Prefix trees: space hack when inserting into BINTERNAL
169                      * pages.  Retain only what's needed to distinguish between
170                      * the new entry and the LAST entry on the page to its left.
171                      * If the keys compare equal, retain the entire key.  Note,
172                      * we don't touch overflow keys, and the entire key must be
173                      * retained for the next-to-left most key on the leftmost
174                      * page of each level, or the search will fail.  Applicable
175                      * ONLY to internal pages that have leaf pages as children.
176                      * Further reduction of the key between pairs of internal
177                      * pages loses too much information.
178                      */
179                     switch (rchild->flags & P_TYPE) {
180                     case P_BINTERNAL:
181                               bi = GETBINTERNAL(rchild, 0);
182                               nbytes = NBINTERNAL(bi->ksize);
183                               break;
184                     case P_BLEAF:
185                               bl = GETBLEAF(rchild, 0);
186                               nbytes = NBINTERNAL(bl->ksize);
187                               if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
188                                   (h->prevpg != P_INVALID || skip > 1)) {
189                                         size_t temp;
190                                         tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
191                                         a.size = tbl->ksize;
192                                         a.data = tbl->bytes;
193                                         b.size = bl->ksize;
194                                         b.data = bl->bytes;
195                                         temp = t->bt_pfx(&a, &b);
196                                         _DBFIT(temp, uint32_t);
197                                         nksize = (uint32_t)temp;
198                                         n = NBINTERNAL(nksize);
199                                         if (n < nbytes) {
200 #ifdef STATISTICS
201                                                   bt_pfxsaved += nbytes - n;
202 #endif
203                                                   nbytes = n;
204                                         } else
205                                                   nksize = 0;
206                               } else
207                                         nksize = 0;
208                               break;
209                     case P_RINTERNAL:
210                     case P_RLEAF:
211                               nbytes = NRINTERNAL;
212                               break;
213                     default:
214                               abort();
215                     }
216 
217                     /* Split the parent page if necessary or shift the indices. */
218                     if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) {
219                               sp = h;
220                               h = h->pgno == P_ROOT ?
221                                   bt_root(t, h, &l, &r, &skip, nbytes) :
222                                   bt_page(t, h, &l, &r, &skip, nbytes);
223                               if (h == NULL)
224                                         goto err1;
225                               parentsplit = 1;
226                     } else {
227                               if (skip < (nxtindex = NEXTINDEX(h)))
228                                         memmove(h->linp + skip + 1, h->linp + skip,
229                                             (nxtindex - skip) * sizeof(indx_t));
230                               h->lower += sizeof(indx_t);
231                               parentsplit = 0;
232                     }
233 
234                     /* Insert the key into the parent page. */
235                     switch (rchild->flags & P_TYPE) {
236                     case P_BINTERNAL:
237                               h->linp[skip] = h->upper -= nbytes;
238                               dest = (char *)(void *)h + h->linp[skip];
239                               memmove(dest, bi, nbytes);
240                               ((BINTERNAL *)(void *)dest)->pgno = rchild->pgno;
241                               break;
242                     case P_BLEAF:
243                               h->linp[skip] = h->upper -= nbytes;
244                               dest = (char *)(void *)h + h->linp[skip];
245                               WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
246                                   rchild->pgno, bl->flags & P_BIGKEY);
247                               memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
248                               if (bl->flags & P_BIGKEY) {
249                                         pgno_t pgno;
250                                         memcpy(&pgno, bl->bytes, sizeof(pgno));
251                                         if (bt_preserve(t, pgno) == RET_ERROR)
252                                                   goto err1;
253                               }
254                               break;
255                     case P_RINTERNAL:
256                               /*
257                                * Update the left page count.  If split
258                                * added at index 0, fix the correct page.
259                                */
260                               if (skip > 0)
261                                         dest = (char *)(void *)h + h->linp[skip - 1];
262                               else
263                                         dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1];
264                               ((RINTERNAL *)(void *)dest)->nrecs = rec_total(lchild);
265                               ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno;
266 
267                               /* Update the right page count. */
268                               h->linp[skip] = h->upper -= nbytes;
269                               dest = (char *)(void *)h + h->linp[skip];
270                               ((RINTERNAL *)(void *)dest)->nrecs = rec_total(rchild);
271                               ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno;
272                               break;
273                     case P_RLEAF:
274                               /*
275                                * Update the left page count.  If split
276                                * added at index 0, fix the correct page.
277                                */
278                               if (skip > 0)
279                                         dest = (char *)(void *)h + h->linp[skip - 1];
280                               else
281                                         dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1];
282                               ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(lchild);
283                               ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno;
284 
285                               /* Update the right page count. */
286                               h->linp[skip] = h->upper -= nbytes;
287                               dest = (char *)(void *)h + h->linp[skip];
288                               ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(rchild);
289                               ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno;
290                               break;
291                     default:
292                               abort();
293                     }
294 
295                     /* Unpin the held pages. */
296                     if (!parentsplit) {
297                               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
298                               break;
299                     }
300 
301                     /* If the root page was split, make it look right. */
302                     if (sp->pgno == P_ROOT &&
303                         (F_ISSET(t, R_RECNO) ?
304                         bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
305                               goto err1;
306 
307                     mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
308                     mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
309           }
310 
311           /* Unpin the held pages. */
312           mpool_put(t->bt_mp, l, MPOOL_DIRTY);
313           mpool_put(t->bt_mp, r, MPOOL_DIRTY);
314 
315           /* Clear any pages left on the stack. */
316           return (RET_SUCCESS);
317 
318           /*
319            * If something fails in the above loop we were already walking back
320            * up the tree and the tree is now inconsistent.  Nothing much we can
321            * do about it but release any memory we're holding.
322            */
323 err1:     mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
324           mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
325 
326 err2:     mpool_put(t->bt_mp, l, 0);
327           mpool_put(t->bt_mp, r, 0);
328           __dbpanic(t->bt_dbp);
329           return (RET_ERROR);
330 }
331 
332 /*
333  * BT_PAGE -- Split a non-root page of a btree.
334  *
335  * Parameters:
336  *        t:        tree
337  *        h:        root page
338  *        lp:       pointer to left page pointer
339  *        rp:       pointer to right page pointer
340  *        skip:     pointer to index to leave open
341  *        ilen:     insert length
342  *
343  * Returns:
344  *        Pointer to page in which to insert or NULL on error.
345  */
346 static PAGE *
bt_page(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)347 bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
348 {
349           PAGE *l, *r, *tp;
350           pgno_t npg;
351 
352 #ifdef STATISTICS
353           ++bt_split;
354 #endif
355           /* Put the new right page for the split into place. */
356           if ((r = __bt_new(t, &npg)) == NULL)
357                     return (NULL);
358           r->pgno = npg;
359           r->lower = BTDATAOFF;
360           r->upper = t->bt_psize;
361           r->nextpg = h->nextpg;
362           r->prevpg = h->pgno;
363           r->flags = h->flags & P_TYPE;
364 
365           /*
366            * If we're splitting the last page on a level because we're appending
367            * a key to it (skip is NEXTINDEX()), it's likely that the data is
368            * sorted.  Adding an empty page on the side of the level is less work
369            * and can push the fill factor much higher than normal.  If we're
370            * wrong it's no big deal, we'll just do the split the right way next
371            * time.  It may look like it's equally easy to do a similar hack for
372            * reverse sorted data, that is, split the tree left, but it's not.
373            * Don't even try.
374            */
375           if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
376 #ifdef STATISTICS
377                     ++bt_sortsplit;
378 #endif
379                     h->nextpg = r->pgno;
380                     r->lower = BTDATAOFF + sizeof(indx_t);
381                     *skip = 0;
382                     *lp = h;
383                     *rp = r;
384                     return (r);
385           }
386 
387           /* Put the new left page for the split into place. */
388           if ((l = calloc(1, t->bt_psize)) == NULL) {
389                     mpool_put(t->bt_mp, r, 0);
390                     return (NULL);
391           }
392 #ifdef PURIFY
393           memset(l, 0xff, t->bt_psize);
394 #endif
395           l->pgno = h->pgno;
396           l->nextpg = r->pgno;
397           l->prevpg = h->prevpg;
398           l->lower = BTDATAOFF;
399           l->upper = t->bt_psize;
400           l->flags = h->flags & P_TYPE;
401 
402           /* Fix up the previous pointer of the page after the split page. */
403           if (h->nextpg != P_INVALID) {
404                     if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
405                               free(l);
406                               /* XXX mpool_free(t->bt_mp, r->pgno); */
407                               return (NULL);
408                     }
409                     tp->prevpg = r->pgno;
410                     mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
411           }
412 
413           /*
414            * Split right.  The key/data pairs aren't sorted in the btree page so
415            * it's simpler to copy the data from the split page onto two new pages
416            * instead of copying half the data to the right page and compacting
417            * the left page in place.  Since the left page can't change, we have
418            * to swap the original and the allocated left page after the split.
419            */
420           tp = bt_psplit(t, h, l, r, skip, ilen);
421 
422           /* Move the new left page onto the old left page. */
423           memmove(h, l, t->bt_psize);
424           if (tp == l)
425                     tp = h;
426           free(l);
427 
428           *lp = h;
429           *rp = r;
430           return (tp);
431 }
432 
433 /*
434  * BT_ROOT -- Split the root page of a btree.
435  *
436  * Parameters:
437  *        t:        tree
438  *        h:        root page
439  *        lp:       pointer to left page pointer
440  *        rp:       pointer to right page pointer
441  *        skip:     pointer to index to leave open
442  *        ilen:     insert length
443  *
444  * Returns:
445  *        Pointer to page in which to insert or NULL on error.
446  */
447 static PAGE *
bt_root(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)448 bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
449 {
450           PAGE *l, *r, *tp;
451           pgno_t lnpg, rnpg;
452 
453 #ifdef STATISTICS
454           ++bt_split;
455           ++bt_rootsplit;
456 #endif
457           /* Put the new left and right pages for the split into place. */
458           if ((l = __bt_new(t, &lnpg)) == NULL ||
459               (r = __bt_new(t, &rnpg)) == NULL)
460                     return (NULL);
461           l->pgno = lnpg;
462           r->pgno = rnpg;
463           l->nextpg = r->pgno;
464           r->prevpg = l->pgno;
465           l->prevpg = r->nextpg = P_INVALID;
466           l->lower = r->lower = BTDATAOFF;
467           l->upper = r->upper = t->bt_psize;
468           l->flags = r->flags = h->flags & P_TYPE;
469 
470           /* Split the root page. */
471           tp = bt_psplit(t, h, l, r, skip, ilen);
472 
473           *lp = l;
474           *rp = r;
475           return (tp);
476 }
477 
478 /*
479  * BT_RROOT -- Fix up the recno root page after it has been split.
480  *
481  * Parameters:
482  *        t:        tree
483  *        h:        root page
484  *        l:        left page
485  *        r:        right page
486  *
487  * Returns:
488  *        RET_ERROR, RET_SUCCESS
489  */
490 static int
bt_rroot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)491 bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
492 {
493           char *dest;
494           uint32_t sz;
495           size_t temp;
496 
497           temp = t->bt_psize - NRINTERNAL;
498           _DBFIT(temp, uint32_t);
499           sz = (uint32_t)temp;
500 
501           /* Insert the left and right keys, set the header information. */
502           _DBFIT(sz, indx_t);
503           h->linp[0] = h->upper = (indx_t)sz;
504           dest = (char *)(void *)h + h->upper;
505           WR_RINTERNAL(dest,
506               l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
507 
508           h->linp[1] = h->upper -= NRINTERNAL;
509           dest = (char *)(void *)h + h->upper;
510           WR_RINTERNAL(dest,
511               r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
512 
513           h->lower = BTDATAOFF + 2 * sizeof(indx_t);
514 
515           /* Unpin the root page, set to recno internal page. */
516           h->flags &= ~P_TYPE;
517           h->flags |= P_RINTERNAL;
518           mpool_put(t->bt_mp, h, MPOOL_DIRTY);
519 
520           return (RET_SUCCESS);
521 }
522 
523 /*
524  * BT_BROOT -- Fix up the btree root page after it has been split.
525  *
526  * Parameters:
527  *        t:        tree
528  *        h:        root page
529  *        l:        left page
530  *        r:        right page
531  *
532  * Returns:
533  *        RET_ERROR, RET_SUCCESS
534  */
535 static int
bt_broot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)536 bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
537 {
538           BINTERNAL *bi = NULL;         /* pacify gcc */
539           BLEAF *bl;
540           uint32_t nbytes;
541           char *dest;
542 
543           /*
544            * If the root page was a leaf page, change it into an internal page.
545            * We copy the key we split on (but not the key's data, in the case of
546            * a leaf page) to the new root page.
547            *
548            * The btree comparison code guarantees that the left-most key on any
549            * level of the tree is never used, so it doesn't need to be filled in.
550            */
551           nbytes = NBINTERNAL(0);
552           h->linp[0] = h->upper = t->bt_psize - nbytes;
553           dest = (char *)(void *)h + h->upper;
554           WR_BINTERNAL(dest, 0, l->pgno, 0);
555 
556           switch (h->flags & P_TYPE) {
557           case P_BLEAF:
558                     bl = GETBLEAF(r, 0);
559                     nbytes = NBINTERNAL(bl->ksize);
560                     h->linp[1] = h->upper -= nbytes;
561                     dest = (char *)(void *)h + h->upper;
562                     WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
563                     memmove(dest, bl->bytes, bl->ksize);
564 
565                     /*
566                      * If the key is on an overflow page, mark the overflow chain
567                      * so it isn't deleted when the leaf copy of the key is deleted.
568                      */
569                     if (bl->flags & P_BIGKEY) {
570                               pgno_t pgno;
571                               memcpy(&pgno, bl->bytes, sizeof(pgno));
572                               if (bt_preserve(t, pgno) == RET_ERROR)
573                                         return (RET_ERROR);
574                     }
575                     break;
576           case P_BINTERNAL:
577                     bi = GETBINTERNAL(r, 0);
578                     nbytes = NBINTERNAL(bi->ksize);
579                     h->linp[1] = h->upper -= nbytes;
580                     dest = (char *)(void *)h + h->upper;
581                     memmove(dest, bi, nbytes);
582                     ((BINTERNAL *)(void *)dest)->pgno = r->pgno;
583                     break;
584           default:
585                     abort();
586           }
587 
588           /* There are two keys on the page. */
589           h->lower = BTDATAOFF + 2 * sizeof(indx_t);
590 
591           /* Unpin the root page, set to btree internal page. */
592           h->flags &= ~P_TYPE;
593           h->flags |= P_BINTERNAL;
594           mpool_put(t->bt_mp, h, MPOOL_DIRTY);
595 
596           return (RET_SUCCESS);
597 }
598 
599 /*
600  * BT_PSPLIT -- Do the real work of splitting the page.
601  *
602  * Parameters:
603  *        t:        tree
604  *        h:        page to be split
605  *        l:        page to put lower half of data
606  *        r:        page to put upper half of data
607  *        pskip:    pointer to index to leave open
608  *        ilen:     insert length
609  *
610  * Returns:
611  *        Pointer to page in which to insert.
612  */
613 static PAGE *
bt_psplit(BTREE * t,PAGE * h,PAGE * l,PAGE * r,indx_t * pskip,size_t ilen)614 bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
615 {
616           BINTERNAL *bi;
617           BLEAF *bl;
618           CURSOR *c;
619           RLEAF *rl;
620           PAGE *rval;
621           void *src = NULL;   /* pacify gcc */
622           indx_t full, half, nxt, off, skip, top, used;
623           uint32_t nbytes;
624           size_t temp;
625           int bigkeycnt, isbigkey;
626 
627           /*
628            * Split the data to the left and right pages.  Leave the skip index
629            * open.  Additionally, make some effort not to split on an overflow
630            * key.  This makes internal page processing faster and can save
631            * space as overflow keys used by internal pages are never deleted.
632            */
633           bigkeycnt = 0;
634           skip = *pskip;
635           temp = t->bt_psize - BTDATAOFF;
636           _DBFIT(temp, indx_t);
637           full = (indx_t)temp;
638           half = full / 2;
639           used = 0;
640           for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
641                     if (skip == off) {
642                               _DBFIT(ilen, uint32_t);
643                               nbytes = (uint32_t)ilen;
644                               isbigkey = 0;                 /* XXX: not really known. */
645                     } else
646                               switch (h->flags & P_TYPE) {
647                               case P_BINTERNAL:
648                                         src = bi = GETBINTERNAL(h, nxt);
649                                         nbytes = NBINTERNAL(bi->ksize);
650                                         isbigkey = bi->flags & P_BIGKEY;
651                                         break;
652                               case P_BLEAF:
653                                         src = bl = GETBLEAF(h, nxt);
654                                         nbytes = NBLEAF(bl);
655                                         isbigkey = bl->flags & P_BIGKEY;
656                                         break;
657                               case P_RINTERNAL:
658                                         src = GETRINTERNAL(h, nxt);
659                                         nbytes = NRINTERNAL;
660                                         isbigkey = 0;
661                                         break;
662                               case P_RLEAF:
663                                         src = rl = GETRLEAF(h, nxt);
664                                         nbytes = NRLEAF(rl);
665                                         isbigkey = 0;
666                                         break;
667                               default:
668                                         abort();
669                               }
670 
671                     /*
672                      * If the key/data pairs are substantial fractions of the max
673                      * possible size for the page, it's possible to get situations
674                      * where we decide to try and copy too much onto the left page.
675                      * Make sure that doesn't happen.
676                      */
677                     if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
678                         nxt == top - 1) {
679                               --off;
680                               break;
681                     }
682 
683                     /* Copy the key/data pair, if not the skipped index. */
684                     if (skip != off) {
685                               ++nxt;
686 
687                               l->linp[off] = l->upper -= nbytes;
688                               memmove((char *)(void *)l + l->upper, src, nbytes);
689                     }
690 
691                     temp = nbytes + sizeof(indx_t);
692                     _DBFIT(temp, indx_t);
693                     used += (indx_t)temp;
694                     if (used >= half) {
695                               if (!isbigkey || bigkeycnt == 3)
696                                         break;
697                               else
698                                         ++bigkeycnt;
699                     }
700           }
701 
702           /*
703            * Off is the last offset that's valid for the left page.
704            * Nxt is the first offset to be placed on the right page.
705            */
706           temp = (off + 1) * sizeof(indx_t);
707           _DBFIT(temp, indx_t);
708           l->lower += (indx_t)temp;
709 
710           /*
711            * If splitting the page that the cursor was on, the cursor has to be
712            * adjusted to point to the same record as before the split.  If the
713            * cursor is at or past the skipped slot, the cursor is incremented by
714            * one.  If the cursor is on the right page, it is decremented by the
715            * number of records split to the left page.
716            */
717           c = &t->bt_cursor;
718           if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
719                     if (c->pg.index >= skip)
720                               ++c->pg.index;
721                     if (c->pg.index < nxt)                            /* Left page. */
722                               c->pg.pgno = l->pgno;
723                     else {                                            /* Right page. */
724                               c->pg.pgno = r->pgno;
725                               c->pg.index -= nxt;
726                     }
727           }
728 
729           /*
730            * If the skipped index was on the left page, just return that page.
731            * Otherwise, adjust the skip index to reflect the new position on
732            * the right page.
733            */
734           if (skip <= off) {
735                     skip = MAX_PAGE_OFFSET;
736                     rval = l;
737           } else {
738                     rval = r;
739                     *pskip -= nxt;
740           }
741 
742           for (off = 0; nxt < top; ++off) {
743                     if (skip == nxt) {
744                               ++off;
745                               skip = MAX_PAGE_OFFSET;
746                     }
747                     switch (h->flags & P_TYPE) {
748                     case P_BINTERNAL:
749                               src = bi = GETBINTERNAL(h, nxt);
750                               nbytes = NBINTERNAL(bi->ksize);
751                               break;
752                     case P_BLEAF:
753                               src = bl = GETBLEAF(h, nxt);
754                               nbytes = NBLEAF(bl);
755                               break;
756                     case P_RINTERNAL:
757                               src = GETRINTERNAL(h, nxt);
758                               nbytes = NRINTERNAL;
759                               break;
760                     case P_RLEAF:
761                               src = rl = GETRLEAF(h, nxt);
762                               nbytes = NRLEAF(rl);
763                               break;
764                     default:
765                               abort();
766                     }
767                     ++nxt;
768                     r->linp[off] = r->upper -= nbytes;
769                     memmove((char *)(void *)r + r->upper, src, nbytes);
770           }
771           temp = off * sizeof(indx_t);
772           _DBFIT(temp, indx_t);
773           r->lower += (indx_t)temp;
774 
775           /* If the key is being appended to the page, adjust the index. */
776           if (skip == top)
777                     r->lower += sizeof(indx_t);
778 
779           return (rval);
780 }
781 
782 /*
783  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
784  *
785  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
786  * record that references them gets deleted.  Chains pointed to by internal
787  * pages never get deleted.  This routine marks a chain as pointed to by an
788  * internal page.
789  *
790  * Parameters:
791  *        t:        tree
792  *        pg:       page number of first page in the chain.
793  *
794  * Returns:
795  *        RET_SUCCESS, RET_ERROR.
796  */
797 static int
bt_preserve(BTREE * t,pgno_t pg)798 bt_preserve(BTREE *t, pgno_t pg)
799 {
800           PAGE *h;
801 
802           if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
803                     return (RET_ERROR);
804           h->flags |= P_PRESERVE;
805           mpool_put(t->bt_mp, h, MPOOL_DIRTY);
806           return (RET_SUCCESS);
807 }
808 
809 /*
810  * REC_TOTAL -- Return the number of recno entries below a page.
811  *
812  * Parameters:
813  *        h:        page
814  *
815  * Returns:
816  *        The number of recno entries below a page.
817  *
818  * XXX
819  * These values could be set by the bt_psplit routine.  The problem is that the
820  * entry has to be popped off of the stack etc. or the values have to be passed
821  * all the way back to bt_split/bt_rroot and it's not very clean.
822  */
823 static recno_t
rec_total(PAGE * h)824 rec_total(PAGE *h)
825 {
826           recno_t recs;
827           indx_t nxt, top;
828 
829           for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
830                     recs += GETRINTERNAL(h, nxt)->nrecs;
831           return (recs);
832 }
833