xref: /sqlite-3.40.0/src/btree.c (revision 4dcbdbff)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.264 2005/08/02 17:13:10 drh Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** For a detailed discussion of BTrees, refer to
16 **
17 **     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18 **     "Sorting And Searching", pages 473-480. Addison-Wesley
19 **     Publishing Company, Reading, Massachusetts.
20 **
21 ** The basic idea is that each page of the file contains N database
22 ** entries and N+1 pointers to subpages.
23 **
24 **   ----------------------------------------------------------------
25 **   |  Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26 **   ----------------------------------------------------------------
27 **
28 ** All of the keys on the page that Ptr(0) points to have values less
29 ** than Key(0).  All of the keys on page Ptr(1) and its subpages have
30 ** values greater than Key(0) and less than Key(1).  All of the keys
31 ** on Ptr(N+1) and its subpages have values greater than Key(N).  And
32 ** so forth.
33 **
34 ** Finding a particular key requires reading O(log(M)) pages from the
35 ** disk where M is the number of entries in the tree.
36 **
37 ** In this implementation, a single file can hold one or more separate
38 ** BTrees.  Each BTree is identified by the index of its root page.  The
39 ** key and data for any entry are combined to form the "payload".  A
40 ** fixed amount of payload can be carried directly on the database
41 ** page.  If the payload is larger than the preset amount then surplus
42 ** bytes are stored on overflow pages.  The payload for an entry
43 ** and the preceding pointer are combined to form a "Cell".  Each
44 ** page has a small header which contains the Ptr(N+1) pointer and other
45 ** information such as the size of key and data.
46 **
47 ** FORMAT DETAILS
48 **
49 ** The file is divided into pages.  The first page is called page 1,
50 ** the second is page 2, and so forth.  A page number of zero indicates
51 ** "no such page".  The page size can be anything between 512 and 65536.
52 ** Each page can be either a btree page, a freelist page or an overflow
53 ** page.
54 **
55 ** The first page is always a btree page.  The first 100 bytes of the first
56 ** page contain a special header (the "file header") that describes the file.
57 ** The format of the file header is as follows:
58 **
59 **   OFFSET   SIZE    DESCRIPTION
60 **      0      16     Header string: "SQLite format 3\000"
61 **     16       2     Page size in bytes.
62 **     18       1     File format write version
63 **     19       1     File format read version
64 **     20       1     Bytes of unused space at the end of each page
65 **     21       1     Max embedded payload fraction
66 **     22       1     Min embedded payload fraction
67 **     23       1     Min leaf payload fraction
68 **     24       4     File change counter
69 **     28       4     Reserved for future use
70 **     32       4     First freelist page
71 **     36       4     Number of freelist pages in the file
72 **     40      60     15 4-byte meta values passed to higher layers
73 **
74 ** All of the integer values are big-endian (most significant byte first).
75 **
76 ** The file change counter is incremented when the database is changed more
77 ** than once within the same second.  This counter, together with the
78 ** modification time of the file, allows other processes to know
79 ** when the file has changed and thus when they need to flush their
80 ** cache.
81 **
82 ** The max embedded payload fraction is the amount of the total usable
83 ** space in a page that can be consumed by a single cell for standard
84 ** B-tree (non-LEAFDATA) tables.  A value of 255 means 100%.  The default
85 ** is to limit the maximum cell size so that at least 4 cells will fit
86 ** on one page.  Thus the default max embedded payload fraction is 64.
87 **
88 ** If the payload for a cell is larger than the max payload, then extra
89 ** payload is spilled to overflow pages.  Once an overflow page is allocated,
90 ** as many bytes as possible are moved into the overflow pages without letting
91 ** the cell size drop below the min embedded payload fraction.
92 **
93 ** The min leaf payload fraction is like the min embedded payload fraction
94 ** except that it applies to leaf nodes in a LEAFDATA tree.  The maximum
95 ** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
96 ** not specified in the header.
97 **
98 ** Each btree pages is divided into three sections:  The header, the
99 ** cell pointer array, and the cell area area.  Page 1 also has a 100-byte
100 ** file header that occurs before the page header.
101 **
102 **      |----------------|
103 **      | file header    |   100 bytes.  Page 1 only.
104 **      |----------------|
105 **      | page header    |   8 bytes for leaves.  12 bytes for interior nodes
106 **      |----------------|
107 **      | cell pointer   |   |  2 bytes per cell.  Sorted order.
108 **      | array          |   |  Grows downward
109 **      |                |   v
110 **      |----------------|
111 **      | unallocated    |
112 **      | space          |
113 **      |----------------|   ^  Grows upwards
114 **      | cell content   |   |  Arbitrary order interspersed with freeblocks.
115 **      | area           |   |  and free space fragments.
116 **      |----------------|
117 **
118 ** The page headers looks like this:
119 **
120 **   OFFSET   SIZE     DESCRIPTION
121 **      0       1      Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
122 **      1       2      byte offset to the first freeblock
123 **      3       2      number of cells on this page
124 **      5       2      first byte of the cell content area
125 **      7       1      number of fragmented free bytes
126 **      8       4      Right child (the Ptr(N+1) value).  Omitted on leaves.
127 **
128 ** The flags define the format of this btree page.  The leaf flag means that
129 ** this page has no children.  The zerodata flag means that this page carries
130 ** only keys and no data.  The intkey flag means that the key is a integer
131 ** which is stored in the key size entry of the cell header rather than in
132 ** the payload area.
133 **
134 ** The cell pointer array begins on the first byte after the page header.
135 ** The cell pointer array contains zero or more 2-byte numbers which are
136 ** offsets from the beginning of the page to the cell content in the cell
137 ** content area.  The cell pointers occur in sorted order.  The system strives
138 ** to keep free space after the last cell pointer so that new cells can
139 ** be easily added without having to defragment the page.
140 **
141 ** Cell content is stored at the very end of the page and grows toward the
142 ** beginning of the page.
143 **
144 ** Unused space within the cell content area is collected into a linked list of
145 ** freeblocks.  Each freeblock is at least 4 bytes in size.  The byte offset
146 ** to the first freeblock is given in the header.  Freeblocks occur in
147 ** increasing order.  Because a freeblock must be at least 4 bytes in size,
148 ** any group of 3 or fewer unused bytes in the cell content area cannot
149 ** exist on the freeblock chain.  A group of 3 or fewer free bytes is called
150 ** a fragment.  The total number of bytes in all fragments is recorded.
151 ** in the page header at offset 7.
152 **
153 **    SIZE    DESCRIPTION
154 **      2     Byte offset of the next freeblock
155 **      2     Bytes in this freeblock
156 **
157 ** Cells are of variable length.  Cells are stored in the cell content area at
158 ** the end of the page.  Pointers to the cells are in the cell pointer array
159 ** that immediately follows the page header.  Cells is not necessarily
160 ** contiguous or in order, but cell pointers are contiguous and in order.
161 **
162 ** Cell content makes use of variable length integers.  A variable
163 ** length integer is 1 to 9 bytes where the lower 7 bits of each
164 ** byte are used.  The integer consists of all bytes that have bit 8 set and
165 ** the first byte with bit 8 clear.  The most significant byte of the integer
166 ** appears first.  A variable-length integer may not be more than 9 bytes long.
167 ** As a special case, all 8 bytes of the 9th byte are used as data.  This
168 ** allows a 64-bit integer to be encoded in 9 bytes.
169 **
170 **    0x00                      becomes  0x00000000
171 **    0x7f                      becomes  0x0000007f
172 **    0x81 0x00                 becomes  0x00000080
173 **    0x82 0x00                 becomes  0x00000100
174 **    0x80 0x7f                 becomes  0x0000007f
175 **    0x8a 0x91 0xd1 0xac 0x78  becomes  0x12345678
176 **    0x81 0x81 0x81 0x81 0x01  becomes  0x10204081
177 **
178 ** Variable length integers are used for rowids and to hold the number of
179 ** bytes of key and data in a btree cell.
180 **
181 ** The content of a cell looks like this:
182 **
183 **    SIZE    DESCRIPTION
184 **      4     Page number of the left child. Omitted if leaf flag is set.
185 **     var    Number of bytes of data. Omitted if the zerodata flag is set.
186 **     var    Number of bytes of key. Or the key itself if intkey flag is set.
187 **      *     Payload
188 **      4     First page of the overflow chain.  Omitted if no overflow
189 **
190 ** Overflow pages form a linked list.  Each page except the last is completely
191 ** filled with data (pagesize - 4 bytes).  The last page can have as little
192 ** as 1 byte of data.
193 **
194 **    SIZE    DESCRIPTION
195 **      4     Page number of next overflow page
196 **      *     Data
197 **
198 ** Freelist pages come in two subtypes: trunk pages and leaf pages.  The
199 ** file header points to first in a linked list of trunk page.  Each trunk
200 ** page points to multiple leaf pages.  The content of a leaf page is
201 ** unspecified.  A trunk page looks like this:
202 **
203 **    SIZE    DESCRIPTION
204 **      4     Page number of next trunk page
205 **      4     Number of leaf pointers on this page
206 **      *     zero or more pages numbers of leaves
207 */
208 #include "sqliteInt.h"
209 #include "pager.h"
210 #include "btree.h"
211 #include "os.h"
212 #include <assert.h>
213 
214 /* Round up a number to the next larger multiple of 8.  This is used
215 ** to force 8-byte alignment on 64-bit architectures.
216 */
217 #define ROUND8(x)   ((x+7)&~7)
218 
219 
220 /* The following value is the maximum cell size assuming a maximum page
221 ** size give above.
222 */
223 #define MX_CELL_SIZE(pBt)  (pBt->pageSize-8)
224 
225 /* The maximum number of cells on a single page of the database.  This
226 ** assumes a minimum cell size of 3 bytes.  Such small cells will be
227 ** exceedingly rare, but they are possible.
228 */
229 #define MX_CELL(pBt) ((pBt->pageSize-8)/3)
230 
231 /* Forward declarations */
232 typedef struct MemPage MemPage;
233 
234 /*
235 ** This is a magic string that appears at the beginning of every
236 ** SQLite database in order to identify the file as a real database.
237 **
238 ** You can change this value at compile-time by specifying a
239 ** -DSQLITE_FILE_HEADER="..." on the compiler command-line.  The
240 ** header must be exactly 16 bytes including the zero-terminator so
241 ** the string itself should be 15 characters long.  If you change
242 ** the header, then your custom library will not be able to read
243 ** databases generated by the standard tools and the standard tools
244 ** will not be able to read databases created by your custom library.
245 */
246 #ifndef SQLITE_FILE_HEADER /* 123456789 123456 */
247 #  define SQLITE_FILE_HEADER "SQLite format 3"
248 #endif
249 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
250 
251 /*
252 ** Page type flags.  An ORed combination of these flags appear as the
253 ** first byte of every BTree page.
254 */
255 #define PTF_INTKEY    0x01
256 #define PTF_ZERODATA  0x02
257 #define PTF_LEAFDATA  0x04
258 #define PTF_LEAF      0x08
259 
260 /*
261 ** As each page of the file is loaded into memory, an instance of the following
262 ** structure is appended and initialized to zero.  This structure stores
263 ** information about the page that is decoded from the raw file page.
264 **
265 ** The pParent field points back to the parent page.  This allows us to
266 ** walk up the BTree from any leaf to the root.  Care must be taken to
267 ** unref() the parent page pointer when this page is no longer referenced.
268 ** The pageDestructor() routine handles that chore.
269 */
270 struct MemPage {
271   u8 isInit;           /* True if previously initialized. MUST BE FIRST! */
272   u8 idxShift;         /* True if Cell indices have changed */
273   u8 nOverflow;        /* Number of overflow cell bodies in aCell[] */
274   u8 intKey;           /* True if intkey flag is set */
275   u8 leaf;             /* True if leaf flag is set */
276   u8 zeroData;         /* True if table stores keys only */
277   u8 leafData;         /* True if tables stores data on leaves only */
278   u8 hasData;          /* True if this page stores data */
279   u8 hdrOffset;        /* 100 for page 1.  0 otherwise */
280   u8 childPtrSize;     /* 0 if leaf==1.  4 if leaf==0 */
281   u16 maxLocal;        /* Copy of Btree.maxLocal or Btree.maxLeaf */
282   u16 minLocal;        /* Copy of Btree.minLocal or Btree.minLeaf */
283   u16 cellOffset;      /* Index in aData of first cell pointer */
284   u16 idxParent;       /* Index in parent of this node */
285   u16 nFree;           /* Number of free bytes on the page */
286   u16 nCell;           /* Number of cells on this page, local and ovfl */
287   struct _OvflCell {   /* Cells that will not fit on aData[] */
288     u8 *pCell;           /* Pointers to the body of the overflow cell */
289     u16 idx;             /* Insert this cell before idx-th non-overflow cell */
290   } aOvfl[5];
291   struct Btree *pBt;   /* Pointer back to BTree structure */
292   u8 *aData;           /* Pointer back to the start of the page */
293   Pgno pgno;           /* Page number for this page */
294   MemPage *pParent;    /* The parent of this page.  NULL for root */
295 };
296 
297 /*
298 ** The in-memory image of a disk page has the auxiliary information appended
299 ** to the end.  EXTRA_SIZE is the number of bytes of space needed to hold
300 ** that extra information.
301 */
302 #define EXTRA_SIZE sizeof(MemPage)
303 
304 /*
305 ** Everything we need to know about an open database
306 */
307 struct Btree {
308   Pager *pPager;        /* The page cache */
309   BtCursor *pCursor;    /* A list of all open cursors */
310   MemPage *pPage1;      /* First page of the database */
311   u8 inTrans;           /* True if a transaction is in progress */
312   u8 inStmt;            /* True if we are in a statement subtransaction */
313   u8 readOnly;          /* True if the underlying file is readonly */
314   u8 maxEmbedFrac;      /* Maximum payload as % of total page size */
315   u8 minEmbedFrac;      /* Minimum payload as % of total page size */
316   u8 minLeafFrac;       /* Minimum leaf payload as % of total page size */
317   u8 pageSizeFixed;     /* True if the page size can no longer be changed */
318 #ifndef SQLITE_OMIT_AUTOVACUUM
319   u8 autoVacuum;        /* True if database supports auto-vacuum */
320 #endif
321   u16 pageSize;         /* Total number of bytes on a page */
322   u16 usableSize;       /* Number of usable bytes on each page */
323   int maxLocal;         /* Maximum local payload in non-LEAFDATA tables */
324   int minLocal;         /* Minimum local payload in non-LEAFDATA tables */
325   int maxLeaf;          /* Maximum local payload in a LEAFDATA table */
326   int minLeaf;          /* Minimum local payload in a LEAFDATA table */
327   BusyHandler *pBusyHandler;   /* Callback for when there is lock contention */
328 };
329 typedef Btree Bt;
330 
331 /*
332 ** Btree.inTrans may take one of the following values.
333 */
334 #define TRANS_NONE  0
335 #define TRANS_READ  1
336 #define TRANS_WRITE 2
337 
338 /*
339 ** An instance of the following structure is used to hold information
340 ** about a cell.  The parseCellPtr() function fills in this structure
341 ** based on information extract from the raw disk page.
342 */
343 typedef struct CellInfo CellInfo;
344 struct CellInfo {
345   u8 *pCell;     /* Pointer to the start of cell content */
346   i64 nKey;      /* The key for INTKEY tables, or number of bytes in key */
347   u32 nData;     /* Number of bytes of data */
348   u16 nHeader;   /* Size of the cell content header in bytes */
349   u16 nLocal;    /* Amount of payload held locally */
350   u16 iOverflow; /* Offset to overflow page number.  Zero if no overflow */
351   u16 nSize;     /* Size of the cell content on the main b-tree page */
352 };
353 
354 /*
355 ** A cursor is a pointer to a particular entry in the BTree.
356 ** The entry is identified by its MemPage and the index in
357 ** MemPage.aCell[] of the entry.
358 */
359 struct BtCursor {
360   Btree *pBt;               /* The Btree to which this cursor belongs */
361   BtCursor *pNext, *pPrev;  /* Forms a linked list of all cursors */
362   int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
363   void *pArg;               /* First arg to xCompare() */
364   Pgno pgnoRoot;            /* The root page of this tree */
365   MemPage *pPage;           /* Page that contains the entry */
366   int idx;                  /* Index of the entry in pPage->aCell[] */
367   CellInfo info;            /* A parse of the cell we are pointing at */
368   u8 wrFlag;                /* True if writable */
369   u8 isValid;               /* TRUE if points to a valid entry */
370 };
371 
372 /*
373 ** The TRACE macro will print high-level status information about the
374 ** btree operation when the global variable sqlite3_btree_trace is
375 ** enabled.
376 */
377 #if SQLITE_TEST
378 # define TRACE(X)   if( sqlite3_btree_trace )\
379                         { sqlite3DebugPrintf X; fflush(stdout); }
380 #else
381 # define TRACE(X)
382 #endif
383 int sqlite3_btree_trace=0;  /* True to enable tracing */
384 
385 /*
386 ** Forward declaration
387 */
388 static int checkReadLocks(Btree*,Pgno,BtCursor*);
389 
390 /*
391 ** Read or write a two- and four-byte big-endian integer values.
392 */
393 static u32 get2byte(unsigned char *p){
394   return (p[0]<<8) | p[1];
395 }
396 static u32 get4byte(unsigned char *p){
397   return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
398 }
399 static void put2byte(unsigned char *p, u32 v){
400   p[0] = v>>8;
401   p[1] = v;
402 }
403 static void put4byte(unsigned char *p, u32 v){
404   p[0] = v>>24;
405   p[1] = v>>16;
406   p[2] = v>>8;
407   p[3] = v;
408 }
409 
410 /*
411 ** Routines to read and write variable-length integers.  These used to
412 ** be defined locally, but now we use the varint routines in the util.c
413 ** file.
414 */
415 #define getVarint    sqlite3GetVarint
416 #define getVarint32  sqlite3GetVarint32
417 #define putVarint    sqlite3PutVarint
418 
419 /* The database page the PENDING_BYTE occupies. This page is never used.
420 ** TODO: This macro is very similary to PAGER_MJ_PGNO() in pager.c. They
421 ** should possibly be consolidated (presumably in pager.h).
422 */
423 #define PENDING_BYTE_PAGE(pBt) ((PENDING_BYTE/(pBt)->pageSize)+1)
424 
425 #ifndef SQLITE_OMIT_AUTOVACUUM
426 /*
427 ** These macros define the location of the pointer-map entry for a
428 ** database page. The first argument to each is the number of usable
429 ** bytes on each page of the database (often 1024). The second is the
430 ** page number to look up in the pointer map.
431 **
432 ** PTRMAP_PAGENO returns the database page number of the pointer-map
433 ** page that stores the required pointer. PTRMAP_PTROFFSET returns
434 ** the offset of the requested map entry.
435 **
436 ** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,
437 ** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be
438 ** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements
439 ** this test.
440 */
441 #define PTRMAP_PAGENO(pgsz, pgno) (((pgno-2)/(pgsz/5+1))*(pgsz/5+1)+2)
442 #define PTRMAP_PTROFFSET(pgsz, pgno) (((pgno-2)%(pgsz/5+1)-1)*5)
443 #define PTRMAP_ISPAGE(pgsz, pgno) (PTRMAP_PAGENO(pgsz,pgno)==pgno)
444 
445 /*
446 ** The pointer map is a lookup table that identifies the parent page for
447 ** each child page in the database file.  The parent page is the page that
448 ** contains a pointer to the child.  Every page in the database contains
449 ** 0 or 1 parent pages.  (In this context 'database page' refers
450 ** to any page that is not part of the pointer map itself.)  Each pointer map
451 ** entry consists of a single byte 'type' and a 4 byte parent page number.
452 ** The PTRMAP_XXX identifiers below are the valid types.
453 **
454 ** The purpose of the pointer map is to facility moving pages from one
455 ** position in the file to another as part of autovacuum.  When a page
456 ** is moved, the pointer in its parent must be updated to point to the
457 ** new location.  The pointer map is used to locate the parent page quickly.
458 **
459 ** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not
460 **                  used in this case.
461 **
462 ** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number
463 **                  is not used in this case.
464 **
465 ** PTRMAP_OVERFLOW1: The database page is the first page in a list of
466 **                   overflow pages. The page number identifies the page that
467 **                   contains the cell with a pointer to this overflow page.
468 **
469 ** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of
470 **                   overflow pages. The page-number identifies the previous
471 **                   page in the overflow page list.
472 **
473 ** PTRMAP_BTREE: The database page is a non-root btree page. The page number
474 **               identifies the parent page in the btree.
475 */
476 #define PTRMAP_ROOTPAGE 1
477 #define PTRMAP_FREEPAGE 2
478 #define PTRMAP_OVERFLOW1 3
479 #define PTRMAP_OVERFLOW2 4
480 #define PTRMAP_BTREE 5
481 
482 /*
483 ** Write an entry into the pointer map.
484 **
485 ** This routine updates the pointer map entry for page number 'key'
486 ** so that it maps to type 'eType' and parent page number 'pgno'.
487 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
488 */
489 static int ptrmapPut(Btree *pBt, Pgno key, u8 eType, Pgno parent){
490   u8 *pPtrmap;    /* The pointer map page */
491   Pgno iPtrmap;   /* The pointer map page number */
492   int offset;     /* Offset in pointer map page */
493   int rc;
494 
495   assert( pBt->autoVacuum );
496   if( key==0 ){
497     return SQLITE_CORRUPT;
498   }
499   iPtrmap = PTRMAP_PAGENO(pBt->usableSize, key);
500   rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap);
501   if( rc!=SQLITE_OK ){
502     return rc;
503   }
504   offset = PTRMAP_PTROFFSET(pBt->usableSize, key);
505 
506   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
507     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
508     rc = sqlite3pager_write(pPtrmap);
509     if( rc==SQLITE_OK ){
510       pPtrmap[offset] = eType;
511       put4byte(&pPtrmap[offset+1], parent);
512     }
513   }
514 
515   sqlite3pager_unref(pPtrmap);
516   return rc;
517 }
518 
519 /*
520 ** Read an entry from the pointer map.
521 **
522 ** This routine retrieves the pointer map entry for page 'key', writing
523 ** the type and parent page number to *pEType and *pPgno respectively.
524 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
525 */
526 static int ptrmapGet(Btree *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
527   int iPtrmap;       /* Pointer map page index */
528   u8 *pPtrmap;       /* Pointer map page data */
529   int offset;        /* Offset of entry in pointer map */
530   int rc;
531 
532   iPtrmap = PTRMAP_PAGENO(pBt->usableSize, key);
533   rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap);
534   if( rc!=0 ){
535     return rc;
536   }
537 
538   offset = PTRMAP_PTROFFSET(pBt->usableSize, key);
539   if( pEType ) *pEType = pPtrmap[offset];
540   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
541 
542   sqlite3pager_unref(pPtrmap);
543   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT;
544   return SQLITE_OK;
545 }
546 
547 #endif /* SQLITE_OMIT_AUTOVACUUM */
548 
549 /*
550 ** Given a btree page and a cell index (0 means the first cell on
551 ** the page, 1 means the second cell, and so forth) return a pointer
552 ** to the cell content.
553 **
554 ** This routine works only for pages that do not contain overflow cells.
555 */
556 static u8 *findCell(MemPage *pPage, int iCell){
557   u8 *data = pPage->aData;
558   assert( iCell>=0 );
559   assert( iCell<get2byte(&data[pPage->hdrOffset+3]) );
560   return data + get2byte(&data[pPage->cellOffset+2*iCell]);
561 }
562 
563 /*
564 ** This a more complex version of findCell() that works for
565 ** pages that do contain overflow cells.  See insert
566 */
567 static u8 *findOverflowCell(MemPage *pPage, int iCell){
568   int i;
569   for(i=pPage->nOverflow-1; i>=0; i--){
570     int k;
571     struct _OvflCell *pOvfl;
572     pOvfl = &pPage->aOvfl[i];
573     k = pOvfl->idx;
574     if( k<=iCell ){
575       if( k==iCell ){
576         return pOvfl->pCell;
577       }
578       iCell--;
579     }
580   }
581   return findCell(pPage, iCell);
582 }
583 
584 /*
585 ** Parse a cell content block and fill in the CellInfo structure.  There
586 ** are two versions of this function.  parseCell() takes a cell index
587 ** as the second argument and parseCellPtr() takes a pointer to the
588 ** body of the cell as its second argument.
589 */
590 static void parseCellPtr(
591   MemPage *pPage,         /* Page containing the cell */
592   u8 *pCell,              /* Pointer to the cell text. */
593   CellInfo *pInfo         /* Fill in this structure */
594 ){
595   int n;                  /* Number bytes in cell content header */
596   u32 nPayload;           /* Number of bytes of cell payload */
597 
598   pInfo->pCell = pCell;
599   assert( pPage->leaf==0 || pPage->leaf==1 );
600   n = pPage->childPtrSize;
601   assert( n==4-4*pPage->leaf );
602   if( pPage->hasData ){
603     n += getVarint32(&pCell[n], &nPayload);
604   }else{
605     nPayload = 0;
606   }
607   n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
608   pInfo->nHeader = n;
609   pInfo->nData = nPayload;
610   if( !pPage->intKey ){
611     nPayload += pInfo->nKey;
612   }
613   if( nPayload<=pPage->maxLocal ){
614     /* This is the (easy) common case where the entire payload fits
615     ** on the local page.  No overflow is required.
616     */
617     int nSize;          /* Total size of cell content in bytes */
618     pInfo->nLocal = nPayload;
619     pInfo->iOverflow = 0;
620     nSize = nPayload + n;
621     if( nSize<4 ){
622       nSize = 4;        /* Minimum cell size is 4 */
623     }
624     pInfo->nSize = nSize;
625   }else{
626     /* If the payload will not fit completely on the local page, we have
627     ** to decide how much to store locally and how much to spill onto
628     ** overflow pages.  The strategy is to minimize the amount of unused
629     ** space on overflow pages while keeping the amount of local storage
630     ** in between minLocal and maxLocal.
631     **
632     ** Warning:  changing the way overflow payload is distributed in any
633     ** way will result in an incompatible file format.
634     */
635     int minLocal;  /* Minimum amount of payload held locally */
636     int maxLocal;  /* Maximum amount of payload held locally */
637     int surplus;   /* Overflow payload available for local storage */
638 
639     minLocal = pPage->minLocal;
640     maxLocal = pPage->maxLocal;
641     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
642     if( surplus <= maxLocal ){
643       pInfo->nLocal = surplus;
644     }else{
645       pInfo->nLocal = minLocal;
646     }
647     pInfo->iOverflow = pInfo->nLocal + n;
648     pInfo->nSize = pInfo->iOverflow + 4;
649   }
650 }
651 static void parseCell(
652   MemPage *pPage,         /* Page containing the cell */
653   int iCell,              /* The cell index.  First cell is 0 */
654   CellInfo *pInfo         /* Fill in this structure */
655 ){
656   parseCellPtr(pPage, findCell(pPage, iCell), pInfo);
657 }
658 
659 /*
660 ** Compute the total number of bytes that a Cell needs in the cell
661 ** data area of the btree-page.  The return number includes the cell
662 ** data header and the local payload, but not any overflow page or
663 ** the space used by the cell pointer.
664 */
665 #ifndef NDEBUG
666 static int cellSize(MemPage *pPage, int iCell){
667   CellInfo info;
668   parseCell(pPage, iCell, &info);
669   return info.nSize;
670 }
671 #endif
672 static int cellSizePtr(MemPage *pPage, u8 *pCell){
673   CellInfo info;
674   parseCellPtr(pPage, pCell, &info);
675   return info.nSize;
676 }
677 
678 #ifndef SQLITE_OMIT_AUTOVACUUM
679 /*
680 ** If the cell pCell, part of page pPage contains a pointer
681 ** to an overflow page, insert an entry into the pointer-map
682 ** for the overflow page.
683 */
684 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
685   if( pCell ){
686     CellInfo info;
687     parseCellPtr(pPage, pCell, &info);
688     if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
689       Pgno ovfl = get4byte(&pCell[info.iOverflow]);
690       return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
691     }
692   }
693   return SQLITE_OK;
694 }
695 /*
696 ** If the cell with index iCell on page pPage contains a pointer
697 ** to an overflow page, insert an entry into the pointer-map
698 ** for the overflow page.
699 */
700 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
701   u8 *pCell;
702   pCell = findOverflowCell(pPage, iCell);
703   return ptrmapPutOvflPtr(pPage, pCell);
704 }
705 #endif
706 
707 
708 /*
709 ** Do sanity checking on a page.  Throw an exception if anything is
710 ** not right.
711 **
712 ** This routine is used for internal error checking only.  It is omitted
713 ** from most builds.
714 */
715 #if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
716 static void _pageIntegrity(MemPage *pPage){
717   int usableSize;
718   u8 *data;
719   int i, j, idx, c, pc, hdr, nFree;
720   int cellOffset;
721   int nCell, cellLimit;
722   u8 *used;
723 
724   used = sqliteMallocRaw( pPage->pBt->pageSize );
725   if( used==0 ) return;
726   usableSize = pPage->pBt->usableSize;
727   assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->pageSize] );
728   hdr = pPage->hdrOffset;
729   assert( hdr==(pPage->pgno==1 ? 100 : 0) );
730   assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
731   c = pPage->aData[hdr];
732   if( pPage->isInit ){
733     assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
734     assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
735     assert( pPage->leafData == ((c & PTF_LEAFDATA)!=0) );
736     assert( pPage->intKey == ((c & (PTF_INTKEY|PTF_LEAFDATA))!=0) );
737     assert( pPage->hasData ==
738              !(pPage->zeroData || (!pPage->leaf && pPage->leafData)) );
739     assert( pPage->cellOffset==pPage->hdrOffset+12-4*pPage->leaf );
740     assert( pPage->nCell = get2byte(&pPage->aData[hdr+3]) );
741   }
742   data = pPage->aData;
743   memset(used, 0, usableSize);
744   for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
745   nFree = 0;
746   pc = get2byte(&data[hdr+1]);
747   while( pc ){
748     int size;
749     assert( pc>0 && pc<usableSize-4 );
750     size = get2byte(&data[pc+2]);
751     assert( pc+size<=usableSize );
752     nFree += size;
753     for(i=pc; i<pc+size; i++){
754       assert( used[i]==0 );
755       used[i] = 1;
756     }
757     pc = get2byte(&data[pc]);
758   }
759   idx = 0;
760   nCell = get2byte(&data[hdr+3]);
761   cellLimit = get2byte(&data[hdr+5]);
762   assert( pPage->isInit==0
763          || pPage->nFree==nFree+data[hdr+7]+cellLimit-(cellOffset+2*nCell) );
764   cellOffset = pPage->cellOffset;
765   for(i=0; i<nCell; i++){
766     int size;
767     pc = get2byte(&data[cellOffset+2*i]);
768     assert( pc>0 && pc<usableSize-4 );
769     size = cellSize(pPage, &data[pc]);
770     assert( pc+size<=usableSize );
771     for(j=pc; j<pc+size; j++){
772       assert( used[j]==0 );
773       used[j] = 1;
774     }
775   }
776   for(i=cellOffset+2*nCell; i<cellimit; i++){
777     assert( used[i]==0 );
778     used[i] = 1;
779   }
780   nFree = 0;
781   for(i=0; i<usableSize; i++){
782     assert( used[i]<=1 );
783     if( used[i]==0 ) nFree++;
784   }
785   assert( nFree==data[hdr+7] );
786   sqliteFree(used);
787 }
788 #define pageIntegrity(X) _pageIntegrity(X)
789 #else
790 # define pageIntegrity(X)
791 #endif
792 
793 /*
794 ** Defragment the page given.  All Cells are moved to the
795 ** beginning of the page and all free space is collected
796 ** into one big FreeBlk at the end of the page.
797 */
798 static int defragmentPage(MemPage *pPage){
799   int i;                     /* Loop counter */
800   int pc;                    /* Address of a i-th cell */
801   int addr;                  /* Offset of first byte after cell pointer array */
802   int hdr;                   /* Offset to the page header */
803   int size;                  /* Size of a cell */
804   int usableSize;            /* Number of usable bytes on a page */
805   int cellOffset;            /* Offset to the cell pointer array */
806   int brk;                   /* Offset to the cell content area */
807   int nCell;                 /* Number of cells on the page */
808   unsigned char *data;       /* The page data */
809   unsigned char *temp;       /* Temp area for cell content */
810 
811   assert( sqlite3pager_iswriteable(pPage->aData) );
812   assert( pPage->pBt!=0 );
813   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
814   assert( pPage->nOverflow==0 );
815   temp = sqliteMalloc( pPage->pBt->pageSize );
816   if( temp==0 ) return SQLITE_NOMEM;
817   data = pPage->aData;
818   hdr = pPage->hdrOffset;
819   cellOffset = pPage->cellOffset;
820   nCell = pPage->nCell;
821   assert( nCell==get2byte(&data[hdr+3]) );
822   usableSize = pPage->pBt->usableSize;
823   brk = get2byte(&data[hdr+5]);
824   memcpy(&temp[brk], &data[brk], usableSize - brk);
825   brk = usableSize;
826   for(i=0; i<nCell; i++){
827     u8 *pAddr;     /* The i-th cell pointer */
828     pAddr = &data[cellOffset + i*2];
829     pc = get2byte(pAddr);
830     assert( pc<pPage->pBt->usableSize );
831     size = cellSizePtr(pPage, &temp[pc]);
832     brk -= size;
833     memcpy(&data[brk], &temp[pc], size);
834     put2byte(pAddr, brk);
835   }
836   assert( brk>=cellOffset+2*nCell );
837   put2byte(&data[hdr+5], brk);
838   data[hdr+1] = 0;
839   data[hdr+2] = 0;
840   data[hdr+7] = 0;
841   addr = cellOffset+2*nCell;
842   memset(&data[addr], 0, brk-addr);
843   sqliteFree(temp);
844   return SQLITE_OK;
845 }
846 
847 /*
848 ** Allocate nByte bytes of space on a page.
849 **
850 ** Return the index into pPage->aData[] of the first byte of
851 ** the new allocation. Or return 0 if there is not enough free
852 ** space on the page to satisfy the allocation request.
853 **
854 ** If the page contains nBytes of free space but does not contain
855 ** nBytes of contiguous free space, then this routine automatically
856 ** calls defragementPage() to consolidate all free space before
857 ** allocating the new chunk.
858 */
859 static int allocateSpace(MemPage *pPage, int nByte){
860   int addr, pc, hdr;
861   int size;
862   int nFrag;
863   int top;
864   int nCell;
865   int cellOffset;
866   unsigned char *data;
867 
868   data = pPage->aData;
869   assert( sqlite3pager_iswriteable(data) );
870   assert( pPage->pBt );
871   if( nByte<4 ) nByte = 4;
872   if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
873   pPage->nFree -= nByte;
874   hdr = pPage->hdrOffset;
875 
876   nFrag = data[hdr+7];
877   if( nFrag<60 ){
878     /* Search the freelist looking for a slot big enough to satisfy the
879     ** space request. */
880     addr = hdr+1;
881     while( (pc = get2byte(&data[addr]))>0 ){
882       size = get2byte(&data[pc+2]);
883       if( size>=nByte ){
884         if( size<nByte+4 ){
885           memcpy(&data[addr], &data[pc], 2);
886           data[hdr+7] = nFrag + size - nByte;
887           return pc;
888         }else{
889           put2byte(&data[pc+2], size-nByte);
890           return pc + size - nByte;
891         }
892       }
893       addr = pc;
894     }
895   }
896 
897   /* Allocate memory from the gap in between the cell pointer array
898   ** and the cell content area.
899   */
900   top = get2byte(&data[hdr+5]);
901   nCell = get2byte(&data[hdr+3]);
902   cellOffset = pPage->cellOffset;
903   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
904     if( defragmentPage(pPage) ) return 0;
905     top = get2byte(&data[hdr+5]);
906   }
907   top -= nByte;
908   assert( cellOffset + 2*nCell <= top );
909   put2byte(&data[hdr+5], top);
910   return top;
911 }
912 
913 /*
914 ** Return a section of the pPage->aData to the freelist.
915 ** The first byte of the new free block is pPage->aDisk[start]
916 ** and the size of the block is "size" bytes.
917 **
918 ** Most of the effort here is involved in coalesing adjacent
919 ** free blocks into a single big free block.
920 */
921 static void freeSpace(MemPage *pPage, int start, int size){
922   int addr, pbegin, hdr;
923   unsigned char *data = pPage->aData;
924 
925   assert( pPage->pBt!=0 );
926   assert( sqlite3pager_iswriteable(data) );
927   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
928   assert( (start + size)<=pPage->pBt->usableSize );
929   if( size<4 ) size = 4;
930 
931   /* Add the space back into the linked list of freeblocks */
932   hdr = pPage->hdrOffset;
933   addr = hdr + 1;
934   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
935     assert( pbegin<=pPage->pBt->usableSize-4 );
936     assert( pbegin>addr );
937     addr = pbegin;
938   }
939   assert( pbegin<=pPage->pBt->usableSize-4 );
940   assert( pbegin>addr || pbegin==0 );
941   put2byte(&data[addr], start);
942   put2byte(&data[start], pbegin);
943   put2byte(&data[start+2], size);
944   pPage->nFree += size;
945 
946   /* Coalesce adjacent free blocks */
947   addr = pPage->hdrOffset + 1;
948   while( (pbegin = get2byte(&data[addr]))>0 ){
949     int pnext, psize;
950     assert( pbegin>addr );
951     assert( pbegin<=pPage->pBt->usableSize-4 );
952     pnext = get2byte(&data[pbegin]);
953     psize = get2byte(&data[pbegin+2]);
954     if( pbegin + psize + 3 >= pnext && pnext>0 ){
955       int frag = pnext - (pbegin+psize);
956       assert( frag<=data[pPage->hdrOffset+7] );
957       data[pPage->hdrOffset+7] -= frag;
958       put2byte(&data[pbegin], get2byte(&data[pnext]));
959       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
960     }else{
961       addr = pbegin;
962     }
963   }
964 
965   /* If the cell content area begins with a freeblock, remove it. */
966   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
967     int top;
968     pbegin = get2byte(&data[hdr+1]);
969     memcpy(&data[hdr+1], &data[pbegin], 2);
970     top = get2byte(&data[hdr+5]);
971     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
972   }
973 }
974 
975 /*
976 ** Decode the flags byte (the first byte of the header) for a page
977 ** and initialize fields of the MemPage structure accordingly.
978 */
979 static void decodeFlags(MemPage *pPage, int flagByte){
980   Btree *pBt;     /* A copy of pPage->pBt */
981 
982   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
983   pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
984   pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
985   pPage->leaf = (flagByte & PTF_LEAF)!=0;
986   pPage->childPtrSize = 4*(pPage->leaf==0);
987   pBt = pPage->pBt;
988   if( flagByte & PTF_LEAFDATA ){
989     pPage->leafData = 1;
990     pPage->maxLocal = pBt->maxLeaf;
991     pPage->minLocal = pBt->minLeaf;
992   }else{
993     pPage->leafData = 0;
994     pPage->maxLocal = pBt->maxLocal;
995     pPage->minLocal = pBt->minLocal;
996   }
997   pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
998 }
999 
1000 /*
1001 ** Initialize the auxiliary information for a disk block.
1002 **
1003 ** The pParent parameter must be a pointer to the MemPage which
1004 ** is the parent of the page being initialized.  The root of a
1005 ** BTree has no parent and so for that page, pParent==NULL.
1006 **
1007 ** Return SQLITE_OK on success.  If we see that the page does
1008 ** not contain a well-formed database page, then return
1009 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1010 ** guarantee that the page is well-formed.  It only shows that
1011 ** we failed to detect any corruption.
1012 */
1013 static int initPage(
1014   MemPage *pPage,        /* The page to be initialized */
1015   MemPage *pParent       /* The parent.  Might be NULL */
1016 ){
1017   int pc;            /* Address of a freeblock within pPage->aData[] */
1018   int hdr;           /* Offset to beginning of page header */
1019   u8 *data;          /* Equal to pPage->aData */
1020   Btree *pBt;        /* The main btree structure */
1021   int usableSize;    /* Amount of usable space on each page */
1022   int cellOffset;    /* Offset from start of page to first cell pointer */
1023   int nFree;         /* Number of unused bytes on the page */
1024   int top;           /* First byte of the cell content area */
1025 
1026   pBt = pPage->pBt;
1027   assert( pBt!=0 );
1028   assert( pParent==0 || pParent->pBt==pBt );
1029   assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
1030   assert( pPage->aData == &((unsigned char*)pPage)[-pBt->pageSize] );
1031   if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
1032     /* The parent page should never change unless the file is corrupt */
1033     return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1034   }
1035   if( pPage->isInit ) return SQLITE_OK;
1036   if( pPage->pParent==0 && pParent!=0 ){
1037     pPage->pParent = pParent;
1038     sqlite3pager_ref(pParent->aData);
1039   }
1040   hdr = pPage->hdrOffset;
1041   data = pPage->aData;
1042   decodeFlags(pPage, data[hdr]);
1043   pPage->nOverflow = 0;
1044   pPage->idxShift = 0;
1045   usableSize = pBt->usableSize;
1046   pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1047   top = get2byte(&data[hdr+5]);
1048   pPage->nCell = get2byte(&data[hdr+3]);
1049   if( pPage->nCell>MX_CELL(pBt) ){
1050     /* To many cells for a single page.  The page must be corrupt */
1051     return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1052   }
1053   if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
1054     /* All pages must have at least one cell, except for root pages */
1055     return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1056   }
1057 
1058   /* Compute the total free space on the page */
1059   pc = get2byte(&data[hdr+1]);
1060   nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
1061   while( pc>0 ){
1062     int next, size;
1063     if( pc>usableSize-4 ){
1064       /* Free block is off the page */
1065       return SQLITE_CORRUPT;  /* bkpt-CORRUPT */
1066     }
1067     next = get2byte(&data[pc]);
1068     size = get2byte(&data[pc+2]);
1069     if( next>0 && next<=pc+size+3 ){
1070       /* Free blocks must be in accending order */
1071       return SQLITE_CORRUPT;  /* bkpt-CORRUPT */
1072     }
1073     nFree += size;
1074     pc = next;
1075   }
1076   pPage->nFree = nFree;
1077   if( nFree>=usableSize ){
1078     /* Free space cannot exceed total page size */
1079     return SQLITE_CORRUPT;  /* bkpt-CORRUPT */
1080   }
1081 
1082   pPage->isInit = 1;
1083   pageIntegrity(pPage);
1084   return SQLITE_OK;
1085 }
1086 
1087 /*
1088 ** Set up a raw page so that it looks like a database page holding
1089 ** no entries.
1090 */
1091 static void zeroPage(MemPage *pPage, int flags){
1092   unsigned char *data = pPage->aData;
1093   Btree *pBt = pPage->pBt;
1094   int hdr = pPage->hdrOffset;
1095   int first;
1096 
1097   assert( sqlite3pager_pagenumber(data)==pPage->pgno );
1098   assert( &data[pBt->pageSize] == (unsigned char*)pPage );
1099   assert( sqlite3pager_iswriteable(data) );
1100   memset(&data[hdr], 0, pBt->usableSize - hdr);
1101   data[hdr] = flags;
1102   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1103   memset(&data[hdr+1], 0, 4);
1104   data[hdr+7] = 0;
1105   put2byte(&data[hdr+5], pBt->usableSize);
1106   pPage->nFree = pBt->usableSize - first;
1107   decodeFlags(pPage, flags);
1108   pPage->hdrOffset = hdr;
1109   pPage->cellOffset = first;
1110   pPage->nOverflow = 0;
1111   pPage->idxShift = 0;
1112   pPage->nCell = 0;
1113   pPage->isInit = 1;
1114   pageIntegrity(pPage);
1115 }
1116 
1117 /*
1118 ** Get a page from the pager.  Initialize the MemPage.pBt and
1119 ** MemPage.aData elements if needed.
1120 */
1121 static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
1122   int rc;
1123   unsigned char *aData;
1124   MemPage *pPage;
1125   rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
1126   if( rc ) return rc;
1127   pPage = (MemPage*)&aData[pBt->pageSize];
1128   pPage->aData = aData;
1129   pPage->pBt = pBt;
1130   pPage->pgno = pgno;
1131   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1132   *ppPage = pPage;
1133   return SQLITE_OK;
1134 }
1135 
1136 /*
1137 ** Get a page from the pager and initialize it.  This routine
1138 ** is just a convenience wrapper around separate calls to
1139 ** getPage() and initPage().
1140 */
1141 static int getAndInitPage(
1142   Btree *pBt,          /* The database file */
1143   Pgno pgno,           /* Number of the page to get */
1144   MemPage **ppPage,    /* Write the page pointer here */
1145   MemPage *pParent     /* Parent of the page */
1146 ){
1147   int rc;
1148   if( pgno==0 ){
1149     return SQLITE_CORRUPT;  /* bkpt-CORRUPT */
1150   }
1151   rc = getPage(pBt, pgno, ppPage);
1152   if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1153     rc = initPage(*ppPage, pParent);
1154   }
1155   return rc;
1156 }
1157 
1158 /*
1159 ** Release a MemPage.  This should be called once for each prior
1160 ** call to getPage.
1161 */
1162 static void releasePage(MemPage *pPage){
1163   if( pPage ){
1164     assert( pPage->aData );
1165     assert( pPage->pBt );
1166     assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
1167     sqlite3pager_unref(pPage->aData);
1168   }
1169 }
1170 
1171 /*
1172 ** This routine is called when the reference count for a page
1173 ** reaches zero.  We need to unref the pParent pointer when that
1174 ** happens.
1175 */
1176 static void pageDestructor(void *pData, int pageSize){
1177   MemPage *pPage;
1178   assert( (pageSize & 7)==0 );
1179   pPage = (MemPage*)&((char*)pData)[pageSize];
1180   if( pPage->pParent ){
1181     MemPage *pParent = pPage->pParent;
1182     pPage->pParent = 0;
1183     releasePage(pParent);
1184   }
1185   pPage->isInit = 0;
1186 }
1187 
1188 /*
1189 ** During a rollback, when the pager reloads information into the cache
1190 ** so that the cache is restored to its original state at the start of
1191 ** the transaction, for each page restored this routine is called.
1192 **
1193 ** This routine needs to reset the extra data section at the end of the
1194 ** page to agree with the restored data.
1195 */
1196 static void pageReinit(void *pData, int pageSize){
1197   MemPage *pPage;
1198   assert( (pageSize & 7)==0 );
1199   pPage = (MemPage*)&((char*)pData)[pageSize];
1200   if( pPage->isInit ){
1201     pPage->isInit = 0;
1202     initPage(pPage, pPage->pParent);
1203   }
1204 }
1205 
1206 /*
1207 ** Open a database file.
1208 **
1209 ** zFilename is the name of the database file.  If zFilename is NULL
1210 ** a new database with a random name is created.  This randomly named
1211 ** database file will be deleted when sqlite3BtreeClose() is called.
1212 */
1213 int sqlite3BtreeOpen(
1214   const char *zFilename,  /* Name of the file containing the BTree database */
1215   Btree **ppBtree,        /* Pointer to new Btree object written here */
1216   int flags               /* Options */
1217 ){
1218   Btree *pBt;
1219   int rc;
1220   int nReserve;
1221   unsigned char zDbHeader[100];
1222 
1223   /*
1224   ** The following asserts make sure that structures used by the btree are
1225   ** the right size.  This is to guard against size changes that result
1226   ** when compiling on a different architecture.
1227   */
1228   assert( sizeof(i64)==8 );
1229   assert( sizeof(u64)==8 );
1230   assert( sizeof(u32)==4 );
1231   assert( sizeof(u16)==2 );
1232   assert( sizeof(Pgno)==4 );
1233 
1234   pBt = sqliteMalloc( sizeof(*pBt) );
1235   if( pBt==0 ){
1236     *ppBtree = 0;
1237     return SQLITE_NOMEM;
1238   }
1239   rc = sqlite3pager_open(&pBt->pPager, zFilename, EXTRA_SIZE, flags);
1240   if( rc!=SQLITE_OK ){
1241     if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
1242     sqliteFree(pBt);
1243     *ppBtree = 0;
1244     return rc;
1245   }
1246   sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
1247   sqlite3pager_set_reiniter(pBt->pPager, pageReinit);
1248   pBt->pCursor = 0;
1249   pBt->pPage1 = 0;
1250   pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
1251   sqlite3pager_read_fileheader(pBt->pPager, sizeof(zDbHeader), zDbHeader);
1252   pBt->pageSize = get2byte(&zDbHeader[16]);
1253   if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1254        || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1255     pBt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1256     pBt->maxEmbedFrac = 64;   /* 25% */
1257     pBt->minEmbedFrac = 32;   /* 12.5% */
1258     pBt->minLeafFrac = 32;    /* 12.5% */
1259 #ifndef SQLITE_OMIT_AUTOVACUUM
1260     /* If the magic name ":memory:" will create an in-memory database, then
1261     ** do not set the auto-vacuum flag, even if SQLITE_DEFAULT_AUTOVACUUM
1262     ** is true. On the other hand, if SQLITE_OMIT_MEMORYDB has been defined,
1263     ** then ":memory:" is just a regular file-name. Respect the auto-vacuum
1264     ** default in this case.
1265     */
1266 #ifndef SQLITE_OMIT_MEMORYDB
1267     if( zFilename && strcmp(zFilename,":memory:") ){
1268 #else
1269     if( zFilename ){
1270 #endif
1271       pBt->autoVacuum = SQLITE_DEFAULT_AUTOVACUUM;
1272     }
1273 #endif
1274     nReserve = 0;
1275   }else{
1276     nReserve = zDbHeader[20];
1277     pBt->maxEmbedFrac = zDbHeader[21];
1278     pBt->minEmbedFrac = zDbHeader[22];
1279     pBt->minLeafFrac = zDbHeader[23];
1280     pBt->pageSizeFixed = 1;
1281 #ifndef SQLITE_OMIT_AUTOVACUUM
1282     pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1283 #endif
1284   }
1285   pBt->usableSize = pBt->pageSize - nReserve;
1286   assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1287   sqlite3pager_set_pagesize(pBt->pPager, pBt->pageSize);
1288   *ppBtree = pBt;
1289   return SQLITE_OK;
1290 }
1291 
1292 /*
1293 ** Close an open database and invalidate all cursors.
1294 */
1295 int sqlite3BtreeClose(Btree *pBt){
1296   while( pBt->pCursor ){
1297     sqlite3BtreeCloseCursor(pBt->pCursor);
1298   }
1299   sqlite3pager_close(pBt->pPager);
1300   sqliteFree(pBt);
1301   return SQLITE_OK;
1302 }
1303 
1304 /*
1305 ** Change the busy handler callback function.
1306 */
1307 int sqlite3BtreeSetBusyHandler(Btree *pBt, BusyHandler *pHandler){
1308   pBt->pBusyHandler = pHandler;
1309   sqlite3pager_set_busyhandler(pBt->pPager, pHandler);
1310   return SQLITE_OK;
1311 }
1312 
1313 /*
1314 ** Change the limit on the number of pages allowed in the cache.
1315 **
1316 ** The maximum number of cache pages is set to the absolute
1317 ** value of mxPage.  If mxPage is negative, the pager will
1318 ** operate asynchronously - it will not stop to do fsync()s
1319 ** to insure data is written to the disk surface before
1320 ** continuing.  Transactions still work if synchronous is off,
1321 ** and the database cannot be corrupted if this program
1322 ** crashes.  But if the operating system crashes or there is
1323 ** an abrupt power failure when synchronous is off, the database
1324 ** could be left in an inconsistent and unrecoverable state.
1325 ** Synchronous is on by default so database corruption is not
1326 ** normally a worry.
1327 */
1328 int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
1329   sqlite3pager_set_cachesize(pBt->pPager, mxPage);
1330   return SQLITE_OK;
1331 }
1332 
1333 /*
1334 ** Change the way data is synced to disk in order to increase or decrease
1335 ** how well the database resists damage due to OS crashes and power
1336 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
1337 ** there is a high probability of damage)  Level 2 is the default.  There
1338 ** is a very low but non-zero probability of damage.  Level 3 reduces the
1339 ** probability of damage to near zero but with a write performance reduction.
1340 */
1341 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1342 int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
1343   sqlite3pager_set_safety_level(pBt->pPager, level);
1344   return SQLITE_OK;
1345 }
1346 #endif
1347 
1348 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1349 /*
1350 ** Change the default pages size and the number of reserved bytes per page.
1351 **
1352 ** The page size must be a power of 2 between 512 and 65536.  If the page
1353 ** size supplied does not meet this constraint then the page size is not
1354 ** changed.
1355 **
1356 ** Page sizes are constrained to be a power of two so that the region
1357 ** of the database file used for locking (beginning at PENDING_BYTE,
1358 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1359 ** at the beginning of a page.
1360 **
1361 ** If parameter nReserve is less than zero, then the number of reserved
1362 ** bytes per page is left unchanged.
1363 */
1364 int sqlite3BtreeSetPageSize(Btree *pBt, int pageSize, int nReserve){
1365   if( pBt->pageSizeFixed ){
1366     return SQLITE_READONLY;
1367   }
1368   if( nReserve<0 ){
1369     nReserve = pBt->pageSize - pBt->usableSize;
1370   }
1371   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1372         ((pageSize-1)&pageSize)==0 ){
1373     assert( (pageSize & 7)==0 );
1374     pBt->pageSize = sqlite3pager_set_pagesize(pBt->pPager, pageSize);
1375   }
1376   pBt->usableSize = pBt->pageSize - nReserve;
1377   return SQLITE_OK;
1378 }
1379 
1380 /*
1381 ** Return the currently defined page size
1382 */
1383 int sqlite3BtreeGetPageSize(Btree *pBt){
1384   return pBt->pageSize;
1385 }
1386 int sqlite3BtreeGetReserve(Btree *pBt){
1387   return pBt->pageSize - pBt->usableSize;
1388 }
1389 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1390 
1391 /*
1392 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1393 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1394 ** is disabled. The default value for the auto-vacuum property is
1395 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1396 */
1397 int sqlite3BtreeSetAutoVacuum(Btree *pBt, int autoVacuum){
1398 #ifdef SQLITE_OMIT_AUTOVACUUM
1399   return SQLITE_READONLY;
1400 #else
1401   if( pBt->pageSizeFixed ){
1402     return SQLITE_READONLY;
1403   }
1404   pBt->autoVacuum = (autoVacuum?1:0);
1405   return SQLITE_OK;
1406 #endif
1407 }
1408 
1409 /*
1410 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1411 ** enabled 1 is returned. Otherwise 0.
1412 */
1413 int sqlite3BtreeGetAutoVacuum(Btree *pBt){
1414 #ifdef SQLITE_OMIT_AUTOVACUUM
1415   return 0;
1416 #else
1417   return pBt->autoVacuum;
1418 #endif
1419 }
1420 
1421 
1422 /*
1423 ** Get a reference to pPage1 of the database file.  This will
1424 ** also acquire a readlock on that file.
1425 **
1426 ** SQLITE_OK is returned on success.  If the file is not a
1427 ** well-formed database file, then SQLITE_CORRUPT is returned.
1428 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
1429 ** is returned if we run out of memory.  SQLITE_PROTOCOL is returned
1430 ** if there is a locking protocol violation.
1431 */
1432 static int lockBtree(Btree *pBt){
1433   int rc, pageSize;
1434   MemPage *pPage1;
1435   if( pBt->pPage1 ) return SQLITE_OK;
1436   rc = getPage(pBt, 1, &pPage1);
1437   if( rc!=SQLITE_OK ) return rc;
1438 
1439 
1440   /* Do some checking to help insure the file we opened really is
1441   ** a valid database file.
1442   */
1443   rc = SQLITE_NOTADB;
1444   if( sqlite3pager_pagecount(pBt->pPager)>0 ){
1445     u8 *page1 = pPage1->aData;
1446     if( memcmp(page1, zMagicHeader, 16)!=0 ){
1447       goto page1_init_failed;
1448     }
1449     if( page1[18]>1 || page1[19]>1 ){
1450       goto page1_init_failed;
1451     }
1452     pageSize = get2byte(&page1[16]);
1453     if( ((pageSize-1)&pageSize)!=0 ){
1454       goto page1_init_failed;
1455     }
1456     assert( (pageSize & 7)==0 );
1457     pBt->pageSize = pageSize;
1458     pBt->usableSize = pageSize - page1[20];
1459     if( pBt->usableSize<500 ){
1460       goto page1_init_failed;
1461     }
1462     pBt->maxEmbedFrac = page1[21];
1463     pBt->minEmbedFrac = page1[22];
1464     pBt->minLeafFrac = page1[23];
1465 #ifndef SQLITE_OMIT_AUTOVACUUM
1466     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1467 #endif
1468   }
1469 
1470   /* maxLocal is the maximum amount of payload to store locally for
1471   ** a cell.  Make sure it is small enough so that at least minFanout
1472   ** cells can will fit on one page.  We assume a 10-byte page header.
1473   ** Besides the payload, the cell must store:
1474   **     2-byte pointer to the cell
1475   **     4-byte child pointer
1476   **     9-byte nKey value
1477   **     4-byte nData value
1478   **     4-byte overflow page pointer
1479   ** So a cell consists of a 2-byte poiner, a header which is as much as
1480   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1481   ** page pointer.
1482   */
1483   pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1484   pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1485   pBt->maxLeaf = pBt->usableSize - 35;
1486   pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1487   if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1488     goto page1_init_failed;
1489   }
1490   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1491   pBt->pPage1 = pPage1;
1492   return SQLITE_OK;
1493 
1494 page1_init_failed:
1495   releasePage(pPage1);
1496   pBt->pPage1 = 0;
1497   return rc;
1498 }
1499 
1500 /*
1501 ** This routine works like lockBtree() except that it also invokes the
1502 ** busy callback if there is lock contention.
1503 */
1504 static int lockBtreeWithRetry(Btree *pBt){
1505   int rc = SQLITE_OK;
1506   if( pBt->inTrans==TRANS_NONE ){
1507     rc = sqlite3BtreeBeginTrans(pBt, 0);
1508     pBt->inTrans = TRANS_NONE;
1509   }
1510   return rc;
1511 }
1512 
1513 
1514 /*
1515 ** If there are no outstanding cursors and we are not in the middle
1516 ** of a transaction but there is a read lock on the database, then
1517 ** this routine unrefs the first page of the database file which
1518 ** has the effect of releasing the read lock.
1519 **
1520 ** If there are any outstanding cursors, this routine is a no-op.
1521 **
1522 ** If there is a transaction in progress, this routine is a no-op.
1523 */
1524 static void unlockBtreeIfUnused(Btree *pBt){
1525   if( pBt->inTrans==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1526     if( pBt->pPage1->aData==0 ){
1527       MemPage *pPage = pBt->pPage1;
1528       pPage->aData = &((char*)pPage)[-pBt->pageSize];
1529       pPage->pBt = pBt;
1530       pPage->pgno = 1;
1531     }
1532     releasePage(pBt->pPage1);
1533     pBt->pPage1 = 0;
1534     pBt->inStmt = 0;
1535   }
1536 }
1537 
1538 /*
1539 ** Create a new database by initializing the first page of the
1540 ** file.
1541 */
1542 static int newDatabase(Btree *pBt){
1543   MemPage *pP1;
1544   unsigned char *data;
1545   int rc;
1546   if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
1547   pP1 = pBt->pPage1;
1548   assert( pP1!=0 );
1549   data = pP1->aData;
1550   rc = sqlite3pager_write(data);
1551   if( rc ) return rc;
1552   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1553   assert( sizeof(zMagicHeader)==16 );
1554   put2byte(&data[16], pBt->pageSize);
1555   data[18] = 1;
1556   data[19] = 1;
1557   data[20] = pBt->pageSize - pBt->usableSize;
1558   data[21] = pBt->maxEmbedFrac;
1559   data[22] = pBt->minEmbedFrac;
1560   data[23] = pBt->minLeafFrac;
1561   memset(&data[24], 0, 100-24);
1562   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1563   pBt->pageSizeFixed = 1;
1564 #ifndef SQLITE_OMIT_AUTOVACUUM
1565   if( pBt->autoVacuum ){
1566     put4byte(&data[36 + 4*4], 1);
1567   }
1568 #endif
1569   return SQLITE_OK;
1570 }
1571 
1572 /*
1573 ** Attempt to start a new transaction. A write-transaction
1574 ** is started if the second argument is nonzero, otherwise a read-
1575 ** transaction.  If the second argument is 2 or more and exclusive
1576 ** transaction is started, meaning that no other process is allowed
1577 ** to access the database.  A preexisting transaction may not be
1578 ** upgraded to exclusive by calling this routine a second time - the
1579 ** exclusivity flag only works for a new transaction.
1580 **
1581 ** A write-transaction must be started before attempting any
1582 ** changes to the database.  None of the following routines
1583 ** will work unless a transaction is started first:
1584 **
1585 **      sqlite3BtreeCreateTable()
1586 **      sqlite3BtreeCreateIndex()
1587 **      sqlite3BtreeClearTable()
1588 **      sqlite3BtreeDropTable()
1589 **      sqlite3BtreeInsert()
1590 **      sqlite3BtreeDelete()
1591 **      sqlite3BtreeUpdateMeta()
1592 **
1593 ** If an initial attempt to acquire the lock fails because of lock contention
1594 ** and the database was previously unlocked, then invoke the busy handler
1595 ** if there is one.  But if there was previously a read-lock, do not
1596 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
1597 ** returned when there is already a read-lock in order to avoid a deadlock.
1598 **
1599 ** Suppose there are two processes A and B.  A has a read lock and B has
1600 ** a reserved lock.  B tries to promote to exclusive but is blocked because
1601 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
1602 ** One or the other of the two processes must give way or there can be
1603 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
1604 ** when A already has a read lock, we encourage A to give up and let B
1605 ** proceed.
1606 */
1607 int sqlite3BtreeBeginTrans(Btree *pBt, int wrflag){
1608   int rc = SQLITE_OK;
1609 
1610   /* If the btree is already in a write-transaction, or it
1611   ** is already in a read-transaction and a read-transaction
1612   ** is requested, this is a no-op.
1613   */
1614   if( pBt->inTrans==TRANS_WRITE || (pBt->inTrans==TRANS_READ && !wrflag) ){
1615     return SQLITE_OK;
1616   }
1617 
1618   /* Write transactions are not possible on a read-only database */
1619   if( pBt->readOnly && wrflag ){
1620     return SQLITE_READONLY;
1621   }
1622 
1623   do {
1624     if( pBt->pPage1==0 ){
1625       rc = lockBtree(pBt);
1626     }
1627 
1628     if( rc==SQLITE_OK && wrflag ){
1629       rc = sqlite3pager_begin(pBt->pPage1->aData, wrflag>1);
1630       if( rc==SQLITE_OK ){
1631         rc = newDatabase(pBt);
1632       }
1633     }
1634 
1635     if( rc==SQLITE_OK ){
1636       pBt->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1637       if( wrflag ) pBt->inStmt = 0;
1638     }else{
1639       unlockBtreeIfUnused(pBt);
1640     }
1641   }while( rc==SQLITE_BUSY && pBt->inTrans==TRANS_NONE &&
1642           sqlite3InvokeBusyHandler(pBt->pBusyHandler) );
1643   return rc;
1644 }
1645 
1646 #ifndef SQLITE_OMIT_AUTOVACUUM
1647 
1648 /*
1649 ** Set the pointer-map entries for all children of page pPage. Also, if
1650 ** pPage contains cells that point to overflow pages, set the pointer
1651 ** map entries for the overflow pages as well.
1652 */
1653 static int setChildPtrmaps(MemPage *pPage){
1654   int i;                             /* Counter variable */
1655   int nCell;                         /* Number of cells in page pPage */
1656   int rc = SQLITE_OK;                /* Return code */
1657   Btree *pBt = pPage->pBt;
1658   int isInitOrig = pPage->isInit;
1659   Pgno pgno = pPage->pgno;
1660 
1661   initPage(pPage, 0);
1662   nCell = pPage->nCell;
1663 
1664   for(i=0; i<nCell; i++){
1665     u8 *pCell = findCell(pPage, i);
1666 
1667     rc = ptrmapPutOvflPtr(pPage, pCell);
1668     if( rc!=SQLITE_OK ){
1669       goto set_child_ptrmaps_out;
1670     }
1671 
1672     if( !pPage->leaf ){
1673       Pgno childPgno = get4byte(pCell);
1674       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1675       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1676     }
1677   }
1678 
1679   if( !pPage->leaf ){
1680     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1681     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1682   }
1683 
1684 set_child_ptrmaps_out:
1685   pPage->isInit = isInitOrig;
1686   return rc;
1687 }
1688 
1689 /*
1690 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1691 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
1692 ** iTo. Parameter eType describes the type of pointer to be modified, as
1693 ** follows:
1694 **
1695 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
1696 **                   page of pPage.
1697 **
1698 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1699 **                   page pointed to by one of the cells on pPage.
1700 **
1701 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1702 **                   overflow page in the list.
1703 */
1704 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1705   if( eType==PTRMAP_OVERFLOW2 ){
1706     /* The pointer is always the first 4 bytes of the page in this case.  */
1707     if( get4byte(pPage->aData)!=iFrom ){
1708       return SQLITE_CORRUPT;
1709     }
1710     put4byte(pPage->aData, iTo);
1711   }else{
1712     int isInitOrig = pPage->isInit;
1713     int i;
1714     int nCell;
1715 
1716     initPage(pPage, 0);
1717     nCell = pPage->nCell;
1718 
1719     for(i=0; i<nCell; i++){
1720       u8 *pCell = findCell(pPage, i);
1721       if( eType==PTRMAP_OVERFLOW1 ){
1722         CellInfo info;
1723         parseCellPtr(pPage, pCell, &info);
1724         if( info.iOverflow ){
1725           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1726             put4byte(&pCell[info.iOverflow], iTo);
1727             break;
1728           }
1729         }
1730       }else{
1731         if( get4byte(pCell)==iFrom ){
1732           put4byte(pCell, iTo);
1733           break;
1734         }
1735       }
1736     }
1737 
1738     if( i==nCell ){
1739       if( eType!=PTRMAP_BTREE ||
1740           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1741         return SQLITE_CORRUPT;
1742       }
1743       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1744     }
1745 
1746     pPage->isInit = isInitOrig;
1747   }
1748   return SQLITE_OK;
1749 }
1750 
1751 
1752 /*
1753 ** Move the open database page pDbPage to location iFreePage in the
1754 ** database. The pDbPage reference remains valid.
1755 */
1756 static int relocatePage(
1757   Btree *pBt,              /* Btree */
1758   MemPage *pDbPage,        /* Open page to move */
1759   u8 eType,                /* Pointer map 'type' entry for pDbPage */
1760   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
1761   Pgno iFreePage           /* The location to move pDbPage to */
1762 ){
1763   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
1764   Pgno iDbPage = pDbPage->pgno;
1765   Pager *pPager = pBt->pPager;
1766   int rc;
1767 
1768   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
1769       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
1770 
1771   /* Move page iDbPage from it's current location to page number iFreePage */
1772   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
1773       iDbPage, iFreePage, iPtrPage, eType));
1774   rc = sqlite3pager_movepage(pPager, pDbPage->aData, iFreePage);
1775   if( rc!=SQLITE_OK ){
1776     return rc;
1777   }
1778   pDbPage->pgno = iFreePage;
1779 
1780   /* If pDbPage was a btree-page, then it may have child pages and/or cells
1781   ** that point to overflow pages. The pointer map entries for all these
1782   ** pages need to be changed.
1783   **
1784   ** If pDbPage is an overflow page, then the first 4 bytes may store a
1785   ** pointer to a subsequent overflow page. If this is the case, then
1786   ** the pointer map needs to be updated for the subsequent overflow page.
1787   */
1788   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
1789     rc = setChildPtrmaps(pDbPage);
1790     if( rc!=SQLITE_OK ){
1791       return rc;
1792     }
1793   }else{
1794     Pgno nextOvfl = get4byte(pDbPage->aData);
1795     if( nextOvfl!=0 ){
1796       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
1797       if( rc!=SQLITE_OK ){
1798         return rc;
1799       }
1800     }
1801   }
1802 
1803   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
1804   ** that it points at iFreePage. Also fix the pointer map entry for
1805   ** iPtrPage.
1806   */
1807   if( eType!=PTRMAP_ROOTPAGE ){
1808     rc = getPage(pBt, iPtrPage, &pPtrPage);
1809     if( rc!=SQLITE_OK ){
1810       return rc;
1811     }
1812     rc = sqlite3pager_write(pPtrPage->aData);
1813     if( rc!=SQLITE_OK ){
1814       releasePage(pPtrPage);
1815       return rc;
1816     }
1817     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
1818     releasePage(pPtrPage);
1819     if( rc==SQLITE_OK ){
1820       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
1821     }
1822   }
1823   return rc;
1824 }
1825 
1826 /* Forward declaration required by autoVacuumCommit(). */
1827 static int allocatePage(Btree *, MemPage **, Pgno *, Pgno, u8);
1828 
1829 /*
1830 ** This routine is called prior to sqlite3pager_commit when a transaction
1831 ** is commited for an auto-vacuum database.
1832 */
1833 static int autoVacuumCommit(Btree *pBt, Pgno *nTrunc){
1834   Pager *pPager = pBt->pPager;
1835   Pgno nFreeList;   /* Number of pages remaining on the free-list. */
1836   int nPtrMap;      /* Number of pointer-map pages deallocated */
1837   Pgno origSize;  /* Pages in the database file */
1838   Pgno finSize;   /* Pages in the database file after truncation */
1839   int rc;           /* Return code */
1840   u8 eType;
1841   int pgsz = pBt->pageSize;  /* Page size for this database */
1842   Pgno iDbPage;              /* The database page to move */
1843   MemPage *pDbMemPage = 0;   /* "" */
1844   Pgno iPtrPage;             /* The page that contains a pointer to iDbPage */
1845   Pgno iFreePage;            /* The free-list page to move iDbPage to */
1846   MemPage *pFreeMemPage = 0; /* "" */
1847 
1848 #ifndef NDEBUG
1849   int nRef = *sqlite3pager_stats(pPager);
1850 #endif
1851 
1852   assert( pBt->autoVacuum );
1853   if( PTRMAP_ISPAGE(pgsz, sqlite3pager_pagecount(pPager)) ){
1854     return SQLITE_CORRUPT;
1855   }
1856 
1857   /* Figure out how many free-pages are in the database. If there are no
1858   ** free pages, then auto-vacuum is a no-op.
1859   */
1860   nFreeList = get4byte(&pBt->pPage1->aData[36]);
1861   if( nFreeList==0 ){
1862     *nTrunc = 0;
1863     return SQLITE_OK;
1864   }
1865 
1866   origSize = sqlite3pager_pagecount(pPager);
1867   nPtrMap = (nFreeList-origSize+PTRMAP_PAGENO(pgsz, origSize)+pgsz/5)/(pgsz/5);
1868   finSize = origSize - nFreeList - nPtrMap;
1869   if( origSize>PENDING_BYTE_PAGE(pBt) && finSize<=PENDING_BYTE_PAGE(pBt) ){
1870     finSize--;
1871     if( PTRMAP_ISPAGE(pBt->usableSize, finSize) ){
1872       finSize--;
1873     }
1874   }
1875   TRACE(("AUTOVACUUM: Begin (db size %d->%d)\n", origSize, finSize));
1876 
1877   /* Variable 'finSize' will be the size of the file in pages after
1878   ** the auto-vacuum has completed (the current file size minus the number
1879   ** of pages on the free list). Loop through the pages that lie beyond
1880   ** this mark, and if they are not already on the free list, move them
1881   ** to a free page earlier in the file (somewhere before finSize).
1882   */
1883   for( iDbPage=finSize+1; iDbPage<=origSize; iDbPage++ ){
1884     /* If iDbPage is a pointer map page, or the pending-byte page, skip it. */
1885     if( PTRMAP_ISPAGE(pgsz, iDbPage) || iDbPage==PENDING_BYTE_PAGE(pBt) ){
1886       continue;
1887     }
1888 
1889     rc = ptrmapGet(pBt, iDbPage, &eType, &iPtrPage);
1890     if( rc!=SQLITE_OK ) goto autovacuum_out;
1891     if( eType==PTRMAP_ROOTPAGE ){
1892       rc = SQLITE_CORRUPT;
1893       goto autovacuum_out;
1894     }
1895 
1896     /* If iDbPage is free, do not swap it.  */
1897     if( eType==PTRMAP_FREEPAGE ){
1898       continue;
1899     }
1900     rc = getPage(pBt, iDbPage, &pDbMemPage);
1901     if( rc!=SQLITE_OK ) goto autovacuum_out;
1902 
1903     /* Find the next page in the free-list that is not already at the end
1904     ** of the file. A page can be pulled off the free list using the
1905     ** allocatePage() routine.
1906     */
1907     do{
1908       if( pFreeMemPage ){
1909         releasePage(pFreeMemPage);
1910         pFreeMemPage = 0;
1911       }
1912       rc = allocatePage(pBt, &pFreeMemPage, &iFreePage, 0, 0);
1913       if( rc!=SQLITE_OK ){
1914         releasePage(pDbMemPage);
1915         goto autovacuum_out;
1916       }
1917       assert( iFreePage<=origSize );
1918     }while( iFreePage>finSize );
1919     releasePage(pFreeMemPage);
1920     pFreeMemPage = 0;
1921 
1922     rc = relocatePage(pBt, pDbMemPage, eType, iPtrPage, iFreePage);
1923     releasePage(pDbMemPage);
1924     if( rc!=SQLITE_OK ) goto autovacuum_out;
1925   }
1926 
1927   /* The entire free-list has been swapped to the end of the file. So
1928   ** truncate the database file to finSize pages and consider the
1929   ** free-list empty.
1930   */
1931   rc = sqlite3pager_write(pBt->pPage1->aData);
1932   if( rc!=SQLITE_OK ) goto autovacuum_out;
1933   put4byte(&pBt->pPage1->aData[32], 0);
1934   put4byte(&pBt->pPage1->aData[36], 0);
1935   if( rc!=SQLITE_OK ) goto autovacuum_out;
1936   *nTrunc = finSize;
1937 
1938 autovacuum_out:
1939   assert( nRef==*sqlite3pager_stats(pPager) );
1940   if( rc!=SQLITE_OK ){
1941     sqlite3pager_rollback(pPager);
1942   }
1943   return rc;
1944 }
1945 #endif
1946 
1947 /*
1948 ** Commit the transaction currently in progress.
1949 **
1950 ** This will release the write lock on the database file.  If there
1951 ** are no active cursors, it also releases the read lock.
1952 */
1953 int sqlite3BtreeCommit(Btree *pBt){
1954   int rc = SQLITE_OK;
1955   if( pBt->inTrans==TRANS_WRITE ){
1956     rc = sqlite3pager_commit(pBt->pPager);
1957   }
1958   pBt->inTrans = TRANS_NONE;
1959   pBt->inStmt = 0;
1960   unlockBtreeIfUnused(pBt);
1961   return rc;
1962 }
1963 
1964 #ifndef NDEBUG
1965 /*
1966 ** Return the number of write-cursors open on this handle. This is for use
1967 ** in assert() expressions, so it is only compiled if NDEBUG is not
1968 ** defined.
1969 */
1970 static int countWriteCursors(Btree *pBt){
1971   BtCursor *pCur;
1972   int r = 0;
1973   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1974     if( pCur->wrFlag ) r++;
1975   }
1976   return r;
1977 }
1978 #endif
1979 
1980 #ifdef SQLITE_TEST
1981 /*
1982 ** Print debugging information about all cursors to standard output.
1983 */
1984 void sqlite3BtreeCursorList(Btree *pBt){
1985   BtCursor *pCur;
1986   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1987     MemPage *pPage = pCur->pPage;
1988     char *zMode = pCur->wrFlag ? "rw" : "ro";
1989     sqlite3DebugPrintf("CURSOR %p rooted at %4d(%s) currently at %d.%d%s\n",
1990        pCur, pCur->pgnoRoot, zMode,
1991        pPage ? pPage->pgno : 0, pCur->idx,
1992        pCur->isValid ? "" : " eof"
1993     );
1994   }
1995 }
1996 #endif
1997 
1998 /*
1999 ** Rollback the transaction in progress.  All cursors will be
2000 ** invalided by this operation.  Any attempt to use a cursor
2001 ** that was open at the beginning of this operation will result
2002 ** in an error.
2003 **
2004 ** This will release the write lock on the database file.  If there
2005 ** are no active cursors, it also releases the read lock.
2006 */
2007 int sqlite3BtreeRollback(Btree *pBt){
2008   int rc = SQLITE_OK;
2009   MemPage *pPage1;
2010   if( pBt->inTrans==TRANS_WRITE ){
2011     rc = sqlite3pager_rollback(pBt->pPager);
2012     /* The rollback may have destroyed the pPage1->aData value.  So
2013     ** call getPage() on page 1 again to make sure pPage1->aData is
2014     ** set correctly. */
2015     if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
2016       releasePage(pPage1);
2017     }
2018     assert( countWriteCursors(pBt)==0 );
2019   }
2020   pBt->inTrans = TRANS_NONE;
2021   pBt->inStmt = 0;
2022   unlockBtreeIfUnused(pBt);
2023   return rc;
2024 }
2025 
2026 /*
2027 ** Start a statement subtransaction.  The subtransaction can
2028 ** can be rolled back independently of the main transaction.
2029 ** You must start a transaction before starting a subtransaction.
2030 ** The subtransaction is ended automatically if the main transaction
2031 ** commits or rolls back.
2032 **
2033 ** Only one subtransaction may be active at a time.  It is an error to try
2034 ** to start a new subtransaction if another subtransaction is already active.
2035 **
2036 ** Statement subtransactions are used around individual SQL statements
2037 ** that are contained within a BEGIN...COMMIT block.  If a constraint
2038 ** error occurs within the statement, the effect of that one statement
2039 ** can be rolled back without having to rollback the entire transaction.
2040 */
2041 int sqlite3BtreeBeginStmt(Btree *pBt){
2042   int rc;
2043   if( (pBt->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2044     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2045   }
2046   rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
2047   pBt->inStmt = 1;
2048   return rc;
2049 }
2050 
2051 
2052 /*
2053 ** Commit the statment subtransaction currently in progress.  If no
2054 ** subtransaction is active, this is a no-op.
2055 */
2056 int sqlite3BtreeCommitStmt(Btree *pBt){
2057   int rc;
2058   if( pBt->inStmt && !pBt->readOnly ){
2059     rc = sqlite3pager_stmt_commit(pBt->pPager);
2060   }else{
2061     rc = SQLITE_OK;
2062   }
2063   pBt->inStmt = 0;
2064   return rc;
2065 }
2066 
2067 /*
2068 ** Rollback the active statement subtransaction.  If no subtransaction
2069 ** is active this routine is a no-op.
2070 **
2071 ** All cursors will be invalidated by this operation.  Any attempt
2072 ** to use a cursor that was open at the beginning of this operation
2073 ** will result in an error.
2074 */
2075 int sqlite3BtreeRollbackStmt(Btree *pBt){
2076   int rc;
2077   if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
2078   rc = sqlite3pager_stmt_rollback(pBt->pPager);
2079   assert( countWriteCursors(pBt)==0 );
2080   pBt->inStmt = 0;
2081   return rc;
2082 }
2083 
2084 /*
2085 ** Default key comparison function to be used if no comparison function
2086 ** is specified on the sqlite3BtreeCursor() call.
2087 */
2088 static int dfltCompare(
2089   void *NotUsed,             /* User data is not used */
2090   int n1, const void *p1,    /* First key to compare */
2091   int n2, const void *p2     /* Second key to compare */
2092 ){
2093   int c;
2094   c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2095   if( c==0 ){
2096     c = n1 - n2;
2097   }
2098   return c;
2099 }
2100 
2101 /*
2102 ** Create a new cursor for the BTree whose root is on the page
2103 ** iTable.  The act of acquiring a cursor gets a read lock on
2104 ** the database file.
2105 **
2106 ** If wrFlag==0, then the cursor can only be used for reading.
2107 ** If wrFlag==1, then the cursor can be used for reading or for
2108 ** writing if other conditions for writing are also met.  These
2109 ** are the conditions that must be met in order for writing to
2110 ** be allowed:
2111 **
2112 ** 1:  The cursor must have been opened with wrFlag==1
2113 **
2114 ** 2:  No other cursors may be open with wrFlag==0 on the same table
2115 **
2116 ** 3:  The database must be writable (not on read-only media)
2117 **
2118 ** 4:  There must be an active transaction.
2119 **
2120 ** Condition 2 warrants further discussion.  If any cursor is opened
2121 ** on a table with wrFlag==0, that prevents all other cursors from
2122 ** writing to that table.  This is a kind of "read-lock".  When a cursor
2123 ** is opened with wrFlag==0 it is guaranteed that the table will not
2124 ** change as long as the cursor is open.  This allows the cursor to
2125 ** do a sequential scan of the table without having to worry about
2126 ** entries being inserted or deleted during the scan.  Cursors should
2127 ** be opened with wrFlag==0 only if this read-lock property is needed.
2128 ** That is to say, cursors should be opened with wrFlag==0 only if they
2129 ** intend to use the sqlite3BtreeNext() system call.  All other cursors
2130 ** should be opened with wrFlag==1 even if they never really intend
2131 ** to write.
2132 **
2133 ** No checking is done to make sure that page iTable really is the
2134 ** root page of a b-tree.  If it is not, then the cursor acquired
2135 ** will not work correctly.
2136 **
2137 ** The comparison function must be logically the same for every cursor
2138 ** on a particular table.  Changing the comparison function will result
2139 ** in incorrect operations.  If the comparison function is NULL, a
2140 ** default comparison function is used.  The comparison function is
2141 ** always ignored for INTKEY tables.
2142 */
2143 int sqlite3BtreeCursor(
2144   Btree *pBt,                                 /* The btree */
2145   int iTable,                                 /* Root page of table to open */
2146   int wrFlag,                                 /* 1 to write. 0 read-only */
2147   int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2148   void *pArg,                                 /* First arg to xCompare() */
2149   BtCursor **ppCur                            /* Write new cursor here */
2150 ){
2151   int rc;
2152   BtCursor *pCur;
2153 
2154   *ppCur = 0;
2155   if( wrFlag ){
2156     if( pBt->readOnly ){
2157       return SQLITE_READONLY;
2158     }
2159     if( checkReadLocks(pBt, iTable, 0) ){
2160       return SQLITE_LOCKED;
2161     }
2162   }
2163   if( pBt->pPage1==0 ){
2164     rc = lockBtreeWithRetry(pBt);
2165     if( rc!=SQLITE_OK ){
2166       return rc;
2167     }
2168   }
2169   pCur = sqliteMallocRaw( sizeof(*pCur) );
2170   if( pCur==0 ){
2171     rc = SQLITE_NOMEM;
2172     goto create_cursor_exception;
2173   }
2174   pCur->pgnoRoot = (Pgno)iTable;
2175   pCur->pPage = 0;  /* For exit-handler, in case getAndInitPage() fails. */
2176   if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
2177     rc = SQLITE_EMPTY;
2178     goto create_cursor_exception;
2179   }
2180   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2181   if( rc!=SQLITE_OK ){
2182     goto create_cursor_exception;
2183   }
2184   pCur->xCompare = xCmp ? xCmp : dfltCompare;
2185   pCur->pArg = pArg;
2186   pCur->pBt = pBt;
2187   pCur->wrFlag = wrFlag;
2188   pCur->idx = 0;
2189   memset(&pCur->info, 0, sizeof(pCur->info));
2190   pCur->pNext = pBt->pCursor;
2191   if( pCur->pNext ){
2192     pCur->pNext->pPrev = pCur;
2193   }
2194   pCur->pPrev = 0;
2195   pBt->pCursor = pCur;
2196   pCur->isValid = 0;
2197   *ppCur = pCur;
2198   return SQLITE_OK;
2199 
2200 create_cursor_exception:
2201   if( pCur ){
2202     releasePage(pCur->pPage);
2203     sqliteFree(pCur);
2204   }
2205   unlockBtreeIfUnused(pBt);
2206   return rc;
2207 }
2208 
2209 #if 0  /* Not Used */
2210 /*
2211 ** Change the value of the comparison function used by a cursor.
2212 */
2213 void sqlite3BtreeSetCompare(
2214   BtCursor *pCur,     /* The cursor to whose comparison function is changed */
2215   int(*xCmp)(void*,int,const void*,int,const void*), /* New comparison func */
2216   void *pArg          /* First argument to xCmp() */
2217 ){
2218   pCur->xCompare = xCmp ? xCmp : dfltCompare;
2219   pCur->pArg = pArg;
2220 }
2221 #endif
2222 
2223 /*
2224 ** Close a cursor.  The read lock on the database file is released
2225 ** when the last cursor is closed.
2226 */
2227 int sqlite3BtreeCloseCursor(BtCursor *pCur){
2228   Btree *pBt = pCur->pBt;
2229   if( pCur->pPrev ){
2230     pCur->pPrev->pNext = pCur->pNext;
2231   }else{
2232     pBt->pCursor = pCur->pNext;
2233   }
2234   if( pCur->pNext ){
2235     pCur->pNext->pPrev = pCur->pPrev;
2236   }
2237   releasePage(pCur->pPage);
2238   unlockBtreeIfUnused(pBt);
2239   sqliteFree(pCur);
2240   return SQLITE_OK;
2241 }
2242 
2243 /*
2244 ** Make a temporary cursor by filling in the fields of pTempCur.
2245 ** The temporary cursor is not on the cursor list for the Btree.
2246 */
2247 static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2248   memcpy(pTempCur, pCur, sizeof(*pCur));
2249   pTempCur->pNext = 0;
2250   pTempCur->pPrev = 0;
2251   if( pTempCur->pPage ){
2252     sqlite3pager_ref(pTempCur->pPage->aData);
2253   }
2254 }
2255 
2256 /*
2257 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2258 ** function above.
2259 */
2260 static void releaseTempCursor(BtCursor *pCur){
2261   if( pCur->pPage ){
2262     sqlite3pager_unref(pCur->pPage->aData);
2263   }
2264 }
2265 
2266 /*
2267 ** Make sure the BtCursor.info field of the given cursor is valid.
2268 ** If it is not already valid, call parseCell() to fill it in.
2269 **
2270 ** BtCursor.info is a cache of the information in the current cell.
2271 ** Using this cache reduces the number of calls to parseCell().
2272 */
2273 static void getCellInfo(BtCursor *pCur){
2274   if( pCur->info.nSize==0 ){
2275     parseCell(pCur->pPage, pCur->idx, &pCur->info);
2276   }else{
2277 #ifndef NDEBUG
2278     CellInfo info;
2279     memset(&info, 0, sizeof(info));
2280     parseCell(pCur->pPage, pCur->idx, &info);
2281     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2282 #endif
2283   }
2284 }
2285 
2286 /*
2287 ** Set *pSize to the size of the buffer needed to hold the value of
2288 ** the key for the current entry.  If the cursor is not pointing
2289 ** to a valid entry, *pSize is set to 0.
2290 **
2291 ** For a table with the INTKEY flag set, this routine returns the key
2292 ** itself, not the number of bytes in the key.
2293 */
2294 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2295   if( !pCur->isValid ){
2296     *pSize = 0;
2297   }else{
2298     getCellInfo(pCur);
2299     *pSize = pCur->info.nKey;
2300   }
2301   return SQLITE_OK;
2302 }
2303 
2304 /*
2305 ** Set *pSize to the number of bytes of data in the entry the
2306 ** cursor currently points to.  Always return SQLITE_OK.
2307 ** Failure is not possible.  If the cursor is not currently
2308 ** pointing to an entry (which can happen, for example, if
2309 ** the database is empty) then *pSize is set to 0.
2310 */
2311 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2312   if( !pCur->isValid ){
2313     /* Not pointing at a valid entry - set *pSize to 0. */
2314     *pSize = 0;
2315   }else{
2316     getCellInfo(pCur);
2317     *pSize = pCur->info.nData;
2318   }
2319   return SQLITE_OK;
2320 }
2321 
2322 /*
2323 ** Read payload information from the entry that the pCur cursor is
2324 ** pointing to.  Begin reading the payload at "offset" and read
2325 ** a total of "amt" bytes.  Put the result in zBuf.
2326 **
2327 ** This routine does not make a distinction between key and data.
2328 ** It just reads bytes from the payload area.  Data might appear
2329 ** on the main page or be scattered out on multiple overflow pages.
2330 */
2331 static int getPayload(
2332   BtCursor *pCur,      /* Cursor pointing to entry to read from */
2333   int offset,          /* Begin reading this far into payload */
2334   int amt,             /* Read this many bytes */
2335   unsigned char *pBuf, /* Write the bytes into this buffer */
2336   int skipKey          /* offset begins at data if this is true */
2337 ){
2338   unsigned char *aPayload;
2339   Pgno nextPage;
2340   int rc;
2341   MemPage *pPage;
2342   Btree *pBt;
2343   int ovflSize;
2344   u32 nKey;
2345 
2346   assert( pCur!=0 && pCur->pPage!=0 );
2347   assert( pCur->isValid );
2348   pBt = pCur->pBt;
2349   pPage = pCur->pPage;
2350   pageIntegrity(pPage);
2351   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2352   getCellInfo(pCur);
2353   aPayload = pCur->info.pCell;
2354   aPayload += pCur->info.nHeader;
2355   if( pPage->intKey ){
2356     nKey = 0;
2357   }else{
2358     nKey = pCur->info.nKey;
2359   }
2360   assert( offset>=0 );
2361   if( skipKey ){
2362     offset += nKey;
2363   }
2364   if( offset+amt > nKey+pCur->info.nData ){
2365     return SQLITE_ERROR;
2366   }
2367   if( offset<pCur->info.nLocal ){
2368     int a = amt;
2369     if( a+offset>pCur->info.nLocal ){
2370       a = pCur->info.nLocal - offset;
2371     }
2372     memcpy(pBuf, &aPayload[offset], a);
2373     if( a==amt ){
2374       return SQLITE_OK;
2375     }
2376     offset = 0;
2377     pBuf += a;
2378     amt -= a;
2379   }else{
2380     offset -= pCur->info.nLocal;
2381   }
2382   ovflSize = pBt->usableSize - 4;
2383   if( amt>0 ){
2384     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
2385     while( amt>0 && nextPage ){
2386       rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
2387       if( rc!=0 ){
2388         return rc;
2389       }
2390       nextPage = get4byte(aPayload);
2391       if( offset<ovflSize ){
2392         int a = amt;
2393         if( a + offset > ovflSize ){
2394           a = ovflSize - offset;
2395         }
2396         memcpy(pBuf, &aPayload[offset+4], a);
2397         offset = 0;
2398         amt -= a;
2399         pBuf += a;
2400       }else{
2401         offset -= ovflSize;
2402       }
2403       sqlite3pager_unref(aPayload);
2404     }
2405   }
2406 
2407   if( amt>0 ){
2408     return SQLITE_CORRUPT; /* bkpt-CORRUPT */
2409   }
2410   return SQLITE_OK;
2411 }
2412 
2413 /*
2414 ** Read part of the key associated with cursor pCur.  Exactly
2415 ** "amt" bytes will be transfered into pBuf[].  The transfer
2416 ** begins at "offset".
2417 **
2418 ** Return SQLITE_OK on success or an error code if anything goes
2419 ** wrong.  An error is returned if "offset+amt" is larger than
2420 ** the available payload.
2421 */
2422 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
2423   assert( pCur->isValid );
2424   assert( pCur->pPage!=0 );
2425   if( pCur->pPage->intKey ){
2426     return SQLITE_CORRUPT;
2427   }
2428   assert( pCur->pPage->intKey==0 );
2429   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2430   return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
2431 }
2432 
2433 /*
2434 ** Read part of the data associated with cursor pCur.  Exactly
2435 ** "amt" bytes will be transfered into pBuf[].  The transfer
2436 ** begins at "offset".
2437 **
2438 ** Return SQLITE_OK on success or an error code if anything goes
2439 ** wrong.  An error is returned if "offset+amt" is larger than
2440 ** the available payload.
2441 */
2442 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
2443   assert( pCur->isValid );
2444   assert( pCur->pPage!=0 );
2445   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2446   return getPayload(pCur, offset, amt, pBuf, 1);
2447 }
2448 
2449 /*
2450 ** Return a pointer to payload information from the entry that the
2451 ** pCur cursor is pointing to.  The pointer is to the beginning of
2452 ** the key if skipKey==0 and it points to the beginning of data if
2453 ** skipKey==1.  The number of bytes of available key/data is written
2454 ** into *pAmt.  If *pAmt==0, then the value returned will not be
2455 ** a valid pointer.
2456 **
2457 ** This routine is an optimization.  It is common for the entire key
2458 ** and data to fit on the local page and for there to be no overflow
2459 ** pages.  When that is so, this routine can be used to access the
2460 ** key and data without making a copy.  If the key and/or data spills
2461 ** onto overflow pages, then getPayload() must be used to reassembly
2462 ** the key/data and copy it into a preallocated buffer.
2463 **
2464 ** The pointer returned by this routine looks directly into the cached
2465 ** page of the database.  The data might change or move the next time
2466 ** any btree routine is called.
2467 */
2468 static const unsigned char *fetchPayload(
2469   BtCursor *pCur,      /* Cursor pointing to entry to read from */
2470   int *pAmt,           /* Write the number of available bytes here */
2471   int skipKey          /* read beginning at data if this is true */
2472 ){
2473   unsigned char *aPayload;
2474   MemPage *pPage;
2475   Btree *pBt;
2476   u32 nKey;
2477   int nLocal;
2478 
2479   assert( pCur!=0 && pCur->pPage!=0 );
2480   assert( pCur->isValid );
2481   pBt = pCur->pBt;
2482   pPage = pCur->pPage;
2483   pageIntegrity(pPage);
2484   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2485   getCellInfo(pCur);
2486   aPayload = pCur->info.pCell;
2487   aPayload += pCur->info.nHeader;
2488   if( pPage->intKey ){
2489     nKey = 0;
2490   }else{
2491     nKey = pCur->info.nKey;
2492   }
2493   if( skipKey ){
2494     aPayload += nKey;
2495     nLocal = pCur->info.nLocal - nKey;
2496   }else{
2497     nLocal = pCur->info.nLocal;
2498     if( nLocal>nKey ){
2499       nLocal = nKey;
2500     }
2501   }
2502   *pAmt = nLocal;
2503   return aPayload;
2504 }
2505 
2506 
2507 /*
2508 ** For the entry that cursor pCur is point to, return as
2509 ** many bytes of the key or data as are available on the local
2510 ** b-tree page.  Write the number of available bytes into *pAmt.
2511 **
2512 ** The pointer returned is ephemeral.  The key/data may move
2513 ** or be destroyed on the next call to any Btree routine.
2514 **
2515 ** These routines is used to get quick access to key and data
2516 ** in the common case where no overflow pages are used.
2517 */
2518 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
2519   return (const void*)fetchPayload(pCur, pAmt, 0);
2520 }
2521 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
2522   return (const void*)fetchPayload(pCur, pAmt, 1);
2523 }
2524 
2525 
2526 /*
2527 ** Move the cursor down to a new child page.  The newPgno argument is the
2528 ** page number of the child page to move to.
2529 */
2530 static int moveToChild(BtCursor *pCur, u32 newPgno){
2531   int rc;
2532   MemPage *pNewPage;
2533   MemPage *pOldPage;
2534   Btree *pBt = pCur->pBt;
2535 
2536   assert( pCur->isValid );
2537   rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
2538   if( rc ) return rc;
2539   pageIntegrity(pNewPage);
2540   pNewPage->idxParent = pCur->idx;
2541   pOldPage = pCur->pPage;
2542   pOldPage->idxShift = 0;
2543   releasePage(pOldPage);
2544   pCur->pPage = pNewPage;
2545   pCur->idx = 0;
2546   pCur->info.nSize = 0;
2547   if( pNewPage->nCell<1 ){
2548     return SQLITE_CORRUPT; /* bkpt-CORRUPT */
2549   }
2550   return SQLITE_OK;
2551 }
2552 
2553 /*
2554 ** Return true if the page is the virtual root of its table.
2555 **
2556 ** The virtual root page is the root page for most tables.  But
2557 ** for the table rooted on page 1, sometime the real root page
2558 ** is empty except for the right-pointer.  In such cases the
2559 ** virtual root page is the page that the right-pointer of page
2560 ** 1 is pointing to.
2561 */
2562 static int isRootPage(MemPage *pPage){
2563   MemPage *pParent = pPage->pParent;
2564   if( pParent==0 ) return 1;
2565   if( pParent->pgno>1 ) return 0;
2566   if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
2567   return 0;
2568 }
2569 
2570 /*
2571 ** Move the cursor up to the parent page.
2572 **
2573 ** pCur->idx is set to the cell index that contains the pointer
2574 ** to the page we are coming from.  If we are coming from the
2575 ** right-most child page then pCur->idx is set to one more than
2576 ** the largest cell index.
2577 */
2578 static void moveToParent(BtCursor *pCur){
2579   Pgno oldPgno;
2580   MemPage *pParent;
2581   MemPage *pPage;
2582   int idxParent;
2583 
2584   assert( pCur->isValid );
2585   pPage = pCur->pPage;
2586   assert( pPage!=0 );
2587   assert( !isRootPage(pPage) );
2588   pageIntegrity(pPage);
2589   pParent = pPage->pParent;
2590   assert( pParent!=0 );
2591   pageIntegrity(pParent);
2592   idxParent = pPage->idxParent;
2593   sqlite3pager_ref(pParent->aData);
2594   oldPgno = pPage->pgno;
2595   releasePage(pPage);
2596   pCur->pPage = pParent;
2597   pCur->info.nSize = 0;
2598   assert( pParent->idxShift==0 );
2599   pCur->idx = idxParent;
2600 }
2601 
2602 /*
2603 ** Move the cursor to the root page
2604 */
2605 static int moveToRoot(BtCursor *pCur){
2606   MemPage *pRoot;
2607   int rc;
2608   Btree *pBt = pCur->pBt;
2609 
2610   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
2611   if( rc ){
2612     pCur->isValid = 0;
2613     return rc;
2614   }
2615   releasePage(pCur->pPage);
2616   pageIntegrity(pRoot);
2617   pCur->pPage = pRoot;
2618   pCur->idx = 0;
2619   pCur->info.nSize = 0;
2620   if( pRoot->nCell==0 && !pRoot->leaf ){
2621     Pgno subpage;
2622     assert( pRoot->pgno==1 );
2623     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
2624     assert( subpage>0 );
2625     pCur->isValid = 1;
2626     rc = moveToChild(pCur, subpage);
2627   }
2628   pCur->isValid = pCur->pPage->nCell>0;
2629   return rc;
2630 }
2631 
2632 /*
2633 ** Move the cursor down to the left-most leaf entry beneath the
2634 ** entry to which it is currently pointing.
2635 */
2636 static int moveToLeftmost(BtCursor *pCur){
2637   Pgno pgno;
2638   int rc;
2639   MemPage *pPage;
2640 
2641   assert( pCur->isValid );
2642   while( !(pPage = pCur->pPage)->leaf ){
2643     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2644     pgno = get4byte(findCell(pPage, pCur->idx));
2645     rc = moveToChild(pCur, pgno);
2646     if( rc ) return rc;
2647   }
2648   return SQLITE_OK;
2649 }
2650 
2651 /*
2652 ** Move the cursor down to the right-most leaf entry beneath the
2653 ** page to which it is currently pointing.  Notice the difference
2654 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
2655 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
2656 ** finds the right-most entry beneath the *page*.
2657 */
2658 static int moveToRightmost(BtCursor *pCur){
2659   Pgno pgno;
2660   int rc;
2661   MemPage *pPage;
2662 
2663   assert( pCur->isValid );
2664   while( !(pPage = pCur->pPage)->leaf ){
2665     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2666     pCur->idx = pPage->nCell;
2667     rc = moveToChild(pCur, pgno);
2668     if( rc ) return rc;
2669   }
2670   pCur->idx = pPage->nCell - 1;
2671   pCur->info.nSize = 0;
2672   return SQLITE_OK;
2673 }
2674 
2675 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
2676 ** on success.  Set *pRes to 0 if the cursor actually points to something
2677 ** or set *pRes to 1 if the table is empty.
2678 */
2679 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
2680   int rc;
2681   rc = moveToRoot(pCur);
2682   if( rc ) return rc;
2683   if( pCur->isValid==0 ){
2684     assert( pCur->pPage->nCell==0 );
2685     *pRes = 1;
2686     return SQLITE_OK;
2687   }
2688   assert( pCur->pPage->nCell>0 );
2689   *pRes = 0;
2690   rc = moveToLeftmost(pCur);
2691   return rc;
2692 }
2693 
2694 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
2695 ** on success.  Set *pRes to 0 if the cursor actually points to something
2696 ** or set *pRes to 1 if the table is empty.
2697 */
2698 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
2699   int rc;
2700   rc = moveToRoot(pCur);
2701   if( rc ) return rc;
2702   if( pCur->isValid==0 ){
2703     assert( pCur->pPage->nCell==0 );
2704     *pRes = 1;
2705     return SQLITE_OK;
2706   }
2707   assert( pCur->isValid );
2708   *pRes = 0;
2709   rc = moveToRightmost(pCur);
2710   return rc;
2711 }
2712 
2713 /* Move the cursor so that it points to an entry near pKey/nKey.
2714 ** Return a success code.
2715 **
2716 ** For INTKEY tables, only the nKey parameter is used.  pKey is
2717 ** ignored.  For other tables, nKey is the number of bytes of data
2718 ** in nKey.  The comparison function specified when the cursor was
2719 ** created is used to compare keys.
2720 **
2721 ** If an exact match is not found, then the cursor is always
2722 ** left pointing at a leaf page which would hold the entry if it
2723 ** were present.  The cursor might point to an entry that comes
2724 ** before or after the key.
2725 **
2726 ** The result of comparing the key with the entry to which the
2727 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
2728 ** this value is as follows:
2729 **
2730 **     *pRes<0      The cursor is left pointing at an entry that
2731 **                  is smaller than pKey or if the table is empty
2732 **                  and the cursor is therefore left point to nothing.
2733 **
2734 **     *pRes==0     The cursor is left pointing at an entry that
2735 **                  exactly matches pKey.
2736 **
2737 **     *pRes>0      The cursor is left pointing at an entry that
2738 **                  is larger than pKey.
2739 */
2740 int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){
2741   int rc;
2742   rc = moveToRoot(pCur);
2743   if( rc ) return rc;
2744   assert( pCur->pPage );
2745   assert( pCur->pPage->isInit );
2746   if( pCur->isValid==0 ){
2747     *pRes = -1;
2748     assert( pCur->pPage->nCell==0 );
2749     return SQLITE_OK;
2750   }
2751    for(;;){
2752     int lwr, upr;
2753     Pgno chldPg;
2754     MemPage *pPage = pCur->pPage;
2755     int c = -1;  /* pRes return if table is empty must be -1 */
2756     lwr = 0;
2757     upr = pPage->nCell-1;
2758     if( !pPage->intKey && pKey==0 ){
2759       return SQLITE_CORRUPT;
2760     }
2761     pageIntegrity(pPage);
2762     while( lwr<=upr ){
2763       void *pCellKey;
2764       i64 nCellKey;
2765       pCur->idx = (lwr+upr)/2;
2766       pCur->info.nSize = 0;
2767       sqlite3BtreeKeySize(pCur, &nCellKey);
2768       if( pPage->intKey ){
2769         if( nCellKey<nKey ){
2770           c = -1;
2771         }else if( nCellKey>nKey ){
2772           c = +1;
2773         }else{
2774           c = 0;
2775         }
2776       }else{
2777         int available;
2778         pCellKey = (void *)fetchPayload(pCur, &available, 0);
2779         if( available>=nCellKey ){
2780           c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2781         }else{
2782           pCellKey = sqliteMallocRaw( nCellKey );
2783           if( pCellKey==0 ) return SQLITE_NOMEM;
2784           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
2785           c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2786           sqliteFree(pCellKey);
2787           if( rc ) return rc;
2788         }
2789       }
2790       if( c==0 ){
2791         if( pPage->leafData && !pPage->leaf ){
2792           lwr = pCur->idx;
2793           upr = lwr - 1;
2794           break;
2795         }else{
2796           if( pRes ) *pRes = 0;
2797           return SQLITE_OK;
2798         }
2799       }
2800       if( c<0 ){
2801         lwr = pCur->idx+1;
2802       }else{
2803         upr = pCur->idx-1;
2804       }
2805     }
2806     assert( lwr==upr+1 );
2807     assert( pPage->isInit );
2808     if( pPage->leaf ){
2809       chldPg = 0;
2810     }else if( lwr>=pPage->nCell ){
2811       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2812     }else{
2813       chldPg = get4byte(findCell(pPage, lwr));
2814     }
2815     if( chldPg==0 ){
2816       assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2817       if( pRes ) *pRes = c;
2818       return SQLITE_OK;
2819     }
2820     pCur->idx = lwr;
2821     pCur->info.nSize = 0;
2822     rc = moveToChild(pCur, chldPg);
2823     if( rc ){
2824       return rc;
2825     }
2826   }
2827   /* NOT REACHED */
2828 }
2829 
2830 /*
2831 ** Return TRUE if the cursor is not pointing at an entry of the table.
2832 **
2833 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
2834 ** past the last entry in the table or sqlite3BtreePrev() moves past
2835 ** the first entry.  TRUE is also returned if the table is empty.
2836 */
2837 int sqlite3BtreeEof(BtCursor *pCur){
2838   return pCur->isValid==0;
2839 }
2840 
2841 /*
2842 ** Advance the cursor to the next entry in the database.  If
2843 ** successful then set *pRes=0.  If the cursor
2844 ** was already pointing to the last entry in the database before
2845 ** this routine was called, then set *pRes=1.
2846 */
2847 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
2848   int rc;
2849   MemPage *pPage = pCur->pPage;
2850 
2851   assert( pRes!=0 );
2852   if( pCur->isValid==0 ){
2853     *pRes = 1;
2854     return SQLITE_OK;
2855   }
2856   assert( pPage->isInit );
2857   assert( pCur->idx<pPage->nCell );
2858 
2859   pCur->idx++;
2860   pCur->info.nSize = 0;
2861   if( pCur->idx>=pPage->nCell ){
2862     if( !pPage->leaf ){
2863       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
2864       if( rc ) return rc;
2865       rc = moveToLeftmost(pCur);
2866       *pRes = 0;
2867       return rc;
2868     }
2869     do{
2870       if( isRootPage(pPage) ){
2871         *pRes = 1;
2872         pCur->isValid = 0;
2873         return SQLITE_OK;
2874       }
2875       moveToParent(pCur);
2876       pPage = pCur->pPage;
2877     }while( pCur->idx>=pPage->nCell );
2878     *pRes = 0;
2879     if( pPage->leafData ){
2880       rc = sqlite3BtreeNext(pCur, pRes);
2881     }else{
2882       rc = SQLITE_OK;
2883     }
2884     return rc;
2885   }
2886   *pRes = 0;
2887   if( pPage->leaf ){
2888     return SQLITE_OK;
2889   }
2890   rc = moveToLeftmost(pCur);
2891   return rc;
2892 }
2893 
2894 /*
2895 ** Step the cursor to the back to the previous entry in the database.  If
2896 ** successful then set *pRes=0.  If the cursor
2897 ** was already pointing to the first entry in the database before
2898 ** this routine was called, then set *pRes=1.
2899 */
2900 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
2901   int rc;
2902   Pgno pgno;
2903   MemPage *pPage;
2904   if( pCur->isValid==0 ){
2905     *pRes = 1;
2906     return SQLITE_OK;
2907   }
2908 
2909   pPage = pCur->pPage;
2910   assert( pPage->isInit );
2911   assert( pCur->idx>=0 );
2912   if( !pPage->leaf ){
2913     pgno = get4byte( findCell(pPage, pCur->idx) );
2914     rc = moveToChild(pCur, pgno);
2915     if( rc ) return rc;
2916     rc = moveToRightmost(pCur);
2917   }else{
2918     while( pCur->idx==0 ){
2919       if( isRootPage(pPage) ){
2920         pCur->isValid = 0;
2921         *pRes = 1;
2922         return SQLITE_OK;
2923       }
2924       moveToParent(pCur);
2925       pPage = pCur->pPage;
2926     }
2927     pCur->idx--;
2928     pCur->info.nSize = 0;
2929     if( pPage->leafData && !pPage->leaf ){
2930       rc = sqlite3BtreePrevious(pCur, pRes);
2931     }else{
2932       rc = SQLITE_OK;
2933     }
2934   }
2935   *pRes = 0;
2936   return rc;
2937 }
2938 
2939 /*
2940 ** Allocate a new page from the database file.
2941 **
2942 ** The new page is marked as dirty.  (In other words, sqlite3pager_write()
2943 ** has already been called on the new page.)  The new page has also
2944 ** been referenced and the calling routine is responsible for calling
2945 ** sqlite3pager_unref() on the new page when it is done.
2946 **
2947 ** SQLITE_OK is returned on success.  Any other return value indicates
2948 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
2949 ** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
2950 **
2951 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
2952 ** locate a page close to the page number "nearby".  This can be used in an
2953 ** attempt to keep related pages close to each other in the database file,
2954 ** which in turn can make database access faster.
2955 **
2956 ** If the "exact" parameter is not 0, and the page-number nearby exists
2957 ** anywhere on the free-list, then it is guarenteed to be returned. This
2958 ** is only used by auto-vacuum databases when allocating a new table.
2959 */
2960 static int allocatePage(
2961   Btree *pBt,
2962   MemPage **ppPage,
2963   Pgno *pPgno,
2964   Pgno nearby,
2965   u8 exact
2966 ){
2967   MemPage *pPage1;
2968   int rc;
2969   int n;     /* Number of pages on the freelist */
2970   int k;     /* Number of leaves on the trunk of the freelist */
2971 
2972   pPage1 = pBt->pPage1;
2973   n = get4byte(&pPage1->aData[36]);
2974   if( n>0 ){
2975     /* There are pages on the freelist.  Reuse one of those pages. */
2976     MemPage *pTrunk = 0;
2977     Pgno iTrunk;
2978     MemPage *pPrevTrunk = 0;
2979     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
2980 
2981     /* If the 'exact' parameter was true and a query of the pointer-map
2982     ** shows that the page 'nearby' is somewhere on the free-list, then
2983     ** the entire-list will be searched for that page.
2984     */
2985 #ifndef SQLITE_OMIT_AUTOVACUUM
2986     if( exact ){
2987       u8 eType;
2988       assert( nearby>0 );
2989       assert( pBt->autoVacuum );
2990       rc = ptrmapGet(pBt, nearby, &eType, 0);
2991       if( rc ) return rc;
2992       if( eType==PTRMAP_FREEPAGE ){
2993         searchList = 1;
2994       }
2995       *pPgno = nearby;
2996     }
2997 #endif
2998 
2999     /* Decrement the free-list count by 1. Set iTrunk to the index of the
3000     ** first free-list trunk page. iPrevTrunk is initially 1.
3001     */
3002     rc = sqlite3pager_write(pPage1->aData);
3003     if( rc ) return rc;
3004     put4byte(&pPage1->aData[36], n-1);
3005 
3006     /* The code within this loop is run only once if the 'searchList' variable
3007     ** is not true. Otherwise, it runs once for each trunk-page on the
3008     ** free-list until the page 'nearby' is located.
3009     */
3010     do {
3011       pPrevTrunk = pTrunk;
3012       if( pPrevTrunk ){
3013         iTrunk = get4byte(&pPrevTrunk->aData[0]);
3014       }else{
3015         iTrunk = get4byte(&pPage1->aData[32]);
3016       }
3017       rc = getPage(pBt, iTrunk, &pTrunk);
3018       if( rc ){
3019         releasePage(pPrevTrunk);
3020         return rc;
3021       }
3022 
3023       /* TODO: This should move to after the loop? */
3024       rc = sqlite3pager_write(pTrunk->aData);
3025       if( rc ){
3026         releasePage(pTrunk);
3027         releasePage(pPrevTrunk);
3028         return rc;
3029       }
3030 
3031       k = get4byte(&pTrunk->aData[4]);
3032       if( k==0 && !searchList ){
3033         /* The trunk has no leaves and the list is not being searched.
3034         ** So extract the trunk page itself and use it as the newly
3035         ** allocated page */
3036         assert( pPrevTrunk==0 );
3037         *pPgno = iTrunk;
3038         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3039         *ppPage = pTrunk;
3040         pTrunk = 0;
3041         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3042       }else if( k>pBt->usableSize/4 - 8 ){
3043         /* Value of k is out of range.  Database corruption */
3044         return SQLITE_CORRUPT; /* bkpt-CORRUPT */
3045 #ifndef SQLITE_OMIT_AUTOVACUUM
3046       }else if( searchList && nearby==iTrunk ){
3047         /* The list is being searched and this trunk page is the page
3048         ** to allocate, regardless of whether it has leaves.
3049         */
3050         assert( *pPgno==iTrunk );
3051         *ppPage = pTrunk;
3052         searchList = 0;
3053         if( k==0 ){
3054           if( !pPrevTrunk ){
3055             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3056           }else{
3057             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3058           }
3059         }else{
3060           /* The trunk page is required by the caller but it contains
3061           ** pointers to free-list leaves. The first leaf becomes a trunk
3062           ** page in this case.
3063           */
3064           MemPage *pNewTrunk;
3065           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3066           rc = getPage(pBt, iNewTrunk, &pNewTrunk);
3067           if( rc!=SQLITE_OK ){
3068             releasePage(pTrunk);
3069             releasePage(pPrevTrunk);
3070             return rc;
3071           }
3072           rc = sqlite3pager_write(pNewTrunk->aData);
3073           if( rc!=SQLITE_OK ){
3074             releasePage(pNewTrunk);
3075             releasePage(pTrunk);
3076             releasePage(pPrevTrunk);
3077             return rc;
3078           }
3079           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3080           put4byte(&pNewTrunk->aData[4], k-1);
3081           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3082           if( !pPrevTrunk ){
3083             put4byte(&pPage1->aData[32], iNewTrunk);
3084           }else{
3085             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3086           }
3087           releasePage(pNewTrunk);
3088         }
3089         pTrunk = 0;
3090         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3091 #endif
3092       }else{
3093         /* Extract a leaf from the trunk */
3094         int closest;
3095         Pgno iPage;
3096         unsigned char *aData = pTrunk->aData;
3097         if( nearby>0 ){
3098           int i, dist;
3099           closest = 0;
3100           dist = get4byte(&aData[8]) - nearby;
3101           if( dist<0 ) dist = -dist;
3102           for(i=1; i<k; i++){
3103             int d2 = get4byte(&aData[8+i*4]) - nearby;
3104             if( d2<0 ) d2 = -d2;
3105             if( d2<dist ){
3106               closest = i;
3107               dist = d2;
3108             }
3109           }
3110         }else{
3111           closest = 0;
3112         }
3113 
3114         iPage = get4byte(&aData[8+closest*4]);
3115         if( !searchList || iPage==nearby ){
3116           *pPgno = iPage;
3117           if( *pPgno>sqlite3pager_pagecount(pBt->pPager) ){
3118             /* Free page off the end of the file */
3119             return SQLITE_CORRUPT; /* bkpt-CORRUPT */
3120           }
3121           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
3122                  ": %d more free pages\n",
3123                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
3124           if( closest<k-1 ){
3125             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
3126           }
3127           put4byte(&aData[4], k-1);
3128           rc = getPage(pBt, *pPgno, ppPage);
3129           if( rc==SQLITE_OK ){
3130             sqlite3pager_dont_rollback((*ppPage)->aData);
3131             rc = sqlite3pager_write((*ppPage)->aData);
3132             if( rc!=SQLITE_OK ){
3133               releasePage(*ppPage);
3134             }
3135           }
3136           searchList = 0;
3137         }
3138       }
3139       releasePage(pPrevTrunk);
3140     }while( searchList );
3141     releasePage(pTrunk);
3142   }else{
3143     /* There are no pages on the freelist, so create a new page at the
3144     ** end of the file */
3145     *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
3146 
3147 #ifndef SQLITE_OMIT_AUTOVACUUM
3148     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt->usableSize, *pPgno) ){
3149       /* If *pPgno refers to a pointer-map page, allocate two new pages
3150       ** at the end of the file instead of one. The first allocated page
3151       ** becomes a new pointer-map page, the second is used by the caller.
3152       */
3153       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
3154       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3155       (*pPgno)++;
3156     }
3157 #endif
3158 
3159     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3160     rc = getPage(pBt, *pPgno, ppPage);
3161     if( rc ) return rc;
3162     rc = sqlite3pager_write((*ppPage)->aData);
3163     if( rc!=SQLITE_OK ){
3164       releasePage(*ppPage);
3165     }
3166     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
3167   }
3168 
3169   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3170   return rc;
3171 }
3172 
3173 /*
3174 ** Add a page of the database file to the freelist.
3175 **
3176 ** sqlite3pager_unref() is NOT called for pPage.
3177 */
3178 static int freePage(MemPage *pPage){
3179   Btree *pBt = pPage->pBt;
3180   MemPage *pPage1 = pBt->pPage1;
3181   int rc, n, k;
3182 
3183   /* Prepare the page for freeing */
3184   assert( pPage->pgno>1 );
3185   pPage->isInit = 0;
3186   releasePage(pPage->pParent);
3187   pPage->pParent = 0;
3188 
3189   /* Increment the free page count on pPage1 */
3190   rc = sqlite3pager_write(pPage1->aData);
3191   if( rc ) return rc;
3192   n = get4byte(&pPage1->aData[36]);
3193   put4byte(&pPage1->aData[36], n+1);
3194 
3195 #ifndef SQLITE_OMIT_AUTOVACUUM
3196   /* If the database supports auto-vacuum, write an entry in the pointer-map
3197   ** to indicate that the page is free.
3198   */
3199   if( pBt->autoVacuum ){
3200     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
3201     if( rc ) return rc;
3202   }
3203 #endif
3204 
3205   if( n==0 ){
3206     /* This is the first free page */
3207     rc = sqlite3pager_write(pPage->aData);
3208     if( rc ) return rc;
3209     memset(pPage->aData, 0, 8);
3210     put4byte(&pPage1->aData[32], pPage->pgno);
3211     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
3212   }else{
3213     /* Other free pages already exist.  Retrive the first trunk page
3214     ** of the freelist and find out how many leaves it has. */
3215     MemPage *pTrunk;
3216     rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
3217     if( rc ) return rc;
3218     k = get4byte(&pTrunk->aData[4]);
3219     if( k>=pBt->usableSize/4 - 8 ){
3220       /* The trunk is full.  Turn the page being freed into a new
3221       ** trunk page with no leaves. */
3222       rc = sqlite3pager_write(pPage->aData);
3223       if( rc ) return rc;
3224       put4byte(pPage->aData, pTrunk->pgno);
3225       put4byte(&pPage->aData[4], 0);
3226       put4byte(&pPage1->aData[32], pPage->pgno);
3227       TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
3228               pPage->pgno, pTrunk->pgno));
3229     }else{
3230       /* Add the newly freed page as a leaf on the current trunk */
3231       rc = sqlite3pager_write(pTrunk->aData);
3232       if( rc ) return rc;
3233       put4byte(&pTrunk->aData[4], k+1);
3234       put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
3235       sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
3236       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
3237     }
3238     releasePage(pTrunk);
3239   }
3240   return rc;
3241 }
3242 
3243 /*
3244 ** Free any overflow pages associated with the given Cell.
3245 */
3246 static int clearCell(MemPage *pPage, unsigned char *pCell){
3247   Btree *pBt = pPage->pBt;
3248   CellInfo info;
3249   Pgno ovflPgno;
3250   int rc;
3251 
3252   parseCellPtr(pPage, pCell, &info);
3253   if( info.iOverflow==0 ){
3254     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
3255   }
3256   ovflPgno = get4byte(&pCell[info.iOverflow]);
3257   while( ovflPgno!=0 ){
3258     MemPage *pOvfl;
3259     if( ovflPgno>sqlite3pager_pagecount(pBt->pPager) ){
3260       return SQLITE_CORRUPT;
3261     }
3262     rc = getPage(pBt, ovflPgno, &pOvfl);
3263     if( rc ) return rc;
3264     ovflPgno = get4byte(pOvfl->aData);
3265     rc = freePage(pOvfl);
3266     sqlite3pager_unref(pOvfl->aData);
3267     if( rc ) return rc;
3268   }
3269   return SQLITE_OK;
3270 }
3271 
3272 /*
3273 ** Create the byte sequence used to represent a cell on page pPage
3274 ** and write that byte sequence into pCell[].  Overflow pages are
3275 ** allocated and filled in as necessary.  The calling procedure
3276 ** is responsible for making sure sufficient space has been allocated
3277 ** for pCell[].
3278 **
3279 ** Note that pCell does not necessary need to point to the pPage->aData
3280 ** area.  pCell might point to some temporary storage.  The cell will
3281 ** be constructed in this temporary area then copied into pPage->aData
3282 ** later.
3283 */
3284 static int fillInCell(
3285   MemPage *pPage,                /* The page that contains the cell */
3286   unsigned char *pCell,          /* Complete text of the cell */
3287   const void *pKey, i64 nKey,    /* The key */
3288   const void *pData,int nData,   /* The data */
3289   int *pnSize                    /* Write cell size here */
3290 ){
3291   int nPayload;
3292   const u8 *pSrc;
3293   int nSrc, n, rc;
3294   int spaceLeft;
3295   MemPage *pOvfl = 0;
3296   MemPage *pToRelease = 0;
3297   unsigned char *pPrior;
3298   unsigned char *pPayload;
3299   Btree *pBt = pPage->pBt;
3300   Pgno pgnoOvfl = 0;
3301   int nHeader;
3302   CellInfo info;
3303 
3304   /* Fill in the header. */
3305   nHeader = 0;
3306   if( !pPage->leaf ){
3307     nHeader += 4;
3308   }
3309   if( pPage->hasData ){
3310     nHeader += putVarint(&pCell[nHeader], nData);
3311   }else{
3312     nData = 0;
3313   }
3314   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
3315   parseCellPtr(pPage, pCell, &info);
3316   assert( info.nHeader==nHeader );
3317   assert( info.nKey==nKey );
3318   assert( info.nData==nData );
3319 
3320   /* Fill in the payload */
3321   nPayload = nData;
3322   if( pPage->intKey ){
3323     pSrc = pData;
3324     nSrc = nData;
3325     nData = 0;
3326   }else{
3327     nPayload += nKey;
3328     pSrc = pKey;
3329     nSrc = nKey;
3330   }
3331   *pnSize = info.nSize;
3332   spaceLeft = info.nLocal;
3333   pPayload = &pCell[nHeader];
3334   pPrior = &pCell[info.iOverflow];
3335 
3336   while( nPayload>0 ){
3337     if( spaceLeft==0 ){
3338 #ifndef SQLITE_OMIT_AUTOVACUUM
3339       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
3340 #endif
3341       rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
3342 #ifndef SQLITE_OMIT_AUTOVACUUM
3343       /* If the database supports auto-vacuum, and the second or subsequent
3344       ** overflow page is being allocated, add an entry to the pointer-map
3345       ** for that page now. The entry for the first overflow page will be
3346       ** added later, by the insertCell() routine.
3347       */
3348       if( pBt->autoVacuum && pgnoPtrmap!=0 && rc==SQLITE_OK ){
3349         rc = ptrmapPut(pBt, pgnoOvfl, PTRMAP_OVERFLOW2, pgnoPtrmap);
3350       }
3351 #endif
3352       if( rc ){
3353         releasePage(pToRelease);
3354         /* clearCell(pPage, pCell); */
3355         return rc;
3356       }
3357       put4byte(pPrior, pgnoOvfl);
3358       releasePage(pToRelease);
3359       pToRelease = pOvfl;
3360       pPrior = pOvfl->aData;
3361       put4byte(pPrior, 0);
3362       pPayload = &pOvfl->aData[4];
3363       spaceLeft = pBt->usableSize - 4;
3364     }
3365     n = nPayload;
3366     if( n>spaceLeft ) n = spaceLeft;
3367     if( n>nSrc ) n = nSrc;
3368     memcpy(pPayload, pSrc, n);
3369     nPayload -= n;
3370     pPayload += n;
3371     pSrc += n;
3372     nSrc -= n;
3373     spaceLeft -= n;
3374     if( nSrc==0 ){
3375       nSrc = nData;
3376       pSrc = pData;
3377     }
3378   }
3379   releasePage(pToRelease);
3380   return SQLITE_OK;
3381 }
3382 
3383 /*
3384 ** Change the MemPage.pParent pointer on the page whose number is
3385 ** given in the second argument so that MemPage.pParent holds the
3386 ** pointer in the third argument.
3387 */
3388 static int reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
3389   MemPage *pThis;
3390   unsigned char *aData;
3391 
3392   if( pgno==0 ) return SQLITE_OK;
3393   assert( pBt->pPager!=0 );
3394   aData = sqlite3pager_lookup(pBt->pPager, pgno);
3395   if( aData ){
3396     pThis = (MemPage*)&aData[pBt->pageSize];
3397     assert( pThis->aData==aData );
3398     if( pThis->isInit ){
3399       if( pThis->pParent!=pNewParent ){
3400         if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
3401         pThis->pParent = pNewParent;
3402         if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
3403       }
3404       pThis->idxParent = idx;
3405     }
3406     sqlite3pager_unref(aData);
3407   }
3408 
3409 #ifndef SQLITE_OMIT_AUTOVACUUM
3410   if( pBt->autoVacuum ){
3411     return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
3412   }
3413 #endif
3414   return SQLITE_OK;
3415 }
3416 
3417 
3418 
3419 /*
3420 ** Change the pParent pointer of all children of pPage to point back
3421 ** to pPage.
3422 **
3423 ** In other words, for every child of pPage, invoke reparentPage()
3424 ** to make sure that each child knows that pPage is its parent.
3425 **
3426 ** This routine gets called after you memcpy() one page into
3427 ** another.
3428 */
3429 static int reparentChildPages(MemPage *pPage){
3430   int i;
3431   Btree *pBt = pPage->pBt;
3432   int rc = SQLITE_OK;
3433 
3434   if( pPage->leaf ) return SQLITE_OK;
3435 
3436   for(i=0; i<pPage->nCell; i++){
3437     u8 *pCell = findCell(pPage, i);
3438     if( !pPage->leaf ){
3439       rc = reparentPage(pBt, get4byte(pCell), pPage, i);
3440       if( rc!=SQLITE_OK ) return rc;
3441     }
3442   }
3443   if( !pPage->leaf ){
3444     rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
3445        pPage, i);
3446     pPage->idxShift = 0;
3447   }
3448   return rc;
3449 }
3450 
3451 /*
3452 ** Remove the i-th cell from pPage.  This routine effects pPage only.
3453 ** The cell content is not freed or deallocated.  It is assumed that
3454 ** the cell content has been copied someplace else.  This routine just
3455 ** removes the reference to the cell from pPage.
3456 **
3457 ** "sz" must be the number of bytes in the cell.
3458 */
3459 static void dropCell(MemPage *pPage, int idx, int sz){
3460   int i;          /* Loop counter */
3461   int pc;         /* Offset to cell content of cell being deleted */
3462   u8 *data;       /* pPage->aData */
3463   u8 *ptr;        /* Used to move bytes around within data[] */
3464 
3465   assert( idx>=0 && idx<pPage->nCell );
3466   assert( sz==cellSize(pPage, idx) );
3467   assert( sqlite3pager_iswriteable(pPage->aData) );
3468   data = pPage->aData;
3469   ptr = &data[pPage->cellOffset + 2*idx];
3470   pc = get2byte(ptr);
3471   assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
3472   freeSpace(pPage, pc, sz);
3473   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
3474     ptr[0] = ptr[2];
3475     ptr[1] = ptr[3];
3476   }
3477   pPage->nCell--;
3478   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
3479   pPage->nFree += 2;
3480   pPage->idxShift = 1;
3481 }
3482 
3483 /*
3484 ** Insert a new cell on pPage at cell index "i".  pCell points to the
3485 ** content of the cell.
3486 **
3487 ** If the cell content will fit on the page, then put it there.  If it
3488 ** will not fit, then make a copy of the cell content into pTemp if
3489 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
3490 ** in pPage->aOvfl[] and make it point to the cell content (either
3491 ** in pTemp or the original pCell) and also record its index.
3492 ** Allocating a new entry in pPage->aCell[] implies that
3493 ** pPage->nOverflow is incremented.
3494 **
3495 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
3496 ** cell. The caller will overwrite them after this function returns. If
3497 ** nSkip is non-zero, then pCell may not point to an invalid memory location
3498 ** (but pCell+nSkip is always valid).
3499 */
3500 static int insertCell(
3501   MemPage *pPage,   /* Page into which we are copying */
3502   int i,            /* New cell becomes the i-th cell of the page */
3503   u8 *pCell,        /* Content of the new cell */
3504   int sz,           /* Bytes of content in pCell */
3505   u8 *pTemp,        /* Temp storage space for pCell, if needed */
3506   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
3507 ){
3508   int idx;          /* Where to write new cell content in data[] */
3509   int j;            /* Loop counter */
3510   int top;          /* First byte of content for any cell in data[] */
3511   int end;          /* First byte past the last cell pointer in data[] */
3512   int ins;          /* Index in data[] where new cell pointer is inserted */
3513   int hdr;          /* Offset into data[] of the page header */
3514   int cellOffset;   /* Address of first cell pointer in data[] */
3515   u8 *data;         /* The content of the whole page */
3516   u8 *ptr;          /* Used for moving information around in data[] */
3517 
3518   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
3519   assert( sz==cellSizePtr(pPage, pCell) );
3520   assert( sqlite3pager_iswriteable(pPage->aData) );
3521   if( pPage->nOverflow || sz+2>pPage->nFree ){
3522     if( pTemp ){
3523       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
3524       pCell = pTemp;
3525     }
3526     j = pPage->nOverflow++;
3527     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
3528     pPage->aOvfl[j].pCell = pCell;
3529     pPage->aOvfl[j].idx = i;
3530     pPage->nFree = 0;
3531   }else{
3532     data = pPage->aData;
3533     hdr = pPage->hdrOffset;
3534     top = get2byte(&data[hdr+5]);
3535     cellOffset = pPage->cellOffset;
3536     end = cellOffset + 2*pPage->nCell + 2;
3537     ins = cellOffset + 2*i;
3538     if( end > top - sz ){
3539       int rc = defragmentPage(pPage);
3540       if( rc!=SQLITE_OK ) return rc;
3541       top = get2byte(&data[hdr+5]);
3542       assert( end + sz <= top );
3543     }
3544     idx = allocateSpace(pPage, sz);
3545     assert( idx>0 );
3546     assert( end <= get2byte(&data[hdr+5]) );
3547     pPage->nCell++;
3548     pPage->nFree -= 2;
3549     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
3550     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
3551       ptr[0] = ptr[-2];
3552       ptr[1] = ptr[-1];
3553     }
3554     put2byte(&data[ins], idx);
3555     put2byte(&data[hdr+3], pPage->nCell);
3556     pPage->idxShift = 1;
3557     pageIntegrity(pPage);
3558 #ifndef SQLITE_OMIT_AUTOVACUUM
3559     if( pPage->pBt->autoVacuum ){
3560       /* The cell may contain a pointer to an overflow page. If so, write
3561       ** the entry for the overflow page into the pointer map.
3562       */
3563       CellInfo info;
3564       parseCellPtr(pPage, pCell, &info);
3565       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
3566         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
3567         int rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
3568         if( rc!=SQLITE_OK ) return rc;
3569       }
3570     }
3571 #endif
3572   }
3573 
3574   return SQLITE_OK;
3575 }
3576 
3577 /*
3578 ** Add a list of cells to a page.  The page should be initially empty.
3579 ** The cells are guaranteed to fit on the page.
3580 */
3581 static void assemblePage(
3582   MemPage *pPage,   /* The page to be assemblied */
3583   int nCell,        /* The number of cells to add to this page */
3584   u8 **apCell,      /* Pointers to cell bodies */
3585   int *aSize        /* Sizes of the cells */
3586 ){
3587   int i;            /* Loop counter */
3588   int totalSize;    /* Total size of all cells */
3589   int hdr;          /* Index of page header */
3590   int cellptr;      /* Address of next cell pointer */
3591   int cellbody;     /* Address of next cell body */
3592   u8 *data;         /* Data for the page */
3593 
3594   assert( pPage->nOverflow==0 );
3595   totalSize = 0;
3596   for(i=0; i<nCell; i++){
3597     totalSize += aSize[i];
3598   }
3599   assert( totalSize+2*nCell<=pPage->nFree );
3600   assert( pPage->nCell==0 );
3601   cellptr = pPage->cellOffset;
3602   data = pPage->aData;
3603   hdr = pPage->hdrOffset;
3604   put2byte(&data[hdr+3], nCell);
3605   if( nCell ){
3606     cellbody = allocateSpace(pPage, totalSize);
3607     assert( cellbody>0 );
3608     assert( pPage->nFree >= 2*nCell );
3609     pPage->nFree -= 2*nCell;
3610     for(i=0; i<nCell; i++){
3611       put2byte(&data[cellptr], cellbody);
3612       memcpy(&data[cellbody], apCell[i], aSize[i]);
3613       cellptr += 2;
3614       cellbody += aSize[i];
3615     }
3616     assert( cellbody==pPage->pBt->usableSize );
3617   }
3618   pPage->nCell = nCell;
3619 }
3620 
3621 /*
3622 ** The following parameters determine how many adjacent pages get involved
3623 ** in a balancing operation.  NN is the number of neighbors on either side
3624 ** of the page that participate in the balancing operation.  NB is the
3625 ** total number of pages that participate, including the target page and
3626 ** NN neighbors on either side.
3627 **
3628 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
3629 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
3630 ** in exchange for a larger degradation in INSERT and UPDATE performance.
3631 ** The value of NN appears to give the best results overall.
3632 */
3633 #define NN 1             /* Number of neighbors on either side of pPage */
3634 #define NB (NN*2+1)      /* Total pages involved in the balance */
3635 
3636 /* Forward reference */
3637 static int balance(MemPage*, int);
3638 
3639 #ifndef SQLITE_OMIT_QUICKBALANCE
3640 /*
3641 ** This version of balance() handles the common special case where
3642 ** a new entry is being inserted on the extreme right-end of the
3643 ** tree, in other words, when the new entry will become the largest
3644 ** entry in the tree.
3645 **
3646 ** Instead of trying balance the 3 right-most leaf pages, just add
3647 ** a new page to the right-hand side and put the one new entry in
3648 ** that page.  This leaves the right side of the tree somewhat
3649 ** unbalanced.  But odds are that we will be inserting new entries
3650 ** at the end soon afterwards so the nearly empty page will quickly
3651 ** fill up.  On average.
3652 **
3653 ** pPage is the leaf page which is the right-most page in the tree.
3654 ** pParent is its parent.  pPage must have a single overflow entry
3655 ** which is also the right-most entry on the page.
3656 */
3657 static int balance_quick(MemPage *pPage, MemPage *pParent){
3658   int rc;
3659   MemPage *pNew;
3660   Pgno pgnoNew;
3661   u8 *pCell;
3662   int szCell;
3663   CellInfo info;
3664   Btree *pBt = pPage->pBt;
3665   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
3666   int parentSize;                   /* Size of new divider cell */
3667   u8 parentCell[64];                /* Space for the new divider cell */
3668 
3669   /* Allocate a new page. Insert the overflow cell from pPage
3670   ** into it. Then remove the overflow cell from pPage.
3671   */
3672   rc = allocatePage(pBt, &pNew, &pgnoNew, 0, 0);
3673   if( rc!=SQLITE_OK ){
3674     return rc;
3675   }
3676   pCell = pPage->aOvfl[0].pCell;
3677   szCell = cellSizePtr(pPage, pCell);
3678   zeroPage(pNew, pPage->aData[0]);
3679   assemblePage(pNew, 1, &pCell, &szCell);
3680   pPage->nOverflow = 0;
3681 
3682   /* Set the parent of the newly allocated page to pParent. */
3683   pNew->pParent = pParent;
3684   sqlite3pager_ref(pParent->aData);
3685 
3686   /* pPage is currently the right-child of pParent. Change this
3687   ** so that the right-child is the new page allocated above and
3688   ** pPage is the next-to-right child.
3689   */
3690   assert( pPage->nCell>0 );
3691   parseCellPtr(pPage, findCell(pPage, pPage->nCell-1), &info);
3692   rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, &parentSize);
3693   if( rc!=SQLITE_OK ){
3694     return rc;
3695   }
3696   assert( parentSize<64 );
3697   rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
3698   if( rc!=SQLITE_OK ){
3699     return rc;
3700   }
3701   put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
3702   put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
3703 
3704 #ifndef SQLITE_OMIT_AUTOVACUUM
3705   /* If this is an auto-vacuum database, update the pointer map
3706   ** with entries for the new page, and any pointer from the
3707   ** cell on the page to an overflow page.
3708   */
3709   if( pBt->autoVacuum ){
3710     rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
3711     if( rc!=SQLITE_OK ){
3712       return rc;
3713     }
3714     rc = ptrmapPutOvfl(pNew, 0);
3715     if( rc!=SQLITE_OK ){
3716       return rc;
3717     }
3718   }
3719 #endif
3720 
3721   /* Release the reference to the new page and balance the parent page,
3722   ** in case the divider cell inserted caused it to become overfull.
3723   */
3724   releasePage(pNew);
3725   return balance(pParent, 0);
3726 }
3727 #endif /* SQLITE_OMIT_QUICKBALANCE */
3728 
3729 /*
3730 ** The ISAUTOVACUUM macro is used within balance_nonroot() to determine
3731 ** if the database supports auto-vacuum or not. Because it is used
3732 ** within an expression that is an argument to another macro
3733 ** (sqliteMallocRaw), it is not possible to use conditional compilation.
3734 ** So, this macro is defined instead.
3735 */
3736 #ifndef SQLITE_OMIT_AUTOVACUUM
3737 #define ISAUTOVACUUM (pBt->autoVacuum)
3738 #else
3739 #define ISAUTOVACUUM 0
3740 #endif
3741 
3742 /*
3743 ** This routine redistributes Cells on pPage and up to NN*2 siblings
3744 ** of pPage so that all pages have about the same amount of free space.
3745 ** Usually NN siblings on either side of pPage is used in the balancing,
3746 ** though more siblings might come from one side if pPage is the first
3747 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
3748 ** (something which can only happen if pPage is the root page or a
3749 ** child of root) then all available siblings participate in the balancing.
3750 **
3751 ** The number of siblings of pPage might be increased or decreased by one or
3752 ** two in an effort to keep pages nearly full but not over full. The root page
3753 ** is special and is allowed to be nearly empty. If pPage is
3754 ** the root page, then the depth of the tree might be increased
3755 ** or decreased by one, as necessary, to keep the root page from being
3756 ** overfull or completely empty.
3757 **
3758 ** Note that when this routine is called, some of the Cells on pPage
3759 ** might not actually be stored in pPage->aData[].  This can happen
3760 ** if the page is overfull.  Part of the job of this routine is to
3761 ** make sure all Cells for pPage once again fit in pPage->aData[].
3762 **
3763 ** In the course of balancing the siblings of pPage, the parent of pPage
3764 ** might become overfull or underfull.  If that happens, then this routine
3765 ** is called recursively on the parent.
3766 **
3767 ** If this routine fails for any reason, it might leave the database
3768 ** in a corrupted state.  So if this routine fails, the database should
3769 ** be rolled back.
3770 */
3771 static int balance_nonroot(MemPage *pPage){
3772   MemPage *pParent;            /* The parent of pPage */
3773   Btree *pBt;                  /* The whole database */
3774   int nCell = 0;               /* Number of cells in apCell[] */
3775   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
3776   int nOld;                    /* Number of pages in apOld[] */
3777   int nNew;                    /* Number of pages in apNew[] */
3778   int nDiv;                    /* Number of cells in apDiv[] */
3779   int i, j, k;                 /* Loop counters */
3780   int idx;                     /* Index of pPage in pParent->aCell[] */
3781   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
3782   int rc;                      /* The return code */
3783   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
3784   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
3785   int usableSpace;             /* Bytes in pPage beyond the header */
3786   int pageFlags;               /* Value of pPage->aData[0] */
3787   int subtotal;                /* Subtotal of bytes in cells on one page */
3788   int iSpace = 0;              /* First unused byte of aSpace[] */
3789   MemPage *apOld[NB];          /* pPage and up to two siblings */
3790   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
3791   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
3792   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
3793   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
3794   int idxDiv[NB];              /* Indices of divider cells in pParent */
3795   u8 *apDiv[NB];               /* Divider cells in pParent */
3796   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
3797   int szNew[NB+2];             /* Combined size of cells place on i-th page */
3798   u8 **apCell = 0;             /* All cells begin balanced */
3799   int *szCell;                 /* Local size of all cells in apCell[] */
3800   u8 *aCopy[NB];               /* Space for holding data of apCopy[] */
3801   u8 *aSpace;                  /* Space to hold copies of dividers cells */
3802 #ifndef SQLITE_OMIT_AUTOVACUUM
3803   u8 *aFrom = 0;
3804 #endif
3805 
3806   /*
3807   ** Find the parent page.
3808   */
3809   assert( pPage->isInit );
3810   assert( sqlite3pager_iswriteable(pPage->aData) );
3811   pBt = pPage->pBt;
3812   pParent = pPage->pParent;
3813   sqlite3pager_write(pParent->aData);
3814   assert( pParent );
3815   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
3816 
3817 #ifndef SQLITE_OMIT_QUICKBALANCE
3818   /*
3819   ** A special case:  If a new entry has just been inserted into a
3820   ** table (that is, a btree with integer keys and all data at the leaves)
3821   ** and the new entry is the right-most entry in the tree (it has the
3822   ** largest key) then use the special balance_quick() routine for
3823   ** balancing.  balance_quick() is much faster and results in a tighter
3824   ** packing of data in the common case.
3825   */
3826   if( pPage->leaf &&
3827       pPage->intKey &&
3828       pPage->leafData &&
3829       pPage->nOverflow==1 &&
3830       pPage->aOvfl[0].idx==pPage->nCell &&
3831       pPage->pParent->pgno!=1 &&
3832       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
3833   ){
3834     /*
3835     ** TODO: Check the siblings to the left of pPage. It may be that
3836     ** they are not full and no new page is required.
3837     */
3838     return balance_quick(pPage, pParent);
3839   }
3840 #endif
3841 
3842   /*
3843   ** Find the cell in the parent page whose left child points back
3844   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
3845   ** is the rightmost child of pParent then set idx to pParent->nCell
3846   */
3847   if( pParent->idxShift ){
3848     Pgno pgno;
3849     pgno = pPage->pgno;
3850     assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
3851     for(idx=0; idx<pParent->nCell; idx++){
3852       if( get4byte(findCell(pParent, idx))==pgno ){
3853         break;
3854       }
3855     }
3856     assert( idx<pParent->nCell
3857              || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
3858   }else{
3859     idx = pPage->idxParent;
3860   }
3861 
3862   /*
3863   ** Initialize variables so that it will be safe to jump
3864   ** directly to balance_cleanup at any moment.
3865   */
3866   nOld = nNew = 0;
3867   sqlite3pager_ref(pParent->aData);
3868 
3869   /*
3870   ** Find sibling pages to pPage and the cells in pParent that divide
3871   ** the siblings.  An attempt is made to find NN siblings on either
3872   ** side of pPage.  More siblings are taken from one side, however, if
3873   ** pPage there are fewer than NN siblings on the other side.  If pParent
3874   ** has NB or fewer children then all children of pParent are taken.
3875   */
3876   nxDiv = idx - NN;
3877   if( nxDiv + NB > pParent->nCell ){
3878     nxDiv = pParent->nCell - NB + 1;
3879   }
3880   if( nxDiv<0 ){
3881     nxDiv = 0;
3882   }
3883   nDiv = 0;
3884   for(i=0, k=nxDiv; i<NB; i++, k++){
3885     if( k<pParent->nCell ){
3886       idxDiv[i] = k;
3887       apDiv[i] = findCell(pParent, k);
3888       nDiv++;
3889       assert( !pParent->leaf );
3890       pgnoOld[i] = get4byte(apDiv[i]);
3891     }else if( k==pParent->nCell ){
3892       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
3893     }else{
3894       break;
3895     }
3896     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
3897     if( rc ) goto balance_cleanup;
3898     apOld[i]->idxParent = k;
3899     apCopy[i] = 0;
3900     assert( i==nOld );
3901     nOld++;
3902     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
3903   }
3904 
3905   /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
3906   ** alignment */
3907   nMaxCells = (nMaxCells + 1)&~1;
3908 
3909   /*
3910   ** Allocate space for memory structures
3911   */
3912   apCell = sqliteMallocRaw(
3913        nMaxCells*sizeof(u8*)                           /* apCell */
3914      + nMaxCells*sizeof(int)                           /* szCell */
3915      + ROUND8(sizeof(MemPage))*NB                      /* aCopy */
3916      + pBt->pageSize*(5+NB)                            /* aSpace */
3917      + (ISAUTOVACUUM ? nMaxCells : 0)                  /* aFrom */
3918   );
3919   if( apCell==0 ){
3920     rc = SQLITE_NOMEM;
3921     goto balance_cleanup;
3922   }
3923   szCell = (int*)&apCell[nMaxCells];
3924   aCopy[0] = (u8*)&szCell[nMaxCells];
3925   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
3926   for(i=1; i<NB; i++){
3927     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
3928     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
3929   }
3930   aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
3931   assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
3932 #ifndef SQLITE_OMIT_AUTOVACUUM
3933   if( pBt->autoVacuum ){
3934     aFrom = &aSpace[5*pBt->pageSize];
3935   }
3936 #endif
3937 
3938   /*
3939   ** Make copies of the content of pPage and its siblings into aOld[].
3940   ** The rest of this function will use data from the copies rather
3941   ** that the original pages since the original pages will be in the
3942   ** process of being overwritten.
3943   */
3944   for(i=0; i<nOld; i++){
3945     MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->pageSize];
3946     p->aData = &((u8*)p)[-pBt->pageSize];
3947     memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage));
3948     /* The memcpy() above changes the value of p->aData so we have to
3949     ** set it again. */
3950     p->aData = &((u8*)p)[-pBt->pageSize];
3951   }
3952 
3953   /*
3954   ** Load pointers to all cells on sibling pages and the divider cells
3955   ** into the local apCell[] array.  Make copies of the divider cells
3956   ** into space obtained form aSpace[] and remove the the divider Cells
3957   ** from pParent.
3958   **
3959   ** If the siblings are on leaf pages, then the child pointers of the
3960   ** divider cells are stripped from the cells before they are copied
3961   ** into aSpace[].  In this way, all cells in apCell[] are without
3962   ** child pointers.  If siblings are not leaves, then all cell in
3963   ** apCell[] include child pointers.  Either way, all cells in apCell[]
3964   ** are alike.
3965   **
3966   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
3967   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
3968   */
3969   nCell = 0;
3970   leafCorrection = pPage->leaf*4;
3971   leafData = pPage->leafData && pPage->leaf;
3972   for(i=0; i<nOld; i++){
3973     MemPage *pOld = apCopy[i];
3974     int limit = pOld->nCell+pOld->nOverflow;
3975     for(j=0; j<limit; j++){
3976       assert( nCell<nMaxCells );
3977       apCell[nCell] = findOverflowCell(pOld, j);
3978       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
3979 #ifndef SQLITE_OMIT_AUTOVACUUM
3980       if( pBt->autoVacuum ){
3981         int a;
3982         aFrom[nCell] = i;
3983         for(a=0; a<pOld->nOverflow; a++){
3984           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
3985             aFrom[nCell] = 0xFF;
3986             break;
3987           }
3988         }
3989       }
3990 #endif
3991       nCell++;
3992     }
3993     if( i<nOld-1 ){
3994       int sz = cellSizePtr(pParent, apDiv[i]);
3995       if( leafData ){
3996         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
3997         ** are duplicates of keys on the child pages.  We need to remove
3998         ** the divider cells from pParent, but the dividers cells are not
3999         ** added to apCell[] because they are duplicates of child cells.
4000         */
4001         dropCell(pParent, nxDiv, sz);
4002       }else{
4003         u8 *pTemp;
4004         assert( nCell<nMaxCells );
4005         szCell[nCell] = sz;
4006         pTemp = &aSpace[iSpace];
4007         iSpace += sz;
4008         assert( iSpace<=pBt->pageSize*5 );
4009         memcpy(pTemp, apDiv[i], sz);
4010         apCell[nCell] = pTemp+leafCorrection;
4011 #ifndef SQLITE_OMIT_AUTOVACUUM
4012         if( pBt->autoVacuum ){
4013           aFrom[nCell] = 0xFF;
4014         }
4015 #endif
4016         dropCell(pParent, nxDiv, sz);
4017         szCell[nCell] -= leafCorrection;
4018         assert( get4byte(pTemp)==pgnoOld[i] );
4019         if( !pOld->leaf ){
4020           assert( leafCorrection==0 );
4021           /* The right pointer of the child page pOld becomes the left
4022           ** pointer of the divider cell */
4023           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
4024         }else{
4025           assert( leafCorrection==4 );
4026         }
4027         nCell++;
4028       }
4029     }
4030   }
4031 
4032   /*
4033   ** Figure out the number of pages needed to hold all nCell cells.
4034   ** Store this number in "k".  Also compute szNew[] which is the total
4035   ** size of all cells on the i-th page and cntNew[] which is the index
4036   ** in apCell[] of the cell that divides page i from page i+1.
4037   ** cntNew[k] should equal nCell.
4038   **
4039   ** Values computed by this block:
4040   **
4041   **           k: The total number of sibling pages
4042   **    szNew[i]: Spaced used on the i-th sibling page.
4043   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
4044   **              the right of the i-th sibling page.
4045   ** usableSpace: Number of bytes of space available on each sibling.
4046   **
4047   */
4048   usableSpace = pBt->usableSize - 12 + leafCorrection;
4049   for(subtotal=k=i=0; i<nCell; i++){
4050     assert( i<nMaxCells );
4051     subtotal += szCell[i] + 2;
4052     if( subtotal > usableSpace ){
4053       szNew[k] = subtotal - szCell[i];
4054       cntNew[k] = i;
4055       if( leafData ){ i--; }
4056       subtotal = 0;
4057       k++;
4058     }
4059   }
4060   szNew[k] = subtotal;
4061   cntNew[k] = nCell;
4062   k++;
4063 
4064   /*
4065   ** The packing computed by the previous block is biased toward the siblings
4066   ** on the left side.  The left siblings are always nearly full, while the
4067   ** right-most sibling might be nearly empty.  This block of code attempts
4068   ** to adjust the packing of siblings to get a better balance.
4069   **
4070   ** This adjustment is more than an optimization.  The packing above might
4071   ** be so out of balance as to be illegal.  For example, the right-most
4072   ** sibling might be completely empty.  This adjustment is not optional.
4073   */
4074   for(i=k-1; i>0; i--){
4075     int szRight = szNew[i];  /* Size of sibling on the right */
4076     int szLeft = szNew[i-1]; /* Size of sibling on the left */
4077     int r;              /* Index of right-most cell in left sibling */
4078     int d;              /* Index of first cell to the left of right sibling */
4079 
4080     r = cntNew[i-1] - 1;
4081     d = r + 1 - leafData;
4082     assert( d<nMaxCells );
4083     assert( r<nMaxCells );
4084     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
4085       szRight += szCell[d] + 2;
4086       szLeft -= szCell[r] + 2;
4087       cntNew[i-1]--;
4088       r = cntNew[i-1] - 1;
4089       d = r + 1 - leafData;
4090     }
4091     szNew[i] = szRight;
4092     szNew[i-1] = szLeft;
4093   }
4094 
4095   /* Either we found one or more cells (cntnew[0])>0) or we are the
4096   ** a virtual root page.  A virtual root page is when the real root
4097   ** page is page 1 and we are the only child of that page.
4098   */
4099   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
4100 
4101   /*
4102   ** Allocate k new pages.  Reuse old pages where possible.
4103   */
4104   assert( pPage->pgno>1 );
4105   pageFlags = pPage->aData[0];
4106   for(i=0; i<k; i++){
4107     MemPage *pNew;
4108     if( i<nOld ){
4109       pNew = apNew[i] = apOld[i];
4110       pgnoNew[i] = pgnoOld[i];
4111       apOld[i] = 0;
4112       rc = sqlite3pager_write(pNew->aData);
4113       if( rc ) goto balance_cleanup;
4114     }else{
4115       rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
4116       if( rc ) goto balance_cleanup;
4117       apNew[i] = pNew;
4118     }
4119     nNew++;
4120     zeroPage(pNew, pageFlags);
4121   }
4122 
4123   /* Free any old pages that were not reused as new pages.
4124   */
4125   while( i<nOld ){
4126     rc = freePage(apOld[i]);
4127     if( rc ) goto balance_cleanup;
4128     releasePage(apOld[i]);
4129     apOld[i] = 0;
4130     i++;
4131   }
4132 
4133   /*
4134   ** Put the new pages in accending order.  This helps to
4135   ** keep entries in the disk file in order so that a scan
4136   ** of the table is a linear scan through the file.  That
4137   ** in turn helps the operating system to deliver pages
4138   ** from the disk more rapidly.
4139   **
4140   ** An O(n^2) insertion sort algorithm is used, but since
4141   ** n is never more than NB (a small constant), that should
4142   ** not be a problem.
4143   **
4144   ** When NB==3, this one optimization makes the database
4145   ** about 25% faster for large insertions and deletions.
4146   */
4147   for(i=0; i<k-1; i++){
4148     int minV = pgnoNew[i];
4149     int minI = i;
4150     for(j=i+1; j<k; j++){
4151       if( pgnoNew[j]<(unsigned)minV ){
4152         minI = j;
4153         minV = pgnoNew[j];
4154       }
4155     }
4156     if( minI>i ){
4157       int t;
4158       MemPage *pT;
4159       t = pgnoNew[i];
4160       pT = apNew[i];
4161       pgnoNew[i] = pgnoNew[minI];
4162       apNew[i] = apNew[minI];
4163       pgnoNew[minI] = t;
4164       apNew[minI] = pT;
4165     }
4166   }
4167   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
4168     pgnoOld[0],
4169     nOld>=2 ? pgnoOld[1] : 0,
4170     nOld>=3 ? pgnoOld[2] : 0,
4171     pgnoNew[0], szNew[0],
4172     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
4173     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
4174     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
4175     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
4176 
4177   /*
4178   ** Evenly distribute the data in apCell[] across the new pages.
4179   ** Insert divider cells into pParent as necessary.
4180   */
4181   j = 0;
4182   for(i=0; i<nNew; i++){
4183     /* Assemble the new sibling page. */
4184     MemPage *pNew = apNew[i];
4185     assert( j<nMaxCells );
4186     assert( pNew->pgno==pgnoNew[i] );
4187     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
4188     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
4189     assert( pNew->nOverflow==0 );
4190 
4191 #ifndef SQLITE_OMIT_AUTOVACUUM
4192     /* If this is an auto-vacuum database, update the pointer map entries
4193     ** that point to the siblings that were rearranged. These can be: left
4194     ** children of cells, the right-child of the page, or overflow pages
4195     ** pointed to by cells.
4196     */
4197     if( pBt->autoVacuum ){
4198       for(k=j; k<cntNew[i]; k++){
4199         assert( k<nMaxCells );
4200         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
4201           rc = ptrmapPutOvfl(pNew, k-j);
4202           if( rc!=SQLITE_OK ){
4203             goto balance_cleanup;
4204           }
4205         }
4206       }
4207     }
4208 #endif
4209 
4210     j = cntNew[i];
4211 
4212     /* If the sibling page assembled above was not the right-most sibling,
4213     ** insert a divider cell into the parent page.
4214     */
4215     if( i<nNew-1 && j<nCell ){
4216       u8 *pCell;
4217       u8 *pTemp;
4218       int sz;
4219 
4220       assert( j<nMaxCells );
4221       pCell = apCell[j];
4222       sz = szCell[j] + leafCorrection;
4223       if( !pNew->leaf ){
4224         memcpy(&pNew->aData[8], pCell, 4);
4225         pTemp = 0;
4226       }else if( leafData ){
4227 	/* If the tree is a leaf-data tree, and the siblings are leaves,
4228         ** then there is no divider cell in apCell[]. Instead, the divider
4229         ** cell consists of the integer key for the right-most cell of
4230         ** the sibling-page assembled above only.
4231         */
4232         CellInfo info;
4233         j--;
4234         parseCellPtr(pNew, apCell[j], &info);
4235         pCell = &aSpace[iSpace];
4236         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
4237         iSpace += sz;
4238         assert( iSpace<=pBt->pageSize*5 );
4239         pTemp = 0;
4240       }else{
4241         pCell -= 4;
4242         pTemp = &aSpace[iSpace];
4243         iSpace += sz;
4244         assert( iSpace<=pBt->pageSize*5 );
4245       }
4246       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
4247       if( rc!=SQLITE_OK ) goto balance_cleanup;
4248       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
4249 #ifndef SQLITE_OMIT_AUTOVACUUM
4250       /* If this is an auto-vacuum database, and not a leaf-data tree,
4251       ** then update the pointer map with an entry for the overflow page
4252       ** that the cell just inserted points to (if any).
4253       */
4254       if( pBt->autoVacuum && !leafData ){
4255         rc = ptrmapPutOvfl(pParent, nxDiv);
4256         if( rc!=SQLITE_OK ){
4257           goto balance_cleanup;
4258         }
4259       }
4260 #endif
4261       j++;
4262       nxDiv++;
4263     }
4264   }
4265   assert( j==nCell );
4266   if( (pageFlags & PTF_LEAF)==0 ){
4267     memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
4268   }
4269   if( nxDiv==pParent->nCell+pParent->nOverflow ){
4270     /* Right-most sibling is the right-most child of pParent */
4271     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
4272   }else{
4273     /* Right-most sibling is the left child of the first entry in pParent
4274     ** past the right-most divider entry */
4275     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
4276   }
4277 
4278   /*
4279   ** Reparent children of all cells.
4280   */
4281   for(i=0; i<nNew; i++){
4282     rc = reparentChildPages(apNew[i]);
4283     if( rc!=SQLITE_OK ) goto balance_cleanup;
4284   }
4285   rc = reparentChildPages(pParent);
4286   if( rc!=SQLITE_OK ) goto balance_cleanup;
4287 
4288   /*
4289   ** Balance the parent page.  Note that the current page (pPage) might
4290   ** have been added to the freelist so it might no longer be initialized.
4291   ** But the parent page will always be initialized.
4292   */
4293   assert( pParent->isInit );
4294   /* assert( pPage->isInit ); // No! pPage might have been added to freelist */
4295   /* pageIntegrity(pPage);    // No! pPage might have been added to freelist */
4296   rc = balance(pParent, 0);
4297 
4298   /*
4299   ** Cleanup before returning.
4300   */
4301 balance_cleanup:
4302   sqliteFree(apCell);
4303   for(i=0; i<nOld; i++){
4304     releasePage(apOld[i]);
4305   }
4306   for(i=0; i<nNew; i++){
4307     releasePage(apNew[i]);
4308   }
4309   releasePage(pParent);
4310   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
4311           pPage->pgno, nOld, nNew, nCell));
4312   return rc;
4313 }
4314 
4315 /*
4316 ** This routine is called for the root page of a btree when the root
4317 ** page contains no cells.  This is an opportunity to make the tree
4318 ** shallower by one level.
4319 */
4320 static int balance_shallower(MemPage *pPage){
4321   MemPage *pChild;             /* The only child page of pPage */
4322   Pgno pgnoChild;              /* Page number for pChild */
4323   int rc = SQLITE_OK;          /* Return code from subprocedures */
4324   Btree *pBt;                  /* The main BTree structure */
4325   int mxCellPerPage;           /* Maximum number of cells per page */
4326   u8 **apCell;                 /* All cells from pages being balanced */
4327   int *szCell;                 /* Local size of all cells */
4328 
4329   assert( pPage->pParent==0 );
4330   assert( pPage->nCell==0 );
4331   pBt = pPage->pBt;
4332   mxCellPerPage = MX_CELL(pBt);
4333   apCell = sqliteMallocRaw( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
4334   if( apCell==0 ) return SQLITE_NOMEM;
4335   szCell = (int*)&apCell[mxCellPerPage];
4336   if( pPage->leaf ){
4337     /* The table is completely empty */
4338     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
4339   }else{
4340     /* The root page is empty but has one child.  Transfer the
4341     ** information from that one child into the root page if it
4342     ** will fit.  This reduces the depth of the tree by one.
4343     **
4344     ** If the root page is page 1, it has less space available than
4345     ** its child (due to the 100 byte header that occurs at the beginning
4346     ** of the database fle), so it might not be able to hold all of the
4347     ** information currently contained in the child.  If this is the
4348     ** case, then do not do the transfer.  Leave page 1 empty except
4349     ** for the right-pointer to the child page.  The child page becomes
4350     ** the virtual root of the tree.
4351     */
4352     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4353     assert( pgnoChild>0 );
4354     assert( pgnoChild<=sqlite3pager_pagecount(pPage->pBt->pPager) );
4355     rc = getPage(pPage->pBt, pgnoChild, &pChild);
4356     if( rc ) goto end_shallow_balance;
4357     if( pPage->pgno==1 ){
4358       rc = initPage(pChild, pPage);
4359       if( rc ) goto end_shallow_balance;
4360       assert( pChild->nOverflow==0 );
4361       if( pChild->nFree>=100 ){
4362         /* The child information will fit on the root page, so do the
4363         ** copy */
4364         int i;
4365         zeroPage(pPage, pChild->aData[0]);
4366         for(i=0; i<pChild->nCell; i++){
4367           apCell[i] = findCell(pChild,i);
4368           szCell[i] = cellSizePtr(pChild, apCell[i]);
4369         }
4370         assemblePage(pPage, pChild->nCell, apCell, szCell);
4371         /* Copy the right-pointer of the child to the parent. */
4372         put4byte(&pPage->aData[pPage->hdrOffset+8],
4373             get4byte(&pChild->aData[pChild->hdrOffset+8]));
4374         freePage(pChild);
4375         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
4376       }else{
4377         /* The child has more information that will fit on the root.
4378         ** The tree is already balanced.  Do nothing. */
4379         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
4380       }
4381     }else{
4382       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
4383       pPage->isInit = 0;
4384       pPage->pParent = 0;
4385       rc = initPage(pPage, 0);
4386       assert( rc==SQLITE_OK );
4387       freePage(pChild);
4388       TRACE(("BALANCE: transfer child %d into root %d\n",
4389               pChild->pgno, pPage->pgno));
4390     }
4391     rc = reparentChildPages(pPage);
4392     assert( pPage->nOverflow==0 );
4393 #ifndef SQLITE_OMIT_AUTOVACUUM
4394     if( pBt->autoVacuum ){
4395       int i;
4396       for(i=0; i<pPage->nCell; i++){
4397         rc = ptrmapPutOvfl(pPage, i);
4398         if( rc!=SQLITE_OK ){
4399           goto end_shallow_balance;
4400         }
4401       }
4402     }
4403 #endif
4404     if( rc!=SQLITE_OK ) goto end_shallow_balance;
4405     releasePage(pChild);
4406   }
4407 end_shallow_balance:
4408   sqliteFree(apCell);
4409   return rc;
4410 }
4411 
4412 
4413 /*
4414 ** The root page is overfull
4415 **
4416 ** When this happens, Create a new child page and copy the
4417 ** contents of the root into the child.  Then make the root
4418 ** page an empty page with rightChild pointing to the new
4419 ** child.   Finally, call balance_internal() on the new child
4420 ** to cause it to split.
4421 */
4422 static int balance_deeper(MemPage *pPage){
4423   int rc;             /* Return value from subprocedures */
4424   MemPage *pChild;    /* Pointer to a new child page */
4425   Pgno pgnoChild;     /* Page number of the new child page */
4426   Btree *pBt;         /* The BTree */
4427   int usableSize;     /* Total usable size of a page */
4428   u8 *data;           /* Content of the parent page */
4429   u8 *cdata;          /* Content of the child page */
4430   int hdr;            /* Offset to page header in parent */
4431   int brk;            /* Offset to content of first cell in parent */
4432 
4433   assert( pPage->pParent==0 );
4434   assert( pPage->nOverflow>0 );
4435   pBt = pPage->pBt;
4436   rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
4437   if( rc ) return rc;
4438   assert( sqlite3pager_iswriteable(pChild->aData) );
4439   usableSize = pBt->usableSize;
4440   data = pPage->aData;
4441   hdr = pPage->hdrOffset;
4442   brk = get2byte(&data[hdr+5]);
4443   cdata = pChild->aData;
4444   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
4445   memcpy(&cdata[brk], &data[brk], usableSize-brk);
4446   assert( pChild->isInit==0 );
4447   rc = initPage(pChild, pPage);
4448   if( rc ) goto balancedeeper_out;
4449   memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
4450   pChild->nOverflow = pPage->nOverflow;
4451   if( pChild->nOverflow ){
4452     pChild->nFree = 0;
4453   }
4454   assert( pChild->nCell==pPage->nCell );
4455   zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
4456   put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
4457   TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
4458 #ifndef SQLITE_OMIT_AUTOVACUUM
4459   if( pBt->autoVacuum ){
4460     int i;
4461     rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
4462     if( rc ) goto balancedeeper_out;
4463     for(i=0; i<pChild->nCell; i++){
4464       rc = ptrmapPutOvfl(pChild, i);
4465       if( rc!=SQLITE_OK ){
4466         return rc;
4467       }
4468     }
4469   }
4470 #endif
4471   rc = balance_nonroot(pChild);
4472 
4473 balancedeeper_out:
4474   releasePage(pChild);
4475   return rc;
4476 }
4477 
4478 /*
4479 ** Decide if the page pPage needs to be balanced.  If balancing is
4480 ** required, call the appropriate balancing routine.
4481 */
4482 static int balance(MemPage *pPage, int insert){
4483   int rc = SQLITE_OK;
4484   if( pPage->pParent==0 ){
4485     if( pPage->nOverflow>0 ){
4486       rc = balance_deeper(pPage);
4487     }
4488     if( rc==SQLITE_OK && pPage->nCell==0 ){
4489       rc = balance_shallower(pPage);
4490     }
4491   }else{
4492     if( pPage->nOverflow>0 ||
4493         (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
4494       rc = balance_nonroot(pPage);
4495     }
4496   }
4497   return rc;
4498 }
4499 
4500 /*
4501 ** This routine checks all cursors that point to table pgnoRoot.
4502 ** If any of those cursors other than pExclude were opened with
4503 ** wrFlag==0 then this routine returns SQLITE_LOCKED.  If all
4504 ** cursors that point to pgnoRoot were opened with wrFlag==1
4505 ** then this routine returns SQLITE_OK.
4506 **
4507 ** In addition to checking for read-locks (where a read-lock
4508 ** means a cursor opened with wrFlag==0) this routine also moves
4509 ** all cursors other than pExclude so that they are pointing to the
4510 ** first Cell on root page.  This is necessary because an insert
4511 ** or delete might change the number of cells on a page or delete
4512 ** a page entirely and we do not want to leave any cursors
4513 ** pointing to non-existant pages or cells.
4514 */
4515 static int checkReadLocks(Btree *pBt, Pgno pgnoRoot, BtCursor *pExclude){
4516   BtCursor *p;
4517   for(p=pBt->pCursor; p; p=p->pNext){
4518     if( p->pgnoRoot!=pgnoRoot || p==pExclude ) continue;
4519     if( p->wrFlag==0 ) return SQLITE_LOCKED;
4520     if( p->pPage->pgno!=p->pgnoRoot ){
4521       moveToRoot(p);
4522     }
4523   }
4524   return SQLITE_OK;
4525 }
4526 
4527 /*
4528 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
4529 ** and the data is given by (pData,nData).  The cursor is used only to
4530 ** define what table the record should be inserted into.  The cursor
4531 ** is left pointing at a random location.
4532 **
4533 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
4534 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
4535 */
4536 int sqlite3BtreeInsert(
4537   BtCursor *pCur,                /* Insert data into the table of this cursor */
4538   const void *pKey, i64 nKey,    /* The key of the new record */
4539   const void *pData, int nData   /* The data of the new record */
4540 ){
4541   int rc;
4542   int loc;
4543   int szNew;
4544   MemPage *pPage;
4545   Btree *pBt = pCur->pBt;
4546   unsigned char *oldCell;
4547   unsigned char *newCell = 0;
4548 
4549   if( pBt->inTrans!=TRANS_WRITE ){
4550     /* Must start a transaction before doing an insert */
4551     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4552   }
4553   assert( !pBt->readOnly );
4554   if( !pCur->wrFlag ){
4555     return SQLITE_PERM;   /* Cursor not open for writing */
4556   }
4557   if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){
4558     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
4559   }
4560   rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
4561   if( rc ) return rc;
4562   pPage = pCur->pPage;
4563   assert( pPage->intKey || nKey>=0 );
4564   assert( pPage->leaf || !pPage->leafData );
4565   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
4566           pCur->pgnoRoot, nKey, nData, pPage->pgno,
4567           loc==0 ? "overwrite" : "new entry"));
4568   assert( pPage->isInit );
4569   rc = sqlite3pager_write(pPage->aData);
4570   if( rc ) return rc;
4571   newCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) );
4572   if( newCell==0 ) return SQLITE_NOMEM;
4573   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
4574   if( rc ) goto end_insert;
4575   assert( szNew==cellSizePtr(pPage, newCell) );
4576   assert( szNew<=MX_CELL_SIZE(pBt) );
4577   if( loc==0 && pCur->isValid ){
4578     int szOld;
4579     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
4580     oldCell = findCell(pPage, pCur->idx);
4581     if( !pPage->leaf ){
4582       memcpy(newCell, oldCell, 4);
4583     }
4584     szOld = cellSizePtr(pPage, oldCell);
4585     rc = clearCell(pPage, oldCell);
4586     if( rc ) goto end_insert;
4587     dropCell(pPage, pCur->idx, szOld);
4588   }else if( loc<0 && pPage->nCell>0 ){
4589     assert( pPage->leaf );
4590     pCur->idx++;
4591     pCur->info.nSize = 0;
4592   }else{
4593     assert( pPage->leaf );
4594   }
4595   rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
4596   if( rc!=SQLITE_OK ) goto end_insert;
4597   rc = balance(pPage, 1);
4598   /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
4599   /* fflush(stdout); */
4600   if( rc==SQLITE_OK ){
4601     moveToRoot(pCur);
4602   }
4603 end_insert:
4604   sqliteFree(newCell);
4605   return rc;
4606 }
4607 
4608 /*
4609 ** Delete the entry that the cursor is pointing to.  The cursor
4610 ** is left pointing at a random location.
4611 */
4612 int sqlite3BtreeDelete(BtCursor *pCur){
4613   MemPage *pPage = pCur->pPage;
4614   unsigned char *pCell;
4615   int rc;
4616   Pgno pgnoChild = 0;
4617   Btree *pBt = pCur->pBt;
4618 
4619   assert( pPage->isInit );
4620   if( pBt->inTrans!=TRANS_WRITE ){
4621     /* Must start a transaction before doing a delete */
4622     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4623   }
4624   assert( !pBt->readOnly );
4625   if( pCur->idx >= pPage->nCell ){
4626     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
4627   }
4628   if( !pCur->wrFlag ){
4629     return SQLITE_PERM;   /* Did not open this cursor for writing */
4630   }
4631   if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){
4632     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
4633   }
4634   rc = sqlite3pager_write(pPage->aData);
4635   if( rc ) return rc;
4636 
4637   /* Locate the cell within it's page and leave pCell pointing to the
4638   ** data. The clearCell() call frees any overflow pages associated with the
4639   ** cell. The cell itself is still intact.
4640   */
4641   pCell = findCell(pPage, pCur->idx);
4642   if( !pPage->leaf ){
4643     pgnoChild = get4byte(pCell);
4644   }
4645   rc = clearCell(pPage, pCell);
4646   if( rc ) return rc;
4647 
4648   if( !pPage->leaf ){
4649     /*
4650     ** The entry we are about to delete is not a leaf so if we do not
4651     ** do something we will leave a hole on an internal page.
4652     ** We have to fill the hole by moving in a cell from a leaf.  The
4653     ** next Cell after the one to be deleted is guaranteed to exist and
4654     ** to be a leaf so we can use it.
4655     */
4656     BtCursor leafCur;
4657     unsigned char *pNext;
4658     int szNext;
4659     int notUsed;
4660     unsigned char *tempCell = 0;
4661     assert( !pPage->leafData );
4662     getTempCursor(pCur, &leafCur);
4663     rc = sqlite3BtreeNext(&leafCur, &notUsed);
4664     if( rc!=SQLITE_OK ){
4665       if( rc!=SQLITE_NOMEM ){
4666         rc = SQLITE_CORRUPT;  /* bkpt-CORRUPT */
4667       }
4668     }
4669     if( rc==SQLITE_OK ){
4670       rc = sqlite3pager_write(leafCur.pPage->aData);
4671     }
4672     if( rc==SQLITE_OK ){
4673       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
4674          pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
4675       dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
4676       pNext = findCell(leafCur.pPage, leafCur.idx);
4677       szNext = cellSizePtr(leafCur.pPage, pNext);
4678       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
4679       tempCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) );
4680       if( tempCell==0 ){
4681         rc = SQLITE_NOMEM;
4682       }
4683     }
4684     if( rc==SQLITE_OK ){
4685       rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
4686     }
4687     if( rc==SQLITE_OK ){
4688       put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
4689       rc = balance(pPage, 0);
4690     }
4691     if( rc==SQLITE_OK ){
4692       dropCell(leafCur.pPage, leafCur.idx, szNext);
4693       rc = balance(leafCur.pPage, 0);
4694     }
4695     sqliteFree(tempCell);
4696     releaseTempCursor(&leafCur);
4697   }else{
4698     TRACE(("DELETE: table=%d delete from leaf %d\n",
4699        pCur->pgnoRoot, pPage->pgno));
4700     dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
4701     rc = balance(pPage, 0);
4702   }
4703   if( rc==SQLITE_OK ){
4704     moveToRoot(pCur);
4705   }
4706   return rc;
4707 }
4708 
4709 /*
4710 ** Create a new BTree table.  Write into *piTable the page
4711 ** number for the root page of the new table.
4712 **
4713 ** The type of type is determined by the flags parameter.  Only the
4714 ** following values of flags are currently in use.  Other values for
4715 ** flags might not work:
4716 **
4717 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
4718 **     BTREE_ZERODATA                  Used for SQL indices
4719 */
4720 int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
4721   MemPage *pRoot;
4722   Pgno pgnoRoot;
4723   int rc;
4724   if( pBt->inTrans!=TRANS_WRITE ){
4725     /* Must start a transaction first */
4726     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4727   }
4728   assert( !pBt->readOnly );
4729 
4730   /* It is illegal to create a table if any cursors are open on the
4731   ** database. This is because in auto-vacuum mode the backend may
4732   ** need to move a database page to make room for the new root-page.
4733   ** If an open cursor was using the page a problem would occur.
4734   */
4735   if( pBt->pCursor ){
4736     return SQLITE_LOCKED;
4737   }
4738 
4739 #ifdef SQLITE_OMIT_AUTOVACUUM
4740   rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0);
4741   if( rc ) return rc;
4742 #else
4743   if( pBt->autoVacuum ){
4744     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
4745     MemPage *pPageMove; /* The page to move to. */
4746 
4747     /* Read the value of meta[3] from the database to determine where the
4748     ** root page of the new table should go. meta[3] is the largest root-page
4749     ** created so far, so the new root-page is (meta[3]+1).
4750     */
4751     rc = sqlite3BtreeGetMeta(pBt, 4, &pgnoRoot);
4752     if( rc!=SQLITE_OK ) return rc;
4753     pgnoRoot++;
4754 
4755     /* The new root-page may not be allocated on a pointer-map page, or the
4756     ** PENDING_BYTE page.
4757     */
4758     if( pgnoRoot==PTRMAP_PAGENO(pBt->usableSize, pgnoRoot) ||
4759         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
4760       pgnoRoot++;
4761     }
4762     assert( pgnoRoot>=3 );
4763 
4764     /* Allocate a page. The page that currently resides at pgnoRoot will
4765     ** be moved to the allocated page (unless the allocated page happens
4766     ** to reside at pgnoRoot).
4767     */
4768     rc = allocatePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
4769     if( rc!=SQLITE_OK ){
4770       return rc;
4771     }
4772 
4773     if( pgnoMove!=pgnoRoot ){
4774       u8 eType;
4775       Pgno iPtrPage;
4776 
4777       releasePage(pPageMove);
4778       rc = getPage(pBt, pgnoRoot, &pRoot);
4779       if( rc!=SQLITE_OK ){
4780         return rc;
4781       }
4782       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
4783       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
4784         releasePage(pRoot);
4785         return rc;
4786       }
4787       assert( eType!=PTRMAP_ROOTPAGE );
4788       assert( eType!=PTRMAP_FREEPAGE );
4789       rc = sqlite3pager_write(pRoot->aData);
4790       if( rc!=SQLITE_OK ){
4791         releasePage(pRoot);
4792         return rc;
4793       }
4794       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove);
4795       releasePage(pRoot);
4796       if( rc!=SQLITE_OK ){
4797         return rc;
4798       }
4799       rc = getPage(pBt, pgnoRoot, &pRoot);
4800       if( rc!=SQLITE_OK ){
4801         return rc;
4802       }
4803       rc = sqlite3pager_write(pRoot->aData);
4804       if( rc!=SQLITE_OK ){
4805         releasePage(pRoot);
4806         return rc;
4807       }
4808     }else{
4809       pRoot = pPageMove;
4810     }
4811 
4812     /* Update the pointer-map and meta-data with the new root-page number. */
4813     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
4814     if( rc ){
4815       releasePage(pRoot);
4816       return rc;
4817     }
4818     rc = sqlite3BtreeUpdateMeta(pBt, 4, pgnoRoot);
4819     if( rc ){
4820       releasePage(pRoot);
4821       return rc;
4822     }
4823 
4824   }else{
4825     rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0);
4826     if( rc ) return rc;
4827   }
4828 #endif
4829   assert( sqlite3pager_iswriteable(pRoot->aData) );
4830   zeroPage(pRoot, flags | PTF_LEAF);
4831   sqlite3pager_unref(pRoot->aData);
4832   *piTable = (int)pgnoRoot;
4833   return SQLITE_OK;
4834 }
4835 
4836 /*
4837 ** Erase the given database page and all its children.  Return
4838 ** the page to the freelist.
4839 */
4840 static int clearDatabasePage(
4841   Btree *pBt,           /* The BTree that contains the table */
4842   Pgno pgno,            /* Page number to clear */
4843   MemPage *pParent,     /* Parent page.  NULL for the root */
4844   int freePageFlag      /* Deallocate page if true */
4845 ){
4846   MemPage *pPage = 0;
4847   int rc;
4848   unsigned char *pCell;
4849   int i;
4850 
4851   if( pgno>sqlite3pager_pagecount(pBt->pPager) ){
4852     return SQLITE_CORRUPT;
4853   }
4854 
4855   rc = getAndInitPage(pBt, pgno, &pPage, pParent);
4856   if( rc ) goto cleardatabasepage_out;
4857   rc = sqlite3pager_write(pPage->aData);
4858   if( rc ) goto cleardatabasepage_out;
4859   for(i=0; i<pPage->nCell; i++){
4860     pCell = findCell(pPage, i);
4861     if( !pPage->leaf ){
4862       rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
4863       if( rc ) goto cleardatabasepage_out;
4864     }
4865     rc = clearCell(pPage, pCell);
4866     if( rc ) goto cleardatabasepage_out;
4867   }
4868   if( !pPage->leaf ){
4869     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
4870     if( rc ) goto cleardatabasepage_out;
4871   }
4872   if( freePageFlag ){
4873     rc = freePage(pPage);
4874   }else{
4875     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
4876   }
4877 
4878 cleardatabasepage_out:
4879   releasePage(pPage);
4880   return rc;
4881 }
4882 
4883 /*
4884 ** Delete all information from a single table in the database.  iTable is
4885 ** the page number of the root of the table.  After this routine returns,
4886 ** the root page is empty, but still exists.
4887 **
4888 ** This routine will fail with SQLITE_LOCKED if there are any open
4889 ** read cursors on the table.  Open write cursors are moved to the
4890 ** root of the table.
4891 */
4892 int sqlite3BtreeClearTable(Btree *pBt, int iTable){
4893   int rc;
4894   BtCursor *pCur;
4895   if( pBt->inTrans!=TRANS_WRITE ){
4896     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4897   }
4898   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
4899     if( pCur->pgnoRoot==(Pgno)iTable ){
4900       if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
4901       moveToRoot(pCur);
4902     }
4903   }
4904   rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
4905   if( rc ){
4906     sqlite3BtreeRollback(pBt);
4907   }
4908   return rc;
4909 }
4910 
4911 /*
4912 ** Erase all information in a table and add the root of the table to
4913 ** the freelist.  Except, the root of the principle table (the one on
4914 ** page 1) is never added to the freelist.
4915 **
4916 ** This routine will fail with SQLITE_LOCKED if there are any open
4917 ** cursors on the table.
4918 **
4919 ** If AUTOVACUUM is enabled and the page at iTable is not the last
4920 ** root page in the database file, then the last root page
4921 ** in the database file is moved into the slot formerly occupied by
4922 ** iTable and that last slot formerly occupied by the last root page
4923 ** is added to the freelist instead of iTable.  In this say, all
4924 ** root pages are kept at the beginning of the database file, which
4925 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
4926 ** page number that used to be the last root page in the file before
4927 ** the move.  If no page gets moved, *piMoved is set to 0.
4928 ** The last root page is recorded in meta[3] and the value of
4929 ** meta[3] is updated by this procedure.
4930 */
4931 int sqlite3BtreeDropTable(Btree *pBt, int iTable, int *piMoved){
4932   int rc;
4933   MemPage *pPage = 0;
4934 
4935   if( pBt->inTrans!=TRANS_WRITE ){
4936     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4937   }
4938 
4939   /* It is illegal to drop a table if any cursors are open on the
4940   ** database. This is because in auto-vacuum mode the backend may
4941   ** need to move another root-page to fill a gap left by the deleted
4942   ** root page. If an open cursor was using this page a problem would
4943   ** occur.
4944   */
4945   if( pBt->pCursor ){
4946     return SQLITE_LOCKED;
4947   }
4948 
4949   rc = getPage(pBt, (Pgno)iTable, &pPage);
4950   if( rc ) return rc;
4951   rc = sqlite3BtreeClearTable(pBt, iTable);
4952   if( rc ){
4953     releasePage(pPage);
4954     return rc;
4955   }
4956 
4957   *piMoved = 0;
4958 
4959   if( iTable>1 ){
4960 #ifdef SQLITE_OMIT_AUTOVACUUM
4961     rc = freePage(pPage);
4962     releasePage(pPage);
4963 #else
4964     if( pBt->autoVacuum ){
4965       Pgno maxRootPgno;
4966       rc = sqlite3BtreeGetMeta(pBt, 4, &maxRootPgno);
4967       if( rc!=SQLITE_OK ){
4968         releasePage(pPage);
4969         return rc;
4970       }
4971 
4972       if( iTable==maxRootPgno ){
4973         /* If the table being dropped is the table with the largest root-page
4974         ** number in the database, put the root page on the free list.
4975         */
4976         rc = freePage(pPage);
4977         releasePage(pPage);
4978         if( rc!=SQLITE_OK ){
4979           return rc;
4980         }
4981       }else{
4982         /* The table being dropped does not have the largest root-page
4983         ** number in the database. So move the page that does into the
4984         ** gap left by the deleted root-page.
4985         */
4986         MemPage *pMove;
4987         releasePage(pPage);
4988         rc = getPage(pBt, maxRootPgno, &pMove);
4989         if( rc!=SQLITE_OK ){
4990           return rc;
4991         }
4992         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable);
4993         releasePage(pMove);
4994         if( rc!=SQLITE_OK ){
4995           return rc;
4996         }
4997         rc = getPage(pBt, maxRootPgno, &pMove);
4998         if( rc!=SQLITE_OK ){
4999           return rc;
5000         }
5001         rc = freePage(pMove);
5002         releasePage(pMove);
5003         if( rc!=SQLITE_OK ){
5004           return rc;
5005         }
5006         *piMoved = maxRootPgno;
5007       }
5008 
5009       /* Set the new 'max-root-page' value in the database header. This
5010       ** is the old value less one, less one more if that happens to
5011       ** be a root-page number, less one again if that is the
5012       ** PENDING_BYTE_PAGE.
5013       */
5014       maxRootPgno--;
5015       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
5016         maxRootPgno--;
5017       }
5018       if( maxRootPgno==PTRMAP_PAGENO(pBt->usableSize, maxRootPgno) ){
5019         maxRootPgno--;
5020       }
5021       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
5022 
5023       rc = sqlite3BtreeUpdateMeta(pBt, 4, maxRootPgno);
5024     }else{
5025       rc = freePage(pPage);
5026       releasePage(pPage);
5027     }
5028 #endif
5029   }else{
5030     /* If sqlite3BtreeDropTable was called on page 1. */
5031     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
5032     releasePage(pPage);
5033   }
5034   return rc;
5035 }
5036 
5037 
5038 /*
5039 ** Read the meta-information out of a database file.  Meta[0]
5040 ** is the number of free pages currently in the database.  Meta[1]
5041 ** through meta[15] are available for use by higher layers.  Meta[0]
5042 ** is read-only, the others are read/write.
5043 **
5044 ** The schema layer numbers meta values differently.  At the schema
5045 ** layer (and the SetCookie and ReadCookie opcodes) the number of
5046 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
5047 */
5048 int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
5049   int rc;
5050   unsigned char *pP1;
5051 
5052   assert( idx>=0 && idx<=15 );
5053   rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
5054   if( rc ) return rc;
5055   *pMeta = get4byte(&pP1[36 + idx*4]);
5056   sqlite3pager_unref(pP1);
5057 
5058   /* If autovacuumed is disabled in this build but we are trying to
5059   ** access an autovacuumed database, then make the database readonly.
5060   */
5061 #ifdef SQLITE_OMIT_AUTOVACUUM
5062   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
5063 #endif
5064 
5065   return SQLITE_OK;
5066 }
5067 
5068 /*
5069 ** Write meta-information back into the database.  Meta[0] is
5070 ** read-only and may not be written.
5071 */
5072 int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
5073   unsigned char *pP1;
5074   int rc;
5075   assert( idx>=1 && idx<=15 );
5076   if( pBt->inTrans!=TRANS_WRITE ){
5077     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5078   }
5079   assert( pBt->pPage1!=0 );
5080   pP1 = pBt->pPage1->aData;
5081   rc = sqlite3pager_write(pP1);
5082   if( rc ) return rc;
5083   put4byte(&pP1[36 + idx*4], iMeta);
5084   return SQLITE_OK;
5085 }
5086 
5087 /*
5088 ** Return the flag byte at the beginning of the page that the cursor
5089 ** is currently pointing to.
5090 */
5091 int sqlite3BtreeFlags(BtCursor *pCur){
5092   MemPage *pPage = pCur->pPage;
5093   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
5094 }
5095 
5096 #ifdef SQLITE_DEBUG
5097 /*
5098 ** Print a disassembly of the given page on standard output.  This routine
5099 ** is used for debugging and testing only.
5100 */
5101 static int btreePageDump(Btree *pBt, int pgno, int recursive, MemPage *pParent){
5102   int rc;
5103   MemPage *pPage;
5104   int i, j, c;
5105   int nFree;
5106   u16 idx;
5107   int hdr;
5108   int nCell;
5109   int isInit;
5110   unsigned char *data;
5111   char range[20];
5112   unsigned char payload[20];
5113 
5114   rc = getPage(pBt, (Pgno)pgno, &pPage);
5115   isInit = pPage->isInit;
5116   if( pPage->isInit==0 ){
5117     initPage(pPage, pParent);
5118   }
5119   if( rc ){
5120     return rc;
5121   }
5122   hdr = pPage->hdrOffset;
5123   data = pPage->aData;
5124   c = data[hdr];
5125   pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0;
5126   pPage->zeroData = (c & PTF_ZERODATA)!=0;
5127   pPage->leafData = (c & PTF_LEAFDATA)!=0;
5128   pPage->leaf = (c & PTF_LEAF)!=0;
5129   pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
5130   nCell = get2byte(&data[hdr+3]);
5131   sqlite3DebugPrintf("PAGE %d:  flags=0x%02x  frag=%d   parent=%d\n", pgno,
5132     data[hdr], data[hdr+7],
5133     (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
5134   assert( hdr == (pgno==1 ? 100 : 0) );
5135   idx = hdr + 12 - pPage->leaf*4;
5136   for(i=0; i<nCell; i++){
5137     CellInfo info;
5138     Pgno child;
5139     unsigned char *pCell;
5140     int sz;
5141     int addr;
5142 
5143     addr = get2byte(&data[idx + 2*i]);
5144     pCell = &data[addr];
5145     parseCellPtr(pPage, pCell, &info);
5146     sz = info.nSize;
5147     sprintf(range,"%d..%d", addr, addr+sz-1);
5148     if( pPage->leaf ){
5149       child = 0;
5150     }else{
5151       child = get4byte(pCell);
5152     }
5153     sz = info.nData;
5154     if( !pPage->intKey ) sz += info.nKey;
5155     if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
5156     memcpy(payload, &pCell[info.nHeader], sz);
5157     for(j=0; j<sz; j++){
5158       if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
5159     }
5160     payload[sz] = 0;
5161     sqlite3DebugPrintf(
5162       "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4d payload=%s\n",
5163       i, range, child, info.nKey, info.nData, payload
5164     );
5165   }
5166   if( !pPage->leaf ){
5167     sqlite3DebugPrintf("right_child: %d\n", get4byte(&data[hdr+8]));
5168   }
5169   nFree = 0;
5170   i = 0;
5171   idx = get2byte(&data[hdr+1]);
5172   while( idx>0 && idx<pPage->pBt->usableSize ){
5173     int sz = get2byte(&data[idx+2]);
5174     sprintf(range,"%d..%d", idx, idx+sz-1);
5175     nFree += sz;
5176     sqlite3DebugPrintf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
5177        i, range, sz, nFree);
5178     idx = get2byte(&data[idx]);
5179     i++;
5180   }
5181   if( idx!=0 ){
5182     sqlite3DebugPrintf("ERROR: next freeblock index out of range: %d\n", idx);
5183   }
5184   if( recursive && !pPage->leaf ){
5185     for(i=0; i<nCell; i++){
5186       unsigned char *pCell = findCell(pPage, i);
5187       btreePageDump(pBt, get4byte(pCell), 1, pPage);
5188       idx = get2byte(pCell);
5189     }
5190     btreePageDump(pBt, get4byte(&data[hdr+8]), 1, pPage);
5191   }
5192   pPage->isInit = isInit;
5193   sqlite3pager_unref(data);
5194   fflush(stdout);
5195   return SQLITE_OK;
5196 }
5197 int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
5198   return btreePageDump(pBt, pgno, recursive, 0);
5199 }
5200 #endif
5201 
5202 #ifdef SQLITE_TEST
5203 /*
5204 ** Fill aResult[] with information about the entry and page that the
5205 ** cursor is pointing to.
5206 **
5207 **   aResult[0] =  The page number
5208 **   aResult[1] =  The entry number
5209 **   aResult[2] =  Total number of entries on this page
5210 **   aResult[3] =  Cell size (local payload + header)
5211 **   aResult[4] =  Number of free bytes on this page
5212 **   aResult[5] =  Number of free blocks on the page
5213 **   aResult[6] =  Total payload size (local + overflow)
5214 **   aResult[7] =  Header size in bytes
5215 **   aResult[8] =  Local payload size
5216 **   aResult[9] =  Parent page number
5217 **
5218 ** This routine is used for testing and debugging only.
5219 */
5220 int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult, int upCnt){
5221   int cnt, idx;
5222   MemPage *pPage = pCur->pPage;
5223   BtCursor tmpCur;
5224 
5225   pageIntegrity(pPage);
5226   assert( pPage->isInit );
5227   getTempCursor(pCur, &tmpCur);
5228   while( upCnt-- ){
5229     moveToParent(&tmpCur);
5230   }
5231   pPage = tmpCur.pPage;
5232   pageIntegrity(pPage);
5233   aResult[0] = sqlite3pager_pagenumber(pPage->aData);
5234   assert( aResult[0]==pPage->pgno );
5235   aResult[1] = tmpCur.idx;
5236   aResult[2] = pPage->nCell;
5237   if( tmpCur.idx>=0 && tmpCur.idx<pPage->nCell ){
5238     getCellInfo(&tmpCur);
5239     aResult[3] = tmpCur.info.nSize;
5240     aResult[6] = tmpCur.info.nData;
5241     aResult[7] = tmpCur.info.nHeader;
5242     aResult[8] = tmpCur.info.nLocal;
5243   }else{
5244     aResult[3] = 0;
5245     aResult[6] = 0;
5246     aResult[7] = 0;
5247     aResult[8] = 0;
5248   }
5249   aResult[4] = pPage->nFree;
5250   cnt = 0;
5251   idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
5252   while( idx>0 && idx<pPage->pBt->usableSize ){
5253     cnt++;
5254     idx = get2byte(&pPage->aData[idx]);
5255   }
5256   aResult[5] = cnt;
5257   if( pPage->pParent==0 || isRootPage(pPage) ){
5258     aResult[9] = 0;
5259   }else{
5260     aResult[9] = pPage->pParent->pgno;
5261   }
5262   releaseTempCursor(&tmpCur);
5263   return SQLITE_OK;
5264 }
5265 #endif
5266 
5267 /*
5268 ** Return the pager associated with a BTree.  This routine is used for
5269 ** testing and debugging only.
5270 */
5271 Pager *sqlite3BtreePager(Btree *pBt){
5272   return pBt->pPager;
5273 }
5274 
5275 /*
5276 ** This structure is passed around through all the sanity checking routines
5277 ** in order to keep track of some global state information.
5278 */
5279 typedef struct IntegrityCk IntegrityCk;
5280 struct IntegrityCk {
5281   Btree *pBt;    /* The tree being checked out */
5282   Pager *pPager; /* The associated pager.  Also accessible by pBt->pPager */
5283   int nPage;     /* Number of pages in the database */
5284   int *anRef;    /* Number of times each page is referenced */
5285   char *zErrMsg; /* An error message.  NULL of no errors seen. */
5286 };
5287 
5288 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
5289 /*
5290 ** Append a message to the error message string.
5291 */
5292 static void checkAppendMsg(
5293   IntegrityCk *pCheck,
5294   char *zMsg1,
5295   const char *zFormat,
5296   ...
5297 ){
5298   va_list ap;
5299   char *zMsg2;
5300   va_start(ap, zFormat);
5301   zMsg2 = sqlite3VMPrintf(zFormat, ap);
5302   va_end(ap);
5303   if( zMsg1==0 ) zMsg1 = "";
5304   if( pCheck->zErrMsg ){
5305     char *zOld = pCheck->zErrMsg;
5306     pCheck->zErrMsg = 0;
5307     sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
5308     sqliteFree(zOld);
5309   }else{
5310     sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
5311   }
5312   sqliteFree(zMsg2);
5313 }
5314 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5315 
5316 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
5317 /*
5318 ** Add 1 to the reference count for page iPage.  If this is the second
5319 ** reference to the page, add an error message to pCheck->zErrMsg.
5320 ** Return 1 if there are 2 ore more references to the page and 0 if
5321 ** if this is the first reference to the page.
5322 **
5323 ** Also check that the page number is in bounds.
5324 */
5325 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
5326   if( iPage==0 ) return 1;
5327   if( iPage>pCheck->nPage || iPage<0 ){
5328     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
5329     return 1;
5330   }
5331   if( pCheck->anRef[iPage]==1 ){
5332     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
5333     return 1;
5334   }
5335   return  (pCheck->anRef[iPage]++)>1;
5336 }
5337 
5338 #ifndef SQLITE_OMIT_AUTOVACUUM
5339 /*
5340 ** Check that the entry in the pointer-map for page iChild maps to
5341 ** page iParent, pointer type ptrType. If not, append an error message
5342 ** to pCheck.
5343 */
5344 static void checkPtrmap(
5345   IntegrityCk *pCheck,   /* Integrity check context */
5346   Pgno iChild,           /* Child page number */
5347   u8 eType,              /* Expected pointer map type */
5348   Pgno iParent,          /* Expected pointer map parent page number */
5349   char *zContext         /* Context description (used for error msg) */
5350 ){
5351   int rc;
5352   u8 ePtrmapType;
5353   Pgno iPtrmapParent;
5354 
5355   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
5356   if( rc!=SQLITE_OK ){
5357     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
5358     return;
5359   }
5360 
5361   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
5362     checkAppendMsg(pCheck, zContext,
5363       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
5364       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
5365   }
5366 }
5367 #endif
5368 
5369 /*
5370 ** Check the integrity of the freelist or of an overflow page list.
5371 ** Verify that the number of pages on the list is N.
5372 */
5373 static void checkList(
5374   IntegrityCk *pCheck,  /* Integrity checking context */
5375   int isFreeList,       /* True for a freelist.  False for overflow page list */
5376   int iPage,            /* Page number for first page in the list */
5377   int N,                /* Expected number of pages in the list */
5378   char *zContext        /* Context for error messages */
5379 ){
5380   int i;
5381   int expected = N;
5382   int iFirst = iPage;
5383   while( N-- > 0 ){
5384     unsigned char *pOvfl;
5385     if( iPage<1 ){
5386       checkAppendMsg(pCheck, zContext,
5387          "%d of %d pages missing from overflow list starting at %d",
5388           N+1, expected, iFirst);
5389       break;
5390     }
5391     if( checkRef(pCheck, iPage, zContext) ) break;
5392     if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
5393       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
5394       break;
5395     }
5396     if( isFreeList ){
5397       int n = get4byte(&pOvfl[4]);
5398 #ifndef SQLITE_OMIT_AUTOVACUUM
5399       if( pCheck->pBt->autoVacuum ){
5400         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
5401       }
5402 #endif
5403       if( n>pCheck->pBt->usableSize/4-8 ){
5404         checkAppendMsg(pCheck, zContext,
5405            "freelist leaf count too big on page %d", iPage);
5406         N--;
5407       }else{
5408         for(i=0; i<n; i++){
5409           Pgno iFreePage = get4byte(&pOvfl[8+i*4]);
5410 #ifndef SQLITE_OMIT_AUTOVACUUM
5411           if( pCheck->pBt->autoVacuum ){
5412             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
5413           }
5414 #endif
5415           checkRef(pCheck, iFreePage, zContext);
5416         }
5417         N -= n;
5418       }
5419     }
5420 #ifndef SQLITE_OMIT_AUTOVACUUM
5421     else{
5422       /* If this database supports auto-vacuum and iPage is not the last
5423       ** page in this overflow list, check that the pointer-map entry for
5424       ** the following page matches iPage.
5425       */
5426       if( pCheck->pBt->autoVacuum && N>0 ){
5427         i = get4byte(pOvfl);
5428         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
5429       }
5430     }
5431 #endif
5432     iPage = get4byte(pOvfl);
5433     sqlite3pager_unref(pOvfl);
5434   }
5435 }
5436 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5437 
5438 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
5439 /*
5440 ** Do various sanity checks on a single page of a tree.  Return
5441 ** the tree depth.  Root pages return 0.  Parents of root pages
5442 ** return 1, and so forth.
5443 **
5444 ** These checks are done:
5445 **
5446 **      1.  Make sure that cells and freeblocks do not overlap
5447 **          but combine to completely cover the page.
5448 **  NO  2.  Make sure cell keys are in order.
5449 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
5450 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
5451 **      5.  Check the integrity of overflow pages.
5452 **      6.  Recursively call checkTreePage on all children.
5453 **      7.  Verify that the depth of all children is the same.
5454 **      8.  Make sure this page is at least 33% full or else it is
5455 **          the root of the tree.
5456 */
5457 static int checkTreePage(
5458   IntegrityCk *pCheck,  /* Context for the sanity check */
5459   int iPage,            /* Page number of the page to check */
5460   MemPage *pParent,     /* Parent page */
5461   char *zParentContext, /* Parent context */
5462   char *zLowerBound,    /* All keys should be greater than this, if not NULL */
5463   int nLower,           /* Number of characters in zLowerBound */
5464   char *zUpperBound,    /* All keys should be less than this, if not NULL */
5465   int nUpper            /* Number of characters in zUpperBound */
5466 ){
5467   MemPage *pPage;
5468   int i, rc, depth, d2, pgno, cnt;
5469   int hdr, cellStart;
5470   int nCell;
5471   u8 *data;
5472   BtCursor cur;
5473   Btree *pBt;
5474   int maxLocal, usableSize;
5475   char zContext[100];
5476   char *hit;
5477 
5478   sprintf(zContext, "Page %d: ", iPage);
5479 
5480   /* Check that the page exists
5481   */
5482   cur.pBt = pBt = pCheck->pBt;
5483   usableSize = pBt->usableSize;
5484   if( iPage==0 ) return 0;
5485   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
5486   if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
5487     checkAppendMsg(pCheck, zContext,
5488        "unable to get the page. error code=%d", rc);
5489     return 0;
5490   }
5491   maxLocal = pPage->leafData ? pBt->maxLeaf : pBt->maxLocal;
5492   if( (rc = initPage(pPage, pParent))!=0 ){
5493     checkAppendMsg(pCheck, zContext, "initPage() returns error code %d", rc);
5494     releasePage(pPage);
5495     return 0;
5496   }
5497 
5498   /* Check out all the cells.
5499   */
5500   depth = 0;
5501   cur.pPage = pPage;
5502   for(i=0; i<pPage->nCell; i++){
5503     u8 *pCell;
5504     int sz;
5505     CellInfo info;
5506 
5507     /* Check payload overflow pages
5508     */
5509     sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
5510     pCell = findCell(pPage,i);
5511     parseCellPtr(pPage, pCell, &info);
5512     sz = info.nData;
5513     if( !pPage->intKey ) sz += info.nKey;
5514     if( sz>info.nLocal ){
5515       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
5516       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
5517 #ifndef SQLITE_OMIT_AUTOVACUUM
5518       if( pBt->autoVacuum ){
5519         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
5520       }
5521 #endif
5522       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
5523     }
5524 
5525     /* Check sanity of left child page.
5526     */
5527     if( !pPage->leaf ){
5528       pgno = get4byte(pCell);
5529 #ifndef SQLITE_OMIT_AUTOVACUUM
5530       if( pBt->autoVacuum ){
5531         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
5532       }
5533 #endif
5534       d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
5535       if( i>0 && d2!=depth ){
5536         checkAppendMsg(pCheck, zContext, "Child page depth differs");
5537       }
5538       depth = d2;
5539     }
5540   }
5541   if( !pPage->leaf ){
5542     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5543     sprintf(zContext, "On page %d at right child: ", iPage);
5544 #ifndef SQLITE_OMIT_AUTOVACUUM
5545     if( pBt->autoVacuum ){
5546       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
5547     }
5548 #endif
5549     checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
5550   }
5551 
5552   /* Check for complete coverage of the page
5553   */
5554   data = pPage->aData;
5555   hdr = pPage->hdrOffset;
5556   hit = sqliteMalloc( usableSize );
5557   if( hit ){
5558     memset(hit, 1, get2byte(&data[hdr+5]));
5559     nCell = get2byte(&data[hdr+3]);
5560     cellStart = hdr + 12 - 4*pPage->leaf;
5561     for(i=0; i<nCell; i++){
5562       int pc = get2byte(&data[cellStart+i*2]);
5563       int size = cellSizePtr(pPage, &data[pc]);
5564       int j;
5565       if( (pc+size-1)>=usableSize || pc<0 ){
5566         checkAppendMsg(pCheck, 0,
5567             "Corruption detected in cell %d on page %d",i,iPage,0);
5568       }else{
5569         for(j=pc+size-1; j>=pc; j--) hit[j]++;
5570       }
5571     }
5572     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
5573            cnt++){
5574       int size = get2byte(&data[i+2]);
5575       int j;
5576       if( (i+size-1)>=usableSize || i<0 ){
5577         checkAppendMsg(pCheck, 0,
5578             "Corruption detected in cell %d on page %d",i,iPage,0);
5579       }else{
5580         for(j=i+size-1; j>=i; j--) hit[j]++;
5581       }
5582       i = get2byte(&data[i]);
5583     }
5584     for(i=cnt=0; i<usableSize; i++){
5585       if( hit[i]==0 ){
5586         cnt++;
5587       }else if( hit[i]>1 ){
5588         checkAppendMsg(pCheck, 0,
5589           "Multiple uses for byte %d of page %d", i, iPage);
5590         break;
5591       }
5592     }
5593     if( cnt!=data[hdr+7] ){
5594       checkAppendMsg(pCheck, 0,
5595           "Fragmented space is %d byte reported as %d on page %d",
5596           cnt, data[hdr+7], iPage);
5597     }
5598   }
5599   sqliteFree(hit);
5600 
5601   releasePage(pPage);
5602   return depth+1;
5603 }
5604 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5605 
5606 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
5607 /*
5608 ** This routine does a complete check of the given BTree file.  aRoot[] is
5609 ** an array of pages numbers were each page number is the root page of
5610 ** a table.  nRoot is the number of entries in aRoot.
5611 **
5612 ** If everything checks out, this routine returns NULL.  If something is
5613 ** amiss, an error message is written into memory obtained from malloc()
5614 ** and a pointer to that error message is returned.  The calling function
5615 ** is responsible for freeing the error message when it is done.
5616 */
5617 char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
5618   int i;
5619   int nRef;
5620   IntegrityCk sCheck;
5621 
5622   nRef = *sqlite3pager_stats(pBt->pPager);
5623   if( lockBtreeWithRetry(pBt)!=SQLITE_OK ){
5624     return sqliteStrDup("Unable to acquire a read lock on the database");
5625   }
5626   sCheck.pBt = pBt;
5627   sCheck.pPager = pBt->pPager;
5628   sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
5629   if( sCheck.nPage==0 ){
5630     unlockBtreeIfUnused(pBt);
5631     return 0;
5632   }
5633   sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
5634   if( !sCheck.anRef ){
5635     unlockBtreeIfUnused(pBt);
5636     return sqlite3MPrintf("Unable to malloc %d bytes",
5637         (sCheck.nPage+1)*sizeof(sCheck.anRef[0]));
5638   }
5639   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
5640   i = PENDING_BYTE_PAGE(pBt);
5641   if( i<=sCheck.nPage ){
5642     sCheck.anRef[i] = 1;
5643   }
5644   sCheck.zErrMsg = 0;
5645 
5646   /* Check the integrity of the freelist
5647   */
5648   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
5649             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
5650 
5651   /* Check all the tables.
5652   */
5653   for(i=0; i<nRoot; i++){
5654     if( aRoot[i]==0 ) continue;
5655 #ifndef SQLITE_OMIT_AUTOVACUUM
5656     if( pBt->autoVacuum && aRoot[i]>1 ){
5657       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
5658     }
5659 #endif
5660     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
5661   }
5662 
5663   /* Make sure every page in the file is referenced
5664   */
5665   for(i=1; i<=sCheck.nPage; i++){
5666 #ifdef SQLITE_OMIT_AUTOVACUUM
5667     if( sCheck.anRef[i]==0 ){
5668       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
5669     }
5670 #else
5671     /* If the database supports auto-vacuum, make sure no tables contain
5672     ** references to pointer-map pages.
5673     */
5674     if( sCheck.anRef[i]==0 &&
5675        (PTRMAP_PAGENO(pBt->usableSize, i)!=i || !pBt->autoVacuum) ){
5676       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
5677     }
5678     if( sCheck.anRef[i]!=0 &&
5679        (PTRMAP_PAGENO(pBt->usableSize, i)==i && pBt->autoVacuum) ){
5680       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
5681     }
5682 #endif
5683   }
5684 
5685   /* Make sure this analysis did not leave any unref() pages
5686   */
5687   unlockBtreeIfUnused(pBt);
5688   if( nRef != *sqlite3pager_stats(pBt->pPager) ){
5689     checkAppendMsg(&sCheck, 0,
5690       "Outstanding page count goes from %d to %d during this analysis",
5691       nRef, *sqlite3pager_stats(pBt->pPager)
5692     );
5693   }
5694 
5695   /* Clean  up and report errors.
5696   */
5697   sqliteFree(sCheck.anRef);
5698   return sCheck.zErrMsg;
5699 }
5700 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5701 
5702 /*
5703 ** Return the full pathname of the underlying database file.
5704 */
5705 const char *sqlite3BtreeGetFilename(Btree *pBt){
5706   assert( pBt->pPager!=0 );
5707   return sqlite3pager_filename(pBt->pPager);
5708 }
5709 
5710 /*
5711 ** Return the pathname of the directory that contains the database file.
5712 */
5713 const char *sqlite3BtreeGetDirname(Btree *pBt){
5714   assert( pBt->pPager!=0 );
5715   return sqlite3pager_dirname(pBt->pPager);
5716 }
5717 
5718 /*
5719 ** Return the pathname of the journal file for this database. The return
5720 ** value of this routine is the same regardless of whether the journal file
5721 ** has been created or not.
5722 */
5723 const char *sqlite3BtreeGetJournalname(Btree *pBt){
5724   assert( pBt->pPager!=0 );
5725   return sqlite3pager_journalname(pBt->pPager);
5726 }
5727 
5728 #ifndef SQLITE_OMIT_VACUUM
5729 /*
5730 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
5731 ** must be active for both files.
5732 **
5733 ** The size of file pBtFrom may be reduced by this operation.
5734 ** If anything goes wrong, the transaction on pBtFrom is rolled back.
5735 */
5736 int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
5737   int rc = SQLITE_OK;
5738   Pgno i, nPage, nToPage;
5739 
5740   if( pBtTo->inTrans!=TRANS_WRITE || pBtFrom->inTrans!=TRANS_WRITE ){
5741     return SQLITE_ERROR;
5742   }
5743   if( pBtTo->pCursor ) return SQLITE_BUSY;
5744   nToPage = sqlite3pager_pagecount(pBtTo->pPager);
5745   nPage = sqlite3pager_pagecount(pBtFrom->pPager);
5746   for(i=1; rc==SQLITE_OK && i<=nPage; i++){
5747     void *pPage;
5748     rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
5749     if( rc ) break;
5750     rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
5751     if( rc ) break;
5752     sqlite3pager_unref(pPage);
5753   }
5754   for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
5755     void *pPage;
5756     rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
5757     if( rc ) break;
5758     rc = sqlite3pager_write(pPage);
5759     sqlite3pager_unref(pPage);
5760     sqlite3pager_dont_write(pBtTo->pPager, i);
5761   }
5762   if( !rc && nPage<nToPage ){
5763     rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
5764   }
5765   if( rc ){
5766     sqlite3BtreeRollback(pBtTo);
5767   }
5768   return rc;
5769 }
5770 #endif /* SQLITE_OMIT_VACUUM */
5771 
5772 /*
5773 ** Return non-zero if a transaction is active.
5774 */
5775 int sqlite3BtreeIsInTrans(Btree *pBt){
5776   return (pBt && (pBt->inTrans==TRANS_WRITE));
5777 }
5778 
5779 /*
5780 ** Return non-zero if a statement transaction is active.
5781 */
5782 int sqlite3BtreeIsInStmt(Btree *pBt){
5783   return (pBt && pBt->inStmt);
5784 }
5785 
5786 /*
5787 ** This call is a no-op if no write-transaction is currently active on pBt.
5788 **
5789 ** Otherwise, sync the database file for the btree pBt. zMaster points to
5790 ** the name of a master journal file that should be written into the
5791 ** individual journal file, or is NULL, indicating no master journal file
5792 ** (single database transaction).
5793 **
5794 ** When this is called, the master journal should already have been
5795 ** created, populated with this journal pointer and synced to disk.
5796 **
5797 ** Once this is routine has returned, the only thing required to commit
5798 ** the write-transaction for this database file is to delete the journal.
5799 */
5800 int sqlite3BtreeSync(Btree *pBt, const char *zMaster){
5801   if( pBt->inTrans==TRANS_WRITE ){
5802 #ifndef SQLITE_OMIT_AUTOVACUUM
5803     Pgno nTrunc = 0;
5804     if( pBt->autoVacuum ){
5805       int rc = autoVacuumCommit(pBt, &nTrunc);
5806       if( rc!=SQLITE_OK ) return rc;
5807     }
5808     return sqlite3pager_sync(pBt->pPager, zMaster, nTrunc);
5809 #endif
5810     return sqlite3pager_sync(pBt->pPager, zMaster, 0);
5811   }
5812   return SQLITE_OK;
5813 }
5814 
5815 #ifndef SQLITE_OMIT_GLOBALRECOVER
5816 /*
5817 ** Reset the btree and underlying pager after a malloc() failure. Any
5818 ** transaction that was active when malloc() failed is rolled back.
5819 */
5820 int sqlite3BtreeReset(Btree *pBt){
5821   if( pBt->pCursor ) return SQLITE_BUSY;
5822   pBt->inTrans = TRANS_NONE;
5823   unlockBtreeIfUnused(pBt);
5824   return sqlite3pager_reset(pBt->pPager);
5825 }
5826 #endif
5827