xref: /sqlite-3.40.0/src/btree.c (revision ef5ecb41)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.163 2004/06/09 20:03:09 drh Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** For a detailed discussion of BTrees, refer to
16 **
17 **     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18 **     "Sorting And Searching", pages 473-480. Addison-Wesley
19 **     Publishing Company, Reading, Massachusetts.
20 **
21 ** The basic idea is that each page of the file contains N database
22 ** entries and N+1 pointers to subpages.
23 **
24 **   ----------------------------------------------------------------
25 **   |  Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26 **   ----------------------------------------------------------------
27 **
28 ** All of the keys on the page that Ptr(0) points to have values less
29 ** than Key(0).  All of the keys on page Ptr(1) and its subpages have
30 ** values greater than Key(0) and less than Key(1).  All of the keys
31 ** on Ptr(N+1) and its subpages have values greater than Key(N).  And
32 ** so forth.
33 **
34 ** Finding a particular key requires reading O(log(M)) pages from the
35 ** disk where M is the number of entries in the tree.
36 **
37 ** In this implementation, a single file can hold one or more separate
38 ** BTrees.  Each BTree is identified by the index of its root page.  The
39 ** key and data for any entry are combined to form the "payload".  A
40 ** fixed amount of payload can be carried directly on the database
41 ** page.  If the payload is larger than the preset amount then surplus
42 ** bytes are stored on overflow pages.  The payload for an entry
43 ** and the preceding pointer are combined to form a "Cell".  Each
44 ** page has a small header which contains the Ptr(N+1) pointer and other
45 ** information such as the size of key and data.
46 **
47 ** FORMAT DETAILS
48 **
49 ** The file is divided into pages.  The first page is called page 1,
50 ** the second is page 2, and so forth.  A page number of zero indicates
51 ** "no such page".  The page size can be anything between 512 and 65536.
52 ** Each page can be either a btree page, a freelist page or an overflow
53 ** page.
54 **
55 ** The first page is always a btree page.  The first 100 bytes of the first
56 ** page contain a special header (the "file header") that describes the file.
57 ** The format of the file header is as follows:
58 **
59 **   OFFSET   SIZE    DESCRIPTION
60 **      0      16     Header string: "SQLite format 3\000"
61 **     16       2     Page size in bytes.
62 **     18       1     File format write version
63 **     19       1     File format read version
64 **     20       1     Bytes of unused space at the end of each page
65 **     21       1     Max embedded payload fraction
66 **     22       1     Min embedded payload fraction
67 **     23       1     Min leaf payload fraction
68 **     24       4     File change counter
69 **     28       4     Reserved for future use
70 **     32       4     First freelist page
71 **     36       4     Number of freelist pages in the file
72 **     40      60     15 4-byte meta values passed to higher layers
73 **
74 ** All of the integer values are big-endian (most significant byte first).
75 **
76 ** The file change counter is incremented when the database is changed more
77 ** than once within the same second.  This counter, together with the
78 ** modification time of the file, allows other processes to know
79 ** when the file has changed and thus when they need to flush their
80 ** cache.
81 **
82 ** The max embedded payload fraction is the amount of the total usable
83 ** space in a page that can be consumed by a single cell for standard
84 ** B-tree (non-LEAFDATA) tables.  A value of 255 means 100%.  The default
85 ** is to limit the maximum cell size so that at least 4 cells will fit
86 ** on one page.  Thus the default max embedded payload fraction is 64.
87 **
88 ** If the payload for a cell is larger than the max payload, then extra
89 ** payload is spilled to overflow pages.  Once an overflow page is allocated,
90 ** as many bytes as possible are moved into the overflow pages without letting
91 ** the cell size drop below the min embedded payload fraction.
92 **
93 ** The min leaf payload fraction is like the min embedded payload fraction
94 ** except that it applies to leaf nodes in a LEAFDATA tree.  The maximum
95 ** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
96 ** not specified in the header.
97 **
98 ** Each btree pages is divided into three sections:  The header, the
99 ** cell pointer array, and the cell area area.  Page 1 also has a 100-byte
100 ** file header that occurs before the page header.
101 **
102 **      |----------------|
103 **      | file header    |   100 bytes.  Page 1 only.
104 **      |----------------|
105 **      | page header    |   8 bytes for leaves.  12 bytes for interior nodes
106 **      |----------------|
107 **      | cell pointer   |   |  2 bytes per cell.  Sorted order.
108 **      | array          |   |  Grows downward
109 **      |                |   v
110 **      |----------------|
111 **      | unallocated    |
112 **      | space          |
113 **      |----------------|   ^  Grows upwards
114 **      | cell content   |   |  Arbitrary order interspersed with freeblocks.
115 **      | area           |   |  and free space fragments.
116 **      |----------------|
117 **
118 ** The page headers looks like this:
119 **
120 **   OFFSET   SIZE     DESCRIPTION
121 **      0       1      Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
122 **      1       2      byte offset to the first freeblock
123 **      3       2      number of cells on this page
124 **      5       2      first byte of the cell content area
125 **      7       1      number of fragmented free bytes
126 **      8       4      Right child (the Ptr(N+1) value).  Omitted on leaves.
127 **
128 ** The flags define the format of this btree page.  The leaf flag means that
129 ** this page has no children.  The zerodata flag means that this page carries
130 ** only keys and no data.  The intkey flag means that the key is a single
131 ** variable length integer at the beginning of the payload.
132 **
133 ** The cell pointer array begins on the first byte after the page header.
134 ** The cell pointer array contains zero or more 2-byte numbers which are
135 ** offsets from the beginning of the page to the cell content in the cell
136 ** content area.  The cell pointers occur in sorted order.  The system strives
137 ** to keep free space after the last cell pointer so that new cells can
138 ** be easily added without have to defragment the page.
139 **
140 ** Cell content is stored at the very end of the page and grows toward the
141 ** beginning of the page.
142 **
143 ** Unused space within the cell content area is collected into a linked list of
144 ** freeblocks.  Each freeblock is at least 4 bytes in size.  The byte offset
145 ** to the first freeblock is given in the header.  Freeblocks occur in
146 ** increasing order.  Because a freeblock must be at least 4 bytes in size,
147 ** any group of 3 or fewer unused bytes in the cell content area cannot
148 ** exist on the freeblock chain.  A group of 3 or fewer free bytes is called
149 ** a fragment.  The total number of bytes in all fragments is recorded.
150 ** in the page header at offset 7.
151 **
152 **    SIZE    DESCRIPTION
153 **      2     Byte offset of the next freeblock
154 **      2     Bytes in this freeblock
155 **
156 ** Cells are of variable length.  Cells are stored in the cell content area at
157 ** the end of the page.  Pointers to the cells are in the cell pointer array
158 ** that immediately follows the page header.  Cells is not necessarily
159 ** contiguous or in order, but cell pointers are contiguous and in order.
160 **
161 ** Cell content makes use of variable length integers.  A variable
162 ** length integer is 1 to 9 bytes where the lower 7 bits of each
163 ** byte are used.  The integer consists of all bytes that have bit 8 set and
164 ** the first byte with bit 8 clear.  The most significant byte of the integer
165 ** appears first.  A variable-length integer may not be more than 9 bytes long.
166 ** As a special case, all 8 bytes of the 9th byte are used as data.  This
167 ** allows a 64-bit integer to be encoded in 9 bytes.
168 **
169 **    0x00                      becomes  0x00000000
170 **    0x7f                      becomes  0x0000007f
171 **    0x81 0x00                 becomes  0x00000080
172 **    0x82 0x00                 becomes  0x00000100
173 **    0x80 0x7f                 becomes  0x0000007f
174 **    0x8a 0x91 0xd1 0xac 0x78  becomes  0x12345678
175 **    0x81 0x81 0x81 0x81 0x01  becomes  0x10204081
176 **
177 ** Variable length integers are used for rowids and to hold the number of
178 ** bytes of key and data in a btree cell.
179 **
180 ** The content of a cell looks like this:
181 **
182 **    SIZE    DESCRIPTION
183 **      4     Page number of the left child. Omitted if leaf flag is set.
184 **     var    Number of bytes of data. Omitted if the zerodata flag is set.
185 **     var    Number of bytes of key. Or the key itself if intkey flag is set.
186 **      *     Payload
187 **      4     First page of the overflow chain.  Omitted if no overflow
188 **
189 ** Overflow pages form a linked list.  Each page except the last is completely
190 ** filled with data (pagesize - 4 bytes).  The last page can have as little
191 ** as 1 byte of data.
192 **
193 **    SIZE    DESCRIPTION
194 **      4     Page number of next overflow page
195 **      *     Data
196 **
197 ** Freelist pages come in two subtypes: trunk pages and leaf pages.  The
198 ** file header points to first in a linked list of trunk page.  Each trunk
199 ** page points to multiple leaf pages.  The content of a leaf page is
200 ** unspecified.  A trunk page looks like this:
201 **
202 **    SIZE    DESCRIPTION
203 **      4     Page number of next trunk page
204 **      4     Number of leaf pointers on this page
205 **      *     zero or more pages numbers of leaves
206 */
207 #include "sqliteInt.h"
208 #include "pager.h"
209 #include "btree.h"
210 #include <assert.h>
211 
212 
213 /* Maximum page size.  The upper bound on this value is 65536 (a limit
214 ** imposed by the 2-byte size of cell array pointers.)  The
215 ** maximum page size determines the amount of stack space allocated
216 ** by many of the routines in this module.  On embedded architectures
217 ** or any machine where memory and especially stack memory is limited,
218 ** one may wish to chose a smaller value for the maximum page size.
219 */
220 #ifndef MX_PAGE_SIZE
221 # define MX_PAGE_SIZE 1024
222 #endif
223 
224 /* The following value is the maximum cell size assuming a maximum page
225 ** size give above.
226 */
227 #define MX_CELL_SIZE  (MX_PAGE_SIZE-8)
228 
229 /* The maximum number of cells on a single page of the database.  This
230 ** assumes a minimum cell size of 3 bytes.  Such small cells will be
231 ** exceedingly rare, but they are possible.
232 */
233 #define MX_CELL ((MX_PAGE_SIZE-8)/3)
234 
235 /* Forward declarations */
236 typedef struct MemPage MemPage;
237 
238 /*
239 ** This is a magic string that appears at the beginning of every
240 ** SQLite database in order to identify the file as a real database.
241 **                                  123456789 123456 */
242 static const char zMagicHeader[] = "SQLite format 3";
243 
244 /*
245 ** Page type flags.  An ORed combination of these flags appear as the
246 ** first byte of every BTree page.
247 */
248 #define PTF_INTKEY    0x01
249 #define PTF_ZERODATA  0x02
250 #define PTF_LEAFDATA  0x04
251 #define PTF_LEAF      0x08
252 
253 /*
254 ** As each page of the file is loaded into memory, an instance of the following
255 ** structure is appended and initialized to zero.  This structure stores
256 ** information about the page that is decoded from the raw file page.
257 **
258 ** The pParent field points back to the parent page.  This allows us to
259 ** walk up the BTree from any leaf to the root.  Care must be taken to
260 ** unref() the parent page pointer when this page is no longer referenced.
261 ** The pageDestructor() routine handles that chore.
262 */
263 struct MemPage {
264   u8 isInit;           /* True if previously initialized. MUST BE FIRST! */
265   u8 idxShift;         /* True if Cell indices have changed */
266   u8 nOverflow;        /* Number of overflow cell bodies in aCell[] */
267   u8 intKey;           /* True if intkey flag is set */
268   u8 leaf;             /* True if leaf flag is set */
269   u8 zeroData;         /* True if table stores keys only */
270   u8 leafData;         /* True if tables stores data on leaves only */
271   u8 hasData;          /* True if this page stores data */
272   u8 hdrOffset;        /* 100 for page 1.  0 otherwise */
273   u8 childPtrSize;     /* 0 if leaf==1.  4 if leaf==0 */
274   u16 maxLocal;        /* Copy of Btree.maxLocal or Btree.maxLeaf */
275   u16 minLocal;        /* Copy of Btree.minLocal or Btree.minLeaf */
276   u16 cellOffset;      /* Index in aData of first cell pointer */
277   u16 idxParent;       /* Index in parent of this node */
278   u16 nFree;           /* Number of free bytes on the page */
279   u16 nCell;           /* Number of cells on this page, local and ovfl */
280   struct _OvflCell {   /* Cells that will not fit on aData[] */
281     u8 *pCell;           /* Pointers to the body of the overflow cell */
282     u16 idx;             /* Insert this cell before idx-th non-overflow cell */
283   } aOvfl[5];
284   struct Btree *pBt;   /* Pointer back to BTree structure */
285   u8 *aData;           /* Pointer back to the start of the page */
286   Pgno pgno;           /* Page number for this page */
287   MemPage *pParent;    /* The parent of this page.  NULL for root */
288 };
289 
290 /*
291 ** The in-memory image of a disk page has the auxiliary information appended
292 ** to the end.  EXTRA_SIZE is the number of bytes of space needed to hold
293 ** that extra information.
294 */
295 #define EXTRA_SIZE sizeof(MemPage)
296 
297 /*
298 ** Everything we need to know about an open database
299 */
300 struct Btree {
301   Pager *pPager;        /* The page cache */
302   BtCursor *pCursor;    /* A list of all open cursors */
303   MemPage *pPage1;      /* First page of the database */
304   u8 inTrans;           /* True if a transaction is in progress */
305   u8 inStmt;            /* True if we are in a statement subtransaction */
306   u8 readOnly;          /* True if the underlying file is readonly */
307   u8 maxEmbedFrac;      /* Maximum payload as % of total page size */
308   u8 minEmbedFrac;      /* Minimum payload as % of total page size */
309   u8 minLeafFrac;       /* Minimum leaf payload as % of total page size */
310   u16 pageSize;         /* Total number of bytes on a page */
311   u16 usableSize;       /* Number of usable bytes on each page */
312   int maxLocal;         /* Maximum local payload in non-LEAFDATA tables */
313   int minLocal;         /* Minimum local payload in non-LEAFDATA tables */
314   int maxLeaf;          /* Maximum local payload in a LEAFDATA table */
315   int minLeaf;          /* Minimum local payload in a LEAFDATA table */
316 };
317 typedef Btree Bt;
318 
319 /*
320 ** Btree.inTrans may take one of the following values.
321 */
322 #define TRANS_NONE  0
323 #define TRANS_READ  1
324 #define TRANS_WRITE 2
325 
326 /*
327 ** An instance of the following structure is used to hold information
328 ** about a cell.  The parseCellPtr() function fills in this structure
329 ** based on information extract from the raw disk page.
330 */
331 typedef struct CellInfo CellInfo;
332 struct CellInfo {
333   u8 *pCell;     /* Pointer to the start of cell content */
334   i64 nKey;      /* The key for INTKEY tables, or number of bytes in key */
335   u32 nData;     /* Number of bytes of data */
336   u16 nHeader;   /* Size of the cell content header in bytes */
337   u16 nLocal;    /* Amount of payload held locally */
338   u16 iOverflow; /* Offset to overflow page number.  Zero if no overflow */
339   u16 nSize;     /* Size of the cell content on the main b-tree page */
340 };
341 
342 /*
343 ** A cursor is a pointer to a particular entry in the BTree.
344 ** The entry is identified by its MemPage and the index in
345 ** MemPage.aCell[] of the entry.
346 */
347 struct BtCursor {
348   Btree *pBt;               /* The Btree to which this cursor belongs */
349   BtCursor *pNext, *pPrev;  /* Forms a linked list of all cursors */
350   BtCursor *pShared;        /* Loop of cursors with the same root page */
351   int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
352   void *pArg;               /* First arg to xCompare() */
353   Pgno pgnoRoot;            /* The root page of this tree */
354   MemPage *pPage;           /* Page that contains the entry */
355   int idx;                  /* Index of the entry in pPage->aCell[] */
356   CellInfo info;            /* A parse of the cell we are pointing at */
357   u8 wrFlag;                /* True if writable */
358   u8 isValid;               /* TRUE if points to a valid entry */
359   u8 status;                /* Set to SQLITE_ABORT if cursors is invalidated */
360 };
361 
362 /*
363 ** Read or write a two- and four-byte big-endian integer values.
364 */
365 static u32 get2byte(unsigned char *p){
366   return (p[0]<<8) | p[1];
367 }
368 static u32 get4byte(unsigned char *p){
369   return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
370 }
371 static void put2byte(unsigned char *p, u32 v){
372   p[0] = v>>8;
373   p[1] = v;
374 }
375 static void put4byte(unsigned char *p, u32 v){
376   p[0] = v>>24;
377   p[1] = v>>16;
378   p[2] = v>>8;
379   p[3] = v;
380 }
381 
382 /*
383 ** Routines to read and write variable-length integers.  These used to
384 ** be defined locally, but now we use the varint routines in the util.c
385 ** file.
386 */
387 #define getVarint    sqlite3GetVarint
388 #define getVarint32  sqlite3GetVarint32
389 #define putVarint    sqlite3PutVarint
390 
391 /*
392 ** Given a btree page and a cell index (0 means the first cell on
393 ** the page, 1 means the second cell, and so forth) return a pointer
394 ** to the cell content.
395 **
396 ** This routine works only for pages that do not contain overflow cells.
397 */
398 static u8 *findCell(MemPage *pPage, int iCell){
399   u8 *data = pPage->aData;
400   assert( iCell>=0 );
401   assert( iCell<get2byte(&data[pPage->hdrOffset+3]) );
402   return data + get2byte(&data[pPage->cellOffset+2*iCell]);
403 }
404 
405 /*
406 ** This a more complex version of findCell() that works for
407 ** pages that do contain overflow cells.  See insert
408 */
409 static u8 *findOverflowCell(MemPage *pPage, int iCell){
410   int i;
411   for(i=pPage->nOverflow-1; i>=0; i--){
412     if( pPage->aOvfl[i].idx<=iCell ){
413       if( pPage->aOvfl[i].idx==iCell ){
414         return pPage->aOvfl[i].pCell;
415       }
416       iCell--;
417     }
418   }
419   return findCell(pPage, iCell);
420 }
421 
422 /*
423 ** Parse a cell content block and fill in the CellInfo structure.  There
424 ** are two versions of this function.  parseCell() takes a cell index
425 ** as the second argument and parseCellPtr() takes a pointer to the
426 ** body of the cell as its second argument.
427 */
428 static void parseCellPtr(
429   MemPage *pPage,         /* Page containing the cell */
430   u8 *pCell,              /* Pointer to the cell text. */
431   CellInfo *pInfo         /* Fill in this structure */
432 ){
433   int n;                  /* Number bytes in cell content header */
434   u32 nPayload;           /* Number of bytes of cell payload */
435 
436   pInfo->pCell = pCell;
437   assert( pPage->leaf==0 || pPage->leaf==1 );
438   n = pPage->childPtrSize;
439   assert( n==4-4*pPage->leaf );
440   if( pPage->hasData ){
441     n += getVarint32(&pCell[n], &nPayload);
442   }else{
443     nPayload = 0;
444   }
445   n += getVarint(&pCell[n], &pInfo->nKey);
446   pInfo->nHeader = n;
447   pInfo->nData = nPayload;
448   if( !pPage->intKey ){
449     nPayload += pInfo->nKey;
450   }
451   if( nPayload<=pPage->maxLocal ){
452     /* This is the (easy) common case where the entire payload fits
453     ** on the local page.  No overflow is required.
454     */
455     int nSize;          /* Total size of cell content in bytes */
456     pInfo->nLocal = nPayload;
457     pInfo->iOverflow = 0;
458     nSize = nPayload + n;
459     if( nSize<4 ){
460       nSize = 4;        /* Minimum cell size is 4 */
461     }
462     pInfo->nSize = nSize;
463   }else{
464     /* If the payload will not fit completely on the local page, we have
465     ** to decide how much to store locally and how much to spill onto
466     ** overflow pages.  The strategy is to minimize the amount of unused
467     ** space on overflow pages while keeping the amount of local storage
468     ** in between minLocal and maxLocal.
469     **
470     ** Warning:  changing the way overflow payload is distributed in any
471     ** way will result in an incompatible file format.
472     */
473     int minLocal;  /* Minimum amount of payload held locally */
474     int maxLocal;  /* Maximum amount of payload held locally */
475     int surplus;   /* Overflow payload available for local storage */
476 
477     minLocal = pPage->minLocal;
478     maxLocal = pPage->maxLocal;
479     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
480     if( surplus <= maxLocal ){
481       pInfo->nLocal = surplus;
482     }else{
483       pInfo->nLocal = minLocal;
484     }
485     pInfo->iOverflow = pInfo->nLocal + n;
486     pInfo->nSize = pInfo->iOverflow + 4;
487   }
488 }
489 static void parseCell(
490   MemPage *pPage,         /* Page containing the cell */
491   int iCell,              /* The cell index.  First cell is 0 */
492   CellInfo *pInfo         /* Fill in this structure */
493 ){
494   parseCellPtr(pPage, findCell(pPage, iCell), pInfo);
495 }
496 
497 /*
498 ** Compute the total number of bytes that a Cell needs in the cell
499 ** data area of the btree-page.  The return number includes the cell
500 ** data header and the local payload, but not any overflow page or
501 ** the space used by the cell pointer.
502 */
503 static int cellSize(MemPage *pPage, int iCell){
504   CellInfo info;
505   parseCell(pPage, iCell, &info);
506   return info.nSize;
507 }
508 static int cellSizePtr(MemPage *pPage, u8 *pCell){
509   CellInfo info;
510   parseCellPtr(pPage, pCell, &info);
511   return info.nSize;
512 }
513 
514 /*
515 ** Do sanity checking on a page.  Throw an exception if anything is
516 ** not right.
517 **
518 ** This routine is used for internal error checking only.  It is omitted
519 ** from most builds.
520 */
521 #if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
522 static void _pageIntegrity(MemPage *pPage){
523   int usableSize;
524   u8 *data;
525   int i, j, idx, c, pc, hdr, nFree;
526   int cellOffset;
527   int nCell, cellLimit;
528   u8 used[MX_PAGE_SIZE];
529 
530   usableSize = pPage->pBt->usableSize;
531   assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->pageSize] );
532   hdr = pPage->hdrOffset;
533   assert( hdr==(pPage->pgno==1 ? 100 : 0) );
534   assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
535   c = pPage->aData[hdr];
536   if( pPage->isInit ){
537     assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
538     assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
539     assert( pPage->leafData == ((c & PTF_LEAFDATA)!=0) );
540     assert( pPage->intKey == ((c & (PTF_INTKEY|PTF_LEAFDATA))!=0) );
541     assert( pPage->hasData ==
542              !(pPage->zeroData || (!pPage->leaf && pPage->leafData)) );
543     assert( pPage->cellOffset==pPage->hdrOffset+12-4*pPage->leaf );
544     assert( pPage->nCell = get2byte(&pPage->aData[hdr+3]) );
545   }
546   data = pPage->aData;
547   memset(used, 0, usableSize);
548   for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
549   nFree = 0;
550   pc = get2byte(&data[hdr+1]);
551   while( pc ){
552     int size;
553     assert( pc>0 && pc<usableSize-4 );
554     size = get2byte(&data[pc+2]);
555     assert( pc+size<=usableSize );
556     nFree += size;
557     for(i=pc; i<pc+size; i++){
558       assert( used[i]==0 );
559       used[i] = 1;
560     }
561     pc = get2byte(&data[pc]);
562   }
563   idx = 0;
564   nCell = get2byte(&data[hdr+3]);
565   cellLimit = get2byte(&data[hdr+5]);
566   assert( pPage->isInit==0
567          || pPage->nFree==nFree+data[hdr+7]+cellLimit-(cellOffset+2*nCell) );
568   cellOffset = pPage->cellOffset;
569   for(i=0; i<nCell; i++){
570     int size;
571     pc = get2byte(&data[cellOffset+2*i]);
572     assert( pc>0 && pc<usableSize-4 );
573     size = cellSize(pPage, &data[pc]);
574     assert( pc+size<=usableSize );
575     for(j=pc; j<pc+size; j++){
576       assert( used[j]==0 );
577       used[j] = 1;
578     }
579   }
580   for(i=cellOffset+2*nCell; i<cellimit; i++){
581     assert( used[i]==0 );
582     used[i] = 1;
583   }
584   nFree = 0;
585   for(i=0; i<usableSize; i++){
586     assert( used[i]<=1 );
587     if( used[i]==0 ) nFree++;
588   }
589   assert( nFree==data[hdr+7] );
590 }
591 #define pageIntegrity(X) _pageIntegrity(X)
592 #else
593 # define pageIntegrity(X)
594 #endif
595 
596 /*
597 ** Defragment the page given.  All Cells are moved to the
598 ** beginning of the page and all free space is collected
599 ** into one big FreeBlk at the end of the page.
600 */
601 static void defragmentPage(MemPage *pPage){
602   int i;                     /* Loop counter */
603   int pc;                    /* Address of a i-th cell */
604   int addr;                  /* Offset of first byte after cell pointer array */
605   int hdr;                   /* Offset to the page header */
606   int size;                  /* Size of a cell */
607   int usableSize;            /* Number of usable bytes on a page */
608   int cellOffset;            /* Offset to the cell pointer array */
609   int brk;                   /* Offset to the cell content area */
610   int nCell;                 /* Number of cells on the page */
611   unsigned char *data;               /* The page data */
612   unsigned char temp[MX_PAGE_SIZE];  /* Temp holding area for cell content */
613 
614   assert( sqlite3pager_iswriteable(pPage->aData) );
615   assert( pPage->pBt!=0 );
616   assert( pPage->pBt->usableSize <= MX_PAGE_SIZE );
617   assert( pPage->nOverflow==0 );
618   data = pPage->aData;
619   hdr = pPage->hdrOffset;
620   cellOffset = pPage->cellOffset;
621   nCell = pPage->nCell;
622   assert( nCell==get2byte(&data[hdr+3]) );
623   usableSize = pPage->pBt->usableSize;
624   brk = get2byte(&data[hdr+5]);
625   memcpy(&temp[brk], &data[brk], usableSize - brk);
626   brk = usableSize;
627   for(i=0; i<nCell; i++){
628     u8 *pAddr;     /* The i-th cell pointer */
629     pAddr = &data[cellOffset + i*2];
630     pc = get2byte(pAddr);
631     assert( pc<pPage->pBt->usableSize );
632     size = cellSizePtr(pPage, &temp[pc]);
633     brk -= size;
634     memcpy(&data[brk], &temp[pc], size);
635     put2byte(pAddr, brk);
636   }
637   assert( brk>=cellOffset+2*nCell );
638   put2byte(&data[hdr+5], brk);
639   data[hdr+1] = 0;
640   data[hdr+2] = 0;
641   data[hdr+7] = 0;
642   addr = cellOffset+2*nCell;
643   memset(&data[addr], 0, brk-addr);
644 }
645 
646 /*
647 ** Allocate nByte bytes of space on a page.
648 **
649 ** Return the index into pPage->aData[] of the first byte of
650 ** the new allocation. Or return 0 if there is not enough free
651 ** space on the page to satisfy the allocation request.
652 **
653 ** If the page contains nBytes of free space but does not contain
654 ** nBytes of contiguous free space, then this routine automatically
655 ** calls defragementPage() to consolidate all free space before
656 ** allocating the new chunk.
657 */
658 static int allocateSpace(MemPage *pPage, int nByte){
659   int addr, pc, hdr;
660   int size;
661   int nFrag;
662   int top;
663   int nCell;
664   int cellOffset;
665   unsigned char *data;
666 
667   data = pPage->aData;
668   assert( sqlite3pager_iswriteable(data) );
669   assert( pPage->pBt );
670   if( nByte<4 ) nByte = 4;
671   if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
672   pPage->nFree -= nByte;
673   hdr = pPage->hdrOffset;
674 
675   nFrag = data[hdr+7];
676   if( nFrag<60 ){
677     /* Search the freelist looking for a slot big enough to satisfy the
678     ** space request. */
679     addr = hdr+1;
680     while( (pc = get2byte(&data[addr]))>0 ){
681       size = get2byte(&data[pc+2]);
682       if( size>=nByte ){
683         if( size<nByte+4 ){
684           memcpy(&data[addr], &data[pc], 2);
685           data[hdr+7] = nFrag + size - nByte;
686           return pc;
687         }else{
688           put2byte(&data[pc+2], size-nByte);
689           return pc + size - nByte;
690         }
691       }
692       addr = pc;
693     }
694   }
695 
696   /* Allocate memory from the gap in between the cell pointer array
697   ** and the cell content area.
698   */
699   top = get2byte(&data[hdr+5]);
700   nCell = get2byte(&data[hdr+3]);
701   cellOffset = pPage->cellOffset;
702   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
703     defragmentPage(pPage);
704     top = get2byte(&data[hdr+5]);
705   }
706   top -= nByte;
707   assert( cellOffset + 2*nCell <= top );
708   put2byte(&data[hdr+5], top);
709   return top;
710 }
711 
712 /*
713 ** Return a section of the pPage->aData to the freelist.
714 ** The first byte of the new free block is pPage->aDisk[start]
715 ** and the size of the block is "size" bytes.
716 **
717 ** Most of the effort here is involved in coalesing adjacent
718 ** free blocks into a single big free block.
719 */
720 static void freeSpace(MemPage *pPage, int start, int size){
721   int end = start + size;  /* End of the segment being freed */
722   int addr, pbegin, hdr;
723   unsigned char *data = pPage->aData;
724 
725   assert( pPage->pBt!=0 );
726   assert( sqlite3pager_iswriteable(data) );
727   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
728   assert( end<=pPage->pBt->usableSize );
729   if( size<4 ) size = 4;
730 
731   /* Add the space back into the linked list of freeblocks */
732   hdr = pPage->hdrOffset;
733   addr = hdr + 1;
734   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
735     assert( pbegin<=pPage->pBt->usableSize-4 );
736     assert( pbegin>addr );
737     addr = pbegin;
738   }
739   assert( pbegin<=pPage->pBt->usableSize-4 );
740   assert( pbegin>addr || pbegin==0 );
741   put2byte(&data[addr], start);
742   put2byte(&data[start], pbegin);
743   put2byte(&data[start+2], size);
744   pPage->nFree += size;
745 
746   /* Coalesce adjacent free blocks */
747   addr = pPage->hdrOffset + 1;
748   while( (pbegin = get2byte(&data[addr]))>0 ){
749     int pnext, psize;
750     assert( pbegin>addr );
751     assert( pbegin<=pPage->pBt->usableSize-4 );
752     pnext = get2byte(&data[pbegin]);
753     psize = get2byte(&data[pbegin+2]);
754     if( pbegin + psize + 3 >= pnext && pnext>0 ){
755       int frag = pnext - (pbegin+psize);
756       assert( frag<=data[pPage->hdrOffset+7] );
757       data[pPage->hdrOffset+7] -= frag;
758       put2byte(&data[pbegin], get2byte(&data[pnext]));
759       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
760     }else{
761       addr = pbegin;
762     }
763   }
764 
765   /* If the cell content area begins with a freeblock, remove it. */
766   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
767     int top;
768     pbegin = get2byte(&data[hdr+1]);
769     memcpy(&data[hdr+1], &data[pbegin], 2);
770     top = get2byte(&data[hdr+5]);
771     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
772   }
773 }
774 
775 /*
776 ** Decode the flags byte (the first byte of the header) for a page
777 ** and initialize fields of the MemPage structure accordingly.
778 */
779 static void decodeFlags(MemPage *pPage, int flagByte){
780   Btree *pBt;     /* A copy of pPage->pBt */
781 
782   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
783   pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
784   pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
785   pPage->leaf = (flagByte & PTF_LEAF)!=0;
786   pPage->childPtrSize = 4*(pPage->leaf==0);
787   pBt = pPage->pBt;
788   if( flagByte & PTF_LEAFDATA ){
789     pPage->leafData = 1;
790     pPage->maxLocal = pBt->maxLeaf;
791     pPage->minLocal = pBt->minLeaf;
792   }else{
793     pPage->leafData = 0;
794     pPage->maxLocal = pBt->maxLocal;
795     pPage->minLocal = pBt->minLocal;
796   }
797   pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
798 }
799 
800 /*
801 ** Initialize the auxiliary information for a disk block.
802 **
803 ** The pParent parameter must be a pointer to the MemPage which
804 ** is the parent of the page being initialized.  The root of a
805 ** BTree has no parent and so for that page, pParent==NULL.
806 **
807 ** Return SQLITE_OK on success.  If we see that the page does
808 ** not contain a well-formed database page, then return
809 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
810 ** guarantee that the page is well-formed.  It only shows that
811 ** we failed to detect any corruption.
812 */
813 static int initPage(
814   MemPage *pPage,        /* The page to be initialized */
815   MemPage *pParent       /* The parent.  Might be NULL */
816 ){
817   int pc;            /* Address of a freeblock within pPage->aData[] */
818   int i;             /* Loop counter */
819   int hdr;           /* Offset to beginning of page header */
820   u8 *data;          /* Equal to pPage->aData */
821   int usableSize;    /* Amount of usable space on each page */
822   int cellOffset;    /* Offset from start of page to first cell pointer */
823   int nFree;         /* Number of unused bytes on the page */
824   int top;           /* First byte of the cell content area */
825 
826   assert( pPage->pBt!=0 );
827   assert( pParent==0 || pParent->pBt==pPage->pBt );
828   assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
829   assert( pPage->aData == &((unsigned char*)pPage)[-pPage->pBt->pageSize] );
830   assert( pPage->pParent==0 || pPage->pParent==pParent );
831   assert( pPage->pParent==pParent || !pPage->isInit );
832   if( pPage->isInit ) return SQLITE_OK;
833   if( pPage->pParent==0 && pParent!=0 ){
834     pPage->pParent = pParent;
835     sqlite3pager_ref(pParent->aData);
836   }
837   hdr = pPage->hdrOffset;
838   data = pPage->aData;
839   decodeFlags(pPage, data[hdr]);
840   pPage->nOverflow = 0;
841   pPage->idxShift = 0;
842   usableSize = pPage->pBt->usableSize;
843   pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
844   top = get2byte(&data[hdr+5]);
845   pPage->nCell = get2byte(&data[hdr+3]);
846 
847   /* Compute the total free space on the page */
848   pc = get2byte(&data[hdr+1]);
849   nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
850   i = 0;
851   while( pc>0 ){
852     int next, size;
853     if( pc>=usableSize ) return SQLITE_CORRUPT;
854     if( i++>MX_PAGE_SIZE ) return SQLITE_CORRUPT;
855     next = get2byte(&data[pc]);
856     size = get2byte(&data[pc+2]);
857     if( next>0 && next<=pc+size+3 ) return SQLITE_CORRUPT;
858     nFree += size;
859     pc = next;
860   }
861   pPage->nFree = nFree;
862   if( nFree>=usableSize ) return SQLITE_CORRUPT;
863 
864   pPage->isInit = 1;
865   pageIntegrity(pPage);
866   return SQLITE_OK;
867 }
868 
869 /*
870 ** Set up a raw page so that it looks like a database page holding
871 ** no entries.
872 */
873 static void zeroPage(MemPage *pPage, int flags){
874   unsigned char *data = pPage->aData;
875   Btree *pBt = pPage->pBt;
876   int hdr = pPage->hdrOffset;
877   int first;
878 
879   assert( sqlite3pager_pagenumber(data)==pPage->pgno );
880   assert( &data[pBt->pageSize] == (unsigned char*)pPage );
881   assert( sqlite3pager_iswriteable(data) );
882   memset(&data[hdr], 0, pBt->usableSize - hdr);
883   data[hdr] = flags;
884   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
885   memset(&data[hdr+1], 0, 4);
886   data[hdr+7] = 0;
887   put2byte(&data[hdr+5], pBt->usableSize);
888   pPage->nFree = pBt->usableSize - first;
889   decodeFlags(pPage, flags);
890   pPage->hdrOffset = hdr;
891   pPage->cellOffset = first;
892   pPage->nOverflow = 0;
893   pPage->idxShift = 0;
894   pPage->nCell = 0;
895   pPage->isInit = 1;
896   pageIntegrity(pPage);
897 }
898 
899 /*
900 ** Get a page from the pager.  Initialize the MemPage.pBt and
901 ** MemPage.aData elements if needed.
902 */
903 static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
904   int rc;
905   unsigned char *aData;
906   MemPage *pPage;
907   rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
908   if( rc ) return rc;
909   pPage = (MemPage*)&aData[pBt->pageSize];
910   pPage->aData = aData;
911   pPage->pBt = pBt;
912   pPage->pgno = pgno;
913   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
914   *ppPage = pPage;
915   return SQLITE_OK;
916 }
917 
918 /*
919 ** Get a page from the pager and initialize it.  This routine
920 ** is just a convenience wrapper around separate calls to
921 ** getPage() and initPage().
922 */
923 static int getAndInitPage(
924   Btree *pBt,          /* The database file */
925   Pgno pgno,           /* Number of the page to get */
926   MemPage **ppPage,    /* Write the page pointer here */
927   MemPage *pParent     /* Parent of the page */
928 ){
929   int rc;
930   rc = getPage(pBt, pgno, ppPage);
931   if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
932     rc = initPage(*ppPage, pParent);
933   }
934   return rc;
935 }
936 
937 /*
938 ** Release a MemPage.  This should be called once for each prior
939 ** call to getPage.
940 */
941 static void releasePage(MemPage *pPage){
942   if( pPage ){
943     assert( pPage->aData );
944     assert( pPage->pBt );
945     assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
946     sqlite3pager_unref(pPage->aData);
947   }
948 }
949 
950 /*
951 ** This routine is called when the reference count for a page
952 ** reaches zero.  We need to unref the pParent pointer when that
953 ** happens.
954 */
955 static void pageDestructor(void *pData, int pageSize){
956   MemPage *pPage = (MemPage*)&((char*)pData)[pageSize];
957   if( pPage->pParent ){
958     MemPage *pParent = pPage->pParent;
959     pPage->pParent = 0;
960     releasePage(pParent);
961   }
962   pPage->isInit = 0;
963 }
964 
965 /*
966 ** During a rollback, when the pager reloads information into the cache
967 ** so that the cache is restored to its original state at the start of
968 ** the transaction, for each page restored this routine is called.
969 **
970 ** This routine needs to reset the extra data section at the end of the
971 ** page to agree with the restored data.
972 */
973 static void pageReinit(void *pData, int pageSize){
974   MemPage *pPage = (MemPage*)&((char*)pData)[pageSize];
975   if( pPage->isInit ){
976     pPage->isInit = 0;
977     initPage(pPage, pPage->pParent);
978   }
979 }
980 
981 /*
982 ** Open a new database.
983 **
984 ** Actually, this routine just sets up the internal data structures
985 ** for accessing the database.  We do not open the database file
986 ** until the first page is loaded.
987 **
988 ** zFilename is the name of the database file.  If zFilename is NULL
989 ** a new database with a random name is created.  This randomly named
990 ** database file will be deleted when sqlite3BtreeClose() is called.
991 */
992 int sqlite3BtreeOpen(
993   const char *zFilename,  /* Name of the file containing the BTree database */
994   Btree **ppBtree,        /* Pointer to new Btree object written here */
995   int nCache,             /* Number of cache pages */
996   int flags,              /* Options */
997   void *pBusyHandler      /* Busy callback info passed to pager layer */
998 ){
999   Btree *pBt;
1000   int rc;
1001 
1002   /*
1003   ** The following asserts make sure that structures used by the btree are
1004   ** the right size.  This is to guard against size changes that result
1005   ** when compiling on a different architecture.
1006   */
1007   assert( sizeof(i64)==8 );
1008   assert( sizeof(u64)==8 );
1009   assert( sizeof(u32)==4 );
1010   assert( sizeof(u16)==2 );
1011   assert( sizeof(Pgno)==4 );
1012   assert( sizeof(ptr)==sizeof(char*) );
1013   assert( sizeof(uptr)==sizeof(ptr) );
1014 
1015   pBt = sqliteMalloc( sizeof(*pBt) );
1016   if( pBt==0 ){
1017     *ppBtree = 0;
1018     return SQLITE_NOMEM;
1019   }
1020   if( nCache<10 ) nCache = 10;
1021   rc = sqlite3pager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
1022                         (flags & BTREE_OMIT_JOURNAL)==0, pBusyHandler);
1023   if( rc!=SQLITE_OK ){
1024     if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
1025     sqliteFree(pBt);
1026     *ppBtree = 0;
1027     return rc;
1028   }
1029   sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
1030   sqlite3pager_set_reiniter(pBt->pPager, pageReinit);
1031   pBt->pCursor = 0;
1032   pBt->pPage1 = 0;
1033   pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
1034   pBt->pageSize = SQLITE_PAGE_SIZE;  /* FIX ME - read from header */
1035   pBt->usableSize = pBt->pageSize;
1036   pBt->maxEmbedFrac = 64;            /* FIX ME - read from header */
1037   pBt->minEmbedFrac = 32;            /* FIX ME - read from header */
1038   pBt->minLeafFrac = 32;             /* FIX ME - read from header */
1039 
1040   *ppBtree = pBt;
1041   return SQLITE_OK;
1042 }
1043 
1044 /*
1045 ** Close an open database and invalidate all cursors.
1046 */
1047 int sqlite3BtreeClose(Btree *pBt){
1048   while( pBt->pCursor ){
1049     sqlite3BtreeCloseCursor(pBt->pCursor);
1050   }
1051   sqlite3pager_close(pBt->pPager);
1052   sqliteFree(pBt);
1053   return SQLITE_OK;
1054 }
1055 
1056 /*
1057 ** Change the limit on the number of pages allowed in the cache.
1058 **
1059 ** The maximum number of cache pages is set to the absolute
1060 ** value of mxPage.  If mxPage is negative, the pager will
1061 ** operate asynchronously - it will not stop to do fsync()s
1062 ** to insure data is written to the disk surface before
1063 ** continuing.  Transactions still work if synchronous is off,
1064 ** and the database cannot be corrupted if this program
1065 ** crashes.  But if the operating system crashes or there is
1066 ** an abrupt power failure when synchronous is off, the database
1067 ** could be left in an inconsistent and unrecoverable state.
1068 ** Synchronous is on by default so database corruption is not
1069 ** normally a worry.
1070 */
1071 int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
1072   sqlite3pager_set_cachesize(pBt->pPager, mxPage);
1073   return SQLITE_OK;
1074 }
1075 
1076 /*
1077 ** Change the way data is synced to disk in order to increase or decrease
1078 ** how well the database resists damage due to OS crashes and power
1079 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
1080 ** there is a high probability of damage)  Level 2 is the default.  There
1081 ** is a very low but non-zero probability of damage.  Level 3 reduces the
1082 ** probability of damage to near zero but with a write performance reduction.
1083 */
1084 int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
1085   sqlite3pager_set_safety_level(pBt->pPager, level);
1086   return SQLITE_OK;
1087 }
1088 
1089 /*
1090 ** Get a reference to pPage1 of the database file.  This will
1091 ** also acquire a readlock on that file.
1092 **
1093 ** SQLITE_OK is returned on success.  If the file is not a
1094 ** well-formed database file, then SQLITE_CORRUPT is returned.
1095 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
1096 ** is returned if we run out of memory.  SQLITE_PROTOCOL is returned
1097 ** if there is a locking protocol violation.
1098 */
1099 static int lockBtree(Btree *pBt){
1100   int rc;
1101   MemPage *pPage1;
1102   if( pBt->pPage1 ) return SQLITE_OK;
1103   rc = getPage(pBt, 1, &pPage1);
1104   if( rc!=SQLITE_OK ) return rc;
1105 
1106 
1107   /* Do some checking to help insure the file we opened really is
1108   ** a valid database file.
1109   */
1110   rc = SQLITE_NOTADB;
1111   if( sqlite3pager_pagecount(pBt->pPager)>0 ){
1112     u8 *page1 = pPage1->aData;
1113     if( memcmp(page1, zMagicHeader, 16)!=0 ){
1114       goto page1_init_failed;
1115     }
1116     if( page1[18]>1 || page1[19]>1 ){
1117       goto page1_init_failed;
1118     }
1119     pBt->pageSize = get2byte(&page1[16]);
1120     pBt->usableSize = pBt->pageSize - page1[20];
1121     if( pBt->usableSize<500 ){
1122       goto page1_init_failed;
1123     }
1124     pBt->maxEmbedFrac = page1[21];
1125     pBt->minEmbedFrac = page1[22];
1126     pBt->minLeafFrac = page1[23];
1127   }
1128 
1129   /* maxLocal is the maximum amount of payload to store locally for
1130   ** a cell.  Make sure it is small enough so that at least minFanout
1131   ** cells can will fit on one page.  We assume a 10-byte page header.
1132   ** Besides the payload, the cell must store:
1133   **     2-byte pointer to the cell
1134   **     4-byte child pointer
1135   **     9-byte nKey value
1136   **     4-byte nData value
1137   **     4-byte overflow page pointer
1138   ** So a cell consists of a 2-byte poiner, a header which is as much as
1139   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1140   ** page pointer.
1141   */
1142   pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1143   pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1144   pBt->maxLeaf = pBt->usableSize - 35;
1145   pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1146   if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1147     goto page1_init_failed;
1148   }
1149   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE );
1150   pBt->pPage1 = pPage1;
1151   return SQLITE_OK;
1152 
1153 page1_init_failed:
1154   releasePage(pPage1);
1155   pBt->pPage1 = 0;
1156   return rc;
1157 }
1158 
1159 /*
1160 ** If there are no outstanding cursors and we are not in the middle
1161 ** of a transaction but there is a read lock on the database, then
1162 ** this routine unrefs the first page of the database file which
1163 ** has the effect of releasing the read lock.
1164 **
1165 ** If there are any outstanding cursors, this routine is a no-op.
1166 **
1167 ** If there is a transaction in progress, this routine is a no-op.
1168 */
1169 static void unlockBtreeIfUnused(Btree *pBt){
1170   if( pBt->inTrans==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1171     if( pBt->pPage1->aData==0 ){
1172       MemPage *pPage = pBt->pPage1;
1173       pPage->aData = &((char*)pPage)[-pBt->pageSize];
1174       pPage->pBt = pBt;
1175       pPage->pgno = 1;
1176     }
1177     releasePage(pBt->pPage1);
1178     pBt->pPage1 = 0;
1179     pBt->inStmt = 0;
1180   }
1181 }
1182 
1183 /*
1184 ** Create a new database by initializing the first page of the
1185 ** file.
1186 */
1187 static int newDatabase(Btree *pBt){
1188   MemPage *pP1;
1189   unsigned char *data;
1190   int rc;
1191   if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
1192   pP1 = pBt->pPage1;
1193   assert( pP1!=0 );
1194   data = pP1->aData;
1195   rc = sqlite3pager_write(data);
1196   if( rc ) return rc;
1197   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1198   assert( sizeof(zMagicHeader)==16 );
1199   put2byte(&data[16], pBt->pageSize);
1200   data[18] = 1;
1201   data[19] = 1;
1202   data[20] = pBt->pageSize - pBt->usableSize;
1203   data[21] = pBt->maxEmbedFrac;
1204   data[22] = pBt->minEmbedFrac;
1205   data[23] = pBt->minLeafFrac;
1206   memset(&data[24], 0, 100-24);
1207   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1208   return SQLITE_OK;
1209 }
1210 
1211 /*
1212 ** Attempt to start a new transaction. A write-transaction
1213 ** is started if the second argument is true, otherwise a read-
1214 ** transaction.
1215 **
1216 ** A write-transaction must be started before attempting any
1217 ** changes to the database.  None of the following routines
1218 ** will work unless a transaction is started first:
1219 **
1220 **      sqlite3BtreeCreateTable()
1221 **      sqlite3BtreeCreateIndex()
1222 **      sqlite3BtreeClearTable()
1223 **      sqlite3BtreeDropTable()
1224 **      sqlite3BtreeInsert()
1225 **      sqlite3BtreeDelete()
1226 **      sqlite3BtreeUpdateMeta()
1227 **
1228 ** If wrflag is true, then nMaster specifies the maximum length of
1229 ** a master journal file name supplied later via sqlite3BtreeSync().
1230 ** This is so that appropriate space can be allocated in the journal file
1231 ** when it is created..
1232 */
1233 int sqlite3BtreeBeginTrans(Btree *pBt, int wrflag, int nMaster){
1234   int rc = SQLITE_OK;
1235 
1236   /* If the btree is already in a write-transaction, or it
1237   ** is already in a read-transaction and a read-transaction
1238   ** is requested, this is a no-op.
1239   */
1240   if( pBt->inTrans==TRANS_WRITE ||
1241       (pBt->inTrans==TRANS_READ && !wrflag) ){
1242     return SQLITE_OK;
1243   }
1244   if( pBt->readOnly && wrflag ){
1245     return SQLITE_READONLY;
1246   }
1247 
1248   if( pBt->pPage1==0 ){
1249     rc = lockBtree(pBt);
1250   }
1251 
1252   if( rc==SQLITE_OK && wrflag ){
1253     rc = sqlite3pager_begin(pBt->pPage1->aData, nMaster);
1254     if( rc==SQLITE_OK ){
1255       rc = newDatabase(pBt);
1256     }
1257   }
1258 
1259   if( rc==SQLITE_OK ){
1260     pBt->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1261     if( wrflag ) pBt->inStmt = 0;
1262   }else{
1263     unlockBtreeIfUnused(pBt);
1264   }
1265   return rc;
1266 }
1267 
1268 /*
1269 ** Commit the transaction currently in progress.
1270 **
1271 ** This will release the write lock on the database file.  If there
1272 ** are no active cursors, it also releases the read lock.
1273 */
1274 int sqlite3BtreeCommit(Btree *pBt){
1275   int rc = SQLITE_OK;
1276   if( pBt->inTrans==TRANS_WRITE ){
1277     rc = sqlite3pager_commit(pBt->pPager);
1278   }
1279   pBt->inTrans = TRANS_NONE;
1280   pBt->inStmt = 0;
1281   unlockBtreeIfUnused(pBt);
1282   return rc;
1283 }
1284 
1285 /*
1286 ** Invalidate all cursors
1287 */
1288 static void invalidateCursors(Btree *pBt){
1289   BtCursor *pCur;
1290   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1291     MemPage *pPage = pCur->pPage;
1292     if( pPage /* && !pPage->isInit */ ){
1293       pageIntegrity(pPage);
1294       releasePage(pPage);
1295       pCur->pPage = 0;
1296       pCur->isValid = 0;
1297       pCur->status = SQLITE_ABORT;
1298     }
1299   }
1300 }
1301 
1302 #ifdef SQLITE_TEST
1303 /*
1304 ** Print debugging information about all cursors to standard output.
1305 */
1306 void sqlite3BtreeCursorList(Btree *pBt){
1307   BtCursor *pCur;
1308   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1309     MemPage *pPage = pCur->pPage;
1310     char *zMode = pCur->wrFlag ? "rw" : "ro";
1311     printf("CURSOR %08x rooted at %4d(%s) currently at %d.%d%s\n",
1312        (int)pCur, pCur->pgnoRoot, zMode,
1313        pPage ? pPage->pgno : 0, pCur->idx,
1314        pCur->isValid ? "" : " eof"
1315     );
1316   }
1317 }
1318 #endif
1319 
1320 /*
1321 ** Rollback the transaction in progress.  All cursors will be
1322 ** invalided by this operation.  Any attempt to use a cursor
1323 ** that was open at the beginning of this operation will result
1324 ** in an error.
1325 **
1326 ** This will release the write lock on the database file.  If there
1327 ** are no active cursors, it also releases the read lock.
1328 */
1329 int sqlite3BtreeRollback(Btree *pBt){
1330   int rc;
1331   MemPage *pPage1;
1332   if( pBt->inTrans==TRANS_WRITE ){
1333     rc = sqlite3pager_rollback(pBt->pPager);
1334     /* The rollback may have destroyed the pPage1->aData value.  So
1335     ** call getPage() on page 1 again to make sure pPage1->aData is
1336     ** set correctly. */
1337     if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
1338       releasePage(pPage1);
1339     }
1340     invalidateCursors(pBt);
1341   }
1342   pBt->inTrans = TRANS_NONE;
1343   pBt->inStmt = 0;
1344   unlockBtreeIfUnused(pBt);
1345   return rc;
1346 }
1347 
1348 /*
1349 ** Start a statement subtransaction.  The subtransaction can
1350 ** can be rolled back independently of the main transaction.
1351 ** You must start a transaction before starting a subtransaction.
1352 ** The subtransaction is ended automatically if the main transaction
1353 ** commits or rolls back.
1354 **
1355 ** Only one subtransaction may be active at a time.  It is an error to try
1356 ** to start a new subtransaction if another subtransaction is already active.
1357 **
1358 ** Statement subtransactions are used around individual SQL statements
1359 ** that are contained within a BEGIN...COMMIT block.  If a constraint
1360 ** error occurs within the statement, the effect of that one statement
1361 ** can be rolled back without having to rollback the entire transaction.
1362 */
1363 int sqlite3BtreeBeginStmt(Btree *pBt){
1364   int rc;
1365   if( (pBt->inTrans!=TRANS_WRITE) || pBt->inStmt ){
1366     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1367   }
1368   rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
1369   pBt->inStmt = 1;
1370   return rc;
1371 }
1372 
1373 
1374 /*
1375 ** Commit the statment subtransaction currently in progress.  If no
1376 ** subtransaction is active, this is a no-op.
1377 */
1378 int sqlite3BtreeCommitStmt(Btree *pBt){
1379   int rc;
1380   if( pBt->inStmt && !pBt->readOnly ){
1381     rc = sqlite3pager_stmt_commit(pBt->pPager);
1382   }else{
1383     rc = SQLITE_OK;
1384   }
1385   pBt->inStmt = 0;
1386   return rc;
1387 }
1388 
1389 /*
1390 ** Rollback the active statement subtransaction.  If no subtransaction
1391 ** is active this routine is a no-op.
1392 **
1393 ** All cursors will be invalidated by this operation.  Any attempt
1394 ** to use a cursor that was open at the beginning of this operation
1395 ** will result in an error.
1396 */
1397 int sqlite3BtreeRollbackStmt(Btree *pBt){
1398   int rc;
1399   if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
1400   rc = sqlite3pager_stmt_rollback(pBt->pPager);
1401   invalidateCursors(pBt);
1402   pBt->inStmt = 0;
1403   return rc;
1404 }
1405 
1406 /*
1407 ** Default key comparison function to be used if no comparison function
1408 ** is specified on the sqlite3BtreeCursor() call.
1409 */
1410 static int dfltCompare(
1411   void *NotUsed,             /* User data is not used */
1412   int n1, const void *p1,    /* First key to compare */
1413   int n2, const void *p2     /* Second key to compare */
1414 ){
1415   int c;
1416   c = memcmp(p1, p2, n1<n2 ? n1 : n2);
1417   if( c==0 ){
1418     c = n1 - n2;
1419   }
1420   return c;
1421 }
1422 
1423 /*
1424 ** Create a new cursor for the BTree whose root is on the page
1425 ** iTable.  The act of acquiring a cursor gets a read lock on
1426 ** the database file.
1427 **
1428 ** If wrFlag==0, then the cursor can only be used for reading.
1429 ** If wrFlag==1, then the cursor can be used for reading or for
1430 ** writing if other conditions for writing are also met.  These
1431 ** are the conditions that must be met in order for writing to
1432 ** be allowed:
1433 **
1434 ** 1:  The cursor must have been opened with wrFlag==1
1435 **
1436 ** 2:  No other cursors may be open with wrFlag==0 on the same table
1437 **
1438 ** 3:  The database must be writable (not on read-only media)
1439 **
1440 ** 4:  There must be an active transaction.
1441 **
1442 ** Condition 2 warrants further discussion.  If any cursor is opened
1443 ** on a table with wrFlag==0, that prevents all other cursors from
1444 ** writing to that table.  This is a kind of "read-lock".  When a cursor
1445 ** is opened with wrFlag==0 it is guaranteed that the table will not
1446 ** change as long as the cursor is open.  This allows the cursor to
1447 ** do a sequential scan of the table without having to worry about
1448 ** entries being inserted or deleted during the scan.  Cursors should
1449 ** be opened with wrFlag==0 only if this read-lock property is needed.
1450 ** That is to say, cursors should be opened with wrFlag==0 only if they
1451 ** intend to use the sqlite3BtreeNext() system call.  All other cursors
1452 ** should be opened with wrFlag==1 even if they never really intend
1453 ** to write.
1454 **
1455 ** No checking is done to make sure that page iTable really is the
1456 ** root page of a b-tree.  If it is not, then the cursor acquired
1457 ** will not work correctly.
1458 **
1459 ** The comparison function must be logically the same for every cursor
1460 ** on a particular table.  Changing the comparison function will result
1461 ** in incorrect operations.  If the comparison function is NULL, a
1462 ** default comparison function is used.  The comparison function is
1463 ** always ignored for INTKEY tables.
1464 */
1465 int sqlite3BtreeCursor(
1466   Btree *pBt,                                 /* The btree */
1467   int iTable,                                 /* Root page of table to open */
1468   int wrFlag,                                 /* 1 to write. 0 read-only */
1469   int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
1470   void *pArg,                                 /* First arg to xCompare() */
1471   BtCursor **ppCur                            /* Write new cursor here */
1472 ){
1473   int rc;
1474   BtCursor *pCur, *pRing;
1475 
1476   if( pBt->readOnly && wrFlag ){
1477     *ppCur = 0;
1478     return SQLITE_READONLY;
1479   }
1480   if( pBt->pPage1==0 ){
1481     rc = lockBtree(pBt);
1482     if( rc!=SQLITE_OK ){
1483       *ppCur = 0;
1484       return rc;
1485     }
1486   }
1487   pCur = sqliteMalloc( sizeof(*pCur) );
1488   if( pCur==0 ){
1489     rc = SQLITE_NOMEM;
1490     goto create_cursor_exception;
1491   }
1492   pCur->pgnoRoot = (Pgno)iTable;
1493   if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
1494     rc = SQLITE_EMPTY;
1495     goto create_cursor_exception;
1496   }
1497   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
1498   if( rc!=SQLITE_OK ){
1499     goto create_cursor_exception;
1500   }
1501   pCur->xCompare = xCmp ? xCmp : dfltCompare;
1502   pCur->pArg = pArg;
1503   pCur->pBt = pBt;
1504   pCur->wrFlag = wrFlag;
1505   pCur->idx = 0;
1506   pCur->info.nSize = 0;
1507   pCur->pNext = pBt->pCursor;
1508   if( pCur->pNext ){
1509     pCur->pNext->pPrev = pCur;
1510   }
1511   pCur->pPrev = 0;
1512   pRing = pBt->pCursor;
1513   while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1514   if( pRing ){
1515     pCur->pShared = pRing->pShared;
1516     pRing->pShared = pCur;
1517   }else{
1518     pCur->pShared = pCur;
1519   }
1520   pBt->pCursor = pCur;
1521   pCur->isValid = 0;
1522   pCur->status = SQLITE_OK;
1523   *ppCur = pCur;
1524   return SQLITE_OK;
1525 
1526 create_cursor_exception:
1527   *ppCur = 0;
1528   if( pCur ){
1529     releasePage(pCur->pPage);
1530     sqliteFree(pCur);
1531   }
1532   unlockBtreeIfUnused(pBt);
1533   return rc;
1534 }
1535 
1536 #if 0  /* Not Used */
1537 /*
1538 ** Change the value of the comparison function used by a cursor.
1539 */
1540 void sqlite3BtreeSetCompare(
1541   BtCursor *pCur,     /* The cursor to whose comparison function is changed */
1542   int(*xCmp)(void*,int,const void*,int,const void*), /* New comparison func */
1543   void *pArg          /* First argument to xCmp() */
1544 ){
1545   pCur->xCompare = xCmp ? xCmp : dfltCompare;
1546   pCur->pArg = pArg;
1547 }
1548 #endif
1549 
1550 /*
1551 ** Close a cursor.  The read lock on the database file is released
1552 ** when the last cursor is closed.
1553 */
1554 int sqlite3BtreeCloseCursor(BtCursor *pCur){
1555   Btree *pBt = pCur->pBt;
1556   if( pCur->pPrev ){
1557     pCur->pPrev->pNext = pCur->pNext;
1558   }else{
1559     pBt->pCursor = pCur->pNext;
1560   }
1561   if( pCur->pNext ){
1562     pCur->pNext->pPrev = pCur->pPrev;
1563   }
1564   releasePage(pCur->pPage);
1565   if( pCur->pShared!=pCur ){
1566     BtCursor *pRing = pCur->pShared;
1567     while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1568     pRing->pShared = pCur->pShared;
1569   }
1570   unlockBtreeIfUnused(pBt);
1571   sqliteFree(pCur);
1572   return SQLITE_OK;
1573 }
1574 
1575 /*
1576 ** Make a temporary cursor by filling in the fields of pTempCur.
1577 ** The temporary cursor is not on the cursor list for the Btree.
1578 */
1579 static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
1580   memcpy(pTempCur, pCur, sizeof(*pCur));
1581   pTempCur->pNext = 0;
1582   pTempCur->pPrev = 0;
1583   if( pTempCur->pPage ){
1584     sqlite3pager_ref(pTempCur->pPage->aData);
1585   }
1586 }
1587 
1588 /*
1589 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
1590 ** function above.
1591 */
1592 static void releaseTempCursor(BtCursor *pCur){
1593   if( pCur->pPage ){
1594     sqlite3pager_unref(pCur->pPage->aData);
1595   }
1596 }
1597 
1598 /*
1599 ** Make sure the BtCursor.info field of the given cursor is valid.
1600 ** If it is not already valid, call parseCell() to fill it in.
1601 **
1602 ** BtCursor.info is a cache of the information in the current cell.
1603 ** Using this cache reduces the number of calls to parseCell().
1604 */
1605 static void getCellInfo(BtCursor *pCur){
1606   if( pCur->info.nSize==0 ){
1607     parseCell(pCur->pPage, pCur->idx, &pCur->info);
1608   }else{
1609 #ifndef NDEBUG
1610     CellInfo info;
1611     memset(&info, 0, sizeof(info));
1612     parseCell(pCur->pPage, pCur->idx, &info);
1613     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
1614 #endif
1615   }
1616 }
1617 
1618 /*
1619 ** Set *pSize to the size of the buffer needed to hold the value of
1620 ** the key for the current entry.  If the cursor is not pointing
1621 ** to a valid entry, *pSize is set to 0.
1622 **
1623 ** For a table with the INTKEY flag set, this routine returns the key
1624 ** itself, not the number of bytes in the key.
1625 */
1626 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
1627   if( !pCur->isValid ){
1628     *pSize = 0;
1629   }else{
1630     getCellInfo(pCur);
1631     *pSize = pCur->info.nKey;
1632   }
1633   return SQLITE_OK;
1634 }
1635 
1636 /*
1637 ** Set *pSize to the number of bytes of data in the entry the
1638 ** cursor currently points to.  Always return SQLITE_OK.
1639 ** Failure is not possible.  If the cursor is not currently
1640 ** pointing to an entry (which can happen, for example, if
1641 ** the database is empty) then *pSize is set to 0.
1642 */
1643 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
1644   if( !pCur->isValid ){
1645     /* Not pointing at a valid entry - set *pSize to 0. */
1646     *pSize = 0;
1647   }else{
1648     getCellInfo(pCur);
1649     *pSize = pCur->info.nData;
1650   }
1651   return SQLITE_OK;
1652 }
1653 
1654 /*
1655 ** Read payload information from the entry that the pCur cursor is
1656 ** pointing to.  Begin reading the payload at "offset" and read
1657 ** a total of "amt" bytes.  Put the result in zBuf.
1658 **
1659 ** This routine does not make a distinction between key and data.
1660 ** It just reads bytes from the payload area.  Data might appear
1661 ** on the main page or be scattered out on multiple overflow pages.
1662 */
1663 static int getPayload(
1664   BtCursor *pCur,      /* Cursor pointing to entry to read from */
1665   int offset,          /* Begin reading this far into payload */
1666   int amt,             /* Read this many bytes */
1667   unsigned char *pBuf, /* Write the bytes into this buffer */
1668   int skipKey          /* offset begins at data if this is true */
1669 ){
1670   unsigned char *aPayload;
1671   Pgno nextPage;
1672   int rc;
1673   MemPage *pPage;
1674   Btree *pBt;
1675   int ovflSize;
1676   u32 nKey;
1677 
1678   assert( pCur!=0 && pCur->pPage!=0 );
1679   assert( pCur->isValid );
1680   pBt = pCur->pBt;
1681   pPage = pCur->pPage;
1682   pageIntegrity(pPage);
1683   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1684   getCellInfo(pCur);
1685   aPayload = pCur->info.pCell;
1686   aPayload += pCur->info.nHeader;
1687   if( pPage->intKey ){
1688     nKey = 0;
1689   }else{
1690     nKey = pCur->info.nKey;
1691   }
1692   assert( offset>=0 );
1693   if( skipKey ){
1694     offset += nKey;
1695   }
1696   if( offset+amt > nKey+pCur->info.nData ){
1697     return SQLITE_ERROR;
1698   }
1699   if( offset<pCur->info.nLocal ){
1700     int a = amt;
1701     if( a+offset>pCur->info.nLocal ){
1702       a = pCur->info.nLocal - offset;
1703     }
1704     memcpy(pBuf, &aPayload[offset], a);
1705     if( a==amt ){
1706       return SQLITE_OK;
1707     }
1708     offset = 0;
1709     pBuf += a;
1710     amt -= a;
1711   }else{
1712     offset -= pCur->info.nLocal;
1713   }
1714   if( amt>0 ){
1715     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
1716   }
1717   ovflSize = pBt->usableSize - 4;
1718   while( amt>0 && nextPage ){
1719     rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
1720     if( rc!=0 ){
1721       return rc;
1722     }
1723     nextPage = get4byte(aPayload);
1724     if( offset<ovflSize ){
1725       int a = amt;
1726       if( a + offset > ovflSize ){
1727         a = ovflSize - offset;
1728       }
1729       memcpy(pBuf, &aPayload[offset+4], a);
1730       offset = 0;
1731       amt -= a;
1732       pBuf += a;
1733     }else{
1734       offset -= ovflSize;
1735     }
1736     sqlite3pager_unref(aPayload);
1737   }
1738   if( amt>0 ){
1739     return SQLITE_CORRUPT;
1740   }
1741   return SQLITE_OK;
1742 }
1743 
1744 /*
1745 ** Read part of the key associated with cursor pCur.  Exactly
1746 ** "amt" bytes will be transfered into pBuf[].  The transfer
1747 ** begins at "offset".
1748 **
1749 ** Return SQLITE_OK on success or an error code if anything goes
1750 ** wrong.  An error is returned if "offset+amt" is larger than
1751 ** the available payload.
1752 */
1753 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
1754   assert( amt>=0 );
1755   assert( offset>=0 );
1756   if( pCur->isValid==0 ){
1757     return pCur->status;
1758   }
1759   assert( pCur->pPage!=0 );
1760   assert( pCur->pPage->intKey==0 );
1761   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1762   return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
1763 }
1764 
1765 /*
1766 ** Read part of the data associated with cursor pCur.  Exactly
1767 ** "amt" bytes will be transfered into pBuf[].  The transfer
1768 ** begins at "offset".
1769 **
1770 ** Return SQLITE_OK on success or an error code if anything goes
1771 ** wrong.  An error is returned if "offset+amt" is larger than
1772 ** the available payload.
1773 */
1774 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
1775   if( !pCur->isValid ){
1776     return pCur->status ? pCur->status : SQLITE_INTERNAL;
1777   }
1778   assert( amt>=0 );
1779   assert( offset>=0 );
1780   assert( pCur->pPage!=0 );
1781   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1782   return getPayload(pCur, offset, amt, pBuf, 1);
1783 }
1784 
1785 /*
1786 ** Return a pointer to payload information from the entry that the
1787 ** pCur cursor is pointing to.  The pointer is to the beginning of
1788 ** the key if skipKey==0 and it points to the beginning of data if
1789 ** skipKey==1.  The number of bytes of available key/data is written
1790 ** into *pAmt.  If *pAmt==0, then the value returned will not be
1791 ** a valid pointer.
1792 **
1793 ** This routine is an optimization.  It is common for the entire key
1794 ** and data to fit on the local page and for there to be no overflow
1795 ** pages.  When that is so, this routine can be used to access the
1796 ** key and data without making a copy.  If the key and/or data spills
1797 ** onto overflow pages, then getPayload() must be used to reassembly
1798 ** the key/data and copy it into a preallocated buffer.
1799 **
1800 ** The pointer returned by this routine looks directly into the cached
1801 ** page of the database.  The data might change or move the next time
1802 ** any btree routine is called.
1803 */
1804 static const unsigned char *fetchPayload(
1805   BtCursor *pCur,      /* Cursor pointing to entry to read from */
1806   int *pAmt,           /* Write the number of available bytes here */
1807   int skipKey          /* read beginning at data if this is true */
1808 ){
1809   unsigned char *aPayload;
1810   MemPage *pPage;
1811   Btree *pBt;
1812   u32 nKey;
1813   int nLocal;
1814 
1815   assert( pCur!=0 && pCur->pPage!=0 );
1816   assert( pCur->isValid );
1817   pBt = pCur->pBt;
1818   pPage = pCur->pPage;
1819   pageIntegrity(pPage);
1820   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1821   getCellInfo(pCur);
1822   aPayload = pCur->info.pCell;
1823   aPayload += pCur->info.nHeader;
1824   if( pPage->intKey ){
1825     nKey = 0;
1826   }else{
1827     nKey = pCur->info.nKey;
1828   }
1829   if( skipKey ){
1830     aPayload += nKey;
1831     nLocal = pCur->info.nLocal - nKey;
1832   }else{
1833     nLocal = pCur->info.nLocal;
1834     if( nLocal>nKey ){
1835       nLocal = nKey;
1836     }
1837   }
1838   *pAmt = nLocal;
1839   return aPayload;
1840 }
1841 
1842 
1843 /*
1844 ** For the entry that cursor pCur is point to, return as
1845 ** many bytes of the key or data as are available on the local
1846 ** b-tree page.  Write the number of available bytes into *pAmt.
1847 **
1848 ** The pointer returned is ephemeral.  The key/data may move
1849 ** or be destroyed on the next call to any Btree routine.
1850 **
1851 ** These routines is used to get quick access to key and data
1852 ** in the common case where no overflow pages are used.
1853 */
1854 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
1855   return (const void*)fetchPayload(pCur, pAmt, 0);
1856 }
1857 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
1858   return (const void*)fetchPayload(pCur, pAmt, 1);
1859 }
1860 
1861 
1862 /*
1863 ** Move the cursor down to a new child page.  The newPgno argument is the
1864 ** page number of the child page to move to.
1865 */
1866 static int moveToChild(BtCursor *pCur, u32 newPgno){
1867   int rc;
1868   MemPage *pNewPage;
1869   MemPage *pOldPage;
1870   Btree *pBt = pCur->pBt;
1871 
1872   assert( pCur->isValid );
1873   rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
1874   if( rc ) return rc;
1875   pageIntegrity(pNewPage);
1876   pNewPage->idxParent = pCur->idx;
1877   pOldPage = pCur->pPage;
1878   pOldPage->idxShift = 0;
1879   releasePage(pOldPage);
1880   pCur->pPage = pNewPage;
1881   pCur->idx = 0;
1882   pCur->info.nSize = 0;
1883   if( pNewPage->nCell<1 ){
1884     return SQLITE_CORRUPT;
1885   }
1886   return SQLITE_OK;
1887 }
1888 
1889 /*
1890 ** Return true if the page is the virtual root of its table.
1891 **
1892 ** The virtual root page is the root page for most tables.  But
1893 ** for the table rooted on page 1, sometime the real root page
1894 ** is empty except for the right-pointer.  In such cases the
1895 ** virtual root page is the page that the right-pointer of page
1896 ** 1 is pointing to.
1897 */
1898 static int isRootPage(MemPage *pPage){
1899   MemPage *pParent = pPage->pParent;
1900   if( pParent==0 ) return 1;
1901   if( pParent->pgno>1 ) return 0;
1902   if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
1903   return 0;
1904 }
1905 
1906 /*
1907 ** Move the cursor up to the parent page.
1908 **
1909 ** pCur->idx is set to the cell index that contains the pointer
1910 ** to the page we are coming from.  If we are coming from the
1911 ** right-most child page then pCur->idx is set to one more than
1912 ** the largest cell index.
1913 */
1914 static void moveToParent(BtCursor *pCur){
1915   Pgno oldPgno;
1916   MemPage *pParent;
1917   MemPage *pPage;
1918   int idxParent;
1919 
1920   assert( pCur->isValid );
1921   pPage = pCur->pPage;
1922   assert( pPage!=0 );
1923   assert( !isRootPage(pPage) );
1924   pageIntegrity(pPage);
1925   pParent = pPage->pParent;
1926   assert( pParent!=0 );
1927   pageIntegrity(pParent);
1928   idxParent = pPage->idxParent;
1929   sqlite3pager_ref(pParent->aData);
1930   oldPgno = pPage->pgno;
1931   releasePage(pPage);
1932   pCur->pPage = pParent;
1933   pCur->info.nSize = 0;
1934   assert( pParent->idxShift==0 );
1935   pCur->idx = idxParent;
1936 }
1937 
1938 /*
1939 ** Move the cursor to the root page
1940 */
1941 static int moveToRoot(BtCursor *pCur){
1942   MemPage *pRoot;
1943   int rc;
1944   Btree *pBt = pCur->pBt;
1945 
1946   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
1947   if( rc ){
1948     pCur->isValid = 0;
1949     return rc;
1950   }
1951   releasePage(pCur->pPage);
1952   pageIntegrity(pRoot);
1953   pCur->pPage = pRoot;
1954   pCur->idx = 0;
1955   pCur->info.nSize = 0;
1956   if( pRoot->nCell==0 && !pRoot->leaf ){
1957     Pgno subpage;
1958     assert( pRoot->pgno==1 );
1959     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
1960     assert( subpage>0 );
1961     pCur->isValid = 1;
1962     rc = moveToChild(pCur, subpage);
1963   }
1964   pCur->isValid = pCur->pPage->nCell>0;
1965   return rc;
1966 }
1967 
1968 /*
1969 ** Move the cursor down to the left-most leaf entry beneath the
1970 ** entry to which it is currently pointing.
1971 */
1972 static int moveToLeftmost(BtCursor *pCur){
1973   Pgno pgno;
1974   int rc;
1975   MemPage *pPage;
1976 
1977   assert( pCur->isValid );
1978   while( !(pPage = pCur->pPage)->leaf ){
1979     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1980     pgno = get4byte(findCell(pPage, pCur->idx));
1981     rc = moveToChild(pCur, pgno);
1982     if( rc ) return rc;
1983   }
1984   return SQLITE_OK;
1985 }
1986 
1987 /*
1988 ** Move the cursor down to the right-most leaf entry beneath the
1989 ** page to which it is currently pointing.  Notice the difference
1990 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
1991 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1992 ** finds the right-most entry beneath the *page*.
1993 */
1994 static int moveToRightmost(BtCursor *pCur){
1995   Pgno pgno;
1996   int rc;
1997   MemPage *pPage;
1998 
1999   assert( pCur->isValid );
2000   while( !(pPage = pCur->pPage)->leaf ){
2001     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2002     pCur->idx = pPage->nCell;
2003     rc = moveToChild(pCur, pgno);
2004     if( rc ) return rc;
2005   }
2006   pCur->idx = pPage->nCell - 1;
2007   pCur->info.nSize = 0;
2008   return SQLITE_OK;
2009 }
2010 
2011 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
2012 ** on success.  Set *pRes to 0 if the cursor actually points to something
2013 ** or set *pRes to 1 if the table is empty.
2014 */
2015 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
2016   int rc;
2017   if( pCur->status ){
2018     return pCur->status;
2019   }
2020   rc = moveToRoot(pCur);
2021   if( rc ) return rc;
2022   if( pCur->isValid==0 ){
2023     assert( pCur->pPage->nCell==0 );
2024     *pRes = 1;
2025     return SQLITE_OK;
2026   }
2027   assert( pCur->pPage->nCell>0 );
2028   *pRes = 0;
2029   rc = moveToLeftmost(pCur);
2030   return rc;
2031 }
2032 
2033 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
2034 ** on success.  Set *pRes to 0 if the cursor actually points to something
2035 ** or set *pRes to 1 if the table is empty.
2036 */
2037 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
2038   int rc;
2039   if( pCur->status ){
2040     return pCur->status;
2041   }
2042   rc = moveToRoot(pCur);
2043   if( rc ) return rc;
2044   if( pCur->isValid==0 ){
2045     assert( pCur->pPage->nCell==0 );
2046     *pRes = 1;
2047     return SQLITE_OK;
2048   }
2049   assert( pCur->isValid );
2050   *pRes = 0;
2051   rc = moveToRightmost(pCur);
2052   return rc;
2053 }
2054 
2055 /* Move the cursor so that it points to an entry near pKey/nKey.
2056 ** Return a success code.
2057 **
2058 ** For INTKEY tables, only the nKey parameter is used.  pKey is
2059 ** ignored.  For other tables, nKey is the number of bytes of data
2060 ** in nKey.  The comparison function specified when the cursor was
2061 ** created is used to compare keys.
2062 **
2063 ** If an exact match is not found, then the cursor is always
2064 ** left pointing at a leaf page which would hold the entry if it
2065 ** were present.  The cursor might point to an entry that comes
2066 ** before or after the key.
2067 **
2068 ** The result of comparing the key with the entry to which the
2069 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
2070 ** this value is as follows:
2071 **
2072 **     *pRes<0      The cursor is left pointing at an entry that
2073 **                  is smaller than pKey or if the table is empty
2074 **                  and the cursor is therefore left point to nothing.
2075 **
2076 **     *pRes==0     The cursor is left pointing at an entry that
2077 **                  exactly matches pKey.
2078 **
2079 **     *pRes>0      The cursor is left pointing at an entry that
2080 **                  is larger than pKey.
2081 */
2082 int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){
2083   int rc;
2084 
2085   if( pCur->status ){
2086     return pCur->status;
2087   }
2088   rc = moveToRoot(pCur);
2089   if( rc ) return rc;
2090   assert( pCur->pPage );
2091   assert( pCur->pPage->isInit );
2092   if( pCur->isValid==0 ){
2093     *pRes = -1;
2094     assert( pCur->pPage->nCell==0 );
2095     return SQLITE_OK;
2096   }
2097   for(;;){
2098     int lwr, upr;
2099     Pgno chldPg;
2100     MemPage *pPage = pCur->pPage;
2101     int c = -1;  /* pRes return if table is empty must be -1 */
2102     lwr = 0;
2103     upr = pPage->nCell-1;
2104     pageIntegrity(pPage);
2105     while( lwr<=upr ){
2106       void *pCellKey;
2107       i64 nCellKey;
2108       pCur->idx = (lwr+upr)/2;
2109       pCur->info.nSize = 0;
2110       sqlite3BtreeKeySize(pCur, &nCellKey);
2111       if( pPage->intKey ){
2112         if( nCellKey<nKey ){
2113           c = -1;
2114         }else if( nCellKey>nKey ){
2115           c = +1;
2116         }else{
2117           c = 0;
2118         }
2119       }else{
2120         int available;
2121         pCellKey = (void *)fetchPayload(pCur, &available, 0);
2122         if( available>=nCellKey ){
2123           c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2124         }else{
2125           pCellKey = sqliteMallocRaw( nCellKey );
2126           if( pCellKey==0 ) return SQLITE_NOMEM;
2127           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
2128           c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2129           sqliteFree(pCellKey);
2130           if( rc ) return rc;
2131         }
2132       }
2133       if( c==0 ){
2134         if( pPage->leafData && !pPage->leaf ){
2135           lwr = pCur->idx;
2136           upr = lwr - 1;
2137           break;
2138         }else{
2139           if( pRes ) *pRes = 0;
2140           return SQLITE_OK;
2141         }
2142       }
2143       if( c<0 ){
2144         lwr = pCur->idx+1;
2145       }else{
2146         upr = pCur->idx-1;
2147       }
2148     }
2149     assert( lwr==upr+1 );
2150     assert( pPage->isInit );
2151     if( pPage->leaf ){
2152       chldPg = 0;
2153     }else if( lwr>=pPage->nCell ){
2154       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2155     }else{
2156       chldPg = get4byte(findCell(pPage, lwr));
2157     }
2158     if( chldPg==0 ){
2159       assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2160       if( pRes ) *pRes = c;
2161       return SQLITE_OK;
2162     }
2163     pCur->idx = lwr;
2164     pCur->info.nSize = 0;
2165     rc = moveToChild(pCur, chldPg);
2166     if( rc ){
2167       return rc;
2168     }
2169   }
2170   /* NOT REACHED */
2171 }
2172 
2173 /*
2174 ** Return TRUE if the cursor is not pointing at an entry of the table.
2175 **
2176 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
2177 ** past the last entry in the table or sqlite3BtreePrev() moves past
2178 ** the first entry.  TRUE is also returned if the table is empty.
2179 */
2180 int sqlite3BtreeEof(BtCursor *pCur){
2181   return pCur->isValid==0;
2182 }
2183 
2184 /*
2185 ** Advance the cursor to the next entry in the database.  If
2186 ** successful then set *pRes=0.  If the cursor
2187 ** was already pointing to the last entry in the database before
2188 ** this routine was called, then set *pRes=1.
2189 */
2190 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
2191   int rc;
2192   MemPage *pPage = pCur->pPage;
2193 
2194   assert( pRes!=0 );
2195   if( pCur->isValid==0 ){
2196     *pRes = 1;
2197     return SQLITE_OK;
2198   }
2199   assert( pPage->isInit );
2200   assert( pCur->idx<pPage->nCell );
2201   pCur->idx++;
2202   pCur->info.nSize = 0;
2203   if( pCur->idx>=pPage->nCell ){
2204     if( !pPage->leaf ){
2205       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
2206       if( rc ) return rc;
2207       rc = moveToLeftmost(pCur);
2208       *pRes = 0;
2209       return rc;
2210     }
2211     do{
2212       if( isRootPage(pPage) ){
2213         *pRes = 1;
2214         pCur->isValid = 0;
2215         return SQLITE_OK;
2216       }
2217       moveToParent(pCur);
2218       pPage = pCur->pPage;
2219     }while( pCur->idx>=pPage->nCell );
2220     *pRes = 0;
2221     if( pPage->leafData ){
2222       rc = sqlite3BtreeNext(pCur, pRes);
2223     }else{
2224       rc = SQLITE_OK;
2225     }
2226     return rc;
2227   }
2228   *pRes = 0;
2229   if( pPage->leaf ){
2230     return SQLITE_OK;
2231   }
2232   rc = moveToLeftmost(pCur);
2233   return rc;
2234 }
2235 
2236 /*
2237 ** Step the cursor to the back to the previous entry in the database.  If
2238 ** successful then set *pRes=0.  If the cursor
2239 ** was already pointing to the first entry in the database before
2240 ** this routine was called, then set *pRes=1.
2241 */
2242 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
2243   int rc;
2244   Pgno pgno;
2245   MemPage *pPage;
2246   if( pCur->isValid==0 ){
2247     *pRes = 1;
2248     return SQLITE_OK;
2249   }
2250   pPage = pCur->pPage;
2251   assert( pPage->isInit );
2252   assert( pCur->idx>=0 );
2253   if( !pPage->leaf ){
2254     pgno = get4byte( findCell(pPage, pCur->idx) );
2255     rc = moveToChild(pCur, pgno);
2256     if( rc ) return rc;
2257     rc = moveToRightmost(pCur);
2258   }else{
2259     while( pCur->idx==0 ){
2260       if( isRootPage(pPage) ){
2261         pCur->isValid = 0;
2262         *pRes = 1;
2263         return SQLITE_OK;
2264       }
2265       moveToParent(pCur);
2266       pPage = pCur->pPage;
2267     }
2268     pCur->idx--;
2269     pCur->info.nSize = 0;
2270     if( pPage->leafData ){
2271       rc = sqlite3BtreePrevious(pCur, pRes);
2272     }else{
2273       rc = SQLITE_OK;
2274     }
2275   }
2276   *pRes = 0;
2277   return rc;
2278 }
2279 
2280 /*
2281 ** The TRACE macro will print high-level status information about the
2282 ** btree operation when the global variable sqlite3_btree_trace is
2283 ** enabled.
2284 */
2285 #if SQLITE_TEST
2286 # define TRACE(X)   if( sqlite3_btree_trace )\
2287                         { sqlite3DebugPrintf X; fflush(stdout); }
2288 #else
2289 # define TRACE(X)
2290 #endif
2291 int sqlite3_btree_trace=0;  /* True to enable tracing */
2292 
2293 /*
2294 ** Allocate a new page from the database file.
2295 **
2296 ** The new page is marked as dirty.  (In other words, sqlite3pager_write()
2297 ** has already been called on the new page.)  The new page has also
2298 ** been referenced and the calling routine is responsible for calling
2299 ** sqlite3pager_unref() on the new page when it is done.
2300 **
2301 ** SQLITE_OK is returned on success.  Any other return value indicates
2302 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
2303 ** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
2304 **
2305 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
2306 ** locate a page close to the page number "nearby".  This can be used in an
2307 ** attempt to keep related pages close to each other in the database file,
2308 ** which in turn can make database access faster.
2309 */
2310 static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
2311   MemPage *pPage1;
2312   int rc;
2313   int n;     /* Number of pages on the freelist */
2314   int k;     /* Number of leaves on the trunk of the freelist */
2315 
2316   pPage1 = pBt->pPage1;
2317   n = get4byte(&pPage1->aData[36]);
2318   if( n>0 ){
2319     /* There are pages on the freelist.  Reuse one of those pages. */
2320     MemPage *pTrunk;
2321     rc = sqlite3pager_write(pPage1->aData);
2322     if( rc ) return rc;
2323     put4byte(&pPage1->aData[36], n-1);
2324     rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
2325     if( rc ) return rc;
2326     rc = sqlite3pager_write(pTrunk->aData);
2327     if( rc ){
2328       releasePage(pTrunk);
2329       return rc;
2330     }
2331     k = get4byte(&pTrunk->aData[4]);
2332     if( k==0 ){
2333       /* The trunk has no leaves.  So extract the trunk page itself and
2334       ** use it as the newly allocated page */
2335       *pPgno = get4byte(&pPage1->aData[32]);
2336       memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
2337       *ppPage = pTrunk;
2338       TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
2339     }else{
2340       /* Extract a leaf from the trunk */
2341       int closest;
2342       unsigned char *aData = pTrunk->aData;
2343       if( nearby>0 ){
2344         int i, dist;
2345         closest = 0;
2346         dist = get4byte(&aData[8]) - nearby;
2347         if( dist<0 ) dist = -dist;
2348         for(i=1; i<k; i++){
2349           int d2 = get4byte(&aData[8+i*4]) - nearby;
2350           if( d2<0 ) d2 = -d2;
2351           if( d2<dist ) closest = i;
2352         }
2353       }else{
2354         closest = 0;
2355       }
2356       *pPgno = get4byte(&aData[8+closest*4]);
2357       TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d: %d more free pages\n",
2358              *pPgno, closest+1, k, pTrunk->pgno, n-1));
2359       if( closest<k-1 ){
2360         memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
2361       }
2362       put4byte(&aData[4], k-1);
2363       rc = getPage(pBt, *pPgno, ppPage);
2364       releasePage(pTrunk);
2365       if( rc==SQLITE_OK ){
2366         sqlite3pager_dont_rollback((*ppPage)->aData);
2367         rc = sqlite3pager_write((*ppPage)->aData);
2368       }
2369     }
2370   }else{
2371     /* There are no pages on the freelist, so create a new page at the
2372     ** end of the file */
2373     *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
2374     rc = getPage(pBt, *pPgno, ppPage);
2375     if( rc ) return rc;
2376     rc = sqlite3pager_write((*ppPage)->aData);
2377     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
2378   }
2379   return rc;
2380 }
2381 
2382 /*
2383 ** Add a page of the database file to the freelist.
2384 **
2385 ** sqlite3pager_unref() is NOT called for pPage.
2386 */
2387 static int freePage(MemPage *pPage){
2388   Btree *pBt = pPage->pBt;
2389   MemPage *pPage1 = pBt->pPage1;
2390   int rc, n, k;
2391 
2392   /* Prepare the page for freeing */
2393   assert( pPage->pgno>1 );
2394   pPage->isInit = 0;
2395   releasePage(pPage->pParent);
2396   pPage->pParent = 0;
2397 
2398   /* Increment the free page count on pPage1 */
2399   rc = sqlite3pager_write(pPage1->aData);
2400   if( rc ) return rc;
2401   n = get4byte(&pPage1->aData[36]);
2402   put4byte(&pPage1->aData[36], n+1);
2403 
2404   if( n==0 ){
2405     /* This is the first free page */
2406     rc = sqlite3pager_write(pPage->aData);
2407     if( rc ) return rc;
2408     memset(pPage->aData, 0, 8);
2409     put4byte(&pPage1->aData[32], pPage->pgno);
2410     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
2411   }else{
2412     /* Other free pages already exist.  Retrive the first trunk page
2413     ** of the freelist and find out how many leaves it has. */
2414     MemPage *pTrunk;
2415     rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
2416     if( rc ) return rc;
2417     k = get4byte(&pTrunk->aData[4]);
2418     if( k==pBt->usableSize/4 - 8 ){
2419       /* The trunk is full.  Turn the page being freed into a new
2420       ** trunk page with no leaves. */
2421       rc = sqlite3pager_write(pPage->aData);
2422       if( rc ) return rc;
2423       put4byte(pPage->aData, pTrunk->pgno);
2424       put4byte(&pPage->aData[4], 0);
2425       put4byte(&pPage1->aData[32], pPage->pgno);
2426       TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
2427               pPage->pgno, pTrunk->pgno));
2428     }else{
2429       /* Add the newly freed page as a leaf on the current trunk */
2430       rc = sqlite3pager_write(pTrunk->aData);
2431       if( rc ) return rc;
2432       put4byte(&pTrunk->aData[4], k+1);
2433       put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
2434       sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
2435       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
2436     }
2437     releasePage(pTrunk);
2438   }
2439   return rc;
2440 }
2441 
2442 /*
2443 ** Free any overflow pages associated with the given Cell.
2444 */
2445 static int clearCell(MemPage *pPage, unsigned char *pCell){
2446   Btree *pBt = pPage->pBt;
2447   CellInfo info;
2448   Pgno ovflPgno;
2449   int rc;
2450 
2451   parseCellPtr(pPage, pCell, &info);
2452   if( info.iOverflow==0 ){
2453     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
2454   }
2455   ovflPgno = get4byte(&pCell[info.iOverflow]);
2456   while( ovflPgno!=0 ){
2457     MemPage *pOvfl;
2458     rc = getPage(pBt, ovflPgno, &pOvfl);
2459     if( rc ) return rc;
2460     ovflPgno = get4byte(pOvfl->aData);
2461     rc = freePage(pOvfl);
2462     if( rc ) return rc;
2463     sqlite3pager_unref(pOvfl->aData);
2464   }
2465   return SQLITE_OK;
2466 }
2467 
2468 /*
2469 ** Create the byte sequence used to represent a cell on page pPage
2470 ** and write that byte sequence into pCell[].  Overflow pages are
2471 ** allocated and filled in as necessary.  The calling procedure
2472 ** is responsible for making sure sufficient space has been allocated
2473 ** for pCell[].
2474 **
2475 ** Note that pCell does not necessary need to point to the pPage->aData
2476 ** area.  pCell might point to some temporary storage.  The cell will
2477 ** be constructed in this temporary area then copied into pPage->aData
2478 ** later.
2479 */
2480 static int fillInCell(
2481   MemPage *pPage,                /* The page that contains the cell */
2482   unsigned char *pCell,          /* Complete text of the cell */
2483   const void *pKey, i64 nKey,    /* The key */
2484   const void *pData,int nData,   /* The data */
2485   int *pnSize                    /* Write cell size here */
2486 ){
2487   int nPayload;
2488   const u8 *pSrc;
2489   int nSrc, n, rc;
2490   int spaceLeft;
2491   MemPage *pOvfl = 0;
2492   MemPage *pToRelease = 0;
2493   unsigned char *pPrior;
2494   unsigned char *pPayload;
2495   Btree *pBt = pPage->pBt;
2496   Pgno pgnoOvfl = 0;
2497   int nHeader;
2498   CellInfo info;
2499 
2500   /* Fill in the header. */
2501   nHeader = 0;
2502   if( !pPage->leaf ){
2503     nHeader += 4;
2504   }
2505   if( pPage->hasData ){
2506     nHeader += putVarint(&pCell[nHeader], nData);
2507   }else{
2508     nData = 0;
2509   }
2510   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
2511   parseCellPtr(pPage, pCell, &info);
2512   assert( info.nHeader==nHeader );
2513   assert( info.nKey==nKey );
2514   assert( info.nData==nData );
2515 
2516   /* Fill in the payload */
2517   nPayload = nData;
2518   if( pPage->intKey ){
2519     pSrc = pData;
2520     nSrc = nData;
2521     nData = 0;
2522   }else{
2523     nPayload += nKey;
2524     pSrc = pKey;
2525     nSrc = nKey;
2526   }
2527   *pnSize = info.nSize;
2528   spaceLeft = info.nLocal;
2529   pPayload = &pCell[nHeader];
2530   pPrior = &pCell[info.iOverflow];
2531 
2532   while( nPayload>0 ){
2533     if( spaceLeft==0 ){
2534       rc =  allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl);
2535       if( rc ){
2536         releasePage(pToRelease);
2537         clearCell(pPage, pCell);
2538         return rc;
2539       }
2540       put4byte(pPrior, pgnoOvfl);
2541       releasePage(pToRelease);
2542       pToRelease = pOvfl;
2543       pPrior = pOvfl->aData;
2544       put4byte(pPrior, 0);
2545       pPayload = &pOvfl->aData[4];
2546       spaceLeft = pBt->usableSize - 4;
2547     }
2548     n = nPayload;
2549     if( n>spaceLeft ) n = spaceLeft;
2550     if( n>nSrc ) n = nSrc;
2551     memcpy(pPayload, pSrc, n);
2552     nPayload -= n;
2553     pPayload += n;
2554     pSrc += n;
2555     nSrc -= n;
2556     spaceLeft -= n;
2557     if( nSrc==0 ){
2558       nSrc = nData;
2559       pSrc = pData;
2560     }
2561   }
2562   releasePage(pToRelease);
2563   return SQLITE_OK;
2564 }
2565 
2566 /*
2567 ** Change the MemPage.pParent pointer on the page whose number is
2568 ** given in the second argument so that MemPage.pParent holds the
2569 ** pointer in the third argument.
2570 */
2571 static void reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
2572   MemPage *pThis;
2573   unsigned char *aData;
2574 
2575   if( pgno==0 ) return;
2576   assert( pBt->pPager!=0 );
2577   aData = sqlite3pager_lookup(pBt->pPager, pgno);
2578   if( aData ){
2579     pThis = (MemPage*)&aData[pBt->usableSize];
2580     if( pThis->isInit ){
2581       if( pThis->pParent!=pNewParent ){
2582         if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
2583         pThis->pParent = pNewParent;
2584         if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
2585       }
2586       pThis->idxParent = idx;
2587     }
2588     sqlite3pager_unref(aData);
2589   }
2590 }
2591 
2592 /*
2593 ** Change the pParent pointer of all children of pPage to point back
2594 ** to pPage.
2595 **
2596 ** In other words, for every child of pPage, invoke reparentPage()
2597 ** to make sure that each child knows that pPage is its parent.
2598 **
2599 ** This routine gets called after you memcpy() one page into
2600 ** another.
2601 */
2602 static void reparentChildPages(MemPage *pPage){
2603   int i;
2604   Btree *pBt;
2605 
2606   if( pPage->leaf ) return;
2607   pBt = pPage->pBt;
2608   for(i=0; i<pPage->nCell; i++){
2609     reparentPage(pBt, get4byte(findCell(pPage,i)), pPage, i);
2610   }
2611   reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]), pPage, i);
2612   pPage->idxShift = 0;
2613 }
2614 
2615 /*
2616 ** Remove the i-th cell from pPage.  This routine effects pPage only.
2617 ** The cell content is not freed or deallocated.  It is assumed that
2618 ** the cell content has been copied someplace else.  This routine just
2619 ** removes the reference to the cell from pPage.
2620 **
2621 ** "sz" must be the number of bytes in the cell.
2622 */
2623 static void dropCell(MemPage *pPage, int idx, int sz){
2624   int i;          /* Loop counter */
2625   int pc;         /* Offset to cell content of cell being deleted */
2626   u8 *data;       /* pPage->aData */
2627   u8 *ptr;        /* Used to move bytes around within data[] */
2628 
2629   assert( idx>=0 && idx<pPage->nCell );
2630   assert( sz==cellSize(pPage, idx) );
2631   assert( sqlite3pager_iswriteable(pPage->aData) );
2632   data = pPage->aData;
2633   ptr = &data[pPage->cellOffset + 2*idx];
2634   pc = get2byte(ptr);
2635   assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
2636   freeSpace(pPage, pc, sz);
2637   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
2638     ptr[0] = ptr[2];
2639     ptr[1] = ptr[3];
2640   }
2641   pPage->nCell--;
2642   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
2643   pPage->nFree += 2;
2644   pPage->idxShift = 1;
2645 }
2646 
2647 /*
2648 ** Insert a new cell on pPage at cell index "i".  pCell points to the
2649 ** content of the cell.
2650 **
2651 ** If the cell content will fit on the page, then put it there.  If it
2652 ** will not fit, then make a copy of the cell content into pTemp if
2653 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
2654 ** in pPage->aOvfl[] and make it point to the cell content (either
2655 ** in pTemp or the original pCell) and also record its index.
2656 ** Allocating a new entry in pPage->aCell[] implies that
2657 ** pPage->nOverflow is incremented.
2658 */
2659 static void insertCell(
2660   MemPage *pPage,   /* Page into which we are copying */
2661   int i,            /* New cell becomes the i-th cell of the page */
2662   u8 *pCell,        /* Content of the new cell */
2663   int sz,           /* Bytes of content in pCell */
2664   u8 *pTemp         /* Temp storage space for pCell, if needed */
2665 ){
2666   int idx;          /* Where to write new cell content in data[] */
2667   int j;            /* Loop counter */
2668   int top;          /* First byte of content for any cell in data[] */
2669   int end;          /* First byte past the last cell pointer in data[] */
2670   int ins;          /* Index in data[] where new cell pointer is inserted */
2671   int hdr;          /* Offset into data[] of the page header */
2672   int cellOffset;   /* Address of first cell pointer in data[] */
2673   u8 *data;         /* The content of the whole page */
2674   u8 *ptr;          /* Used for moving information around in data[] */
2675 
2676   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
2677   assert( sz==cellSizePtr(pPage, pCell) );
2678   assert( sqlite3pager_iswriteable(pPage->aData) );
2679   if( pPage->nOverflow || sz+2>pPage->nFree ){
2680     if( pTemp ){
2681       memcpy(pTemp, pCell, sz);
2682       pCell = pTemp;
2683     }
2684     j = pPage->nOverflow++;
2685     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
2686     pPage->aOvfl[j].pCell = pCell;
2687     pPage->aOvfl[j].idx = i;
2688     pPage->nFree = 0;
2689   }else{
2690     data = pPage->aData;
2691     hdr = pPage->hdrOffset;
2692     top = get2byte(&data[hdr+5]);
2693     cellOffset = pPage->cellOffset;
2694     end = cellOffset + 2*pPage->nCell + 2;
2695     ins = cellOffset + 2*i;
2696     if( end > top - sz ){
2697       defragmentPage(pPage);
2698       top = get2byte(&data[hdr+5]);
2699       assert( end + sz <= top );
2700     }
2701     idx = allocateSpace(pPage, sz);
2702     assert( idx>0 );
2703     assert( end <= get2byte(&data[hdr+5]) );
2704     pPage->nCell++;
2705     pPage->nFree -= 2;
2706     memcpy(&data[idx], pCell, sz);
2707     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
2708       ptr[0] = ptr[-2];
2709       ptr[1] = ptr[-1];
2710     }
2711     put2byte(&data[ins], idx);
2712     put2byte(&data[hdr+3], pPage->nCell);
2713     pPage->idxShift = 1;
2714     pageIntegrity(pPage);
2715   }
2716 }
2717 
2718 /*
2719 ** Add a list of cells to a page.  The page should be initially empty.
2720 ** The cells are guaranteed to fit on the page.
2721 */
2722 static void assemblePage(
2723   MemPage *pPage,   /* The page to be assemblied */
2724   int nCell,        /* The number of cells to add to this page */
2725   u8 **apCell,      /* Pointers to cell bodies */
2726   int *aSize        /* Sizes of the cells */
2727 ){
2728   int i;            /* Loop counter */
2729   int totalSize;    /* Total size of all cells */
2730   int hdr;          /* Index of page header */
2731   int cellptr;      /* Address of next cell pointer */
2732   int cellbody;     /* Address of next cell body */
2733   u8 *data;         /* Data for the page */
2734 
2735   assert( pPage->nOverflow==0 );
2736   totalSize = 0;
2737   for(i=0; i<nCell; i++){
2738     totalSize += aSize[i];
2739   }
2740   assert( totalSize+2*nCell<=pPage->nFree );
2741   assert( pPage->nCell==0 );
2742   cellptr = pPage->cellOffset;
2743   data = pPage->aData;
2744   hdr = pPage->hdrOffset;
2745   put2byte(&data[hdr+3], nCell);
2746   cellbody = allocateSpace(pPage, totalSize);
2747   assert( cellbody>0 );
2748   assert( pPage->nFree >= 2*nCell );
2749   pPage->nFree -= 2*nCell;
2750   for(i=0; i<nCell; i++){
2751     put2byte(&data[cellptr], cellbody);
2752     memcpy(&data[cellbody], apCell[i], aSize[i]);
2753     cellptr += 2;
2754     cellbody += aSize[i];
2755   }
2756   assert( cellbody==pPage->pBt->usableSize );
2757   pPage->nCell = nCell;
2758 }
2759 
2760 /*
2761 ** GCC does not define the offsetof() macro so we'll have to do it
2762 ** ourselves.
2763 */
2764 #ifndef offsetof
2765 #define offsetof(STRUCTURE,FIELD) ((int)((char*)&((STRUCTURE*)0)->FIELD))
2766 #endif
2767 
2768 /*
2769 ** The following parameters determine how many adjacent pages get involved
2770 ** in a balancing operation.  NN is the number of neighbors on either side
2771 ** of the page that participate in the balancing operation.  NB is the
2772 ** total number of pages that participate, including the target page and
2773 ** NN neighbors on either side.
2774 **
2775 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
2776 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2777 ** in exchange for a larger degradation in INSERT and UPDATE performance.
2778 ** The value of NN appears to give the best results overall.
2779 */
2780 #define NN 1             /* Number of neighbors on either side of pPage */
2781 #define NB (NN*2+1)      /* Total pages involved in the balance */
2782 
2783 /* Forward reference */
2784 static int balance(MemPage*);
2785 
2786 /*
2787 ** This routine redistributes Cells on pPage and up to NN*2 siblings
2788 ** of pPage so that all pages have about the same amount of free space.
2789 ** Usually one sibling on either side of pPage is used in the balancing,
2790 ** though both siblings might come from one side if pPage is the first
2791 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
2792 ** (something which can only happen if pPage is the root page or a
2793 ** child of root) then all available siblings participate in the balancing.
2794 **
2795 ** The number of siblings of pPage might be increased or decreased by
2796 ** one in an effort to keep pages nearly full but not over full. The root page
2797 ** is special and is allowed to be nearly empty. If pPage is
2798 ** the root page, then the depth of the tree might be increased
2799 ** or decreased by one, as necessary, to keep the root page from being
2800 ** overfull or completely empty.
2801 **
2802 ** Note that when this routine is called, some of the Cells on pPage
2803 ** might not actually be stored in pPage->aData[].  This can happen
2804 ** if the page is overfull.  Part of the job of this routine is to
2805 ** make sure all Cells for pPage once again fit in pPage->aData[].
2806 **
2807 ** In the course of balancing the siblings of pPage, the parent of pPage
2808 ** might become overfull or underfull.  If that happens, then this routine
2809 ** is called recursively on the parent.
2810 **
2811 ** If this routine fails for any reason, it might leave the database
2812 ** in a corrupted state.  So if this routine fails, the database should
2813 ** be rolled back.
2814 */
2815 static int balance_nonroot(MemPage *pPage){
2816   MemPage *pParent;            /* The parent of pPage */
2817   Btree *pBt;                  /* The whole database */
2818   int nCell;                   /* Number of cells in aCell[] */
2819   int nOld;                    /* Number of pages in apOld[] */
2820   int nNew;                    /* Number of pages in apNew[] */
2821   int nDiv;                    /* Number of cells in apDiv[] */
2822   int i, j, k;                 /* Loop counters */
2823   int idx;                     /* Index of pPage in pParent->aCell[] */
2824   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
2825   int rc;                      /* The return code */
2826   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
2827   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
2828   int usableSpace;             /* Bytes in pPage beyond the header */
2829   int pageFlags;               /* Value of pPage->aData[0] */
2830   int subtotal;                /* Subtotal of bytes in cells on one page */
2831   int iSpace = 0;              /* First unused byte of aSpace[] */
2832   MemPage *apOld[NB];          /* pPage and up to two siblings */
2833   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
2834   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
2835   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
2836   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
2837   int idxDiv[NB];              /* Indices of divider cells in pParent */
2838   u8 *apDiv[NB];               /* Divider cells in pParent */
2839   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
2840   int szNew[NB+2];             /* Combined size of cells place on i-th page */
2841   u8 *apCell[(MX_CELL+2)*NB];  /* All cells from pages being balanced */
2842   int szCell[(MX_CELL+2)*NB];  /* Local size of all cells */
2843   u8 aCopy[NB][MX_PAGE_SIZE+sizeof(MemPage)];  /* Space for apCopy[] */
2844   u8 aSpace[MX_PAGE_SIZE*5];   /* Space to copies of divider cells */
2845 
2846   /*
2847   ** Find the parent page.
2848   */
2849   assert( pPage->isInit );
2850   assert( sqlite3pager_iswriteable(pPage->aData) );
2851   pBt = pPage->pBt;
2852   pParent = pPage->pParent;
2853   sqlite3pager_write(pParent->aData);
2854   assert( pParent );
2855   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
2856 
2857   /*
2858   ** Find the cell in the parent page whose left child points back
2859   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
2860   ** is the rightmost child of pParent then set idx to pParent->nCell
2861   */
2862   if( pParent->idxShift ){
2863     Pgno pgno;
2864     pgno = pPage->pgno;
2865     assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
2866     for(idx=0; idx<pParent->nCell; idx++){
2867       if( get4byte(findCell(pParent, idx))==pgno ){
2868         break;
2869       }
2870     }
2871     assert( idx<pParent->nCell
2872              || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
2873   }else{
2874     idx = pPage->idxParent;
2875   }
2876 
2877   /*
2878   ** Initialize variables so that it will be safe to jump
2879   ** directly to balance_cleanup at any moment.
2880   */
2881   nOld = nNew = 0;
2882   sqlite3pager_ref(pParent->aData);
2883 
2884   /*
2885   ** Find sibling pages to pPage and the cells in pParent that divide
2886   ** the siblings.  An attempt is made to find NN siblings on either
2887   ** side of pPage.  More siblings are taken from one side, however, if
2888   ** pPage there are fewer than NN siblings on the other side.  If pParent
2889   ** has NB or fewer children then all children of pParent are taken.
2890   */
2891   nxDiv = idx - NN;
2892   if( nxDiv + NB > pParent->nCell ){
2893     nxDiv = pParent->nCell - NB + 1;
2894   }
2895   if( nxDiv<0 ){
2896     nxDiv = 0;
2897   }
2898   nDiv = 0;
2899   for(i=0, k=nxDiv; i<NB; i++, k++){
2900     if( k<pParent->nCell ){
2901       idxDiv[i] = k;
2902       apDiv[i] = findCell(pParent, k);
2903       nDiv++;
2904       assert( !pParent->leaf );
2905       pgnoOld[i] = get4byte(apDiv[i]);
2906     }else if( k==pParent->nCell ){
2907       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
2908     }else{
2909       break;
2910     }
2911     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
2912     if( rc ) goto balance_cleanup;
2913     apOld[i]->idxParent = k;
2914     apCopy[i] = 0;
2915     assert( i==nOld );
2916     nOld++;
2917   }
2918 
2919   /*
2920   ** Make copies of the content of pPage and its siblings into aOld[].
2921   ** The rest of this function will use data from the copies rather
2922   ** that the original pages since the original pages will be in the
2923   ** process of being overwritten.
2924   */
2925   for(i=0; i<nOld; i++){
2926     MemPage *p = apCopy[i] = (MemPage*)&aCopy[i+1][-sizeof(MemPage)];
2927     p->aData = &((u8*)p)[-pBt->pageSize];
2928     memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage));
2929     p->aData = &((u8*)p)[-pBt->pageSize];
2930   }
2931 
2932   /*
2933   ** Load pointers to all cells on sibling pages and the divider cells
2934   ** into the local apCell[] array.  Make copies of the divider cells
2935   ** into space obtained form aSpace[] and remove the the divider Cells
2936   ** from pParent.
2937   **
2938   ** If the siblings are on leaf pages, then the child pointers of the
2939   ** divider cells are stripped from the cells before they are copied
2940   ** into aSpace[].  In this way, all cells in apCell[] are without
2941   ** child pointers.  If siblings are not leaves, then all cell in
2942   ** apCell[] include child pointers.  Either way, all cells in apCell[]
2943   ** are alike.
2944   **
2945   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
2946   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
2947   */
2948   nCell = 0;
2949   leafCorrection = pPage->leaf*4;
2950   leafData = pPage->leafData && pPage->leaf;
2951   for(i=0; i<nOld; i++){
2952     MemPage *pOld = apCopy[i];
2953     int limit = pOld->nCell+pOld->nOverflow;
2954     for(j=0; j<limit; j++){
2955       apCell[nCell] = findOverflowCell(pOld, j);
2956       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
2957       nCell++;
2958     }
2959     if( i<nOld-1 ){
2960       int sz = cellSizePtr(pParent, apDiv[i]);
2961       if( leafData ){
2962         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
2963         ** are duplicates of keys on the child pages.  We need to remove
2964         ** the divider cells from pParent, but the dividers cells are not
2965         ** added to apCell[] because they are duplicates of child cells.
2966         */
2967         dropCell(pParent, nxDiv, sz);
2968       }else{
2969         u8 *pTemp;
2970         szCell[nCell] = sz;
2971         pTemp = &aSpace[iSpace];
2972         iSpace += sz;
2973         assert( iSpace<=sizeof(aSpace) );
2974         memcpy(pTemp, apDiv[i], sz);
2975         apCell[nCell] = pTemp+leafCorrection;
2976         dropCell(pParent, nxDiv, sz);
2977         szCell[nCell] -= leafCorrection;
2978         assert( get4byte(pTemp)==pgnoOld[i] );
2979         if( !pOld->leaf ){
2980           assert( leafCorrection==0 );
2981           /* The right pointer of the child page pOld becomes the left
2982           ** pointer of the divider cell */
2983           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
2984         }else{
2985           assert( leafCorrection==4 );
2986         }
2987         nCell++;
2988       }
2989     }
2990   }
2991 
2992   /*
2993   ** Figure out the number of pages needed to hold all nCell cells.
2994   ** Store this number in "k".  Also compute szNew[] which is the total
2995   ** size of all cells on the i-th page and cntNew[] which is the index
2996   ** in apCell[] of the cell that divides page i from page i+1.
2997   ** cntNew[k] should equal nCell.
2998   **
2999   ** Values computed by this block:
3000   **
3001   **           k: The total number of sibling pages
3002   **    szNew[i]: Spaced used on the i-th sibling page.
3003   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
3004   **              the right of the i-th sibling page.
3005   ** usableSpace: Number of bytes of space available on each sibling.
3006   **
3007   */
3008   usableSpace = pBt->usableSize - 12 + leafCorrection;
3009   for(subtotal=k=i=0; i<nCell; i++){
3010     subtotal += szCell[i] + 2;
3011     if( subtotal > usableSpace ){
3012       szNew[k] = subtotal - szCell[i];
3013       cntNew[k] = i;
3014       if( leafData ){ i--; }
3015       subtotal = 0;
3016       k++;
3017     }
3018   }
3019   szNew[k] = subtotal;
3020   cntNew[k] = nCell;
3021   k++;
3022 
3023   /*
3024   ** The packing computed by the previous block is biased toward the siblings
3025   ** on the left side.  The left siblings are always nearly full, while the
3026   ** right-most sibling might be nearly empty.  This block of code attempts
3027   ** to adjust the packing of siblings to get a better balance.
3028   **
3029   ** This adjustment is more than an optimization.  The packing above might
3030   ** be so out of balance as to be illegal.  For example, the right-most
3031   ** sibling might be completely empty.  This adjustment is not optional.
3032   */
3033   for(i=k-1; i>0; i--){
3034     int szRight = szNew[i];  /* Size of sibling on the right */
3035     int szLeft = szNew[i-1]; /* Size of sibling on the left */
3036     int r;              /* Index of right-most cell in left sibling */
3037     int d;              /* Index of first cell to the left of right sibling */
3038 
3039     r = cntNew[i-1] - 1;
3040     d = r + 1 - leafData;
3041     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
3042       szRight += szCell[d] + 2;
3043       szLeft -= szCell[r] + 2;
3044       cntNew[i-1]--;
3045       r = cntNew[i-1] - 1;
3046       d = r + 1 - leafData;
3047     }
3048     szNew[i] = szRight;
3049     szNew[i-1] = szLeft;
3050   }
3051   assert( cntNew[0]>0 );
3052 
3053   /*
3054   ** Allocate k new pages.  Reuse old pages where possible.
3055   */
3056   assert( pPage->pgno>1 );
3057   pageFlags = pPage->aData[0];
3058   for(i=0; i<k; i++){
3059     MemPage *pNew;
3060     if( i<nOld ){
3061       pNew = apNew[i] = apOld[i];
3062       pgnoNew[i] = pgnoOld[i];
3063       apOld[i] = 0;
3064       sqlite3pager_write(pNew->aData);
3065     }else{
3066       rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1]);
3067       if( rc ) goto balance_cleanup;
3068       apNew[i] = pNew;
3069     }
3070     nNew++;
3071     zeroPage(pNew, pageFlags);
3072   }
3073 
3074   /* Free any old pages that were not reused as new pages.
3075   */
3076   while( i<nOld ){
3077     rc = freePage(apOld[i]);
3078     if( rc ) goto balance_cleanup;
3079     releasePage(apOld[i]);
3080     apOld[i] = 0;
3081     i++;
3082   }
3083 
3084   /*
3085   ** Put the new pages in accending order.  This helps to
3086   ** keep entries in the disk file in order so that a scan
3087   ** of the table is a linear scan through the file.  That
3088   ** in turn helps the operating system to deliver pages
3089   ** from the disk more rapidly.
3090   **
3091   ** An O(n^2) insertion sort algorithm is used, but since
3092   ** n is never more than NB (a small constant), that should
3093   ** not be a problem.
3094   **
3095   ** When NB==3, this one optimization makes the database
3096   ** about 25% faster for large insertions and deletions.
3097   */
3098   for(i=0; i<k-1; i++){
3099     int minV = pgnoNew[i];
3100     int minI = i;
3101     for(j=i+1; j<k; j++){
3102       if( pgnoNew[j]<(unsigned)minV ){
3103         minI = j;
3104         minV = pgnoNew[j];
3105       }
3106     }
3107     if( minI>i ){
3108       int t;
3109       MemPage *pT;
3110       t = pgnoNew[i];
3111       pT = apNew[i];
3112       pgnoNew[i] = pgnoNew[minI];
3113       apNew[i] = apNew[minI];
3114       pgnoNew[minI] = t;
3115       apNew[minI] = pT;
3116     }
3117   }
3118   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
3119     pgnoOld[0],
3120     nOld>=2 ? pgnoOld[1] : 0,
3121     nOld>=3 ? pgnoOld[2] : 0,
3122     pgnoNew[0], szNew[0],
3123     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
3124     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
3125     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
3126     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
3127 
3128 
3129   /*
3130   ** Evenly distribute the data in apCell[] across the new pages.
3131   ** Insert divider cells into pParent as necessary.
3132   */
3133   j = 0;
3134   for(i=0; i<nNew; i++){
3135     MemPage *pNew = apNew[i];
3136     assert( pNew->pgno==pgnoNew[i] );
3137     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
3138     j = cntNew[i];
3139     assert( pNew->nCell>0 );
3140     assert( pNew->nOverflow==0 );
3141     if( i<nNew-1 && j<nCell ){
3142       u8 *pCell;
3143       u8 *pTemp;
3144       int sz;
3145       pCell = apCell[j];
3146       sz = szCell[j] + leafCorrection;
3147       if( !pNew->leaf ){
3148         memcpy(&pNew->aData[8], pCell, 4);
3149         pTemp = 0;
3150       }else if( leafData ){
3151         CellInfo info;
3152         j--;
3153         parseCellPtr(pNew, apCell[j], &info);
3154         pCell = &aSpace[iSpace];
3155         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
3156         iSpace += sz;
3157         assert( iSpace<=sizeof(aSpace) );
3158         pTemp = 0;
3159       }else{
3160         pCell -= 4;
3161         pTemp = &aSpace[iSpace];
3162         iSpace += sz;
3163         assert( iSpace<=sizeof(aSpace) );
3164       }
3165       insertCell(pParent, nxDiv, pCell, sz, pTemp);
3166       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
3167       j++;
3168       nxDiv++;
3169     }
3170   }
3171   assert( j==nCell );
3172   if( (pageFlags & PTF_LEAF)==0 ){
3173     memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
3174   }
3175   if( nxDiv==pParent->nCell+pParent->nOverflow ){
3176     /* Right-most sibling is the right-most child of pParent */
3177     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
3178   }else{
3179     /* Right-most sibling is the left child of the first entry in pParent
3180     ** past the right-most divider entry */
3181     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
3182   }
3183 
3184   /*
3185   ** Reparent children of all cells.
3186   */
3187   for(i=0; i<nNew; i++){
3188     reparentChildPages(apNew[i]);
3189   }
3190   reparentChildPages(pParent);
3191 
3192   /*
3193   ** Balance the parent page.  Note that the current page (pPage) might
3194   ** have been added to the freelist is it might no longer be initialized.
3195   ** But the parent page will always be initialized.
3196   */
3197   assert( pParent->isInit );
3198   /* assert( pPage->isInit ); // No! pPage might have been added to freelist */
3199   /* pageIntegrity(pPage);    // No! pPage might have been added to freelist */
3200   rc = balance(pParent);
3201 
3202   /*
3203   ** Cleanup before returning.
3204   */
3205 balance_cleanup:
3206   for(i=0; i<nOld; i++){
3207     releasePage(apOld[i]);
3208   }
3209   for(i=0; i<nNew; i++){
3210     releasePage(apNew[i]);
3211   }
3212   releasePage(pParent);
3213   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
3214           pPage->pgno, nOld, nNew, nCell));
3215   return rc;
3216 }
3217 
3218 /*
3219 ** This routine is called for the root page of a btree when the root
3220 ** page contains no cells.  This is an opportunity to make the tree
3221 ** shallower by one level.
3222 */
3223 static int balance_shallower(MemPage *pPage){
3224   MemPage *pChild;             /* The only child page of pPage */
3225   Pgno pgnoChild;              /* Page number for pChild */
3226   int rc;                      /* Return code from subprocedures */
3227   u8 *apCell[(MX_CELL+2)*NB];  /* All cells from pages being balanced */
3228   int szCell[(MX_CELL+2)*NB];  /* Local size of all cells */
3229 
3230   assert( pPage->pParent==0 );
3231   assert( pPage->nCell==0 );
3232   if( pPage->leaf ){
3233     /* The table is completely empty */
3234     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
3235   }else{
3236     /* The root page is empty but has one child.  Transfer the
3237     ** information from that one child into the root page if it
3238     ** will fit.  This reduces the depth of the tree by one.
3239     **
3240     ** If the root page is page 1, it has less space available than
3241     ** its child (due to the 100 byte header that occurs at the beginning
3242     ** of the database fle), so it might not be able to hold all of the
3243     ** information currently contained in the child.  If this is the
3244     ** case, then do not do the transfer.  Leave page 1 empty except
3245     ** for the right-pointer to the child page.  The child page becomes
3246     ** the virtual root of the tree.
3247     */
3248     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3249     assert( pgnoChild>0 );
3250     assert( pgnoChild<=sqlite3pager_pagecount(pPage->pBt->pPager) );
3251     rc = getPage(pPage->pBt, pgnoChild, &pChild);
3252     if( rc ) return rc;
3253     if( pPage->pgno==1 ){
3254       rc = initPage(pChild, pPage);
3255       if( rc ) return rc;
3256       assert( pChild->nOverflow==0 );
3257       if( pChild->nFree>=100 ){
3258         /* The child information will fit on the root page, so do the
3259         ** copy */
3260         int i;
3261         zeroPage(pPage, pChild->aData[0]);
3262         for(i=0; i<pChild->nCell; i++){
3263           apCell[i] = findCell(pChild,i);
3264           szCell[i] = cellSizePtr(pChild, apCell[i]);
3265         }
3266         assemblePage(pPage, pChild->nCell, apCell, szCell);
3267         freePage(pChild);
3268         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
3269       }else{
3270         /* The child has more information that will fit on the root.
3271         ** The tree is already balanced.  Do nothing. */
3272         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
3273       }
3274     }else{
3275       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
3276       pPage->isInit = 0;
3277       pPage->pParent = 0;
3278       rc = initPage(pPage, 0);
3279       assert( rc==SQLITE_OK );
3280       freePage(pChild);
3281       TRACE(("BALANCE: transfer child %d into root %d\n",
3282               pChild->pgno, pPage->pgno));
3283     }
3284     reparentChildPages(pPage);
3285     releasePage(pChild);
3286   }
3287   return SQLITE_OK;
3288 }
3289 
3290 
3291 /*
3292 ** The root page is overfull
3293 **
3294 ** When this happens, Create a new child page and copy the
3295 ** contents of the root into the child.  Then make the root
3296 ** page an empty page with rightChild pointing to the new
3297 ** child.   Finally, call balance_internal() on the new child
3298 ** to cause it to split.
3299 */
3300 static int balance_deeper(MemPage *pPage){
3301   int rc;             /* Return value from subprocedures */
3302   MemPage *pChild;    /* Pointer to a new child page */
3303   Pgno pgnoChild;     /* Page number of the new child page */
3304   Btree *pBt;         /* The BTree */
3305   int usableSize;     /* Total usable size of a page */
3306   u8 *data;           /* Content of the parent page */
3307   u8 *cdata;          /* Content of the child page */
3308   int hdr;            /* Offset to page header in parent */
3309   int brk;            /* Offset to content of first cell in parent */
3310 
3311   assert( pPage->pParent==0 );
3312   assert( pPage->nOverflow>0 );
3313   pBt = pPage->pBt;
3314   rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno);
3315   if( rc ) return rc;
3316   assert( sqlite3pager_iswriteable(pChild->aData) );
3317   usableSize = pBt->usableSize;
3318   data = pPage->aData;
3319   hdr = pPage->hdrOffset;
3320   brk = get2byte(&data[hdr+5]);
3321   cdata = pChild->aData;
3322   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
3323   memcpy(&cdata[brk], &data[brk], usableSize-brk);
3324   rc = initPage(pChild, pPage);
3325   if( rc ) return rc;
3326   memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
3327   pChild->nOverflow = pPage->nOverflow;
3328   if( pChild->nOverflow ){
3329     pChild->nFree = 0;
3330   }
3331   assert( pChild->nCell==pPage->nCell );
3332   zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
3333   put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
3334   TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
3335   rc = balance_nonroot(pChild);
3336   releasePage(pChild);
3337   return rc;
3338 }
3339 
3340 /*
3341 ** Decide if the page pPage needs to be balanced.  If balancing is
3342 ** required, call the appropriate balancing routine.
3343 */
3344 static int balance(MemPage *pPage){
3345   int rc = SQLITE_OK;
3346   if( pPage->pParent==0 ){
3347     if( pPage->nOverflow>0 ){
3348       rc = balance_deeper(pPage);
3349     }
3350     if( pPage->nCell==0 ){
3351       rc = balance_shallower(pPage);
3352     }
3353   }else{
3354     if( pPage->nOverflow>0 || pPage->nFree>pPage->pBt->usableSize*2/3 ){
3355       rc = balance_nonroot(pPage);
3356     }
3357   }
3358   return rc;
3359 }
3360 
3361 /*
3362 ** This routine checks all cursors that point to the same table
3363 ** as pCur points to.  If any of those cursors were opened with
3364 ** wrFlag==0 then this routine returns SQLITE_LOCKED.  If all
3365 ** cursors point to the same table were opened with wrFlag==1
3366 ** then this routine returns SQLITE_OK.
3367 **
3368 ** In addition to checking for read-locks (where a read-lock
3369 ** means a cursor opened with wrFlag==0) this routine also moves
3370 ** all cursors other than pCur so that they are pointing to the
3371 ** first Cell on root page.  This is necessary because an insert
3372 ** or delete might change the number of cells on a page or delete
3373 ** a page entirely and we do not want to leave any cursors
3374 ** pointing to non-existant pages or cells.
3375 */
3376 static int checkReadLocks(BtCursor *pCur){
3377   BtCursor *p;
3378   assert( pCur->wrFlag );
3379   for(p=pCur->pShared; p!=pCur; p=p->pShared){
3380     assert( p );
3381     assert( p->pgnoRoot==pCur->pgnoRoot );
3382     assert( p->pPage->pgno==sqlite3pager_pagenumber(p->pPage->aData) );
3383     if( p->wrFlag==0 ) return SQLITE_LOCKED;
3384     if( p->pPage->pgno!=p->pgnoRoot ){
3385       moveToRoot(p);
3386     }
3387   }
3388   return SQLITE_OK;
3389 }
3390 
3391 /*
3392 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
3393 ** and the data is given by (pData,nData).  The cursor is used only to
3394 ** define what table the record should be inserted into.  The cursor
3395 ** is left pointing at a random location.
3396 **
3397 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
3398 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
3399 */
3400 int sqlite3BtreeInsert(
3401   BtCursor *pCur,                /* Insert data into the table of this cursor */
3402   const void *pKey, i64 nKey,    /* The key of the new record */
3403   const void *pData, int nData   /* The data of the new record */
3404 ){
3405   int rc;
3406   int loc;
3407   int szNew;
3408   MemPage *pPage;
3409   Btree *pBt = pCur->pBt;
3410   unsigned char *oldCell;
3411   unsigned char newCell[MX_CELL_SIZE];
3412 
3413   if( pCur->status ){
3414     return pCur->status;  /* A rollback destroyed this cursor */
3415   }
3416   if( pBt->inTrans!=TRANS_WRITE ){
3417     /* Must start a transaction before doing an insert */
3418     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3419   }
3420   assert( !pBt->readOnly );
3421   if( !pCur->wrFlag ){
3422     return SQLITE_PERM;   /* Cursor not open for writing */
3423   }
3424   if( checkReadLocks(pCur) ){
3425     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3426   }
3427   rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
3428   if( rc ) return rc;
3429   pPage = pCur->pPage;
3430   assert( pPage->intKey || nKey>=0 );
3431   assert( pPage->leaf || !pPage->leafData );
3432   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
3433           pCur->pgnoRoot, nKey, nData, pPage->pgno,
3434           loc==0 ? "overwrite" : "new entry"));
3435   assert( pPage->isInit );
3436   rc = sqlite3pager_write(pPage->aData);
3437   if( rc ) return rc;
3438   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
3439   if( rc ) return rc;
3440   assert( szNew==cellSizePtr(pPage, newCell) );
3441   assert( szNew<=sizeof(newCell) );
3442   if( loc==0 && pCur->isValid ){
3443     int szOld;
3444     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3445     oldCell = findCell(pPage, pCur->idx);
3446     if( !pPage->leaf ){
3447       memcpy(newCell, oldCell, 4);
3448     }
3449     szOld = cellSizePtr(pPage, oldCell);
3450     rc = clearCell(pPage, oldCell);
3451     if( rc ) return rc;
3452     dropCell(pPage, pCur->idx, szOld);
3453   }else if( loc<0 && pPage->nCell>0 ){
3454     assert( pPage->leaf );
3455     pCur->idx++;
3456     pCur->info.nSize = 0;
3457   }else{
3458     assert( pPage->leaf );
3459   }
3460   insertCell(pPage, pCur->idx, newCell, szNew, 0);
3461   rc = balance(pPage);
3462   /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
3463   /* fflush(stdout); */
3464   moveToRoot(pCur);
3465   return rc;
3466 }
3467 
3468 /*
3469 ** Delete the entry that the cursor is pointing to.  The cursor
3470 ** is left pointing at a random location.
3471 */
3472 int sqlite3BtreeDelete(BtCursor *pCur){
3473   MemPage *pPage = pCur->pPage;
3474   unsigned char *pCell;
3475   int rc;
3476   Pgno pgnoChild;
3477   Btree *pBt = pCur->pBt;
3478 
3479   assert( pPage->isInit );
3480   if( pCur->status ){
3481     return pCur->status;  /* A rollback destroyed this cursor */
3482   }
3483   if( pBt->inTrans!=TRANS_WRITE ){
3484     /* Must start a transaction before doing a delete */
3485     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3486   }
3487   assert( !pBt->readOnly );
3488   if( pCur->idx >= pPage->nCell ){
3489     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
3490   }
3491   if( !pCur->wrFlag ){
3492     return SQLITE_PERM;   /* Did not open this cursor for writing */
3493   }
3494   if( checkReadLocks(pCur) ){
3495     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3496   }
3497   rc = sqlite3pager_write(pPage->aData);
3498   if( rc ) return rc;
3499   pCell = findCell(pPage, pCur->idx);
3500   if( !pPage->leaf ){
3501     pgnoChild = get4byte(pCell);
3502   }
3503   clearCell(pPage, pCell);
3504   if( !pPage->leaf ){
3505     /*
3506     ** The entry we are about to delete is not a leaf so if we do not
3507     ** do something we will leave a hole on an internal page.
3508     ** We have to fill the hole by moving in a cell from a leaf.  The
3509     ** next Cell after the one to be deleted is guaranteed to exist and
3510     ** to be a leaf so we can use it.
3511     */
3512     BtCursor leafCur;
3513     unsigned char *pNext;
3514     int szNext;
3515     int notUsed;
3516     unsigned char tempCell[MX_CELL_SIZE];
3517     assert( !pPage->leafData );
3518     getTempCursor(pCur, &leafCur);
3519     rc = sqlite3BtreeNext(&leafCur, &notUsed);
3520     if( rc!=SQLITE_OK ){
3521       if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
3522       return rc;
3523     }
3524     rc = sqlite3pager_write(leafCur.pPage->aData);
3525     if( rc ) return rc;
3526     TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
3527        pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
3528     dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
3529     pNext = findCell(leafCur.pPage, leafCur.idx);
3530     szNext = cellSizePtr(leafCur.pPage, pNext);
3531     assert( sizeof(tempCell)>=szNext+4 );
3532     insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell);
3533     put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
3534     rc = balance(pPage);
3535     if( rc ) return rc;
3536     dropCell(leafCur.pPage, leafCur.idx, szNext);
3537     rc = balance(leafCur.pPage);
3538     releaseTempCursor(&leafCur);
3539   }else{
3540     TRACE(("DELETE: table=%d delete from leaf %d\n",
3541        pCur->pgnoRoot, pPage->pgno));
3542     dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
3543     rc = balance(pPage);
3544   }
3545   moveToRoot(pCur);
3546   return rc;
3547 }
3548 
3549 /*
3550 ** Create a new BTree table.  Write into *piTable the page
3551 ** number for the root page of the new table.
3552 **
3553 ** The type of type is determined by the flags parameter.  Only the
3554 ** following values of flags are currently in use.  Other values for
3555 ** flags might not work:
3556 **
3557 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
3558 **     BTREE_ZERODATA                  Used for SQL indices
3559 */
3560 int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
3561   MemPage *pRoot;
3562   Pgno pgnoRoot;
3563   int rc;
3564   if( pBt->inTrans!=TRANS_WRITE ){
3565     /* Must start a transaction first */
3566     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3567   }
3568   if( pBt->readOnly ){
3569     return SQLITE_READONLY;
3570   }
3571   rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1);
3572   if( rc ) return rc;
3573   assert( sqlite3pager_iswriteable(pRoot->aData) );
3574   zeroPage(pRoot, flags | PTF_LEAF);
3575   sqlite3pager_unref(pRoot->aData);
3576   *piTable = (int)pgnoRoot;
3577   return SQLITE_OK;
3578 }
3579 
3580 /*
3581 ** Erase the given database page and all its children.  Return
3582 ** the page to the freelist.
3583 */
3584 static int clearDatabasePage(
3585   Btree *pBt,           /* The BTree that contains the table */
3586   Pgno pgno,            /* Page number to clear */
3587   MemPage *pParent,     /* Parent page.  NULL for the root */
3588   int freePageFlag      /* Deallocate page if true */
3589 ){
3590   MemPage *pPage;
3591   int rc;
3592   unsigned char *pCell;
3593   int i;
3594 
3595   rc = getAndInitPage(pBt, pgno, &pPage, pParent);
3596   if( rc ) return rc;
3597   rc = sqlite3pager_write(pPage->aData);
3598   if( rc ) return rc;
3599   for(i=0; i<pPage->nCell; i++){
3600     pCell = findCell(pPage, i);
3601     if( !pPage->leaf ){
3602       rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
3603       if( rc ) return rc;
3604     }
3605     rc = clearCell(pPage, pCell);
3606     if( rc ) return rc;
3607   }
3608   if( !pPage->leaf ){
3609     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
3610     if( rc ) return rc;
3611   }
3612   if( freePageFlag ){
3613     rc = freePage(pPage);
3614   }else{
3615     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
3616   }
3617   releasePage(pPage);
3618   return rc;
3619 }
3620 
3621 /*
3622 ** Delete all information from a single table in the database.  iTable is
3623 ** the page number of the root of the table.  After this routine returns,
3624 ** the root page is empty, but still exists.
3625 **
3626 ** This routine will fail with SQLITE_LOCKED if there are any open
3627 ** read cursors on the table.  Open write cursors are moved to the
3628 ** root of the table.
3629 */
3630 int sqlite3BtreeClearTable(Btree *pBt, int iTable){
3631   int rc;
3632   BtCursor *pCur;
3633   if( pBt->inTrans!=TRANS_WRITE ){
3634     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3635   }
3636   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3637     if( pCur->pgnoRoot==(Pgno)iTable ){
3638       if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
3639       moveToRoot(pCur);
3640     }
3641   }
3642   rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
3643   if( rc ){
3644     sqlite3BtreeRollback(pBt);
3645   }
3646   return rc;
3647 }
3648 
3649 /*
3650 ** Erase all information in a table and add the root of the table to
3651 ** the freelist.  Except, the root of the principle table (the one on
3652 ** page 1) is never added to the freelist.
3653 **
3654 ** This routine will fail with SQLITE_LOCKED if there are any open
3655 ** cursors on the table.
3656 */
3657 int sqlite3BtreeDropTable(Btree *pBt, int iTable){
3658   int rc;
3659   MemPage *pPage;
3660   BtCursor *pCur;
3661   if( pBt->inTrans!=TRANS_WRITE ){
3662     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3663   }
3664   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3665     if( pCur->pgnoRoot==(Pgno)iTable ){
3666       return SQLITE_LOCKED;  /* Cannot drop a table that has a cursor */
3667     }
3668   }
3669   rc = getPage(pBt, (Pgno)iTable, &pPage);
3670   if( rc ) return rc;
3671   rc = sqlite3BtreeClearTable(pBt, iTable);
3672   if( rc ) return rc;
3673   if( iTable>1 ){
3674     rc = freePage(pPage);
3675   }else{
3676     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
3677   }
3678   releasePage(pPage);
3679   return rc;
3680 }
3681 
3682 
3683 /*
3684 ** Read the meta-information out of a database file.  Meta[0]
3685 ** is the number of free pages currently in the database.  Meta[1]
3686 ** through meta[15] are available for use by higher layers.  Meta[0]
3687 ** is read-only, the others are read/write.
3688 **
3689 ** The schema layer numbers meta values differently.  At the schema
3690 ** layer (and the SetCookie and ReadCookie opcodes) the number of
3691 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
3692 */
3693 int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
3694   int rc;
3695   unsigned char *pP1;
3696 
3697   assert( idx>=0 && idx<=15 );
3698   rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
3699   if( rc ) return rc;
3700   *pMeta = get4byte(&pP1[36 + idx*4]);
3701   sqlite3pager_unref(pP1);
3702   return SQLITE_OK;
3703 }
3704 
3705 /*
3706 ** Write meta-information back into the database.  Meta[0] is
3707 ** read-only and may not be written.
3708 */
3709 int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
3710   unsigned char *pP1;
3711   int rc;
3712   assert( idx>=1 && idx<=15 );
3713   if( pBt->inTrans!=TRANS_WRITE ){
3714     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3715   }
3716   assert( pBt->pPage1!=0 );
3717   pP1 = pBt->pPage1->aData;
3718   rc = sqlite3pager_write(pP1);
3719   if( rc ) return rc;
3720   put4byte(&pP1[36 + idx*4], iMeta);
3721   return SQLITE_OK;
3722 }
3723 
3724 /*
3725 ** Return the flag byte at the beginning of the page that the cursor
3726 ** is currently pointing to.
3727 */
3728 int sqlite3BtreeFlags(BtCursor *pCur){
3729   MemPage *pPage = pCur->pPage;
3730   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
3731 }
3732 
3733 /*
3734 ** Print a disassembly of the given page on standard output.  This routine
3735 ** is used for debugging and testing only.
3736 */
3737 #ifdef SQLITE_TEST
3738 int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
3739   int rc;
3740   MemPage *pPage;
3741   int i, j, c;
3742   int nFree;
3743   u16 idx;
3744   int hdr;
3745   int nCell;
3746   int isInit;
3747   unsigned char *data;
3748   char range[20];
3749   unsigned char payload[20];
3750 
3751   rc = getPage(pBt, (Pgno)pgno, &pPage);
3752   isInit = pPage->isInit;
3753   if( pPage->isInit==0 ){
3754     initPage(pPage, 0);
3755   }
3756   if( rc ){
3757     return rc;
3758   }
3759   hdr = pPage->hdrOffset;
3760   data = pPage->aData;
3761   c = data[hdr];
3762   pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0;
3763   pPage->zeroData = (c & PTF_ZERODATA)!=0;
3764   pPage->leafData = (c & PTF_LEAFDATA)!=0;
3765   pPage->leaf = (c & PTF_LEAF)!=0;
3766   pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
3767   nCell = get2byte(&data[hdr+3]);
3768   printf("PAGE %d:  flags=0x%02x  frag=%d   parent=%d\n", pgno,
3769     data[hdr], data[hdr+7],
3770     (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
3771   assert( hdr == (pgno==1 ? 100 : 0) );
3772   idx = hdr + 12 - pPage->leaf*4;
3773   for(i=0; i<nCell; i++){
3774     CellInfo info;
3775     Pgno child;
3776     unsigned char *pCell;
3777     int sz;
3778     int addr;
3779 
3780     addr = get2byte(&data[idx + 2*i]);
3781     pCell = &data[addr];
3782     parseCellPtr(pPage, pCell, &info);
3783     sz = info.nSize;
3784     sprintf(range,"%d..%d", addr, addr+sz-1);
3785     if( pPage->leaf ){
3786       child = 0;
3787     }else{
3788       child = get4byte(pCell);
3789     }
3790     sz = info.nData;
3791     if( !pPage->intKey ) sz += info.nKey;
3792     if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
3793     memcpy(payload, &pCell[info.nHeader], sz);
3794     for(j=0; j<sz; j++){
3795       if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3796     }
3797     payload[sz] = 0;
3798     printf(
3799       "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4d payload=%s\n",
3800       i, range, child, info.nKey, info.nData, payload
3801     );
3802   }
3803   if( !pPage->leaf ){
3804     printf("right_child: %d\n", get4byte(&data[hdr+8]));
3805   }
3806   nFree = 0;
3807   i = 0;
3808   idx = get2byte(&data[hdr+1]);
3809   while( idx>0 && idx<pPage->pBt->usableSize ){
3810     int sz = get2byte(&data[idx+2]);
3811     sprintf(range,"%d..%d", idx, idx+sz-1);
3812     nFree += sz;
3813     printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
3814        i, range, sz, nFree);
3815     idx = get2byte(&data[idx]);
3816     i++;
3817   }
3818   if( idx!=0 ){
3819     printf("ERROR: next freeblock index out of range: %d\n", idx);
3820   }
3821   if( recursive && !pPage->leaf ){
3822     for(i=0; i<nCell; i++){
3823       unsigned char *pCell = findCell(pPage, i);
3824       sqlite3BtreePageDump(pBt, get4byte(pCell), 1);
3825       idx = get2byte(pCell);
3826     }
3827     sqlite3BtreePageDump(pBt, get4byte(&data[hdr+8]), 1);
3828   }
3829   pPage->isInit = isInit;
3830   sqlite3pager_unref(data);
3831   fflush(stdout);
3832   return SQLITE_OK;
3833 }
3834 #endif
3835 
3836 #ifdef SQLITE_TEST
3837 /*
3838 ** Fill aResult[] with information about the entry and page that the
3839 ** cursor is pointing to.
3840 **
3841 **   aResult[0] =  The page number
3842 **   aResult[1] =  The entry number
3843 **   aResult[2] =  Total number of entries on this page
3844 **   aResult[3] =  Size of this entry
3845 **   aResult[4] =  Number of free bytes on this page
3846 **   aResult[5] =  Number of free blocks on the page
3847 **   aResult[6] =  Page number of the left child of this entry
3848 **   aResult[7] =  Page number of the right child for the whole page
3849 **
3850 ** This routine is used for testing and debugging only.
3851 */
3852 int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult){
3853   int cnt, idx;
3854   MemPage *pPage = pCur->pPage;
3855 
3856   pageIntegrity(pPage);
3857   assert( pPage->isInit );
3858   aResult[0] = sqlite3pager_pagenumber(pPage->aData);
3859   assert( aResult[0]==pPage->pgno );
3860   aResult[1] = pCur->idx;
3861   aResult[2] = pPage->nCell;
3862   if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
3863     u8 *pCell = findCell(pPage, pCur->idx);
3864     aResult[3] = cellSizePtr(pPage, pCell);
3865     aResult[6] = pPage->leaf ? 0 : get4byte(pCell);
3866   }else{
3867     aResult[3] = 0;
3868     aResult[6] = 0;
3869   }
3870   aResult[4] = pPage->nFree;
3871   cnt = 0;
3872   idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
3873   while( idx>0 && idx<pPage->pBt->usableSize ){
3874     cnt++;
3875     idx = get2byte(&pPage->aData[idx]);
3876   }
3877   aResult[5] = cnt;
3878   aResult[7] = pPage->leaf ? 0 : get4byte(&pPage->aData[pPage->hdrOffset+8]);
3879   return SQLITE_OK;
3880 }
3881 #endif
3882 
3883 /*
3884 ** Return the pager associated with a BTree.  This routine is used for
3885 ** testing and debugging only.
3886 */
3887 Pager *sqlite3BtreePager(Btree *pBt){
3888   return pBt->pPager;
3889 }
3890 
3891 /*
3892 ** This structure is passed around through all the sanity checking routines
3893 ** in order to keep track of some global state information.
3894 */
3895 typedef struct IntegrityCk IntegrityCk;
3896 struct IntegrityCk {
3897   Btree *pBt;    /* The tree being checked out */
3898   Pager *pPager; /* The associated pager.  Also accessible by pBt->pPager */
3899   int nPage;     /* Number of pages in the database */
3900   int *anRef;    /* Number of times each page is referenced */
3901   char *zErrMsg; /* An error message.  NULL of no errors seen. */
3902 };
3903 
3904 /*
3905 ** Append a message to the error message string.
3906 */
3907 static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
3908   if( pCheck->zErrMsg ){
3909     char *zOld = pCheck->zErrMsg;
3910     pCheck->zErrMsg = 0;
3911     sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
3912     sqliteFree(zOld);
3913   }else{
3914     sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
3915   }
3916 }
3917 
3918 /*
3919 ** Add 1 to the reference count for page iPage.  If this is the second
3920 ** reference to the page, add an error message to pCheck->zErrMsg.
3921 ** Return 1 if there are 2 ore more references to the page and 0 if
3922 ** if this is the first reference to the page.
3923 **
3924 ** Also check that the page number is in bounds.
3925 */
3926 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
3927   if( iPage==0 ) return 1;
3928   if( iPage>pCheck->nPage || iPage<0 ){
3929     char zBuf[100];
3930     sprintf(zBuf, "invalid page number %d", iPage);
3931     checkAppendMsg(pCheck, zContext, zBuf);
3932     return 1;
3933   }
3934   if( pCheck->anRef[iPage]==1 ){
3935     char zBuf[100];
3936     sprintf(zBuf, "2nd reference to page %d", iPage);
3937     checkAppendMsg(pCheck, zContext, zBuf);
3938     return 1;
3939   }
3940   return  (pCheck->anRef[iPage]++)>1;
3941 }
3942 
3943 /*
3944 ** Check the integrity of the freelist or of an overflow page list.
3945 ** Verify that the number of pages on the list is N.
3946 */
3947 static void checkList(
3948   IntegrityCk *pCheck,  /* Integrity checking context */
3949   int isFreeList,       /* True for a freelist.  False for overflow page list */
3950   int iPage,            /* Page number for first page in the list */
3951   int N,                /* Expected number of pages in the list */
3952   char *zContext        /* Context for error messages */
3953 ){
3954   int i;
3955   int expected = N;
3956   int iFirst = iPage;
3957   char zMsg[100];
3958   while( N-- > 0 ){
3959     unsigned char *pOvfl;
3960     if( iPage<1 ){
3961       sprintf(zMsg, "%d of %d pages missing from overflow list starting at %d",
3962           N+1, expected, iFirst);
3963       checkAppendMsg(pCheck, zContext, zMsg);
3964       break;
3965     }
3966     if( checkRef(pCheck, iPage, zContext) ) break;
3967     if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
3968       sprintf(zMsg, "failed to get page %d", iPage);
3969       checkAppendMsg(pCheck, zContext, zMsg);
3970       break;
3971     }
3972     if( isFreeList ){
3973       int n = get4byte(&pOvfl[4]);
3974       for(i=0; i<n; i++){
3975         checkRef(pCheck, get4byte(&pOvfl[8+i*4]), zContext);
3976       }
3977       N -= n;
3978     }
3979     iPage = get4byte(pOvfl);
3980     sqlite3pager_unref(pOvfl);
3981   }
3982 }
3983 
3984 /*
3985 ** Do various sanity checks on a single page of a tree.  Return
3986 ** the tree depth.  Root pages return 0.  Parents of root pages
3987 ** return 1, and so forth.
3988 **
3989 ** These checks are done:
3990 **
3991 **      1.  Make sure that cells and freeblocks do not overlap
3992 **          but combine to completely cover the page.
3993 **  NO  2.  Make sure cell keys are in order.
3994 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
3995 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
3996 **      5.  Check the integrity of overflow pages.
3997 **      6.  Recursively call checkTreePage on all children.
3998 **      7.  Verify that the depth of all children is the same.
3999 **      8.  Make sure this page is at least 33% full or else it is
4000 **          the root of the tree.
4001 */
4002 static int checkTreePage(
4003   IntegrityCk *pCheck,  /* Context for the sanity check */
4004   int iPage,            /* Page number of the page to check */
4005   MemPage *pParent,     /* Parent page */
4006   char *zParentContext, /* Parent context */
4007   char *zLowerBound,    /* All keys should be greater than this, if not NULL */
4008   int nLower,           /* Number of characters in zLowerBound */
4009   char *zUpperBound,    /* All keys should be less than this, if not NULL */
4010   int nUpper            /* Number of characters in zUpperBound */
4011 ){
4012   MemPage *pPage;
4013   int i, rc, depth, d2, pgno, cnt;
4014   int hdr, cellStart;
4015   int nCell;
4016   u8 *data;
4017   BtCursor cur;
4018   Btree *pBt;
4019   int maxLocal, usableSize;
4020   char zMsg[100];
4021   char zContext[100];
4022   char hit[MX_PAGE_SIZE];
4023 
4024   /* Check that the page exists
4025   */
4026   cur.pBt = pBt = pCheck->pBt;
4027   usableSize = pBt->usableSize;
4028   if( iPage==0 ) return 0;
4029   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
4030   if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
4031     sprintf(zMsg, "unable to get the page. error code=%d", rc);
4032     checkAppendMsg(pCheck, zContext, zMsg);
4033     return 0;
4034   }
4035   maxLocal = pPage->leafData ? pBt->maxLeaf : pBt->maxLocal;
4036   if( (rc = initPage(pPage, pParent))!=0 ){
4037     sprintf(zMsg, "initPage() returns error code %d", rc);
4038     checkAppendMsg(pCheck, zContext, zMsg);
4039     releasePage(pPage);
4040     return 0;
4041   }
4042 
4043   /* Check out all the cells.
4044   */
4045   depth = 0;
4046   cur.pPage = pPage;
4047   for(i=0; i<pPage->nCell; i++){
4048     u8 *pCell;
4049     int sz;
4050     CellInfo info;
4051 
4052     /* Check payload overflow pages
4053     */
4054     sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
4055     pCell = findCell(pPage,i);
4056     parseCellPtr(pPage, pCell, &info);
4057     sz = info.nData;
4058     if( !pPage->intKey ) sz += info.nKey;
4059     if( sz>info.nLocal ){
4060       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
4061       checkList(pCheck, 0, get4byte(&pCell[info.iOverflow]),nPage,zContext);
4062     }
4063 
4064     /* Check sanity of left child page.
4065     */
4066     if( !pPage->leaf ){
4067       pgno = get4byte(pCell);
4068       d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
4069       if( i>0 && d2!=depth ){
4070         checkAppendMsg(pCheck, zContext, "Child page depth differs");
4071       }
4072       depth = d2;
4073     }
4074   }
4075   if( !pPage->leaf ){
4076     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4077     sprintf(zContext, "On page %d at right child: ", iPage);
4078     checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
4079   }
4080 
4081   /* Check for complete coverage of the page
4082   */
4083   data = pPage->aData;
4084   hdr = pPage->hdrOffset;
4085   memset(hit, 0, usableSize);
4086   memset(hit, 1, get2byte(&data[hdr+5]));
4087   nCell = get2byte(&data[hdr+3]);
4088   cellStart = hdr + 12 - 4*pPage->leaf;
4089   for(i=0; i<nCell; i++){
4090     int pc = get2byte(&data[cellStart+i*2]);
4091     int size = cellSizePtr(pPage, &data[pc]);
4092     int j;
4093     for(j=pc+size-1; j>=pc; j--) hit[j]++;
4094   }
4095   for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; cnt++){
4096     int size = get2byte(&data[i+2]);
4097     int j;
4098     for(j=i+size-1; j>=i; j--) hit[j]++;
4099     i = get2byte(&data[i]);
4100   }
4101   for(i=cnt=0; i<usableSize; i++){
4102     if( hit[i]==0 ){
4103       cnt++;
4104     }else if( hit[i]>1 ){
4105       sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
4106       checkAppendMsg(pCheck, zMsg, 0);
4107       break;
4108     }
4109   }
4110   if( cnt!=data[hdr+7] ){
4111     sprintf(zMsg, "Fragmented space is %d byte reported as %d on page %d",
4112         cnt, data[hdr+7], iPage);
4113     checkAppendMsg(pCheck, zMsg, 0);
4114   }
4115 
4116   releasePage(pPage);
4117   return depth+1;
4118 }
4119 
4120 /*
4121 ** This routine does a complete check of the given BTree file.  aRoot[] is
4122 ** an array of pages numbers were each page number is the root page of
4123 ** a table.  nRoot is the number of entries in aRoot.
4124 **
4125 ** If everything checks out, this routine returns NULL.  If something is
4126 ** amiss, an error message is written into memory obtained from malloc()
4127 ** and a pointer to that error message is returned.  The calling function
4128 ** is responsible for freeing the error message when it is done.
4129 */
4130 char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
4131   int i;
4132   int nRef;
4133   IntegrityCk sCheck;
4134 
4135   nRef = *sqlite3pager_stats(pBt->pPager);
4136   if( lockBtree(pBt)!=SQLITE_OK ){
4137     return sqliteStrDup("Unable to acquire a read lock on the database");
4138   }
4139   sCheck.pBt = pBt;
4140   sCheck.pPager = pBt->pPager;
4141   sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
4142   if( sCheck.nPage==0 ){
4143     unlockBtreeIfUnused(pBt);
4144     return 0;
4145   }
4146   sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
4147   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
4148   sCheck.zErrMsg = 0;
4149 
4150   /* Check the integrity of the freelist
4151   */
4152   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
4153             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
4154 
4155   /* Check all the tables.
4156   */
4157   for(i=0; i<nRoot; i++){
4158     if( aRoot[i]==0 ) continue;
4159     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
4160   }
4161 
4162   /* Make sure every page in the file is referenced
4163   */
4164   for(i=1; i<=sCheck.nPage; i++){
4165     if( sCheck.anRef[i]==0 ){
4166       char zBuf[100];
4167       sprintf(zBuf, "Page %d is never used", i);
4168       checkAppendMsg(&sCheck, zBuf, 0);
4169     }
4170   }
4171 
4172   /* Make sure this analysis did not leave any unref() pages
4173   */
4174   unlockBtreeIfUnused(pBt);
4175   if( nRef != *sqlite3pager_stats(pBt->pPager) ){
4176     char zBuf[100];
4177     sprintf(zBuf,
4178       "Outstanding page count goes from %d to %d during this analysis",
4179       nRef, *sqlite3pager_stats(pBt->pPager)
4180     );
4181     checkAppendMsg(&sCheck, zBuf, 0);
4182   }
4183 
4184   /* Clean  up and report errors.
4185   */
4186   sqliteFree(sCheck.anRef);
4187   return sCheck.zErrMsg;
4188 }
4189 
4190 /*
4191 ** Return the full pathname of the underlying database file.
4192 */
4193 const char *sqlite3BtreeGetFilename(Btree *pBt){
4194   assert( pBt->pPager!=0 );
4195   return sqlite3pager_filename(pBt->pPager);
4196 }
4197 
4198 /*
4199 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
4200 ** must be active for both files.
4201 **
4202 ** The size of file pBtFrom may be reduced by this operation.
4203 ** If anything goes wrong, the transaction on pBtFrom is rolled back.
4204 */
4205 int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
4206   int rc = SQLITE_OK;
4207   Pgno i, nPage, nToPage;
4208 
4209   if( pBtTo->inTrans!=TRANS_WRITE || pBtFrom->inTrans!=TRANS_WRITE ){
4210     return SQLITE_ERROR;
4211   }
4212   if( pBtTo->pCursor ) return SQLITE_BUSY;
4213   memcpy(pBtTo->pPage1->aData, pBtFrom->pPage1->aData, pBtFrom->usableSize);
4214   rc = sqlite3pager_overwrite(pBtTo->pPager, 1, pBtFrom->pPage1->aData);
4215   nToPage = sqlite3pager_pagecount(pBtTo->pPager);
4216   nPage = sqlite3pager_pagecount(pBtFrom->pPager);
4217   for(i=2; rc==SQLITE_OK && i<=nPage; i++){
4218     void *pPage;
4219     rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
4220     if( rc ) break;
4221     rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
4222     if( rc ) break;
4223     sqlite3pager_unref(pPage);
4224   }
4225   for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
4226     void *pPage;
4227     rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
4228     if( rc ) break;
4229     rc = sqlite3pager_write(pPage);
4230     sqlite3pager_unref(pPage);
4231     sqlite3pager_dont_write(pBtTo->pPager, i);
4232   }
4233   if( !rc && nPage<nToPage ){
4234     rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
4235   }
4236   if( rc ){
4237     sqlite3BtreeRollback(pBtTo);
4238   }
4239   return rc;
4240 }
4241 
4242 /*
4243 ** Return non-zero if a transaction is active.
4244 */
4245 int sqlite3BtreeIsInTrans(Btree *pBt){
4246   return (pBt && (pBt->inTrans==TRANS_WRITE));
4247 }
4248 
4249 /*
4250 ** Return non-zero if a statement transaction is active.
4251 */
4252 int sqlite3BtreeIsInStmt(Btree *pBt){
4253   return (pBt && pBt->inStmt);
4254 }
4255 
4256 /*
4257 ** This call is a no-op if no write-transaction is currently active on pBt.
4258 **
4259 ** Otherwise, sync the database file for the btree pBt. zMaster points to
4260 ** the name of a master journal file that should be written into the
4261 ** individual journal file, or is NULL, indicating no master journal file
4262 ** (single database transaction).
4263 **
4264 ** When this is called, the master journal should already have been
4265 ** created, populated with this journal pointer and synced to disk.
4266 **
4267 ** Once this is routine has returned, the only thing required to commit
4268 ** the write-transaction for this database file is to delete the journal.
4269 */
4270 int sqlite3BtreeSync(Btree *pBt, const char *zMaster){
4271   if( pBt->inTrans==TRANS_WRITE ){
4272     return sqlite3pager_sync(pBt->pPager, zMaster);
4273   }
4274   return SQLITE_OK;
4275 }
4276