xref: /sqlite-3.40.0/src/btree.c (revision 85b623f2)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.433 2007/12/13 21:54:11 drh Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
17 */
18 #include "btreeInt.h"
19 
20 /*
21 ** The header string that appears at the beginning of every
22 ** SQLite database.
23 */
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25 
26 /*
27 ** Set this global variable to 1 to enable tracing using the TRACE
28 ** macro.
29 */
30 #if SQLITE_TEST
31 int sqlite3_btree_trace=0;  /* True to enable tracing */
32 #endif
33 
34 
35 
36 #ifndef SQLITE_OMIT_SHARED_CACHE
37 /*
38 ** A flag to indicate whether or not shared cache is enabled.  Also,
39 ** a list of BtShared objects that are eligible for participation
40 ** in shared cache.  The variables have file scope during normal builds,
41 ** but the test harness needs to access these variables so we make them
42 ** global for test builds.
43 */
44 #ifdef SQLITE_TEST
45 BtShared *sqlite3SharedCacheList = 0;
46 int sqlite3SharedCacheEnabled = 0;
47 #else
48 static BtShared *sqlite3SharedCacheList = 0;
49 static int sqlite3SharedCacheEnabled = 0;
50 #endif
51 #endif /* SQLITE_OMIT_SHARED_CACHE */
52 
53 #ifndef SQLITE_OMIT_SHARED_CACHE
54 /*
55 ** Enable or disable the shared pager and schema features.
56 **
57 ** This routine has no effect on existing database connections.
58 ** The shared cache setting effects only future calls to
59 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
60 */
61 int sqlite3_enable_shared_cache(int enable){
62   sqlite3SharedCacheEnabled = enable;
63   return SQLITE_OK;
64 }
65 #endif
66 
67 
68 /*
69 ** Forward declaration
70 */
71 static int checkReadLocks(Btree*,Pgno,BtCursor*);
72 
73 
74 #ifdef SQLITE_OMIT_SHARED_CACHE
75   /*
76   ** The functions queryTableLock(), lockTable() and unlockAllTables()
77   ** manipulate entries in the BtShared.pLock linked list used to store
78   ** shared-cache table level locks. If the library is compiled with the
79   ** shared-cache feature disabled, then there is only ever one user
80   ** of each BtShared structure and so this locking is not necessary.
81   ** So define the lock related functions as no-ops.
82   */
83   #define queryTableLock(a,b,c) SQLITE_OK
84   #define lockTable(a,b,c) SQLITE_OK
85   #define unlockAllTables(a)
86 #endif
87 
88 #ifndef SQLITE_OMIT_SHARED_CACHE
89 /*
90 ** Query to see if btree handle p may obtain a lock of type eLock
91 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
92 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
93 ** SQLITE_LOCKED if not.
94 */
95 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
96   BtShared *pBt = p->pBt;
97   BtLock *pIter;
98 
99   assert( sqlite3BtreeHoldsMutex(p) );
100 
101   /* This is a no-op if the shared-cache is not enabled */
102   if( !p->sharable ){
103     return SQLITE_OK;
104   }
105 
106   /* This (along with lockTable()) is where the ReadUncommitted flag is
107   ** dealt with. If the caller is querying for a read-lock and the flag is
108   ** set, it is unconditionally granted - even if there are write-locks
109   ** on the table. If a write-lock is requested, the ReadUncommitted flag
110   ** is not considered.
111   **
112   ** In function lockTable(), if a read-lock is demanded and the
113   ** ReadUncommitted flag is set, no entry is added to the locks list
114   ** (BtShared.pLock).
115   **
116   ** To summarize: If the ReadUncommitted flag is set, then read cursors do
117   ** not create or respect table locks. The locking procedure for a
118   ** write-cursor does not change.
119   */
120   if(
121     !p->db ||
122     0==(p->db->flags&SQLITE_ReadUncommitted) ||
123     eLock==WRITE_LOCK ||
124     iTab==MASTER_ROOT
125   ){
126     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
127       if( pIter->pBtree!=p && pIter->iTable==iTab &&
128           (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
129         return SQLITE_LOCKED;
130       }
131     }
132   }
133   return SQLITE_OK;
134 }
135 #endif /* !SQLITE_OMIT_SHARED_CACHE */
136 
137 #ifndef SQLITE_OMIT_SHARED_CACHE
138 /*
139 ** Add a lock on the table with root-page iTable to the shared-btree used
140 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
141 ** WRITE_LOCK.
142 **
143 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
144 ** SQLITE_NOMEM may also be returned.
145 */
146 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
147   BtShared *pBt = p->pBt;
148   BtLock *pLock = 0;
149   BtLock *pIter;
150 
151   assert( sqlite3BtreeHoldsMutex(p) );
152 
153   /* This is a no-op if the shared-cache is not enabled */
154   if( !p->sharable ){
155     return SQLITE_OK;
156   }
157 
158   assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
159 
160   /* If the read-uncommitted flag is set and a read-lock is requested,
161   ** return early without adding an entry to the BtShared.pLock list. See
162   ** comment in function queryTableLock() for more info on handling
163   ** the ReadUncommitted flag.
164   */
165   if(
166     (p->db) &&
167     (p->db->flags&SQLITE_ReadUncommitted) &&
168     (eLock==READ_LOCK) &&
169     iTable!=MASTER_ROOT
170   ){
171     return SQLITE_OK;
172   }
173 
174   /* First search the list for an existing lock on this table. */
175   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
176     if( pIter->iTable==iTable && pIter->pBtree==p ){
177       pLock = pIter;
178       break;
179     }
180   }
181 
182   /* If the above search did not find a BtLock struct associating Btree p
183   ** with table iTable, allocate one and link it into the list.
184   */
185   if( !pLock ){
186     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
187     if( !pLock ){
188       return SQLITE_NOMEM;
189     }
190     pLock->iTable = iTable;
191     pLock->pBtree = p;
192     pLock->pNext = pBt->pLock;
193     pBt->pLock = pLock;
194   }
195 
196   /* Set the BtLock.eLock variable to the maximum of the current lock
197   ** and the requested lock. This means if a write-lock was already held
198   ** and a read-lock requested, we don't incorrectly downgrade the lock.
199   */
200   assert( WRITE_LOCK>READ_LOCK );
201   if( eLock>pLock->eLock ){
202     pLock->eLock = eLock;
203   }
204 
205   return SQLITE_OK;
206 }
207 #endif /* !SQLITE_OMIT_SHARED_CACHE */
208 
209 #ifndef SQLITE_OMIT_SHARED_CACHE
210 /*
211 ** Release all the table locks (locks obtained via calls to the lockTable()
212 ** procedure) held by Btree handle p.
213 */
214 static void unlockAllTables(Btree *p){
215   BtLock **ppIter = &p->pBt->pLock;
216 
217   assert( sqlite3BtreeHoldsMutex(p) );
218   assert( p->sharable || 0==*ppIter );
219 
220   while( *ppIter ){
221     BtLock *pLock = *ppIter;
222     if( pLock->pBtree==p ){
223       *ppIter = pLock->pNext;
224       sqlite3_free(pLock);
225     }else{
226       ppIter = &pLock->pNext;
227     }
228   }
229 }
230 #endif /* SQLITE_OMIT_SHARED_CACHE */
231 
232 static void releasePage(MemPage *pPage);  /* Forward reference */
233 
234 /*
235 ** Verify that the cursor holds a mutex on the BtShared
236 */
237 #ifndef NDEBUG
238 static int cursorHoldsMutex(BtCursor *p){
239   return sqlite3_mutex_held(p->pBt->mutex);
240 }
241 #endif
242 
243 
244 #ifndef SQLITE_OMIT_INCRBLOB
245 /*
246 ** Invalidate the overflow page-list cache for cursor pCur, if any.
247 */
248 static void invalidateOverflowCache(BtCursor *pCur){
249   assert( cursorHoldsMutex(pCur) );
250   sqlite3_free(pCur->aOverflow);
251   pCur->aOverflow = 0;
252 }
253 
254 /*
255 ** Invalidate the overflow page-list cache for all cursors opened
256 ** on the shared btree structure pBt.
257 */
258 static void invalidateAllOverflowCache(BtShared *pBt){
259   BtCursor *p;
260   assert( sqlite3_mutex_held(pBt->mutex) );
261   for(p=pBt->pCursor; p; p=p->pNext){
262     invalidateOverflowCache(p);
263   }
264 }
265 #else
266   #define invalidateOverflowCache(x)
267   #define invalidateAllOverflowCache(x)
268 #endif
269 
270 /*
271 ** Save the current cursor position in the variables BtCursor.nKey
272 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
273 */
274 static int saveCursorPosition(BtCursor *pCur){
275   int rc;
276 
277   assert( CURSOR_VALID==pCur->eState );
278   assert( 0==pCur->pKey );
279   assert( cursorHoldsMutex(pCur) );
280 
281   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
282 
283   /* If this is an intKey table, then the above call to BtreeKeySize()
284   ** stores the integer key in pCur->nKey. In this case this value is
285   ** all that is required. Otherwise, if pCur is not open on an intKey
286   ** table, then malloc space for and store the pCur->nKey bytes of key
287   ** data.
288   */
289   if( rc==SQLITE_OK && 0==pCur->pPage->intKey){
290     void *pKey = sqlite3_malloc(pCur->nKey);
291     if( pKey ){
292       rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
293       if( rc==SQLITE_OK ){
294         pCur->pKey = pKey;
295       }else{
296         sqlite3_free(pKey);
297       }
298     }else{
299       rc = SQLITE_NOMEM;
300     }
301   }
302   assert( !pCur->pPage->intKey || !pCur->pKey );
303 
304   if( rc==SQLITE_OK ){
305     releasePage(pCur->pPage);
306     pCur->pPage = 0;
307     pCur->eState = CURSOR_REQUIRESEEK;
308   }
309 
310   invalidateOverflowCache(pCur);
311   return rc;
312 }
313 
314 /*
315 ** Save the positions of all cursors except pExcept open on the table
316 ** with root-page iRoot. Usually, this is called just before cursor
317 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
318 */
319 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
320   BtCursor *p;
321   assert( sqlite3_mutex_held(pBt->mutex) );
322   assert( pExcept==0 || pExcept->pBt==pBt );
323   for(p=pBt->pCursor; p; p=p->pNext){
324     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
325         p->eState==CURSOR_VALID ){
326       int rc = saveCursorPosition(p);
327       if( SQLITE_OK!=rc ){
328         return rc;
329       }
330     }
331   }
332   return SQLITE_OK;
333 }
334 
335 /*
336 ** Clear the current cursor position.
337 */
338 static void clearCursorPosition(BtCursor *pCur){
339   assert( cursorHoldsMutex(pCur) );
340   sqlite3_free(pCur->pKey);
341   pCur->pKey = 0;
342   pCur->eState = CURSOR_INVALID;
343 }
344 
345 /*
346 ** Restore the cursor to the position it was in (or as close to as possible)
347 ** when saveCursorPosition() was called. Note that this call deletes the
348 ** saved position info stored by saveCursorPosition(), so there can be
349 ** at most one effective restoreOrClearCursorPosition() call after each
350 ** saveCursorPosition().
351 **
352 ** If the second argument argument - doSeek - is false, then instead of
353 ** returning the cursor to its saved position, any saved position is deleted
354 ** and the cursor state set to CURSOR_INVALID.
355 */
356 int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){
357   int rc;
358   assert( cursorHoldsMutex(pCur) );
359   assert( pCur->eState>=CURSOR_REQUIRESEEK );
360   if( pCur->eState==CURSOR_FAULT ){
361     return pCur->skip;
362   }
363 #ifndef SQLITE_OMIT_INCRBLOB
364   if( pCur->isIncrblobHandle ){
365     return SQLITE_ABORT;
366   }
367 #endif
368   pCur->eState = CURSOR_INVALID;
369   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
370   if( rc==SQLITE_OK ){
371     sqlite3_free(pCur->pKey);
372     pCur->pKey = 0;
373     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
374   }
375   return rc;
376 }
377 
378 #define restoreOrClearCursorPosition(p) \
379   (p->eState>=CURSOR_REQUIRESEEK ? \
380          sqlite3BtreeRestoreOrClearCursorPosition(p) : \
381          SQLITE_OK)
382 
383 #ifndef SQLITE_OMIT_AUTOVACUUM
384 /*
385 ** Given a page number of a regular database page, return the page
386 ** number for the pointer-map page that contains the entry for the
387 ** input page number.
388 */
389 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
390   int nPagesPerMapPage, iPtrMap, ret;
391   assert( sqlite3_mutex_held(pBt->mutex) );
392   nPagesPerMapPage = (pBt->usableSize/5)+1;
393   iPtrMap = (pgno-2)/nPagesPerMapPage;
394   ret = (iPtrMap*nPagesPerMapPage) + 2;
395   if( ret==PENDING_BYTE_PAGE(pBt) ){
396     ret++;
397   }
398   return ret;
399 }
400 
401 /*
402 ** Write an entry into the pointer map.
403 **
404 ** This routine updates the pointer map entry for page number 'key'
405 ** so that it maps to type 'eType' and parent page number 'pgno'.
406 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
407 */
408 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
409   DbPage *pDbPage;  /* The pointer map page */
410   u8 *pPtrmap;      /* The pointer map data */
411   Pgno iPtrmap;     /* The pointer map page number */
412   int offset;       /* Offset in pointer map page */
413   int rc;
414 
415   assert( sqlite3_mutex_held(pBt->mutex) );
416   /* The master-journal page number must never be used as a pointer map page */
417   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
418 
419   assert( pBt->autoVacuum );
420   if( key==0 ){
421     return SQLITE_CORRUPT_BKPT;
422   }
423   iPtrmap = PTRMAP_PAGENO(pBt, key);
424   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
425   if( rc!=SQLITE_OK ){
426     return rc;
427   }
428   offset = PTRMAP_PTROFFSET(pBt, key);
429   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
430 
431   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
432     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
433     rc = sqlite3PagerWrite(pDbPage);
434     if( rc==SQLITE_OK ){
435       pPtrmap[offset] = eType;
436       put4byte(&pPtrmap[offset+1], parent);
437     }
438   }
439 
440   sqlite3PagerUnref(pDbPage);
441   return rc;
442 }
443 
444 /*
445 ** Read an entry from the pointer map.
446 **
447 ** This routine retrieves the pointer map entry for page 'key', writing
448 ** the type and parent page number to *pEType and *pPgno respectively.
449 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
450 */
451 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
452   DbPage *pDbPage;   /* The pointer map page */
453   int iPtrmap;       /* Pointer map page index */
454   u8 *pPtrmap;       /* Pointer map page data */
455   int offset;        /* Offset of entry in pointer map */
456   int rc;
457 
458   assert( sqlite3_mutex_held(pBt->mutex) );
459 
460   iPtrmap = PTRMAP_PAGENO(pBt, key);
461   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
462   if( rc!=0 ){
463     return rc;
464   }
465   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
466 
467   offset = PTRMAP_PTROFFSET(pBt, key);
468   assert( pEType!=0 );
469   *pEType = pPtrmap[offset];
470   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
471 
472   sqlite3PagerUnref(pDbPage);
473   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
474   return SQLITE_OK;
475 }
476 
477 #endif /* SQLITE_OMIT_AUTOVACUUM */
478 
479 /*
480 ** Given a btree page and a cell index (0 means the first cell on
481 ** the page, 1 means the second cell, and so forth) return a pointer
482 ** to the cell content.
483 **
484 ** This routine works only for pages that do not contain overflow cells.
485 */
486 #define findCell(pPage, iCell) \
487   ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)]))
488 #ifdef SQLITE_TEST
489 u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){
490   assert( iCell>=0 );
491   assert( iCell<get2byte(&pPage->aData[pPage->hdrOffset+3]) );
492   return findCell(pPage, iCell);
493 }
494 #endif
495 
496 /*
497 ** This a more complex version of sqlite3BtreeFindCell() that works for
498 ** pages that do contain overflow cells.  See insert
499 */
500 static u8 *findOverflowCell(MemPage *pPage, int iCell){
501   int i;
502   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
503   for(i=pPage->nOverflow-1; i>=0; i--){
504     int k;
505     struct _OvflCell *pOvfl;
506     pOvfl = &pPage->aOvfl[i];
507     k = pOvfl->idx;
508     if( k<=iCell ){
509       if( k==iCell ){
510         return pOvfl->pCell;
511       }
512       iCell--;
513     }
514   }
515   return findCell(pPage, iCell);
516 }
517 
518 /*
519 ** Parse a cell content block and fill in the CellInfo structure.  There
520 ** are two versions of this function.  sqlite3BtreeParseCell() takes a
521 ** cell index as the second argument and sqlite3BtreeParseCellPtr()
522 ** takes a pointer to the body of the cell as its second argument.
523 **
524 ** Within this file, the parseCell() macro can be called instead of
525 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
526 */
527 void sqlite3BtreeParseCellPtr(
528   MemPage *pPage,         /* Page containing the cell */
529   u8 *pCell,              /* Pointer to the cell text. */
530   CellInfo *pInfo         /* Fill in this structure */
531 ){
532   int n;                  /* Number bytes in cell content header */
533   u32 nPayload;           /* Number of bytes of cell payload */
534 
535   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
536 
537   pInfo->pCell = pCell;
538   assert( pPage->leaf==0 || pPage->leaf==1 );
539   n = pPage->childPtrSize;
540   assert( n==4-4*pPage->leaf );
541   if( pPage->hasData ){
542     n += getVarint32(&pCell[n], &nPayload);
543   }else{
544     nPayload = 0;
545   }
546   pInfo->nData = nPayload;
547   if( pPage->intKey ){
548     n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
549   }else{
550     u32 x;
551     n += getVarint32(&pCell[n], &x);
552     pInfo->nKey = x;
553     nPayload += x;
554   }
555   pInfo->nPayload = nPayload;
556   pInfo->nHeader = n;
557   if( nPayload<=pPage->maxLocal ){
558     /* This is the (easy) common case where the entire payload fits
559     ** on the local page.  No overflow is required.
560     */
561     int nSize;          /* Total size of cell content in bytes */
562     pInfo->nLocal = nPayload;
563     pInfo->iOverflow = 0;
564     nSize = nPayload + n;
565     if( nSize<4 ){
566       nSize = 4;        /* Minimum cell size is 4 */
567     }
568     pInfo->nSize = nSize;
569   }else{
570     /* If the payload will not fit completely on the local page, we have
571     ** to decide how much to store locally and how much to spill onto
572     ** overflow pages.  The strategy is to minimize the amount of unused
573     ** space on overflow pages while keeping the amount of local storage
574     ** in between minLocal and maxLocal.
575     **
576     ** Warning:  changing the way overflow payload is distributed in any
577     ** way will result in an incompatible file format.
578     */
579     int minLocal;  /* Minimum amount of payload held locally */
580     int maxLocal;  /* Maximum amount of payload held locally */
581     int surplus;   /* Overflow payload available for local storage */
582 
583     minLocal = pPage->minLocal;
584     maxLocal = pPage->maxLocal;
585     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
586     if( surplus <= maxLocal ){
587       pInfo->nLocal = surplus;
588     }else{
589       pInfo->nLocal = minLocal;
590     }
591     pInfo->iOverflow = pInfo->nLocal + n;
592     pInfo->nSize = pInfo->iOverflow + 4;
593   }
594 }
595 #define parseCell(pPage, iCell, pInfo) \
596   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
597 void sqlite3BtreeParseCell(
598   MemPage *pPage,         /* Page containing the cell */
599   int iCell,              /* The cell index.  First cell is 0 */
600   CellInfo *pInfo         /* Fill in this structure */
601 ){
602   parseCell(pPage, iCell, pInfo);
603 }
604 
605 /*
606 ** Compute the total number of bytes that a Cell needs in the cell
607 ** data area of the btree-page.  The return number includes the cell
608 ** data header and the local payload, but not any overflow page or
609 ** the space used by the cell pointer.
610 */
611 #ifndef NDEBUG
612 static int cellSize(MemPage *pPage, int iCell){
613   CellInfo info;
614   sqlite3BtreeParseCell(pPage, iCell, &info);
615   return info.nSize;
616 }
617 #endif
618 static int cellSizePtr(MemPage *pPage, u8 *pCell){
619   CellInfo info;
620   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
621   return info.nSize;
622 }
623 
624 #ifndef SQLITE_OMIT_AUTOVACUUM
625 /*
626 ** If the cell pCell, part of page pPage contains a pointer
627 ** to an overflow page, insert an entry into the pointer-map
628 ** for the overflow page.
629 */
630 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
631   if( pCell ){
632     CellInfo info;
633     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
634     assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
635     if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
636       Pgno ovfl = get4byte(&pCell[info.iOverflow]);
637       return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
638     }
639   }
640   return SQLITE_OK;
641 }
642 /*
643 ** If the cell with index iCell on page pPage contains a pointer
644 ** to an overflow page, insert an entry into the pointer-map
645 ** for the overflow page.
646 */
647 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
648   u8 *pCell;
649   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
650   pCell = findOverflowCell(pPage, iCell);
651   return ptrmapPutOvflPtr(pPage, pCell);
652 }
653 #endif
654 
655 
656 /*
657 ** Defragment the page given.  All Cells are moved to the
658 ** end of the page and all free space is collected into one
659 ** big FreeBlk that occurs in between the header and cell
660 ** pointer array and the cell content area.
661 */
662 static int defragmentPage(MemPage *pPage){
663   int i;                     /* Loop counter */
664   int pc;                    /* Address of a i-th cell */
665   int addr;                  /* Offset of first byte after cell pointer array */
666   int hdr;                   /* Offset to the page header */
667   int size;                  /* Size of a cell */
668   int usableSize;            /* Number of usable bytes on a page */
669   int cellOffset;            /* Offset to the cell pointer array */
670   int brk;                   /* Offset to the cell content area */
671   int nCell;                 /* Number of cells on the page */
672   unsigned char *data;       /* The page data */
673   unsigned char *temp;       /* Temp area for cell content */
674 
675   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
676   assert( pPage->pBt!=0 );
677   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
678   assert( pPage->nOverflow==0 );
679   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
680   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
681   data = pPage->aData;
682   hdr = pPage->hdrOffset;
683   cellOffset = pPage->cellOffset;
684   nCell = pPage->nCell;
685   assert( nCell==get2byte(&data[hdr+3]) );
686   usableSize = pPage->pBt->usableSize;
687   brk = get2byte(&data[hdr+5]);
688   memcpy(&temp[brk], &data[brk], usableSize - brk);
689   brk = usableSize;
690   for(i=0; i<nCell; i++){
691     u8 *pAddr;     /* The i-th cell pointer */
692     pAddr = &data[cellOffset + i*2];
693     pc = get2byte(pAddr);
694     assert( pc<pPage->pBt->usableSize );
695     size = cellSizePtr(pPage, &temp[pc]);
696     brk -= size;
697     memcpy(&data[brk], &temp[pc], size);
698     put2byte(pAddr, brk);
699   }
700   assert( brk>=cellOffset+2*nCell );
701   put2byte(&data[hdr+5], brk);
702   data[hdr+1] = 0;
703   data[hdr+2] = 0;
704   data[hdr+7] = 0;
705   addr = cellOffset+2*nCell;
706   memset(&data[addr], 0, brk-addr);
707   return SQLITE_OK;
708 }
709 
710 /*
711 ** Allocate nByte bytes of space on a page.
712 **
713 ** Return the index into pPage->aData[] of the first byte of
714 ** the new allocation. Or return 0 if there is not enough free
715 ** space on the page to satisfy the allocation request.
716 **
717 ** If the page contains nBytes of free space but does not contain
718 ** nBytes of contiguous free space, then this routine automatically
719 ** calls defragementPage() to consolidate all free space before
720 ** allocating the new chunk.
721 */
722 static int allocateSpace(MemPage *pPage, int nByte){
723   int addr, pc, hdr;
724   int size;
725   int nFrag;
726   int top;
727   int nCell;
728   int cellOffset;
729   unsigned char *data;
730 
731   data = pPage->aData;
732   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
733   assert( pPage->pBt );
734   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
735   if( nByte<4 ) nByte = 4;
736   if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
737   pPage->nFree -= nByte;
738   hdr = pPage->hdrOffset;
739 
740   nFrag = data[hdr+7];
741   if( nFrag<60 ){
742     /* Search the freelist looking for a slot big enough to satisfy the
743     ** space request. */
744     addr = hdr+1;
745     while( (pc = get2byte(&data[addr]))>0 ){
746       size = get2byte(&data[pc+2]);
747       if( size>=nByte ){
748         if( size<nByte+4 ){
749           memcpy(&data[addr], &data[pc], 2);
750           data[hdr+7] = nFrag + size - nByte;
751           return pc;
752         }else{
753           put2byte(&data[pc+2], size-nByte);
754           return pc + size - nByte;
755         }
756       }
757       addr = pc;
758     }
759   }
760 
761   /* Allocate memory from the gap in between the cell pointer array
762   ** and the cell content area.
763   */
764   top = get2byte(&data[hdr+5]);
765   nCell = get2byte(&data[hdr+3]);
766   cellOffset = pPage->cellOffset;
767   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
768     if( defragmentPage(pPage) ) return 0;
769     top = get2byte(&data[hdr+5]);
770   }
771   top -= nByte;
772   assert( cellOffset + 2*nCell <= top );
773   put2byte(&data[hdr+5], top);
774   return top;
775 }
776 
777 /*
778 ** Return a section of the pPage->aData to the freelist.
779 ** The first byte of the new free block is pPage->aDisk[start]
780 ** and the size of the block is "size" bytes.
781 **
782 ** Most of the effort here is involved in coalesing adjacent
783 ** free blocks into a single big free block.
784 */
785 static void freeSpace(MemPage *pPage, int start, int size){
786   int addr, pbegin, hdr;
787   unsigned char *data = pPage->aData;
788 
789   assert( pPage->pBt!=0 );
790   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
791   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
792   assert( (start + size)<=pPage->pBt->usableSize );
793   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
794   if( size<4 ) size = 4;
795 
796 #ifdef SQLITE_SECURE_DELETE
797   /* Overwrite deleted information with zeros when the SECURE_DELETE
798   ** option is enabled at compile-time */
799   memset(&data[start], 0, size);
800 #endif
801 
802   /* Add the space back into the linked list of freeblocks */
803   hdr = pPage->hdrOffset;
804   addr = hdr + 1;
805   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
806     assert( pbegin<=pPage->pBt->usableSize-4 );
807     assert( pbegin>addr );
808     addr = pbegin;
809   }
810   assert( pbegin<=pPage->pBt->usableSize-4 );
811   assert( pbegin>addr || pbegin==0 );
812   put2byte(&data[addr], start);
813   put2byte(&data[start], pbegin);
814   put2byte(&data[start+2], size);
815   pPage->nFree += size;
816 
817   /* Coalesce adjacent free blocks */
818   addr = pPage->hdrOffset + 1;
819   while( (pbegin = get2byte(&data[addr]))>0 ){
820     int pnext, psize;
821     assert( pbegin>addr );
822     assert( pbegin<=pPage->pBt->usableSize-4 );
823     pnext = get2byte(&data[pbegin]);
824     psize = get2byte(&data[pbegin+2]);
825     if( pbegin + psize + 3 >= pnext && pnext>0 ){
826       int frag = pnext - (pbegin+psize);
827       assert( frag<=data[pPage->hdrOffset+7] );
828       data[pPage->hdrOffset+7] -= frag;
829       put2byte(&data[pbegin], get2byte(&data[pnext]));
830       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
831     }else{
832       addr = pbegin;
833     }
834   }
835 
836   /* If the cell content area begins with a freeblock, remove it. */
837   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
838     int top;
839     pbegin = get2byte(&data[hdr+1]);
840     memcpy(&data[hdr+1], &data[pbegin], 2);
841     top = get2byte(&data[hdr+5]);
842     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
843   }
844 }
845 
846 /*
847 ** Decode the flags byte (the first byte of the header) for a page
848 ** and initialize fields of the MemPage structure accordingly.
849 */
850 static void decodeFlags(MemPage *pPage, int flagByte){
851   BtShared *pBt;     /* A copy of pPage->pBt */
852 
853   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
854   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
855   pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
856   pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
857   pPage->leaf = (flagByte & PTF_LEAF)!=0;
858   pPage->childPtrSize = 4*(pPage->leaf==0);
859   pBt = pPage->pBt;
860   if( flagByte & PTF_LEAFDATA ){
861     pPage->leafData = 1;
862     pPage->maxLocal = pBt->maxLeaf;
863     pPage->minLocal = pBt->minLeaf;
864   }else{
865     pPage->leafData = 0;
866     pPage->maxLocal = pBt->maxLocal;
867     pPage->minLocal = pBt->minLocal;
868   }
869   pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
870 }
871 
872 /*
873 ** Initialize the auxiliary information for a disk block.
874 **
875 ** The pParent parameter must be a pointer to the MemPage which
876 ** is the parent of the page being initialized.  The root of a
877 ** BTree has no parent and so for that page, pParent==NULL.
878 **
879 ** Return SQLITE_OK on success.  If we see that the page does
880 ** not contain a well-formed database page, then return
881 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
882 ** guarantee that the page is well-formed.  It only shows that
883 ** we failed to detect any corruption.
884 */
885 int sqlite3BtreeInitPage(
886   MemPage *pPage,        /* The page to be initialized */
887   MemPage *pParent       /* The parent.  Might be NULL */
888 ){
889   int pc;            /* Address of a freeblock within pPage->aData[] */
890   int hdr;           /* Offset to beginning of page header */
891   u8 *data;          /* Equal to pPage->aData */
892   BtShared *pBt;        /* The main btree structure */
893   int usableSize;    /* Amount of usable space on each page */
894   int cellOffset;    /* Offset from start of page to first cell pointer */
895   int nFree;         /* Number of unused bytes on the page */
896   int top;           /* First byte of the cell content area */
897 
898   pBt = pPage->pBt;
899   assert( pBt!=0 );
900   assert( pParent==0 || pParent->pBt==pBt );
901   assert( sqlite3_mutex_held(pBt->mutex) );
902   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
903   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
904   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
905   if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
906     /* The parent page should never change unless the file is corrupt */
907     return SQLITE_CORRUPT_BKPT;
908   }
909   if( pPage->isInit ) return SQLITE_OK;
910   if( pPage->pParent==0 && pParent!=0 ){
911     pPage->pParent = pParent;
912     sqlite3PagerRef(pParent->pDbPage);
913   }
914   hdr = pPage->hdrOffset;
915   data = pPage->aData;
916   decodeFlags(pPage, data[hdr]);
917   pPage->nOverflow = 0;
918   pPage->idxShift = 0;
919   usableSize = pBt->usableSize;
920   pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
921   top = get2byte(&data[hdr+5]);
922   pPage->nCell = get2byte(&data[hdr+3]);
923   if( pPage->nCell>MX_CELL(pBt) ){
924     /* To many cells for a single page.  The page must be corrupt */
925     return SQLITE_CORRUPT_BKPT;
926   }
927   if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
928     /* All pages must have at least one cell, except for root pages */
929     return SQLITE_CORRUPT_BKPT;
930   }
931 
932   /* Compute the total free space on the page */
933   pc = get2byte(&data[hdr+1]);
934   nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
935   while( pc>0 ){
936     int next, size;
937     if( pc>usableSize-4 ){
938       /* Free block is off the page */
939       return SQLITE_CORRUPT_BKPT;
940     }
941     next = get2byte(&data[pc]);
942     size = get2byte(&data[pc+2]);
943     if( next>0 && next<=pc+size+3 ){
944       /* Free blocks must be in accending order */
945       return SQLITE_CORRUPT_BKPT;
946     }
947     nFree += size;
948     pc = next;
949   }
950   pPage->nFree = nFree;
951   if( nFree>=usableSize ){
952     /* Free space cannot exceed total page size */
953     return SQLITE_CORRUPT_BKPT;
954   }
955 
956   pPage->isInit = 1;
957   return SQLITE_OK;
958 }
959 
960 /*
961 ** Set up a raw page so that it looks like a database page holding
962 ** no entries.
963 */
964 static void zeroPage(MemPage *pPage, int flags){
965   unsigned char *data = pPage->aData;
966   BtShared *pBt = pPage->pBt;
967   int hdr = pPage->hdrOffset;
968   int first;
969 
970   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
971   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
972   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
973   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
974   assert( sqlite3_mutex_held(pBt->mutex) );
975   memset(&data[hdr], 0, pBt->usableSize - hdr);
976   data[hdr] = flags;
977   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
978   memset(&data[hdr+1], 0, 4);
979   data[hdr+7] = 0;
980   put2byte(&data[hdr+5], pBt->usableSize);
981   pPage->nFree = pBt->usableSize - first;
982   decodeFlags(pPage, flags);
983   pPage->hdrOffset = hdr;
984   pPage->cellOffset = first;
985   pPage->nOverflow = 0;
986   pPage->idxShift = 0;
987   pPage->nCell = 0;
988   pPage->isInit = 1;
989 }
990 
991 /*
992 ** Get a page from the pager.  Initialize the MemPage.pBt and
993 ** MemPage.aData elements if needed.
994 **
995 ** If the noContent flag is set, it means that we do not care about
996 ** the content of the page at this time.  So do not go to the disk
997 ** to fetch the content.  Just fill in the content with zeros for now.
998 ** If in the future we call sqlite3PagerWrite() on this page, that
999 ** means we have started to be concerned about content and the disk
1000 ** read should occur at that point.
1001 */
1002 int sqlite3BtreeGetPage(
1003   BtShared *pBt,       /* The btree */
1004   Pgno pgno,           /* Number of the page to fetch */
1005   MemPage **ppPage,    /* Return the page in this parameter */
1006   int noContent        /* Do not load page content if true */
1007 ){
1008   int rc;
1009   MemPage *pPage;
1010   DbPage *pDbPage;
1011 
1012   assert( sqlite3_mutex_held(pBt->mutex) );
1013   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1014   if( rc ) return rc;
1015   pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage);
1016   pPage->aData = sqlite3PagerGetData(pDbPage);
1017   pPage->pDbPage = pDbPage;
1018   pPage->pBt = pBt;
1019   pPage->pgno = pgno;
1020   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1021   *ppPage = pPage;
1022   return SQLITE_OK;
1023 }
1024 
1025 /*
1026 ** Get a page from the pager and initialize it.  This routine
1027 ** is just a convenience wrapper around separate calls to
1028 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1029 */
1030 static int getAndInitPage(
1031   BtShared *pBt,          /* The database file */
1032   Pgno pgno,           /* Number of the page to get */
1033   MemPage **ppPage,    /* Write the page pointer here */
1034   MemPage *pParent     /* Parent of the page */
1035 ){
1036   int rc;
1037   assert( sqlite3_mutex_held(pBt->mutex) );
1038   if( pgno==0 ){
1039     return SQLITE_CORRUPT_BKPT;
1040   }
1041   rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1042   if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1043     rc = sqlite3BtreeInitPage(*ppPage, pParent);
1044   }
1045   return rc;
1046 }
1047 
1048 /*
1049 ** Release a MemPage.  This should be called once for each prior
1050 ** call to sqlite3BtreeGetPage.
1051 */
1052 static void releasePage(MemPage *pPage){
1053   if( pPage ){
1054     assert( pPage->aData );
1055     assert( pPage->pBt );
1056     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1057     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1058     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1059     sqlite3PagerUnref(pPage->pDbPage);
1060   }
1061 }
1062 
1063 /*
1064 ** This routine is called when the reference count for a page
1065 ** reaches zero.  We need to unref the pParent pointer when that
1066 ** happens.
1067 */
1068 static void pageDestructor(DbPage *pData, int pageSize){
1069   MemPage *pPage;
1070   assert( (pageSize & 7)==0 );
1071   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1072   assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) );
1073   if( pPage->pParent ){
1074     MemPage *pParent = pPage->pParent;
1075     assert( pParent->pBt==pPage->pBt );
1076     pPage->pParent = 0;
1077     releasePage(pParent);
1078   }
1079   pPage->isInit = 0;
1080 }
1081 
1082 /*
1083 ** During a rollback, when the pager reloads information into the cache
1084 ** so that the cache is restored to its original state at the start of
1085 ** the transaction, for each page restored this routine is called.
1086 **
1087 ** This routine needs to reset the extra data section at the end of the
1088 ** page to agree with the restored data.
1089 */
1090 static void pageReinit(DbPage *pData, int pageSize){
1091   MemPage *pPage;
1092   assert( (pageSize & 7)==0 );
1093   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1094   if( pPage->isInit ){
1095     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1096     pPage->isInit = 0;
1097     sqlite3BtreeInitPage(pPage, pPage->pParent);
1098   }
1099 }
1100 
1101 /*
1102 ** Invoke the busy handler for a btree.
1103 */
1104 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1105   BtShared *pBt = (BtShared*)pArg;
1106   assert( pBt->db );
1107   assert( sqlite3_mutex_held(pBt->db->mutex) );
1108   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1109 }
1110 
1111 /*
1112 ** Open a database file.
1113 **
1114 ** zFilename is the name of the database file.  If zFilename is NULL
1115 ** a new database with a random name is created.  This randomly named
1116 ** database file will be deleted when sqlite3BtreeClose() is called.
1117 ** If zFilename is ":memory:" then an in-memory database is created
1118 ** that is automatically destroyed when it is closed.
1119 */
1120 int sqlite3BtreeOpen(
1121   const char *zFilename,  /* Name of the file containing the BTree database */
1122   sqlite3 *db,            /* Associated database handle */
1123   Btree **ppBtree,        /* Pointer to new Btree object written here */
1124   int flags,              /* Options */
1125   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1126 ){
1127   sqlite3_vfs *pVfs;      /* The VFS to use for this btree */
1128   BtShared *pBt = 0;      /* Shared part of btree structure */
1129   Btree *p;               /* Handle to return */
1130   int rc = SQLITE_OK;
1131   int nReserve;
1132   unsigned char zDbHeader[100];
1133 
1134   /* Set the variable isMemdb to true for an in-memory database, or
1135   ** false for a file-based database. This symbol is only required if
1136   ** either of the shared-data or autovacuum features are compiled
1137   ** into the library.
1138   */
1139 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1140   #ifdef SQLITE_OMIT_MEMORYDB
1141     const int isMemdb = 0;
1142   #else
1143     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1144   #endif
1145 #endif
1146 
1147   assert( db!=0 );
1148   assert( sqlite3_mutex_held(db->mutex) );
1149 
1150   pVfs = db->pVfs;
1151   p = sqlite3MallocZero(sizeof(Btree));
1152   if( !p ){
1153     return SQLITE_NOMEM;
1154   }
1155   p->inTrans = TRANS_NONE;
1156   p->db = db;
1157 
1158 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1159   /*
1160   ** If this Btree is a candidate for shared cache, try to find an
1161   ** existing BtShared object that we can share with
1162   */
1163   if( (flags & BTREE_PRIVATE)==0
1164    && isMemdb==0
1165    && (db->flags & SQLITE_Vtab)==0
1166    && zFilename && zFilename[0]
1167   ){
1168     if( sqlite3SharedCacheEnabled ){
1169       int nFullPathname = pVfs->mxPathname+1;
1170       char *zFullPathname = (char *)sqlite3_malloc(nFullPathname);
1171       sqlite3_mutex *mutexShared;
1172       p->sharable = 1;
1173       if( db ){
1174         db->flags |= SQLITE_SharedCache;
1175       }
1176       if( !zFullPathname ){
1177         sqlite3_free(p);
1178         return SQLITE_NOMEM;
1179       }
1180       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1181       mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1182       sqlite3_mutex_enter(mutexShared);
1183       for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){
1184         assert( pBt->nRef>0 );
1185         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1186                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1187           p->pBt = pBt;
1188           pBt->nRef++;
1189           break;
1190         }
1191       }
1192       sqlite3_mutex_leave(mutexShared);
1193       sqlite3_free(zFullPathname);
1194     }
1195 #ifdef SQLITE_DEBUG
1196     else{
1197       /* In debug mode, we mark all persistent databases as sharable
1198       ** even when they are not.  This exercises the locking code and
1199       ** gives more opportunity for asserts(sqlite3_mutex_held())
1200       ** statements to find locking problems.
1201       */
1202       p->sharable = 1;
1203     }
1204 #endif
1205   }
1206 #endif
1207   if( pBt==0 ){
1208     /*
1209     ** The following asserts make sure that structures used by the btree are
1210     ** the right size.  This is to guard against size changes that result
1211     ** when compiling on a different architecture.
1212     */
1213     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1214     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1215     assert( sizeof(u32)==4 );
1216     assert( sizeof(u16)==2 );
1217     assert( sizeof(Pgno)==4 );
1218 
1219     pBt = sqlite3MallocZero( sizeof(*pBt) );
1220     if( pBt==0 ){
1221       rc = SQLITE_NOMEM;
1222       goto btree_open_out;
1223     }
1224     pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1225     pBt->busyHdr.pArg = pBt;
1226     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1227                           EXTRA_SIZE, flags, vfsFlags);
1228     if( rc==SQLITE_OK ){
1229       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1230     }
1231     if( rc!=SQLITE_OK ){
1232       goto btree_open_out;
1233     }
1234     sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1235     p->pBt = pBt;
1236 
1237     sqlite3PagerSetDestructor(pBt->pPager, pageDestructor);
1238     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1239     pBt->pCursor = 0;
1240     pBt->pPage1 = 0;
1241     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1242     pBt->pageSize = get2byte(&zDbHeader[16]);
1243     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1244          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1245       pBt->pageSize = 0;
1246       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1247       pBt->maxEmbedFrac = 64;   /* 25% */
1248       pBt->minEmbedFrac = 32;   /* 12.5% */
1249       pBt->minLeafFrac = 32;    /* 12.5% */
1250 #ifndef SQLITE_OMIT_AUTOVACUUM
1251       /* If the magic name ":memory:" will create an in-memory database, then
1252       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1253       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1254       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1255       ** regular file-name. In this case the auto-vacuum applies as per normal.
1256       */
1257       if( zFilename && !isMemdb ){
1258         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1259         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1260       }
1261 #endif
1262       nReserve = 0;
1263     }else{
1264       nReserve = zDbHeader[20];
1265       pBt->maxEmbedFrac = zDbHeader[21];
1266       pBt->minEmbedFrac = zDbHeader[22];
1267       pBt->minLeafFrac = zDbHeader[23];
1268       pBt->pageSizeFixed = 1;
1269 #ifndef SQLITE_OMIT_AUTOVACUUM
1270       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1271       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1272 #endif
1273     }
1274     pBt->usableSize = pBt->pageSize - nReserve;
1275     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1276     sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1277 
1278 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1279     /* Add the new BtShared object to the linked list sharable BtShareds.
1280     */
1281     if( p->sharable ){
1282       sqlite3_mutex *mutexShared;
1283       pBt->nRef = 1;
1284       mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1285       if( SQLITE_THREADSAFE ){
1286         pBt->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
1287         if( pBt->mutex==0 ){
1288           rc = SQLITE_NOMEM;
1289           db->mallocFailed = 0;
1290           goto btree_open_out;
1291         }
1292       }
1293       sqlite3_mutex_enter(mutexShared);
1294       pBt->pNext = sqlite3SharedCacheList;
1295       sqlite3SharedCacheList = pBt;
1296       sqlite3_mutex_leave(mutexShared);
1297     }
1298 #endif
1299   }
1300 
1301 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1302   /* If the new Btree uses a sharable pBtShared, then link the new
1303   ** Btree into the list of all sharable Btrees for the same connection.
1304   ** The list is kept in ascending order by pBt address.
1305   */
1306   if( p->sharable ){
1307     int i;
1308     Btree *pSib;
1309     for(i=0; i<db->nDb; i++){
1310       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1311         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1312         if( p->pBt<pSib->pBt ){
1313           p->pNext = pSib;
1314           p->pPrev = 0;
1315           pSib->pPrev = p;
1316         }else{
1317           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1318             pSib = pSib->pNext;
1319           }
1320           p->pNext = pSib->pNext;
1321           p->pPrev = pSib;
1322           if( p->pNext ){
1323             p->pNext->pPrev = p;
1324           }
1325           pSib->pNext = p;
1326         }
1327         break;
1328       }
1329     }
1330   }
1331 #endif
1332   *ppBtree = p;
1333 
1334 btree_open_out:
1335   if( rc!=SQLITE_OK ){
1336     if( pBt && pBt->pPager ){
1337       sqlite3PagerClose(pBt->pPager);
1338     }
1339     sqlite3_free(pBt);
1340     sqlite3_free(p);
1341     *ppBtree = 0;
1342   }
1343   return rc;
1344 }
1345 
1346 /*
1347 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1348 ** remove the BtShared structure from the sharing list.  Return
1349 ** true if the BtShared.nRef counter reaches zero and return
1350 ** false if it is still positive.
1351 */
1352 static int removeFromSharingList(BtShared *pBt){
1353 #ifndef SQLITE_OMIT_SHARED_CACHE
1354   sqlite3_mutex *pMaster;
1355   BtShared *pList;
1356   int removed = 0;
1357 
1358   assert( sqlite3_mutex_notheld(pBt->mutex) );
1359   pMaster = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1360   sqlite3_mutex_enter(pMaster);
1361   pBt->nRef--;
1362   if( pBt->nRef<=0 ){
1363     if( sqlite3SharedCacheList==pBt ){
1364       sqlite3SharedCacheList = pBt->pNext;
1365     }else{
1366       pList = sqlite3SharedCacheList;
1367       while( pList && pList->pNext!=pBt ){
1368         pList=pList->pNext;
1369       }
1370       if( pList ){
1371         pList->pNext = pBt->pNext;
1372       }
1373     }
1374     if( SQLITE_THREADSAFE ){
1375       sqlite3_mutex_free(pBt->mutex);
1376     }
1377     removed = 1;
1378   }
1379   sqlite3_mutex_leave(pMaster);
1380   return removed;
1381 #else
1382   return 1;
1383 #endif
1384 }
1385 
1386 /*
1387 ** Close an open database and invalidate all cursors.
1388 */
1389 int sqlite3BtreeClose(Btree *p){
1390   BtShared *pBt = p->pBt;
1391   BtCursor *pCur;
1392 
1393   /* Close all cursors opened via this handle.  */
1394   assert( sqlite3_mutex_held(p->db->mutex) );
1395   sqlite3BtreeEnter(p);
1396   pBt->db = p->db;
1397   pCur = pBt->pCursor;
1398   while( pCur ){
1399     BtCursor *pTmp = pCur;
1400     pCur = pCur->pNext;
1401     if( pTmp->pBtree==p ){
1402       sqlite3BtreeCloseCursor(pTmp);
1403     }
1404   }
1405 
1406   /* Rollback any active transaction and free the handle structure.
1407   ** The call to sqlite3BtreeRollback() drops any table-locks held by
1408   ** this handle.
1409   */
1410   sqlite3BtreeRollback(p);
1411   sqlite3BtreeLeave(p);
1412 
1413   /* If there are still other outstanding references to the shared-btree
1414   ** structure, return now. The remainder of this procedure cleans
1415   ** up the shared-btree.
1416   */
1417   assert( p->wantToLock==0 && p->locked==0 );
1418   if( !p->sharable || removeFromSharingList(pBt) ){
1419     /* The pBt is no longer on the sharing list, so we can access
1420     ** it without having to hold the mutex.
1421     **
1422     ** Clean out and delete the BtShared object.
1423     */
1424     assert( !pBt->pCursor );
1425     sqlite3PagerClose(pBt->pPager);
1426     if( pBt->xFreeSchema && pBt->pSchema ){
1427       pBt->xFreeSchema(pBt->pSchema);
1428     }
1429     sqlite3_free(pBt->pSchema);
1430     sqlite3_free(pBt);
1431   }
1432 
1433 #ifndef SQLITE_OMIT_SHARED_CACHE
1434   assert( p->wantToLock==0 );
1435   assert( p->locked==0 );
1436   if( p->pPrev ) p->pPrev->pNext = p->pNext;
1437   if( p->pNext ) p->pNext->pPrev = p->pPrev;
1438 #endif
1439 
1440   sqlite3_free(p);
1441   return SQLITE_OK;
1442 }
1443 
1444 /*
1445 ** Change the limit on the number of pages allowed in the cache.
1446 **
1447 ** The maximum number of cache pages is set to the absolute
1448 ** value of mxPage.  If mxPage is negative, the pager will
1449 ** operate asynchronously - it will not stop to do fsync()s
1450 ** to insure data is written to the disk surface before
1451 ** continuing.  Transactions still work if synchronous is off,
1452 ** and the database cannot be corrupted if this program
1453 ** crashes.  But if the operating system crashes or there is
1454 ** an abrupt power failure when synchronous is off, the database
1455 ** could be left in an inconsistent and unrecoverable state.
1456 ** Synchronous is on by default so database corruption is not
1457 ** normally a worry.
1458 */
1459 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1460   BtShared *pBt = p->pBt;
1461   assert( sqlite3_mutex_held(p->db->mutex) );
1462   sqlite3BtreeEnter(p);
1463   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1464   sqlite3BtreeLeave(p);
1465   return SQLITE_OK;
1466 }
1467 
1468 /*
1469 ** Change the way data is synced to disk in order to increase or decrease
1470 ** how well the database resists damage due to OS crashes and power
1471 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
1472 ** there is a high probability of damage)  Level 2 is the default.  There
1473 ** is a very low but non-zero probability of damage.  Level 3 reduces the
1474 ** probability of damage to near zero but with a write performance reduction.
1475 */
1476 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1477 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1478   BtShared *pBt = p->pBt;
1479   assert( sqlite3_mutex_held(p->db->mutex) );
1480   sqlite3BtreeEnter(p);
1481   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1482   sqlite3BtreeLeave(p);
1483   return SQLITE_OK;
1484 }
1485 #endif
1486 
1487 /*
1488 ** Return TRUE if the given btree is set to safety level 1.  In other
1489 ** words, return TRUE if no sync() occurs on the disk files.
1490 */
1491 int sqlite3BtreeSyncDisabled(Btree *p){
1492   BtShared *pBt = p->pBt;
1493   int rc;
1494   assert( sqlite3_mutex_held(p->db->mutex) );
1495   sqlite3BtreeEnter(p);
1496   assert( pBt && pBt->pPager );
1497   rc = sqlite3PagerNosync(pBt->pPager);
1498   sqlite3BtreeLeave(p);
1499   return rc;
1500 }
1501 
1502 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1503 /*
1504 ** Change the default pages size and the number of reserved bytes per page.
1505 **
1506 ** The page size must be a power of 2 between 512 and 65536.  If the page
1507 ** size supplied does not meet this constraint then the page size is not
1508 ** changed.
1509 **
1510 ** Page sizes are constrained to be a power of two so that the region
1511 ** of the database file used for locking (beginning at PENDING_BYTE,
1512 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1513 ** at the beginning of a page.
1514 **
1515 ** If parameter nReserve is less than zero, then the number of reserved
1516 ** bytes per page is left unchanged.
1517 */
1518 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1519   int rc = SQLITE_OK;
1520   BtShared *pBt = p->pBt;
1521   sqlite3BtreeEnter(p);
1522   if( pBt->pageSizeFixed ){
1523     sqlite3BtreeLeave(p);
1524     return SQLITE_READONLY;
1525   }
1526   if( nReserve<0 ){
1527     nReserve = pBt->pageSize - pBt->usableSize;
1528   }
1529   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1530         ((pageSize-1)&pageSize)==0 ){
1531     assert( (pageSize & 7)==0 );
1532     assert( !pBt->pPage1 && !pBt->pCursor );
1533     pBt->pageSize = pageSize;
1534     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1535   }
1536   pBt->usableSize = pBt->pageSize - nReserve;
1537   sqlite3BtreeLeave(p);
1538   return rc;
1539 }
1540 
1541 /*
1542 ** Return the currently defined page size
1543 */
1544 int sqlite3BtreeGetPageSize(Btree *p){
1545   return p->pBt->pageSize;
1546 }
1547 int sqlite3BtreeGetReserve(Btree *p){
1548   int n;
1549   sqlite3BtreeEnter(p);
1550   n = p->pBt->pageSize - p->pBt->usableSize;
1551   sqlite3BtreeLeave(p);
1552   return n;
1553 }
1554 
1555 /*
1556 ** Set the maximum page count for a database if mxPage is positive.
1557 ** No changes are made if mxPage is 0 or negative.
1558 ** Regardless of the value of mxPage, return the maximum page count.
1559 */
1560 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1561   int n;
1562   sqlite3BtreeEnter(p);
1563   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1564   sqlite3BtreeLeave(p);
1565   return n;
1566 }
1567 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1568 
1569 /*
1570 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1571 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1572 ** is disabled. The default value for the auto-vacuum property is
1573 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1574 */
1575 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1576 #ifdef SQLITE_OMIT_AUTOVACUUM
1577   return SQLITE_READONLY;
1578 #else
1579   BtShared *pBt = p->pBt;
1580   int rc = SQLITE_OK;
1581   int av = (autoVacuum?1:0);
1582 
1583   sqlite3BtreeEnter(p);
1584   if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1585     rc = SQLITE_READONLY;
1586   }else{
1587     pBt->autoVacuum = av;
1588   }
1589   sqlite3BtreeLeave(p);
1590   return rc;
1591 #endif
1592 }
1593 
1594 /*
1595 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1596 ** enabled 1 is returned. Otherwise 0.
1597 */
1598 int sqlite3BtreeGetAutoVacuum(Btree *p){
1599 #ifdef SQLITE_OMIT_AUTOVACUUM
1600   return BTREE_AUTOVACUUM_NONE;
1601 #else
1602   int rc;
1603   sqlite3BtreeEnter(p);
1604   rc = (
1605     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1606     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1607     BTREE_AUTOVACUUM_INCR
1608   );
1609   sqlite3BtreeLeave(p);
1610   return rc;
1611 #endif
1612 }
1613 
1614 
1615 /*
1616 ** Get a reference to pPage1 of the database file.  This will
1617 ** also acquire a readlock on that file.
1618 **
1619 ** SQLITE_OK is returned on success.  If the file is not a
1620 ** well-formed database file, then SQLITE_CORRUPT is returned.
1621 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
1622 ** is returned if we run out of memory.
1623 */
1624 static int lockBtree(BtShared *pBt){
1625   int rc, pageSize;
1626   MemPage *pPage1;
1627 
1628   assert( sqlite3_mutex_held(pBt->mutex) );
1629   if( pBt->pPage1 ) return SQLITE_OK;
1630   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1631   if( rc!=SQLITE_OK ) return rc;
1632 
1633 
1634   /* Do some checking to help insure the file we opened really is
1635   ** a valid database file.
1636   */
1637   rc = SQLITE_NOTADB;
1638   if( sqlite3PagerPagecount(pBt->pPager)>0 ){
1639     u8 *page1 = pPage1->aData;
1640     if( memcmp(page1, zMagicHeader, 16)!=0 ){
1641       goto page1_init_failed;
1642     }
1643     if( page1[18]>1 ){
1644       pBt->readOnly = 1;
1645     }
1646     if( page1[19]>1 ){
1647       goto page1_init_failed;
1648     }
1649     pageSize = get2byte(&page1[16]);
1650     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1651         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1652     ){
1653       goto page1_init_failed;
1654     }
1655     assert( (pageSize & 7)==0 );
1656     pBt->pageSize = pageSize;
1657     pBt->usableSize = pageSize - page1[20];
1658     if( pBt->usableSize<500 ){
1659       goto page1_init_failed;
1660     }
1661     pBt->maxEmbedFrac = page1[21];
1662     pBt->minEmbedFrac = page1[22];
1663     pBt->minLeafFrac = page1[23];
1664 #ifndef SQLITE_OMIT_AUTOVACUUM
1665     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1666     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1667 #endif
1668   }
1669 
1670   /* maxLocal is the maximum amount of payload to store locally for
1671   ** a cell.  Make sure it is small enough so that at least minFanout
1672   ** cells can will fit on one page.  We assume a 10-byte page header.
1673   ** Besides the payload, the cell must store:
1674   **     2-byte pointer to the cell
1675   **     4-byte child pointer
1676   **     9-byte nKey value
1677   **     4-byte nData value
1678   **     4-byte overflow page pointer
1679   ** So a cell consists of a 2-byte poiner, a header which is as much as
1680   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1681   ** page pointer.
1682   */
1683   pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1684   pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1685   pBt->maxLeaf = pBt->usableSize - 35;
1686   pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1687   if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1688     goto page1_init_failed;
1689   }
1690   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1691   pBt->pPage1 = pPage1;
1692   return SQLITE_OK;
1693 
1694 page1_init_failed:
1695   releasePage(pPage1);
1696   pBt->pPage1 = 0;
1697   return rc;
1698 }
1699 
1700 /*
1701 ** This routine works like lockBtree() except that it also invokes the
1702 ** busy callback if there is lock contention.
1703 */
1704 static int lockBtreeWithRetry(Btree *pRef){
1705   int rc = SQLITE_OK;
1706 
1707   assert( sqlite3BtreeHoldsMutex(pRef) );
1708   if( pRef->inTrans==TRANS_NONE ){
1709     u8 inTransaction = pRef->pBt->inTransaction;
1710     btreeIntegrity(pRef);
1711     rc = sqlite3BtreeBeginTrans(pRef, 0);
1712     pRef->pBt->inTransaction = inTransaction;
1713     pRef->inTrans = TRANS_NONE;
1714     if( rc==SQLITE_OK ){
1715       pRef->pBt->nTransaction--;
1716     }
1717     btreeIntegrity(pRef);
1718   }
1719   return rc;
1720 }
1721 
1722 
1723 /*
1724 ** If there are no outstanding cursors and we are not in the middle
1725 ** of a transaction but there is a read lock on the database, then
1726 ** this routine unrefs the first page of the database file which
1727 ** has the effect of releasing the read lock.
1728 **
1729 ** If there are any outstanding cursors, this routine is a no-op.
1730 **
1731 ** If there is a transaction in progress, this routine is a no-op.
1732 */
1733 static void unlockBtreeIfUnused(BtShared *pBt){
1734   assert( sqlite3_mutex_held(pBt->mutex) );
1735   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1736     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1737       if( pBt->pPage1->aData==0 ){
1738         MemPage *pPage = pBt->pPage1;
1739         pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1740         pPage->pBt = pBt;
1741         pPage->pgno = 1;
1742       }
1743       releasePage(pBt->pPage1);
1744     }
1745     pBt->pPage1 = 0;
1746     pBt->inStmt = 0;
1747   }
1748 }
1749 
1750 /*
1751 ** Create a new database by initializing the first page of the
1752 ** file.
1753 */
1754 static int newDatabase(BtShared *pBt){
1755   MemPage *pP1;
1756   unsigned char *data;
1757   int rc;
1758 
1759   assert( sqlite3_mutex_held(pBt->mutex) );
1760   if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK;
1761   pP1 = pBt->pPage1;
1762   assert( pP1!=0 );
1763   data = pP1->aData;
1764   rc = sqlite3PagerWrite(pP1->pDbPage);
1765   if( rc ) return rc;
1766   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1767   assert( sizeof(zMagicHeader)==16 );
1768   put2byte(&data[16], pBt->pageSize);
1769   data[18] = 1;
1770   data[19] = 1;
1771   data[20] = pBt->pageSize - pBt->usableSize;
1772   data[21] = pBt->maxEmbedFrac;
1773   data[22] = pBt->minEmbedFrac;
1774   data[23] = pBt->minLeafFrac;
1775   memset(&data[24], 0, 100-24);
1776   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1777   pBt->pageSizeFixed = 1;
1778 #ifndef SQLITE_OMIT_AUTOVACUUM
1779   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1780   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1781   put4byte(&data[36 + 4*4], pBt->autoVacuum);
1782   put4byte(&data[36 + 7*4], pBt->incrVacuum);
1783 #endif
1784   return SQLITE_OK;
1785 }
1786 
1787 /*
1788 ** Attempt to start a new transaction. A write-transaction
1789 ** is started if the second argument is nonzero, otherwise a read-
1790 ** transaction.  If the second argument is 2 or more and exclusive
1791 ** transaction is started, meaning that no other process is allowed
1792 ** to access the database.  A preexisting transaction may not be
1793 ** upgraded to exclusive by calling this routine a second time - the
1794 ** exclusivity flag only works for a new transaction.
1795 **
1796 ** A write-transaction must be started before attempting any
1797 ** changes to the database.  None of the following routines
1798 ** will work unless a transaction is started first:
1799 **
1800 **      sqlite3BtreeCreateTable()
1801 **      sqlite3BtreeCreateIndex()
1802 **      sqlite3BtreeClearTable()
1803 **      sqlite3BtreeDropTable()
1804 **      sqlite3BtreeInsert()
1805 **      sqlite3BtreeDelete()
1806 **      sqlite3BtreeUpdateMeta()
1807 **
1808 ** If an initial attempt to acquire the lock fails because of lock contention
1809 ** and the database was previously unlocked, then invoke the busy handler
1810 ** if there is one.  But if there was previously a read-lock, do not
1811 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
1812 ** returned when there is already a read-lock in order to avoid a deadlock.
1813 **
1814 ** Suppose there are two processes A and B.  A has a read lock and B has
1815 ** a reserved lock.  B tries to promote to exclusive but is blocked because
1816 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
1817 ** One or the other of the two processes must give way or there can be
1818 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
1819 ** when A already has a read lock, we encourage A to give up and let B
1820 ** proceed.
1821 */
1822 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1823   BtShared *pBt = p->pBt;
1824   int rc = SQLITE_OK;
1825 
1826   sqlite3BtreeEnter(p);
1827   pBt->db = p->db;
1828   btreeIntegrity(p);
1829 
1830   /* If the btree is already in a write-transaction, or it
1831   ** is already in a read-transaction and a read-transaction
1832   ** is requested, this is a no-op.
1833   */
1834   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1835     goto trans_begun;
1836   }
1837 
1838   /* Write transactions are not possible on a read-only database */
1839   if( pBt->readOnly && wrflag ){
1840     rc = SQLITE_READONLY;
1841     goto trans_begun;
1842   }
1843 
1844   /* If another database handle has already opened a write transaction
1845   ** on this shared-btree structure and a second write transaction is
1846   ** requested, return SQLITE_BUSY.
1847   */
1848   if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1849     rc = SQLITE_BUSY;
1850     goto trans_begun;
1851   }
1852 
1853   do {
1854     if( pBt->pPage1==0 ){
1855       rc = lockBtree(pBt);
1856     }
1857 
1858     if( rc==SQLITE_OK && wrflag ){
1859       if( pBt->readOnly ){
1860         rc = SQLITE_READONLY;
1861       }else{
1862         rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1863         if( rc==SQLITE_OK ){
1864           rc = newDatabase(pBt);
1865         }
1866       }
1867     }
1868 
1869     if( rc==SQLITE_OK ){
1870       if( wrflag ) pBt->inStmt = 0;
1871     }else{
1872       unlockBtreeIfUnused(pBt);
1873     }
1874   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1875           sqlite3BtreeInvokeBusyHandler(pBt, 0) );
1876 
1877   if( rc==SQLITE_OK ){
1878     if( p->inTrans==TRANS_NONE ){
1879       pBt->nTransaction++;
1880     }
1881     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1882     if( p->inTrans>pBt->inTransaction ){
1883       pBt->inTransaction = p->inTrans;
1884     }
1885   }
1886 
1887 
1888 trans_begun:
1889   btreeIntegrity(p);
1890   sqlite3BtreeLeave(p);
1891   return rc;
1892 }
1893 
1894 #ifndef SQLITE_OMIT_AUTOVACUUM
1895 
1896 /*
1897 ** Set the pointer-map entries for all children of page pPage. Also, if
1898 ** pPage contains cells that point to overflow pages, set the pointer
1899 ** map entries for the overflow pages as well.
1900 */
1901 static int setChildPtrmaps(MemPage *pPage){
1902   int i;                             /* Counter variable */
1903   int nCell;                         /* Number of cells in page pPage */
1904   int rc;                            /* Return code */
1905   BtShared *pBt = pPage->pBt;
1906   int isInitOrig = pPage->isInit;
1907   Pgno pgno = pPage->pgno;
1908 
1909   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1910   rc = sqlite3BtreeInitPage(pPage, pPage->pParent);
1911   if( rc!=SQLITE_OK ){
1912     goto set_child_ptrmaps_out;
1913   }
1914   nCell = pPage->nCell;
1915 
1916   for(i=0; i<nCell; i++){
1917     u8 *pCell = findCell(pPage, i);
1918 
1919     rc = ptrmapPutOvflPtr(pPage, pCell);
1920     if( rc!=SQLITE_OK ){
1921       goto set_child_ptrmaps_out;
1922     }
1923 
1924     if( !pPage->leaf ){
1925       Pgno childPgno = get4byte(pCell);
1926       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1927       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1928     }
1929   }
1930 
1931   if( !pPage->leaf ){
1932     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1933     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1934   }
1935 
1936 set_child_ptrmaps_out:
1937   pPage->isInit = isInitOrig;
1938   return rc;
1939 }
1940 
1941 /*
1942 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1943 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
1944 ** iTo. Parameter eType describes the type of pointer to be modified, as
1945 ** follows:
1946 **
1947 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
1948 **                   page of pPage.
1949 **
1950 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1951 **                   page pointed to by one of the cells on pPage.
1952 **
1953 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1954 **                   overflow page in the list.
1955 */
1956 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1957   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1958   if( eType==PTRMAP_OVERFLOW2 ){
1959     /* The pointer is always the first 4 bytes of the page in this case.  */
1960     if( get4byte(pPage->aData)!=iFrom ){
1961       return SQLITE_CORRUPT_BKPT;
1962     }
1963     put4byte(pPage->aData, iTo);
1964   }else{
1965     int isInitOrig = pPage->isInit;
1966     int i;
1967     int nCell;
1968 
1969     sqlite3BtreeInitPage(pPage, 0);
1970     nCell = pPage->nCell;
1971 
1972     for(i=0; i<nCell; i++){
1973       u8 *pCell = findCell(pPage, i);
1974       if( eType==PTRMAP_OVERFLOW1 ){
1975         CellInfo info;
1976         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1977         if( info.iOverflow ){
1978           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1979             put4byte(&pCell[info.iOverflow], iTo);
1980             break;
1981           }
1982         }
1983       }else{
1984         if( get4byte(pCell)==iFrom ){
1985           put4byte(pCell, iTo);
1986           break;
1987         }
1988       }
1989     }
1990 
1991     if( i==nCell ){
1992       if( eType!=PTRMAP_BTREE ||
1993           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1994         return SQLITE_CORRUPT_BKPT;
1995       }
1996       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1997     }
1998 
1999     pPage->isInit = isInitOrig;
2000   }
2001   return SQLITE_OK;
2002 }
2003 
2004 
2005 /*
2006 ** Move the open database page pDbPage to location iFreePage in the
2007 ** database. The pDbPage reference remains valid.
2008 */
2009 static int relocatePage(
2010   BtShared *pBt,           /* Btree */
2011   MemPage *pDbPage,        /* Open page to move */
2012   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2013   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2014   Pgno iFreePage           /* The location to move pDbPage to */
2015 ){
2016   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2017   Pgno iDbPage = pDbPage->pgno;
2018   Pager *pPager = pBt->pPager;
2019   int rc;
2020 
2021   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2022       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2023   assert( sqlite3_mutex_held(pBt->mutex) );
2024   assert( pDbPage->pBt==pBt );
2025 
2026   /* Move page iDbPage from its current location to page number iFreePage */
2027   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2028       iDbPage, iFreePage, iPtrPage, eType));
2029   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage);
2030   if( rc!=SQLITE_OK ){
2031     return rc;
2032   }
2033   pDbPage->pgno = iFreePage;
2034 
2035   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2036   ** that point to overflow pages. The pointer map entries for all these
2037   ** pages need to be changed.
2038   **
2039   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2040   ** pointer to a subsequent overflow page. If this is the case, then
2041   ** the pointer map needs to be updated for the subsequent overflow page.
2042   */
2043   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2044     rc = setChildPtrmaps(pDbPage);
2045     if( rc!=SQLITE_OK ){
2046       return rc;
2047     }
2048   }else{
2049     Pgno nextOvfl = get4byte(pDbPage->aData);
2050     if( nextOvfl!=0 ){
2051       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2052       if( rc!=SQLITE_OK ){
2053         return rc;
2054       }
2055     }
2056   }
2057 
2058   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2059   ** that it points at iFreePage. Also fix the pointer map entry for
2060   ** iPtrPage.
2061   */
2062   if( eType!=PTRMAP_ROOTPAGE ){
2063     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2064     if( rc!=SQLITE_OK ){
2065       return rc;
2066     }
2067     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2068     if( rc!=SQLITE_OK ){
2069       releasePage(pPtrPage);
2070       return rc;
2071     }
2072     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2073     releasePage(pPtrPage);
2074     if( rc==SQLITE_OK ){
2075       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2076     }
2077   }
2078   return rc;
2079 }
2080 
2081 /* Forward declaration required by incrVacuumStep(). */
2082 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2083 
2084 /*
2085 ** Perform a single step of an incremental-vacuum. If successful,
2086 ** return SQLITE_OK. If there is no work to do (and therefore no
2087 ** point in calling this function again), return SQLITE_DONE.
2088 **
2089 ** More specificly, this function attempts to re-organize the
2090 ** database so that the last page of the file currently in use
2091 ** is no longer in use.
2092 **
2093 ** If the nFin parameter is non-zero, the implementation assumes
2094 ** that the caller will keep calling incrVacuumStep() until
2095 ** it returns SQLITE_DONE or an error, and that nFin is the
2096 ** number of pages the database file will contain after this
2097 ** process is complete.
2098 */
2099 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2100   Pgno iLastPg;             /* Last page in the database */
2101   Pgno nFreeList;           /* Number of pages still on the free-list */
2102 
2103   assert( sqlite3_mutex_held(pBt->mutex) );
2104   iLastPg = pBt->nTrunc;
2105   if( iLastPg==0 ){
2106     iLastPg = sqlite3PagerPagecount(pBt->pPager);
2107   }
2108 
2109   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2110     int rc;
2111     u8 eType;
2112     Pgno iPtrPage;
2113 
2114     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2115     if( nFreeList==0 || nFin==iLastPg ){
2116       return SQLITE_DONE;
2117     }
2118 
2119     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2120     if( rc!=SQLITE_OK ){
2121       return rc;
2122     }
2123     if( eType==PTRMAP_ROOTPAGE ){
2124       return SQLITE_CORRUPT_BKPT;
2125     }
2126 
2127     if( eType==PTRMAP_FREEPAGE ){
2128       if( nFin==0 ){
2129         /* Remove the page from the files free-list. This is not required
2130         ** if nFin is non-zero. In that case, the free-list will be
2131         ** truncated to zero after this function returns, so it doesn't
2132         ** matter if it still contains some garbage entries.
2133         */
2134         Pgno iFreePg;
2135         MemPage *pFreePg;
2136         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2137         if( rc!=SQLITE_OK ){
2138           return rc;
2139         }
2140         assert( iFreePg==iLastPg );
2141         releasePage(pFreePg);
2142       }
2143     } else {
2144       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2145       MemPage *pLastPg;
2146 
2147       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2148       if( rc!=SQLITE_OK ){
2149         return rc;
2150       }
2151 
2152       /* If nFin is zero, this loop runs exactly once and page pLastPg
2153       ** is swapped with the first free page pulled off the free list.
2154       **
2155       ** On the other hand, if nFin is greater than zero, then keep
2156       ** looping until a free-page located within the first nFin pages
2157       ** of the file is found.
2158       */
2159       do {
2160         MemPage *pFreePg;
2161         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2162         if( rc!=SQLITE_OK ){
2163           releasePage(pLastPg);
2164           return rc;
2165         }
2166         releasePage(pFreePg);
2167       }while( nFin!=0 && iFreePg>nFin );
2168       assert( iFreePg<iLastPg );
2169 
2170       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2171       if( rc==SQLITE_OK ){
2172         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg);
2173       }
2174       releasePage(pLastPg);
2175       if( rc!=SQLITE_OK ){
2176         return rc;
2177       }
2178     }
2179   }
2180 
2181   pBt->nTrunc = iLastPg - 1;
2182   while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2183     pBt->nTrunc--;
2184   }
2185   return SQLITE_OK;
2186 }
2187 
2188 /*
2189 ** A write-transaction must be opened before calling this function.
2190 ** It performs a single unit of work towards an incremental vacuum.
2191 **
2192 ** If the incremental vacuum is finished after this function has run,
2193 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
2194 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2195 */
2196 int sqlite3BtreeIncrVacuum(Btree *p){
2197   int rc;
2198   BtShared *pBt = p->pBt;
2199 
2200   sqlite3BtreeEnter(p);
2201   pBt->db = p->db;
2202   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2203   if( !pBt->autoVacuum ){
2204     rc = SQLITE_DONE;
2205   }else{
2206     invalidateAllOverflowCache(pBt);
2207     rc = incrVacuumStep(pBt, 0);
2208   }
2209   sqlite3BtreeLeave(p);
2210   return rc;
2211 }
2212 
2213 /*
2214 ** This routine is called prior to sqlite3PagerCommit when a transaction
2215 ** is commited for an auto-vacuum database.
2216 **
2217 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2218 ** the database file should be truncated to during the commit process.
2219 ** i.e. the database has been reorganized so that only the first *pnTrunc
2220 ** pages are in use.
2221 */
2222 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2223   int rc = SQLITE_OK;
2224   Pager *pPager = pBt->pPager;
2225 #ifndef NDEBUG
2226   int nRef = sqlite3PagerRefcount(pPager);
2227 #endif
2228 
2229   assert( sqlite3_mutex_held(pBt->mutex) );
2230   invalidateAllOverflowCache(pBt);
2231   assert(pBt->autoVacuum);
2232   if( !pBt->incrVacuum ){
2233     Pgno nFin = 0;
2234 
2235     if( pBt->nTrunc==0 ){
2236       Pgno nFree;
2237       Pgno nPtrmap;
2238       const int pgsz = pBt->pageSize;
2239       Pgno nOrig = sqlite3PagerPagecount(pBt->pPager);
2240 
2241       if( PTRMAP_ISPAGE(pBt, nOrig) ){
2242         return SQLITE_CORRUPT_BKPT;
2243       }
2244       if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2245         nOrig--;
2246       }
2247       nFree = get4byte(&pBt->pPage1->aData[36]);
2248       nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2249       nFin = nOrig - nFree - nPtrmap;
2250       if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2251         nFin--;
2252       }
2253       while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2254         nFin--;
2255       }
2256     }
2257 
2258     while( rc==SQLITE_OK ){
2259       rc = incrVacuumStep(pBt, nFin);
2260     }
2261     if( rc==SQLITE_DONE ){
2262       assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2263       rc = SQLITE_OK;
2264       if( pBt->nTrunc ){
2265         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2266         put4byte(&pBt->pPage1->aData[32], 0);
2267         put4byte(&pBt->pPage1->aData[36], 0);
2268         pBt->nTrunc = nFin;
2269       }
2270     }
2271     if( rc!=SQLITE_OK ){
2272       sqlite3PagerRollback(pPager);
2273     }
2274   }
2275 
2276   if( rc==SQLITE_OK ){
2277     *pnTrunc = pBt->nTrunc;
2278     pBt->nTrunc = 0;
2279   }
2280   assert( nRef==sqlite3PagerRefcount(pPager) );
2281   return rc;
2282 }
2283 
2284 #endif
2285 
2286 /*
2287 ** This routine does the first phase of a two-phase commit.  This routine
2288 ** causes a rollback journal to be created (if it does not already exist)
2289 ** and populated with enough information so that if a power loss occurs
2290 ** the database can be restored to its original state by playing back
2291 ** the journal.  Then the contents of the journal are flushed out to
2292 ** the disk.  After the journal is safely on oxide, the changes to the
2293 ** database are written into the database file and flushed to oxide.
2294 ** At the end of this call, the rollback journal still exists on the
2295 ** disk and we are still holding all locks, so the transaction has not
2296 ** committed.  See sqlite3BtreeCommit() for the second phase of the
2297 ** commit process.
2298 **
2299 ** This call is a no-op if no write-transaction is currently active on pBt.
2300 **
2301 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2302 ** the name of a master journal file that should be written into the
2303 ** individual journal file, or is NULL, indicating no master journal file
2304 ** (single database transaction).
2305 **
2306 ** When this is called, the master journal should already have been
2307 ** created, populated with this journal pointer and synced to disk.
2308 **
2309 ** Once this is routine has returned, the only thing required to commit
2310 ** the write-transaction for this database file is to delete the journal.
2311 */
2312 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2313   int rc = SQLITE_OK;
2314   if( p->inTrans==TRANS_WRITE ){
2315     BtShared *pBt = p->pBt;
2316     Pgno nTrunc = 0;
2317     sqlite3BtreeEnter(p);
2318     pBt->db = p->db;
2319 #ifndef SQLITE_OMIT_AUTOVACUUM
2320     if( pBt->autoVacuum ){
2321       rc = autoVacuumCommit(pBt, &nTrunc);
2322       if( rc!=SQLITE_OK ){
2323         sqlite3BtreeLeave(p);
2324         return rc;
2325       }
2326     }
2327 #endif
2328     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc);
2329     sqlite3BtreeLeave(p);
2330   }
2331   return rc;
2332 }
2333 
2334 /*
2335 ** Commit the transaction currently in progress.
2336 **
2337 ** This routine implements the second phase of a 2-phase commit.  The
2338 ** sqlite3BtreeSync() routine does the first phase and should be invoked
2339 ** prior to calling this routine.  The sqlite3BtreeSync() routine did
2340 ** all the work of writing information out to disk and flushing the
2341 ** contents so that they are written onto the disk platter.  All this
2342 ** routine has to do is delete or truncate the rollback journal
2343 ** (which causes the transaction to commit) and drop locks.
2344 **
2345 ** This will release the write lock on the database file.  If there
2346 ** are no active cursors, it also releases the read lock.
2347 */
2348 int sqlite3BtreeCommitPhaseTwo(Btree *p){
2349   BtShared *pBt = p->pBt;
2350 
2351   sqlite3BtreeEnter(p);
2352   pBt->db = p->db;
2353   btreeIntegrity(p);
2354 
2355   /* If the handle has a write-transaction open, commit the shared-btrees
2356   ** transaction and set the shared state to TRANS_READ.
2357   */
2358   if( p->inTrans==TRANS_WRITE ){
2359     int rc;
2360     assert( pBt->inTransaction==TRANS_WRITE );
2361     assert( pBt->nTransaction>0 );
2362     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2363     if( rc!=SQLITE_OK ){
2364       sqlite3BtreeLeave(p);
2365       return rc;
2366     }
2367     pBt->inTransaction = TRANS_READ;
2368     pBt->inStmt = 0;
2369   }
2370   unlockAllTables(p);
2371 
2372   /* If the handle has any kind of transaction open, decrement the transaction
2373   ** count of the shared btree. If the transaction count reaches 0, set
2374   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2375   ** will unlock the pager.
2376   */
2377   if( p->inTrans!=TRANS_NONE ){
2378     pBt->nTransaction--;
2379     if( 0==pBt->nTransaction ){
2380       pBt->inTransaction = TRANS_NONE;
2381     }
2382   }
2383 
2384   /* Set the handles current transaction state to TRANS_NONE and unlock
2385   ** the pager if this call closed the only read or write transaction.
2386   */
2387   p->inTrans = TRANS_NONE;
2388   unlockBtreeIfUnused(pBt);
2389 
2390   btreeIntegrity(p);
2391   sqlite3BtreeLeave(p);
2392   return SQLITE_OK;
2393 }
2394 
2395 /*
2396 ** Do both phases of a commit.
2397 */
2398 int sqlite3BtreeCommit(Btree *p){
2399   int rc;
2400   sqlite3BtreeEnter(p);
2401   rc = sqlite3BtreeCommitPhaseOne(p, 0);
2402   if( rc==SQLITE_OK ){
2403     rc = sqlite3BtreeCommitPhaseTwo(p);
2404   }
2405   sqlite3BtreeLeave(p);
2406   return rc;
2407 }
2408 
2409 #ifndef NDEBUG
2410 /*
2411 ** Return the number of write-cursors open on this handle. This is for use
2412 ** in assert() expressions, so it is only compiled if NDEBUG is not
2413 ** defined.
2414 **
2415 ** For the purposes of this routine, a write-cursor is any cursor that
2416 ** is capable of writing to the databse.  That means the cursor was
2417 ** originally opened for writing and the cursor has not be disabled
2418 ** by having its state changed to CURSOR_FAULT.
2419 */
2420 static int countWriteCursors(BtShared *pBt){
2421   BtCursor *pCur;
2422   int r = 0;
2423   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2424     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2425   }
2426   return r;
2427 }
2428 #endif
2429 
2430 /*
2431 ** This routine sets the state to CURSOR_FAULT and the error
2432 ** code to errCode for every cursor on BtShared that pBtree
2433 ** references.
2434 **
2435 ** Every cursor is tripped, including cursors that belong
2436 ** to other database connections that happen to be sharing
2437 ** the cache with pBtree.
2438 **
2439 ** This routine gets called when a rollback occurs.
2440 ** All cursors using the same cache must be tripped
2441 ** to prevent them from trying to use the btree after
2442 ** the rollback.  The rollback may have deleted tables
2443 ** or moved root pages, so it is not sufficient to
2444 ** save the state of the cursor.  The cursor must be
2445 ** invalidated.
2446 */
2447 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2448   BtCursor *p;
2449   sqlite3BtreeEnter(pBtree);
2450   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2451     clearCursorPosition(p);
2452     p->eState = CURSOR_FAULT;
2453     p->skip = errCode;
2454   }
2455   sqlite3BtreeLeave(pBtree);
2456 }
2457 
2458 /*
2459 ** Rollback the transaction in progress.  All cursors will be
2460 ** invalided by this operation.  Any attempt to use a cursor
2461 ** that was open at the beginning of this operation will result
2462 ** in an error.
2463 **
2464 ** This will release the write lock on the database file.  If there
2465 ** are no active cursors, it also releases the read lock.
2466 */
2467 int sqlite3BtreeRollback(Btree *p){
2468   int rc;
2469   BtShared *pBt = p->pBt;
2470   MemPage *pPage1;
2471 
2472   sqlite3BtreeEnter(p);
2473   pBt->db = p->db;
2474   rc = saveAllCursors(pBt, 0, 0);
2475 #ifndef SQLITE_OMIT_SHARED_CACHE
2476   if( rc!=SQLITE_OK ){
2477     /* This is a horrible situation. An IO or malloc() error occured whilst
2478     ** trying to save cursor positions. If this is an automatic rollback (as
2479     ** the result of a constraint, malloc() failure or IO error) then
2480     ** the cache may be internally inconsistent (not contain valid trees) so
2481     ** we cannot simply return the error to the caller. Instead, abort
2482     ** all queries that may be using any of the cursors that failed to save.
2483     */
2484     sqlite3BtreeTripAllCursors(p, rc);
2485   }
2486 #endif
2487   btreeIntegrity(p);
2488   unlockAllTables(p);
2489 
2490   if( p->inTrans==TRANS_WRITE ){
2491     int rc2;
2492 
2493 #ifndef SQLITE_OMIT_AUTOVACUUM
2494     pBt->nTrunc = 0;
2495 #endif
2496 
2497     assert( TRANS_WRITE==pBt->inTransaction );
2498     rc2 = sqlite3PagerRollback(pBt->pPager);
2499     if( rc2!=SQLITE_OK ){
2500       rc = rc2;
2501     }
2502 
2503     /* The rollback may have destroyed the pPage1->aData value.  So
2504     ** call sqlite3BtreeGetPage() on page 1 again to make
2505     ** sure pPage1->aData is set correctly. */
2506     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2507       releasePage(pPage1);
2508     }
2509     assert( countWriteCursors(pBt)==0 );
2510     pBt->inTransaction = TRANS_READ;
2511   }
2512 
2513   if( p->inTrans!=TRANS_NONE ){
2514     assert( pBt->nTransaction>0 );
2515     pBt->nTransaction--;
2516     if( 0==pBt->nTransaction ){
2517       pBt->inTransaction = TRANS_NONE;
2518     }
2519   }
2520 
2521   p->inTrans = TRANS_NONE;
2522   pBt->inStmt = 0;
2523   unlockBtreeIfUnused(pBt);
2524 
2525   btreeIntegrity(p);
2526   sqlite3BtreeLeave(p);
2527   return rc;
2528 }
2529 
2530 /*
2531 ** Start a statement subtransaction.  The subtransaction can
2532 ** can be rolled back independently of the main transaction.
2533 ** You must start a transaction before starting a subtransaction.
2534 ** The subtransaction is ended automatically if the main transaction
2535 ** commits or rolls back.
2536 **
2537 ** Only one subtransaction may be active at a time.  It is an error to try
2538 ** to start a new subtransaction if another subtransaction is already active.
2539 **
2540 ** Statement subtransactions are used around individual SQL statements
2541 ** that are contained within a BEGIN...COMMIT block.  If a constraint
2542 ** error occurs within the statement, the effect of that one statement
2543 ** can be rolled back without having to rollback the entire transaction.
2544 */
2545 int sqlite3BtreeBeginStmt(Btree *p){
2546   int rc;
2547   BtShared *pBt = p->pBt;
2548   sqlite3BtreeEnter(p);
2549   pBt->db = p->db;
2550   if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2551     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2552   }else{
2553     assert( pBt->inTransaction==TRANS_WRITE );
2554     rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2555     pBt->inStmt = 1;
2556   }
2557   sqlite3BtreeLeave(p);
2558   return rc;
2559 }
2560 
2561 
2562 /*
2563 ** Commit the statment subtransaction currently in progress.  If no
2564 ** subtransaction is active, this is a no-op.
2565 */
2566 int sqlite3BtreeCommitStmt(Btree *p){
2567   int rc;
2568   BtShared *pBt = p->pBt;
2569   sqlite3BtreeEnter(p);
2570   pBt->db = p->db;
2571   if( pBt->inStmt && !pBt->readOnly ){
2572     rc = sqlite3PagerStmtCommit(pBt->pPager);
2573   }else{
2574     rc = SQLITE_OK;
2575   }
2576   pBt->inStmt = 0;
2577   sqlite3BtreeLeave(p);
2578   return rc;
2579 }
2580 
2581 /*
2582 ** Rollback the active statement subtransaction.  If no subtransaction
2583 ** is active this routine is a no-op.
2584 **
2585 ** All cursors will be invalidated by this operation.  Any attempt
2586 ** to use a cursor that was open at the beginning of this operation
2587 ** will result in an error.
2588 */
2589 int sqlite3BtreeRollbackStmt(Btree *p){
2590   int rc = SQLITE_OK;
2591   BtShared *pBt = p->pBt;
2592   sqlite3BtreeEnter(p);
2593   pBt->db = p->db;
2594   if( pBt->inStmt && !pBt->readOnly ){
2595     rc = sqlite3PagerStmtRollback(pBt->pPager);
2596     assert( countWriteCursors(pBt)==0 );
2597     pBt->inStmt = 0;
2598   }
2599   sqlite3BtreeLeave(p);
2600   return rc;
2601 }
2602 
2603 /*
2604 ** Default key comparison function to be used if no comparison function
2605 ** is specified on the sqlite3BtreeCursor() call.
2606 */
2607 static int dfltCompare(
2608   void *NotUsed,             /* User data is not used */
2609   int n1, const void *p1,    /* First key to compare */
2610   int n2, const void *p2     /* Second key to compare */
2611 ){
2612   int c;
2613   c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2614   if( c==0 ){
2615     c = n1 - n2;
2616   }
2617   return c;
2618 }
2619 
2620 /*
2621 ** Create a new cursor for the BTree whose root is on the page
2622 ** iTable.  The act of acquiring a cursor gets a read lock on
2623 ** the database file.
2624 **
2625 ** If wrFlag==0, then the cursor can only be used for reading.
2626 ** If wrFlag==1, then the cursor can be used for reading or for
2627 ** writing if other conditions for writing are also met.  These
2628 ** are the conditions that must be met in order for writing to
2629 ** be allowed:
2630 **
2631 ** 1:  The cursor must have been opened with wrFlag==1
2632 **
2633 ** 2:  Other database connections that share the same pager cache
2634 **     but which are not in the READ_UNCOMMITTED state may not have
2635 **     cursors open with wrFlag==0 on the same table.  Otherwise
2636 **     the changes made by this write cursor would be visible to
2637 **     the read cursors in the other database connection.
2638 **
2639 ** 3:  The database must be writable (not on read-only media)
2640 **
2641 ** 4:  There must be an active transaction.
2642 **
2643 ** No checking is done to make sure that page iTable really is the
2644 ** root page of a b-tree.  If it is not, then the cursor acquired
2645 ** will not work correctly.
2646 **
2647 ** The comparison function must be logically the same for every cursor
2648 ** on a particular table.  Changing the comparison function will result
2649 ** in incorrect operations.  If the comparison function is NULL, a
2650 ** default comparison function is used.  The comparison function is
2651 ** always ignored for INTKEY tables.
2652 */
2653 static int btreeCursor(
2654   Btree *p,                                   /* The btree */
2655   int iTable,                                 /* Root page of table to open */
2656   int wrFlag,                                 /* 1 to write. 0 read-only */
2657   int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2658   void *pArg,                                 /* First arg to xCompare() */
2659   BtCursor **ppCur                            /* Write new cursor here */
2660 ){
2661   int rc;
2662   BtCursor *pCur;
2663   BtShared *pBt = p->pBt;
2664 
2665   assert( sqlite3BtreeHoldsMutex(p) );
2666   *ppCur = 0;
2667   if( wrFlag ){
2668     if( pBt->readOnly ){
2669       return SQLITE_READONLY;
2670     }
2671     if( checkReadLocks(p, iTable, 0) ){
2672       return SQLITE_LOCKED;
2673     }
2674   }
2675 
2676   if( pBt->pPage1==0 ){
2677     rc = lockBtreeWithRetry(p);
2678     if( rc!=SQLITE_OK ){
2679       return rc;
2680     }
2681     if( pBt->readOnly && wrFlag ){
2682       return SQLITE_READONLY;
2683     }
2684   }
2685   pCur = sqlite3MallocZero( sizeof(*pCur) );
2686   if( pCur==0 ){
2687     rc = SQLITE_NOMEM;
2688     goto create_cursor_exception;
2689   }
2690   pCur->pgnoRoot = (Pgno)iTable;
2691   if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){
2692     rc = SQLITE_EMPTY;
2693     goto create_cursor_exception;
2694   }
2695   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2696   if( rc!=SQLITE_OK ){
2697     goto create_cursor_exception;
2698   }
2699 
2700   /* Now that no other errors can occur, finish filling in the BtCursor
2701   ** variables, link the cursor into the BtShared list and set *ppCur (the
2702   ** output argument to this function).
2703   */
2704   pCur->xCompare = xCmp ? xCmp : dfltCompare;
2705   pCur->pArg = pArg;
2706   pCur->pBtree = p;
2707   pCur->pBt = pBt;
2708   pCur->wrFlag = wrFlag;
2709   pCur->pNext = pBt->pCursor;
2710   if( pCur->pNext ){
2711     pCur->pNext->pPrev = pCur;
2712   }
2713   pBt->pCursor = pCur;
2714   pCur->eState = CURSOR_INVALID;
2715   *ppCur = pCur;
2716 
2717   return SQLITE_OK;
2718 
2719 create_cursor_exception:
2720   if( pCur ){
2721     releasePage(pCur->pPage);
2722     sqlite3_free(pCur);
2723   }
2724   unlockBtreeIfUnused(pBt);
2725   return rc;
2726 }
2727 int sqlite3BtreeCursor(
2728   Btree *p,                                   /* The btree */
2729   int iTable,                                 /* Root page of table to open */
2730   int wrFlag,                                 /* 1 to write. 0 read-only */
2731   int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2732   void *pArg,                                 /* First arg to xCompare() */
2733   BtCursor **ppCur                            /* Write new cursor here */
2734 ){
2735   int rc;
2736   sqlite3BtreeEnter(p);
2737   p->pBt->db = p->db;
2738   rc = btreeCursor(p, iTable, wrFlag, xCmp, pArg, ppCur);
2739   sqlite3BtreeLeave(p);
2740   return rc;
2741 }
2742 
2743 
2744 /*
2745 ** Close a cursor.  The read lock on the database file is released
2746 ** when the last cursor is closed.
2747 */
2748 int sqlite3BtreeCloseCursor(BtCursor *pCur){
2749   BtShared *pBt = pCur->pBt;
2750   Btree *pBtree = pCur->pBtree;
2751 
2752   sqlite3BtreeEnter(pBtree);
2753   pBt->db = pBtree->db;
2754   clearCursorPosition(pCur);
2755   if( pCur->pPrev ){
2756     pCur->pPrev->pNext = pCur->pNext;
2757   }else{
2758     pBt->pCursor = pCur->pNext;
2759   }
2760   if( pCur->pNext ){
2761     pCur->pNext->pPrev = pCur->pPrev;
2762   }
2763   releasePage(pCur->pPage);
2764   unlockBtreeIfUnused(pBt);
2765   invalidateOverflowCache(pCur);
2766   sqlite3_free(pCur);
2767   sqlite3BtreeLeave(pBtree);
2768   return SQLITE_OK;
2769 }
2770 
2771 /*
2772 ** Make a temporary cursor by filling in the fields of pTempCur.
2773 ** The temporary cursor is not on the cursor list for the Btree.
2774 */
2775 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2776   assert( cursorHoldsMutex(pCur) );
2777   memcpy(pTempCur, pCur, sizeof(*pCur));
2778   pTempCur->pNext = 0;
2779   pTempCur->pPrev = 0;
2780   if( pTempCur->pPage ){
2781     sqlite3PagerRef(pTempCur->pPage->pDbPage);
2782   }
2783 }
2784 
2785 /*
2786 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2787 ** function above.
2788 */
2789 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2790   assert( cursorHoldsMutex(pCur) );
2791   if( pCur->pPage ){
2792     sqlite3PagerUnref(pCur->pPage->pDbPage);
2793   }
2794 }
2795 
2796 /*
2797 ** Make sure the BtCursor* given in the argument has a valid
2798 ** BtCursor.info structure.  If it is not already valid, call
2799 ** sqlite3BtreeParseCell() to fill it in.
2800 **
2801 ** BtCursor.info is a cache of the information in the current cell.
2802 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2803 **
2804 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
2805 ** compiler to crash when getCellInfo() is implemented as a macro.
2806 ** But there is a measureable speed advantage to using the macro on gcc
2807 ** (when less compiler optimizations like -Os or -O0 are used and the
2808 ** compiler is not doing agressive inlining.)  So we use a real function
2809 ** for MSVC and a macro for everything else.  Ticket #2457.
2810 */
2811 #ifndef NDEBUG
2812   static void assertCellInfo(BtCursor *pCur){
2813     CellInfo info;
2814     memset(&info, 0, sizeof(info));
2815     sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info);
2816     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2817   }
2818 #else
2819   #define assertCellInfo(x)
2820 #endif
2821 #ifdef _MSC_VER
2822   /* Use a real function in MSVC to work around bugs in that compiler. */
2823   static void getCellInfo(BtCursor *pCur){
2824     if( pCur->info.nSize==0 ){
2825       sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info);
2826     }else{
2827       assertCellInfo(pCur);
2828     }
2829   }
2830 #else /* if not _MSC_VER */
2831   /* Use a macro in all other compilers so that the function is inlined */
2832 #define getCellInfo(pCur)                                               \
2833   if( pCur->info.nSize==0 ){                                            \
2834     sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info);         \
2835   }else{                                                                \
2836     assertCellInfo(pCur);                                               \
2837   }
2838 #endif /* _MSC_VER */
2839 
2840 /*
2841 ** Set *pSize to the size of the buffer needed to hold the value of
2842 ** the key for the current entry.  If the cursor is not pointing
2843 ** to a valid entry, *pSize is set to 0.
2844 **
2845 ** For a table with the INTKEY flag set, this routine returns the key
2846 ** itself, not the number of bytes in the key.
2847 */
2848 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2849   int rc;
2850 
2851   assert( cursorHoldsMutex(pCur) );
2852   rc = restoreOrClearCursorPosition(pCur);
2853   if( rc==SQLITE_OK ){
2854     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2855     if( pCur->eState==CURSOR_INVALID ){
2856       *pSize = 0;
2857     }else{
2858       getCellInfo(pCur);
2859       *pSize = pCur->info.nKey;
2860     }
2861   }
2862   return rc;
2863 }
2864 
2865 /*
2866 ** Set *pSize to the number of bytes of data in the entry the
2867 ** cursor currently points to.  Always return SQLITE_OK.
2868 ** Failure is not possible.  If the cursor is not currently
2869 ** pointing to an entry (which can happen, for example, if
2870 ** the database is empty) then *pSize is set to 0.
2871 */
2872 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2873   int rc;
2874 
2875   assert( cursorHoldsMutex(pCur) );
2876   rc = restoreOrClearCursorPosition(pCur);
2877   if( rc==SQLITE_OK ){
2878     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2879     if( pCur->eState==CURSOR_INVALID ){
2880       /* Not pointing at a valid entry - set *pSize to 0. */
2881       *pSize = 0;
2882     }else{
2883       getCellInfo(pCur);
2884       *pSize = pCur->info.nData;
2885     }
2886   }
2887   return rc;
2888 }
2889 
2890 /*
2891 ** Given the page number of an overflow page in the database (parameter
2892 ** ovfl), this function finds the page number of the next page in the
2893 ** linked list of overflow pages. If possible, it uses the auto-vacuum
2894 ** pointer-map data instead of reading the content of page ovfl to do so.
2895 **
2896 ** If an error occurs an SQLite error code is returned. Otherwise:
2897 **
2898 ** Unless pPgnoNext is NULL, the page number of the next overflow
2899 ** page in the linked list is written to *pPgnoNext. If page ovfl
2900 ** is the last page in its linked list, *pPgnoNext is set to zero.
2901 **
2902 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
2903 ** for page ovfl. The underlying pager page may have been requested
2904 ** with the noContent flag set, so the page data accessable via
2905 ** this handle may not be trusted.
2906 */
2907 static int getOverflowPage(
2908   BtShared *pBt,
2909   Pgno ovfl,                   /* Overflow page */
2910   MemPage **ppPage,            /* OUT: MemPage handle */
2911   Pgno *pPgnoNext              /* OUT: Next overflow page number */
2912 ){
2913   Pgno next = 0;
2914   int rc;
2915 
2916   assert( sqlite3_mutex_held(pBt->mutex) );
2917   /* One of these must not be NULL. Otherwise, why call this function? */
2918   assert(ppPage || pPgnoNext);
2919 
2920   /* If pPgnoNext is NULL, then this function is being called to obtain
2921   ** a MemPage* reference only. No page-data is required in this case.
2922   */
2923   if( !pPgnoNext ){
2924     return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
2925   }
2926 
2927 #ifndef SQLITE_OMIT_AUTOVACUUM
2928   /* Try to find the next page in the overflow list using the
2929   ** autovacuum pointer-map pages. Guess that the next page in
2930   ** the overflow list is page number (ovfl+1). If that guess turns
2931   ** out to be wrong, fall back to loading the data of page
2932   ** number ovfl to determine the next page number.
2933   */
2934   if( pBt->autoVacuum ){
2935     Pgno pgno;
2936     Pgno iGuess = ovfl+1;
2937     u8 eType;
2938 
2939     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
2940       iGuess++;
2941     }
2942 
2943     if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){
2944       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
2945       if( rc!=SQLITE_OK ){
2946         return rc;
2947       }
2948       if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
2949         next = iGuess;
2950       }
2951     }
2952   }
2953 #endif
2954 
2955   if( next==0 || ppPage ){
2956     MemPage *pPage = 0;
2957 
2958     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
2959     assert(rc==SQLITE_OK || pPage==0);
2960     if( next==0 && rc==SQLITE_OK ){
2961       next = get4byte(pPage->aData);
2962     }
2963 
2964     if( ppPage ){
2965       *ppPage = pPage;
2966     }else{
2967       releasePage(pPage);
2968     }
2969   }
2970   *pPgnoNext = next;
2971 
2972   return rc;
2973 }
2974 
2975 /*
2976 ** Copy data from a buffer to a page, or from a page to a buffer.
2977 **
2978 ** pPayload is a pointer to data stored on database page pDbPage.
2979 ** If argument eOp is false, then nByte bytes of data are copied
2980 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
2981 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
2982 ** of data are copied from the buffer pBuf to pPayload.
2983 **
2984 ** SQLITE_OK is returned on success, otherwise an error code.
2985 */
2986 static int copyPayload(
2987   void *pPayload,           /* Pointer to page data */
2988   void *pBuf,               /* Pointer to buffer */
2989   int nByte,                /* Number of bytes to copy */
2990   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
2991   DbPage *pDbPage           /* Page containing pPayload */
2992 ){
2993   if( eOp ){
2994     /* Copy data from buffer to page (a write operation) */
2995     int rc = sqlite3PagerWrite(pDbPage);
2996     if( rc!=SQLITE_OK ){
2997       return rc;
2998     }
2999     memcpy(pPayload, pBuf, nByte);
3000   }else{
3001     /* Copy data from page to buffer (a read operation) */
3002     memcpy(pBuf, pPayload, nByte);
3003   }
3004   return SQLITE_OK;
3005 }
3006 
3007 /*
3008 ** This function is used to read or overwrite payload information
3009 ** for the entry that the pCur cursor is pointing to. If the eOp
3010 ** parameter is 0, this is a read operation (data copied into
3011 ** buffer pBuf). If it is non-zero, a write (data copied from
3012 ** buffer pBuf).
3013 **
3014 ** A total of "amt" bytes are read or written beginning at "offset".
3015 ** Data is read to or from the buffer pBuf.
3016 **
3017 ** This routine does not make a distinction between key and data.
3018 ** It just reads or writes bytes from the payload area.  Data might
3019 ** appear on the main page or be scattered out on multiple overflow
3020 ** pages.
3021 **
3022 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3023 ** cursor entry uses one or more overflow pages, this function
3024 ** allocates space for and lazily popluates the overflow page-list
3025 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3026 ** cache to make seeking to the supplied offset more efficient.
3027 **
3028 ** Once an overflow page-list cache has been allocated, it may be
3029 ** invalidated if some other cursor writes to the same table, or if
3030 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3031 ** mode, the following events may invalidate an overflow page-list cache.
3032 **
3033 **   * An incremental vacuum,
3034 **   * A commit in auto_vacuum="full" mode,
3035 **   * Creating a table (may require moving an overflow page).
3036 */
3037 static int accessPayload(
3038   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3039   int offset,          /* Begin reading this far into payload */
3040   int amt,             /* Read this many bytes */
3041   unsigned char *pBuf, /* Write the bytes into this buffer */
3042   int skipKey,         /* offset begins at data if this is true */
3043   int eOp              /* zero to read. non-zero to write. */
3044 ){
3045   unsigned char *aPayload;
3046   int rc = SQLITE_OK;
3047   u32 nKey;
3048   int iIdx = 0;
3049   MemPage *pPage = pCur->pPage;     /* Btree page of current cursor entry */
3050   BtShared *pBt;                   /* Btree this cursor belongs to */
3051 
3052   assert( pPage );
3053   assert( pCur->eState==CURSOR_VALID );
3054   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3055   assert( offset>=0 );
3056   assert( cursorHoldsMutex(pCur) );
3057 
3058   getCellInfo(pCur);
3059   aPayload = pCur->info.pCell + pCur->info.nHeader;
3060   nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3061 
3062   if( skipKey ){
3063     offset += nKey;
3064   }
3065   if( offset+amt > nKey+pCur->info.nData ){
3066     /* Trying to read or write past the end of the data is an error */
3067     return SQLITE_ERROR;
3068   }
3069 
3070   /* Check if data must be read/written to/from the btree page itself. */
3071   if( offset<pCur->info.nLocal ){
3072     int a = amt;
3073     if( a+offset>pCur->info.nLocal ){
3074       a = pCur->info.nLocal - offset;
3075     }
3076     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3077     offset = 0;
3078     pBuf += a;
3079     amt -= a;
3080   }else{
3081     offset -= pCur->info.nLocal;
3082   }
3083 
3084   pBt = pCur->pBt;
3085   if( rc==SQLITE_OK && amt>0 ){
3086     const int ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3087     Pgno nextPage;
3088 
3089     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3090 
3091 #ifndef SQLITE_OMIT_INCRBLOB
3092     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3093     ** has not been allocated, allocate it now. The array is sized at
3094     ** one entry for each overflow page in the overflow chain. The
3095     ** page number of the first overflow page is stored in aOverflow[0],
3096     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3097     ** (the cache is lazily populated).
3098     */
3099     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3100       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3101       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3102       if( nOvfl && !pCur->aOverflow ){
3103         rc = SQLITE_NOMEM;
3104       }
3105     }
3106 
3107     /* If the overflow page-list cache has been allocated and the
3108     ** entry for the first required overflow page is valid, skip
3109     ** directly to it.
3110     */
3111     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3112       iIdx = (offset/ovflSize);
3113       nextPage = pCur->aOverflow[iIdx];
3114       offset = (offset%ovflSize);
3115     }
3116 #endif
3117 
3118     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3119 
3120 #ifndef SQLITE_OMIT_INCRBLOB
3121       /* If required, populate the overflow page-list cache. */
3122       if( pCur->aOverflow ){
3123         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3124         pCur->aOverflow[iIdx] = nextPage;
3125       }
3126 #endif
3127 
3128       if( offset>=ovflSize ){
3129         /* The only reason to read this page is to obtain the page
3130         ** number for the next page in the overflow chain. The page
3131         ** data is not required. So first try to lookup the overflow
3132         ** page-list cache, if any, then fall back to the getOverflowPage()
3133         ** function.
3134         */
3135 #ifndef SQLITE_OMIT_INCRBLOB
3136         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3137           nextPage = pCur->aOverflow[iIdx+1];
3138         } else
3139 #endif
3140           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3141         offset -= ovflSize;
3142       }else{
3143         /* Need to read this page properly. It contains some of the
3144         ** range of data that is being read (eOp==0) or written (eOp!=0).
3145         */
3146         DbPage *pDbPage;
3147         int a = amt;
3148         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3149         if( rc==SQLITE_OK ){
3150           aPayload = sqlite3PagerGetData(pDbPage);
3151           nextPage = get4byte(aPayload);
3152           if( a + offset > ovflSize ){
3153             a = ovflSize - offset;
3154           }
3155           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3156           sqlite3PagerUnref(pDbPage);
3157           offset = 0;
3158           amt -= a;
3159           pBuf += a;
3160         }
3161       }
3162     }
3163   }
3164 
3165   if( rc==SQLITE_OK && amt>0 ){
3166     return SQLITE_CORRUPT_BKPT;
3167   }
3168   return rc;
3169 }
3170 
3171 /*
3172 ** Read part of the key associated with cursor pCur.  Exactly
3173 ** "amt" bytes will be transfered into pBuf[].  The transfer
3174 ** begins at "offset".
3175 **
3176 ** Return SQLITE_OK on success or an error code if anything goes
3177 ** wrong.  An error is returned if "offset+amt" is larger than
3178 ** the available payload.
3179 */
3180 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3181   int rc;
3182 
3183   assert( cursorHoldsMutex(pCur) );
3184   rc = restoreOrClearCursorPosition(pCur);
3185   if( rc==SQLITE_OK ){
3186     assert( pCur->eState==CURSOR_VALID );
3187     assert( pCur->pPage!=0 );
3188     if( pCur->pPage->intKey ){
3189       return SQLITE_CORRUPT_BKPT;
3190     }
3191     assert( pCur->pPage->intKey==0 );
3192     assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3193     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3194   }
3195   return rc;
3196 }
3197 
3198 /*
3199 ** Read part of the data associated with cursor pCur.  Exactly
3200 ** "amt" bytes will be transfered into pBuf[].  The transfer
3201 ** begins at "offset".
3202 **
3203 ** Return SQLITE_OK on success or an error code if anything goes
3204 ** wrong.  An error is returned if "offset+amt" is larger than
3205 ** the available payload.
3206 */
3207 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3208   int rc;
3209 
3210   assert( cursorHoldsMutex(pCur) );
3211   rc = restoreOrClearCursorPosition(pCur);
3212   if( rc==SQLITE_OK ){
3213     assert( pCur->eState==CURSOR_VALID );
3214     assert( pCur->pPage!=0 );
3215     assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3216     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3217   }
3218   return rc;
3219 }
3220 
3221 /*
3222 ** Return a pointer to payload information from the entry that the
3223 ** pCur cursor is pointing to.  The pointer is to the beginning of
3224 ** the key if skipKey==0 and it points to the beginning of data if
3225 ** skipKey==1.  The number of bytes of available key/data is written
3226 ** into *pAmt.  If *pAmt==0, then the value returned will not be
3227 ** a valid pointer.
3228 **
3229 ** This routine is an optimization.  It is common for the entire key
3230 ** and data to fit on the local page and for there to be no overflow
3231 ** pages.  When that is so, this routine can be used to access the
3232 ** key and data without making a copy.  If the key and/or data spills
3233 ** onto overflow pages, then accessPayload() must be used to reassembly
3234 ** the key/data and copy it into a preallocated buffer.
3235 **
3236 ** The pointer returned by this routine looks directly into the cached
3237 ** page of the database.  The data might change or move the next time
3238 ** any btree routine is called.
3239 */
3240 static const unsigned char *fetchPayload(
3241   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3242   int *pAmt,           /* Write the number of available bytes here */
3243   int skipKey          /* read beginning at data if this is true */
3244 ){
3245   unsigned char *aPayload;
3246   MemPage *pPage;
3247   u32 nKey;
3248   int nLocal;
3249 
3250   assert( pCur!=0 && pCur->pPage!=0 );
3251   assert( pCur->eState==CURSOR_VALID );
3252   assert( cursorHoldsMutex(pCur) );
3253   pPage = pCur->pPage;
3254   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3255   getCellInfo(pCur);
3256   aPayload = pCur->info.pCell;
3257   aPayload += pCur->info.nHeader;
3258   if( pPage->intKey ){
3259     nKey = 0;
3260   }else{
3261     nKey = pCur->info.nKey;
3262   }
3263   if( skipKey ){
3264     aPayload += nKey;
3265     nLocal = pCur->info.nLocal - nKey;
3266   }else{
3267     nLocal = pCur->info.nLocal;
3268     if( nLocal>nKey ){
3269       nLocal = nKey;
3270     }
3271   }
3272   *pAmt = nLocal;
3273   return aPayload;
3274 }
3275 
3276 
3277 /*
3278 ** For the entry that cursor pCur is point to, return as
3279 ** many bytes of the key or data as are available on the local
3280 ** b-tree page.  Write the number of available bytes into *pAmt.
3281 **
3282 ** The pointer returned is ephemeral.  The key/data may move
3283 ** or be destroyed on the next call to any Btree routine,
3284 ** including calls from other threads against the same cache.
3285 ** Hence, a mutex on the BtShared should be held prior to calling
3286 ** this routine.
3287 **
3288 ** These routines is used to get quick access to key and data
3289 ** in the common case where no overflow pages are used.
3290 */
3291 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3292   assert( cursorHoldsMutex(pCur) );
3293   if( pCur->eState==CURSOR_VALID ){
3294     return (const void*)fetchPayload(pCur, pAmt, 0);
3295   }
3296   return 0;
3297 }
3298 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3299   assert( cursorHoldsMutex(pCur) );
3300   if( pCur->eState==CURSOR_VALID ){
3301     return (const void*)fetchPayload(pCur, pAmt, 1);
3302   }
3303   return 0;
3304 }
3305 
3306 
3307 /*
3308 ** Move the cursor down to a new child page.  The newPgno argument is the
3309 ** page number of the child page to move to.
3310 */
3311 static int moveToChild(BtCursor *pCur, u32 newPgno){
3312   int rc;
3313   MemPage *pNewPage;
3314   MemPage *pOldPage;
3315   BtShared *pBt = pCur->pBt;
3316 
3317   assert( cursorHoldsMutex(pCur) );
3318   assert( pCur->eState==CURSOR_VALID );
3319   rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
3320   if( rc ) return rc;
3321   pNewPage->idxParent = pCur->idx;
3322   pOldPage = pCur->pPage;
3323   pOldPage->idxShift = 0;
3324   releasePage(pOldPage);
3325   pCur->pPage = pNewPage;
3326   pCur->idx = 0;
3327   pCur->info.nSize = 0;
3328   if( pNewPage->nCell<1 ){
3329     return SQLITE_CORRUPT_BKPT;
3330   }
3331   return SQLITE_OK;
3332 }
3333 
3334 /*
3335 ** Return true if the page is the virtual root of its table.
3336 **
3337 ** The virtual root page is the root page for most tables.  But
3338 ** for the table rooted on page 1, sometime the real root page
3339 ** is empty except for the right-pointer.  In such cases the
3340 ** virtual root page is the page that the right-pointer of page
3341 ** 1 is pointing to.
3342 */
3343 int sqlite3BtreeIsRootPage(MemPage *pPage){
3344   MemPage *pParent;
3345 
3346   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3347   pParent = pPage->pParent;
3348   if( pParent==0 ) return 1;
3349   if( pParent->pgno>1 ) return 0;
3350   if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
3351   return 0;
3352 }
3353 
3354 /*
3355 ** Move the cursor up to the parent page.
3356 **
3357 ** pCur->idx is set to the cell index that contains the pointer
3358 ** to the page we are coming from.  If we are coming from the
3359 ** right-most child page then pCur->idx is set to one more than
3360 ** the largest cell index.
3361 */
3362 void sqlite3BtreeMoveToParent(BtCursor *pCur){
3363   MemPage *pParent;
3364   MemPage *pPage;
3365   int idxParent;
3366 
3367   assert( cursorHoldsMutex(pCur) );
3368   assert( pCur->eState==CURSOR_VALID );
3369   pPage = pCur->pPage;
3370   assert( pPage!=0 );
3371   assert( !sqlite3BtreeIsRootPage(pPage) );
3372   pParent = pPage->pParent;
3373   assert( pParent!=0 );
3374   idxParent = pPage->idxParent;
3375   sqlite3PagerRef(pParent->pDbPage);
3376   releasePage(pPage);
3377   pCur->pPage = pParent;
3378   pCur->info.nSize = 0;
3379   assert( pParent->idxShift==0 );
3380   pCur->idx = idxParent;
3381 }
3382 
3383 /*
3384 ** Move the cursor to the root page
3385 */
3386 static int moveToRoot(BtCursor *pCur){
3387   MemPage *pRoot;
3388   int rc = SQLITE_OK;
3389   Btree *p = pCur->pBtree;
3390   BtShared *pBt = p->pBt;
3391 
3392   assert( cursorHoldsMutex(pCur) );
3393   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3394   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
3395   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
3396   if( pCur->eState>=CURSOR_REQUIRESEEK ){
3397     if( pCur->eState==CURSOR_FAULT ){
3398       return pCur->skip;
3399     }
3400     clearCursorPosition(pCur);
3401   }
3402   pRoot = pCur->pPage;
3403   if( pRoot && pRoot->pgno==pCur->pgnoRoot ){
3404     assert( pRoot->isInit );
3405   }else{
3406     if(
3407       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0))
3408     ){
3409       pCur->eState = CURSOR_INVALID;
3410       return rc;
3411     }
3412     releasePage(pCur->pPage);
3413     pCur->pPage = pRoot;
3414   }
3415   pCur->idx = 0;
3416   pCur->info.nSize = 0;
3417   if( pRoot->nCell==0 && !pRoot->leaf ){
3418     Pgno subpage;
3419     assert( pRoot->pgno==1 );
3420     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3421     assert( subpage>0 );
3422     pCur->eState = CURSOR_VALID;
3423     rc = moveToChild(pCur, subpage);
3424   }
3425   pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3426   return rc;
3427 }
3428 
3429 /*
3430 ** Move the cursor down to the left-most leaf entry beneath the
3431 ** entry to which it is currently pointing.
3432 **
3433 ** The left-most leaf is the one with the smallest key - the first
3434 ** in ascending order.
3435 */
3436 static int moveToLeftmost(BtCursor *pCur){
3437   Pgno pgno;
3438   int rc = SQLITE_OK;
3439   MemPage *pPage;
3440 
3441   assert( cursorHoldsMutex(pCur) );
3442   assert( pCur->eState==CURSOR_VALID );
3443   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3444     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3445     pgno = get4byte(findCell(pPage, pCur->idx));
3446     rc = moveToChild(pCur, pgno);
3447   }
3448   return rc;
3449 }
3450 
3451 /*
3452 ** Move the cursor down to the right-most leaf entry beneath the
3453 ** page to which it is currently pointing.  Notice the difference
3454 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
3455 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3456 ** finds the right-most entry beneath the *page*.
3457 **
3458 ** The right-most entry is the one with the largest key - the last
3459 ** key in ascending order.
3460 */
3461 static int moveToRightmost(BtCursor *pCur){
3462   Pgno pgno;
3463   int rc = SQLITE_OK;
3464   MemPage *pPage;
3465 
3466   assert( cursorHoldsMutex(pCur) );
3467   assert( pCur->eState==CURSOR_VALID );
3468   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3469     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3470     pCur->idx = pPage->nCell;
3471     rc = moveToChild(pCur, pgno);
3472   }
3473   if( rc==SQLITE_OK ){
3474     pCur->idx = pPage->nCell - 1;
3475     pCur->info.nSize = 0;
3476   }
3477   return SQLITE_OK;
3478 }
3479 
3480 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
3481 ** on success.  Set *pRes to 0 if the cursor actually points to something
3482 ** or set *pRes to 1 if the table is empty.
3483 */
3484 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3485   int rc;
3486 
3487   assert( cursorHoldsMutex(pCur) );
3488   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3489   rc = moveToRoot(pCur);
3490   if( rc==SQLITE_OK ){
3491     if( pCur->eState==CURSOR_INVALID ){
3492       assert( pCur->pPage->nCell==0 );
3493       *pRes = 1;
3494       rc = SQLITE_OK;
3495     }else{
3496       assert( pCur->pPage->nCell>0 );
3497       *pRes = 0;
3498       rc = moveToLeftmost(pCur);
3499     }
3500   }
3501   return rc;
3502 }
3503 
3504 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
3505 ** on success.  Set *pRes to 0 if the cursor actually points to something
3506 ** or set *pRes to 1 if the table is empty.
3507 */
3508 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3509   int rc;
3510 
3511   assert( cursorHoldsMutex(pCur) );
3512   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3513   rc = moveToRoot(pCur);
3514   if( rc==SQLITE_OK ){
3515     if( CURSOR_INVALID==pCur->eState ){
3516       assert( pCur->pPage->nCell==0 );
3517       *pRes = 1;
3518     }else{
3519       assert( pCur->eState==CURSOR_VALID );
3520       *pRes = 0;
3521       rc = moveToRightmost(pCur);
3522     }
3523   }
3524   return rc;
3525 }
3526 
3527 /* Move the cursor so that it points to an entry near pKey/nKey.
3528 ** Return a success code.
3529 **
3530 ** For INTKEY tables, only the nKey parameter is used.  pKey is
3531 ** ignored.  For other tables, nKey is the number of bytes of data
3532 ** in pKey.  The comparison function specified when the cursor was
3533 ** created is used to compare keys.
3534 **
3535 ** If an exact match is not found, then the cursor is always
3536 ** left pointing at a leaf page which would hold the entry if it
3537 ** were present.  The cursor might point to an entry that comes
3538 ** before or after the key.
3539 **
3540 ** The result of comparing the key with the entry to which the
3541 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
3542 ** this value is as follows:
3543 **
3544 **     *pRes<0      The cursor is left pointing at an entry that
3545 **                  is smaller than pKey or if the table is empty
3546 **                  and the cursor is therefore left point to nothing.
3547 **
3548 **     *pRes==0     The cursor is left pointing at an entry that
3549 **                  exactly matches pKey.
3550 **
3551 **     *pRes>0      The cursor is left pointing at an entry that
3552 **                  is larger than pKey.
3553 **
3554 */
3555 int sqlite3BtreeMoveto(
3556   BtCursor *pCur,        /* The cursor to be moved */
3557   const void *pKey,      /* The key content for indices.  Not used by tables */
3558   i64 nKey,              /* Size of pKey.  Or the key for tables */
3559   int biasRight,         /* If true, bias the search to the high end */
3560   int *pRes              /* Search result flag */
3561 ){
3562   int rc;
3563 
3564   assert( cursorHoldsMutex(pCur) );
3565   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3566   rc = moveToRoot(pCur);
3567   if( rc ){
3568     return rc;
3569   }
3570   assert( pCur->pPage );
3571   assert( pCur->pPage->isInit );
3572   if( pCur->eState==CURSOR_INVALID ){
3573     *pRes = -1;
3574     assert( pCur->pPage->nCell==0 );
3575     return SQLITE_OK;
3576   }
3577   for(;;){
3578     int lwr, upr;
3579     Pgno chldPg;
3580     MemPage *pPage = pCur->pPage;
3581     int c = -1;  /* pRes return if table is empty must be -1 */
3582     lwr = 0;
3583     upr = pPage->nCell-1;
3584     if( !pPage->intKey && pKey==0 ){
3585       return SQLITE_CORRUPT_BKPT;
3586     }
3587     if( biasRight ){
3588       pCur->idx = upr;
3589     }else{
3590       pCur->idx = (upr+lwr)/2;
3591     }
3592     if( lwr<=upr ) for(;;){
3593       void *pCellKey;
3594       i64 nCellKey;
3595       pCur->info.nSize = 0;
3596       if( pPage->intKey ){
3597         u8 *pCell;
3598         pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize;
3599         if( pPage->hasData ){
3600           u32 dummy;
3601           pCell += getVarint32(pCell, &dummy);
3602         }
3603         getVarint(pCell, (u64 *)&nCellKey);
3604         if( nCellKey<nKey ){
3605           c = -1;
3606         }else if( nCellKey>nKey ){
3607           c = +1;
3608         }else{
3609           c = 0;
3610         }
3611       }else{
3612         int available;
3613         pCellKey = (void *)fetchPayload(pCur, &available, 0);
3614         nCellKey = pCur->info.nKey;
3615         if( available>=nCellKey ){
3616           c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3617         }else{
3618           pCellKey = sqlite3_malloc( nCellKey );
3619           if( pCellKey==0 ) return SQLITE_NOMEM;
3620           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3621           c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3622           sqlite3_free(pCellKey);
3623           if( rc ){
3624             return rc;
3625           }
3626         }
3627       }
3628       if( c==0 ){
3629         if( pPage->leafData && !pPage->leaf ){
3630           lwr = pCur->idx;
3631           upr = lwr - 1;
3632           break;
3633         }else{
3634           if( pRes ) *pRes = 0;
3635           return SQLITE_OK;
3636         }
3637       }
3638       if( c<0 ){
3639         lwr = pCur->idx+1;
3640       }else{
3641         upr = pCur->idx-1;
3642       }
3643       if( lwr>upr ){
3644         break;
3645       }
3646       pCur->idx = (lwr+upr)/2;
3647     }
3648     assert( lwr==upr+1 );
3649     assert( pPage->isInit );
3650     if( pPage->leaf ){
3651       chldPg = 0;
3652     }else if( lwr>=pPage->nCell ){
3653       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3654     }else{
3655       chldPg = get4byte(findCell(pPage, lwr));
3656     }
3657     if( chldPg==0 ){
3658       assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3659       if( pRes ) *pRes = c;
3660       return SQLITE_OK;
3661     }
3662     pCur->idx = lwr;
3663     pCur->info.nSize = 0;
3664     rc = moveToChild(pCur, chldPg);
3665     if( rc ){
3666       return rc;
3667     }
3668   }
3669   /* NOT REACHED */
3670 }
3671 
3672 
3673 /*
3674 ** Return TRUE if the cursor is not pointing at an entry of the table.
3675 **
3676 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
3677 ** past the last entry in the table or sqlite3BtreePrev() moves past
3678 ** the first entry.  TRUE is also returned if the table is empty.
3679 */
3680 int sqlite3BtreeEof(BtCursor *pCur){
3681   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3682   ** have been deleted? This API will need to change to return an error code
3683   ** as well as the boolean result value.
3684   */
3685   return (CURSOR_VALID!=pCur->eState);
3686 }
3687 
3688 /*
3689 ** Return the database connection handle for a cursor.
3690 */
3691 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
3692   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3693   return pCur->pBtree->db;
3694 }
3695 
3696 /*
3697 ** Advance the cursor to the next entry in the database.  If
3698 ** successful then set *pRes=0.  If the cursor
3699 ** was already pointing to the last entry in the database before
3700 ** this routine was called, then set *pRes=1.
3701 */
3702 static int btreeNext(BtCursor *pCur, int *pRes){
3703   int rc;
3704   MemPage *pPage;
3705 
3706   assert( cursorHoldsMutex(pCur) );
3707   rc = restoreOrClearCursorPosition(pCur);
3708   if( rc!=SQLITE_OK ){
3709     return rc;
3710   }
3711   assert( pRes!=0 );
3712   pPage = pCur->pPage;
3713   if( CURSOR_INVALID==pCur->eState ){
3714     *pRes = 1;
3715     return SQLITE_OK;
3716   }
3717   if( pCur->skip>0 ){
3718     pCur->skip = 0;
3719     *pRes = 0;
3720     return SQLITE_OK;
3721   }
3722   pCur->skip = 0;
3723 
3724   assert( pPage->isInit );
3725   assert( pCur->idx<pPage->nCell );
3726 
3727   pCur->idx++;
3728   pCur->info.nSize = 0;
3729   if( pCur->idx>=pPage->nCell ){
3730     if( !pPage->leaf ){
3731       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3732       if( rc ) return rc;
3733       rc = moveToLeftmost(pCur);
3734       *pRes = 0;
3735       return rc;
3736     }
3737     do{
3738       if( sqlite3BtreeIsRootPage(pPage) ){
3739         *pRes = 1;
3740         pCur->eState = CURSOR_INVALID;
3741         return SQLITE_OK;
3742       }
3743       sqlite3BtreeMoveToParent(pCur);
3744       pPage = pCur->pPage;
3745     }while( pCur->idx>=pPage->nCell );
3746     *pRes = 0;
3747     if( pPage->leafData ){
3748       rc = sqlite3BtreeNext(pCur, pRes);
3749     }else{
3750       rc = SQLITE_OK;
3751     }
3752     return rc;
3753   }
3754   *pRes = 0;
3755   if( pPage->leaf ){
3756     return SQLITE_OK;
3757   }
3758   rc = moveToLeftmost(pCur);
3759   return rc;
3760 }
3761 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3762   int rc;
3763   assert( cursorHoldsMutex(pCur) );
3764   rc = btreeNext(pCur, pRes);
3765   return rc;
3766 }
3767 
3768 
3769 /*
3770 ** Step the cursor to the back to the previous entry in the database.  If
3771 ** successful then set *pRes=0.  If the cursor
3772 ** was already pointing to the first entry in the database before
3773 ** this routine was called, then set *pRes=1.
3774 */
3775 static int btreePrevious(BtCursor *pCur, int *pRes){
3776   int rc;
3777   Pgno pgno;
3778   MemPage *pPage;
3779 
3780   assert( cursorHoldsMutex(pCur) );
3781   rc = restoreOrClearCursorPosition(pCur);
3782   if( rc!=SQLITE_OK ){
3783     return rc;
3784   }
3785   if( CURSOR_INVALID==pCur->eState ){
3786     *pRes = 1;
3787     return SQLITE_OK;
3788   }
3789   if( pCur->skip<0 ){
3790     pCur->skip = 0;
3791     *pRes = 0;
3792     return SQLITE_OK;
3793   }
3794   pCur->skip = 0;
3795 
3796   pPage = pCur->pPage;
3797   assert( pPage->isInit );
3798   assert( pCur->idx>=0 );
3799   if( !pPage->leaf ){
3800     pgno = get4byte( findCell(pPage, pCur->idx) );
3801     rc = moveToChild(pCur, pgno);
3802     if( rc ){
3803       return rc;
3804     }
3805     rc = moveToRightmost(pCur);
3806   }else{
3807     while( pCur->idx==0 ){
3808       if( sqlite3BtreeIsRootPage(pPage) ){
3809         pCur->eState = CURSOR_INVALID;
3810         *pRes = 1;
3811         return SQLITE_OK;
3812       }
3813       sqlite3BtreeMoveToParent(pCur);
3814       pPage = pCur->pPage;
3815     }
3816     pCur->idx--;
3817     pCur->info.nSize = 0;
3818     if( pPage->leafData && !pPage->leaf ){
3819       rc = sqlite3BtreePrevious(pCur, pRes);
3820     }else{
3821       rc = SQLITE_OK;
3822     }
3823   }
3824   *pRes = 0;
3825   return rc;
3826 }
3827 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3828   int rc;
3829   assert( cursorHoldsMutex(pCur) );
3830   rc = btreePrevious(pCur, pRes);
3831   return rc;
3832 }
3833 
3834 /*
3835 ** Allocate a new page from the database file.
3836 **
3837 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
3838 ** has already been called on the new page.)  The new page has also
3839 ** been referenced and the calling routine is responsible for calling
3840 ** sqlite3PagerUnref() on the new page when it is done.
3841 **
3842 ** SQLITE_OK is returned on success.  Any other return value indicates
3843 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
3844 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
3845 **
3846 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
3847 ** locate a page close to the page number "nearby".  This can be used in an
3848 ** attempt to keep related pages close to each other in the database file,
3849 ** which in turn can make database access faster.
3850 **
3851 ** If the "exact" parameter is not 0, and the page-number nearby exists
3852 ** anywhere on the free-list, then it is guarenteed to be returned. This
3853 ** is only used by auto-vacuum databases when allocating a new table.
3854 */
3855 static int allocateBtreePage(
3856   BtShared *pBt,
3857   MemPage **ppPage,
3858   Pgno *pPgno,
3859   Pgno nearby,
3860   u8 exact
3861 ){
3862   MemPage *pPage1;
3863   int rc;
3864   int n;     /* Number of pages on the freelist */
3865   int k;     /* Number of leaves on the trunk of the freelist */
3866   MemPage *pTrunk = 0;
3867   MemPage *pPrevTrunk = 0;
3868 
3869   assert( sqlite3_mutex_held(pBt->mutex) );
3870   pPage1 = pBt->pPage1;
3871   n = get4byte(&pPage1->aData[36]);
3872   if( n>0 ){
3873     /* There are pages on the freelist.  Reuse one of those pages. */
3874     Pgno iTrunk;
3875     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
3876 
3877     /* If the 'exact' parameter was true and a query of the pointer-map
3878     ** shows that the page 'nearby' is somewhere on the free-list, then
3879     ** the entire-list will be searched for that page.
3880     */
3881 #ifndef SQLITE_OMIT_AUTOVACUUM
3882     if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){
3883       u8 eType;
3884       assert( nearby>0 );
3885       assert( pBt->autoVacuum );
3886       rc = ptrmapGet(pBt, nearby, &eType, 0);
3887       if( rc ) return rc;
3888       if( eType==PTRMAP_FREEPAGE ){
3889         searchList = 1;
3890       }
3891       *pPgno = nearby;
3892     }
3893 #endif
3894 
3895     /* Decrement the free-list count by 1. Set iTrunk to the index of the
3896     ** first free-list trunk page. iPrevTrunk is initially 1.
3897     */
3898     rc = sqlite3PagerWrite(pPage1->pDbPage);
3899     if( rc ) return rc;
3900     put4byte(&pPage1->aData[36], n-1);
3901 
3902     /* The code within this loop is run only once if the 'searchList' variable
3903     ** is not true. Otherwise, it runs once for each trunk-page on the
3904     ** free-list until the page 'nearby' is located.
3905     */
3906     do {
3907       pPrevTrunk = pTrunk;
3908       if( pPrevTrunk ){
3909         iTrunk = get4byte(&pPrevTrunk->aData[0]);
3910       }else{
3911         iTrunk = get4byte(&pPage1->aData[32]);
3912       }
3913       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
3914       if( rc ){
3915         pTrunk = 0;
3916         goto end_allocate_page;
3917       }
3918 
3919       k = get4byte(&pTrunk->aData[4]);
3920       if( k==0 && !searchList ){
3921         /* The trunk has no leaves and the list is not being searched.
3922         ** So extract the trunk page itself and use it as the newly
3923         ** allocated page */
3924         assert( pPrevTrunk==0 );
3925         rc = sqlite3PagerWrite(pTrunk->pDbPage);
3926         if( rc ){
3927           goto end_allocate_page;
3928         }
3929         *pPgno = iTrunk;
3930         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3931         *ppPage = pTrunk;
3932         pTrunk = 0;
3933         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3934       }else if( k>pBt->usableSize/4 - 8 ){
3935         /* Value of k is out of range.  Database corruption */
3936         rc = SQLITE_CORRUPT_BKPT;
3937         goto end_allocate_page;
3938 #ifndef SQLITE_OMIT_AUTOVACUUM
3939       }else if( searchList && nearby==iTrunk ){
3940         /* The list is being searched and this trunk page is the page
3941         ** to allocate, regardless of whether it has leaves.
3942         */
3943         assert( *pPgno==iTrunk );
3944         *ppPage = pTrunk;
3945         searchList = 0;
3946         rc = sqlite3PagerWrite(pTrunk->pDbPage);
3947         if( rc ){
3948           goto end_allocate_page;
3949         }
3950         if( k==0 ){
3951           if( !pPrevTrunk ){
3952             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3953           }else{
3954             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3955           }
3956         }else{
3957           /* The trunk page is required by the caller but it contains
3958           ** pointers to free-list leaves. The first leaf becomes a trunk
3959           ** page in this case.
3960           */
3961           MemPage *pNewTrunk;
3962           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3963           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
3964           if( rc!=SQLITE_OK ){
3965             goto end_allocate_page;
3966           }
3967           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
3968           if( rc!=SQLITE_OK ){
3969             releasePage(pNewTrunk);
3970             goto end_allocate_page;
3971           }
3972           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3973           put4byte(&pNewTrunk->aData[4], k-1);
3974           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3975           releasePage(pNewTrunk);
3976           if( !pPrevTrunk ){
3977             put4byte(&pPage1->aData[32], iNewTrunk);
3978           }else{
3979             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
3980             if( rc ){
3981               goto end_allocate_page;
3982             }
3983             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3984           }
3985         }
3986         pTrunk = 0;
3987         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3988 #endif
3989       }else{
3990         /* Extract a leaf from the trunk */
3991         int closest;
3992         Pgno iPage;
3993         unsigned char *aData = pTrunk->aData;
3994         rc = sqlite3PagerWrite(pTrunk->pDbPage);
3995         if( rc ){
3996           goto end_allocate_page;
3997         }
3998         if( nearby>0 ){
3999           int i, dist;
4000           closest = 0;
4001           dist = get4byte(&aData[8]) - nearby;
4002           if( dist<0 ) dist = -dist;
4003           for(i=1; i<k; i++){
4004             int d2 = get4byte(&aData[8+i*4]) - nearby;
4005             if( d2<0 ) d2 = -d2;
4006             if( d2<dist ){
4007               closest = i;
4008               dist = d2;
4009             }
4010           }
4011         }else{
4012           closest = 0;
4013         }
4014 
4015         iPage = get4byte(&aData[8+closest*4]);
4016         if( !searchList || iPage==nearby ){
4017           *pPgno = iPage;
4018           if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){
4019             /* Free page off the end of the file */
4020             return SQLITE_CORRUPT_BKPT;
4021           }
4022           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4023                  ": %d more free pages\n",
4024                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
4025           if( closest<k-1 ){
4026             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4027           }
4028           put4byte(&aData[4], k-1);
4029           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
4030           if( rc==SQLITE_OK ){
4031             sqlite3PagerDontRollback((*ppPage)->pDbPage);
4032             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4033             if( rc!=SQLITE_OK ){
4034               releasePage(*ppPage);
4035             }
4036           }
4037           searchList = 0;
4038         }
4039       }
4040       releasePage(pPrevTrunk);
4041       pPrevTrunk = 0;
4042     }while( searchList );
4043   }else{
4044     /* There are no pages on the freelist, so create a new page at the
4045     ** end of the file */
4046     *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1;
4047 
4048 #ifndef SQLITE_OMIT_AUTOVACUUM
4049     if( pBt->nTrunc ){
4050       /* An incr-vacuum has already run within this transaction. So the
4051       ** page to allocate is not from the physical end of the file, but
4052       ** at pBt->nTrunc.
4053       */
4054       *pPgno = pBt->nTrunc+1;
4055       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4056         (*pPgno)++;
4057       }
4058     }
4059     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4060       /* If *pPgno refers to a pointer-map page, allocate two new pages
4061       ** at the end of the file instead of one. The first allocated page
4062       ** becomes a new pointer-map page, the second is used by the caller.
4063       */
4064       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4065       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4066       (*pPgno)++;
4067     }
4068     if( pBt->nTrunc ){
4069       pBt->nTrunc = *pPgno;
4070     }
4071 #endif
4072 
4073     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4074     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4075     if( rc ) return rc;
4076     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4077     if( rc!=SQLITE_OK ){
4078       releasePage(*ppPage);
4079     }
4080     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4081   }
4082 
4083   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4084 
4085 end_allocate_page:
4086   releasePage(pTrunk);
4087   releasePage(pPrevTrunk);
4088   return rc;
4089 }
4090 
4091 /*
4092 ** Add a page of the database file to the freelist.
4093 **
4094 ** sqlite3PagerUnref() is NOT called for pPage.
4095 */
4096 static int freePage(MemPage *pPage){
4097   BtShared *pBt = pPage->pBt;
4098   MemPage *pPage1 = pBt->pPage1;
4099   int rc, n, k;
4100 
4101   /* Prepare the page for freeing */
4102   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4103   assert( pPage->pgno>1 );
4104   pPage->isInit = 0;
4105   releasePage(pPage->pParent);
4106   pPage->pParent = 0;
4107 
4108   /* Increment the free page count on pPage1 */
4109   rc = sqlite3PagerWrite(pPage1->pDbPage);
4110   if( rc ) return rc;
4111   n = get4byte(&pPage1->aData[36]);
4112   put4byte(&pPage1->aData[36], n+1);
4113 
4114 #ifdef SQLITE_SECURE_DELETE
4115   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4116   ** always fully overwrite deleted information with zeros.
4117   */
4118   rc = sqlite3PagerWrite(pPage->pDbPage);
4119   if( rc ) return rc;
4120   memset(pPage->aData, 0, pPage->pBt->pageSize);
4121 #endif
4122 
4123 #ifndef SQLITE_OMIT_AUTOVACUUM
4124   /* If the database supports auto-vacuum, write an entry in the pointer-map
4125   ** to indicate that the page is free.
4126   */
4127   if( pBt->autoVacuum ){
4128     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
4129     if( rc ) return rc;
4130   }
4131 #endif
4132 
4133   if( n==0 ){
4134     /* This is the first free page */
4135     rc = sqlite3PagerWrite(pPage->pDbPage);
4136     if( rc ) return rc;
4137     memset(pPage->aData, 0, 8);
4138     put4byte(&pPage1->aData[32], pPage->pgno);
4139     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
4140   }else{
4141     /* Other free pages already exist.  Retrive the first trunk page
4142     ** of the freelist and find out how many leaves it has. */
4143     MemPage *pTrunk;
4144     rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
4145     if( rc ) return rc;
4146     k = get4byte(&pTrunk->aData[4]);
4147     if( k>=pBt->usableSize/4 - 8 ){
4148       /* The trunk is full.  Turn the page being freed into a new
4149       ** trunk page with no leaves. */
4150       rc = sqlite3PagerWrite(pPage->pDbPage);
4151       if( rc==SQLITE_OK ){
4152         put4byte(pPage->aData, pTrunk->pgno);
4153         put4byte(&pPage->aData[4], 0);
4154         put4byte(&pPage1->aData[32], pPage->pgno);
4155         TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4156                 pPage->pgno, pTrunk->pgno));
4157       }
4158     }else if( k<0 ){
4159       rc = SQLITE_CORRUPT;
4160     }else{
4161       /* Add the newly freed page as a leaf on the current trunk */
4162       rc = sqlite3PagerWrite(pTrunk->pDbPage);
4163       if( rc==SQLITE_OK ){
4164         put4byte(&pTrunk->aData[4], k+1);
4165         put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
4166 #ifndef SQLITE_SECURE_DELETE
4167         sqlite3PagerDontWrite(pPage->pDbPage);
4168 #endif
4169       }
4170       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4171     }
4172     releasePage(pTrunk);
4173   }
4174   return rc;
4175 }
4176 
4177 /*
4178 ** Free any overflow pages associated with the given Cell.
4179 */
4180 static int clearCell(MemPage *pPage, unsigned char *pCell){
4181   BtShared *pBt = pPage->pBt;
4182   CellInfo info;
4183   Pgno ovflPgno;
4184   int rc;
4185   int nOvfl;
4186   int ovflPageSize;
4187 
4188   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4189   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4190   if( info.iOverflow==0 ){
4191     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
4192   }
4193   ovflPgno = get4byte(&pCell[info.iOverflow]);
4194   ovflPageSize = pBt->usableSize - 4;
4195   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4196   assert( ovflPgno==0 || nOvfl>0 );
4197   while( nOvfl-- ){
4198     MemPage *pOvfl;
4199     if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){
4200       return SQLITE_CORRUPT_BKPT;
4201     }
4202 
4203     rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
4204     if( rc ) return rc;
4205     rc = freePage(pOvfl);
4206     sqlite3PagerUnref(pOvfl->pDbPage);
4207     if( rc ) return rc;
4208   }
4209   return SQLITE_OK;
4210 }
4211 
4212 /*
4213 ** Create the byte sequence used to represent a cell on page pPage
4214 ** and write that byte sequence into pCell[].  Overflow pages are
4215 ** allocated and filled in as necessary.  The calling procedure
4216 ** is responsible for making sure sufficient space has been allocated
4217 ** for pCell[].
4218 **
4219 ** Note that pCell does not necessary need to point to the pPage->aData
4220 ** area.  pCell might point to some temporary storage.  The cell will
4221 ** be constructed in this temporary area then copied into pPage->aData
4222 ** later.
4223 */
4224 static int fillInCell(
4225   MemPage *pPage,                /* The page that contains the cell */
4226   unsigned char *pCell,          /* Complete text of the cell */
4227   const void *pKey, i64 nKey,    /* The key */
4228   const void *pData,int nData,   /* The data */
4229   int nZero,                     /* Extra zero bytes to append to pData */
4230   int *pnSize                    /* Write cell size here */
4231 ){
4232   int nPayload;
4233   const u8 *pSrc;
4234   int nSrc, n, rc;
4235   int spaceLeft;
4236   MemPage *pOvfl = 0;
4237   MemPage *pToRelease = 0;
4238   unsigned char *pPrior;
4239   unsigned char *pPayload;
4240   BtShared *pBt = pPage->pBt;
4241   Pgno pgnoOvfl = 0;
4242   int nHeader;
4243   CellInfo info;
4244 
4245   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4246 
4247   /* Fill in the header. */
4248   nHeader = 0;
4249   if( !pPage->leaf ){
4250     nHeader += 4;
4251   }
4252   if( pPage->hasData ){
4253     nHeader += putVarint(&pCell[nHeader], nData+nZero);
4254   }else{
4255     nData = nZero = 0;
4256   }
4257   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4258   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4259   assert( info.nHeader==nHeader );
4260   assert( info.nKey==nKey );
4261   assert( info.nData==nData+nZero );
4262 
4263   /* Fill in the payload */
4264   nPayload = nData + nZero;
4265   if( pPage->intKey ){
4266     pSrc = pData;
4267     nSrc = nData;
4268     nData = 0;
4269   }else{
4270     nPayload += nKey;
4271     pSrc = pKey;
4272     nSrc = nKey;
4273   }
4274   *pnSize = info.nSize;
4275   spaceLeft = info.nLocal;
4276   pPayload = &pCell[nHeader];
4277   pPrior = &pCell[info.iOverflow];
4278 
4279   while( nPayload>0 ){
4280     if( spaceLeft==0 ){
4281       int isExact = 0;
4282 #ifndef SQLITE_OMIT_AUTOVACUUM
4283       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4284       if( pBt->autoVacuum ){
4285         do{
4286           pgnoOvfl++;
4287         } while(
4288           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4289         );
4290         if( pgnoOvfl>1 ){
4291           /* isExact = 1; */
4292         }
4293       }
4294 #endif
4295       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
4296 #ifndef SQLITE_OMIT_AUTOVACUUM
4297       /* If the database supports auto-vacuum, and the second or subsequent
4298       ** overflow page is being allocated, add an entry to the pointer-map
4299       ** for that page now.
4300       **
4301       ** If this is the first overflow page, then write a partial entry
4302       ** to the pointer-map. If we write nothing to this pointer-map slot,
4303       ** then the optimistic overflow chain processing in clearCell()
4304       ** may misinterpret the uninitialised values and delete the
4305       ** wrong pages from the database.
4306       */
4307       if( pBt->autoVacuum && rc==SQLITE_OK ){
4308         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4309         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4310         if( rc ){
4311           releasePage(pOvfl);
4312         }
4313       }
4314 #endif
4315       if( rc ){
4316         releasePage(pToRelease);
4317         return rc;
4318       }
4319       put4byte(pPrior, pgnoOvfl);
4320       releasePage(pToRelease);
4321       pToRelease = pOvfl;
4322       pPrior = pOvfl->aData;
4323       put4byte(pPrior, 0);
4324       pPayload = &pOvfl->aData[4];
4325       spaceLeft = pBt->usableSize - 4;
4326     }
4327     n = nPayload;
4328     if( n>spaceLeft ) n = spaceLeft;
4329     if( nSrc>0 ){
4330       if( n>nSrc ) n = nSrc;
4331       assert( pSrc );
4332       memcpy(pPayload, pSrc, n);
4333     }else{
4334       memset(pPayload, 0, n);
4335     }
4336     nPayload -= n;
4337     pPayload += n;
4338     pSrc += n;
4339     nSrc -= n;
4340     spaceLeft -= n;
4341     if( nSrc==0 ){
4342       nSrc = nData;
4343       pSrc = pData;
4344     }
4345   }
4346   releasePage(pToRelease);
4347   return SQLITE_OK;
4348 }
4349 
4350 /*
4351 ** Change the MemPage.pParent pointer on the page whose number is
4352 ** given in the second argument so that MemPage.pParent holds the
4353 ** pointer in the third argument.
4354 */
4355 static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){
4356   MemPage *pThis;
4357   DbPage *pDbPage;
4358 
4359   assert( sqlite3_mutex_held(pBt->mutex) );
4360   assert( pNewParent!=0 );
4361   if( pgno==0 ) return SQLITE_OK;
4362   assert( pBt->pPager!=0 );
4363   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
4364   if( pDbPage ){
4365     pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage);
4366     if( pThis->isInit ){
4367       assert( pThis->aData==sqlite3PagerGetData(pDbPage) );
4368       if( pThis->pParent!=pNewParent ){
4369         if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage);
4370         pThis->pParent = pNewParent;
4371         sqlite3PagerRef(pNewParent->pDbPage);
4372       }
4373       pThis->idxParent = idx;
4374     }
4375     sqlite3PagerUnref(pDbPage);
4376   }
4377 
4378 #ifndef SQLITE_OMIT_AUTOVACUUM
4379   if( pBt->autoVacuum ){
4380     return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
4381   }
4382 #endif
4383   return SQLITE_OK;
4384 }
4385 
4386 
4387 
4388 /*
4389 ** Change the pParent pointer of all children of pPage to point back
4390 ** to pPage.
4391 **
4392 ** In other words, for every child of pPage, invoke reparentPage()
4393 ** to make sure that each child knows that pPage is its parent.
4394 **
4395 ** This routine gets called after you memcpy() one page into
4396 ** another.
4397 */
4398 static int reparentChildPages(MemPage *pPage){
4399   int i;
4400   BtShared *pBt = pPage->pBt;
4401   int rc = SQLITE_OK;
4402 
4403   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4404   if( pPage->leaf ) return SQLITE_OK;
4405 
4406   for(i=0; i<pPage->nCell; i++){
4407     u8 *pCell = findCell(pPage, i);
4408     if( !pPage->leaf ){
4409       rc = reparentPage(pBt, get4byte(pCell), pPage, i);
4410       if( rc!=SQLITE_OK ) return rc;
4411     }
4412   }
4413   if( !pPage->leaf ){
4414     rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
4415        pPage, i);
4416     pPage->idxShift = 0;
4417   }
4418   return rc;
4419 }
4420 
4421 /*
4422 ** Remove the i-th cell from pPage.  This routine effects pPage only.
4423 ** The cell content is not freed or deallocated.  It is assumed that
4424 ** the cell content has been copied someplace else.  This routine just
4425 ** removes the reference to the cell from pPage.
4426 **
4427 ** "sz" must be the number of bytes in the cell.
4428 */
4429 static void dropCell(MemPage *pPage, int idx, int sz){
4430   int i;          /* Loop counter */
4431   int pc;         /* Offset to cell content of cell being deleted */
4432   u8 *data;       /* pPage->aData */
4433   u8 *ptr;        /* Used to move bytes around within data[] */
4434 
4435   assert( idx>=0 && idx<pPage->nCell );
4436   assert( sz==cellSize(pPage, idx) );
4437   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4438   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4439   data = pPage->aData;
4440   ptr = &data[pPage->cellOffset + 2*idx];
4441   pc = get2byte(ptr);
4442   assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4443   freeSpace(pPage, pc, sz);
4444   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4445     ptr[0] = ptr[2];
4446     ptr[1] = ptr[3];
4447   }
4448   pPage->nCell--;
4449   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4450   pPage->nFree += 2;
4451   pPage->idxShift = 1;
4452 }
4453 
4454 /*
4455 ** Insert a new cell on pPage at cell index "i".  pCell points to the
4456 ** content of the cell.
4457 **
4458 ** If the cell content will fit on the page, then put it there.  If it
4459 ** will not fit, then make a copy of the cell content into pTemp if
4460 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
4461 ** in pPage->aOvfl[] and make it point to the cell content (either
4462 ** in pTemp or the original pCell) and also record its index.
4463 ** Allocating a new entry in pPage->aCell[] implies that
4464 ** pPage->nOverflow is incremented.
4465 **
4466 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4467 ** cell. The caller will overwrite them after this function returns. If
4468 ** nSkip is non-zero, then pCell may not point to an invalid memory location
4469 ** (but pCell+nSkip is always valid).
4470 */
4471 static int insertCell(
4472   MemPage *pPage,   /* Page into which we are copying */
4473   int i,            /* New cell becomes the i-th cell of the page */
4474   u8 *pCell,        /* Content of the new cell */
4475   int sz,           /* Bytes of content in pCell */
4476   u8 *pTemp,        /* Temp storage space for pCell, if needed */
4477   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
4478 ){
4479   int idx;          /* Where to write new cell content in data[] */
4480   int j;            /* Loop counter */
4481   int top;          /* First byte of content for any cell in data[] */
4482   int end;          /* First byte past the last cell pointer in data[] */
4483   int ins;          /* Index in data[] where new cell pointer is inserted */
4484   int hdr;          /* Offset into data[] of the page header */
4485   int cellOffset;   /* Address of first cell pointer in data[] */
4486   u8 *data;         /* The content of the whole page */
4487   u8 *ptr;          /* Used for moving information around in data[] */
4488 
4489   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4490   assert( sz==cellSizePtr(pPage, pCell) );
4491   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4492   if( pPage->nOverflow || sz+2>pPage->nFree ){
4493     if( pTemp ){
4494       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4495       pCell = pTemp;
4496     }
4497     j = pPage->nOverflow++;
4498     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4499     pPage->aOvfl[j].pCell = pCell;
4500     pPage->aOvfl[j].idx = i;
4501     pPage->nFree = 0;
4502   }else{
4503     int rc = sqlite3PagerWrite(pPage->pDbPage);
4504     if( rc!=SQLITE_OK ){
4505       return rc;
4506     }
4507     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4508     data = pPage->aData;
4509     hdr = pPage->hdrOffset;
4510     top = get2byte(&data[hdr+5]);
4511     cellOffset = pPage->cellOffset;
4512     end = cellOffset + 2*pPage->nCell + 2;
4513     ins = cellOffset + 2*i;
4514     if( end > top - sz ){
4515       rc = defragmentPage(pPage);
4516       if( rc!=SQLITE_OK ) return rc;
4517       top = get2byte(&data[hdr+5]);
4518       assert( end + sz <= top );
4519     }
4520     idx = allocateSpace(pPage, sz);
4521     assert( idx>0 );
4522     assert( end <= get2byte(&data[hdr+5]) );
4523     pPage->nCell++;
4524     pPage->nFree -= 2;
4525     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4526     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4527       ptr[0] = ptr[-2];
4528       ptr[1] = ptr[-1];
4529     }
4530     put2byte(&data[ins], idx);
4531     put2byte(&data[hdr+3], pPage->nCell);
4532     pPage->idxShift = 1;
4533 #ifndef SQLITE_OMIT_AUTOVACUUM
4534     if( pPage->pBt->autoVacuum ){
4535       /* The cell may contain a pointer to an overflow page. If so, write
4536       ** the entry for the overflow page into the pointer map.
4537       */
4538       CellInfo info;
4539       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4540       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4541       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4542         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4543         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4544         if( rc!=SQLITE_OK ) return rc;
4545       }
4546     }
4547 #endif
4548   }
4549 
4550   return SQLITE_OK;
4551 }
4552 
4553 /*
4554 ** Add a list of cells to a page.  The page should be initially empty.
4555 ** The cells are guaranteed to fit on the page.
4556 */
4557 static void assemblePage(
4558   MemPage *pPage,   /* The page to be assemblied */
4559   int nCell,        /* The number of cells to add to this page */
4560   u8 **apCell,      /* Pointers to cell bodies */
4561   int *aSize        /* Sizes of the cells */
4562 ){
4563   int i;            /* Loop counter */
4564   int totalSize;    /* Total size of all cells */
4565   int hdr;          /* Index of page header */
4566   int cellptr;      /* Address of next cell pointer */
4567   int cellbody;     /* Address of next cell body */
4568   u8 *data;         /* Data for the page */
4569 
4570   assert( pPage->nOverflow==0 );
4571   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4572   totalSize = 0;
4573   for(i=0; i<nCell; i++){
4574     totalSize += aSize[i];
4575   }
4576   assert( totalSize+2*nCell<=pPage->nFree );
4577   assert( pPage->nCell==0 );
4578   cellptr = pPage->cellOffset;
4579   data = pPage->aData;
4580   hdr = pPage->hdrOffset;
4581   put2byte(&data[hdr+3], nCell);
4582   if( nCell ){
4583     cellbody = allocateSpace(pPage, totalSize);
4584     assert( cellbody>0 );
4585     assert( pPage->nFree >= 2*nCell );
4586     pPage->nFree -= 2*nCell;
4587     for(i=0; i<nCell; i++){
4588       put2byte(&data[cellptr], cellbody);
4589       memcpy(&data[cellbody], apCell[i], aSize[i]);
4590       cellptr += 2;
4591       cellbody += aSize[i];
4592     }
4593     assert( cellbody==pPage->pBt->usableSize );
4594   }
4595   pPage->nCell = nCell;
4596 }
4597 
4598 /*
4599 ** The following parameters determine how many adjacent pages get involved
4600 ** in a balancing operation.  NN is the number of neighbors on either side
4601 ** of the page that participate in the balancing operation.  NB is the
4602 ** total number of pages that participate, including the target page and
4603 ** NN neighbors on either side.
4604 **
4605 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
4606 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4607 ** in exchange for a larger degradation in INSERT and UPDATE performance.
4608 ** The value of NN appears to give the best results overall.
4609 */
4610 #define NN 1             /* Number of neighbors on either side of pPage */
4611 #define NB (NN*2+1)      /* Total pages involved in the balance */
4612 
4613 /* Forward reference */
4614 static int balance(MemPage*, int);
4615 
4616 #ifndef SQLITE_OMIT_QUICKBALANCE
4617 /*
4618 ** This version of balance() handles the common special case where
4619 ** a new entry is being inserted on the extreme right-end of the
4620 ** tree, in other words, when the new entry will become the largest
4621 ** entry in the tree.
4622 **
4623 ** Instead of trying balance the 3 right-most leaf pages, just add
4624 ** a new page to the right-hand side and put the one new entry in
4625 ** that page.  This leaves the right side of the tree somewhat
4626 ** unbalanced.  But odds are that we will be inserting new entries
4627 ** at the end soon afterwards so the nearly empty page will quickly
4628 ** fill up.  On average.
4629 **
4630 ** pPage is the leaf page which is the right-most page in the tree.
4631 ** pParent is its parent.  pPage must have a single overflow entry
4632 ** which is also the right-most entry on the page.
4633 */
4634 static int balance_quick(MemPage *pPage, MemPage *pParent){
4635   int rc;
4636   MemPage *pNew;
4637   Pgno pgnoNew;
4638   u8 *pCell;
4639   int szCell;
4640   CellInfo info;
4641   BtShared *pBt = pPage->pBt;
4642   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
4643   int parentSize;                   /* Size of new divider cell */
4644   u8 parentCell[64];                /* Space for the new divider cell */
4645 
4646   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4647 
4648   /* Allocate a new page. Insert the overflow cell from pPage
4649   ** into it. Then remove the overflow cell from pPage.
4650   */
4651   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4652   if( rc!=SQLITE_OK ){
4653     return rc;
4654   }
4655   pCell = pPage->aOvfl[0].pCell;
4656   szCell = cellSizePtr(pPage, pCell);
4657   zeroPage(pNew, pPage->aData[0]);
4658   assemblePage(pNew, 1, &pCell, &szCell);
4659   pPage->nOverflow = 0;
4660 
4661   /* Set the parent of the newly allocated page to pParent. */
4662   pNew->pParent = pParent;
4663   sqlite3PagerRef(pParent->pDbPage);
4664 
4665   /* pPage is currently the right-child of pParent. Change this
4666   ** so that the right-child is the new page allocated above and
4667   ** pPage is the next-to-right child.
4668   */
4669   assert( pPage->nCell>0 );
4670   pCell = findCell(pPage, pPage->nCell-1);
4671   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4672   rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4673   if( rc!=SQLITE_OK ){
4674     return rc;
4675   }
4676   assert( parentSize<64 );
4677   rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4678   if( rc!=SQLITE_OK ){
4679     return rc;
4680   }
4681   put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4682   put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4683 
4684 #ifndef SQLITE_OMIT_AUTOVACUUM
4685   /* If this is an auto-vacuum database, update the pointer map
4686   ** with entries for the new page, and any pointer from the
4687   ** cell on the page to an overflow page.
4688   */
4689   if( pBt->autoVacuum ){
4690     rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4691     if( rc==SQLITE_OK ){
4692       rc = ptrmapPutOvfl(pNew, 0);
4693     }
4694     if( rc!=SQLITE_OK ){
4695       releasePage(pNew);
4696       return rc;
4697     }
4698   }
4699 #endif
4700 
4701   /* Release the reference to the new page and balance the parent page,
4702   ** in case the divider cell inserted caused it to become overfull.
4703   */
4704   releasePage(pNew);
4705   return balance(pParent, 0);
4706 }
4707 #endif /* SQLITE_OMIT_QUICKBALANCE */
4708 
4709 /*
4710 ** This routine redistributes Cells on pPage and up to NN*2 siblings
4711 ** of pPage so that all pages have about the same amount of free space.
4712 ** Usually NN siblings on either side of pPage is used in the balancing,
4713 ** though more siblings might come from one side if pPage is the first
4714 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
4715 ** (something which can only happen if pPage is the root page or a
4716 ** child of root) then all available siblings participate in the balancing.
4717 **
4718 ** The number of siblings of pPage might be increased or decreased by one or
4719 ** two in an effort to keep pages nearly full but not over full. The root page
4720 ** is special and is allowed to be nearly empty. If pPage is
4721 ** the root page, then the depth of the tree might be increased
4722 ** or decreased by one, as necessary, to keep the root page from being
4723 ** overfull or completely empty.
4724 **
4725 ** Note that when this routine is called, some of the Cells on pPage
4726 ** might not actually be stored in pPage->aData[].  This can happen
4727 ** if the page is overfull.  Part of the job of this routine is to
4728 ** make sure all Cells for pPage once again fit in pPage->aData[].
4729 **
4730 ** In the course of balancing the siblings of pPage, the parent of pPage
4731 ** might become overfull or underfull.  If that happens, then this routine
4732 ** is called recursively on the parent.
4733 **
4734 ** If this routine fails for any reason, it might leave the database
4735 ** in a corrupted state.  So if this routine fails, the database should
4736 ** be rolled back.
4737 */
4738 static int balance_nonroot(MemPage *pPage){
4739   MemPage *pParent;            /* The parent of pPage */
4740   BtShared *pBt;               /* The whole database */
4741   int nCell = 0;               /* Number of cells in apCell[] */
4742   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
4743   int nOld;                    /* Number of pages in apOld[] */
4744   int nNew;                    /* Number of pages in apNew[] */
4745   int nDiv;                    /* Number of cells in apDiv[] */
4746   int i, j, k;                 /* Loop counters */
4747   int idx;                     /* Index of pPage in pParent->aCell[] */
4748   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
4749   int rc;                      /* The return code */
4750   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
4751   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
4752   int usableSpace;             /* Bytes in pPage beyond the header */
4753   int pageFlags;               /* Value of pPage->aData[0] */
4754   int subtotal;                /* Subtotal of bytes in cells on one page */
4755   int iSpace = 0;              /* First unused byte of aSpace[] */
4756   MemPage *apOld[NB];          /* pPage and up to two siblings */
4757   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
4758   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
4759   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
4760   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
4761   u8 *apDiv[NB];               /* Divider cells in pParent */
4762   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
4763   int szNew[NB+2];             /* Combined size of cells place on i-th page */
4764   u8 **apCell = 0;             /* All cells begin balanced */
4765   int *szCell;                 /* Local size of all cells in apCell[] */
4766   u8 *aCopy[NB];               /* Space for holding data of apCopy[] */
4767   u8 *aSpace;                  /* Space to hold copies of dividers cells */
4768 #ifndef SQLITE_OMIT_AUTOVACUUM
4769   u8 *aFrom = 0;
4770 #endif
4771 
4772   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4773 
4774   /*
4775   ** Find the parent page.
4776   */
4777   assert( pPage->isInit );
4778   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
4779   pBt = pPage->pBt;
4780   pParent = pPage->pParent;
4781   assert( pParent );
4782   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4783     return rc;
4784   }
4785   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4786 
4787 #ifndef SQLITE_OMIT_QUICKBALANCE
4788   /*
4789   ** A special case:  If a new entry has just been inserted into a
4790   ** table (that is, a btree with integer keys and all data at the leaves)
4791   ** and the new entry is the right-most entry in the tree (it has the
4792   ** largest key) then use the special balance_quick() routine for
4793   ** balancing.  balance_quick() is much faster and results in a tighter
4794   ** packing of data in the common case.
4795   */
4796   if( pPage->leaf &&
4797       pPage->intKey &&
4798       pPage->leafData &&
4799       pPage->nOverflow==1 &&
4800       pPage->aOvfl[0].idx==pPage->nCell &&
4801       pPage->pParent->pgno!=1 &&
4802       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4803   ){
4804     /*
4805     ** TODO: Check the siblings to the left of pPage. It may be that
4806     ** they are not full and no new page is required.
4807     */
4808     return balance_quick(pPage, pParent);
4809   }
4810 #endif
4811 
4812   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4813     return rc;
4814   }
4815 
4816   /*
4817   ** Find the cell in the parent page whose left child points back
4818   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
4819   ** is the rightmost child of pParent then set idx to pParent->nCell
4820   */
4821   if( pParent->idxShift ){
4822     Pgno pgno;
4823     pgno = pPage->pgno;
4824     assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
4825     for(idx=0; idx<pParent->nCell; idx++){
4826       if( get4byte(findCell(pParent, idx))==pgno ){
4827         break;
4828       }
4829     }
4830     assert( idx<pParent->nCell
4831              || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
4832   }else{
4833     idx = pPage->idxParent;
4834   }
4835 
4836   /*
4837   ** Initialize variables so that it will be safe to jump
4838   ** directly to balance_cleanup at any moment.
4839   */
4840   nOld = nNew = 0;
4841   sqlite3PagerRef(pParent->pDbPage);
4842 
4843   /*
4844   ** Find sibling pages to pPage and the cells in pParent that divide
4845   ** the siblings.  An attempt is made to find NN siblings on either
4846   ** side of pPage.  More siblings are taken from one side, however, if
4847   ** pPage there are fewer than NN siblings on the other side.  If pParent
4848   ** has NB or fewer children then all children of pParent are taken.
4849   */
4850   nxDiv = idx - NN;
4851   if( nxDiv + NB > pParent->nCell ){
4852     nxDiv = pParent->nCell - NB + 1;
4853   }
4854   if( nxDiv<0 ){
4855     nxDiv = 0;
4856   }
4857   nDiv = 0;
4858   for(i=0, k=nxDiv; i<NB; i++, k++){
4859     if( k<pParent->nCell ){
4860       apDiv[i] = findCell(pParent, k);
4861       nDiv++;
4862       assert( !pParent->leaf );
4863       pgnoOld[i] = get4byte(apDiv[i]);
4864     }else if( k==pParent->nCell ){
4865       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
4866     }else{
4867       break;
4868     }
4869     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
4870     if( rc ) goto balance_cleanup;
4871     apOld[i]->idxParent = k;
4872     apCopy[i] = 0;
4873     assert( i==nOld );
4874     nOld++;
4875     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
4876   }
4877 
4878   /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
4879   ** alignment */
4880   nMaxCells = (nMaxCells + 1)&~1;
4881 
4882   /*
4883   ** Allocate space for memory structures
4884   */
4885   apCell = sqlite3_malloc(
4886        nMaxCells*sizeof(u8*)                           /* apCell */
4887      + nMaxCells*sizeof(int)                           /* szCell */
4888      + ROUND8(sizeof(MemPage))*NB                      /* aCopy */
4889      + pBt->pageSize*(5+NB)                            /* aSpace */
4890      + (ISAUTOVACUUM ? nMaxCells : 0)                  /* aFrom */
4891   );
4892   if( apCell==0 ){
4893     rc = SQLITE_NOMEM;
4894     goto balance_cleanup;
4895   }
4896   szCell = (int*)&apCell[nMaxCells];
4897   aCopy[0] = (u8*)&szCell[nMaxCells];
4898   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4899   for(i=1; i<NB; i++){
4900     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4901     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4902   }
4903   aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4904   assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4905 #ifndef SQLITE_OMIT_AUTOVACUUM
4906   if( pBt->autoVacuum ){
4907     aFrom = &aSpace[5*pBt->pageSize];
4908   }
4909 #endif
4910 
4911   /*
4912   ** Make copies of the content of pPage and its siblings into aOld[].
4913   ** The rest of this function will use data from the copies rather
4914   ** that the original pages since the original pages will be in the
4915   ** process of being overwritten.
4916   */
4917   for(i=0; i<nOld; i++){
4918     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
4919     memcpy(p, apOld[i], sizeof(MemPage));
4920     p->aData = (void*)&p[1];
4921     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
4922   }
4923 
4924   /*
4925   ** Load pointers to all cells on sibling pages and the divider cells
4926   ** into the local apCell[] array.  Make copies of the divider cells
4927   ** into space obtained form aSpace[] and remove the the divider Cells
4928   ** from pParent.
4929   **
4930   ** If the siblings are on leaf pages, then the child pointers of the
4931   ** divider cells are stripped from the cells before they are copied
4932   ** into aSpace[].  In this way, all cells in apCell[] are without
4933   ** child pointers.  If siblings are not leaves, then all cell in
4934   ** apCell[] include child pointers.  Either way, all cells in apCell[]
4935   ** are alike.
4936   **
4937   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
4938   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
4939   */
4940   nCell = 0;
4941   leafCorrection = pPage->leaf*4;
4942   leafData = pPage->leafData && pPage->leaf;
4943   for(i=0; i<nOld; i++){
4944     MemPage *pOld = apCopy[i];
4945     int limit = pOld->nCell+pOld->nOverflow;
4946     for(j=0; j<limit; j++){
4947       assert( nCell<nMaxCells );
4948       apCell[nCell] = findOverflowCell(pOld, j);
4949       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
4950 #ifndef SQLITE_OMIT_AUTOVACUUM
4951       if( pBt->autoVacuum ){
4952         int a;
4953         aFrom[nCell] = i;
4954         for(a=0; a<pOld->nOverflow; a++){
4955           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
4956             aFrom[nCell] = 0xFF;
4957             break;
4958           }
4959         }
4960       }
4961 #endif
4962       nCell++;
4963     }
4964     if( i<nOld-1 ){
4965       int sz = cellSizePtr(pParent, apDiv[i]);
4966       if( leafData ){
4967         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
4968         ** are duplicates of keys on the child pages.  We need to remove
4969         ** the divider cells from pParent, but the dividers cells are not
4970         ** added to apCell[] because they are duplicates of child cells.
4971         */
4972         dropCell(pParent, nxDiv, sz);
4973       }else{
4974         u8 *pTemp;
4975         assert( nCell<nMaxCells );
4976         szCell[nCell] = sz;
4977         pTemp = &aSpace[iSpace];
4978         iSpace += sz;
4979         assert( iSpace<=pBt->pageSize*5 );
4980         memcpy(pTemp, apDiv[i], sz);
4981         apCell[nCell] = pTemp+leafCorrection;
4982 #ifndef SQLITE_OMIT_AUTOVACUUM
4983         if( pBt->autoVacuum ){
4984           aFrom[nCell] = 0xFF;
4985         }
4986 #endif
4987         dropCell(pParent, nxDiv, sz);
4988         szCell[nCell] -= leafCorrection;
4989         assert( get4byte(pTemp)==pgnoOld[i] );
4990         if( !pOld->leaf ){
4991           assert( leafCorrection==0 );
4992           /* The right pointer of the child page pOld becomes the left
4993           ** pointer of the divider cell */
4994           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
4995         }else{
4996           assert( leafCorrection==4 );
4997           if( szCell[nCell]<4 ){
4998             /* Do not allow any cells smaller than 4 bytes. */
4999             szCell[nCell] = 4;
5000           }
5001         }
5002         nCell++;
5003       }
5004     }
5005   }
5006 
5007   /*
5008   ** Figure out the number of pages needed to hold all nCell cells.
5009   ** Store this number in "k".  Also compute szNew[] which is the total
5010   ** size of all cells on the i-th page and cntNew[] which is the index
5011   ** in apCell[] of the cell that divides page i from page i+1.
5012   ** cntNew[k] should equal nCell.
5013   **
5014   ** Values computed by this block:
5015   **
5016   **           k: The total number of sibling pages
5017   **    szNew[i]: Spaced used on the i-th sibling page.
5018   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5019   **              the right of the i-th sibling page.
5020   ** usableSpace: Number of bytes of space available on each sibling.
5021   **
5022   */
5023   usableSpace = pBt->usableSize - 12 + leafCorrection;
5024   for(subtotal=k=i=0; i<nCell; i++){
5025     assert( i<nMaxCells );
5026     subtotal += szCell[i] + 2;
5027     if( subtotal > usableSpace ){
5028       szNew[k] = subtotal - szCell[i];
5029       cntNew[k] = i;
5030       if( leafData ){ i--; }
5031       subtotal = 0;
5032       k++;
5033     }
5034   }
5035   szNew[k] = subtotal;
5036   cntNew[k] = nCell;
5037   k++;
5038 
5039   /*
5040   ** The packing computed by the previous block is biased toward the siblings
5041   ** on the left side.  The left siblings are always nearly full, while the
5042   ** right-most sibling might be nearly empty.  This block of code attempts
5043   ** to adjust the packing of siblings to get a better balance.
5044   **
5045   ** This adjustment is more than an optimization.  The packing above might
5046   ** be so out of balance as to be illegal.  For example, the right-most
5047   ** sibling might be completely empty.  This adjustment is not optional.
5048   */
5049   for(i=k-1; i>0; i--){
5050     int szRight = szNew[i];  /* Size of sibling on the right */
5051     int szLeft = szNew[i-1]; /* Size of sibling on the left */
5052     int r;              /* Index of right-most cell in left sibling */
5053     int d;              /* Index of first cell to the left of right sibling */
5054 
5055     r = cntNew[i-1] - 1;
5056     d = r + 1 - leafData;
5057     assert( d<nMaxCells );
5058     assert( r<nMaxCells );
5059     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5060       szRight += szCell[d] + 2;
5061       szLeft -= szCell[r] + 2;
5062       cntNew[i-1]--;
5063       r = cntNew[i-1] - 1;
5064       d = r + 1 - leafData;
5065     }
5066     szNew[i] = szRight;
5067     szNew[i-1] = szLeft;
5068   }
5069 
5070   /* Either we found one or more cells (cntnew[0])>0) or we are the
5071   ** a virtual root page.  A virtual root page is when the real root
5072   ** page is page 1 and we are the only child of that page.
5073   */
5074   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5075 
5076   /*
5077   ** Allocate k new pages.  Reuse old pages where possible.
5078   */
5079   assert( pPage->pgno>1 );
5080   pageFlags = pPage->aData[0];
5081   for(i=0; i<k; i++){
5082     MemPage *pNew;
5083     if( i<nOld ){
5084       pNew = apNew[i] = apOld[i];
5085       pgnoNew[i] = pgnoOld[i];
5086       apOld[i] = 0;
5087       rc = sqlite3PagerWrite(pNew->pDbPage);
5088       nNew++;
5089       if( rc ) goto balance_cleanup;
5090     }else{
5091       assert( i>0 );
5092       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5093       if( rc ) goto balance_cleanup;
5094       apNew[i] = pNew;
5095       nNew++;
5096     }
5097     zeroPage(pNew, pageFlags);
5098   }
5099 
5100   /* Free any old pages that were not reused as new pages.
5101   */
5102   while( i<nOld ){
5103     rc = freePage(apOld[i]);
5104     if( rc ) goto balance_cleanup;
5105     releasePage(apOld[i]);
5106     apOld[i] = 0;
5107     i++;
5108   }
5109 
5110   /*
5111   ** Put the new pages in accending order.  This helps to
5112   ** keep entries in the disk file in order so that a scan
5113   ** of the table is a linear scan through the file.  That
5114   ** in turn helps the operating system to deliver pages
5115   ** from the disk more rapidly.
5116   **
5117   ** An O(n^2) insertion sort algorithm is used, but since
5118   ** n is never more than NB (a small constant), that should
5119   ** not be a problem.
5120   **
5121   ** When NB==3, this one optimization makes the database
5122   ** about 25% faster for large insertions and deletions.
5123   */
5124   for(i=0; i<k-1; i++){
5125     int minV = pgnoNew[i];
5126     int minI = i;
5127     for(j=i+1; j<k; j++){
5128       if( pgnoNew[j]<(unsigned)minV ){
5129         minI = j;
5130         minV = pgnoNew[j];
5131       }
5132     }
5133     if( minI>i ){
5134       int t;
5135       MemPage *pT;
5136       t = pgnoNew[i];
5137       pT = apNew[i];
5138       pgnoNew[i] = pgnoNew[minI];
5139       apNew[i] = apNew[minI];
5140       pgnoNew[minI] = t;
5141       apNew[minI] = pT;
5142     }
5143   }
5144   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5145     pgnoOld[0],
5146     nOld>=2 ? pgnoOld[1] : 0,
5147     nOld>=3 ? pgnoOld[2] : 0,
5148     pgnoNew[0], szNew[0],
5149     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5150     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5151     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5152     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5153 
5154   /*
5155   ** Evenly distribute the data in apCell[] across the new pages.
5156   ** Insert divider cells into pParent as necessary.
5157   */
5158   j = 0;
5159   for(i=0; i<nNew; i++){
5160     /* Assemble the new sibling page. */
5161     MemPage *pNew = apNew[i];
5162     assert( j<nMaxCells );
5163     assert( pNew->pgno==pgnoNew[i] );
5164     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5165     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5166     assert( pNew->nOverflow==0 );
5167 
5168 #ifndef SQLITE_OMIT_AUTOVACUUM
5169     /* If this is an auto-vacuum database, update the pointer map entries
5170     ** that point to the siblings that were rearranged. These can be: left
5171     ** children of cells, the right-child of the page, or overflow pages
5172     ** pointed to by cells.
5173     */
5174     if( pBt->autoVacuum ){
5175       for(k=j; k<cntNew[i]; k++){
5176         assert( k<nMaxCells );
5177         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5178           rc = ptrmapPutOvfl(pNew, k-j);
5179           if( rc!=SQLITE_OK ){
5180             goto balance_cleanup;
5181           }
5182         }
5183       }
5184     }
5185 #endif
5186 
5187     j = cntNew[i];
5188 
5189     /* If the sibling page assembled above was not the right-most sibling,
5190     ** insert a divider cell into the parent page.
5191     */
5192     if( i<nNew-1 && j<nCell ){
5193       u8 *pCell;
5194       u8 *pTemp;
5195       int sz;
5196 
5197       assert( j<nMaxCells );
5198       pCell = apCell[j];
5199       sz = szCell[j] + leafCorrection;
5200       if( !pNew->leaf ){
5201         memcpy(&pNew->aData[8], pCell, 4);
5202         pTemp = 0;
5203       }else if( leafData ){
5204         /* If the tree is a leaf-data tree, and the siblings are leaves,
5205         ** then there is no divider cell in apCell[]. Instead, the divider
5206         ** cell consists of the integer key for the right-most cell of
5207         ** the sibling-page assembled above only.
5208         */
5209         CellInfo info;
5210         j--;
5211         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5212         pCell = &aSpace[iSpace];
5213         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5214         iSpace += sz;
5215         assert( iSpace<=pBt->pageSize*5 );
5216         pTemp = 0;
5217       }else{
5218         pCell -= 4;
5219         pTemp = &aSpace[iSpace];
5220         iSpace += sz;
5221         assert( iSpace<=pBt->pageSize*5 );
5222         /* Obscure case for non-leaf-data trees: If the cell at pCell was
5223         ** previously stored on a leaf node, and its reported size was 4
5224         ** bytes, then it may actually be smaller than this
5225         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5226         ** any cell). But it is important to pass the correct size to
5227         ** insertCell(), so reparse the cell now.
5228         **
5229         ** Note that this can never happen in an SQLite data file, as all
5230         ** cells are at least 4 bytes. It only happens in b-trees used
5231         ** to evaluate "IN (SELECT ...)" and similar clauses.
5232         */
5233         if( szCell[j]==4 ){
5234           assert(leafCorrection==4);
5235           sz = cellSizePtr(pParent, pCell);
5236         }
5237       }
5238       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5239       if( rc!=SQLITE_OK ) goto balance_cleanup;
5240       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5241 #ifndef SQLITE_OMIT_AUTOVACUUM
5242       /* If this is an auto-vacuum database, and not a leaf-data tree,
5243       ** then update the pointer map with an entry for the overflow page
5244       ** that the cell just inserted points to (if any).
5245       */
5246       if( pBt->autoVacuum && !leafData ){
5247         rc = ptrmapPutOvfl(pParent, nxDiv);
5248         if( rc!=SQLITE_OK ){
5249           goto balance_cleanup;
5250         }
5251       }
5252 #endif
5253       j++;
5254       nxDiv++;
5255     }
5256   }
5257   assert( j==nCell );
5258   assert( nOld>0 );
5259   assert( nNew>0 );
5260   if( (pageFlags & PTF_LEAF)==0 ){
5261     memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
5262   }
5263   if( nxDiv==pParent->nCell+pParent->nOverflow ){
5264     /* Right-most sibling is the right-most child of pParent */
5265     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5266   }else{
5267     /* Right-most sibling is the left child of the first entry in pParent
5268     ** past the right-most divider entry */
5269     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5270   }
5271 
5272   /*
5273   ** Reparent children of all cells.
5274   */
5275   for(i=0; i<nNew; i++){
5276     rc = reparentChildPages(apNew[i]);
5277     if( rc!=SQLITE_OK ) goto balance_cleanup;
5278   }
5279   rc = reparentChildPages(pParent);
5280   if( rc!=SQLITE_OK ) goto balance_cleanup;
5281 
5282   /*
5283   ** Balance the parent page.  Note that the current page (pPage) might
5284   ** have been added to the freelist so it might no longer be initialized.
5285   ** But the parent page will always be initialized.
5286   */
5287   assert( pParent->isInit );
5288   rc = balance(pParent, 0);
5289 
5290   /*
5291   ** Cleanup before returning.
5292   */
5293 balance_cleanup:
5294   sqlite3_free(apCell);
5295   for(i=0; i<nOld; i++){
5296     releasePage(apOld[i]);
5297   }
5298   for(i=0; i<nNew; i++){
5299     releasePage(apNew[i]);
5300   }
5301   releasePage(pParent);
5302   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5303           pPage->pgno, nOld, nNew, nCell));
5304   return rc;
5305 }
5306 
5307 /*
5308 ** This routine is called for the root page of a btree when the root
5309 ** page contains no cells.  This is an opportunity to make the tree
5310 ** shallower by one level.
5311 */
5312 static int balance_shallower(MemPage *pPage){
5313   MemPage *pChild;             /* The only child page of pPage */
5314   Pgno pgnoChild;              /* Page number for pChild */
5315   int rc = SQLITE_OK;          /* Return code from subprocedures */
5316   BtShared *pBt;                  /* The main BTree structure */
5317   int mxCellPerPage;           /* Maximum number of cells per page */
5318   u8 **apCell;                 /* All cells from pages being balanced */
5319   int *szCell;                 /* Local size of all cells */
5320 
5321   assert( pPage->pParent==0 );
5322   assert( pPage->nCell==0 );
5323   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5324   pBt = pPage->pBt;
5325   mxCellPerPage = MX_CELL(pBt);
5326   apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
5327   if( apCell==0 ) return SQLITE_NOMEM;
5328   szCell = (int*)&apCell[mxCellPerPage];
5329   if( pPage->leaf ){
5330     /* The table is completely empty */
5331     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5332   }else{
5333     /* The root page is empty but has one child.  Transfer the
5334     ** information from that one child into the root page if it
5335     ** will fit.  This reduces the depth of the tree by one.
5336     **
5337     ** If the root page is page 1, it has less space available than
5338     ** its child (due to the 100 byte header that occurs at the beginning
5339     ** of the database fle), so it might not be able to hold all of the
5340     ** information currently contained in the child.  If this is the
5341     ** case, then do not do the transfer.  Leave page 1 empty except
5342     ** for the right-pointer to the child page.  The child page becomes
5343     ** the virtual root of the tree.
5344     */
5345     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5346     assert( pgnoChild>0 );
5347     assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) );
5348     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5349     if( rc ) goto end_shallow_balance;
5350     if( pPage->pgno==1 ){
5351       rc = sqlite3BtreeInitPage(pChild, pPage);
5352       if( rc ) goto end_shallow_balance;
5353       assert( pChild->nOverflow==0 );
5354       if( pChild->nFree>=100 ){
5355         /* The child information will fit on the root page, so do the
5356         ** copy */
5357         int i;
5358         zeroPage(pPage, pChild->aData[0]);
5359         for(i=0; i<pChild->nCell; i++){
5360           apCell[i] = findCell(pChild,i);
5361           szCell[i] = cellSizePtr(pChild, apCell[i]);
5362         }
5363         assemblePage(pPage, pChild->nCell, apCell, szCell);
5364         /* Copy the right-pointer of the child to the parent. */
5365         put4byte(&pPage->aData[pPage->hdrOffset+8],
5366             get4byte(&pChild->aData[pChild->hdrOffset+8]));
5367         freePage(pChild);
5368         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5369       }else{
5370         /* The child has more information that will fit on the root.
5371         ** The tree is already balanced.  Do nothing. */
5372         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5373       }
5374     }else{
5375       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5376       pPage->isInit = 0;
5377       pPage->pParent = 0;
5378       rc = sqlite3BtreeInitPage(pPage, 0);
5379       assert( rc==SQLITE_OK );
5380       freePage(pChild);
5381       TRACE(("BALANCE: transfer child %d into root %d\n",
5382               pChild->pgno, pPage->pgno));
5383     }
5384     rc = reparentChildPages(pPage);
5385     assert( pPage->nOverflow==0 );
5386 #ifndef SQLITE_OMIT_AUTOVACUUM
5387     if( pBt->autoVacuum ){
5388       int i;
5389       for(i=0; i<pPage->nCell; i++){
5390         rc = ptrmapPutOvfl(pPage, i);
5391         if( rc!=SQLITE_OK ){
5392           goto end_shallow_balance;
5393         }
5394       }
5395     }
5396 #endif
5397     releasePage(pChild);
5398   }
5399 end_shallow_balance:
5400   sqlite3_free(apCell);
5401   return rc;
5402 }
5403 
5404 
5405 /*
5406 ** The root page is overfull
5407 **
5408 ** When this happens, Create a new child page and copy the
5409 ** contents of the root into the child.  Then make the root
5410 ** page an empty page with rightChild pointing to the new
5411 ** child.   Finally, call balance_internal() on the new child
5412 ** to cause it to split.
5413 */
5414 static int balance_deeper(MemPage *pPage){
5415   int rc;             /* Return value from subprocedures */
5416   MemPage *pChild;    /* Pointer to a new child page */
5417   Pgno pgnoChild;     /* Page number of the new child page */
5418   BtShared *pBt;         /* The BTree */
5419   int usableSize;     /* Total usable size of a page */
5420   u8 *data;           /* Content of the parent page */
5421   u8 *cdata;          /* Content of the child page */
5422   int hdr;            /* Offset to page header in parent */
5423   int brk;            /* Offset to content of first cell in parent */
5424 
5425   assert( pPage->pParent==0 );
5426   assert( pPage->nOverflow>0 );
5427   pBt = pPage->pBt;
5428   assert( sqlite3_mutex_held(pBt->mutex) );
5429   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5430   if( rc ) return rc;
5431   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5432   usableSize = pBt->usableSize;
5433   data = pPage->aData;
5434   hdr = pPage->hdrOffset;
5435   brk = get2byte(&data[hdr+5]);
5436   cdata = pChild->aData;
5437   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5438   memcpy(&cdata[brk], &data[brk], usableSize-brk);
5439   assert( pChild->isInit==0 );
5440   rc = sqlite3BtreeInitPage(pChild, pPage);
5441   if( rc ) goto balancedeeper_out;
5442   memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
5443   pChild->nOverflow = pPage->nOverflow;
5444   if( pChild->nOverflow ){
5445     pChild->nFree = 0;
5446   }
5447   assert( pChild->nCell==pPage->nCell );
5448   zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5449   put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5450   TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5451 #ifndef SQLITE_OMIT_AUTOVACUUM
5452   if( pBt->autoVacuum ){
5453     int i;
5454     rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5455     if( rc ) goto balancedeeper_out;
5456     for(i=0; i<pChild->nCell; i++){
5457       rc = ptrmapPutOvfl(pChild, i);
5458       if( rc!=SQLITE_OK ){
5459         return rc;
5460       }
5461     }
5462   }
5463 #endif
5464   rc = balance_nonroot(pChild);
5465 
5466 balancedeeper_out:
5467   releasePage(pChild);
5468   return rc;
5469 }
5470 
5471 /*
5472 ** Decide if the page pPage needs to be balanced.  If balancing is
5473 ** required, call the appropriate balancing routine.
5474 */
5475 static int balance(MemPage *pPage, int insert){
5476   int rc = SQLITE_OK;
5477   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5478   if( pPage->pParent==0 ){
5479     rc = sqlite3PagerWrite(pPage->pDbPage);
5480     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
5481       rc = balance_deeper(pPage);
5482     }
5483     if( rc==SQLITE_OK && pPage->nCell==0 ){
5484       rc = balance_shallower(pPage);
5485     }
5486   }else{
5487     if( pPage->nOverflow>0 ||
5488         (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5489       rc = balance_nonroot(pPage);
5490     }
5491   }
5492   return rc;
5493 }
5494 
5495 /*
5496 ** This routine checks all cursors that point to table pgnoRoot.
5497 ** If any of those cursors were opened with wrFlag==0 in a different
5498 ** database connection (a database connection that shares the pager
5499 ** cache with the current connection) and that other connection
5500 ** is not in the ReadUncommmitted state, then this routine returns
5501 ** SQLITE_LOCKED.
5502 **
5503 ** In addition to checking for read-locks (where a read-lock
5504 ** means a cursor opened with wrFlag==0) this routine also moves
5505 ** all write cursors so that they are pointing to the
5506 ** first Cell on the root page.  This is necessary because an insert
5507 ** or delete might change the number of cells on a page or delete
5508 ** a page entirely and we do not want to leave any cursors
5509 ** pointing to non-existant pages or cells.
5510 */
5511 static int checkReadLocks(Btree *pBtree, Pgno pgnoRoot, BtCursor *pExclude){
5512   BtCursor *p;
5513   BtShared *pBt = pBtree->pBt;
5514   sqlite3 *db = pBtree->db;
5515   assert( sqlite3BtreeHoldsMutex(pBtree) );
5516   for(p=pBt->pCursor; p; p=p->pNext){
5517     if( p==pExclude ) continue;
5518     if( p->eState!=CURSOR_VALID ) continue;
5519     if( p->pgnoRoot!=pgnoRoot ) continue;
5520     if( p->wrFlag==0 ){
5521       sqlite3 *dbOther = p->pBtree->db;
5522       if( dbOther==0 ||
5523          (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5524         return SQLITE_LOCKED;
5525       }
5526     }else if( p->pPage->pgno!=p->pgnoRoot ){
5527       moveToRoot(p);
5528     }
5529   }
5530   return SQLITE_OK;
5531 }
5532 
5533 /*
5534 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
5535 ** and the data is given by (pData,nData).  The cursor is used only to
5536 ** define what table the record should be inserted into.  The cursor
5537 ** is left pointing at a random location.
5538 **
5539 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
5540 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
5541 */
5542 int sqlite3BtreeInsert(
5543   BtCursor *pCur,                /* Insert data into the table of this cursor */
5544   const void *pKey, i64 nKey,    /* The key of the new record */
5545   const void *pData, int nData,  /* The data of the new record */
5546   int nZero,                     /* Number of extra 0 bytes to append to data */
5547   int appendBias                 /* True if this is likely an append */
5548 ){
5549   int rc;
5550   int loc;
5551   int szNew;
5552   MemPage *pPage;
5553   Btree *p = pCur->pBtree;
5554   BtShared *pBt = p->pBt;
5555   unsigned char *oldCell;
5556   unsigned char *newCell = 0;
5557 
5558   assert( cursorHoldsMutex(pCur) );
5559   if( pBt->inTransaction!=TRANS_WRITE ){
5560     /* Must start a transaction before doing an insert */
5561     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5562     return rc;
5563   }
5564   assert( !pBt->readOnly );
5565   if( !pCur->wrFlag ){
5566     return SQLITE_PERM;   /* Cursor not open for writing */
5567   }
5568   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){
5569     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5570   }
5571   if( pCur->eState==CURSOR_FAULT ){
5572     return pCur->skip;
5573   }
5574 
5575   /* Save the positions of any other cursors open on this table */
5576   clearCursorPosition(pCur);
5577   if(
5578     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
5579     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
5580   ){
5581     return rc;
5582   }
5583 
5584   pPage = pCur->pPage;
5585   assert( pPage->intKey || nKey>=0 );
5586   assert( pPage->leaf || !pPage->leafData );
5587   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5588           pCur->pgnoRoot, nKey, nData, pPage->pgno,
5589           loc==0 ? "overwrite" : "new entry"));
5590   assert( pPage->isInit );
5591   newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
5592   if( newCell==0 ) return SQLITE_NOMEM;
5593   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
5594   if( rc ) goto end_insert;
5595   assert( szNew==cellSizePtr(pPage, newCell) );
5596   assert( szNew<=MX_CELL_SIZE(pBt) );
5597   if( loc==0 && CURSOR_VALID==pCur->eState ){
5598     int szOld;
5599     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
5600     rc = sqlite3PagerWrite(pPage->pDbPage);
5601     if( rc ){
5602       goto end_insert;
5603     }
5604     oldCell = findCell(pPage, pCur->idx);
5605     if( !pPage->leaf ){
5606       memcpy(newCell, oldCell, 4);
5607     }
5608     szOld = cellSizePtr(pPage, oldCell);
5609     rc = clearCell(pPage, oldCell);
5610     if( rc ) goto end_insert;
5611     dropCell(pPage, pCur->idx, szOld);
5612   }else if( loc<0 && pPage->nCell>0 ){
5613     assert( pPage->leaf );
5614     pCur->idx++;
5615     pCur->info.nSize = 0;
5616   }else{
5617     assert( pPage->leaf );
5618   }
5619   rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
5620   if( rc!=SQLITE_OK ) goto end_insert;
5621   rc = balance(pPage, 1);
5622   /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
5623   /* fflush(stdout); */
5624   if( rc==SQLITE_OK ){
5625     moveToRoot(pCur);
5626   }
5627 end_insert:
5628   sqlite3_free(newCell);
5629   return rc;
5630 }
5631 
5632 /*
5633 ** Delete the entry that the cursor is pointing to.  The cursor
5634 ** is left pointing at a random location.
5635 */
5636 int sqlite3BtreeDelete(BtCursor *pCur){
5637   MemPage *pPage = pCur->pPage;
5638   unsigned char *pCell;
5639   int rc;
5640   Pgno pgnoChild = 0;
5641   Btree *p = pCur->pBtree;
5642   BtShared *pBt = p->pBt;
5643 
5644   assert( cursorHoldsMutex(pCur) );
5645   assert( pPage->isInit );
5646   if( pBt->inTransaction!=TRANS_WRITE ){
5647     /* Must start a transaction before doing a delete */
5648     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5649     return rc;
5650   }
5651   assert( !pBt->readOnly );
5652   if( pCur->eState==CURSOR_FAULT ){
5653     return pCur->skip;
5654   }
5655   if( pCur->idx >= pPage->nCell ){
5656     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
5657   }
5658   if( !pCur->wrFlag ){
5659     return SQLITE_PERM;   /* Did not open this cursor for writing */
5660   }
5661   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){
5662     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5663   }
5664 
5665   /* Restore the current cursor position (a no-op if the cursor is not in
5666   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
5667   ** open on the same table. Then call sqlite3PagerWrite() on the page
5668   ** that the entry will be deleted from.
5669   */
5670   if(
5671     (rc = restoreOrClearCursorPosition(pCur))!=0 ||
5672     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
5673     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
5674   ){
5675     return rc;
5676   }
5677 
5678   /* Locate the cell within its page and leave pCell pointing to the
5679   ** data. The clearCell() call frees any overflow pages associated with the
5680   ** cell. The cell itself is still intact.
5681   */
5682   pCell = findCell(pPage, pCur->idx);
5683   if( !pPage->leaf ){
5684     pgnoChild = get4byte(pCell);
5685   }
5686   rc = clearCell(pPage, pCell);
5687   if( rc ){
5688     return rc;
5689   }
5690 
5691   if( !pPage->leaf ){
5692     /*
5693     ** The entry we are about to delete is not a leaf so if we do not
5694     ** do something we will leave a hole on an internal page.
5695     ** We have to fill the hole by moving in a cell from a leaf.  The
5696     ** next Cell after the one to be deleted is guaranteed to exist and
5697     ** to be a leaf so we can use it.
5698     */
5699     BtCursor leafCur;
5700     unsigned char *pNext;
5701     int szNext;  /* The compiler warning is wrong: szNext is always
5702                  ** initialized before use.  Adding an extra initialization
5703                  ** to silence the compiler slows down the code. */
5704     int notUsed;
5705     unsigned char *tempCell = 0;
5706     assert( !pPage->leafData );
5707     sqlite3BtreeGetTempCursor(pCur, &leafCur);
5708     rc = sqlite3BtreeNext(&leafCur, &notUsed);
5709     if( rc==SQLITE_OK ){
5710       rc = sqlite3PagerWrite(leafCur.pPage->pDbPage);
5711     }
5712     if( rc==SQLITE_OK ){
5713       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5714          pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
5715       dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5716       pNext = findCell(leafCur.pPage, leafCur.idx);
5717       szNext = cellSizePtr(leafCur.pPage, pNext);
5718       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5719       tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
5720       if( tempCell==0 ){
5721         rc = SQLITE_NOMEM;
5722       }
5723     }
5724     if( rc==SQLITE_OK ){
5725       rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
5726     }
5727     if( rc==SQLITE_OK ){
5728       put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
5729       rc = balance(pPage, 0);
5730     }
5731     if( rc==SQLITE_OK ){
5732       dropCell(leafCur.pPage, leafCur.idx, szNext);
5733       rc = balance(leafCur.pPage, 0);
5734     }
5735     sqlite3_free(tempCell);
5736     sqlite3BtreeReleaseTempCursor(&leafCur);
5737   }else{
5738     TRACE(("DELETE: table=%d delete from leaf %d\n",
5739        pCur->pgnoRoot, pPage->pgno));
5740     dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5741     rc = balance(pPage, 0);
5742   }
5743   if( rc==SQLITE_OK ){
5744     moveToRoot(pCur);
5745   }
5746   return rc;
5747 }
5748 
5749 /*
5750 ** Create a new BTree table.  Write into *piTable the page
5751 ** number for the root page of the new table.
5752 **
5753 ** The type of type is determined by the flags parameter.  Only the
5754 ** following values of flags are currently in use.  Other values for
5755 ** flags might not work:
5756 **
5757 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
5758 **     BTREE_ZERODATA                  Used for SQL indices
5759 */
5760 static int btreeCreateTable(Btree *p, int *piTable, int flags){
5761   BtShared *pBt = p->pBt;
5762   MemPage *pRoot;
5763   Pgno pgnoRoot;
5764   int rc;
5765 
5766   assert( sqlite3BtreeHoldsMutex(p) );
5767   if( pBt->inTransaction!=TRANS_WRITE ){
5768     /* Must start a transaction first */
5769     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5770     return rc;
5771   }
5772   assert( !pBt->readOnly );
5773 
5774 #ifdef SQLITE_OMIT_AUTOVACUUM
5775   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
5776   if( rc ){
5777     return rc;
5778   }
5779 #else
5780   if( pBt->autoVacuum ){
5781     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
5782     MemPage *pPageMove; /* The page to move to. */
5783 
5784     /* Creating a new table may probably require moving an existing database
5785     ** to make room for the new tables root page. In case this page turns
5786     ** out to be an overflow page, delete all overflow page-map caches
5787     ** held by open cursors.
5788     */
5789     invalidateAllOverflowCache(pBt);
5790 
5791     /* Read the value of meta[3] from the database to determine where the
5792     ** root page of the new table should go. meta[3] is the largest root-page
5793     ** created so far, so the new root-page is (meta[3]+1).
5794     */
5795     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
5796     if( rc!=SQLITE_OK ){
5797       return rc;
5798     }
5799     pgnoRoot++;
5800 
5801     /* The new root-page may not be allocated on a pointer-map page, or the
5802     ** PENDING_BYTE page.
5803     */
5804     if( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
5805         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
5806       pgnoRoot++;
5807     }
5808     assert( pgnoRoot>=3 );
5809 
5810     /* Allocate a page. The page that currently resides at pgnoRoot will
5811     ** be moved to the allocated page (unless the allocated page happens
5812     ** to reside at pgnoRoot).
5813     */
5814     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
5815     if( rc!=SQLITE_OK ){
5816       return rc;
5817     }
5818 
5819     if( pgnoMove!=pgnoRoot ){
5820       /* pgnoRoot is the page that will be used for the root-page of
5821       ** the new table (assuming an error did not occur). But we were
5822       ** allocated pgnoMove. If required (i.e. if it was not allocated
5823       ** by extending the file), the current page at position pgnoMove
5824       ** is already journaled.
5825       */
5826       u8 eType;
5827       Pgno iPtrPage;
5828 
5829       releasePage(pPageMove);
5830 
5831       /* Move the page currently at pgnoRoot to pgnoMove. */
5832       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
5833       if( rc!=SQLITE_OK ){
5834         return rc;
5835       }
5836       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
5837       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
5838         releasePage(pRoot);
5839         return rc;
5840       }
5841       assert( eType!=PTRMAP_ROOTPAGE );
5842       assert( eType!=PTRMAP_FREEPAGE );
5843       rc = sqlite3PagerWrite(pRoot->pDbPage);
5844       if( rc!=SQLITE_OK ){
5845         releasePage(pRoot);
5846         return rc;
5847       }
5848       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove);
5849       releasePage(pRoot);
5850 
5851       /* Obtain the page at pgnoRoot */
5852       if( rc!=SQLITE_OK ){
5853         return rc;
5854       }
5855       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
5856       if( rc!=SQLITE_OK ){
5857         return rc;
5858       }
5859       rc = sqlite3PagerWrite(pRoot->pDbPage);
5860       if( rc!=SQLITE_OK ){
5861         releasePage(pRoot);
5862         return rc;
5863       }
5864     }else{
5865       pRoot = pPageMove;
5866     }
5867 
5868     /* Update the pointer-map and meta-data with the new root-page number. */
5869     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
5870     if( rc ){
5871       releasePage(pRoot);
5872       return rc;
5873     }
5874     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
5875     if( rc ){
5876       releasePage(pRoot);
5877       return rc;
5878     }
5879 
5880   }else{
5881     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
5882     if( rc ) return rc;
5883   }
5884 #endif
5885   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
5886   zeroPage(pRoot, flags | PTF_LEAF);
5887   sqlite3PagerUnref(pRoot->pDbPage);
5888   *piTable = (int)pgnoRoot;
5889   return SQLITE_OK;
5890 }
5891 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
5892   int rc;
5893   sqlite3BtreeEnter(p);
5894   p->pBt->db = p->db;
5895   rc = btreeCreateTable(p, piTable, flags);
5896   sqlite3BtreeLeave(p);
5897   return rc;
5898 }
5899 
5900 /*
5901 ** Erase the given database page and all its children.  Return
5902 ** the page to the freelist.
5903 */
5904 static int clearDatabasePage(
5905   BtShared *pBt,           /* The BTree that contains the table */
5906   Pgno pgno,            /* Page number to clear */
5907   MemPage *pParent,     /* Parent page.  NULL for the root */
5908   int freePageFlag      /* Deallocate page if true */
5909 ){
5910   MemPage *pPage = 0;
5911   int rc;
5912   unsigned char *pCell;
5913   int i;
5914 
5915   assert( sqlite3_mutex_held(pBt->mutex) );
5916   if( pgno>sqlite3PagerPagecount(pBt->pPager) ){
5917     return SQLITE_CORRUPT_BKPT;
5918   }
5919 
5920   rc = getAndInitPage(pBt, pgno, &pPage, pParent);
5921   if( rc ) goto cleardatabasepage_out;
5922   for(i=0; i<pPage->nCell; i++){
5923     pCell = findCell(pPage, i);
5924     if( !pPage->leaf ){
5925       rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
5926       if( rc ) goto cleardatabasepage_out;
5927     }
5928     rc = clearCell(pPage, pCell);
5929     if( rc ) goto cleardatabasepage_out;
5930   }
5931   if( !pPage->leaf ){
5932     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
5933     if( rc ) goto cleardatabasepage_out;
5934   }
5935   if( freePageFlag ){
5936     rc = freePage(pPage);
5937   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
5938     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
5939   }
5940 
5941 cleardatabasepage_out:
5942   releasePage(pPage);
5943   return rc;
5944 }
5945 
5946 /*
5947 ** Delete all information from a single table in the database.  iTable is
5948 ** the page number of the root of the table.  After this routine returns,
5949 ** the root page is empty, but still exists.
5950 **
5951 ** This routine will fail with SQLITE_LOCKED if there are any open
5952 ** read cursors on the table.  Open write cursors are moved to the
5953 ** root of the table.
5954 */
5955 int sqlite3BtreeClearTable(Btree *p, int iTable){
5956   int rc;
5957   BtShared *pBt = p->pBt;
5958   sqlite3BtreeEnter(p);
5959   pBt->db = p->db;
5960   if( p->inTrans!=TRANS_WRITE ){
5961     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5962   }else if( (rc = checkReadLocks(p, iTable, 0))!=SQLITE_OK ){
5963     /* nothing to do */
5964   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
5965     /* nothing to do */
5966   }else{
5967     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
5968   }
5969   sqlite3BtreeLeave(p);
5970   return rc;
5971 }
5972 
5973 /*
5974 ** Erase all information in a table and add the root of the table to
5975 ** the freelist.  Except, the root of the principle table (the one on
5976 ** page 1) is never added to the freelist.
5977 **
5978 ** This routine will fail with SQLITE_LOCKED if there are any open
5979 ** cursors on the table.
5980 **
5981 ** If AUTOVACUUM is enabled and the page at iTable is not the last
5982 ** root page in the database file, then the last root page
5983 ** in the database file is moved into the slot formerly occupied by
5984 ** iTable and that last slot formerly occupied by the last root page
5985 ** is added to the freelist instead of iTable.  In this say, all
5986 ** root pages are kept at the beginning of the database file, which
5987 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
5988 ** page number that used to be the last root page in the file before
5989 ** the move.  If no page gets moved, *piMoved is set to 0.
5990 ** The last root page is recorded in meta[3] and the value of
5991 ** meta[3] is updated by this procedure.
5992 */
5993 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
5994   int rc;
5995   MemPage *pPage = 0;
5996   BtShared *pBt = p->pBt;
5997 
5998   assert( sqlite3BtreeHoldsMutex(p) );
5999   if( p->inTrans!=TRANS_WRITE ){
6000     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6001   }
6002 
6003   /* It is illegal to drop a table if any cursors are open on the
6004   ** database. This is because in auto-vacuum mode the backend may
6005   ** need to move another root-page to fill a gap left by the deleted
6006   ** root page. If an open cursor was using this page a problem would
6007   ** occur.
6008   */
6009   if( pBt->pCursor ){
6010     return SQLITE_LOCKED;
6011   }
6012 
6013   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6014   if( rc ) return rc;
6015   rc = sqlite3BtreeClearTable(p, iTable);
6016   if( rc ){
6017     releasePage(pPage);
6018     return rc;
6019   }
6020 
6021   *piMoved = 0;
6022 
6023   if( iTable>1 ){
6024 #ifdef SQLITE_OMIT_AUTOVACUUM
6025     rc = freePage(pPage);
6026     releasePage(pPage);
6027 #else
6028     if( pBt->autoVacuum ){
6029       Pgno maxRootPgno;
6030       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6031       if( rc!=SQLITE_OK ){
6032         releasePage(pPage);
6033         return rc;
6034       }
6035 
6036       if( iTable==maxRootPgno ){
6037         /* If the table being dropped is the table with the largest root-page
6038         ** number in the database, put the root page on the free list.
6039         */
6040         rc = freePage(pPage);
6041         releasePage(pPage);
6042         if( rc!=SQLITE_OK ){
6043           return rc;
6044         }
6045       }else{
6046         /* The table being dropped does not have the largest root-page
6047         ** number in the database. So move the page that does into the
6048         ** gap left by the deleted root-page.
6049         */
6050         MemPage *pMove;
6051         releasePage(pPage);
6052         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6053         if( rc!=SQLITE_OK ){
6054           return rc;
6055         }
6056         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable);
6057         releasePage(pMove);
6058         if( rc!=SQLITE_OK ){
6059           return rc;
6060         }
6061         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6062         if( rc!=SQLITE_OK ){
6063           return rc;
6064         }
6065         rc = freePage(pMove);
6066         releasePage(pMove);
6067         if( rc!=SQLITE_OK ){
6068           return rc;
6069         }
6070         *piMoved = maxRootPgno;
6071       }
6072 
6073       /* Set the new 'max-root-page' value in the database header. This
6074       ** is the old value less one, less one more if that happens to
6075       ** be a root-page number, less one again if that is the
6076       ** PENDING_BYTE_PAGE.
6077       */
6078       maxRootPgno--;
6079       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6080         maxRootPgno--;
6081       }
6082       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6083         maxRootPgno--;
6084       }
6085       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6086 
6087       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6088     }else{
6089       rc = freePage(pPage);
6090       releasePage(pPage);
6091     }
6092 #endif
6093   }else{
6094     /* If sqlite3BtreeDropTable was called on page 1. */
6095     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6096     releasePage(pPage);
6097   }
6098   return rc;
6099 }
6100 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6101   int rc;
6102   sqlite3BtreeEnter(p);
6103   p->pBt->db = p->db;
6104   rc = btreeDropTable(p, iTable, piMoved);
6105   sqlite3BtreeLeave(p);
6106   return rc;
6107 }
6108 
6109 
6110 /*
6111 ** Read the meta-information out of a database file.  Meta[0]
6112 ** is the number of free pages currently in the database.  Meta[1]
6113 ** through meta[15] are available for use by higher layers.  Meta[0]
6114 ** is read-only, the others are read/write.
6115 **
6116 ** The schema layer numbers meta values differently.  At the schema
6117 ** layer (and the SetCookie and ReadCookie opcodes) the number of
6118 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
6119 */
6120 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6121   DbPage *pDbPage;
6122   int rc;
6123   unsigned char *pP1;
6124   BtShared *pBt = p->pBt;
6125 
6126   sqlite3BtreeEnter(p);
6127   pBt->db = p->db;
6128 
6129   /* Reading a meta-data value requires a read-lock on page 1 (and hence
6130   ** the sqlite_master table. We grab this lock regardless of whether or
6131   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6132   ** 1 is treated as a special case by queryTableLock() and lockTable()).
6133   */
6134   rc = queryTableLock(p, 1, READ_LOCK);
6135   if( rc!=SQLITE_OK ){
6136     sqlite3BtreeLeave(p);
6137     return rc;
6138   }
6139 
6140   assert( idx>=0 && idx<=15 );
6141   rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6142   if( rc ){
6143     sqlite3BtreeLeave(p);
6144     return rc;
6145   }
6146   pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6147   *pMeta = get4byte(&pP1[36 + idx*4]);
6148   sqlite3PagerUnref(pDbPage);
6149 
6150   /* If autovacuumed is disabled in this build but we are trying to
6151   ** access an autovacuumed database, then make the database readonly.
6152   */
6153 #ifdef SQLITE_OMIT_AUTOVACUUM
6154   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6155 #endif
6156 
6157   /* Grab the read-lock on page 1. */
6158   rc = lockTable(p, 1, READ_LOCK);
6159   sqlite3BtreeLeave(p);
6160   return rc;
6161 }
6162 
6163 /*
6164 ** Write meta-information back into the database.  Meta[0] is
6165 ** read-only and may not be written.
6166 */
6167 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6168   BtShared *pBt = p->pBt;
6169   unsigned char *pP1;
6170   int rc;
6171   assert( idx>=1 && idx<=15 );
6172   sqlite3BtreeEnter(p);
6173   pBt->db = p->db;
6174   if( p->inTrans!=TRANS_WRITE ){
6175     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6176   }else{
6177     assert( pBt->pPage1!=0 );
6178     pP1 = pBt->pPage1->aData;
6179     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6180     if( rc==SQLITE_OK ){
6181       put4byte(&pP1[36 + idx*4], iMeta);
6182 #ifndef SQLITE_OMIT_AUTOVACUUM
6183       if( idx==7 ){
6184         assert( pBt->autoVacuum || iMeta==0 );
6185         assert( iMeta==0 || iMeta==1 );
6186         pBt->incrVacuum = iMeta;
6187       }
6188 #endif
6189     }
6190   }
6191   sqlite3BtreeLeave(p);
6192   return rc;
6193 }
6194 
6195 /*
6196 ** Return the flag byte at the beginning of the page that the cursor
6197 ** is currently pointing to.
6198 */
6199 int sqlite3BtreeFlags(BtCursor *pCur){
6200   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6201   ** restoreOrClearCursorPosition() here.
6202   */
6203   MemPage *pPage = pCur->pPage;
6204   assert( cursorHoldsMutex(pCur) );
6205   assert( pPage->pBt==pCur->pBt );
6206   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6207 }
6208 
6209 
6210 /*
6211 ** Return the pager associated with a BTree.  This routine is used for
6212 ** testing and debugging only.
6213 */
6214 Pager *sqlite3BtreePager(Btree *p){
6215   return p->pBt->pPager;
6216 }
6217 
6218 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6219 /*
6220 ** Append a message to the error message string.
6221 */
6222 static void checkAppendMsg(
6223   IntegrityCk *pCheck,
6224   char *zMsg1,
6225   const char *zFormat,
6226   ...
6227 ){
6228   va_list ap;
6229   char *zMsg2;
6230   if( !pCheck->mxErr ) return;
6231   pCheck->mxErr--;
6232   pCheck->nErr++;
6233   va_start(ap, zFormat);
6234   zMsg2 = sqlite3VMPrintf(0, zFormat, ap);
6235   va_end(ap);
6236   if( zMsg1==0 ) zMsg1 = "";
6237   if( pCheck->zErrMsg ){
6238     char *zOld = pCheck->zErrMsg;
6239     pCheck->zErrMsg = 0;
6240     sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
6241     sqlite3_free(zOld);
6242   }else{
6243     sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
6244   }
6245   sqlite3_free(zMsg2);
6246 }
6247 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6248 
6249 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6250 /*
6251 ** Add 1 to the reference count for page iPage.  If this is the second
6252 ** reference to the page, add an error message to pCheck->zErrMsg.
6253 ** Return 1 if there are 2 ore more references to the page and 0 if
6254 ** if this is the first reference to the page.
6255 **
6256 ** Also check that the page number is in bounds.
6257 */
6258 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
6259   if( iPage==0 ) return 1;
6260   if( iPage>pCheck->nPage || iPage<0 ){
6261     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
6262     return 1;
6263   }
6264   if( pCheck->anRef[iPage]==1 ){
6265     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
6266     return 1;
6267   }
6268   return  (pCheck->anRef[iPage]++)>1;
6269 }
6270 
6271 #ifndef SQLITE_OMIT_AUTOVACUUM
6272 /*
6273 ** Check that the entry in the pointer-map for page iChild maps to
6274 ** page iParent, pointer type ptrType. If not, append an error message
6275 ** to pCheck.
6276 */
6277 static void checkPtrmap(
6278   IntegrityCk *pCheck,   /* Integrity check context */
6279   Pgno iChild,           /* Child page number */
6280   u8 eType,              /* Expected pointer map type */
6281   Pgno iParent,          /* Expected pointer map parent page number */
6282   char *zContext         /* Context description (used for error msg) */
6283 ){
6284   int rc;
6285   u8 ePtrmapType;
6286   Pgno iPtrmapParent;
6287 
6288   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6289   if( rc!=SQLITE_OK ){
6290     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6291     return;
6292   }
6293 
6294   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6295     checkAppendMsg(pCheck, zContext,
6296       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6297       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6298   }
6299 }
6300 #endif
6301 
6302 /*
6303 ** Check the integrity of the freelist or of an overflow page list.
6304 ** Verify that the number of pages on the list is N.
6305 */
6306 static void checkList(
6307   IntegrityCk *pCheck,  /* Integrity checking context */
6308   int isFreeList,       /* True for a freelist.  False for overflow page list */
6309   int iPage,            /* Page number for first page in the list */
6310   int N,                /* Expected number of pages in the list */
6311   char *zContext        /* Context for error messages */
6312 ){
6313   int i;
6314   int expected = N;
6315   int iFirst = iPage;
6316   while( N-- > 0 && pCheck->mxErr ){
6317     DbPage *pOvflPage;
6318     unsigned char *pOvflData;
6319     if( iPage<1 ){
6320       checkAppendMsg(pCheck, zContext,
6321          "%d of %d pages missing from overflow list starting at %d",
6322           N+1, expected, iFirst);
6323       break;
6324     }
6325     if( checkRef(pCheck, iPage, zContext) ) break;
6326     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
6327       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
6328       break;
6329     }
6330     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
6331     if( isFreeList ){
6332       int n = get4byte(&pOvflData[4]);
6333 #ifndef SQLITE_OMIT_AUTOVACUUM
6334       if( pCheck->pBt->autoVacuum ){
6335         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6336       }
6337 #endif
6338       if( n>pCheck->pBt->usableSize/4-8 ){
6339         checkAppendMsg(pCheck, zContext,
6340            "freelist leaf count too big on page %d", iPage);
6341         N--;
6342       }else{
6343         for(i=0; i<n; i++){
6344           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
6345 #ifndef SQLITE_OMIT_AUTOVACUUM
6346           if( pCheck->pBt->autoVacuum ){
6347             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6348           }
6349 #endif
6350           checkRef(pCheck, iFreePage, zContext);
6351         }
6352         N -= n;
6353       }
6354     }
6355 #ifndef SQLITE_OMIT_AUTOVACUUM
6356     else{
6357       /* If this database supports auto-vacuum and iPage is not the last
6358       ** page in this overflow list, check that the pointer-map entry for
6359       ** the following page matches iPage.
6360       */
6361       if( pCheck->pBt->autoVacuum && N>0 ){
6362         i = get4byte(pOvflData);
6363         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6364       }
6365     }
6366 #endif
6367     iPage = get4byte(pOvflData);
6368     sqlite3PagerUnref(pOvflPage);
6369   }
6370 }
6371 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6372 
6373 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6374 /*
6375 ** Do various sanity checks on a single page of a tree.  Return
6376 ** the tree depth.  Root pages return 0.  Parents of root pages
6377 ** return 1, and so forth.
6378 **
6379 ** These checks are done:
6380 **
6381 **      1.  Make sure that cells and freeblocks do not overlap
6382 **          but combine to completely cover the page.
6383 **  NO  2.  Make sure cell keys are in order.
6384 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
6385 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
6386 **      5.  Check the integrity of overflow pages.
6387 **      6.  Recursively call checkTreePage on all children.
6388 **      7.  Verify that the depth of all children is the same.
6389 **      8.  Make sure this page is at least 33% full or else it is
6390 **          the root of the tree.
6391 */
6392 static int checkTreePage(
6393   IntegrityCk *pCheck,  /* Context for the sanity check */
6394   int iPage,            /* Page number of the page to check */
6395   MemPage *pParent,     /* Parent page */
6396   char *zParentContext  /* Parent context */
6397 ){
6398   MemPage *pPage;
6399   int i, rc, depth, d2, pgno, cnt;
6400   int hdr, cellStart;
6401   int nCell;
6402   u8 *data;
6403   BtShared *pBt;
6404   int usableSize;
6405   char zContext[100];
6406   char *hit;
6407 
6408   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
6409 
6410   /* Check that the page exists
6411   */
6412   pBt = pCheck->pBt;
6413   usableSize = pBt->usableSize;
6414   if( iPage==0 ) return 0;
6415   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
6416   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
6417     checkAppendMsg(pCheck, zContext,
6418        "unable to get the page. error code=%d", rc);
6419     return 0;
6420   }
6421   if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){
6422     checkAppendMsg(pCheck, zContext,
6423                    "sqlite3BtreeInitPage() returns error code %d", rc);
6424     releasePage(pPage);
6425     return 0;
6426   }
6427 
6428   /* Check out all the cells.
6429   */
6430   depth = 0;
6431   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
6432     u8 *pCell;
6433     int sz;
6434     CellInfo info;
6435 
6436     /* Check payload overflow pages
6437     */
6438     sqlite3_snprintf(sizeof(zContext), zContext,
6439              "On tree page %d cell %d: ", iPage, i);
6440     pCell = findCell(pPage,i);
6441     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
6442     sz = info.nData;
6443     if( !pPage->intKey ) sz += info.nKey;
6444     assert( sz==info.nPayload );
6445     if( sz>info.nLocal ){
6446       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
6447       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6448 #ifndef SQLITE_OMIT_AUTOVACUUM
6449       if( pBt->autoVacuum ){
6450         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
6451       }
6452 #endif
6453       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
6454     }
6455 
6456     /* Check sanity of left child page.
6457     */
6458     if( !pPage->leaf ){
6459       pgno = get4byte(pCell);
6460 #ifndef SQLITE_OMIT_AUTOVACUUM
6461       if( pBt->autoVacuum ){
6462         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6463       }
6464 #endif
6465       d2 = checkTreePage(pCheck,pgno,pPage,zContext);
6466       if( i>0 && d2!=depth ){
6467         checkAppendMsg(pCheck, zContext, "Child page depth differs");
6468       }
6469       depth = d2;
6470     }
6471   }
6472   if( !pPage->leaf ){
6473     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6474     sqlite3_snprintf(sizeof(zContext), zContext,
6475                      "On page %d at right child: ", iPage);
6476 #ifndef SQLITE_OMIT_AUTOVACUUM
6477     if( pBt->autoVacuum ){
6478       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
6479     }
6480 #endif
6481     checkTreePage(pCheck, pgno, pPage, zContext);
6482   }
6483 
6484   /* Check for complete coverage of the page
6485   */
6486   data = pPage->aData;
6487   hdr = pPage->hdrOffset;
6488   hit = sqlite3MallocZero( usableSize );
6489   if( hit ){
6490     memset(hit, 1, get2byte(&data[hdr+5]));
6491     nCell = get2byte(&data[hdr+3]);
6492     cellStart = hdr + 12 - 4*pPage->leaf;
6493     for(i=0; i<nCell; i++){
6494       int pc = get2byte(&data[cellStart+i*2]);
6495       int size = cellSizePtr(pPage, &data[pc]);
6496       int j;
6497       if( (pc+size-1)>=usableSize || pc<0 ){
6498         checkAppendMsg(pCheck, 0,
6499             "Corruption detected in cell %d on page %d",i,iPage,0);
6500       }else{
6501         for(j=pc+size-1; j>=pc; j--) hit[j]++;
6502       }
6503     }
6504     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6505            cnt++){
6506       int size = get2byte(&data[i+2]);
6507       int j;
6508       if( (i+size-1)>=usableSize || i<0 ){
6509         checkAppendMsg(pCheck, 0,
6510             "Corruption detected in cell %d on page %d",i,iPage,0);
6511       }else{
6512         for(j=i+size-1; j>=i; j--) hit[j]++;
6513       }
6514       i = get2byte(&data[i]);
6515     }
6516     for(i=cnt=0; i<usableSize; i++){
6517       if( hit[i]==0 ){
6518         cnt++;
6519       }else if( hit[i]>1 ){
6520         checkAppendMsg(pCheck, 0,
6521           "Multiple uses for byte %d of page %d", i, iPage);
6522         break;
6523       }
6524     }
6525     if( cnt!=data[hdr+7] ){
6526       checkAppendMsg(pCheck, 0,
6527           "Fragmented space is %d byte reported as %d on page %d",
6528           cnt, data[hdr+7], iPage);
6529     }
6530   }
6531   sqlite3_free(hit);
6532 
6533   releasePage(pPage);
6534   return depth+1;
6535 }
6536 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6537 
6538 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6539 /*
6540 ** This routine does a complete check of the given BTree file.  aRoot[] is
6541 ** an array of pages numbers were each page number is the root page of
6542 ** a table.  nRoot is the number of entries in aRoot.
6543 **
6544 ** If everything checks out, this routine returns NULL.  If something is
6545 ** amiss, an error message is written into memory obtained from malloc()
6546 ** and a pointer to that error message is returned.  The calling function
6547 ** is responsible for freeing the error message when it is done.
6548 */
6549 char *sqlite3BtreeIntegrityCheck(
6550   Btree *p,     /* The btree to be checked */
6551   int *aRoot,   /* An array of root pages numbers for individual trees */
6552   int nRoot,    /* Number of entries in aRoot[] */
6553   int mxErr,    /* Stop reporting errors after this many */
6554   int *pnErr    /* Write number of errors seen to this variable */
6555 ){
6556   int i;
6557   int nRef;
6558   IntegrityCk sCheck;
6559   BtShared *pBt = p->pBt;
6560 
6561   sqlite3BtreeEnter(p);
6562   pBt->db = p->db;
6563   nRef = sqlite3PagerRefcount(pBt->pPager);
6564   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
6565     sqlite3BtreeLeave(p);
6566     return sqlite3StrDup("Unable to acquire a read lock on the database");
6567   }
6568   sCheck.pBt = pBt;
6569   sCheck.pPager = pBt->pPager;
6570   sCheck.nPage = sqlite3PagerPagecount(sCheck.pPager);
6571   sCheck.mxErr = mxErr;
6572   sCheck.nErr = 0;
6573   *pnErr = 0;
6574 #ifndef SQLITE_OMIT_AUTOVACUUM
6575   if( pBt->nTrunc!=0 ){
6576     sCheck.nPage = pBt->nTrunc;
6577   }
6578 #endif
6579   if( sCheck.nPage==0 ){
6580     unlockBtreeIfUnused(pBt);
6581     sqlite3BtreeLeave(p);
6582     return 0;
6583   }
6584   sCheck.anRef = sqlite3_malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
6585   if( !sCheck.anRef ){
6586     unlockBtreeIfUnused(pBt);
6587     *pnErr = 1;
6588     sqlite3BtreeLeave(p);
6589     return sqlite3MPrintf(p->db, "Unable to malloc %d bytes",
6590         (sCheck.nPage+1)*sizeof(sCheck.anRef[0]));
6591   }
6592   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
6593   i = PENDING_BYTE_PAGE(pBt);
6594   if( i<=sCheck.nPage ){
6595     sCheck.anRef[i] = 1;
6596   }
6597   sCheck.zErrMsg = 0;
6598 
6599   /* Check the integrity of the freelist
6600   */
6601   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6602             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
6603 
6604   /* Check all the tables.
6605   */
6606   for(i=0; i<nRoot && sCheck.mxErr; i++){
6607     if( aRoot[i]==0 ) continue;
6608 #ifndef SQLITE_OMIT_AUTOVACUUM
6609     if( pBt->autoVacuum && aRoot[i]>1 ){
6610       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6611     }
6612 #endif
6613     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
6614   }
6615 
6616   /* Make sure every page in the file is referenced
6617   */
6618   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
6619 #ifdef SQLITE_OMIT_AUTOVACUUM
6620     if( sCheck.anRef[i]==0 ){
6621       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6622     }
6623 #else
6624     /* If the database supports auto-vacuum, make sure no tables contain
6625     ** references to pointer-map pages.
6626     */
6627     if( sCheck.anRef[i]==0 &&
6628        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
6629       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6630     }
6631     if( sCheck.anRef[i]!=0 &&
6632        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
6633       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6634     }
6635 #endif
6636   }
6637 
6638   /* Make sure this analysis did not leave any unref() pages
6639   */
6640   unlockBtreeIfUnused(pBt);
6641   if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
6642     checkAppendMsg(&sCheck, 0,
6643       "Outstanding page count goes from %d to %d during this analysis",
6644       nRef, sqlite3PagerRefcount(pBt->pPager)
6645     );
6646   }
6647 
6648   /* Clean  up and report errors.
6649   */
6650   sqlite3BtreeLeave(p);
6651   sqlite3_free(sCheck.anRef);
6652   *pnErr = sCheck.nErr;
6653   return sCheck.zErrMsg;
6654 }
6655 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6656 
6657 /*
6658 ** Return the full pathname of the underlying database file.
6659 **
6660 ** The pager filename is invariant as long as the pager is
6661 ** open so it is safe to access without the BtShared mutex.
6662 */
6663 const char *sqlite3BtreeGetFilename(Btree *p){
6664   assert( p->pBt->pPager!=0 );
6665   return sqlite3PagerFilename(p->pBt->pPager);
6666 }
6667 
6668 /*
6669 ** Return the pathname of the directory that contains the database file.
6670 **
6671 ** The pager directory name is invariant as long as the pager is
6672 ** open so it is safe to access without the BtShared mutex.
6673 */
6674 const char *sqlite3BtreeGetDirname(Btree *p){
6675   assert( p->pBt->pPager!=0 );
6676   return sqlite3PagerDirname(p->pBt->pPager);
6677 }
6678 
6679 /*
6680 ** Return the pathname of the journal file for this database. The return
6681 ** value of this routine is the same regardless of whether the journal file
6682 ** has been created or not.
6683 **
6684 ** The pager journal filename is invariant as long as the pager is
6685 ** open so it is safe to access without the BtShared mutex.
6686 */
6687 const char *sqlite3BtreeGetJournalname(Btree *p){
6688   assert( p->pBt->pPager!=0 );
6689   return sqlite3PagerJournalname(p->pBt->pPager);
6690 }
6691 
6692 #ifndef SQLITE_OMIT_VACUUM
6693 /*
6694 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
6695 ** must be active for both files.
6696 **
6697 ** The size of file pBtFrom may be reduced by this operation.
6698 ** If anything goes wrong, the transaction on pBtFrom is rolled back.
6699 */
6700 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
6701   int rc = SQLITE_OK;
6702   Pgno i, nPage, nToPage, iSkip;
6703 
6704   BtShared *pBtTo = pTo->pBt;
6705   BtShared *pBtFrom = pFrom->pBt;
6706   pBtTo->db = pTo->db;
6707   pBtFrom->db = pFrom->db;
6708 
6709 
6710   if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
6711     return SQLITE_ERROR;
6712   }
6713   if( pBtTo->pCursor ) return SQLITE_BUSY;
6714   nToPage = sqlite3PagerPagecount(pBtTo->pPager);
6715   nPage = sqlite3PagerPagecount(pBtFrom->pPager);
6716   iSkip = PENDING_BYTE_PAGE(pBtTo);
6717   for(i=1; rc==SQLITE_OK && i<=nPage; i++){
6718     DbPage *pDbPage;
6719     if( i==iSkip ) continue;
6720     rc = sqlite3PagerGet(pBtFrom->pPager, i, &pDbPage);
6721     if( rc ) break;
6722     rc = sqlite3PagerOverwrite(pBtTo->pPager, i, sqlite3PagerGetData(pDbPage));
6723     sqlite3PagerUnref(pDbPage);
6724   }
6725 
6726   /* If the file is shrinking, journal the pages that are being truncated
6727   ** so that they can be rolled back if the commit fails.
6728   */
6729   for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
6730     DbPage *pDbPage;
6731     if( i==iSkip ) continue;
6732     rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
6733     if( rc ) break;
6734     rc = sqlite3PagerWrite(pDbPage);
6735     sqlite3PagerDontWrite(pDbPage);
6736     /* Yeah.  It seems wierd to call DontWrite() right after Write().  But
6737     ** that is because the names of those procedures do not exactly
6738     ** represent what they do.  Write() really means "put this page in the
6739     ** rollback journal and mark it as dirty so that it will be written
6740     ** to the database file later."  DontWrite() undoes the second part of
6741     ** that and prevents the page from being written to the database.  The
6742     ** page is still on the rollback journal, though.  And that is the whole
6743     ** point of this loop: to put pages on the rollback journal. */
6744     sqlite3PagerUnref(pDbPage);
6745   }
6746   if( !rc && nPage<nToPage ){
6747     rc = sqlite3PagerTruncate(pBtTo->pPager, nPage);
6748   }
6749 
6750   if( rc ){
6751     sqlite3BtreeRollback(pTo);
6752   }
6753   return rc;
6754 }
6755 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
6756   int rc;
6757   sqlite3BtreeEnter(pTo);
6758   sqlite3BtreeEnter(pFrom);
6759   rc = btreeCopyFile(pTo, pFrom);
6760   sqlite3BtreeLeave(pFrom);
6761   sqlite3BtreeLeave(pTo);
6762   return rc;
6763 }
6764 
6765 #endif /* SQLITE_OMIT_VACUUM */
6766 
6767 /*
6768 ** Return non-zero if a transaction is active.
6769 */
6770 int sqlite3BtreeIsInTrans(Btree *p){
6771   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
6772   return (p && (p->inTrans==TRANS_WRITE));
6773 }
6774 
6775 /*
6776 ** Return non-zero if a statement transaction is active.
6777 */
6778 int sqlite3BtreeIsInStmt(Btree *p){
6779   assert( sqlite3BtreeHoldsMutex(p) );
6780   return (p->pBt && p->pBt->inStmt);
6781 }
6782 
6783 /*
6784 ** Return non-zero if a read (or write) transaction is active.
6785 */
6786 int sqlite3BtreeIsInReadTrans(Btree *p){
6787   assert( sqlite3_mutex_held(p->db->mutex) );
6788   return (p && (p->inTrans!=TRANS_NONE));
6789 }
6790 
6791 /*
6792 ** This function returns a pointer to a blob of memory associated with
6793 ** a single shared-btree. The memory is used by client code for its own
6794 ** purposes (for example, to store a high-level schema associated with
6795 ** the shared-btree). The btree layer manages reference counting issues.
6796 **
6797 ** The first time this is called on a shared-btree, nBytes bytes of memory
6798 ** are allocated, zeroed, and returned to the caller. For each subsequent
6799 ** call the nBytes parameter is ignored and a pointer to the same blob
6800 ** of memory returned.
6801 **
6802 ** Just before the shared-btree is closed, the function passed as the
6803 ** xFree argument when the memory allocation was made is invoked on the
6804 ** blob of allocated memory. This function should not call sqlite3_free()
6805 ** on the memory, the btree layer does that.
6806 */
6807 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
6808   BtShared *pBt = p->pBt;
6809   sqlite3BtreeEnter(p);
6810   if( !pBt->pSchema ){
6811     pBt->pSchema = sqlite3MallocZero(nBytes);
6812     pBt->xFreeSchema = xFree;
6813   }
6814   sqlite3BtreeLeave(p);
6815   return pBt->pSchema;
6816 }
6817 
6818 /*
6819 ** Return true if another user of the same shared btree as the argument
6820 ** handle holds an exclusive lock on the sqlite_master table.
6821 */
6822 int sqlite3BtreeSchemaLocked(Btree *p){
6823   int rc;
6824   assert( sqlite3_mutex_held(p->db->mutex) );
6825   sqlite3BtreeEnter(p);
6826   rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
6827   sqlite3BtreeLeave(p);
6828   return rc;
6829 }
6830 
6831 
6832 #ifndef SQLITE_OMIT_SHARED_CACHE
6833 /*
6834 ** Obtain a lock on the table whose root page is iTab.  The
6835 ** lock is a write lock if isWritelock is true or a read lock
6836 ** if it is false.
6837 */
6838 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
6839   int rc = SQLITE_OK;
6840   u8 lockType = (isWriteLock?WRITE_LOCK:READ_LOCK);
6841   sqlite3BtreeEnter(p);
6842   rc = queryTableLock(p, iTab, lockType);
6843   if( rc==SQLITE_OK ){
6844     rc = lockTable(p, iTab, lockType);
6845   }
6846   sqlite3BtreeLeave(p);
6847   return rc;
6848 }
6849 #endif
6850 
6851 #ifndef SQLITE_OMIT_INCRBLOB
6852 /*
6853 ** Argument pCsr must be a cursor opened for writing on an
6854 ** INTKEY table currently pointing at a valid table entry.
6855 ** This function modifies the data stored as part of that entry.
6856 ** Only the data content may only be modified, it is not possible
6857 ** to change the length of the data stored.
6858 */
6859 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
6860   assert( cursorHoldsMutex(pCsr) );
6861   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
6862   assert(pCsr->isIncrblobHandle);
6863   if( pCsr->eState>=CURSOR_REQUIRESEEK ){
6864     if( pCsr->eState==CURSOR_FAULT ){
6865       return pCsr->skip;
6866     }else{
6867       return SQLITE_ABORT;
6868     }
6869   }
6870 
6871   /* Check some preconditions:
6872   **   (a) the cursor is open for writing,
6873   **   (b) there is no read-lock on the table being modified and
6874   **   (c) the cursor points at a valid row of an intKey table.
6875   */
6876   if( !pCsr->wrFlag ){
6877     return SQLITE_READONLY;
6878   }
6879   assert( !pCsr->pBt->readOnly
6880           && pCsr->pBt->inTransaction==TRANS_WRITE );
6881   if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr) ){
6882     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
6883   }
6884   if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){
6885     return SQLITE_ERROR;
6886   }
6887 
6888   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
6889 }
6890 
6891 /*
6892 ** Set a flag on this cursor to cache the locations of pages from the
6893 ** overflow list for the current row. This is used by cursors opened
6894 ** for incremental blob IO only.
6895 **
6896 ** This function sets a flag only. The actual page location cache
6897 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
6898 ** accessPayload() (the worker function for sqlite3BtreeData() and
6899 ** sqlite3BtreePutData()).
6900 */
6901 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
6902   assert( cursorHoldsMutex(pCur) );
6903   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
6904   assert(!pCur->isIncrblobHandle);
6905   assert(!pCur->aOverflow);
6906   pCur->isIncrblobHandle = 1;
6907 }
6908 #endif
6909