xref: /sqlite-3.40.0/src/btree.c (revision 8a29dfde)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.451 2008/04/03 21:46:57 drh Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
17 */
18 #include "btreeInt.h"
19 
20 /*
21 ** The header string that appears at the beginning of every
22 ** SQLite database.
23 */
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25 
26 /*
27 ** Set this global variable to 1 to enable tracing using the TRACE
28 ** macro.
29 */
30 #if SQLITE_TEST
31 int sqlite3BtreeTrace=0;  /* True to enable tracing */
32 #endif
33 
34 
35 
36 #ifndef SQLITE_OMIT_SHARED_CACHE
37 /*
38 ** A flag to indicate whether or not shared cache is enabled.  Also,
39 ** a list of BtShared objects that are eligible for participation
40 ** in shared cache.  The variables have file scope during normal builds,
41 ** but the test harness needs to access these variables so we make them
42 ** global for test builds.
43 */
44 #ifdef SQLITE_TEST
45 BtShared *sqlite3SharedCacheList = 0;
46 int sqlite3SharedCacheEnabled = 0;
47 #else
48 static BtShared *sqlite3SharedCacheList = 0;
49 static int sqlite3SharedCacheEnabled = 0;
50 #endif
51 #endif /* SQLITE_OMIT_SHARED_CACHE */
52 
53 #ifndef SQLITE_OMIT_SHARED_CACHE
54 /*
55 ** Enable or disable the shared pager and schema features.
56 **
57 ** This routine has no effect on existing database connections.
58 ** The shared cache setting effects only future calls to
59 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
60 */
61 int sqlite3_enable_shared_cache(int enable){
62   sqlite3SharedCacheEnabled = enable;
63   return SQLITE_OK;
64 }
65 #endif
66 
67 
68 /*
69 ** Forward declaration
70 */
71 static int checkReadLocks(Btree*,Pgno,BtCursor*);
72 
73 
74 #ifdef SQLITE_OMIT_SHARED_CACHE
75   /*
76   ** The functions queryTableLock(), lockTable() and unlockAllTables()
77   ** manipulate entries in the BtShared.pLock linked list used to store
78   ** shared-cache table level locks. If the library is compiled with the
79   ** shared-cache feature disabled, then there is only ever one user
80   ** of each BtShared structure and so this locking is not necessary.
81   ** So define the lock related functions as no-ops.
82   */
83   #define queryTableLock(a,b,c) SQLITE_OK
84   #define lockTable(a,b,c) SQLITE_OK
85   #define unlockAllTables(a)
86 #endif
87 
88 #ifndef SQLITE_OMIT_SHARED_CACHE
89 /*
90 ** Query to see if btree handle p may obtain a lock of type eLock
91 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
92 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
93 ** SQLITE_LOCKED if not.
94 */
95 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
96   BtShared *pBt = p->pBt;
97   BtLock *pIter;
98 
99   assert( sqlite3BtreeHoldsMutex(p) );
100 
101   /* This is a no-op if the shared-cache is not enabled */
102   if( !p->sharable ){
103     return SQLITE_OK;
104   }
105 
106   /* If some other connection is holding an exclusive lock, the
107   ** requested lock may not be obtained.
108   */
109   if( pBt->pExclusive && pBt->pExclusive!=p ){
110     return SQLITE_LOCKED;
111   }
112 
113   /* This (along with lockTable()) is where the ReadUncommitted flag is
114   ** dealt with. If the caller is querying for a read-lock and the flag is
115   ** set, it is unconditionally granted - even if there are write-locks
116   ** on the table. If a write-lock is requested, the ReadUncommitted flag
117   ** is not considered.
118   **
119   ** In function lockTable(), if a read-lock is demanded and the
120   ** ReadUncommitted flag is set, no entry is added to the locks list
121   ** (BtShared.pLock).
122   **
123   ** To summarize: If the ReadUncommitted flag is set, then read cursors do
124   ** not create or respect table locks. The locking procedure for a
125   ** write-cursor does not change.
126   */
127   if(
128     !p->db ||
129     0==(p->db->flags&SQLITE_ReadUncommitted) ||
130     eLock==WRITE_LOCK ||
131     iTab==MASTER_ROOT
132   ){
133     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
134       if( pIter->pBtree!=p && pIter->iTable==iTab &&
135           (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
136         return SQLITE_LOCKED;
137       }
138     }
139   }
140   return SQLITE_OK;
141 }
142 #endif /* !SQLITE_OMIT_SHARED_CACHE */
143 
144 #ifndef SQLITE_OMIT_SHARED_CACHE
145 /*
146 ** Add a lock on the table with root-page iTable to the shared-btree used
147 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
148 ** WRITE_LOCK.
149 **
150 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
151 ** SQLITE_NOMEM may also be returned.
152 */
153 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
154   BtShared *pBt = p->pBt;
155   BtLock *pLock = 0;
156   BtLock *pIter;
157 
158   assert( sqlite3BtreeHoldsMutex(p) );
159 
160   /* This is a no-op if the shared-cache is not enabled */
161   if( !p->sharable ){
162     return SQLITE_OK;
163   }
164 
165   assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
166 
167   /* If the read-uncommitted flag is set and a read-lock is requested,
168   ** return early without adding an entry to the BtShared.pLock list. See
169   ** comment in function queryTableLock() for more info on handling
170   ** the ReadUncommitted flag.
171   */
172   if(
173     (p->db) &&
174     (p->db->flags&SQLITE_ReadUncommitted) &&
175     (eLock==READ_LOCK) &&
176     iTable!=MASTER_ROOT
177   ){
178     return SQLITE_OK;
179   }
180 
181   /* First search the list for an existing lock on this table. */
182   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
183     if( pIter->iTable==iTable && pIter->pBtree==p ){
184       pLock = pIter;
185       break;
186     }
187   }
188 
189   /* If the above search did not find a BtLock struct associating Btree p
190   ** with table iTable, allocate one and link it into the list.
191   */
192   if( !pLock ){
193     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
194     if( !pLock ){
195       return SQLITE_NOMEM;
196     }
197     pLock->iTable = iTable;
198     pLock->pBtree = p;
199     pLock->pNext = pBt->pLock;
200     pBt->pLock = pLock;
201   }
202 
203   /* Set the BtLock.eLock variable to the maximum of the current lock
204   ** and the requested lock. This means if a write-lock was already held
205   ** and a read-lock requested, we don't incorrectly downgrade the lock.
206   */
207   assert( WRITE_LOCK>READ_LOCK );
208   if( eLock>pLock->eLock ){
209     pLock->eLock = eLock;
210   }
211 
212   return SQLITE_OK;
213 }
214 #endif /* !SQLITE_OMIT_SHARED_CACHE */
215 
216 #ifndef SQLITE_OMIT_SHARED_CACHE
217 /*
218 ** Release all the table locks (locks obtained via calls to the lockTable()
219 ** procedure) held by Btree handle p.
220 */
221 static void unlockAllTables(Btree *p){
222   BtShared *pBt = p->pBt;
223   BtLock **ppIter = &pBt->pLock;
224 
225   assert( sqlite3BtreeHoldsMutex(p) );
226   assert( p->sharable || 0==*ppIter );
227 
228   while( *ppIter ){
229     BtLock *pLock = *ppIter;
230     assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
231     if( pLock->pBtree==p ){
232       *ppIter = pLock->pNext;
233       sqlite3_free(pLock);
234     }else{
235       ppIter = &pLock->pNext;
236     }
237   }
238 
239   if( pBt->pExclusive==p ){
240     pBt->pExclusive = 0;
241   }
242 }
243 #endif /* SQLITE_OMIT_SHARED_CACHE */
244 
245 static void releasePage(MemPage *pPage);  /* Forward reference */
246 
247 /*
248 ** Verify that the cursor holds a mutex on the BtShared
249 */
250 #ifndef NDEBUG
251 static int cursorHoldsMutex(BtCursor *p){
252   return sqlite3_mutex_held(p->pBt->mutex);
253 }
254 #endif
255 
256 
257 #ifndef SQLITE_OMIT_INCRBLOB
258 /*
259 ** Invalidate the overflow page-list cache for cursor pCur, if any.
260 */
261 static void invalidateOverflowCache(BtCursor *pCur){
262   assert( cursorHoldsMutex(pCur) );
263   sqlite3_free(pCur->aOverflow);
264   pCur->aOverflow = 0;
265 }
266 
267 /*
268 ** Invalidate the overflow page-list cache for all cursors opened
269 ** on the shared btree structure pBt.
270 */
271 static void invalidateAllOverflowCache(BtShared *pBt){
272   BtCursor *p;
273   assert( sqlite3_mutex_held(pBt->mutex) );
274   for(p=pBt->pCursor; p; p=p->pNext){
275     invalidateOverflowCache(p);
276   }
277 }
278 #else
279   #define invalidateOverflowCache(x)
280   #define invalidateAllOverflowCache(x)
281 #endif
282 
283 /*
284 ** Save the current cursor position in the variables BtCursor.nKey
285 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
286 */
287 static int saveCursorPosition(BtCursor *pCur){
288   int rc;
289 
290   assert( CURSOR_VALID==pCur->eState );
291   assert( 0==pCur->pKey );
292   assert( cursorHoldsMutex(pCur) );
293 
294   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
295 
296   /* If this is an intKey table, then the above call to BtreeKeySize()
297   ** stores the integer key in pCur->nKey. In this case this value is
298   ** all that is required. Otherwise, if pCur is not open on an intKey
299   ** table, then malloc space for and store the pCur->nKey bytes of key
300   ** data.
301   */
302   if( rc==SQLITE_OK && 0==pCur->pPage->intKey){
303     void *pKey = sqlite3_malloc(pCur->nKey);
304     if( pKey ){
305       rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
306       if( rc==SQLITE_OK ){
307         pCur->pKey = pKey;
308       }else{
309         sqlite3_free(pKey);
310       }
311     }else{
312       rc = SQLITE_NOMEM;
313     }
314   }
315   assert( !pCur->pPage->intKey || !pCur->pKey );
316 
317   if( rc==SQLITE_OK ){
318     releasePage(pCur->pPage);
319     pCur->pPage = 0;
320     pCur->eState = CURSOR_REQUIRESEEK;
321   }
322 
323   invalidateOverflowCache(pCur);
324   return rc;
325 }
326 
327 /*
328 ** Save the positions of all cursors except pExcept open on the table
329 ** with root-page iRoot. Usually, this is called just before cursor
330 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
331 */
332 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
333   BtCursor *p;
334   assert( sqlite3_mutex_held(pBt->mutex) );
335   assert( pExcept==0 || pExcept->pBt==pBt );
336   for(p=pBt->pCursor; p; p=p->pNext){
337     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
338         p->eState==CURSOR_VALID ){
339       int rc = saveCursorPosition(p);
340       if( SQLITE_OK!=rc ){
341         return rc;
342       }
343     }
344   }
345   return SQLITE_OK;
346 }
347 
348 /*
349 ** Clear the current cursor position.
350 */
351 static void clearCursorPosition(BtCursor *pCur){
352   assert( cursorHoldsMutex(pCur) );
353   sqlite3_free(pCur->pKey);
354   pCur->pKey = 0;
355   pCur->eState = CURSOR_INVALID;
356 }
357 
358 /*
359 ** Restore the cursor to the position it was in (or as close to as possible)
360 ** when saveCursorPosition() was called. Note that this call deletes the
361 ** saved position info stored by saveCursorPosition(), so there can be
362 ** at most one effective restoreOrClearCursorPosition() call after each
363 ** saveCursorPosition().
364 **
365 ** If the second argument argument - doSeek - is false, then instead of
366 ** returning the cursor to its saved position, any saved position is deleted
367 ** and the cursor state set to CURSOR_INVALID.
368 */
369 int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){
370   int rc;
371   assert( cursorHoldsMutex(pCur) );
372   assert( pCur->eState>=CURSOR_REQUIRESEEK );
373   if( pCur->eState==CURSOR_FAULT ){
374     return pCur->skip;
375   }
376 #ifndef SQLITE_OMIT_INCRBLOB
377   if( pCur->isIncrblobHandle ){
378     return SQLITE_ABORT;
379   }
380 #endif
381   pCur->eState = CURSOR_INVALID;
382   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, 0, pCur->nKey, 0, &pCur->skip);
383   if( rc==SQLITE_OK ){
384     sqlite3_free(pCur->pKey);
385     pCur->pKey = 0;
386     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
387   }
388   return rc;
389 }
390 
391 #define restoreOrClearCursorPosition(p) \
392   (p->eState>=CURSOR_REQUIRESEEK ? \
393          sqlite3BtreeRestoreOrClearCursorPosition(p) : \
394          SQLITE_OK)
395 
396 #ifndef SQLITE_OMIT_AUTOVACUUM
397 /*
398 ** Given a page number of a regular database page, return the page
399 ** number for the pointer-map page that contains the entry for the
400 ** input page number.
401 */
402 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
403   int nPagesPerMapPage, iPtrMap, ret;
404   assert( sqlite3_mutex_held(pBt->mutex) );
405   nPagesPerMapPage = (pBt->usableSize/5)+1;
406   iPtrMap = (pgno-2)/nPagesPerMapPage;
407   ret = (iPtrMap*nPagesPerMapPage) + 2;
408   if( ret==PENDING_BYTE_PAGE(pBt) ){
409     ret++;
410   }
411   return ret;
412 }
413 
414 /*
415 ** Write an entry into the pointer map.
416 **
417 ** This routine updates the pointer map entry for page number 'key'
418 ** so that it maps to type 'eType' and parent page number 'pgno'.
419 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
420 */
421 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
422   DbPage *pDbPage;  /* The pointer map page */
423   u8 *pPtrmap;      /* The pointer map data */
424   Pgno iPtrmap;     /* The pointer map page number */
425   int offset;       /* Offset in pointer map page */
426   int rc;
427 
428   assert( sqlite3_mutex_held(pBt->mutex) );
429   /* The master-journal page number must never be used as a pointer map page */
430   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
431 
432   assert( pBt->autoVacuum );
433   if( key==0 ){
434     return SQLITE_CORRUPT_BKPT;
435   }
436   iPtrmap = PTRMAP_PAGENO(pBt, key);
437   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
438   if( rc!=SQLITE_OK ){
439     return rc;
440   }
441   offset = PTRMAP_PTROFFSET(pBt, key);
442   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
443 
444   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
445     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
446     rc = sqlite3PagerWrite(pDbPage);
447     if( rc==SQLITE_OK ){
448       pPtrmap[offset] = eType;
449       put4byte(&pPtrmap[offset+1], parent);
450     }
451   }
452 
453   sqlite3PagerUnref(pDbPage);
454   return rc;
455 }
456 
457 /*
458 ** Read an entry from the pointer map.
459 **
460 ** This routine retrieves the pointer map entry for page 'key', writing
461 ** the type and parent page number to *pEType and *pPgno respectively.
462 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
463 */
464 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
465   DbPage *pDbPage;   /* The pointer map page */
466   int iPtrmap;       /* Pointer map page index */
467   u8 *pPtrmap;       /* Pointer map page data */
468   int offset;        /* Offset of entry in pointer map */
469   int rc;
470 
471   assert( sqlite3_mutex_held(pBt->mutex) );
472 
473   iPtrmap = PTRMAP_PAGENO(pBt, key);
474   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
475   if( rc!=0 ){
476     return rc;
477   }
478   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
479 
480   offset = PTRMAP_PTROFFSET(pBt, key);
481   assert( pEType!=0 );
482   *pEType = pPtrmap[offset];
483   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
484 
485   sqlite3PagerUnref(pDbPage);
486   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
487   return SQLITE_OK;
488 }
489 
490 #endif /* SQLITE_OMIT_AUTOVACUUM */
491 
492 /*
493 ** Given a btree page and a cell index (0 means the first cell on
494 ** the page, 1 means the second cell, and so forth) return a pointer
495 ** to the cell content.
496 **
497 ** This routine works only for pages that do not contain overflow cells.
498 */
499 #define findCell(pPage, iCell) \
500   ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)]))
501 #ifdef SQLITE_TEST
502 u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){
503   assert( iCell>=0 );
504   assert( iCell<get2byte(&pPage->aData[pPage->hdrOffset+3]) );
505   return findCell(pPage, iCell);
506 }
507 #endif
508 
509 /*
510 ** This a more complex version of sqlite3BtreeFindCell() that works for
511 ** pages that do contain overflow cells.  See insert
512 */
513 static u8 *findOverflowCell(MemPage *pPage, int iCell){
514   int i;
515   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
516   for(i=pPage->nOverflow-1; i>=0; i--){
517     int k;
518     struct _OvflCell *pOvfl;
519     pOvfl = &pPage->aOvfl[i];
520     k = pOvfl->idx;
521     if( k<=iCell ){
522       if( k==iCell ){
523         return pOvfl->pCell;
524       }
525       iCell--;
526     }
527   }
528   return findCell(pPage, iCell);
529 }
530 
531 /*
532 ** Parse a cell content block and fill in the CellInfo structure.  There
533 ** are two versions of this function.  sqlite3BtreeParseCell() takes a
534 ** cell index as the second argument and sqlite3BtreeParseCellPtr()
535 ** takes a pointer to the body of the cell as its second argument.
536 **
537 ** Within this file, the parseCell() macro can be called instead of
538 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
539 */
540 void sqlite3BtreeParseCellPtr(
541   MemPage *pPage,         /* Page containing the cell */
542   u8 *pCell,              /* Pointer to the cell text. */
543   CellInfo *pInfo         /* Fill in this structure */
544 ){
545   int n;                  /* Number bytes in cell content header */
546   u32 nPayload;           /* Number of bytes of cell payload */
547 
548   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
549 
550   pInfo->pCell = pCell;
551   assert( pPage->leaf==0 || pPage->leaf==1 );
552   n = pPage->childPtrSize;
553   assert( n==4-4*pPage->leaf );
554   if( pPage->hasData ){
555     n += getVarint32(&pCell[n], &nPayload);
556   }else{
557     nPayload = 0;
558   }
559   pInfo->nData = nPayload;
560   if( pPage->intKey ){
561     n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
562   }else{
563     u32 x;
564     n += getVarint32(&pCell[n], &x);
565     pInfo->nKey = x;
566     nPayload += x;
567   }
568   pInfo->nPayload = nPayload;
569   pInfo->nHeader = n;
570   if( nPayload<=pPage->maxLocal ){
571     /* This is the (easy) common case where the entire payload fits
572     ** on the local page.  No overflow is required.
573     */
574     int nSize;          /* Total size of cell content in bytes */
575     pInfo->nLocal = nPayload;
576     pInfo->iOverflow = 0;
577     nSize = nPayload + n;
578     if( nSize<4 ){
579       nSize = 4;        /* Minimum cell size is 4 */
580     }
581     pInfo->nSize = nSize;
582   }else{
583     /* If the payload will not fit completely on the local page, we have
584     ** to decide how much to store locally and how much to spill onto
585     ** overflow pages.  The strategy is to minimize the amount of unused
586     ** space on overflow pages while keeping the amount of local storage
587     ** in between minLocal and maxLocal.
588     **
589     ** Warning:  changing the way overflow payload is distributed in any
590     ** way will result in an incompatible file format.
591     */
592     int minLocal;  /* Minimum amount of payload held locally */
593     int maxLocal;  /* Maximum amount of payload held locally */
594     int surplus;   /* Overflow payload available for local storage */
595 
596     minLocal = pPage->minLocal;
597     maxLocal = pPage->maxLocal;
598     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
599     if( surplus <= maxLocal ){
600       pInfo->nLocal = surplus;
601     }else{
602       pInfo->nLocal = minLocal;
603     }
604     pInfo->iOverflow = pInfo->nLocal + n;
605     pInfo->nSize = pInfo->iOverflow + 4;
606   }
607 }
608 #define parseCell(pPage, iCell, pInfo) \
609   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
610 void sqlite3BtreeParseCell(
611   MemPage *pPage,         /* Page containing the cell */
612   int iCell,              /* The cell index.  First cell is 0 */
613   CellInfo *pInfo         /* Fill in this structure */
614 ){
615   parseCell(pPage, iCell, pInfo);
616 }
617 
618 /*
619 ** Compute the total number of bytes that a Cell needs in the cell
620 ** data area of the btree-page.  The return number includes the cell
621 ** data header and the local payload, but not any overflow page or
622 ** the space used by the cell pointer.
623 */
624 #ifndef NDEBUG
625 static u16 cellSize(MemPage *pPage, int iCell){
626   CellInfo info;
627   sqlite3BtreeParseCell(pPage, iCell, &info);
628   return info.nSize;
629 }
630 #endif
631 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
632   CellInfo info;
633   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
634   return info.nSize;
635 }
636 
637 #ifndef SQLITE_OMIT_AUTOVACUUM
638 /*
639 ** If the cell pCell, part of page pPage contains a pointer
640 ** to an overflow page, insert an entry into the pointer-map
641 ** for the overflow page.
642 */
643 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
644   if( pCell ){
645     CellInfo info;
646     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
647     assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
648     if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
649       Pgno ovfl = get4byte(&pCell[info.iOverflow]);
650       return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
651     }
652   }
653   return SQLITE_OK;
654 }
655 /*
656 ** If the cell with index iCell on page pPage contains a pointer
657 ** to an overflow page, insert an entry into the pointer-map
658 ** for the overflow page.
659 */
660 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
661   u8 *pCell;
662   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
663   pCell = findOverflowCell(pPage, iCell);
664   return ptrmapPutOvflPtr(pPage, pCell);
665 }
666 #endif
667 
668 
669 /*
670 ** Defragment the page given.  All Cells are moved to the
671 ** end of the page and all free space is collected into one
672 ** big FreeBlk that occurs in between the header and cell
673 ** pointer array and the cell content area.
674 */
675 static int defragmentPage(MemPage *pPage){
676   int i;                     /* Loop counter */
677   int pc;                    /* Address of a i-th cell */
678   int addr;                  /* Offset of first byte after cell pointer array */
679   int hdr;                   /* Offset to the page header */
680   int size;                  /* Size of a cell */
681   int usableSize;            /* Number of usable bytes on a page */
682   int cellOffset;            /* Offset to the cell pointer array */
683   int brk;                   /* Offset to the cell content area */
684   int nCell;                 /* Number of cells on the page */
685   unsigned char *data;       /* The page data */
686   unsigned char *temp;       /* Temp area for cell content */
687 
688   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
689   assert( pPage->pBt!=0 );
690   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
691   assert( pPage->nOverflow==0 );
692   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
693   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
694   data = pPage->aData;
695   hdr = pPage->hdrOffset;
696   cellOffset = pPage->cellOffset;
697   nCell = pPage->nCell;
698   assert( nCell==get2byte(&data[hdr+3]) );
699   usableSize = pPage->pBt->usableSize;
700   brk = get2byte(&data[hdr+5]);
701   memcpy(&temp[brk], &data[brk], usableSize - brk);
702   brk = usableSize;
703   for(i=0; i<nCell; i++){
704     u8 *pAddr;     /* The i-th cell pointer */
705     pAddr = &data[cellOffset + i*2];
706     pc = get2byte(pAddr);
707     assert( pc<pPage->pBt->usableSize );
708     size = cellSizePtr(pPage, &temp[pc]);
709     brk -= size;
710     memcpy(&data[brk], &temp[pc], size);
711     put2byte(pAddr, brk);
712   }
713   assert( brk>=cellOffset+2*nCell );
714   put2byte(&data[hdr+5], brk);
715   data[hdr+1] = 0;
716   data[hdr+2] = 0;
717   data[hdr+7] = 0;
718   addr = cellOffset+2*nCell;
719   memset(&data[addr], 0, brk-addr);
720   return SQLITE_OK;
721 }
722 
723 /*
724 ** Allocate nByte bytes of space on a page.
725 **
726 ** Return the index into pPage->aData[] of the first byte of
727 ** the new allocation. Or return 0 if there is not enough free
728 ** space on the page to satisfy the allocation request.
729 **
730 ** If the page contains nBytes of free space but does not contain
731 ** nBytes of contiguous free space, then this routine automatically
732 ** calls defragementPage() to consolidate all free space before
733 ** allocating the new chunk.
734 */
735 static int allocateSpace(MemPage *pPage, int nByte){
736   int addr, pc, hdr;
737   int size;
738   int nFrag;
739   int top;
740   int nCell;
741   int cellOffset;
742   unsigned char *data;
743 
744   data = pPage->aData;
745   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
746   assert( pPage->pBt );
747   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
748   if( nByte<4 ) nByte = 4;
749   if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
750   pPage->nFree -= nByte;
751   hdr = pPage->hdrOffset;
752 
753   nFrag = data[hdr+7];
754   if( nFrag<60 ){
755     /* Search the freelist looking for a slot big enough to satisfy the
756     ** space request. */
757     addr = hdr+1;
758     while( (pc = get2byte(&data[addr]))>0 ){
759       size = get2byte(&data[pc+2]);
760       if( size>=nByte ){
761         if( size<nByte+4 ){
762           memcpy(&data[addr], &data[pc], 2);
763           data[hdr+7] = nFrag + size - nByte;
764           return pc;
765         }else{
766           put2byte(&data[pc+2], size-nByte);
767           return pc + size - nByte;
768         }
769       }
770       addr = pc;
771     }
772   }
773 
774   /* Allocate memory from the gap in between the cell pointer array
775   ** and the cell content area.
776   */
777   top = get2byte(&data[hdr+5]);
778   nCell = get2byte(&data[hdr+3]);
779   cellOffset = pPage->cellOffset;
780   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
781     if( defragmentPage(pPage) ) return 0;
782     top = get2byte(&data[hdr+5]);
783   }
784   top -= nByte;
785   assert( cellOffset + 2*nCell <= top );
786   put2byte(&data[hdr+5], top);
787   return top;
788 }
789 
790 /*
791 ** Return a section of the pPage->aData to the freelist.
792 ** The first byte of the new free block is pPage->aDisk[start]
793 ** and the size of the block is "size" bytes.
794 **
795 ** Most of the effort here is involved in coalesing adjacent
796 ** free blocks into a single big free block.
797 */
798 static void freeSpace(MemPage *pPage, int start, int size){
799   int addr, pbegin, hdr;
800   unsigned char *data = pPage->aData;
801 
802   assert( pPage->pBt!=0 );
803   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
804   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
805   assert( (start + size)<=pPage->pBt->usableSize );
806   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
807   if( size<4 ) size = 4;
808 
809 #ifdef SQLITE_SECURE_DELETE
810   /* Overwrite deleted information with zeros when the SECURE_DELETE
811   ** option is enabled at compile-time */
812   memset(&data[start], 0, size);
813 #endif
814 
815   /* Add the space back into the linked list of freeblocks */
816   hdr = pPage->hdrOffset;
817   addr = hdr + 1;
818   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
819     assert( pbegin<=pPage->pBt->usableSize-4 );
820     assert( pbegin>addr );
821     addr = pbegin;
822   }
823   assert( pbegin<=pPage->pBt->usableSize-4 );
824   assert( pbegin>addr || pbegin==0 );
825   put2byte(&data[addr], start);
826   put2byte(&data[start], pbegin);
827   put2byte(&data[start+2], size);
828   pPage->nFree += size;
829 
830   /* Coalesce adjacent free blocks */
831   addr = pPage->hdrOffset + 1;
832   while( (pbegin = get2byte(&data[addr]))>0 ){
833     int pnext, psize;
834     assert( pbegin>addr );
835     assert( pbegin<=pPage->pBt->usableSize-4 );
836     pnext = get2byte(&data[pbegin]);
837     psize = get2byte(&data[pbegin+2]);
838     if( pbegin + psize + 3 >= pnext && pnext>0 ){
839       int frag = pnext - (pbegin+psize);
840       assert( frag<=data[pPage->hdrOffset+7] );
841       data[pPage->hdrOffset+7] -= frag;
842       put2byte(&data[pbegin], get2byte(&data[pnext]));
843       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
844     }else{
845       addr = pbegin;
846     }
847   }
848 
849   /* If the cell content area begins with a freeblock, remove it. */
850   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
851     int top;
852     pbegin = get2byte(&data[hdr+1]);
853     memcpy(&data[hdr+1], &data[pbegin], 2);
854     top = get2byte(&data[hdr+5]);
855     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
856   }
857 }
858 
859 /*
860 ** Decode the flags byte (the first byte of the header) for a page
861 ** and initialize fields of the MemPage structure accordingly.
862 */
863 static void decodeFlags(MemPage *pPage, int flagByte){
864   BtShared *pBt;     /* A copy of pPage->pBt */
865 
866   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
867   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
868   pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
869   pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
870   pPage->leaf = (flagByte & PTF_LEAF)!=0;
871   pPage->childPtrSize = 4*(pPage->leaf==0);
872   pBt = pPage->pBt;
873   if( flagByte & PTF_LEAFDATA ){
874     pPage->leafData = 1;
875     pPage->maxLocal = pBt->maxLeaf;
876     pPage->minLocal = pBt->minLeaf;
877   }else{
878     pPage->leafData = 0;
879     pPage->maxLocal = pBt->maxLocal;
880     pPage->minLocal = pBt->minLocal;
881   }
882   pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
883 }
884 
885 /*
886 ** Initialize the auxiliary information for a disk block.
887 **
888 ** The pParent parameter must be a pointer to the MemPage which
889 ** is the parent of the page being initialized.  The root of a
890 ** BTree has no parent and so for that page, pParent==NULL.
891 **
892 ** Return SQLITE_OK on success.  If we see that the page does
893 ** not contain a well-formed database page, then return
894 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
895 ** guarantee that the page is well-formed.  It only shows that
896 ** we failed to detect any corruption.
897 */
898 int sqlite3BtreeInitPage(
899   MemPage *pPage,        /* The page to be initialized */
900   MemPage *pParent       /* The parent.  Might be NULL */
901 ){
902   int pc;            /* Address of a freeblock within pPage->aData[] */
903   int hdr;           /* Offset to beginning of page header */
904   u8 *data;          /* Equal to pPage->aData */
905   BtShared *pBt;        /* The main btree structure */
906   int usableSize;    /* Amount of usable space on each page */
907   int cellOffset;    /* Offset from start of page to first cell pointer */
908   int nFree;         /* Number of unused bytes on the page */
909   int top;           /* First byte of the cell content area */
910 
911   pBt = pPage->pBt;
912   assert( pBt!=0 );
913   assert( pParent==0 || pParent->pBt==pBt );
914   assert( sqlite3_mutex_held(pBt->mutex) );
915   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
916   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
917   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
918   if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
919     /* The parent page should never change unless the file is corrupt */
920     return SQLITE_CORRUPT_BKPT;
921   }
922   if( pPage->isInit ) return SQLITE_OK;
923   if( pPage->pParent==0 && pParent!=0 ){
924     pPage->pParent = pParent;
925     sqlite3PagerRef(pParent->pDbPage);
926   }
927   hdr = pPage->hdrOffset;
928   data = pPage->aData;
929   decodeFlags(pPage, data[hdr]);
930   pPage->nOverflow = 0;
931   pPage->idxShift = 0;
932   usableSize = pBt->usableSize;
933   pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
934   top = get2byte(&data[hdr+5]);
935   pPage->nCell = get2byte(&data[hdr+3]);
936   if( pPage->nCell>MX_CELL(pBt) ){
937     /* To many cells for a single page.  The page must be corrupt */
938     return SQLITE_CORRUPT_BKPT;
939   }
940   if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
941     /* All pages must have at least one cell, except for root pages */
942     return SQLITE_CORRUPT_BKPT;
943   }
944 
945   /* Compute the total free space on the page */
946   pc = get2byte(&data[hdr+1]);
947   nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
948   while( pc>0 ){
949     int next, size;
950     if( pc>usableSize-4 ){
951       /* Free block is off the page */
952       return SQLITE_CORRUPT_BKPT;
953     }
954     next = get2byte(&data[pc]);
955     size = get2byte(&data[pc+2]);
956     if( next>0 && next<=pc+size+3 ){
957       /* Free blocks must be in accending order */
958       return SQLITE_CORRUPT_BKPT;
959     }
960     nFree += size;
961     pc = next;
962   }
963   pPage->nFree = nFree;
964   if( nFree>=usableSize ){
965     /* Free space cannot exceed total page size */
966     return SQLITE_CORRUPT_BKPT;
967   }
968 
969   pPage->isInit = 1;
970   return SQLITE_OK;
971 }
972 
973 /*
974 ** Set up a raw page so that it looks like a database page holding
975 ** no entries.
976 */
977 static void zeroPage(MemPage *pPage, int flags){
978   unsigned char *data = pPage->aData;
979   BtShared *pBt = pPage->pBt;
980   int hdr = pPage->hdrOffset;
981   int first;
982 
983   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
984   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
985   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
986   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
987   assert( sqlite3_mutex_held(pBt->mutex) );
988   memset(&data[hdr], 0, pBt->usableSize - hdr);
989   data[hdr] = flags;
990   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
991   memset(&data[hdr+1], 0, 4);
992   data[hdr+7] = 0;
993   put2byte(&data[hdr+5], pBt->usableSize);
994   pPage->nFree = pBt->usableSize - first;
995   decodeFlags(pPage, flags);
996   pPage->hdrOffset = hdr;
997   pPage->cellOffset = first;
998   pPage->nOverflow = 0;
999   pPage->idxShift = 0;
1000   pPage->nCell = 0;
1001   pPage->isInit = 1;
1002 }
1003 
1004 /*
1005 ** Get a page from the pager.  Initialize the MemPage.pBt and
1006 ** MemPage.aData elements if needed.
1007 **
1008 ** If the noContent flag is set, it means that we do not care about
1009 ** the content of the page at this time.  So do not go to the disk
1010 ** to fetch the content.  Just fill in the content with zeros for now.
1011 ** If in the future we call sqlite3PagerWrite() on this page, that
1012 ** means we have started to be concerned about content and the disk
1013 ** read should occur at that point.
1014 */
1015 int sqlite3BtreeGetPage(
1016   BtShared *pBt,       /* The btree */
1017   Pgno pgno,           /* Number of the page to fetch */
1018   MemPage **ppPage,    /* Return the page in this parameter */
1019   int noContent        /* Do not load page content if true */
1020 ){
1021   int rc;
1022   MemPage *pPage;
1023   DbPage *pDbPage;
1024 
1025   assert( sqlite3_mutex_held(pBt->mutex) );
1026   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1027   if( rc ) return rc;
1028   pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage);
1029   pPage->aData = sqlite3PagerGetData(pDbPage);
1030   pPage->pDbPage = pDbPage;
1031   pPage->pBt = pBt;
1032   pPage->pgno = pgno;
1033   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1034   *ppPage = pPage;
1035   return SQLITE_OK;
1036 }
1037 
1038 /*
1039 ** Get a page from the pager and initialize it.  This routine
1040 ** is just a convenience wrapper around separate calls to
1041 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1042 */
1043 static int getAndInitPage(
1044   BtShared *pBt,          /* The database file */
1045   Pgno pgno,           /* Number of the page to get */
1046   MemPage **ppPage,    /* Write the page pointer here */
1047   MemPage *pParent     /* Parent of the page */
1048 ){
1049   int rc;
1050   assert( sqlite3_mutex_held(pBt->mutex) );
1051   if( pgno==0 ){
1052     return SQLITE_CORRUPT_BKPT;
1053   }
1054   rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1055   if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1056     rc = sqlite3BtreeInitPage(*ppPage, pParent);
1057   }
1058   return rc;
1059 }
1060 
1061 /*
1062 ** Release a MemPage.  This should be called once for each prior
1063 ** call to sqlite3BtreeGetPage.
1064 */
1065 static void releasePage(MemPage *pPage){
1066   if( pPage ){
1067     assert( pPage->aData );
1068     assert( pPage->pBt );
1069     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1070     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1071     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1072     sqlite3PagerUnref(pPage->pDbPage);
1073   }
1074 }
1075 
1076 /*
1077 ** This routine is called when the reference count for a page
1078 ** reaches zero.  We need to unref the pParent pointer when that
1079 ** happens.
1080 */
1081 static void pageDestructor(DbPage *pData, int pageSize){
1082   MemPage *pPage;
1083   assert( (pageSize & 7)==0 );
1084   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1085   assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) );
1086   if( pPage->pParent ){
1087     MemPage *pParent = pPage->pParent;
1088     assert( pParent->pBt==pPage->pBt );
1089     pPage->pParent = 0;
1090     releasePage(pParent);
1091   }
1092   pPage->isInit = 0;
1093 }
1094 
1095 /*
1096 ** During a rollback, when the pager reloads information into the cache
1097 ** so that the cache is restored to its original state at the start of
1098 ** the transaction, for each page restored this routine is called.
1099 **
1100 ** This routine needs to reset the extra data section at the end of the
1101 ** page to agree with the restored data.
1102 */
1103 static void pageReinit(DbPage *pData, int pageSize){
1104   MemPage *pPage;
1105   assert( (pageSize & 7)==0 );
1106   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1107   if( pPage->isInit ){
1108     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1109     pPage->isInit = 0;
1110     sqlite3BtreeInitPage(pPage, pPage->pParent);
1111   }
1112 }
1113 
1114 /*
1115 ** Invoke the busy handler for a btree.
1116 */
1117 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1118   BtShared *pBt = (BtShared*)pArg;
1119   assert( pBt->db );
1120   assert( sqlite3_mutex_held(pBt->db->mutex) );
1121   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1122 }
1123 
1124 /*
1125 ** Open a database file.
1126 **
1127 ** zFilename is the name of the database file.  If zFilename is NULL
1128 ** a new database with a random name is created.  This randomly named
1129 ** database file will be deleted when sqlite3BtreeClose() is called.
1130 ** If zFilename is ":memory:" then an in-memory database is created
1131 ** that is automatically destroyed when it is closed.
1132 */
1133 int sqlite3BtreeOpen(
1134   const char *zFilename,  /* Name of the file containing the BTree database */
1135   sqlite3 *db,            /* Associated database handle */
1136   Btree **ppBtree,        /* Pointer to new Btree object written here */
1137   int flags,              /* Options */
1138   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1139 ){
1140   sqlite3_vfs *pVfs;      /* The VFS to use for this btree */
1141   BtShared *pBt = 0;      /* Shared part of btree structure */
1142   Btree *p;               /* Handle to return */
1143   int rc = SQLITE_OK;
1144   int nReserve;
1145   unsigned char zDbHeader[100];
1146 
1147   /* Set the variable isMemdb to true for an in-memory database, or
1148   ** false for a file-based database. This symbol is only required if
1149   ** either of the shared-data or autovacuum features are compiled
1150   ** into the library.
1151   */
1152 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1153   #ifdef SQLITE_OMIT_MEMORYDB
1154     const int isMemdb = 0;
1155   #else
1156     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1157   #endif
1158 #endif
1159 
1160   assert( db!=0 );
1161   assert( sqlite3_mutex_held(db->mutex) );
1162 
1163   pVfs = db->pVfs;
1164   p = sqlite3MallocZero(sizeof(Btree));
1165   if( !p ){
1166     return SQLITE_NOMEM;
1167   }
1168   p->inTrans = TRANS_NONE;
1169   p->db = db;
1170 
1171 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1172   /*
1173   ** If this Btree is a candidate for shared cache, try to find an
1174   ** existing BtShared object that we can share with
1175   */
1176   if( (flags & BTREE_PRIVATE)==0
1177    && isMemdb==0
1178    && (db->flags & SQLITE_Vtab)==0
1179    && zFilename && zFilename[0]
1180   ){
1181     if( sqlite3SharedCacheEnabled ){
1182       int nFullPathname = pVfs->mxPathname+1;
1183       char *zFullPathname = (char *)sqlite3_malloc(nFullPathname);
1184       sqlite3_mutex *mutexShared;
1185       p->sharable = 1;
1186       if( db ){
1187         db->flags |= SQLITE_SharedCache;
1188       }
1189       if( !zFullPathname ){
1190         sqlite3_free(p);
1191         return SQLITE_NOMEM;
1192       }
1193       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1194       mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1195       sqlite3_mutex_enter(mutexShared);
1196       for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){
1197         assert( pBt->nRef>0 );
1198         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1199                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1200           p->pBt = pBt;
1201           pBt->nRef++;
1202           break;
1203         }
1204       }
1205       sqlite3_mutex_leave(mutexShared);
1206       sqlite3_free(zFullPathname);
1207     }
1208 #ifdef SQLITE_DEBUG
1209     else{
1210       /* In debug mode, we mark all persistent databases as sharable
1211       ** even when they are not.  This exercises the locking code and
1212       ** gives more opportunity for asserts(sqlite3_mutex_held())
1213       ** statements to find locking problems.
1214       */
1215       p->sharable = 1;
1216     }
1217 #endif
1218   }
1219 #endif
1220   if( pBt==0 ){
1221     /*
1222     ** The following asserts make sure that structures used by the btree are
1223     ** the right size.  This is to guard against size changes that result
1224     ** when compiling on a different architecture.
1225     */
1226     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1227     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1228     assert( sizeof(u32)==4 );
1229     assert( sizeof(u16)==2 );
1230     assert( sizeof(Pgno)==4 );
1231 
1232     pBt = sqlite3MallocZero( sizeof(*pBt) );
1233     if( pBt==0 ){
1234       rc = SQLITE_NOMEM;
1235       goto btree_open_out;
1236     }
1237     pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1238     pBt->busyHdr.pArg = pBt;
1239     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1240                           EXTRA_SIZE, flags, vfsFlags);
1241     if( rc==SQLITE_OK ){
1242       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1243     }
1244     if( rc!=SQLITE_OK ){
1245       goto btree_open_out;
1246     }
1247     sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1248     p->pBt = pBt;
1249 
1250     sqlite3PagerSetDestructor(pBt->pPager, pageDestructor);
1251     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1252     pBt->pCursor = 0;
1253     pBt->pPage1 = 0;
1254     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1255     pBt->pageSize = get2byte(&zDbHeader[16]);
1256     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1257          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1258       pBt->pageSize = 0;
1259       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1260       pBt->maxEmbedFrac = 64;   /* 25% */
1261       pBt->minEmbedFrac = 32;   /* 12.5% */
1262       pBt->minLeafFrac = 32;    /* 12.5% */
1263 #ifndef SQLITE_OMIT_AUTOVACUUM
1264       /* If the magic name ":memory:" will create an in-memory database, then
1265       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1266       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1267       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1268       ** regular file-name. In this case the auto-vacuum applies as per normal.
1269       */
1270       if( zFilename && !isMemdb ){
1271         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1272         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1273       }
1274 #endif
1275       nReserve = 0;
1276     }else{
1277       nReserve = zDbHeader[20];
1278       pBt->maxEmbedFrac = zDbHeader[21];
1279       pBt->minEmbedFrac = zDbHeader[22];
1280       pBt->minLeafFrac = zDbHeader[23];
1281       pBt->pageSizeFixed = 1;
1282 #ifndef SQLITE_OMIT_AUTOVACUUM
1283       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1284       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1285 #endif
1286     }
1287     pBt->usableSize = pBt->pageSize - nReserve;
1288     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1289     sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1290 
1291 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1292     /* Add the new BtShared object to the linked list sharable BtShareds.
1293     */
1294     if( p->sharable ){
1295       sqlite3_mutex *mutexShared;
1296       pBt->nRef = 1;
1297       mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1298       if( SQLITE_THREADSAFE ){
1299         pBt->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
1300         if( pBt->mutex==0 ){
1301           rc = SQLITE_NOMEM;
1302           db->mallocFailed = 0;
1303           goto btree_open_out;
1304         }
1305       }
1306       sqlite3_mutex_enter(mutexShared);
1307       pBt->pNext = sqlite3SharedCacheList;
1308       sqlite3SharedCacheList = pBt;
1309       sqlite3_mutex_leave(mutexShared);
1310     }
1311 #endif
1312   }
1313 
1314 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1315   /* If the new Btree uses a sharable pBtShared, then link the new
1316   ** Btree into the list of all sharable Btrees for the same connection.
1317   ** The list is kept in ascending order by pBt address.
1318   */
1319   if( p->sharable ){
1320     int i;
1321     Btree *pSib;
1322     for(i=0; i<db->nDb; i++){
1323       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1324         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1325         if( p->pBt<pSib->pBt ){
1326           p->pNext = pSib;
1327           p->pPrev = 0;
1328           pSib->pPrev = p;
1329         }else{
1330           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1331             pSib = pSib->pNext;
1332           }
1333           p->pNext = pSib->pNext;
1334           p->pPrev = pSib;
1335           if( p->pNext ){
1336             p->pNext->pPrev = p;
1337           }
1338           pSib->pNext = p;
1339         }
1340         break;
1341       }
1342     }
1343   }
1344 #endif
1345   *ppBtree = p;
1346 
1347 btree_open_out:
1348   if( rc!=SQLITE_OK ){
1349     if( pBt && pBt->pPager ){
1350       sqlite3PagerClose(pBt->pPager);
1351     }
1352     sqlite3_free(pBt);
1353     sqlite3_free(p);
1354     *ppBtree = 0;
1355   }
1356   return rc;
1357 }
1358 
1359 /*
1360 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1361 ** remove the BtShared structure from the sharing list.  Return
1362 ** true if the BtShared.nRef counter reaches zero and return
1363 ** false if it is still positive.
1364 */
1365 static int removeFromSharingList(BtShared *pBt){
1366 #ifndef SQLITE_OMIT_SHARED_CACHE
1367   sqlite3_mutex *pMaster;
1368   BtShared *pList;
1369   int removed = 0;
1370 
1371   assert( sqlite3_mutex_notheld(pBt->mutex) );
1372   pMaster = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1373   sqlite3_mutex_enter(pMaster);
1374   pBt->nRef--;
1375   if( pBt->nRef<=0 ){
1376     if( sqlite3SharedCacheList==pBt ){
1377       sqlite3SharedCacheList = pBt->pNext;
1378     }else{
1379       pList = sqlite3SharedCacheList;
1380       while( pList && pList->pNext!=pBt ){
1381         pList=pList->pNext;
1382       }
1383       if( pList ){
1384         pList->pNext = pBt->pNext;
1385       }
1386     }
1387     if( SQLITE_THREADSAFE ){
1388       sqlite3_mutex_free(pBt->mutex);
1389     }
1390     removed = 1;
1391   }
1392   sqlite3_mutex_leave(pMaster);
1393   return removed;
1394 #else
1395   return 1;
1396 #endif
1397 }
1398 
1399 /*
1400 ** Close an open database and invalidate all cursors.
1401 */
1402 int sqlite3BtreeClose(Btree *p){
1403   BtShared *pBt = p->pBt;
1404   BtCursor *pCur;
1405 
1406   /* Close all cursors opened via this handle.  */
1407   assert( sqlite3_mutex_held(p->db->mutex) );
1408   sqlite3BtreeEnter(p);
1409   pBt->db = p->db;
1410   pCur = pBt->pCursor;
1411   while( pCur ){
1412     BtCursor *pTmp = pCur;
1413     pCur = pCur->pNext;
1414     if( pTmp->pBtree==p ){
1415       sqlite3BtreeCloseCursor(pTmp);
1416     }
1417   }
1418 
1419   /* Rollback any active transaction and free the handle structure.
1420   ** The call to sqlite3BtreeRollback() drops any table-locks held by
1421   ** this handle.
1422   */
1423   sqlite3BtreeRollback(p);
1424   sqlite3BtreeLeave(p);
1425 
1426   /* If there are still other outstanding references to the shared-btree
1427   ** structure, return now. The remainder of this procedure cleans
1428   ** up the shared-btree.
1429   */
1430   assert( p->wantToLock==0 && p->locked==0 );
1431   if( !p->sharable || removeFromSharingList(pBt) ){
1432     /* The pBt is no longer on the sharing list, so we can access
1433     ** it without having to hold the mutex.
1434     **
1435     ** Clean out and delete the BtShared object.
1436     */
1437     assert( !pBt->pCursor );
1438     sqlite3PagerClose(pBt->pPager);
1439     if( pBt->xFreeSchema && pBt->pSchema ){
1440       pBt->xFreeSchema(pBt->pSchema);
1441     }
1442     sqlite3_free(pBt->pSchema);
1443     sqlite3_free(pBt->pTmpSpace);
1444     sqlite3_free(pBt);
1445   }
1446 
1447 #ifndef SQLITE_OMIT_SHARED_CACHE
1448   assert( p->wantToLock==0 );
1449   assert( p->locked==0 );
1450   if( p->pPrev ) p->pPrev->pNext = p->pNext;
1451   if( p->pNext ) p->pNext->pPrev = p->pPrev;
1452 #endif
1453 
1454   sqlite3_free(p);
1455   return SQLITE_OK;
1456 }
1457 
1458 /*
1459 ** Change the limit on the number of pages allowed in the cache.
1460 **
1461 ** The maximum number of cache pages is set to the absolute
1462 ** value of mxPage.  If mxPage is negative, the pager will
1463 ** operate asynchronously - it will not stop to do fsync()s
1464 ** to insure data is written to the disk surface before
1465 ** continuing.  Transactions still work if synchronous is off,
1466 ** and the database cannot be corrupted if this program
1467 ** crashes.  But if the operating system crashes or there is
1468 ** an abrupt power failure when synchronous is off, the database
1469 ** could be left in an inconsistent and unrecoverable state.
1470 ** Synchronous is on by default so database corruption is not
1471 ** normally a worry.
1472 */
1473 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1474   BtShared *pBt = p->pBt;
1475   assert( sqlite3_mutex_held(p->db->mutex) );
1476   sqlite3BtreeEnter(p);
1477   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1478   sqlite3BtreeLeave(p);
1479   return SQLITE_OK;
1480 }
1481 
1482 /*
1483 ** Change the way data is synced to disk in order to increase or decrease
1484 ** how well the database resists damage due to OS crashes and power
1485 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
1486 ** there is a high probability of damage)  Level 2 is the default.  There
1487 ** is a very low but non-zero probability of damage.  Level 3 reduces the
1488 ** probability of damage to near zero but with a write performance reduction.
1489 */
1490 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1491 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1492   BtShared *pBt = p->pBt;
1493   assert( sqlite3_mutex_held(p->db->mutex) );
1494   sqlite3BtreeEnter(p);
1495   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1496   sqlite3BtreeLeave(p);
1497   return SQLITE_OK;
1498 }
1499 #endif
1500 
1501 /*
1502 ** Return TRUE if the given btree is set to safety level 1.  In other
1503 ** words, return TRUE if no sync() occurs on the disk files.
1504 */
1505 int sqlite3BtreeSyncDisabled(Btree *p){
1506   BtShared *pBt = p->pBt;
1507   int rc;
1508   assert( sqlite3_mutex_held(p->db->mutex) );
1509   sqlite3BtreeEnter(p);
1510   assert( pBt && pBt->pPager );
1511   rc = sqlite3PagerNosync(pBt->pPager);
1512   sqlite3BtreeLeave(p);
1513   return rc;
1514 }
1515 
1516 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1517 /*
1518 ** Change the default pages size and the number of reserved bytes per page.
1519 **
1520 ** The page size must be a power of 2 between 512 and 65536.  If the page
1521 ** size supplied does not meet this constraint then the page size is not
1522 ** changed.
1523 **
1524 ** Page sizes are constrained to be a power of two so that the region
1525 ** of the database file used for locking (beginning at PENDING_BYTE,
1526 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1527 ** at the beginning of a page.
1528 **
1529 ** If parameter nReserve is less than zero, then the number of reserved
1530 ** bytes per page is left unchanged.
1531 */
1532 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1533   int rc = SQLITE_OK;
1534   BtShared *pBt = p->pBt;
1535   sqlite3BtreeEnter(p);
1536   if( pBt->pageSizeFixed ){
1537     sqlite3BtreeLeave(p);
1538     return SQLITE_READONLY;
1539   }
1540   if( nReserve<0 ){
1541     nReserve = pBt->pageSize - pBt->usableSize;
1542   }
1543   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1544         ((pageSize-1)&pageSize)==0 ){
1545     assert( (pageSize & 7)==0 );
1546     assert( !pBt->pPage1 && !pBt->pCursor );
1547     pBt->pageSize = pageSize;
1548     sqlite3_free(pBt->pTmpSpace);
1549     pBt->pTmpSpace = 0;
1550     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1551   }
1552   pBt->usableSize = pBt->pageSize - nReserve;
1553   sqlite3BtreeLeave(p);
1554   return rc;
1555 }
1556 
1557 /*
1558 ** Return the currently defined page size
1559 */
1560 int sqlite3BtreeGetPageSize(Btree *p){
1561   return p->pBt->pageSize;
1562 }
1563 int sqlite3BtreeGetReserve(Btree *p){
1564   int n;
1565   sqlite3BtreeEnter(p);
1566   n = p->pBt->pageSize - p->pBt->usableSize;
1567   sqlite3BtreeLeave(p);
1568   return n;
1569 }
1570 
1571 /*
1572 ** Set the maximum page count for a database if mxPage is positive.
1573 ** No changes are made if mxPage is 0 or negative.
1574 ** Regardless of the value of mxPage, return the maximum page count.
1575 */
1576 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1577   int n;
1578   sqlite3BtreeEnter(p);
1579   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1580   sqlite3BtreeLeave(p);
1581   return n;
1582 }
1583 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1584 
1585 /*
1586 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1587 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1588 ** is disabled. The default value for the auto-vacuum property is
1589 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1590 */
1591 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1592 #ifdef SQLITE_OMIT_AUTOVACUUM
1593   return SQLITE_READONLY;
1594 #else
1595   BtShared *pBt = p->pBt;
1596   int rc = SQLITE_OK;
1597   int av = (autoVacuum?1:0);
1598 
1599   sqlite3BtreeEnter(p);
1600   if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1601     rc = SQLITE_READONLY;
1602   }else{
1603     pBt->autoVacuum = av;
1604   }
1605   sqlite3BtreeLeave(p);
1606   return rc;
1607 #endif
1608 }
1609 
1610 /*
1611 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1612 ** enabled 1 is returned. Otherwise 0.
1613 */
1614 int sqlite3BtreeGetAutoVacuum(Btree *p){
1615 #ifdef SQLITE_OMIT_AUTOVACUUM
1616   return BTREE_AUTOVACUUM_NONE;
1617 #else
1618   int rc;
1619   sqlite3BtreeEnter(p);
1620   rc = (
1621     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1622     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1623     BTREE_AUTOVACUUM_INCR
1624   );
1625   sqlite3BtreeLeave(p);
1626   return rc;
1627 #endif
1628 }
1629 
1630 
1631 /*
1632 ** Get a reference to pPage1 of the database file.  This will
1633 ** also acquire a readlock on that file.
1634 **
1635 ** SQLITE_OK is returned on success.  If the file is not a
1636 ** well-formed database file, then SQLITE_CORRUPT is returned.
1637 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
1638 ** is returned if we run out of memory.
1639 */
1640 static int lockBtree(BtShared *pBt){
1641   int rc;
1642   MemPage *pPage1;
1643 
1644   assert( sqlite3_mutex_held(pBt->mutex) );
1645   if( pBt->pPage1 ) return SQLITE_OK;
1646   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1647   if( rc!=SQLITE_OK ) return rc;
1648 
1649   /* Do some checking to help insure the file we opened really is
1650   ** a valid database file.
1651   */
1652   rc = SQLITE_NOTADB;
1653   if( sqlite3PagerPagecount(pBt->pPager)>0 ){
1654     int pageSize;
1655     int usableSize;
1656     u8 *page1 = pPage1->aData;
1657     if( memcmp(page1, zMagicHeader, 16)!=0 ){
1658       goto page1_init_failed;
1659     }
1660     if( page1[18]>1 ){
1661       pBt->readOnly = 1;
1662     }
1663     if( page1[19]>1 ){
1664       goto page1_init_failed;
1665     }
1666     pageSize = get2byte(&page1[16]);
1667     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1668         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1669     ){
1670       goto page1_init_failed;
1671     }
1672     assert( (pageSize & 7)==0 );
1673     usableSize = pageSize - page1[20];
1674     if( pageSize!=pBt->pageSize ){
1675       /* After reading the first page of the database assuming a page size
1676       ** of BtShared.pageSize, we have discovered that the page-size is
1677       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1678       ** zero and return SQLITE_OK. The caller will call this function
1679       ** again with the correct page-size.
1680       */
1681       releasePage(pPage1);
1682       pBt->usableSize = usableSize;
1683       pBt->pageSize = pageSize;
1684       sqlite3_free(pBt->pTmpSpace);
1685       pBt->pTmpSpace = 0;
1686       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1687       return SQLITE_OK;
1688     }
1689     if( usableSize<500 ){
1690       goto page1_init_failed;
1691     }
1692     pBt->pageSize = pageSize;
1693     pBt->usableSize = usableSize;
1694     pBt->maxEmbedFrac = page1[21];
1695     pBt->minEmbedFrac = page1[22];
1696     pBt->minLeafFrac = page1[23];
1697 #ifndef SQLITE_OMIT_AUTOVACUUM
1698     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1699     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1700 #endif
1701   }
1702 
1703   /* maxLocal is the maximum amount of payload to store locally for
1704   ** a cell.  Make sure it is small enough so that at least minFanout
1705   ** cells can will fit on one page.  We assume a 10-byte page header.
1706   ** Besides the payload, the cell must store:
1707   **     2-byte pointer to the cell
1708   **     4-byte child pointer
1709   **     9-byte nKey value
1710   **     4-byte nData value
1711   **     4-byte overflow page pointer
1712   ** So a cell consists of a 2-byte poiner, a header which is as much as
1713   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1714   ** page pointer.
1715   */
1716   pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1717   pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1718   pBt->maxLeaf = pBt->usableSize - 35;
1719   pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1720   if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1721     goto page1_init_failed;
1722   }
1723   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1724   pBt->pPage1 = pPage1;
1725   return SQLITE_OK;
1726 
1727 page1_init_failed:
1728   releasePage(pPage1);
1729   pBt->pPage1 = 0;
1730   return rc;
1731 }
1732 
1733 /*
1734 ** This routine works like lockBtree() except that it also invokes the
1735 ** busy callback if there is lock contention.
1736 */
1737 static int lockBtreeWithRetry(Btree *pRef){
1738   int rc = SQLITE_OK;
1739 
1740   assert( sqlite3BtreeHoldsMutex(pRef) );
1741   if( pRef->inTrans==TRANS_NONE ){
1742     u8 inTransaction = pRef->pBt->inTransaction;
1743     btreeIntegrity(pRef);
1744     rc = sqlite3BtreeBeginTrans(pRef, 0);
1745     pRef->pBt->inTransaction = inTransaction;
1746     pRef->inTrans = TRANS_NONE;
1747     if( rc==SQLITE_OK ){
1748       pRef->pBt->nTransaction--;
1749     }
1750     btreeIntegrity(pRef);
1751   }
1752   return rc;
1753 }
1754 
1755 
1756 /*
1757 ** If there are no outstanding cursors and we are not in the middle
1758 ** of a transaction but there is a read lock on the database, then
1759 ** this routine unrefs the first page of the database file which
1760 ** has the effect of releasing the read lock.
1761 **
1762 ** If there are any outstanding cursors, this routine is a no-op.
1763 **
1764 ** If there is a transaction in progress, this routine is a no-op.
1765 */
1766 static void unlockBtreeIfUnused(BtShared *pBt){
1767   assert( sqlite3_mutex_held(pBt->mutex) );
1768   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1769     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1770       assert( pBt->pPage1->aData );
1771 #if 0
1772       if( pBt->pPage1->aData==0 ){
1773         MemPage *pPage = pBt->pPage1;
1774         pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1775         pPage->pBt = pBt;
1776         pPage->pgno = 1;
1777       }
1778 #endif
1779       releasePage(pBt->pPage1);
1780     }
1781     pBt->pPage1 = 0;
1782     pBt->inStmt = 0;
1783   }
1784 }
1785 
1786 /*
1787 ** Create a new database by initializing the first page of the
1788 ** file.
1789 */
1790 static int newDatabase(BtShared *pBt){
1791   MemPage *pP1;
1792   unsigned char *data;
1793   int rc;
1794 
1795   assert( sqlite3_mutex_held(pBt->mutex) );
1796   if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK;
1797   pP1 = pBt->pPage1;
1798   assert( pP1!=0 );
1799   data = pP1->aData;
1800   rc = sqlite3PagerWrite(pP1->pDbPage);
1801   if( rc ) return rc;
1802   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1803   assert( sizeof(zMagicHeader)==16 );
1804   put2byte(&data[16], pBt->pageSize);
1805   data[18] = 1;
1806   data[19] = 1;
1807   data[20] = pBt->pageSize - pBt->usableSize;
1808   data[21] = pBt->maxEmbedFrac;
1809   data[22] = pBt->minEmbedFrac;
1810   data[23] = pBt->minLeafFrac;
1811   memset(&data[24], 0, 100-24);
1812   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1813   pBt->pageSizeFixed = 1;
1814 #ifndef SQLITE_OMIT_AUTOVACUUM
1815   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1816   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1817   put4byte(&data[36 + 4*4], pBt->autoVacuum);
1818   put4byte(&data[36 + 7*4], pBt->incrVacuum);
1819 #endif
1820   return SQLITE_OK;
1821 }
1822 
1823 /*
1824 ** Attempt to start a new transaction. A write-transaction
1825 ** is started if the second argument is nonzero, otherwise a read-
1826 ** transaction.  If the second argument is 2 or more and exclusive
1827 ** transaction is started, meaning that no other process is allowed
1828 ** to access the database.  A preexisting transaction may not be
1829 ** upgraded to exclusive by calling this routine a second time - the
1830 ** exclusivity flag only works for a new transaction.
1831 **
1832 ** A write-transaction must be started before attempting any
1833 ** changes to the database.  None of the following routines
1834 ** will work unless a transaction is started first:
1835 **
1836 **      sqlite3BtreeCreateTable()
1837 **      sqlite3BtreeCreateIndex()
1838 **      sqlite3BtreeClearTable()
1839 **      sqlite3BtreeDropTable()
1840 **      sqlite3BtreeInsert()
1841 **      sqlite3BtreeDelete()
1842 **      sqlite3BtreeUpdateMeta()
1843 **
1844 ** If an initial attempt to acquire the lock fails because of lock contention
1845 ** and the database was previously unlocked, then invoke the busy handler
1846 ** if there is one.  But if there was previously a read-lock, do not
1847 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
1848 ** returned when there is already a read-lock in order to avoid a deadlock.
1849 **
1850 ** Suppose there are two processes A and B.  A has a read lock and B has
1851 ** a reserved lock.  B tries to promote to exclusive but is blocked because
1852 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
1853 ** One or the other of the two processes must give way or there can be
1854 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
1855 ** when A already has a read lock, we encourage A to give up and let B
1856 ** proceed.
1857 */
1858 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1859   BtShared *pBt = p->pBt;
1860   int rc = SQLITE_OK;
1861 
1862   sqlite3BtreeEnter(p);
1863   pBt->db = p->db;
1864   btreeIntegrity(p);
1865 
1866   /* If the btree is already in a write-transaction, or it
1867   ** is already in a read-transaction and a read-transaction
1868   ** is requested, this is a no-op.
1869   */
1870   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1871     goto trans_begun;
1872   }
1873 
1874   /* Write transactions are not possible on a read-only database */
1875   if( pBt->readOnly && wrflag ){
1876     rc = SQLITE_READONLY;
1877     goto trans_begun;
1878   }
1879 
1880   /* If another database handle has already opened a write transaction
1881   ** on this shared-btree structure and a second write transaction is
1882   ** requested, return SQLITE_BUSY.
1883   */
1884   if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1885     rc = SQLITE_BUSY;
1886     goto trans_begun;
1887   }
1888 
1889 #ifndef SQLITE_OMIT_SHARED_CACHE
1890   if( wrflag>1 ){
1891     BtLock *pIter;
1892     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1893       if( pIter->pBtree!=p ){
1894         rc = SQLITE_BUSY;
1895         goto trans_begun;
1896       }
1897     }
1898   }
1899 #endif
1900 
1901   do {
1902     while( rc==SQLITE_OK && pBt->pPage1==0 ){
1903       rc = lockBtree(pBt);
1904     }
1905 
1906     if( rc==SQLITE_OK && wrflag ){
1907       if( pBt->readOnly ){
1908         rc = SQLITE_READONLY;
1909       }else{
1910         rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1911         if( rc==SQLITE_OK ){
1912           rc = newDatabase(pBt);
1913         }
1914       }
1915     }
1916 
1917     if( rc==SQLITE_OK ){
1918       if( wrflag ) pBt->inStmt = 0;
1919     }else{
1920       unlockBtreeIfUnused(pBt);
1921     }
1922   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1923           sqlite3BtreeInvokeBusyHandler(pBt, 0) );
1924 
1925   if( rc==SQLITE_OK ){
1926     if( p->inTrans==TRANS_NONE ){
1927       pBt->nTransaction++;
1928     }
1929     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1930     if( p->inTrans>pBt->inTransaction ){
1931       pBt->inTransaction = p->inTrans;
1932     }
1933 #ifndef SQLITE_OMIT_SHARED_CACHE
1934     if( wrflag>1 ){
1935       assert( !pBt->pExclusive );
1936       pBt->pExclusive = p;
1937     }
1938 #endif
1939   }
1940 
1941 
1942 trans_begun:
1943   btreeIntegrity(p);
1944   sqlite3BtreeLeave(p);
1945   return rc;
1946 }
1947 
1948 #ifndef SQLITE_OMIT_AUTOVACUUM
1949 
1950 /*
1951 ** Set the pointer-map entries for all children of page pPage. Also, if
1952 ** pPage contains cells that point to overflow pages, set the pointer
1953 ** map entries for the overflow pages as well.
1954 */
1955 static int setChildPtrmaps(MemPage *pPage){
1956   int i;                             /* Counter variable */
1957   int nCell;                         /* Number of cells in page pPage */
1958   int rc;                            /* Return code */
1959   BtShared *pBt = pPage->pBt;
1960   int isInitOrig = pPage->isInit;
1961   Pgno pgno = pPage->pgno;
1962 
1963   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1964   rc = sqlite3BtreeInitPage(pPage, pPage->pParent);
1965   if( rc!=SQLITE_OK ){
1966     goto set_child_ptrmaps_out;
1967   }
1968   nCell = pPage->nCell;
1969 
1970   for(i=0; i<nCell; i++){
1971     u8 *pCell = findCell(pPage, i);
1972 
1973     rc = ptrmapPutOvflPtr(pPage, pCell);
1974     if( rc!=SQLITE_OK ){
1975       goto set_child_ptrmaps_out;
1976     }
1977 
1978     if( !pPage->leaf ){
1979       Pgno childPgno = get4byte(pCell);
1980       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1981       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1982     }
1983   }
1984 
1985   if( !pPage->leaf ){
1986     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1987     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1988   }
1989 
1990 set_child_ptrmaps_out:
1991   pPage->isInit = isInitOrig;
1992   return rc;
1993 }
1994 
1995 /*
1996 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1997 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
1998 ** iTo. Parameter eType describes the type of pointer to be modified, as
1999 ** follows:
2000 **
2001 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2002 **                   page of pPage.
2003 **
2004 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2005 **                   page pointed to by one of the cells on pPage.
2006 **
2007 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2008 **                   overflow page in the list.
2009 */
2010 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2011   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2012   if( eType==PTRMAP_OVERFLOW2 ){
2013     /* The pointer is always the first 4 bytes of the page in this case.  */
2014     if( get4byte(pPage->aData)!=iFrom ){
2015       return SQLITE_CORRUPT_BKPT;
2016     }
2017     put4byte(pPage->aData, iTo);
2018   }else{
2019     int isInitOrig = pPage->isInit;
2020     int i;
2021     int nCell;
2022 
2023     sqlite3BtreeInitPage(pPage, 0);
2024     nCell = pPage->nCell;
2025 
2026     for(i=0; i<nCell; i++){
2027       u8 *pCell = findCell(pPage, i);
2028       if( eType==PTRMAP_OVERFLOW1 ){
2029         CellInfo info;
2030         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
2031         if( info.iOverflow ){
2032           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2033             put4byte(&pCell[info.iOverflow], iTo);
2034             break;
2035           }
2036         }
2037       }else{
2038         if( get4byte(pCell)==iFrom ){
2039           put4byte(pCell, iTo);
2040           break;
2041         }
2042       }
2043     }
2044 
2045     if( i==nCell ){
2046       if( eType!=PTRMAP_BTREE ||
2047           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2048         return SQLITE_CORRUPT_BKPT;
2049       }
2050       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2051     }
2052 
2053     pPage->isInit = isInitOrig;
2054   }
2055   return SQLITE_OK;
2056 }
2057 
2058 
2059 /*
2060 ** Move the open database page pDbPage to location iFreePage in the
2061 ** database. The pDbPage reference remains valid.
2062 */
2063 static int relocatePage(
2064   BtShared *pBt,           /* Btree */
2065   MemPage *pDbPage,        /* Open page to move */
2066   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2067   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2068   Pgno iFreePage           /* The location to move pDbPage to */
2069 ){
2070   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2071   Pgno iDbPage = pDbPage->pgno;
2072   Pager *pPager = pBt->pPager;
2073   int rc;
2074 
2075   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2076       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2077   assert( sqlite3_mutex_held(pBt->mutex) );
2078   assert( pDbPage->pBt==pBt );
2079 
2080   /* Move page iDbPage from its current location to page number iFreePage */
2081   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2082       iDbPage, iFreePage, iPtrPage, eType));
2083   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage);
2084   if( rc!=SQLITE_OK ){
2085     return rc;
2086   }
2087   pDbPage->pgno = iFreePage;
2088 
2089   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2090   ** that point to overflow pages. The pointer map entries for all these
2091   ** pages need to be changed.
2092   **
2093   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2094   ** pointer to a subsequent overflow page. If this is the case, then
2095   ** the pointer map needs to be updated for the subsequent overflow page.
2096   */
2097   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2098     rc = setChildPtrmaps(pDbPage);
2099     if( rc!=SQLITE_OK ){
2100       return rc;
2101     }
2102   }else{
2103     Pgno nextOvfl = get4byte(pDbPage->aData);
2104     if( nextOvfl!=0 ){
2105       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2106       if( rc!=SQLITE_OK ){
2107         return rc;
2108       }
2109     }
2110   }
2111 
2112   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2113   ** that it points at iFreePage. Also fix the pointer map entry for
2114   ** iPtrPage.
2115   */
2116   if( eType!=PTRMAP_ROOTPAGE ){
2117     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2118     if( rc!=SQLITE_OK ){
2119       return rc;
2120     }
2121     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2122     if( rc!=SQLITE_OK ){
2123       releasePage(pPtrPage);
2124       return rc;
2125     }
2126     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2127     releasePage(pPtrPage);
2128     if( rc==SQLITE_OK ){
2129       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2130     }
2131   }
2132   return rc;
2133 }
2134 
2135 /* Forward declaration required by incrVacuumStep(). */
2136 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2137 
2138 /*
2139 ** Perform a single step of an incremental-vacuum. If successful,
2140 ** return SQLITE_OK. If there is no work to do (and therefore no
2141 ** point in calling this function again), return SQLITE_DONE.
2142 **
2143 ** More specificly, this function attempts to re-organize the
2144 ** database so that the last page of the file currently in use
2145 ** is no longer in use.
2146 **
2147 ** If the nFin parameter is non-zero, the implementation assumes
2148 ** that the caller will keep calling incrVacuumStep() until
2149 ** it returns SQLITE_DONE or an error, and that nFin is the
2150 ** number of pages the database file will contain after this
2151 ** process is complete.
2152 */
2153 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2154   Pgno iLastPg;             /* Last page in the database */
2155   Pgno nFreeList;           /* Number of pages still on the free-list */
2156 
2157   assert( sqlite3_mutex_held(pBt->mutex) );
2158   iLastPg = pBt->nTrunc;
2159   if( iLastPg==0 ){
2160     iLastPg = sqlite3PagerPagecount(pBt->pPager);
2161   }
2162 
2163   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2164     int rc;
2165     u8 eType;
2166     Pgno iPtrPage;
2167 
2168     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2169     if( nFreeList==0 || nFin==iLastPg ){
2170       return SQLITE_DONE;
2171     }
2172 
2173     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2174     if( rc!=SQLITE_OK ){
2175       return rc;
2176     }
2177     if( eType==PTRMAP_ROOTPAGE ){
2178       return SQLITE_CORRUPT_BKPT;
2179     }
2180 
2181     if( eType==PTRMAP_FREEPAGE ){
2182       if( nFin==0 ){
2183         /* Remove the page from the files free-list. This is not required
2184         ** if nFin is non-zero. In that case, the free-list will be
2185         ** truncated to zero after this function returns, so it doesn't
2186         ** matter if it still contains some garbage entries.
2187         */
2188         Pgno iFreePg;
2189         MemPage *pFreePg;
2190         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2191         if( rc!=SQLITE_OK ){
2192           return rc;
2193         }
2194         assert( iFreePg==iLastPg );
2195         releasePage(pFreePg);
2196       }
2197     } else {
2198       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2199       MemPage *pLastPg;
2200 
2201       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2202       if( rc!=SQLITE_OK ){
2203         return rc;
2204       }
2205 
2206       /* If nFin is zero, this loop runs exactly once and page pLastPg
2207       ** is swapped with the first free page pulled off the free list.
2208       **
2209       ** On the other hand, if nFin is greater than zero, then keep
2210       ** looping until a free-page located within the first nFin pages
2211       ** of the file is found.
2212       */
2213       do {
2214         MemPage *pFreePg;
2215         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2216         if( rc!=SQLITE_OK ){
2217           releasePage(pLastPg);
2218           return rc;
2219         }
2220         releasePage(pFreePg);
2221       }while( nFin!=0 && iFreePg>nFin );
2222       assert( iFreePg<iLastPg );
2223 
2224       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2225       if( rc==SQLITE_OK ){
2226         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg);
2227       }
2228       releasePage(pLastPg);
2229       if( rc!=SQLITE_OK ){
2230         return rc;
2231       }
2232     }
2233   }
2234 
2235   pBt->nTrunc = iLastPg - 1;
2236   while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2237     pBt->nTrunc--;
2238   }
2239   return SQLITE_OK;
2240 }
2241 
2242 /*
2243 ** A write-transaction must be opened before calling this function.
2244 ** It performs a single unit of work towards an incremental vacuum.
2245 **
2246 ** If the incremental vacuum is finished after this function has run,
2247 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
2248 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2249 */
2250 int sqlite3BtreeIncrVacuum(Btree *p){
2251   int rc;
2252   BtShared *pBt = p->pBt;
2253 
2254   sqlite3BtreeEnter(p);
2255   pBt->db = p->db;
2256   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2257   if( !pBt->autoVacuum ){
2258     rc = SQLITE_DONE;
2259   }else{
2260     invalidateAllOverflowCache(pBt);
2261     rc = incrVacuumStep(pBt, 0);
2262   }
2263   sqlite3BtreeLeave(p);
2264   return rc;
2265 }
2266 
2267 /*
2268 ** This routine is called prior to sqlite3PagerCommit when a transaction
2269 ** is commited for an auto-vacuum database.
2270 **
2271 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2272 ** the database file should be truncated to during the commit process.
2273 ** i.e. the database has been reorganized so that only the first *pnTrunc
2274 ** pages are in use.
2275 */
2276 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2277   int rc = SQLITE_OK;
2278   Pager *pPager = pBt->pPager;
2279 #ifndef NDEBUG
2280   int nRef = sqlite3PagerRefcount(pPager);
2281 #endif
2282 
2283   assert( sqlite3_mutex_held(pBt->mutex) );
2284   invalidateAllOverflowCache(pBt);
2285   assert(pBt->autoVacuum);
2286   if( !pBt->incrVacuum ){
2287     Pgno nFin = 0;
2288 
2289     if( pBt->nTrunc==0 ){
2290       Pgno nFree;
2291       Pgno nPtrmap;
2292       const int pgsz = pBt->pageSize;
2293       Pgno nOrig = sqlite3PagerPagecount(pBt->pPager);
2294 
2295       if( PTRMAP_ISPAGE(pBt, nOrig) ){
2296         return SQLITE_CORRUPT_BKPT;
2297       }
2298       if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2299         nOrig--;
2300       }
2301       nFree = get4byte(&pBt->pPage1->aData[36]);
2302       nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2303       nFin = nOrig - nFree - nPtrmap;
2304       if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2305         nFin--;
2306       }
2307       while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2308         nFin--;
2309       }
2310     }
2311 
2312     while( rc==SQLITE_OK ){
2313       rc = incrVacuumStep(pBt, nFin);
2314     }
2315     if( rc==SQLITE_DONE ){
2316       assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2317       rc = SQLITE_OK;
2318       if( pBt->nTrunc ){
2319         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2320         put4byte(&pBt->pPage1->aData[32], 0);
2321         put4byte(&pBt->pPage1->aData[36], 0);
2322         pBt->nTrunc = nFin;
2323       }
2324     }
2325     if( rc!=SQLITE_OK ){
2326       sqlite3PagerRollback(pPager);
2327     }
2328   }
2329 
2330   if( rc==SQLITE_OK ){
2331     *pnTrunc = pBt->nTrunc;
2332     pBt->nTrunc = 0;
2333   }
2334   assert( nRef==sqlite3PagerRefcount(pPager) );
2335   return rc;
2336 }
2337 
2338 #endif
2339 
2340 /*
2341 ** This routine does the first phase of a two-phase commit.  This routine
2342 ** causes a rollback journal to be created (if it does not already exist)
2343 ** and populated with enough information so that if a power loss occurs
2344 ** the database can be restored to its original state by playing back
2345 ** the journal.  Then the contents of the journal are flushed out to
2346 ** the disk.  After the journal is safely on oxide, the changes to the
2347 ** database are written into the database file and flushed to oxide.
2348 ** At the end of this call, the rollback journal still exists on the
2349 ** disk and we are still holding all locks, so the transaction has not
2350 ** committed.  See sqlite3BtreeCommit() for the second phase of the
2351 ** commit process.
2352 **
2353 ** This call is a no-op if no write-transaction is currently active on pBt.
2354 **
2355 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2356 ** the name of a master journal file that should be written into the
2357 ** individual journal file, or is NULL, indicating no master journal file
2358 ** (single database transaction).
2359 **
2360 ** When this is called, the master journal should already have been
2361 ** created, populated with this journal pointer and synced to disk.
2362 **
2363 ** Once this is routine has returned, the only thing required to commit
2364 ** the write-transaction for this database file is to delete the journal.
2365 */
2366 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2367   int rc = SQLITE_OK;
2368   if( p->inTrans==TRANS_WRITE ){
2369     BtShared *pBt = p->pBt;
2370     Pgno nTrunc = 0;
2371     sqlite3BtreeEnter(p);
2372     pBt->db = p->db;
2373 #ifndef SQLITE_OMIT_AUTOVACUUM
2374     if( pBt->autoVacuum ){
2375       rc = autoVacuumCommit(pBt, &nTrunc);
2376       if( rc!=SQLITE_OK ){
2377         sqlite3BtreeLeave(p);
2378         return rc;
2379       }
2380     }
2381 #endif
2382     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
2383     sqlite3BtreeLeave(p);
2384   }
2385   return rc;
2386 }
2387 
2388 /*
2389 ** Commit the transaction currently in progress.
2390 **
2391 ** This routine implements the second phase of a 2-phase commit.  The
2392 ** sqlite3BtreeSync() routine does the first phase and should be invoked
2393 ** prior to calling this routine.  The sqlite3BtreeSync() routine did
2394 ** all the work of writing information out to disk and flushing the
2395 ** contents so that they are written onto the disk platter.  All this
2396 ** routine has to do is delete or truncate the rollback journal
2397 ** (which causes the transaction to commit) and drop locks.
2398 **
2399 ** This will release the write lock on the database file.  If there
2400 ** are no active cursors, it also releases the read lock.
2401 */
2402 int sqlite3BtreeCommitPhaseTwo(Btree *p){
2403   BtShared *pBt = p->pBt;
2404 
2405   sqlite3BtreeEnter(p);
2406   pBt->db = p->db;
2407   btreeIntegrity(p);
2408 
2409   /* If the handle has a write-transaction open, commit the shared-btrees
2410   ** transaction and set the shared state to TRANS_READ.
2411   */
2412   if( p->inTrans==TRANS_WRITE ){
2413     int rc;
2414     assert( pBt->inTransaction==TRANS_WRITE );
2415     assert( pBt->nTransaction>0 );
2416     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2417     if( rc!=SQLITE_OK ){
2418       sqlite3BtreeLeave(p);
2419       return rc;
2420     }
2421     pBt->inTransaction = TRANS_READ;
2422     pBt->inStmt = 0;
2423   }
2424   unlockAllTables(p);
2425 
2426   /* If the handle has any kind of transaction open, decrement the transaction
2427   ** count of the shared btree. If the transaction count reaches 0, set
2428   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2429   ** will unlock the pager.
2430   */
2431   if( p->inTrans!=TRANS_NONE ){
2432     pBt->nTransaction--;
2433     if( 0==pBt->nTransaction ){
2434       pBt->inTransaction = TRANS_NONE;
2435     }
2436   }
2437 
2438   /* Set the handles current transaction state to TRANS_NONE and unlock
2439   ** the pager if this call closed the only read or write transaction.
2440   */
2441   p->inTrans = TRANS_NONE;
2442   unlockBtreeIfUnused(pBt);
2443 
2444   btreeIntegrity(p);
2445   sqlite3BtreeLeave(p);
2446   return SQLITE_OK;
2447 }
2448 
2449 /*
2450 ** Do both phases of a commit.
2451 */
2452 int sqlite3BtreeCommit(Btree *p){
2453   int rc;
2454   sqlite3BtreeEnter(p);
2455   rc = sqlite3BtreeCommitPhaseOne(p, 0);
2456   if( rc==SQLITE_OK ){
2457     rc = sqlite3BtreeCommitPhaseTwo(p);
2458   }
2459   sqlite3BtreeLeave(p);
2460   return rc;
2461 }
2462 
2463 #ifndef NDEBUG
2464 /*
2465 ** Return the number of write-cursors open on this handle. This is for use
2466 ** in assert() expressions, so it is only compiled if NDEBUG is not
2467 ** defined.
2468 **
2469 ** For the purposes of this routine, a write-cursor is any cursor that
2470 ** is capable of writing to the databse.  That means the cursor was
2471 ** originally opened for writing and the cursor has not be disabled
2472 ** by having its state changed to CURSOR_FAULT.
2473 */
2474 static int countWriteCursors(BtShared *pBt){
2475   BtCursor *pCur;
2476   int r = 0;
2477   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2478     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2479   }
2480   return r;
2481 }
2482 #endif
2483 
2484 /*
2485 ** This routine sets the state to CURSOR_FAULT and the error
2486 ** code to errCode for every cursor on BtShared that pBtree
2487 ** references.
2488 **
2489 ** Every cursor is tripped, including cursors that belong
2490 ** to other database connections that happen to be sharing
2491 ** the cache with pBtree.
2492 **
2493 ** This routine gets called when a rollback occurs.
2494 ** All cursors using the same cache must be tripped
2495 ** to prevent them from trying to use the btree after
2496 ** the rollback.  The rollback may have deleted tables
2497 ** or moved root pages, so it is not sufficient to
2498 ** save the state of the cursor.  The cursor must be
2499 ** invalidated.
2500 */
2501 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2502   BtCursor *p;
2503   sqlite3BtreeEnter(pBtree);
2504   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2505     clearCursorPosition(p);
2506     p->eState = CURSOR_FAULT;
2507     p->skip = errCode;
2508   }
2509   sqlite3BtreeLeave(pBtree);
2510 }
2511 
2512 /*
2513 ** Rollback the transaction in progress.  All cursors will be
2514 ** invalided by this operation.  Any attempt to use a cursor
2515 ** that was open at the beginning of this operation will result
2516 ** in an error.
2517 **
2518 ** This will release the write lock on the database file.  If there
2519 ** are no active cursors, it also releases the read lock.
2520 */
2521 int sqlite3BtreeRollback(Btree *p){
2522   int rc;
2523   BtShared *pBt = p->pBt;
2524   MemPage *pPage1;
2525 
2526   sqlite3BtreeEnter(p);
2527   pBt->db = p->db;
2528   rc = saveAllCursors(pBt, 0, 0);
2529 #ifndef SQLITE_OMIT_SHARED_CACHE
2530   if( rc!=SQLITE_OK ){
2531     /* This is a horrible situation. An IO or malloc() error occured whilst
2532     ** trying to save cursor positions. If this is an automatic rollback (as
2533     ** the result of a constraint, malloc() failure or IO error) then
2534     ** the cache may be internally inconsistent (not contain valid trees) so
2535     ** we cannot simply return the error to the caller. Instead, abort
2536     ** all queries that may be using any of the cursors that failed to save.
2537     */
2538     sqlite3BtreeTripAllCursors(p, rc);
2539   }
2540 #endif
2541   btreeIntegrity(p);
2542   unlockAllTables(p);
2543 
2544   if( p->inTrans==TRANS_WRITE ){
2545     int rc2;
2546 
2547 #ifndef SQLITE_OMIT_AUTOVACUUM
2548     pBt->nTrunc = 0;
2549 #endif
2550 
2551     assert( TRANS_WRITE==pBt->inTransaction );
2552     rc2 = sqlite3PagerRollback(pBt->pPager);
2553     if( rc2!=SQLITE_OK ){
2554       rc = rc2;
2555     }
2556 
2557     /* The rollback may have destroyed the pPage1->aData value.  So
2558     ** call sqlite3BtreeGetPage() on page 1 again to make
2559     ** sure pPage1->aData is set correctly. */
2560     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2561       releasePage(pPage1);
2562     }
2563     assert( countWriteCursors(pBt)==0 );
2564     pBt->inTransaction = TRANS_READ;
2565   }
2566 
2567   if( p->inTrans!=TRANS_NONE ){
2568     assert( pBt->nTransaction>0 );
2569     pBt->nTransaction--;
2570     if( 0==pBt->nTransaction ){
2571       pBt->inTransaction = TRANS_NONE;
2572     }
2573   }
2574 
2575   p->inTrans = TRANS_NONE;
2576   pBt->inStmt = 0;
2577   unlockBtreeIfUnused(pBt);
2578 
2579   btreeIntegrity(p);
2580   sqlite3BtreeLeave(p);
2581   return rc;
2582 }
2583 
2584 /*
2585 ** Start a statement subtransaction.  The subtransaction can
2586 ** can be rolled back independently of the main transaction.
2587 ** You must start a transaction before starting a subtransaction.
2588 ** The subtransaction is ended automatically if the main transaction
2589 ** commits or rolls back.
2590 **
2591 ** Only one subtransaction may be active at a time.  It is an error to try
2592 ** to start a new subtransaction if another subtransaction is already active.
2593 **
2594 ** Statement subtransactions are used around individual SQL statements
2595 ** that are contained within a BEGIN...COMMIT block.  If a constraint
2596 ** error occurs within the statement, the effect of that one statement
2597 ** can be rolled back without having to rollback the entire transaction.
2598 */
2599 int sqlite3BtreeBeginStmt(Btree *p){
2600   int rc;
2601   BtShared *pBt = p->pBt;
2602   sqlite3BtreeEnter(p);
2603   pBt->db = p->db;
2604   if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2605     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2606   }else{
2607     assert( pBt->inTransaction==TRANS_WRITE );
2608     rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2609     pBt->inStmt = 1;
2610   }
2611   sqlite3BtreeLeave(p);
2612   return rc;
2613 }
2614 
2615 
2616 /*
2617 ** Commit the statment subtransaction currently in progress.  If no
2618 ** subtransaction is active, this is a no-op.
2619 */
2620 int sqlite3BtreeCommitStmt(Btree *p){
2621   int rc;
2622   BtShared *pBt = p->pBt;
2623   sqlite3BtreeEnter(p);
2624   pBt->db = p->db;
2625   if( pBt->inStmt && !pBt->readOnly ){
2626     rc = sqlite3PagerStmtCommit(pBt->pPager);
2627   }else{
2628     rc = SQLITE_OK;
2629   }
2630   pBt->inStmt = 0;
2631   sqlite3BtreeLeave(p);
2632   return rc;
2633 }
2634 
2635 /*
2636 ** Rollback the active statement subtransaction.  If no subtransaction
2637 ** is active this routine is a no-op.
2638 **
2639 ** All cursors will be invalidated by this operation.  Any attempt
2640 ** to use a cursor that was open at the beginning of this operation
2641 ** will result in an error.
2642 */
2643 int sqlite3BtreeRollbackStmt(Btree *p){
2644   int rc = SQLITE_OK;
2645   BtShared *pBt = p->pBt;
2646   sqlite3BtreeEnter(p);
2647   pBt->db = p->db;
2648   if( pBt->inStmt && !pBt->readOnly ){
2649     rc = sqlite3PagerStmtRollback(pBt->pPager);
2650     assert( countWriteCursors(pBt)==0 );
2651     pBt->inStmt = 0;
2652   }
2653   sqlite3BtreeLeave(p);
2654   return rc;
2655 }
2656 
2657 /*
2658 ** Create a new cursor for the BTree whose root is on the page
2659 ** iTable.  The act of acquiring a cursor gets a read lock on
2660 ** the database file.
2661 **
2662 ** If wrFlag==0, then the cursor can only be used for reading.
2663 ** If wrFlag==1, then the cursor can be used for reading or for
2664 ** writing if other conditions for writing are also met.  These
2665 ** are the conditions that must be met in order for writing to
2666 ** be allowed:
2667 **
2668 ** 1:  The cursor must have been opened with wrFlag==1
2669 **
2670 ** 2:  Other database connections that share the same pager cache
2671 **     but which are not in the READ_UNCOMMITTED state may not have
2672 **     cursors open with wrFlag==0 on the same table.  Otherwise
2673 **     the changes made by this write cursor would be visible to
2674 **     the read cursors in the other database connection.
2675 **
2676 ** 3:  The database must be writable (not on read-only media)
2677 **
2678 ** 4:  There must be an active transaction.
2679 **
2680 ** No checking is done to make sure that page iTable really is the
2681 ** root page of a b-tree.  If it is not, then the cursor acquired
2682 ** will not work correctly.
2683 */
2684 static int btreeCursor(
2685   Btree *p,                              /* The btree */
2686   int iTable,                            /* Root page of table to open */
2687   int wrFlag,                            /* 1 to write. 0 read-only */
2688   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
2689   BtCursor *pCur                         /* Space for new cursor */
2690 ){
2691   int rc;
2692   BtShared *pBt = p->pBt;
2693 
2694   assert( sqlite3BtreeHoldsMutex(p) );
2695   if( wrFlag ){
2696     if( pBt->readOnly ){
2697       return SQLITE_READONLY;
2698     }
2699     if( checkReadLocks(p, iTable, 0) ){
2700       return SQLITE_LOCKED;
2701     }
2702   }
2703 
2704   if( pBt->pPage1==0 ){
2705     rc = lockBtreeWithRetry(p);
2706     if( rc!=SQLITE_OK ){
2707       return rc;
2708     }
2709     if( pBt->readOnly && wrFlag ){
2710       return SQLITE_READONLY;
2711     }
2712   }
2713   pCur->pgnoRoot = (Pgno)iTable;
2714   if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){
2715     rc = SQLITE_EMPTY;
2716     goto create_cursor_exception;
2717   }
2718   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2719   if( rc!=SQLITE_OK ){
2720     goto create_cursor_exception;
2721   }
2722 
2723   /* Now that no other errors can occur, finish filling in the BtCursor
2724   ** variables, link the cursor into the BtShared list and set *ppCur (the
2725   ** output argument to this function).
2726   */
2727   pCur->pKeyInfo = pKeyInfo;
2728   pCur->pBtree = p;
2729   pCur->pBt = pBt;
2730   pCur->wrFlag = wrFlag;
2731   pCur->pNext = pBt->pCursor;
2732   if( pCur->pNext ){
2733     pCur->pNext->pPrev = pCur;
2734   }
2735   pBt->pCursor = pCur;
2736   pCur->eState = CURSOR_INVALID;
2737 
2738   return SQLITE_OK;
2739 
2740 create_cursor_exception:
2741   if( pCur ){
2742     releasePage(pCur->pPage);
2743   }
2744   unlockBtreeIfUnused(pBt);
2745   return rc;
2746 }
2747 int sqlite3BtreeCursor(
2748   Btree *p,                                   /* The btree */
2749   int iTable,                                 /* Root page of table to open */
2750   int wrFlag,                                 /* 1 to write. 0 read-only */
2751   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
2752   BtCursor *pCur                              /* Write new cursor here */
2753 ){
2754   int rc;
2755   sqlite3BtreeEnter(p);
2756   p->pBt->db = p->db;
2757   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
2758   sqlite3BtreeLeave(p);
2759   return rc;
2760 }
2761 int sqlite3BtreeCursorSize(){
2762   return sizeof(BtCursor);
2763 }
2764 
2765 
2766 
2767 /*
2768 ** Close a cursor.  The read lock on the database file is released
2769 ** when the last cursor is closed.
2770 */
2771 int sqlite3BtreeCloseCursor(BtCursor *pCur){
2772   Btree *pBtree = pCur->pBtree;
2773   if( pBtree ){
2774     BtShared *pBt = pCur->pBt;
2775     sqlite3BtreeEnter(pBtree);
2776     pBt->db = pBtree->db;
2777     clearCursorPosition(pCur);
2778     if( pCur->pPrev ){
2779       pCur->pPrev->pNext = pCur->pNext;
2780     }else{
2781       pBt->pCursor = pCur->pNext;
2782     }
2783     if( pCur->pNext ){
2784       pCur->pNext->pPrev = pCur->pPrev;
2785     }
2786     releasePage(pCur->pPage);
2787     unlockBtreeIfUnused(pBt);
2788     invalidateOverflowCache(pCur);
2789     /* sqlite3_free(pCur); */
2790     sqlite3BtreeLeave(pBtree);
2791   }
2792   return SQLITE_OK;
2793 }
2794 
2795 /*
2796 ** Make a temporary cursor by filling in the fields of pTempCur.
2797 ** The temporary cursor is not on the cursor list for the Btree.
2798 */
2799 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2800   assert( cursorHoldsMutex(pCur) );
2801   memcpy(pTempCur, pCur, sizeof(*pCur));
2802   pTempCur->pNext = 0;
2803   pTempCur->pPrev = 0;
2804   if( pTempCur->pPage ){
2805     sqlite3PagerRef(pTempCur->pPage->pDbPage);
2806   }
2807 }
2808 
2809 /*
2810 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2811 ** function above.
2812 */
2813 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2814   assert( cursorHoldsMutex(pCur) );
2815   if( pCur->pPage ){
2816     sqlite3PagerUnref(pCur->pPage->pDbPage);
2817   }
2818 }
2819 
2820 /*
2821 ** Make sure the BtCursor* given in the argument has a valid
2822 ** BtCursor.info structure.  If it is not already valid, call
2823 ** sqlite3BtreeParseCell() to fill it in.
2824 **
2825 ** BtCursor.info is a cache of the information in the current cell.
2826 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2827 **
2828 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
2829 ** compiler to crash when getCellInfo() is implemented as a macro.
2830 ** But there is a measureable speed advantage to using the macro on gcc
2831 ** (when less compiler optimizations like -Os or -O0 are used and the
2832 ** compiler is not doing agressive inlining.)  So we use a real function
2833 ** for MSVC and a macro for everything else.  Ticket #2457.
2834 */
2835 #ifndef NDEBUG
2836   static void assertCellInfo(BtCursor *pCur){
2837     CellInfo info;
2838     memset(&info, 0, sizeof(info));
2839     sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info);
2840     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2841   }
2842 #else
2843   #define assertCellInfo(x)
2844 #endif
2845 #ifdef _MSC_VER
2846   /* Use a real function in MSVC to work around bugs in that compiler. */
2847   static void getCellInfo(BtCursor *pCur){
2848     if( pCur->info.nSize==0 ){
2849       sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info);
2850       pCur->validNKey = 1;
2851     }else{
2852       assertCellInfo(pCur);
2853     }
2854   }
2855 #else /* if not _MSC_VER */
2856   /* Use a macro in all other compilers so that the function is inlined */
2857 #define getCellInfo(pCur)                                               \
2858   if( pCur->info.nSize==0 ){                                            \
2859     sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info);         \
2860     pCur->validNKey = 1;                                                \
2861   }else{                                                                \
2862     assertCellInfo(pCur);                                               \
2863   }
2864 #endif /* _MSC_VER */
2865 
2866 /*
2867 ** Set *pSize to the size of the buffer needed to hold the value of
2868 ** the key for the current entry.  If the cursor is not pointing
2869 ** to a valid entry, *pSize is set to 0.
2870 **
2871 ** For a table with the INTKEY flag set, this routine returns the key
2872 ** itself, not the number of bytes in the key.
2873 */
2874 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2875   int rc;
2876 
2877   assert( cursorHoldsMutex(pCur) );
2878   rc = restoreOrClearCursorPosition(pCur);
2879   if( rc==SQLITE_OK ){
2880     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2881     if( pCur->eState==CURSOR_INVALID ){
2882       *pSize = 0;
2883     }else{
2884       getCellInfo(pCur);
2885       *pSize = pCur->info.nKey;
2886     }
2887   }
2888   return rc;
2889 }
2890 
2891 /*
2892 ** Set *pSize to the number of bytes of data in the entry the
2893 ** cursor currently points to.  Always return SQLITE_OK.
2894 ** Failure is not possible.  If the cursor is not currently
2895 ** pointing to an entry (which can happen, for example, if
2896 ** the database is empty) then *pSize is set to 0.
2897 */
2898 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2899   int rc;
2900 
2901   assert( cursorHoldsMutex(pCur) );
2902   rc = restoreOrClearCursorPosition(pCur);
2903   if( rc==SQLITE_OK ){
2904     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2905     if( pCur->eState==CURSOR_INVALID ){
2906       /* Not pointing at a valid entry - set *pSize to 0. */
2907       *pSize = 0;
2908     }else{
2909       getCellInfo(pCur);
2910       *pSize = pCur->info.nData;
2911     }
2912   }
2913   return rc;
2914 }
2915 
2916 /*
2917 ** Given the page number of an overflow page in the database (parameter
2918 ** ovfl), this function finds the page number of the next page in the
2919 ** linked list of overflow pages. If possible, it uses the auto-vacuum
2920 ** pointer-map data instead of reading the content of page ovfl to do so.
2921 **
2922 ** If an error occurs an SQLite error code is returned. Otherwise:
2923 **
2924 ** Unless pPgnoNext is NULL, the page number of the next overflow
2925 ** page in the linked list is written to *pPgnoNext. If page ovfl
2926 ** is the last page in its linked list, *pPgnoNext is set to zero.
2927 **
2928 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
2929 ** for page ovfl. The underlying pager page may have been requested
2930 ** with the noContent flag set, so the page data accessable via
2931 ** this handle may not be trusted.
2932 */
2933 static int getOverflowPage(
2934   BtShared *pBt,
2935   Pgno ovfl,                   /* Overflow page */
2936   MemPage **ppPage,            /* OUT: MemPage handle */
2937   Pgno *pPgnoNext              /* OUT: Next overflow page number */
2938 ){
2939   Pgno next = 0;
2940   int rc;
2941 
2942   assert( sqlite3_mutex_held(pBt->mutex) );
2943   /* One of these must not be NULL. Otherwise, why call this function? */
2944   assert(ppPage || pPgnoNext);
2945 
2946   /* If pPgnoNext is NULL, then this function is being called to obtain
2947   ** a MemPage* reference only. No page-data is required in this case.
2948   */
2949   if( !pPgnoNext ){
2950     return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
2951   }
2952 
2953 #ifndef SQLITE_OMIT_AUTOVACUUM
2954   /* Try to find the next page in the overflow list using the
2955   ** autovacuum pointer-map pages. Guess that the next page in
2956   ** the overflow list is page number (ovfl+1). If that guess turns
2957   ** out to be wrong, fall back to loading the data of page
2958   ** number ovfl to determine the next page number.
2959   */
2960   if( pBt->autoVacuum ){
2961     Pgno pgno;
2962     Pgno iGuess = ovfl+1;
2963     u8 eType;
2964 
2965     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
2966       iGuess++;
2967     }
2968 
2969     if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){
2970       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
2971       if( rc!=SQLITE_OK ){
2972         return rc;
2973       }
2974       if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
2975         next = iGuess;
2976       }
2977     }
2978   }
2979 #endif
2980 
2981   if( next==0 || ppPage ){
2982     MemPage *pPage = 0;
2983 
2984     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
2985     assert(rc==SQLITE_OK || pPage==0);
2986     if( next==0 && rc==SQLITE_OK ){
2987       next = get4byte(pPage->aData);
2988     }
2989 
2990     if( ppPage ){
2991       *ppPage = pPage;
2992     }else{
2993       releasePage(pPage);
2994     }
2995   }
2996   *pPgnoNext = next;
2997 
2998   return rc;
2999 }
3000 
3001 /*
3002 ** Copy data from a buffer to a page, or from a page to a buffer.
3003 **
3004 ** pPayload is a pointer to data stored on database page pDbPage.
3005 ** If argument eOp is false, then nByte bytes of data are copied
3006 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3007 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3008 ** of data are copied from the buffer pBuf to pPayload.
3009 **
3010 ** SQLITE_OK is returned on success, otherwise an error code.
3011 */
3012 static int copyPayload(
3013   void *pPayload,           /* Pointer to page data */
3014   void *pBuf,               /* Pointer to buffer */
3015   int nByte,                /* Number of bytes to copy */
3016   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3017   DbPage *pDbPage           /* Page containing pPayload */
3018 ){
3019   if( eOp ){
3020     /* Copy data from buffer to page (a write operation) */
3021     int rc = sqlite3PagerWrite(pDbPage);
3022     if( rc!=SQLITE_OK ){
3023       return rc;
3024     }
3025     memcpy(pPayload, pBuf, nByte);
3026   }else{
3027     /* Copy data from page to buffer (a read operation) */
3028     memcpy(pBuf, pPayload, nByte);
3029   }
3030   return SQLITE_OK;
3031 }
3032 
3033 /*
3034 ** This function is used to read or overwrite payload information
3035 ** for the entry that the pCur cursor is pointing to. If the eOp
3036 ** parameter is 0, this is a read operation (data copied into
3037 ** buffer pBuf). If it is non-zero, a write (data copied from
3038 ** buffer pBuf).
3039 **
3040 ** A total of "amt" bytes are read or written beginning at "offset".
3041 ** Data is read to or from the buffer pBuf.
3042 **
3043 ** This routine does not make a distinction between key and data.
3044 ** It just reads or writes bytes from the payload area.  Data might
3045 ** appear on the main page or be scattered out on multiple overflow
3046 ** pages.
3047 **
3048 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3049 ** cursor entry uses one or more overflow pages, this function
3050 ** allocates space for and lazily popluates the overflow page-list
3051 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3052 ** cache to make seeking to the supplied offset more efficient.
3053 **
3054 ** Once an overflow page-list cache has been allocated, it may be
3055 ** invalidated if some other cursor writes to the same table, or if
3056 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3057 ** mode, the following events may invalidate an overflow page-list cache.
3058 **
3059 **   * An incremental vacuum,
3060 **   * A commit in auto_vacuum="full" mode,
3061 **   * Creating a table (may require moving an overflow page).
3062 */
3063 static int accessPayload(
3064   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3065   int offset,          /* Begin reading this far into payload */
3066   int amt,             /* Read this many bytes */
3067   unsigned char *pBuf, /* Write the bytes into this buffer */
3068   int skipKey,         /* offset begins at data if this is true */
3069   int eOp              /* zero to read. non-zero to write. */
3070 ){
3071   unsigned char *aPayload;
3072   int rc = SQLITE_OK;
3073   u32 nKey;
3074   int iIdx = 0;
3075   MemPage *pPage = pCur->pPage;     /* Btree page of current cursor entry */
3076   BtShared *pBt;                   /* Btree this cursor belongs to */
3077 
3078   assert( pPage );
3079   assert( pCur->eState==CURSOR_VALID );
3080   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3081   assert( offset>=0 );
3082   assert( cursorHoldsMutex(pCur) );
3083 
3084   getCellInfo(pCur);
3085   aPayload = pCur->info.pCell + pCur->info.nHeader;
3086   nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3087 
3088   if( skipKey ){
3089     offset += nKey;
3090   }
3091   if( offset+amt > nKey+pCur->info.nData ){
3092     /* Trying to read or write past the end of the data is an error */
3093     return SQLITE_ERROR;
3094   }
3095 
3096   /* Check if data must be read/written to/from the btree page itself. */
3097   if( offset<pCur->info.nLocal ){
3098     int a = amt;
3099     if( a+offset>pCur->info.nLocal ){
3100       a = pCur->info.nLocal - offset;
3101     }
3102     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3103     offset = 0;
3104     pBuf += a;
3105     amt -= a;
3106   }else{
3107     offset -= pCur->info.nLocal;
3108   }
3109 
3110   pBt = pCur->pBt;
3111   if( rc==SQLITE_OK && amt>0 ){
3112     const int ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3113     Pgno nextPage;
3114 
3115     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3116 
3117 #ifndef SQLITE_OMIT_INCRBLOB
3118     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3119     ** has not been allocated, allocate it now. The array is sized at
3120     ** one entry for each overflow page in the overflow chain. The
3121     ** page number of the first overflow page is stored in aOverflow[0],
3122     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3123     ** (the cache is lazily populated).
3124     */
3125     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3126       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3127       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3128       if( nOvfl && !pCur->aOverflow ){
3129         rc = SQLITE_NOMEM;
3130       }
3131     }
3132 
3133     /* If the overflow page-list cache has been allocated and the
3134     ** entry for the first required overflow page is valid, skip
3135     ** directly to it.
3136     */
3137     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3138       iIdx = (offset/ovflSize);
3139       nextPage = pCur->aOverflow[iIdx];
3140       offset = (offset%ovflSize);
3141     }
3142 #endif
3143 
3144     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3145 
3146 #ifndef SQLITE_OMIT_INCRBLOB
3147       /* If required, populate the overflow page-list cache. */
3148       if( pCur->aOverflow ){
3149         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3150         pCur->aOverflow[iIdx] = nextPage;
3151       }
3152 #endif
3153 
3154       if( offset>=ovflSize ){
3155         /* The only reason to read this page is to obtain the page
3156         ** number for the next page in the overflow chain. The page
3157         ** data is not required. So first try to lookup the overflow
3158         ** page-list cache, if any, then fall back to the getOverflowPage()
3159         ** function.
3160         */
3161 #ifndef SQLITE_OMIT_INCRBLOB
3162         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3163           nextPage = pCur->aOverflow[iIdx+1];
3164         } else
3165 #endif
3166           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3167         offset -= ovflSize;
3168       }else{
3169         /* Need to read this page properly. It contains some of the
3170         ** range of data that is being read (eOp==0) or written (eOp!=0).
3171         */
3172         DbPage *pDbPage;
3173         int a = amt;
3174         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3175         if( rc==SQLITE_OK ){
3176           aPayload = sqlite3PagerGetData(pDbPage);
3177           nextPage = get4byte(aPayload);
3178           if( a + offset > ovflSize ){
3179             a = ovflSize - offset;
3180           }
3181           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3182           sqlite3PagerUnref(pDbPage);
3183           offset = 0;
3184           amt -= a;
3185           pBuf += a;
3186         }
3187       }
3188     }
3189   }
3190 
3191   if( rc==SQLITE_OK && amt>0 ){
3192     return SQLITE_CORRUPT_BKPT;
3193   }
3194   return rc;
3195 }
3196 
3197 /*
3198 ** Read part of the key associated with cursor pCur.  Exactly
3199 ** "amt" bytes will be transfered into pBuf[].  The transfer
3200 ** begins at "offset".
3201 **
3202 ** Return SQLITE_OK on success or an error code if anything goes
3203 ** wrong.  An error is returned if "offset+amt" is larger than
3204 ** the available payload.
3205 */
3206 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3207   int rc;
3208 
3209   assert( cursorHoldsMutex(pCur) );
3210   rc = restoreOrClearCursorPosition(pCur);
3211   if( rc==SQLITE_OK ){
3212     assert( pCur->eState==CURSOR_VALID );
3213     assert( pCur->pPage!=0 );
3214     if( pCur->pPage->intKey ){
3215       return SQLITE_CORRUPT_BKPT;
3216     }
3217     assert( pCur->pPage->intKey==0 );
3218     assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3219     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3220   }
3221   return rc;
3222 }
3223 
3224 /*
3225 ** Read part of the data associated with cursor pCur.  Exactly
3226 ** "amt" bytes will be transfered into pBuf[].  The transfer
3227 ** begins at "offset".
3228 **
3229 ** Return SQLITE_OK on success or an error code if anything goes
3230 ** wrong.  An error is returned if "offset+amt" is larger than
3231 ** the available payload.
3232 */
3233 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3234   int rc;
3235 
3236   assert( cursorHoldsMutex(pCur) );
3237   rc = restoreOrClearCursorPosition(pCur);
3238   if( rc==SQLITE_OK ){
3239     assert( pCur->eState==CURSOR_VALID );
3240     assert( pCur->pPage!=0 );
3241     assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3242     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3243   }
3244   return rc;
3245 }
3246 
3247 /*
3248 ** Return a pointer to payload information from the entry that the
3249 ** pCur cursor is pointing to.  The pointer is to the beginning of
3250 ** the key if skipKey==0 and it points to the beginning of data if
3251 ** skipKey==1.  The number of bytes of available key/data is written
3252 ** into *pAmt.  If *pAmt==0, then the value returned will not be
3253 ** a valid pointer.
3254 **
3255 ** This routine is an optimization.  It is common for the entire key
3256 ** and data to fit on the local page and for there to be no overflow
3257 ** pages.  When that is so, this routine can be used to access the
3258 ** key and data without making a copy.  If the key and/or data spills
3259 ** onto overflow pages, then accessPayload() must be used to reassembly
3260 ** the key/data and copy it into a preallocated buffer.
3261 **
3262 ** The pointer returned by this routine looks directly into the cached
3263 ** page of the database.  The data might change or move the next time
3264 ** any btree routine is called.
3265 */
3266 static const unsigned char *fetchPayload(
3267   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3268   int *pAmt,           /* Write the number of available bytes here */
3269   int skipKey          /* read beginning at data if this is true */
3270 ){
3271   unsigned char *aPayload;
3272   MemPage *pPage;
3273   u32 nKey;
3274   int nLocal;
3275 
3276   assert( pCur!=0 && pCur->pPage!=0 );
3277   assert( pCur->eState==CURSOR_VALID );
3278   assert( cursorHoldsMutex(pCur) );
3279   pPage = pCur->pPage;
3280   assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3281   getCellInfo(pCur);
3282   aPayload = pCur->info.pCell;
3283   aPayload += pCur->info.nHeader;
3284   if( pPage->intKey ){
3285     nKey = 0;
3286   }else{
3287     nKey = pCur->info.nKey;
3288   }
3289   if( skipKey ){
3290     aPayload += nKey;
3291     nLocal = pCur->info.nLocal - nKey;
3292   }else{
3293     nLocal = pCur->info.nLocal;
3294     if( nLocal>nKey ){
3295       nLocal = nKey;
3296     }
3297   }
3298   *pAmt = nLocal;
3299   return aPayload;
3300 }
3301 
3302 
3303 /*
3304 ** For the entry that cursor pCur is point to, return as
3305 ** many bytes of the key or data as are available on the local
3306 ** b-tree page.  Write the number of available bytes into *pAmt.
3307 **
3308 ** The pointer returned is ephemeral.  The key/data may move
3309 ** or be destroyed on the next call to any Btree routine,
3310 ** including calls from other threads against the same cache.
3311 ** Hence, a mutex on the BtShared should be held prior to calling
3312 ** this routine.
3313 **
3314 ** These routines is used to get quick access to key and data
3315 ** in the common case where no overflow pages are used.
3316 */
3317 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3318   assert( cursorHoldsMutex(pCur) );
3319   if( pCur->eState==CURSOR_VALID ){
3320     return (const void*)fetchPayload(pCur, pAmt, 0);
3321   }
3322   return 0;
3323 }
3324 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3325   assert( cursorHoldsMutex(pCur) );
3326   if( pCur->eState==CURSOR_VALID ){
3327     return (const void*)fetchPayload(pCur, pAmt, 1);
3328   }
3329   return 0;
3330 }
3331 
3332 
3333 /*
3334 ** Move the cursor down to a new child page.  The newPgno argument is the
3335 ** page number of the child page to move to.
3336 */
3337 static int moveToChild(BtCursor *pCur, u32 newPgno){
3338   int rc;
3339   MemPage *pNewPage;
3340   MemPage *pOldPage;
3341   BtShared *pBt = pCur->pBt;
3342 
3343   assert( cursorHoldsMutex(pCur) );
3344   assert( pCur->eState==CURSOR_VALID );
3345   rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
3346   if( rc ) return rc;
3347   pNewPage->idxParent = pCur->idx;
3348   pOldPage = pCur->pPage;
3349   pOldPage->idxShift = 0;
3350   releasePage(pOldPage);
3351   pCur->pPage = pNewPage;
3352   pCur->idx = 0;
3353   pCur->info.nSize = 0;
3354   pCur->validNKey = 0;
3355   if( pNewPage->nCell<1 ){
3356     return SQLITE_CORRUPT_BKPT;
3357   }
3358   return SQLITE_OK;
3359 }
3360 
3361 /*
3362 ** Return true if the page is the virtual root of its table.
3363 **
3364 ** The virtual root page is the root page for most tables.  But
3365 ** for the table rooted on page 1, sometime the real root page
3366 ** is empty except for the right-pointer.  In such cases the
3367 ** virtual root page is the page that the right-pointer of page
3368 ** 1 is pointing to.
3369 */
3370 int sqlite3BtreeIsRootPage(MemPage *pPage){
3371   MemPage *pParent;
3372 
3373   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3374   pParent = pPage->pParent;
3375   if( pParent==0 ) return 1;
3376   if( pParent->pgno>1 ) return 0;
3377   if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
3378   return 0;
3379 }
3380 
3381 /*
3382 ** Move the cursor up to the parent page.
3383 **
3384 ** pCur->idx is set to the cell index that contains the pointer
3385 ** to the page we are coming from.  If we are coming from the
3386 ** right-most child page then pCur->idx is set to one more than
3387 ** the largest cell index.
3388 */
3389 void sqlite3BtreeMoveToParent(BtCursor *pCur){
3390   MemPage *pParent;
3391   MemPage *pPage;
3392   int idxParent;
3393 
3394   assert( cursorHoldsMutex(pCur) );
3395   assert( pCur->eState==CURSOR_VALID );
3396   pPage = pCur->pPage;
3397   assert( pPage!=0 );
3398   assert( !sqlite3BtreeIsRootPage(pPage) );
3399   pParent = pPage->pParent;
3400   assert( pParent!=0 );
3401   idxParent = pPage->idxParent;
3402   sqlite3PagerRef(pParent->pDbPage);
3403   releasePage(pPage);
3404   pCur->pPage = pParent;
3405   pCur->info.nSize = 0;
3406   pCur->validNKey = 0;
3407   assert( pParent->idxShift==0 );
3408   pCur->idx = idxParent;
3409 }
3410 
3411 /*
3412 ** Move the cursor to the root page
3413 */
3414 static int moveToRoot(BtCursor *pCur){
3415   MemPage *pRoot;
3416   int rc = SQLITE_OK;
3417   Btree *p = pCur->pBtree;
3418   BtShared *pBt = p->pBt;
3419 
3420   assert( cursorHoldsMutex(pCur) );
3421   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3422   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
3423   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
3424   if( pCur->eState>=CURSOR_REQUIRESEEK ){
3425     if( pCur->eState==CURSOR_FAULT ){
3426       return pCur->skip;
3427     }
3428     clearCursorPosition(pCur);
3429   }
3430   pRoot = pCur->pPage;
3431   if( pRoot && pRoot->pgno==pCur->pgnoRoot ){
3432     assert( pRoot->isInit );
3433   }else{
3434     if(
3435       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0))
3436     ){
3437       pCur->eState = CURSOR_INVALID;
3438       return rc;
3439     }
3440     releasePage(pCur->pPage);
3441     pCur->pPage = pRoot;
3442   }
3443   pCur->idx = 0;
3444   pCur->info.nSize = 0;
3445   pCur->atLast = 0;
3446   pCur->validNKey = 0;
3447   if( pRoot->nCell==0 && !pRoot->leaf ){
3448     Pgno subpage;
3449     assert( pRoot->pgno==1 );
3450     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3451     assert( subpage>0 );
3452     pCur->eState = CURSOR_VALID;
3453     rc = moveToChild(pCur, subpage);
3454   }
3455   pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3456   return rc;
3457 }
3458 
3459 /*
3460 ** Move the cursor down to the left-most leaf entry beneath the
3461 ** entry to which it is currently pointing.
3462 **
3463 ** The left-most leaf is the one with the smallest key - the first
3464 ** in ascending order.
3465 */
3466 static int moveToLeftmost(BtCursor *pCur){
3467   Pgno pgno;
3468   int rc = SQLITE_OK;
3469   MemPage *pPage;
3470 
3471   assert( cursorHoldsMutex(pCur) );
3472   assert( pCur->eState==CURSOR_VALID );
3473   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3474     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3475     pgno = get4byte(findCell(pPage, pCur->idx));
3476     rc = moveToChild(pCur, pgno);
3477   }
3478   return rc;
3479 }
3480 
3481 /*
3482 ** Move the cursor down to the right-most leaf entry beneath the
3483 ** page to which it is currently pointing.  Notice the difference
3484 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
3485 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3486 ** finds the right-most entry beneath the *page*.
3487 **
3488 ** The right-most entry is the one with the largest key - the last
3489 ** key in ascending order.
3490 */
3491 static int moveToRightmost(BtCursor *pCur){
3492   Pgno pgno;
3493   int rc = SQLITE_OK;
3494   MemPage *pPage;
3495 
3496   assert( cursorHoldsMutex(pCur) );
3497   assert( pCur->eState==CURSOR_VALID );
3498   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3499     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3500     pCur->idx = pPage->nCell;
3501     rc = moveToChild(pCur, pgno);
3502   }
3503   if( rc==SQLITE_OK ){
3504     pCur->idx = pPage->nCell - 1;
3505     pCur->info.nSize = 0;
3506     pCur->validNKey = 0;
3507   }
3508   return SQLITE_OK;
3509 }
3510 
3511 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
3512 ** on success.  Set *pRes to 0 if the cursor actually points to something
3513 ** or set *pRes to 1 if the table is empty.
3514 */
3515 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3516   int rc;
3517 
3518   assert( cursorHoldsMutex(pCur) );
3519   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3520   rc = moveToRoot(pCur);
3521   if( rc==SQLITE_OK ){
3522     if( pCur->eState==CURSOR_INVALID ){
3523       assert( pCur->pPage->nCell==0 );
3524       *pRes = 1;
3525       rc = SQLITE_OK;
3526     }else{
3527       assert( pCur->pPage->nCell>0 );
3528       *pRes = 0;
3529       rc = moveToLeftmost(pCur);
3530     }
3531   }
3532   return rc;
3533 }
3534 
3535 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
3536 ** on success.  Set *pRes to 0 if the cursor actually points to something
3537 ** or set *pRes to 1 if the table is empty.
3538 */
3539 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3540   int rc;
3541 
3542   assert( cursorHoldsMutex(pCur) );
3543   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3544   rc = moveToRoot(pCur);
3545   if( rc==SQLITE_OK ){
3546     if( CURSOR_INVALID==pCur->eState ){
3547       assert( pCur->pPage->nCell==0 );
3548       *pRes = 1;
3549     }else{
3550       assert( pCur->eState==CURSOR_VALID );
3551       *pRes = 0;
3552       rc = moveToRightmost(pCur);
3553       getCellInfo(pCur);
3554       pCur->atLast = rc==SQLITE_OK;
3555     }
3556   }
3557   return rc;
3558 }
3559 
3560 /* Move the cursor so that it points to an entry near the key
3561 ** specified by pKey/nKey/pUnKey. Return a success code.
3562 **
3563 ** For INTKEY tables, only the nKey parameter is used.  pKey
3564 ** and pUnKey must be NULL.  For index tables, either pUnKey
3565 ** must point to a key that has already been unpacked, or else
3566 ** pKey/nKey describes a blob containing the key.
3567 **
3568 ** If an exact match is not found, then the cursor is always
3569 ** left pointing at a leaf page which would hold the entry if it
3570 ** were present.  The cursor might point to an entry that comes
3571 ** before or after the key.
3572 **
3573 ** The result of comparing the key with the entry to which the
3574 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
3575 ** this value is as follows:
3576 **
3577 **     *pRes<0      The cursor is left pointing at an entry that
3578 **                  is smaller than pKey or if the table is empty
3579 **                  and the cursor is therefore left point to nothing.
3580 **
3581 **     *pRes==0     The cursor is left pointing at an entry that
3582 **                  exactly matches pKey.
3583 **
3584 **     *pRes>0      The cursor is left pointing at an entry that
3585 **                  is larger than pKey.
3586 **
3587 */
3588 int sqlite3BtreeMoveto(
3589   BtCursor *pCur,        /* The cursor to be moved */
3590   const void *pKey,      /* The key content for indices.  Not used by tables */
3591   UnpackedRecord *pUnKey,/* Unpacked version of pKey */
3592   i64 nKey,              /* Size of pKey.  Or the key for tables */
3593   int biasRight,         /* If true, bias the search to the high end */
3594   int *pRes              /* Search result flag */
3595 ){
3596   int rc;
3597   char aSpace[200];
3598 
3599   assert( cursorHoldsMutex(pCur) );
3600   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3601 
3602   /* If the cursor is already positioned at the point we are trying
3603   ** to move to, then just return without doing any work */
3604   if( pCur->eState==CURSOR_VALID && pCur->validNKey && pCur->pPage->intKey ){
3605     if( pCur->info.nKey==nKey ){
3606       *pRes = 0;
3607       return SQLITE_OK;
3608     }
3609     if( pCur->atLast && pCur->info.nKey<nKey ){
3610       *pRes = -1;
3611       return SQLITE_OK;
3612     }
3613   }
3614 
3615 
3616   rc = moveToRoot(pCur);
3617   if( rc ){
3618     return rc;
3619   }
3620   assert( pCur->pPage );
3621   assert( pCur->pPage->isInit );
3622   if( pCur->eState==CURSOR_INVALID ){
3623     *pRes = -1;
3624     assert( pCur->pPage->nCell==0 );
3625     return SQLITE_OK;
3626   }
3627   if( pCur->pPage->intKey ){
3628     /* We are given an SQL table to search.  The key is the integer
3629     ** rowid contained in nKey.  pKey and pUnKey should both be NULL */
3630     assert( pUnKey==0 );
3631     assert( pKey==0 );
3632   }else if( pUnKey==0 ){
3633     /* We are to search an SQL index using a key encoded as a blob.
3634     ** The blob is found at pKey and is nKey bytes in length.  Unpack
3635     ** this key so that we can use it. */
3636     assert( pKey!=0 );
3637     pUnKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
3638                                    aSpace, sizeof(aSpace));
3639     if( pUnKey==0 ) return SQLITE_NOMEM;
3640   }else{
3641     /* We are to search an SQL index using a key that is already unpacked
3642     ** and handed to us in pUnKey. */
3643     assert( pKey==0 );
3644   }
3645   for(;;){
3646     int lwr, upr;
3647     Pgno chldPg;
3648     MemPage *pPage = pCur->pPage;
3649     int c = -1;  /* pRes return if table is empty must be -1 */
3650     lwr = 0;
3651     upr = pPage->nCell-1;
3652     if( !pPage->intKey && pUnKey==0 ){
3653       rc = SQLITE_CORRUPT_BKPT;
3654       goto moveto_finish;
3655     }
3656     if( biasRight ){
3657       pCur->idx = upr;
3658     }else{
3659       pCur->idx = (upr+lwr)/2;
3660     }
3661     if( lwr<=upr ) for(;;){
3662       void *pCellKey;
3663       i64 nCellKey;
3664       pCur->info.nSize = 0;
3665       pCur->validNKey = 1;
3666       if( pPage->intKey ){
3667         u8 *pCell;
3668         pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize;
3669         if( pPage->hasData ){
3670           u32 dummy;
3671           pCell += getVarint32(pCell, &dummy);
3672         }
3673         getVarint(pCell, (u64*)&nCellKey);
3674         if( nCellKey==nKey ){
3675           c = 0;
3676         }else if( nCellKey<nKey ){
3677           c = -1;
3678         }else{
3679           assert( nCellKey>nKey );
3680           c = +1;
3681         }
3682       }else{
3683         int available;
3684         pCellKey = (void *)fetchPayload(pCur, &available, 0);
3685         nCellKey = pCur->info.nKey;
3686         if( available>=nCellKey ){
3687           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey);
3688         }else{
3689           pCellKey = sqlite3_malloc( nCellKey );
3690           if( pCellKey==0 ){
3691             rc = SQLITE_NOMEM;
3692             goto moveto_finish;
3693           }
3694           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3695           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey);
3696           sqlite3_free(pCellKey);
3697           if( rc ) goto moveto_finish;
3698         }
3699       }
3700       if( c==0 ){
3701         pCur->info.nKey = nCellKey;
3702         if( pPage->leafData && !pPage->leaf ){
3703           lwr = pCur->idx;
3704           upr = lwr - 1;
3705           break;
3706         }else{
3707           if( pRes ) *pRes = 0;
3708           rc = SQLITE_OK;
3709           goto moveto_finish;
3710         }
3711       }
3712       if( c<0 ){
3713         lwr = pCur->idx+1;
3714       }else{
3715         upr = pCur->idx-1;
3716       }
3717       if( lwr>upr ){
3718         pCur->info.nKey = nCellKey;
3719         break;
3720       }
3721       pCur->idx = (lwr+upr)/2;
3722     }
3723     assert( lwr==upr+1 );
3724     assert( pPage->isInit );
3725     if( pPage->leaf ){
3726       chldPg = 0;
3727     }else if( lwr>=pPage->nCell ){
3728       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3729     }else{
3730       chldPg = get4byte(findCell(pPage, lwr));
3731     }
3732     if( chldPg==0 ){
3733       assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3734       if( pRes ) *pRes = c;
3735       rc = SQLITE_OK;
3736       goto moveto_finish;
3737     }
3738     pCur->idx = lwr;
3739     pCur->info.nSize = 0;
3740     pCur->validNKey = 0;
3741     rc = moveToChild(pCur, chldPg);
3742     if( rc ) goto moveto_finish;
3743   }
3744 moveto_finish:
3745   if( pKey ){
3746     /* If we created our own unpacked key at the top of this
3747     ** procedure, then destroy that key before returning. */
3748     sqlite3VdbeDeleteUnpackedRecord(pUnKey);
3749   }
3750   return rc;
3751 }
3752 
3753 
3754 /*
3755 ** Return TRUE if the cursor is not pointing at an entry of the table.
3756 **
3757 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
3758 ** past the last entry in the table or sqlite3BtreePrev() moves past
3759 ** the first entry.  TRUE is also returned if the table is empty.
3760 */
3761 int sqlite3BtreeEof(BtCursor *pCur){
3762   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3763   ** have been deleted? This API will need to change to return an error code
3764   ** as well as the boolean result value.
3765   */
3766   return (CURSOR_VALID!=pCur->eState);
3767 }
3768 
3769 /*
3770 ** Return the database connection handle for a cursor.
3771 */
3772 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
3773   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3774   return pCur->pBtree->db;
3775 }
3776 
3777 /*
3778 ** Advance the cursor to the next entry in the database.  If
3779 ** successful then set *pRes=0.  If the cursor
3780 ** was already pointing to the last entry in the database before
3781 ** this routine was called, then set *pRes=1.
3782 */
3783 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3784   int rc;
3785   MemPage *pPage;
3786 
3787   assert( cursorHoldsMutex(pCur) );
3788   rc = restoreOrClearCursorPosition(pCur);
3789   if( rc!=SQLITE_OK ){
3790     return rc;
3791   }
3792   assert( pRes!=0 );
3793   pPage = pCur->pPage;
3794   if( CURSOR_INVALID==pCur->eState ){
3795     *pRes = 1;
3796     return SQLITE_OK;
3797   }
3798   if( pCur->skip>0 ){
3799     pCur->skip = 0;
3800     *pRes = 0;
3801     return SQLITE_OK;
3802   }
3803   pCur->skip = 0;
3804 
3805   assert( pPage->isInit );
3806   assert( pCur->idx<pPage->nCell );
3807 
3808   pCur->idx++;
3809   pCur->info.nSize = 0;
3810   pCur->validNKey = 0;
3811   if( pCur->idx>=pPage->nCell ){
3812     if( !pPage->leaf ){
3813       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3814       if( rc ) return rc;
3815       rc = moveToLeftmost(pCur);
3816       *pRes = 0;
3817       return rc;
3818     }
3819     do{
3820       if( sqlite3BtreeIsRootPage(pPage) ){
3821         *pRes = 1;
3822         pCur->eState = CURSOR_INVALID;
3823         return SQLITE_OK;
3824       }
3825       sqlite3BtreeMoveToParent(pCur);
3826       pPage = pCur->pPage;
3827     }while( pCur->idx>=pPage->nCell );
3828     *pRes = 0;
3829     if( pPage->leafData ){
3830       rc = sqlite3BtreeNext(pCur, pRes);
3831     }else{
3832       rc = SQLITE_OK;
3833     }
3834     return rc;
3835   }
3836   *pRes = 0;
3837   if( pPage->leaf ){
3838     return SQLITE_OK;
3839   }
3840   rc = moveToLeftmost(pCur);
3841   return rc;
3842 }
3843 
3844 
3845 /*
3846 ** Step the cursor to the back to the previous entry in the database.  If
3847 ** successful then set *pRes=0.  If the cursor
3848 ** was already pointing to the first entry in the database before
3849 ** this routine was called, then set *pRes=1.
3850 */
3851 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3852   int rc;
3853   Pgno pgno;
3854   MemPage *pPage;
3855 
3856   assert( cursorHoldsMutex(pCur) );
3857   rc = restoreOrClearCursorPosition(pCur);
3858   if( rc!=SQLITE_OK ){
3859     return rc;
3860   }
3861   pCur->atLast = 0;
3862   if( CURSOR_INVALID==pCur->eState ){
3863     *pRes = 1;
3864     return SQLITE_OK;
3865   }
3866   if( pCur->skip<0 ){
3867     pCur->skip = 0;
3868     *pRes = 0;
3869     return SQLITE_OK;
3870   }
3871   pCur->skip = 0;
3872 
3873   pPage = pCur->pPage;
3874   assert( pPage->isInit );
3875   assert( pCur->idx>=0 );
3876   if( !pPage->leaf ){
3877     pgno = get4byte( findCell(pPage, pCur->idx) );
3878     rc = moveToChild(pCur, pgno);
3879     if( rc ){
3880       return rc;
3881     }
3882     rc = moveToRightmost(pCur);
3883   }else{
3884     while( pCur->idx==0 ){
3885       if( sqlite3BtreeIsRootPage(pPage) ){
3886         pCur->eState = CURSOR_INVALID;
3887         *pRes = 1;
3888         return SQLITE_OK;
3889       }
3890       sqlite3BtreeMoveToParent(pCur);
3891       pPage = pCur->pPage;
3892     }
3893     pCur->idx--;
3894     pCur->info.nSize = 0;
3895     pCur->validNKey = 0;
3896     if( pPage->leafData && !pPage->leaf ){
3897       rc = sqlite3BtreePrevious(pCur, pRes);
3898     }else{
3899       rc = SQLITE_OK;
3900     }
3901   }
3902   *pRes = 0;
3903   return rc;
3904 }
3905 
3906 /*
3907 ** Allocate a new page from the database file.
3908 **
3909 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
3910 ** has already been called on the new page.)  The new page has also
3911 ** been referenced and the calling routine is responsible for calling
3912 ** sqlite3PagerUnref() on the new page when it is done.
3913 **
3914 ** SQLITE_OK is returned on success.  Any other return value indicates
3915 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
3916 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
3917 **
3918 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
3919 ** locate a page close to the page number "nearby".  This can be used in an
3920 ** attempt to keep related pages close to each other in the database file,
3921 ** which in turn can make database access faster.
3922 **
3923 ** If the "exact" parameter is not 0, and the page-number nearby exists
3924 ** anywhere on the free-list, then it is guarenteed to be returned. This
3925 ** is only used by auto-vacuum databases when allocating a new table.
3926 */
3927 static int allocateBtreePage(
3928   BtShared *pBt,
3929   MemPage **ppPage,
3930   Pgno *pPgno,
3931   Pgno nearby,
3932   u8 exact
3933 ){
3934   MemPage *pPage1;
3935   int rc;
3936   int n;     /* Number of pages on the freelist */
3937   int k;     /* Number of leaves on the trunk of the freelist */
3938   MemPage *pTrunk = 0;
3939   MemPage *pPrevTrunk = 0;
3940 
3941   assert( sqlite3_mutex_held(pBt->mutex) );
3942   pPage1 = pBt->pPage1;
3943   n = get4byte(&pPage1->aData[36]);
3944   if( n>0 ){
3945     /* There are pages on the freelist.  Reuse one of those pages. */
3946     Pgno iTrunk;
3947     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
3948 
3949     /* If the 'exact' parameter was true and a query of the pointer-map
3950     ** shows that the page 'nearby' is somewhere on the free-list, then
3951     ** the entire-list will be searched for that page.
3952     */
3953 #ifndef SQLITE_OMIT_AUTOVACUUM
3954     if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){
3955       u8 eType;
3956       assert( nearby>0 );
3957       assert( pBt->autoVacuum );
3958       rc = ptrmapGet(pBt, nearby, &eType, 0);
3959       if( rc ) return rc;
3960       if( eType==PTRMAP_FREEPAGE ){
3961         searchList = 1;
3962       }
3963       *pPgno = nearby;
3964     }
3965 #endif
3966 
3967     /* Decrement the free-list count by 1. Set iTrunk to the index of the
3968     ** first free-list trunk page. iPrevTrunk is initially 1.
3969     */
3970     rc = sqlite3PagerWrite(pPage1->pDbPage);
3971     if( rc ) return rc;
3972     put4byte(&pPage1->aData[36], n-1);
3973 
3974     /* The code within this loop is run only once if the 'searchList' variable
3975     ** is not true. Otherwise, it runs once for each trunk-page on the
3976     ** free-list until the page 'nearby' is located.
3977     */
3978     do {
3979       pPrevTrunk = pTrunk;
3980       if( pPrevTrunk ){
3981         iTrunk = get4byte(&pPrevTrunk->aData[0]);
3982       }else{
3983         iTrunk = get4byte(&pPage1->aData[32]);
3984       }
3985       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
3986       if( rc ){
3987         pTrunk = 0;
3988         goto end_allocate_page;
3989       }
3990 
3991       k = get4byte(&pTrunk->aData[4]);
3992       if( k==0 && !searchList ){
3993         /* The trunk has no leaves and the list is not being searched.
3994         ** So extract the trunk page itself and use it as the newly
3995         ** allocated page */
3996         assert( pPrevTrunk==0 );
3997         rc = sqlite3PagerWrite(pTrunk->pDbPage);
3998         if( rc ){
3999           goto end_allocate_page;
4000         }
4001         *pPgno = iTrunk;
4002         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4003         *ppPage = pTrunk;
4004         pTrunk = 0;
4005         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4006       }else if( k>pBt->usableSize/4 - 8 ){
4007         /* Value of k is out of range.  Database corruption */
4008         rc = SQLITE_CORRUPT_BKPT;
4009         goto end_allocate_page;
4010 #ifndef SQLITE_OMIT_AUTOVACUUM
4011       }else if( searchList && nearby==iTrunk ){
4012         /* The list is being searched and this trunk page is the page
4013         ** to allocate, regardless of whether it has leaves.
4014         */
4015         assert( *pPgno==iTrunk );
4016         *ppPage = pTrunk;
4017         searchList = 0;
4018         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4019         if( rc ){
4020           goto end_allocate_page;
4021         }
4022         if( k==0 ){
4023           if( !pPrevTrunk ){
4024             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4025           }else{
4026             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4027           }
4028         }else{
4029           /* The trunk page is required by the caller but it contains
4030           ** pointers to free-list leaves. The first leaf becomes a trunk
4031           ** page in this case.
4032           */
4033           MemPage *pNewTrunk;
4034           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4035           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4036           if( rc!=SQLITE_OK ){
4037             goto end_allocate_page;
4038           }
4039           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4040           if( rc!=SQLITE_OK ){
4041             releasePage(pNewTrunk);
4042             goto end_allocate_page;
4043           }
4044           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4045           put4byte(&pNewTrunk->aData[4], k-1);
4046           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4047           releasePage(pNewTrunk);
4048           if( !pPrevTrunk ){
4049             put4byte(&pPage1->aData[32], iNewTrunk);
4050           }else{
4051             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4052             if( rc ){
4053               goto end_allocate_page;
4054             }
4055             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4056           }
4057         }
4058         pTrunk = 0;
4059         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4060 #endif
4061       }else{
4062         /* Extract a leaf from the trunk */
4063         int closest;
4064         Pgno iPage;
4065         unsigned char *aData = pTrunk->aData;
4066         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4067         if( rc ){
4068           goto end_allocate_page;
4069         }
4070         if( nearby>0 ){
4071           int i, dist;
4072           closest = 0;
4073           dist = get4byte(&aData[8]) - nearby;
4074           if( dist<0 ) dist = -dist;
4075           for(i=1; i<k; i++){
4076             int d2 = get4byte(&aData[8+i*4]) - nearby;
4077             if( d2<0 ) d2 = -d2;
4078             if( d2<dist ){
4079               closest = i;
4080               dist = d2;
4081             }
4082           }
4083         }else{
4084           closest = 0;
4085         }
4086 
4087         iPage = get4byte(&aData[8+closest*4]);
4088         if( !searchList || iPage==nearby ){
4089           *pPgno = iPage;
4090           if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){
4091             /* Free page off the end of the file */
4092             return SQLITE_CORRUPT_BKPT;
4093           }
4094           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4095                  ": %d more free pages\n",
4096                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
4097           if( closest<k-1 ){
4098             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4099           }
4100           put4byte(&aData[4], k-1);
4101           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
4102           if( rc==SQLITE_OK ){
4103             sqlite3PagerDontRollback((*ppPage)->pDbPage);
4104             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4105             if( rc!=SQLITE_OK ){
4106               releasePage(*ppPage);
4107             }
4108           }
4109           searchList = 0;
4110         }
4111       }
4112       releasePage(pPrevTrunk);
4113       pPrevTrunk = 0;
4114     }while( searchList );
4115   }else{
4116     /* There are no pages on the freelist, so create a new page at the
4117     ** end of the file */
4118     *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1;
4119 
4120 #ifndef SQLITE_OMIT_AUTOVACUUM
4121     if( pBt->nTrunc ){
4122       /* An incr-vacuum has already run within this transaction. So the
4123       ** page to allocate is not from the physical end of the file, but
4124       ** at pBt->nTrunc.
4125       */
4126       *pPgno = pBt->nTrunc+1;
4127       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4128         (*pPgno)++;
4129       }
4130     }
4131     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4132       /* If *pPgno refers to a pointer-map page, allocate two new pages
4133       ** at the end of the file instead of one. The first allocated page
4134       ** becomes a new pointer-map page, the second is used by the caller.
4135       */
4136       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4137       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4138       (*pPgno)++;
4139       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
4140     }
4141     if( pBt->nTrunc ){
4142       pBt->nTrunc = *pPgno;
4143     }
4144 #endif
4145 
4146     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4147     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4148     if( rc ) return rc;
4149     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4150     if( rc!=SQLITE_OK ){
4151       releasePage(*ppPage);
4152     }
4153     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4154   }
4155 
4156   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4157 
4158 end_allocate_page:
4159   releasePage(pTrunk);
4160   releasePage(pPrevTrunk);
4161   return rc;
4162 }
4163 
4164 /*
4165 ** Add a page of the database file to the freelist.
4166 **
4167 ** sqlite3PagerUnref() is NOT called for pPage.
4168 */
4169 static int freePage(MemPage *pPage){
4170   BtShared *pBt = pPage->pBt;
4171   MemPage *pPage1 = pBt->pPage1;
4172   int rc, n, k;
4173 
4174   /* Prepare the page for freeing */
4175   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4176   assert( pPage->pgno>1 );
4177   pPage->isInit = 0;
4178   releasePage(pPage->pParent);
4179   pPage->pParent = 0;
4180 
4181   /* Increment the free page count on pPage1 */
4182   rc = sqlite3PagerWrite(pPage1->pDbPage);
4183   if( rc ) return rc;
4184   n = get4byte(&pPage1->aData[36]);
4185   put4byte(&pPage1->aData[36], n+1);
4186 
4187 #ifdef SQLITE_SECURE_DELETE
4188   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4189   ** always fully overwrite deleted information with zeros.
4190   */
4191   rc = sqlite3PagerWrite(pPage->pDbPage);
4192   if( rc ) return rc;
4193   memset(pPage->aData, 0, pPage->pBt->pageSize);
4194 #endif
4195 
4196 #ifndef SQLITE_OMIT_AUTOVACUUM
4197   /* If the database supports auto-vacuum, write an entry in the pointer-map
4198   ** to indicate that the page is free.
4199   */
4200   if( pBt->autoVacuum ){
4201     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
4202     if( rc ) return rc;
4203   }
4204 #endif
4205 
4206   if( n==0 ){
4207     /* This is the first free page */
4208     rc = sqlite3PagerWrite(pPage->pDbPage);
4209     if( rc ) return rc;
4210     memset(pPage->aData, 0, 8);
4211     put4byte(&pPage1->aData[32], pPage->pgno);
4212     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
4213   }else{
4214     /* Other free pages already exist.  Retrive the first trunk page
4215     ** of the freelist and find out how many leaves it has. */
4216     MemPage *pTrunk;
4217     rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
4218     if( rc ) return rc;
4219     k = get4byte(&pTrunk->aData[4]);
4220     if( k>=pBt->usableSize/4 - 8 ){
4221       /* The trunk is full.  Turn the page being freed into a new
4222       ** trunk page with no leaves. */
4223       rc = sqlite3PagerWrite(pPage->pDbPage);
4224       if( rc==SQLITE_OK ){
4225         put4byte(pPage->aData, pTrunk->pgno);
4226         put4byte(&pPage->aData[4], 0);
4227         put4byte(&pPage1->aData[32], pPage->pgno);
4228         TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4229                 pPage->pgno, pTrunk->pgno));
4230       }
4231     }else if( k<0 ){
4232       rc = SQLITE_CORRUPT;
4233     }else{
4234       /* Add the newly freed page as a leaf on the current trunk */
4235       rc = sqlite3PagerWrite(pTrunk->pDbPage);
4236       if( rc==SQLITE_OK ){
4237         put4byte(&pTrunk->aData[4], k+1);
4238         put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
4239 #ifndef SQLITE_SECURE_DELETE
4240         sqlite3PagerDontWrite(pPage->pDbPage);
4241 #endif
4242       }
4243       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4244     }
4245     releasePage(pTrunk);
4246   }
4247   return rc;
4248 }
4249 
4250 /*
4251 ** Free any overflow pages associated with the given Cell.
4252 */
4253 static int clearCell(MemPage *pPage, unsigned char *pCell){
4254   BtShared *pBt = pPage->pBt;
4255   CellInfo info;
4256   Pgno ovflPgno;
4257   int rc;
4258   int nOvfl;
4259   int ovflPageSize;
4260 
4261   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4262   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4263   if( info.iOverflow==0 ){
4264     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
4265   }
4266   ovflPgno = get4byte(&pCell[info.iOverflow]);
4267   ovflPageSize = pBt->usableSize - 4;
4268   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4269   assert( ovflPgno==0 || nOvfl>0 );
4270   while( nOvfl-- ){
4271     MemPage *pOvfl;
4272     if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){
4273       return SQLITE_CORRUPT_BKPT;
4274     }
4275 
4276     rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
4277     if( rc ) return rc;
4278     rc = freePage(pOvfl);
4279     sqlite3PagerUnref(pOvfl->pDbPage);
4280     if( rc ) return rc;
4281   }
4282   return SQLITE_OK;
4283 }
4284 
4285 /*
4286 ** Create the byte sequence used to represent a cell on page pPage
4287 ** and write that byte sequence into pCell[].  Overflow pages are
4288 ** allocated and filled in as necessary.  The calling procedure
4289 ** is responsible for making sure sufficient space has been allocated
4290 ** for pCell[].
4291 **
4292 ** Note that pCell does not necessary need to point to the pPage->aData
4293 ** area.  pCell might point to some temporary storage.  The cell will
4294 ** be constructed in this temporary area then copied into pPage->aData
4295 ** later.
4296 */
4297 static int fillInCell(
4298   MemPage *pPage,                /* The page that contains the cell */
4299   unsigned char *pCell,          /* Complete text of the cell */
4300   const void *pKey, i64 nKey,    /* The key */
4301   const void *pData,int nData,   /* The data */
4302   int nZero,                     /* Extra zero bytes to append to pData */
4303   int *pnSize                    /* Write cell size here */
4304 ){
4305   int nPayload;
4306   const u8 *pSrc;
4307   int nSrc, n, rc;
4308   int spaceLeft;
4309   MemPage *pOvfl = 0;
4310   MemPage *pToRelease = 0;
4311   unsigned char *pPrior;
4312   unsigned char *pPayload;
4313   BtShared *pBt = pPage->pBt;
4314   Pgno pgnoOvfl = 0;
4315   int nHeader;
4316   CellInfo info;
4317 
4318   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4319 
4320   /* Fill in the header. */
4321   nHeader = 0;
4322   if( !pPage->leaf ){
4323     nHeader += 4;
4324   }
4325   if( pPage->hasData ){
4326     nHeader += putVarint(&pCell[nHeader], nData+nZero);
4327   }else{
4328     nData = nZero = 0;
4329   }
4330   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4331   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4332   assert( info.nHeader==nHeader );
4333   assert( info.nKey==nKey );
4334   assert( info.nData==nData+nZero );
4335 
4336   /* Fill in the payload */
4337   nPayload = nData + nZero;
4338   if( pPage->intKey ){
4339     pSrc = pData;
4340     nSrc = nData;
4341     nData = 0;
4342   }else{
4343     nPayload += nKey;
4344     pSrc = pKey;
4345     nSrc = nKey;
4346   }
4347   *pnSize = info.nSize;
4348   spaceLeft = info.nLocal;
4349   pPayload = &pCell[nHeader];
4350   pPrior = &pCell[info.iOverflow];
4351 
4352   while( nPayload>0 ){
4353     if( spaceLeft==0 ){
4354       int isExact = 0;
4355 #ifndef SQLITE_OMIT_AUTOVACUUM
4356       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4357       if( pBt->autoVacuum ){
4358         do{
4359           pgnoOvfl++;
4360         } while(
4361           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4362         );
4363         if( pgnoOvfl>1 ){
4364           /* isExact = 1; */
4365         }
4366       }
4367 #endif
4368       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
4369 #ifndef SQLITE_OMIT_AUTOVACUUM
4370       /* If the database supports auto-vacuum, and the second or subsequent
4371       ** overflow page is being allocated, add an entry to the pointer-map
4372       ** for that page now.
4373       **
4374       ** If this is the first overflow page, then write a partial entry
4375       ** to the pointer-map. If we write nothing to this pointer-map slot,
4376       ** then the optimistic overflow chain processing in clearCell()
4377       ** may misinterpret the uninitialised values and delete the
4378       ** wrong pages from the database.
4379       */
4380       if( pBt->autoVacuum && rc==SQLITE_OK ){
4381         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4382         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4383         if( rc ){
4384           releasePage(pOvfl);
4385         }
4386       }
4387 #endif
4388       if( rc ){
4389         releasePage(pToRelease);
4390         return rc;
4391       }
4392       put4byte(pPrior, pgnoOvfl);
4393       releasePage(pToRelease);
4394       pToRelease = pOvfl;
4395       pPrior = pOvfl->aData;
4396       put4byte(pPrior, 0);
4397       pPayload = &pOvfl->aData[4];
4398       spaceLeft = pBt->usableSize - 4;
4399     }
4400     n = nPayload;
4401     if( n>spaceLeft ) n = spaceLeft;
4402     if( nSrc>0 ){
4403       if( n>nSrc ) n = nSrc;
4404       assert( pSrc );
4405       memcpy(pPayload, pSrc, n);
4406     }else{
4407       memset(pPayload, 0, n);
4408     }
4409     nPayload -= n;
4410     pPayload += n;
4411     pSrc += n;
4412     nSrc -= n;
4413     spaceLeft -= n;
4414     if( nSrc==0 ){
4415       nSrc = nData;
4416       pSrc = pData;
4417     }
4418   }
4419   releasePage(pToRelease);
4420   return SQLITE_OK;
4421 }
4422 
4423 /*
4424 ** Change the MemPage.pParent pointer on the page whose number is
4425 ** given in the second argument so that MemPage.pParent holds the
4426 ** pointer in the third argument.
4427 */
4428 static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){
4429   MemPage *pThis;
4430   DbPage *pDbPage;
4431 
4432   assert( sqlite3_mutex_held(pBt->mutex) );
4433   assert( pNewParent!=0 );
4434   if( pgno==0 ) return SQLITE_OK;
4435   assert( pBt->pPager!=0 );
4436   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
4437   if( pDbPage ){
4438     pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage);
4439     if( pThis->isInit ){
4440       assert( pThis->aData==sqlite3PagerGetData(pDbPage) );
4441       if( pThis->pParent!=pNewParent ){
4442         if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage);
4443         pThis->pParent = pNewParent;
4444         sqlite3PagerRef(pNewParent->pDbPage);
4445       }
4446       pThis->idxParent = idx;
4447     }
4448     sqlite3PagerUnref(pDbPage);
4449   }
4450 
4451 #ifndef SQLITE_OMIT_AUTOVACUUM
4452   if( pBt->autoVacuum ){
4453     return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
4454   }
4455 #endif
4456   return SQLITE_OK;
4457 }
4458 
4459 
4460 
4461 /*
4462 ** Change the pParent pointer of all children of pPage to point back
4463 ** to pPage.
4464 **
4465 ** In other words, for every child of pPage, invoke reparentPage()
4466 ** to make sure that each child knows that pPage is its parent.
4467 **
4468 ** This routine gets called after you memcpy() one page into
4469 ** another.
4470 */
4471 static int reparentChildPages(MemPage *pPage){
4472   int i;
4473   BtShared *pBt = pPage->pBt;
4474   int rc = SQLITE_OK;
4475 
4476   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4477   if( pPage->leaf ) return SQLITE_OK;
4478 
4479   for(i=0; i<pPage->nCell; i++){
4480     u8 *pCell = findCell(pPage, i);
4481     rc = reparentPage(pBt, get4byte(pCell), pPage, i);
4482     if( rc!=SQLITE_OK ) return rc;
4483   }
4484   rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
4485                     pPage, i);
4486   pPage->idxShift = 0;
4487   return rc;
4488 }
4489 
4490 /*
4491 ** Remove the i-th cell from pPage.  This routine effects pPage only.
4492 ** The cell content is not freed or deallocated.  It is assumed that
4493 ** the cell content has been copied someplace else.  This routine just
4494 ** removes the reference to the cell from pPage.
4495 **
4496 ** "sz" must be the number of bytes in the cell.
4497 */
4498 static void dropCell(MemPage *pPage, int idx, int sz){
4499   int i;          /* Loop counter */
4500   int pc;         /* Offset to cell content of cell being deleted */
4501   u8 *data;       /* pPage->aData */
4502   u8 *ptr;        /* Used to move bytes around within data[] */
4503 
4504   assert( idx>=0 && idx<pPage->nCell );
4505   assert( sz==cellSize(pPage, idx) );
4506   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4507   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4508   data = pPage->aData;
4509   ptr = &data[pPage->cellOffset + 2*idx];
4510   pc = get2byte(ptr);
4511   assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4512   freeSpace(pPage, pc, sz);
4513   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4514     ptr[0] = ptr[2];
4515     ptr[1] = ptr[3];
4516   }
4517   pPage->nCell--;
4518   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4519   pPage->nFree += 2;
4520   pPage->idxShift = 1;
4521 }
4522 
4523 /*
4524 ** Insert a new cell on pPage at cell index "i".  pCell points to the
4525 ** content of the cell.
4526 **
4527 ** If the cell content will fit on the page, then put it there.  If it
4528 ** will not fit, then make a copy of the cell content into pTemp if
4529 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
4530 ** in pPage->aOvfl[] and make it point to the cell content (either
4531 ** in pTemp or the original pCell) and also record its index.
4532 ** Allocating a new entry in pPage->aCell[] implies that
4533 ** pPage->nOverflow is incremented.
4534 **
4535 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4536 ** cell. The caller will overwrite them after this function returns. If
4537 ** nSkip is non-zero, then pCell may not point to an invalid memory location
4538 ** (but pCell+nSkip is always valid).
4539 */
4540 static int insertCell(
4541   MemPage *pPage,   /* Page into which we are copying */
4542   int i,            /* New cell becomes the i-th cell of the page */
4543   u8 *pCell,        /* Content of the new cell */
4544   int sz,           /* Bytes of content in pCell */
4545   u8 *pTemp,        /* Temp storage space for pCell, if needed */
4546   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
4547 ){
4548   int idx;          /* Where to write new cell content in data[] */
4549   int j;            /* Loop counter */
4550   int top;          /* First byte of content for any cell in data[] */
4551   int end;          /* First byte past the last cell pointer in data[] */
4552   int ins;          /* Index in data[] where new cell pointer is inserted */
4553   int hdr;          /* Offset into data[] of the page header */
4554   int cellOffset;   /* Address of first cell pointer in data[] */
4555   u8 *data;         /* The content of the whole page */
4556   u8 *ptr;          /* Used for moving information around in data[] */
4557 
4558   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4559   assert( sz==cellSizePtr(pPage, pCell) );
4560   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4561   if( pPage->nOverflow || sz+2>pPage->nFree ){
4562     if( pTemp ){
4563       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4564       pCell = pTemp;
4565     }
4566     j = pPage->nOverflow++;
4567     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4568     pPage->aOvfl[j].pCell = pCell;
4569     pPage->aOvfl[j].idx = i;
4570     pPage->nFree = 0;
4571   }else{
4572     int rc = sqlite3PagerWrite(pPage->pDbPage);
4573     if( rc!=SQLITE_OK ){
4574       return rc;
4575     }
4576     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4577     data = pPage->aData;
4578     hdr = pPage->hdrOffset;
4579     top = get2byte(&data[hdr+5]);
4580     cellOffset = pPage->cellOffset;
4581     end = cellOffset + 2*pPage->nCell + 2;
4582     ins = cellOffset + 2*i;
4583     if( end > top - sz ){
4584       rc = defragmentPage(pPage);
4585       if( rc!=SQLITE_OK ) return rc;
4586       top = get2byte(&data[hdr+5]);
4587       assert( end + sz <= top );
4588     }
4589     idx = allocateSpace(pPage, sz);
4590     assert( idx>0 );
4591     assert( end <= get2byte(&data[hdr+5]) );
4592     pPage->nCell++;
4593     pPage->nFree -= 2;
4594     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4595     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4596       ptr[0] = ptr[-2];
4597       ptr[1] = ptr[-1];
4598     }
4599     put2byte(&data[ins], idx);
4600     put2byte(&data[hdr+3], pPage->nCell);
4601     pPage->idxShift = 1;
4602 #ifndef SQLITE_OMIT_AUTOVACUUM
4603     if( pPage->pBt->autoVacuum ){
4604       /* The cell may contain a pointer to an overflow page. If so, write
4605       ** the entry for the overflow page into the pointer map.
4606       */
4607       CellInfo info;
4608       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4609       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4610       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4611         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4612         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4613         if( rc!=SQLITE_OK ) return rc;
4614       }
4615     }
4616 #endif
4617   }
4618 
4619   return SQLITE_OK;
4620 }
4621 
4622 /*
4623 ** Add a list of cells to a page.  The page should be initially empty.
4624 ** The cells are guaranteed to fit on the page.
4625 */
4626 static void assemblePage(
4627   MemPage *pPage,   /* The page to be assemblied */
4628   int nCell,        /* The number of cells to add to this page */
4629   u8 **apCell,      /* Pointers to cell bodies */
4630   u16 *aSize        /* Sizes of the cells */
4631 ){
4632   int i;            /* Loop counter */
4633   int totalSize;    /* Total size of all cells */
4634   int hdr;          /* Index of page header */
4635   int cellptr;      /* Address of next cell pointer */
4636   int cellbody;     /* Address of next cell body */
4637   u8 *data;         /* Data for the page */
4638 
4639   assert( pPage->nOverflow==0 );
4640   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4641   totalSize = 0;
4642   for(i=0; i<nCell; i++){
4643     totalSize += aSize[i];
4644   }
4645   assert( totalSize+2*nCell<=pPage->nFree );
4646   assert( pPage->nCell==0 );
4647   cellptr = pPage->cellOffset;
4648   data = pPage->aData;
4649   hdr = pPage->hdrOffset;
4650   put2byte(&data[hdr+3], nCell);
4651   if( nCell ){
4652     cellbody = allocateSpace(pPage, totalSize);
4653     assert( cellbody>0 );
4654     assert( pPage->nFree >= 2*nCell );
4655     pPage->nFree -= 2*nCell;
4656     for(i=0; i<nCell; i++){
4657       put2byte(&data[cellptr], cellbody);
4658       memcpy(&data[cellbody], apCell[i], aSize[i]);
4659       cellptr += 2;
4660       cellbody += aSize[i];
4661     }
4662     assert( cellbody==pPage->pBt->usableSize );
4663   }
4664   pPage->nCell = nCell;
4665 }
4666 
4667 /*
4668 ** The following parameters determine how many adjacent pages get involved
4669 ** in a balancing operation.  NN is the number of neighbors on either side
4670 ** of the page that participate in the balancing operation.  NB is the
4671 ** total number of pages that participate, including the target page and
4672 ** NN neighbors on either side.
4673 **
4674 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
4675 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4676 ** in exchange for a larger degradation in INSERT and UPDATE performance.
4677 ** The value of NN appears to give the best results overall.
4678 */
4679 #define NN 1             /* Number of neighbors on either side of pPage */
4680 #define NB (NN*2+1)      /* Total pages involved in the balance */
4681 
4682 /* Forward reference */
4683 static int balance(MemPage*, int);
4684 
4685 #ifndef SQLITE_OMIT_QUICKBALANCE
4686 /*
4687 ** This version of balance() handles the common special case where
4688 ** a new entry is being inserted on the extreme right-end of the
4689 ** tree, in other words, when the new entry will become the largest
4690 ** entry in the tree.
4691 **
4692 ** Instead of trying balance the 3 right-most leaf pages, just add
4693 ** a new page to the right-hand side and put the one new entry in
4694 ** that page.  This leaves the right side of the tree somewhat
4695 ** unbalanced.  But odds are that we will be inserting new entries
4696 ** at the end soon afterwards so the nearly empty page will quickly
4697 ** fill up.  On average.
4698 **
4699 ** pPage is the leaf page which is the right-most page in the tree.
4700 ** pParent is its parent.  pPage must have a single overflow entry
4701 ** which is also the right-most entry on the page.
4702 */
4703 static int balance_quick(MemPage *pPage, MemPage *pParent){
4704   int rc;
4705   MemPage *pNew;
4706   Pgno pgnoNew;
4707   u8 *pCell;
4708   u16 szCell;
4709   CellInfo info;
4710   BtShared *pBt = pPage->pBt;
4711   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
4712   int parentSize;                   /* Size of new divider cell */
4713   u8 parentCell[64];                /* Space for the new divider cell */
4714 
4715   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4716 
4717   /* Allocate a new page. Insert the overflow cell from pPage
4718   ** into it. Then remove the overflow cell from pPage.
4719   */
4720   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4721   if( rc!=SQLITE_OK ){
4722     return rc;
4723   }
4724   pCell = pPage->aOvfl[0].pCell;
4725   szCell = cellSizePtr(pPage, pCell);
4726   zeroPage(pNew, pPage->aData[0]);
4727   assemblePage(pNew, 1, &pCell, &szCell);
4728   pPage->nOverflow = 0;
4729 
4730   /* Set the parent of the newly allocated page to pParent. */
4731   pNew->pParent = pParent;
4732   sqlite3PagerRef(pParent->pDbPage);
4733 
4734   /* pPage is currently the right-child of pParent. Change this
4735   ** so that the right-child is the new page allocated above and
4736   ** pPage is the next-to-right child.
4737   */
4738   assert( pPage->nCell>0 );
4739   pCell = findCell(pPage, pPage->nCell-1);
4740   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4741   rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4742   if( rc!=SQLITE_OK ){
4743     return rc;
4744   }
4745   assert( parentSize<64 );
4746   rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4747   if( rc!=SQLITE_OK ){
4748     return rc;
4749   }
4750   put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4751   put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4752 
4753 #ifndef SQLITE_OMIT_AUTOVACUUM
4754   /* If this is an auto-vacuum database, update the pointer map
4755   ** with entries for the new page, and any pointer from the
4756   ** cell on the page to an overflow page.
4757   */
4758   if( pBt->autoVacuum ){
4759     rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4760     if( rc==SQLITE_OK ){
4761       rc = ptrmapPutOvfl(pNew, 0);
4762     }
4763     if( rc!=SQLITE_OK ){
4764       releasePage(pNew);
4765       return rc;
4766     }
4767   }
4768 #endif
4769 
4770   /* Release the reference to the new page and balance the parent page,
4771   ** in case the divider cell inserted caused it to become overfull.
4772   */
4773   releasePage(pNew);
4774   return balance(pParent, 0);
4775 }
4776 #endif /* SQLITE_OMIT_QUICKBALANCE */
4777 
4778 /*
4779 ** This routine redistributes Cells on pPage and up to NN*2 siblings
4780 ** of pPage so that all pages have about the same amount of free space.
4781 ** Usually NN siblings on either side of pPage is used in the balancing,
4782 ** though more siblings might come from one side if pPage is the first
4783 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
4784 ** (something which can only happen if pPage is the root page or a
4785 ** child of root) then all available siblings participate in the balancing.
4786 **
4787 ** The number of siblings of pPage might be increased or decreased by one or
4788 ** two in an effort to keep pages nearly full but not over full. The root page
4789 ** is special and is allowed to be nearly empty. If pPage is
4790 ** the root page, then the depth of the tree might be increased
4791 ** or decreased by one, as necessary, to keep the root page from being
4792 ** overfull or completely empty.
4793 **
4794 ** Note that when this routine is called, some of the Cells on pPage
4795 ** might not actually be stored in pPage->aData[].  This can happen
4796 ** if the page is overfull.  Part of the job of this routine is to
4797 ** make sure all Cells for pPage once again fit in pPage->aData[].
4798 **
4799 ** In the course of balancing the siblings of pPage, the parent of pPage
4800 ** might become overfull or underfull.  If that happens, then this routine
4801 ** is called recursively on the parent.
4802 **
4803 ** If this routine fails for any reason, it might leave the database
4804 ** in a corrupted state.  So if this routine fails, the database should
4805 ** be rolled back.
4806 */
4807 static int balance_nonroot(MemPage *pPage){
4808   MemPage *pParent;            /* The parent of pPage */
4809   BtShared *pBt;               /* The whole database */
4810   int nCell = 0;               /* Number of cells in apCell[] */
4811   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
4812   int nOld;                    /* Number of pages in apOld[] */
4813   int nNew;                    /* Number of pages in apNew[] */
4814   int nDiv;                    /* Number of cells in apDiv[] */
4815   int i, j, k;                 /* Loop counters */
4816   int idx;                     /* Index of pPage in pParent->aCell[] */
4817   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
4818   int rc;                      /* The return code */
4819   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
4820   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
4821   int usableSpace;             /* Bytes in pPage beyond the header */
4822   int pageFlags;               /* Value of pPage->aData[0] */
4823   int subtotal;                /* Subtotal of bytes in cells on one page */
4824   int iSpace = 0;              /* First unused byte of aSpace[] */
4825   MemPage *apOld[NB];          /* pPage and up to two siblings */
4826   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
4827   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
4828   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
4829   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
4830   u8 *apDiv[NB];               /* Divider cells in pParent */
4831   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
4832   int szNew[NB+2];             /* Combined size of cells place on i-th page */
4833   u8 **apCell = 0;             /* All cells begin balanced */
4834   u16 *szCell;                 /* Local size of all cells in apCell[] */
4835   u8 *aCopy[NB];               /* Space for holding data of apCopy[] */
4836   u8 *aSpace;                  /* Space to hold copies of dividers cells */
4837 #ifndef SQLITE_OMIT_AUTOVACUUM
4838   u8 *aFrom = 0;
4839 #endif
4840 
4841   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4842 
4843   /*
4844   ** Find the parent page.
4845   */
4846   assert( pPage->isInit );
4847   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
4848   pBt = pPage->pBt;
4849   pParent = pPage->pParent;
4850   assert( pParent );
4851   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4852     return rc;
4853   }
4854   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4855 
4856 #ifndef SQLITE_OMIT_QUICKBALANCE
4857   /*
4858   ** A special case:  If a new entry has just been inserted into a
4859   ** table (that is, a btree with integer keys and all data at the leaves)
4860   ** and the new entry is the right-most entry in the tree (it has the
4861   ** largest key) then use the special balance_quick() routine for
4862   ** balancing.  balance_quick() is much faster and results in a tighter
4863   ** packing of data in the common case.
4864   */
4865   if( pPage->leaf &&
4866       pPage->intKey &&
4867       pPage->leafData &&
4868       pPage->nOverflow==1 &&
4869       pPage->aOvfl[0].idx==pPage->nCell &&
4870       pPage->pParent->pgno!=1 &&
4871       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4872   ){
4873     /*
4874     ** TODO: Check the siblings to the left of pPage. It may be that
4875     ** they are not full and no new page is required.
4876     */
4877     return balance_quick(pPage, pParent);
4878   }
4879 #endif
4880 
4881   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4882     return rc;
4883   }
4884 
4885   /*
4886   ** Find the cell in the parent page whose left child points back
4887   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
4888   ** is the rightmost child of pParent then set idx to pParent->nCell
4889   */
4890   if( pParent->idxShift ){
4891     Pgno pgno;
4892     pgno = pPage->pgno;
4893     assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
4894     for(idx=0; idx<pParent->nCell; idx++){
4895       if( get4byte(findCell(pParent, idx))==pgno ){
4896         break;
4897       }
4898     }
4899     assert( idx<pParent->nCell
4900              || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
4901   }else{
4902     idx = pPage->idxParent;
4903   }
4904 
4905   /*
4906   ** Initialize variables so that it will be safe to jump
4907   ** directly to balance_cleanup at any moment.
4908   */
4909   nOld = nNew = 0;
4910   sqlite3PagerRef(pParent->pDbPage);
4911 
4912   /*
4913   ** Find sibling pages to pPage and the cells in pParent that divide
4914   ** the siblings.  An attempt is made to find NN siblings on either
4915   ** side of pPage.  More siblings are taken from one side, however, if
4916   ** pPage there are fewer than NN siblings on the other side.  If pParent
4917   ** has NB or fewer children then all children of pParent are taken.
4918   */
4919   nxDiv = idx - NN;
4920   if( nxDiv + NB > pParent->nCell ){
4921     nxDiv = pParent->nCell - NB + 1;
4922   }
4923   if( nxDiv<0 ){
4924     nxDiv = 0;
4925   }
4926   nDiv = 0;
4927   for(i=0, k=nxDiv; i<NB; i++, k++){
4928     if( k<pParent->nCell ){
4929       apDiv[i] = findCell(pParent, k);
4930       nDiv++;
4931       assert( !pParent->leaf );
4932       pgnoOld[i] = get4byte(apDiv[i]);
4933     }else if( k==pParent->nCell ){
4934       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
4935     }else{
4936       break;
4937     }
4938     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
4939     if( rc ) goto balance_cleanup;
4940     apOld[i]->idxParent = k;
4941     apCopy[i] = 0;
4942     assert( i==nOld );
4943     nOld++;
4944     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
4945   }
4946 
4947   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
4948   ** alignment */
4949   nMaxCells = (nMaxCells + 3)&~3;
4950 
4951   /*
4952   ** Allocate space for memory structures
4953   */
4954   apCell = sqlite3_malloc(
4955        nMaxCells*sizeof(u8*)                       /* apCell */
4956      + nMaxCells*sizeof(u16)                       /* szCell */
4957      + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
4958      + pBt->pageSize*5                             /* aSpace */
4959      + (ISAUTOVACUUM ? nMaxCells : 0)              /* aFrom */
4960   );
4961   if( apCell==0 ){
4962     rc = SQLITE_NOMEM;
4963     goto balance_cleanup;
4964   }
4965   szCell = (u16*)&apCell[nMaxCells];
4966   aCopy[0] = (u8*)&szCell[nMaxCells];
4967   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4968   for(i=1; i<NB; i++){
4969     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4970     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4971   }
4972   aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4973   assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4974 #ifndef SQLITE_OMIT_AUTOVACUUM
4975   if( pBt->autoVacuum ){
4976     aFrom = &aSpace[5*pBt->pageSize];
4977   }
4978 #endif
4979 
4980   /*
4981   ** Make copies of the content of pPage and its siblings into aOld[].
4982   ** The rest of this function will use data from the copies rather
4983   ** that the original pages since the original pages will be in the
4984   ** process of being overwritten.
4985   */
4986   for(i=0; i<nOld; i++){
4987     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
4988     memcpy(p, apOld[i], sizeof(MemPage));
4989     p->aData = (void*)&p[1];
4990     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
4991   }
4992 
4993   /*
4994   ** Load pointers to all cells on sibling pages and the divider cells
4995   ** into the local apCell[] array.  Make copies of the divider cells
4996   ** into space obtained form aSpace[] and remove the the divider Cells
4997   ** from pParent.
4998   **
4999   ** If the siblings are on leaf pages, then the child pointers of the
5000   ** divider cells are stripped from the cells before they are copied
5001   ** into aSpace[].  In this way, all cells in apCell[] are without
5002   ** child pointers.  If siblings are not leaves, then all cell in
5003   ** apCell[] include child pointers.  Either way, all cells in apCell[]
5004   ** are alike.
5005   **
5006   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
5007   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
5008   */
5009   nCell = 0;
5010   leafCorrection = pPage->leaf*4;
5011   leafData = pPage->leafData && pPage->leaf;
5012   for(i=0; i<nOld; i++){
5013     MemPage *pOld = apCopy[i];
5014     int limit = pOld->nCell+pOld->nOverflow;
5015     for(j=0; j<limit; j++){
5016       assert( nCell<nMaxCells );
5017       apCell[nCell] = findOverflowCell(pOld, j);
5018       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
5019 #ifndef SQLITE_OMIT_AUTOVACUUM
5020       if( pBt->autoVacuum ){
5021         int a;
5022         aFrom[nCell] = i;
5023         for(a=0; a<pOld->nOverflow; a++){
5024           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5025             aFrom[nCell] = 0xFF;
5026             break;
5027           }
5028         }
5029       }
5030 #endif
5031       nCell++;
5032     }
5033     if( i<nOld-1 ){
5034       u16 sz = cellSizePtr(pParent, apDiv[i]);
5035       if( leafData ){
5036         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5037         ** are duplicates of keys on the child pages.  We need to remove
5038         ** the divider cells from pParent, but the dividers cells are not
5039         ** added to apCell[] because they are duplicates of child cells.
5040         */
5041         dropCell(pParent, nxDiv, sz);
5042       }else{
5043         u8 *pTemp;
5044         assert( nCell<nMaxCells );
5045         szCell[nCell] = sz;
5046         pTemp = &aSpace[iSpace];
5047         iSpace += sz;
5048         assert( iSpace<=pBt->pageSize*5 );
5049         memcpy(pTemp, apDiv[i], sz);
5050         apCell[nCell] = pTemp+leafCorrection;
5051 #ifndef SQLITE_OMIT_AUTOVACUUM
5052         if( pBt->autoVacuum ){
5053           aFrom[nCell] = 0xFF;
5054         }
5055 #endif
5056         dropCell(pParent, nxDiv, sz);
5057         szCell[nCell] -= leafCorrection;
5058         assert( get4byte(pTemp)==pgnoOld[i] );
5059         if( !pOld->leaf ){
5060           assert( leafCorrection==0 );
5061           /* The right pointer of the child page pOld becomes the left
5062           ** pointer of the divider cell */
5063           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
5064         }else{
5065           assert( leafCorrection==4 );
5066           if( szCell[nCell]<4 ){
5067             /* Do not allow any cells smaller than 4 bytes. */
5068             szCell[nCell] = 4;
5069           }
5070         }
5071         nCell++;
5072       }
5073     }
5074   }
5075 
5076   /*
5077   ** Figure out the number of pages needed to hold all nCell cells.
5078   ** Store this number in "k".  Also compute szNew[] which is the total
5079   ** size of all cells on the i-th page and cntNew[] which is the index
5080   ** in apCell[] of the cell that divides page i from page i+1.
5081   ** cntNew[k] should equal nCell.
5082   **
5083   ** Values computed by this block:
5084   **
5085   **           k: The total number of sibling pages
5086   **    szNew[i]: Spaced used on the i-th sibling page.
5087   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5088   **              the right of the i-th sibling page.
5089   ** usableSpace: Number of bytes of space available on each sibling.
5090   **
5091   */
5092   usableSpace = pBt->usableSize - 12 + leafCorrection;
5093   for(subtotal=k=i=0; i<nCell; i++){
5094     assert( i<nMaxCells );
5095     subtotal += szCell[i] + 2;
5096     if( subtotal > usableSpace ){
5097       szNew[k] = subtotal - szCell[i];
5098       cntNew[k] = i;
5099       if( leafData ){ i--; }
5100       subtotal = 0;
5101       k++;
5102     }
5103   }
5104   szNew[k] = subtotal;
5105   cntNew[k] = nCell;
5106   k++;
5107 
5108   /*
5109   ** The packing computed by the previous block is biased toward the siblings
5110   ** on the left side.  The left siblings are always nearly full, while the
5111   ** right-most sibling might be nearly empty.  This block of code attempts
5112   ** to adjust the packing of siblings to get a better balance.
5113   **
5114   ** This adjustment is more than an optimization.  The packing above might
5115   ** be so out of balance as to be illegal.  For example, the right-most
5116   ** sibling might be completely empty.  This adjustment is not optional.
5117   */
5118   for(i=k-1; i>0; i--){
5119     int szRight = szNew[i];  /* Size of sibling on the right */
5120     int szLeft = szNew[i-1]; /* Size of sibling on the left */
5121     int r;              /* Index of right-most cell in left sibling */
5122     int d;              /* Index of first cell to the left of right sibling */
5123 
5124     r = cntNew[i-1] - 1;
5125     d = r + 1 - leafData;
5126     assert( d<nMaxCells );
5127     assert( r<nMaxCells );
5128     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5129       szRight += szCell[d] + 2;
5130       szLeft -= szCell[r] + 2;
5131       cntNew[i-1]--;
5132       r = cntNew[i-1] - 1;
5133       d = r + 1 - leafData;
5134     }
5135     szNew[i] = szRight;
5136     szNew[i-1] = szLeft;
5137   }
5138 
5139   /* Either we found one or more cells (cntnew[0])>0) or we are the
5140   ** a virtual root page.  A virtual root page is when the real root
5141   ** page is page 1 and we are the only child of that page.
5142   */
5143   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5144 
5145   /*
5146   ** Allocate k new pages.  Reuse old pages where possible.
5147   */
5148   assert( pPage->pgno>1 );
5149   pageFlags = pPage->aData[0];
5150   for(i=0; i<k; i++){
5151     MemPage *pNew;
5152     if( i<nOld ){
5153       pNew = apNew[i] = apOld[i];
5154       pgnoNew[i] = pgnoOld[i];
5155       apOld[i] = 0;
5156       rc = sqlite3PagerWrite(pNew->pDbPage);
5157       nNew++;
5158       if( rc ) goto balance_cleanup;
5159     }else{
5160       assert( i>0 );
5161       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5162       if( rc ) goto balance_cleanup;
5163       apNew[i] = pNew;
5164       nNew++;
5165     }
5166     zeroPage(pNew, pageFlags);
5167   }
5168 
5169   /* Free any old pages that were not reused as new pages.
5170   */
5171   while( i<nOld ){
5172     rc = freePage(apOld[i]);
5173     if( rc ) goto balance_cleanup;
5174     releasePage(apOld[i]);
5175     apOld[i] = 0;
5176     i++;
5177   }
5178 
5179   /*
5180   ** Put the new pages in accending order.  This helps to
5181   ** keep entries in the disk file in order so that a scan
5182   ** of the table is a linear scan through the file.  That
5183   ** in turn helps the operating system to deliver pages
5184   ** from the disk more rapidly.
5185   **
5186   ** An O(n^2) insertion sort algorithm is used, but since
5187   ** n is never more than NB (a small constant), that should
5188   ** not be a problem.
5189   **
5190   ** When NB==3, this one optimization makes the database
5191   ** about 25% faster for large insertions and deletions.
5192   */
5193   for(i=0; i<k-1; i++){
5194     int minV = pgnoNew[i];
5195     int minI = i;
5196     for(j=i+1; j<k; j++){
5197       if( pgnoNew[j]<(unsigned)minV ){
5198         minI = j;
5199         minV = pgnoNew[j];
5200       }
5201     }
5202     if( minI>i ){
5203       int t;
5204       MemPage *pT;
5205       t = pgnoNew[i];
5206       pT = apNew[i];
5207       pgnoNew[i] = pgnoNew[minI];
5208       apNew[i] = apNew[minI];
5209       pgnoNew[minI] = t;
5210       apNew[minI] = pT;
5211     }
5212   }
5213   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5214     pgnoOld[0],
5215     nOld>=2 ? pgnoOld[1] : 0,
5216     nOld>=3 ? pgnoOld[2] : 0,
5217     pgnoNew[0], szNew[0],
5218     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5219     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5220     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5221     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5222 
5223   /*
5224   ** Evenly distribute the data in apCell[] across the new pages.
5225   ** Insert divider cells into pParent as necessary.
5226   */
5227   j = 0;
5228   for(i=0; i<nNew; i++){
5229     /* Assemble the new sibling page. */
5230     MemPage *pNew = apNew[i];
5231     assert( j<nMaxCells );
5232     assert( pNew->pgno==pgnoNew[i] );
5233     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5234     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5235     assert( pNew->nOverflow==0 );
5236 
5237 #ifndef SQLITE_OMIT_AUTOVACUUM
5238     /* If this is an auto-vacuum database, update the pointer map entries
5239     ** that point to the siblings that were rearranged. These can be: left
5240     ** children of cells, the right-child of the page, or overflow pages
5241     ** pointed to by cells.
5242     */
5243     if( pBt->autoVacuum ){
5244       for(k=j; k<cntNew[i]; k++){
5245         assert( k<nMaxCells );
5246         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5247           rc = ptrmapPutOvfl(pNew, k-j);
5248           if( rc!=SQLITE_OK ){
5249             goto balance_cleanup;
5250           }
5251         }
5252       }
5253     }
5254 #endif
5255 
5256     j = cntNew[i];
5257 
5258     /* If the sibling page assembled above was not the right-most sibling,
5259     ** insert a divider cell into the parent page.
5260     */
5261     if( i<nNew-1 && j<nCell ){
5262       u8 *pCell;
5263       u8 *pTemp;
5264       int sz;
5265 
5266       assert( j<nMaxCells );
5267       pCell = apCell[j];
5268       sz = szCell[j] + leafCorrection;
5269       if( !pNew->leaf ){
5270         memcpy(&pNew->aData[8], pCell, 4);
5271         pTemp = 0;
5272       }else if( leafData ){
5273         /* If the tree is a leaf-data tree, and the siblings are leaves,
5274         ** then there is no divider cell in apCell[]. Instead, the divider
5275         ** cell consists of the integer key for the right-most cell of
5276         ** the sibling-page assembled above only.
5277         */
5278         CellInfo info;
5279         j--;
5280         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5281         pCell = &aSpace[iSpace];
5282         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5283         iSpace += sz;
5284         assert( iSpace<=pBt->pageSize*5 );
5285         pTemp = 0;
5286       }else{
5287         pCell -= 4;
5288         pTemp = &aSpace[iSpace];
5289         iSpace += sz;
5290         assert( iSpace<=pBt->pageSize*5 );
5291         /* Obscure case for non-leaf-data trees: If the cell at pCell was
5292         ** previously stored on a leaf node, and its reported size was 4
5293         ** bytes, then it may actually be smaller than this
5294         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5295         ** any cell). But it is important to pass the correct size to
5296         ** insertCell(), so reparse the cell now.
5297         **
5298         ** Note that this can never happen in an SQLite data file, as all
5299         ** cells are at least 4 bytes. It only happens in b-trees used
5300         ** to evaluate "IN (SELECT ...)" and similar clauses.
5301         */
5302         if( szCell[j]==4 ){
5303           assert(leafCorrection==4);
5304           sz = cellSizePtr(pParent, pCell);
5305         }
5306       }
5307       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5308       if( rc!=SQLITE_OK ) goto balance_cleanup;
5309       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5310 #ifndef SQLITE_OMIT_AUTOVACUUM
5311       /* If this is an auto-vacuum database, and not a leaf-data tree,
5312       ** then update the pointer map with an entry for the overflow page
5313       ** that the cell just inserted points to (if any).
5314       */
5315       if( pBt->autoVacuum && !leafData ){
5316         rc = ptrmapPutOvfl(pParent, nxDiv);
5317         if( rc!=SQLITE_OK ){
5318           goto balance_cleanup;
5319         }
5320       }
5321 #endif
5322       j++;
5323       nxDiv++;
5324     }
5325   }
5326   assert( j==nCell );
5327   assert( nOld>0 );
5328   assert( nNew>0 );
5329   if( (pageFlags & PTF_LEAF)==0 ){
5330     memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
5331   }
5332   if( nxDiv==pParent->nCell+pParent->nOverflow ){
5333     /* Right-most sibling is the right-most child of pParent */
5334     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5335   }else{
5336     /* Right-most sibling is the left child of the first entry in pParent
5337     ** past the right-most divider entry */
5338     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5339   }
5340 
5341   /*
5342   ** Reparent children of all cells.
5343   */
5344   for(i=0; i<nNew; i++){
5345     rc = reparentChildPages(apNew[i]);
5346     if( rc!=SQLITE_OK ) goto balance_cleanup;
5347   }
5348   rc = reparentChildPages(pParent);
5349   if( rc!=SQLITE_OK ) goto balance_cleanup;
5350 
5351   /*
5352   ** Balance the parent page.  Note that the current page (pPage) might
5353   ** have been added to the freelist so it might no longer be initialized.
5354   ** But the parent page will always be initialized.
5355   */
5356   assert( pParent->isInit );
5357   rc = balance(pParent, 0);
5358 
5359   /*
5360   ** Cleanup before returning.
5361   */
5362 balance_cleanup:
5363   sqlite3_free(apCell);
5364   for(i=0; i<nOld; i++){
5365     releasePage(apOld[i]);
5366   }
5367   for(i=0; i<nNew; i++){
5368     releasePage(apNew[i]);
5369   }
5370   releasePage(pParent);
5371   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5372           pPage->pgno, nOld, nNew, nCell));
5373   return rc;
5374 }
5375 
5376 /*
5377 ** This routine is called for the root page of a btree when the root
5378 ** page contains no cells.  This is an opportunity to make the tree
5379 ** shallower by one level.
5380 */
5381 static int balance_shallower(MemPage *pPage){
5382   MemPage *pChild;             /* The only child page of pPage */
5383   Pgno pgnoChild;              /* Page number for pChild */
5384   int rc = SQLITE_OK;          /* Return code from subprocedures */
5385   BtShared *pBt;                  /* The main BTree structure */
5386   int mxCellPerPage;           /* Maximum number of cells per page */
5387   u8 **apCell;                 /* All cells from pages being balanced */
5388   u16 *szCell;                 /* Local size of all cells */
5389 
5390   assert( pPage->pParent==0 );
5391   assert( pPage->nCell==0 );
5392   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5393   pBt = pPage->pBt;
5394   mxCellPerPage = MX_CELL(pBt);
5395   apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
5396   if( apCell==0 ) return SQLITE_NOMEM;
5397   szCell = (u16*)&apCell[mxCellPerPage];
5398   if( pPage->leaf ){
5399     /* The table is completely empty */
5400     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5401   }else{
5402     /* The root page is empty but has one child.  Transfer the
5403     ** information from that one child into the root page if it
5404     ** will fit.  This reduces the depth of the tree by one.
5405     **
5406     ** If the root page is page 1, it has less space available than
5407     ** its child (due to the 100 byte header that occurs at the beginning
5408     ** of the database fle), so it might not be able to hold all of the
5409     ** information currently contained in the child.  If this is the
5410     ** case, then do not do the transfer.  Leave page 1 empty except
5411     ** for the right-pointer to the child page.  The child page becomes
5412     ** the virtual root of the tree.
5413     */
5414     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5415     assert( pgnoChild>0 );
5416     assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) );
5417     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5418     if( rc ) goto end_shallow_balance;
5419     if( pPage->pgno==1 ){
5420       rc = sqlite3BtreeInitPage(pChild, pPage);
5421       if( rc ) goto end_shallow_balance;
5422       assert( pChild->nOverflow==0 );
5423       if( pChild->nFree>=100 ){
5424         /* The child information will fit on the root page, so do the
5425         ** copy */
5426         int i;
5427         zeroPage(pPage, pChild->aData[0]);
5428         for(i=0; i<pChild->nCell; i++){
5429           apCell[i] = findCell(pChild,i);
5430           szCell[i] = cellSizePtr(pChild, apCell[i]);
5431         }
5432         assemblePage(pPage, pChild->nCell, apCell, szCell);
5433         /* Copy the right-pointer of the child to the parent. */
5434         put4byte(&pPage->aData[pPage->hdrOffset+8],
5435             get4byte(&pChild->aData[pChild->hdrOffset+8]));
5436         freePage(pChild);
5437         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5438       }else{
5439         /* The child has more information that will fit on the root.
5440         ** The tree is already balanced.  Do nothing. */
5441         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5442       }
5443     }else{
5444       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5445       pPage->isInit = 0;
5446       pPage->pParent = 0;
5447       rc = sqlite3BtreeInitPage(pPage, 0);
5448       assert( rc==SQLITE_OK );
5449       freePage(pChild);
5450       TRACE(("BALANCE: transfer child %d into root %d\n",
5451               pChild->pgno, pPage->pgno));
5452     }
5453     rc = reparentChildPages(pPage);
5454     assert( pPage->nOverflow==0 );
5455 #ifndef SQLITE_OMIT_AUTOVACUUM
5456     if( pBt->autoVacuum ){
5457       int i;
5458       for(i=0; i<pPage->nCell; i++){
5459         rc = ptrmapPutOvfl(pPage, i);
5460         if( rc!=SQLITE_OK ){
5461           goto end_shallow_balance;
5462         }
5463       }
5464     }
5465 #endif
5466     releasePage(pChild);
5467   }
5468 end_shallow_balance:
5469   sqlite3_free(apCell);
5470   return rc;
5471 }
5472 
5473 
5474 /*
5475 ** The root page is overfull
5476 **
5477 ** When this happens, Create a new child page and copy the
5478 ** contents of the root into the child.  Then make the root
5479 ** page an empty page with rightChild pointing to the new
5480 ** child.   Finally, call balance_internal() on the new child
5481 ** to cause it to split.
5482 */
5483 static int balance_deeper(MemPage *pPage){
5484   int rc;             /* Return value from subprocedures */
5485   MemPage *pChild;    /* Pointer to a new child page */
5486   Pgno pgnoChild;     /* Page number of the new child page */
5487   BtShared *pBt;         /* The BTree */
5488   int usableSize;     /* Total usable size of a page */
5489   u8 *data;           /* Content of the parent page */
5490   u8 *cdata;          /* Content of the child page */
5491   int hdr;            /* Offset to page header in parent */
5492   int brk;            /* Offset to content of first cell in parent */
5493 
5494   assert( pPage->pParent==0 );
5495   assert( pPage->nOverflow>0 );
5496   pBt = pPage->pBt;
5497   assert( sqlite3_mutex_held(pBt->mutex) );
5498   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5499   if( rc ) return rc;
5500   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5501   usableSize = pBt->usableSize;
5502   data = pPage->aData;
5503   hdr = pPage->hdrOffset;
5504   brk = get2byte(&data[hdr+5]);
5505   cdata = pChild->aData;
5506   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5507   memcpy(&cdata[brk], &data[brk], usableSize-brk);
5508   assert( pChild->isInit==0 );
5509   rc = sqlite3BtreeInitPage(pChild, pPage);
5510   if( rc ) goto balancedeeper_out;
5511   memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
5512   pChild->nOverflow = pPage->nOverflow;
5513   if( pChild->nOverflow ){
5514     pChild->nFree = 0;
5515   }
5516   assert( pChild->nCell==pPage->nCell );
5517   zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5518   put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5519   TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5520 #ifndef SQLITE_OMIT_AUTOVACUUM
5521   if( pBt->autoVacuum ){
5522     int i;
5523     rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5524     if( rc ) goto balancedeeper_out;
5525     for(i=0; i<pChild->nCell; i++){
5526       rc = ptrmapPutOvfl(pChild, i);
5527       if( rc!=SQLITE_OK ){
5528         return rc;
5529       }
5530     }
5531   }
5532 #endif
5533   rc = balance_nonroot(pChild);
5534 
5535 balancedeeper_out:
5536   releasePage(pChild);
5537   return rc;
5538 }
5539 
5540 /*
5541 ** Decide if the page pPage needs to be balanced.  If balancing is
5542 ** required, call the appropriate balancing routine.
5543 */
5544 static int balance(MemPage *pPage, int insert){
5545   int rc = SQLITE_OK;
5546   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5547   if( pPage->pParent==0 ){
5548     rc = sqlite3PagerWrite(pPage->pDbPage);
5549     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
5550       rc = balance_deeper(pPage);
5551     }
5552     if( rc==SQLITE_OK && pPage->nCell==0 ){
5553       rc = balance_shallower(pPage);
5554     }
5555   }else{
5556     if( pPage->nOverflow>0 ||
5557         (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5558       rc = balance_nonroot(pPage);
5559     }
5560   }
5561   return rc;
5562 }
5563 
5564 /*
5565 ** This routine checks all cursors that point to table pgnoRoot.
5566 ** If any of those cursors were opened with wrFlag==0 in a different
5567 ** database connection (a database connection that shares the pager
5568 ** cache with the current connection) and that other connection
5569 ** is not in the ReadUncommmitted state, then this routine returns
5570 ** SQLITE_LOCKED.
5571 **
5572 ** In addition to checking for read-locks (where a read-lock
5573 ** means a cursor opened with wrFlag==0) this routine also moves
5574 ** all write cursors so that they are pointing to the
5575 ** first Cell on the root page.  This is necessary because an insert
5576 ** or delete might change the number of cells on a page or delete
5577 ** a page entirely and we do not want to leave any cursors
5578 ** pointing to non-existant pages or cells.
5579 */
5580 static int checkReadLocks(Btree *pBtree, Pgno pgnoRoot, BtCursor *pExclude){
5581   BtCursor *p;
5582   BtShared *pBt = pBtree->pBt;
5583   sqlite3 *db = pBtree->db;
5584   assert( sqlite3BtreeHoldsMutex(pBtree) );
5585   for(p=pBt->pCursor; p; p=p->pNext){
5586     if( p==pExclude ) continue;
5587     if( p->eState!=CURSOR_VALID ) continue;
5588     if( p->pgnoRoot!=pgnoRoot ) continue;
5589     if( p->wrFlag==0 ){
5590       sqlite3 *dbOther = p->pBtree->db;
5591       if( dbOther==0 ||
5592          (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5593         return SQLITE_LOCKED;
5594       }
5595     }else if( p->pPage->pgno!=p->pgnoRoot ){
5596       moveToRoot(p);
5597     }
5598   }
5599   return SQLITE_OK;
5600 }
5601 
5602 /*
5603 ** Make sure pBt->pTmpSpace points to an allocation of
5604 ** MX_CELL_SIZE(pBt) bytes.
5605 */
5606 static void allocateTempSpace(BtShared *pBt){
5607   if( !pBt->pTmpSpace ){
5608     pBt->pTmpSpace = sqlite3_malloc(MX_CELL_SIZE(pBt));
5609   }
5610 }
5611 
5612 /*
5613 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
5614 ** and the data is given by (pData,nData).  The cursor is used only to
5615 ** define what table the record should be inserted into.  The cursor
5616 ** is left pointing at a random location.
5617 **
5618 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
5619 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
5620 */
5621 int sqlite3BtreeInsert(
5622   BtCursor *pCur,                /* Insert data into the table of this cursor */
5623   const void *pKey, i64 nKey,    /* The key of the new record */
5624   const void *pData, int nData,  /* The data of the new record */
5625   int nZero,                     /* Number of extra 0 bytes to append to data */
5626   int appendBias                 /* True if this is likely an append */
5627 ){
5628   int rc;
5629   int loc;
5630   int szNew;
5631   MemPage *pPage;
5632   Btree *p = pCur->pBtree;
5633   BtShared *pBt = p->pBt;
5634   unsigned char *oldCell;
5635   unsigned char *newCell = 0;
5636 
5637   assert( cursorHoldsMutex(pCur) );
5638   if( pBt->inTransaction!=TRANS_WRITE ){
5639     /* Must start a transaction before doing an insert */
5640     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5641     return rc;
5642   }
5643   assert( !pBt->readOnly );
5644   if( !pCur->wrFlag ){
5645     return SQLITE_PERM;   /* Cursor not open for writing */
5646   }
5647   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){
5648     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5649   }
5650   if( pCur->eState==CURSOR_FAULT ){
5651     return pCur->skip;
5652   }
5653 
5654   /* Save the positions of any other cursors open on this table */
5655   clearCursorPosition(pCur);
5656   if(
5657     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
5658     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, 0, nKey, appendBias, &loc))
5659   ){
5660     return rc;
5661   }
5662 
5663   pPage = pCur->pPage;
5664   assert( pPage->intKey || nKey>=0 );
5665   assert( pPage->leaf || !pPage->leafData );
5666   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5667           pCur->pgnoRoot, nKey, nData, pPage->pgno,
5668           loc==0 ? "overwrite" : "new entry"));
5669   assert( pPage->isInit );
5670   allocateTempSpace(pBt);
5671   newCell = pBt->pTmpSpace;
5672   if( newCell==0 ) return SQLITE_NOMEM;
5673   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
5674   if( rc ) goto end_insert;
5675   assert( szNew==cellSizePtr(pPage, newCell) );
5676   assert( szNew<=MX_CELL_SIZE(pBt) );
5677   if( loc==0 && CURSOR_VALID==pCur->eState ){
5678     u16 szOld;
5679     assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
5680     rc = sqlite3PagerWrite(pPage->pDbPage);
5681     if( rc ){
5682       goto end_insert;
5683     }
5684     oldCell = findCell(pPage, pCur->idx);
5685     if( !pPage->leaf ){
5686       memcpy(newCell, oldCell, 4);
5687     }
5688     szOld = cellSizePtr(pPage, oldCell);
5689     rc = clearCell(pPage, oldCell);
5690     if( rc ) goto end_insert;
5691     dropCell(pPage, pCur->idx, szOld);
5692   }else if( loc<0 && pPage->nCell>0 ){
5693     assert( pPage->leaf );
5694     pCur->idx++;
5695     pCur->info.nSize = 0;
5696     pCur->validNKey = 0;
5697   }else{
5698     assert( pPage->leaf );
5699   }
5700   rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
5701   if( rc!=SQLITE_OK ) goto end_insert;
5702   rc = balance(pPage, 1);
5703   /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
5704   /* fflush(stdout); */
5705   if( rc==SQLITE_OK ){
5706     moveToRoot(pCur);
5707   }
5708 end_insert:
5709   return rc;
5710 }
5711 
5712 /*
5713 ** Delete the entry that the cursor is pointing to.  The cursor
5714 ** is left pointing at a random location.
5715 */
5716 int sqlite3BtreeDelete(BtCursor *pCur){
5717   MemPage *pPage = pCur->pPage;
5718   unsigned char *pCell;
5719   int rc;
5720   Pgno pgnoChild = 0;
5721   Btree *p = pCur->pBtree;
5722   BtShared *pBt = p->pBt;
5723 
5724   assert( cursorHoldsMutex(pCur) );
5725   assert( pPage->isInit );
5726   if( pBt->inTransaction!=TRANS_WRITE ){
5727     /* Must start a transaction before doing a delete */
5728     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5729     return rc;
5730   }
5731   assert( !pBt->readOnly );
5732   if( pCur->eState==CURSOR_FAULT ){
5733     return pCur->skip;
5734   }
5735   if( pCur->idx >= pPage->nCell ){
5736     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
5737   }
5738   if( !pCur->wrFlag ){
5739     return SQLITE_PERM;   /* Did not open this cursor for writing */
5740   }
5741   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){
5742     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5743   }
5744 
5745   /* Restore the current cursor position (a no-op if the cursor is not in
5746   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
5747   ** open on the same table. Then call sqlite3PagerWrite() on the page
5748   ** that the entry will be deleted from.
5749   */
5750   if(
5751     (rc = restoreOrClearCursorPosition(pCur))!=0 ||
5752     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
5753     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
5754   ){
5755     return rc;
5756   }
5757 
5758   /* Locate the cell within its page and leave pCell pointing to the
5759   ** data. The clearCell() call frees any overflow pages associated with the
5760   ** cell. The cell itself is still intact.
5761   */
5762   pCell = findCell(pPage, pCur->idx);
5763   if( !pPage->leaf ){
5764     pgnoChild = get4byte(pCell);
5765   }
5766   rc = clearCell(pPage, pCell);
5767   if( rc ){
5768     return rc;
5769   }
5770 
5771   if( !pPage->leaf ){
5772     /*
5773     ** The entry we are about to delete is not a leaf so if we do not
5774     ** do something we will leave a hole on an internal page.
5775     ** We have to fill the hole by moving in a cell from a leaf.  The
5776     ** next Cell after the one to be deleted is guaranteed to exist and
5777     ** to be a leaf so we can use it.
5778     */
5779     BtCursor leafCur;
5780     unsigned char *pNext;
5781     int notUsed;
5782     unsigned char *tempCell = 0;
5783     assert( !pPage->leafData );
5784     sqlite3BtreeGetTempCursor(pCur, &leafCur);
5785     rc = sqlite3BtreeNext(&leafCur, &notUsed);
5786     if( rc==SQLITE_OK ){
5787       rc = sqlite3PagerWrite(leafCur.pPage->pDbPage);
5788     }
5789     if( rc==SQLITE_OK ){
5790       u16 szNext;
5791       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5792          pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
5793       dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5794       pNext = findCell(leafCur.pPage, leafCur.idx);
5795       szNext = cellSizePtr(leafCur.pPage, pNext);
5796       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5797       allocateTempSpace(pBt);
5798       tempCell = pBt->pTmpSpace;
5799       if( tempCell==0 ){
5800         rc = SQLITE_NOMEM;
5801       }
5802       if( rc==SQLITE_OK ){
5803         rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
5804       }
5805       if( rc==SQLITE_OK ){
5806         put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
5807         rc = balance(pPage, 0);
5808       }
5809       if( rc==SQLITE_OK ){
5810         dropCell(leafCur.pPage, leafCur.idx, szNext);
5811         rc = balance(leafCur.pPage, 0);
5812       }
5813     }
5814     sqlite3BtreeReleaseTempCursor(&leafCur);
5815   }else{
5816     TRACE(("DELETE: table=%d delete from leaf %d\n",
5817        pCur->pgnoRoot, pPage->pgno));
5818     dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5819     rc = balance(pPage, 0);
5820   }
5821   if( rc==SQLITE_OK ){
5822     moveToRoot(pCur);
5823   }
5824   return rc;
5825 }
5826 
5827 /*
5828 ** Create a new BTree table.  Write into *piTable the page
5829 ** number for the root page of the new table.
5830 **
5831 ** The type of type is determined by the flags parameter.  Only the
5832 ** following values of flags are currently in use.  Other values for
5833 ** flags might not work:
5834 **
5835 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
5836 **     BTREE_ZERODATA                  Used for SQL indices
5837 */
5838 static int btreeCreateTable(Btree *p, int *piTable, int flags){
5839   BtShared *pBt = p->pBt;
5840   MemPage *pRoot;
5841   Pgno pgnoRoot;
5842   int rc;
5843 
5844   assert( sqlite3BtreeHoldsMutex(p) );
5845   if( pBt->inTransaction!=TRANS_WRITE ){
5846     /* Must start a transaction first */
5847     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5848     return rc;
5849   }
5850   assert( !pBt->readOnly );
5851 
5852 #ifdef SQLITE_OMIT_AUTOVACUUM
5853   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
5854   if( rc ){
5855     return rc;
5856   }
5857 #else
5858   if( pBt->autoVacuum ){
5859     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
5860     MemPage *pPageMove; /* The page to move to. */
5861 
5862     /* Creating a new table may probably require moving an existing database
5863     ** to make room for the new tables root page. In case this page turns
5864     ** out to be an overflow page, delete all overflow page-map caches
5865     ** held by open cursors.
5866     */
5867     invalidateAllOverflowCache(pBt);
5868 
5869     /* Read the value of meta[3] from the database to determine where the
5870     ** root page of the new table should go. meta[3] is the largest root-page
5871     ** created so far, so the new root-page is (meta[3]+1).
5872     */
5873     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
5874     if( rc!=SQLITE_OK ){
5875       return rc;
5876     }
5877     pgnoRoot++;
5878 
5879     /* The new root-page may not be allocated on a pointer-map page, or the
5880     ** PENDING_BYTE page.
5881     */
5882     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
5883         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
5884       pgnoRoot++;
5885     }
5886     assert( pgnoRoot>=3 );
5887 
5888     /* Allocate a page. The page that currently resides at pgnoRoot will
5889     ** be moved to the allocated page (unless the allocated page happens
5890     ** to reside at pgnoRoot).
5891     */
5892     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
5893     if( rc!=SQLITE_OK ){
5894       return rc;
5895     }
5896 
5897     if( pgnoMove!=pgnoRoot ){
5898       /* pgnoRoot is the page that will be used for the root-page of
5899       ** the new table (assuming an error did not occur). But we were
5900       ** allocated pgnoMove. If required (i.e. if it was not allocated
5901       ** by extending the file), the current page at position pgnoMove
5902       ** is already journaled.
5903       */
5904       u8 eType;
5905       Pgno iPtrPage;
5906 
5907       releasePage(pPageMove);
5908 
5909       /* Move the page currently at pgnoRoot to pgnoMove. */
5910       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
5911       if( rc!=SQLITE_OK ){
5912         return rc;
5913       }
5914       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
5915       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
5916         releasePage(pRoot);
5917         return rc;
5918       }
5919       assert( eType!=PTRMAP_ROOTPAGE );
5920       assert( eType!=PTRMAP_FREEPAGE );
5921       rc = sqlite3PagerWrite(pRoot->pDbPage);
5922       if( rc!=SQLITE_OK ){
5923         releasePage(pRoot);
5924         return rc;
5925       }
5926       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove);
5927       releasePage(pRoot);
5928 
5929       /* Obtain the page at pgnoRoot */
5930       if( rc!=SQLITE_OK ){
5931         return rc;
5932       }
5933       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
5934       if( rc!=SQLITE_OK ){
5935         return rc;
5936       }
5937       rc = sqlite3PagerWrite(pRoot->pDbPage);
5938       if( rc!=SQLITE_OK ){
5939         releasePage(pRoot);
5940         return rc;
5941       }
5942     }else{
5943       pRoot = pPageMove;
5944     }
5945 
5946     /* Update the pointer-map and meta-data with the new root-page number. */
5947     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
5948     if( rc ){
5949       releasePage(pRoot);
5950       return rc;
5951     }
5952     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
5953     if( rc ){
5954       releasePage(pRoot);
5955       return rc;
5956     }
5957 
5958   }else{
5959     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
5960     if( rc ) return rc;
5961   }
5962 #endif
5963   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
5964   zeroPage(pRoot, flags | PTF_LEAF);
5965   sqlite3PagerUnref(pRoot->pDbPage);
5966   *piTable = (int)pgnoRoot;
5967   return SQLITE_OK;
5968 }
5969 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
5970   int rc;
5971   sqlite3BtreeEnter(p);
5972   p->pBt->db = p->db;
5973   rc = btreeCreateTable(p, piTable, flags);
5974   sqlite3BtreeLeave(p);
5975   return rc;
5976 }
5977 
5978 /*
5979 ** Erase the given database page and all its children.  Return
5980 ** the page to the freelist.
5981 */
5982 static int clearDatabasePage(
5983   BtShared *pBt,           /* The BTree that contains the table */
5984   Pgno pgno,            /* Page number to clear */
5985   MemPage *pParent,     /* Parent page.  NULL for the root */
5986   int freePageFlag      /* Deallocate page if true */
5987 ){
5988   MemPage *pPage = 0;
5989   int rc;
5990   unsigned char *pCell;
5991   int i;
5992 
5993   assert( sqlite3_mutex_held(pBt->mutex) );
5994   if( pgno>sqlite3PagerPagecount(pBt->pPager) ){
5995     return SQLITE_CORRUPT_BKPT;
5996   }
5997 
5998   rc = getAndInitPage(pBt, pgno, &pPage, pParent);
5999   if( rc ) goto cleardatabasepage_out;
6000   for(i=0; i<pPage->nCell; i++){
6001     pCell = findCell(pPage, i);
6002     if( !pPage->leaf ){
6003       rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
6004       if( rc ) goto cleardatabasepage_out;
6005     }
6006     rc = clearCell(pPage, pCell);
6007     if( rc ) goto cleardatabasepage_out;
6008   }
6009   if( !pPage->leaf ){
6010     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
6011     if( rc ) goto cleardatabasepage_out;
6012   }
6013   if( freePageFlag ){
6014     rc = freePage(pPage);
6015   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
6016     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
6017   }
6018 
6019 cleardatabasepage_out:
6020   releasePage(pPage);
6021   return rc;
6022 }
6023 
6024 /*
6025 ** Delete all information from a single table in the database.  iTable is
6026 ** the page number of the root of the table.  After this routine returns,
6027 ** the root page is empty, but still exists.
6028 **
6029 ** This routine will fail with SQLITE_LOCKED if there are any open
6030 ** read cursors on the table.  Open write cursors are moved to the
6031 ** root of the table.
6032 */
6033 int sqlite3BtreeClearTable(Btree *p, int iTable){
6034   int rc;
6035   BtShared *pBt = p->pBt;
6036   sqlite3BtreeEnter(p);
6037   pBt->db = p->db;
6038   if( p->inTrans!=TRANS_WRITE ){
6039     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6040   }else if( (rc = checkReadLocks(p, iTable, 0))!=SQLITE_OK ){
6041     /* nothing to do */
6042   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6043     /* nothing to do */
6044   }else{
6045     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
6046   }
6047   sqlite3BtreeLeave(p);
6048   return rc;
6049 }
6050 
6051 /*
6052 ** Erase all information in a table and add the root of the table to
6053 ** the freelist.  Except, the root of the principle table (the one on
6054 ** page 1) is never added to the freelist.
6055 **
6056 ** This routine will fail with SQLITE_LOCKED if there are any open
6057 ** cursors on the table.
6058 **
6059 ** If AUTOVACUUM is enabled and the page at iTable is not the last
6060 ** root page in the database file, then the last root page
6061 ** in the database file is moved into the slot formerly occupied by
6062 ** iTable and that last slot formerly occupied by the last root page
6063 ** is added to the freelist instead of iTable.  In this say, all
6064 ** root pages are kept at the beginning of the database file, which
6065 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
6066 ** page number that used to be the last root page in the file before
6067 ** the move.  If no page gets moved, *piMoved is set to 0.
6068 ** The last root page is recorded in meta[3] and the value of
6069 ** meta[3] is updated by this procedure.
6070 */
6071 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
6072   int rc;
6073   MemPage *pPage = 0;
6074   BtShared *pBt = p->pBt;
6075 
6076   assert( sqlite3BtreeHoldsMutex(p) );
6077   if( p->inTrans!=TRANS_WRITE ){
6078     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6079   }
6080 
6081   /* It is illegal to drop a table if any cursors are open on the
6082   ** database. This is because in auto-vacuum mode the backend may
6083   ** need to move another root-page to fill a gap left by the deleted
6084   ** root page. If an open cursor was using this page a problem would
6085   ** occur.
6086   */
6087   if( pBt->pCursor ){
6088     return SQLITE_LOCKED;
6089   }
6090 
6091   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6092   if( rc ) return rc;
6093   rc = sqlite3BtreeClearTable(p, iTable);
6094   if( rc ){
6095     releasePage(pPage);
6096     return rc;
6097   }
6098 
6099   *piMoved = 0;
6100 
6101   if( iTable>1 ){
6102 #ifdef SQLITE_OMIT_AUTOVACUUM
6103     rc = freePage(pPage);
6104     releasePage(pPage);
6105 #else
6106     if( pBt->autoVacuum ){
6107       Pgno maxRootPgno;
6108       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6109       if( rc!=SQLITE_OK ){
6110         releasePage(pPage);
6111         return rc;
6112       }
6113 
6114       if( iTable==maxRootPgno ){
6115         /* If the table being dropped is the table with the largest root-page
6116         ** number in the database, put the root page on the free list.
6117         */
6118         rc = freePage(pPage);
6119         releasePage(pPage);
6120         if( rc!=SQLITE_OK ){
6121           return rc;
6122         }
6123       }else{
6124         /* The table being dropped does not have the largest root-page
6125         ** number in the database. So move the page that does into the
6126         ** gap left by the deleted root-page.
6127         */
6128         MemPage *pMove;
6129         releasePage(pPage);
6130         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6131         if( rc!=SQLITE_OK ){
6132           return rc;
6133         }
6134         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable);
6135         releasePage(pMove);
6136         if( rc!=SQLITE_OK ){
6137           return rc;
6138         }
6139         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6140         if( rc!=SQLITE_OK ){
6141           return rc;
6142         }
6143         rc = freePage(pMove);
6144         releasePage(pMove);
6145         if( rc!=SQLITE_OK ){
6146           return rc;
6147         }
6148         *piMoved = maxRootPgno;
6149       }
6150 
6151       /* Set the new 'max-root-page' value in the database header. This
6152       ** is the old value less one, less one more if that happens to
6153       ** be a root-page number, less one again if that is the
6154       ** PENDING_BYTE_PAGE.
6155       */
6156       maxRootPgno--;
6157       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6158         maxRootPgno--;
6159       }
6160       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6161         maxRootPgno--;
6162       }
6163       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6164 
6165       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6166     }else{
6167       rc = freePage(pPage);
6168       releasePage(pPage);
6169     }
6170 #endif
6171   }else{
6172     /* If sqlite3BtreeDropTable was called on page 1. */
6173     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6174     releasePage(pPage);
6175   }
6176   return rc;
6177 }
6178 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6179   int rc;
6180   sqlite3BtreeEnter(p);
6181   p->pBt->db = p->db;
6182   rc = btreeDropTable(p, iTable, piMoved);
6183   sqlite3BtreeLeave(p);
6184   return rc;
6185 }
6186 
6187 
6188 /*
6189 ** Read the meta-information out of a database file.  Meta[0]
6190 ** is the number of free pages currently in the database.  Meta[1]
6191 ** through meta[15] are available for use by higher layers.  Meta[0]
6192 ** is read-only, the others are read/write.
6193 **
6194 ** The schema layer numbers meta values differently.  At the schema
6195 ** layer (and the SetCookie and ReadCookie opcodes) the number of
6196 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
6197 */
6198 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6199   DbPage *pDbPage;
6200   int rc;
6201   unsigned char *pP1;
6202   BtShared *pBt = p->pBt;
6203 
6204   sqlite3BtreeEnter(p);
6205   pBt->db = p->db;
6206 
6207   /* Reading a meta-data value requires a read-lock on page 1 (and hence
6208   ** the sqlite_master table. We grab this lock regardless of whether or
6209   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6210   ** 1 is treated as a special case by queryTableLock() and lockTable()).
6211   */
6212   rc = queryTableLock(p, 1, READ_LOCK);
6213   if( rc!=SQLITE_OK ){
6214     sqlite3BtreeLeave(p);
6215     return rc;
6216   }
6217 
6218   assert( idx>=0 && idx<=15 );
6219   rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6220   if( rc ){
6221     sqlite3BtreeLeave(p);
6222     return rc;
6223   }
6224   pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6225   *pMeta = get4byte(&pP1[36 + idx*4]);
6226   sqlite3PagerUnref(pDbPage);
6227 
6228   /* If autovacuumed is disabled in this build but we are trying to
6229   ** access an autovacuumed database, then make the database readonly.
6230   */
6231 #ifdef SQLITE_OMIT_AUTOVACUUM
6232   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6233 #endif
6234 
6235   /* Grab the read-lock on page 1. */
6236   rc = lockTable(p, 1, READ_LOCK);
6237   sqlite3BtreeLeave(p);
6238   return rc;
6239 }
6240 
6241 /*
6242 ** Write meta-information back into the database.  Meta[0] is
6243 ** read-only and may not be written.
6244 */
6245 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6246   BtShared *pBt = p->pBt;
6247   unsigned char *pP1;
6248   int rc;
6249   assert( idx>=1 && idx<=15 );
6250   sqlite3BtreeEnter(p);
6251   pBt->db = p->db;
6252   if( p->inTrans!=TRANS_WRITE ){
6253     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6254   }else{
6255     assert( pBt->pPage1!=0 );
6256     pP1 = pBt->pPage1->aData;
6257     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6258     if( rc==SQLITE_OK ){
6259       put4byte(&pP1[36 + idx*4], iMeta);
6260 #ifndef SQLITE_OMIT_AUTOVACUUM
6261       if( idx==7 ){
6262         assert( pBt->autoVacuum || iMeta==0 );
6263         assert( iMeta==0 || iMeta==1 );
6264         pBt->incrVacuum = iMeta;
6265       }
6266 #endif
6267     }
6268   }
6269   sqlite3BtreeLeave(p);
6270   return rc;
6271 }
6272 
6273 /*
6274 ** Return the flag byte at the beginning of the page that the cursor
6275 ** is currently pointing to.
6276 */
6277 int sqlite3BtreeFlags(BtCursor *pCur){
6278   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6279   ** restoreOrClearCursorPosition() here.
6280   */
6281   MemPage *pPage;
6282   restoreOrClearCursorPosition(pCur);
6283   pPage = pCur->pPage;
6284   assert( cursorHoldsMutex(pCur) );
6285   assert( pPage->pBt==pCur->pBt );
6286   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6287 }
6288 
6289 
6290 /*
6291 ** Return the pager associated with a BTree.  This routine is used for
6292 ** testing and debugging only.
6293 */
6294 Pager *sqlite3BtreePager(Btree *p){
6295   return p->pBt->pPager;
6296 }
6297 
6298 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6299 /*
6300 ** Append a message to the error message string.
6301 */
6302 static void checkAppendMsg(
6303   IntegrityCk *pCheck,
6304   char *zMsg1,
6305   const char *zFormat,
6306   ...
6307 ){
6308   va_list ap;
6309   char *zMsg2;
6310   if( !pCheck->mxErr ) return;
6311   pCheck->mxErr--;
6312   pCheck->nErr++;
6313   va_start(ap, zFormat);
6314   zMsg2 = sqlite3VMPrintf(0, zFormat, ap);
6315   va_end(ap);
6316   if( zMsg1==0 ) zMsg1 = "";
6317   if( pCheck->zErrMsg ){
6318     char *zOld = pCheck->zErrMsg;
6319     pCheck->zErrMsg = 0;
6320     sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
6321     sqlite3_free(zOld);
6322   }else{
6323     sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
6324   }
6325   sqlite3_free(zMsg2);
6326 }
6327 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6328 
6329 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6330 /*
6331 ** Add 1 to the reference count for page iPage.  If this is the second
6332 ** reference to the page, add an error message to pCheck->zErrMsg.
6333 ** Return 1 if there are 2 ore more references to the page and 0 if
6334 ** if this is the first reference to the page.
6335 **
6336 ** Also check that the page number is in bounds.
6337 */
6338 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
6339   if( iPage==0 ) return 1;
6340   if( iPage>pCheck->nPage || iPage<0 ){
6341     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
6342     return 1;
6343   }
6344   if( pCheck->anRef[iPage]==1 ){
6345     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
6346     return 1;
6347   }
6348   return  (pCheck->anRef[iPage]++)>1;
6349 }
6350 
6351 #ifndef SQLITE_OMIT_AUTOVACUUM
6352 /*
6353 ** Check that the entry in the pointer-map for page iChild maps to
6354 ** page iParent, pointer type ptrType. If not, append an error message
6355 ** to pCheck.
6356 */
6357 static void checkPtrmap(
6358   IntegrityCk *pCheck,   /* Integrity check context */
6359   Pgno iChild,           /* Child page number */
6360   u8 eType,              /* Expected pointer map type */
6361   Pgno iParent,          /* Expected pointer map parent page number */
6362   char *zContext         /* Context description (used for error msg) */
6363 ){
6364   int rc;
6365   u8 ePtrmapType;
6366   Pgno iPtrmapParent;
6367 
6368   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6369   if( rc!=SQLITE_OK ){
6370     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6371     return;
6372   }
6373 
6374   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6375     checkAppendMsg(pCheck, zContext,
6376       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6377       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6378   }
6379 }
6380 #endif
6381 
6382 /*
6383 ** Check the integrity of the freelist or of an overflow page list.
6384 ** Verify that the number of pages on the list is N.
6385 */
6386 static void checkList(
6387   IntegrityCk *pCheck,  /* Integrity checking context */
6388   int isFreeList,       /* True for a freelist.  False for overflow page list */
6389   int iPage,            /* Page number for first page in the list */
6390   int N,                /* Expected number of pages in the list */
6391   char *zContext        /* Context for error messages */
6392 ){
6393   int i;
6394   int expected = N;
6395   int iFirst = iPage;
6396   while( N-- > 0 && pCheck->mxErr ){
6397     DbPage *pOvflPage;
6398     unsigned char *pOvflData;
6399     if( iPage<1 ){
6400       checkAppendMsg(pCheck, zContext,
6401          "%d of %d pages missing from overflow list starting at %d",
6402           N+1, expected, iFirst);
6403       break;
6404     }
6405     if( checkRef(pCheck, iPage, zContext) ) break;
6406     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
6407       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
6408       break;
6409     }
6410     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
6411     if( isFreeList ){
6412       int n = get4byte(&pOvflData[4]);
6413 #ifndef SQLITE_OMIT_AUTOVACUUM
6414       if( pCheck->pBt->autoVacuum ){
6415         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6416       }
6417 #endif
6418       if( n>pCheck->pBt->usableSize/4-8 ){
6419         checkAppendMsg(pCheck, zContext,
6420            "freelist leaf count too big on page %d", iPage);
6421         N--;
6422       }else{
6423         for(i=0; i<n; i++){
6424           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
6425 #ifndef SQLITE_OMIT_AUTOVACUUM
6426           if( pCheck->pBt->autoVacuum ){
6427             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6428           }
6429 #endif
6430           checkRef(pCheck, iFreePage, zContext);
6431         }
6432         N -= n;
6433       }
6434     }
6435 #ifndef SQLITE_OMIT_AUTOVACUUM
6436     else{
6437       /* If this database supports auto-vacuum and iPage is not the last
6438       ** page in this overflow list, check that the pointer-map entry for
6439       ** the following page matches iPage.
6440       */
6441       if( pCheck->pBt->autoVacuum && N>0 ){
6442         i = get4byte(pOvflData);
6443         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6444       }
6445     }
6446 #endif
6447     iPage = get4byte(pOvflData);
6448     sqlite3PagerUnref(pOvflPage);
6449   }
6450 }
6451 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6452 
6453 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6454 /*
6455 ** Do various sanity checks on a single page of a tree.  Return
6456 ** the tree depth.  Root pages return 0.  Parents of root pages
6457 ** return 1, and so forth.
6458 **
6459 ** These checks are done:
6460 **
6461 **      1.  Make sure that cells and freeblocks do not overlap
6462 **          but combine to completely cover the page.
6463 **  NO  2.  Make sure cell keys are in order.
6464 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
6465 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
6466 **      5.  Check the integrity of overflow pages.
6467 **      6.  Recursively call checkTreePage on all children.
6468 **      7.  Verify that the depth of all children is the same.
6469 **      8.  Make sure this page is at least 33% full or else it is
6470 **          the root of the tree.
6471 */
6472 static int checkTreePage(
6473   IntegrityCk *pCheck,  /* Context for the sanity check */
6474   int iPage,            /* Page number of the page to check */
6475   MemPage *pParent,     /* Parent page */
6476   char *zParentContext  /* Parent context */
6477 ){
6478   MemPage *pPage;
6479   int i, rc, depth, d2, pgno, cnt;
6480   int hdr, cellStart;
6481   int nCell;
6482   u8 *data;
6483   BtShared *pBt;
6484   int usableSize;
6485   char zContext[100];
6486   char *hit;
6487 
6488   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
6489 
6490   /* Check that the page exists
6491   */
6492   pBt = pCheck->pBt;
6493   usableSize = pBt->usableSize;
6494   if( iPage==0 ) return 0;
6495   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
6496   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
6497     checkAppendMsg(pCheck, zContext,
6498        "unable to get the page. error code=%d", rc);
6499     return 0;
6500   }
6501   if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){
6502     checkAppendMsg(pCheck, zContext,
6503                    "sqlite3BtreeInitPage() returns error code %d", rc);
6504     releasePage(pPage);
6505     return 0;
6506   }
6507 
6508   /* Check out all the cells.
6509   */
6510   depth = 0;
6511   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
6512     u8 *pCell;
6513     int sz;
6514     CellInfo info;
6515 
6516     /* Check payload overflow pages
6517     */
6518     sqlite3_snprintf(sizeof(zContext), zContext,
6519              "On tree page %d cell %d: ", iPage, i);
6520     pCell = findCell(pPage,i);
6521     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
6522     sz = info.nData;
6523     if( !pPage->intKey ) sz += info.nKey;
6524     assert( sz==info.nPayload );
6525     if( sz>info.nLocal ){
6526       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
6527       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6528 #ifndef SQLITE_OMIT_AUTOVACUUM
6529       if( pBt->autoVacuum ){
6530         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
6531       }
6532 #endif
6533       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
6534     }
6535 
6536     /* Check sanity of left child page.
6537     */
6538     if( !pPage->leaf ){
6539       pgno = get4byte(pCell);
6540 #ifndef SQLITE_OMIT_AUTOVACUUM
6541       if( pBt->autoVacuum ){
6542         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6543       }
6544 #endif
6545       d2 = checkTreePage(pCheck,pgno,pPage,zContext);
6546       if( i>0 && d2!=depth ){
6547         checkAppendMsg(pCheck, zContext, "Child page depth differs");
6548       }
6549       depth = d2;
6550     }
6551   }
6552   if( !pPage->leaf ){
6553     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6554     sqlite3_snprintf(sizeof(zContext), zContext,
6555                      "On page %d at right child: ", iPage);
6556 #ifndef SQLITE_OMIT_AUTOVACUUM
6557     if( pBt->autoVacuum ){
6558       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
6559     }
6560 #endif
6561     checkTreePage(pCheck, pgno, pPage, zContext);
6562   }
6563 
6564   /* Check for complete coverage of the page
6565   */
6566   data = pPage->aData;
6567   hdr = pPage->hdrOffset;
6568   hit = sqlite3MallocZero( usableSize );
6569   if( hit ){
6570     memset(hit, 1, get2byte(&data[hdr+5]));
6571     nCell = get2byte(&data[hdr+3]);
6572     cellStart = hdr + 12 - 4*pPage->leaf;
6573     for(i=0; i<nCell; i++){
6574       int pc = get2byte(&data[cellStart+i*2]);
6575       u16 size = cellSizePtr(pPage, &data[pc]);
6576       int j;
6577       if( (pc+size-1)>=usableSize || pc<0 ){
6578         checkAppendMsg(pCheck, 0,
6579             "Corruption detected in cell %d on page %d",i,iPage,0);
6580       }else{
6581         for(j=pc+size-1; j>=pc; j--) hit[j]++;
6582       }
6583     }
6584     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6585            cnt++){
6586       int size = get2byte(&data[i+2]);
6587       int j;
6588       if( (i+size-1)>=usableSize || i<0 ){
6589         checkAppendMsg(pCheck, 0,
6590             "Corruption detected in cell %d on page %d",i,iPage,0);
6591       }else{
6592         for(j=i+size-1; j>=i; j--) hit[j]++;
6593       }
6594       i = get2byte(&data[i]);
6595     }
6596     for(i=cnt=0; i<usableSize; i++){
6597       if( hit[i]==0 ){
6598         cnt++;
6599       }else if( hit[i]>1 ){
6600         checkAppendMsg(pCheck, 0,
6601           "Multiple uses for byte %d of page %d", i, iPage);
6602         break;
6603       }
6604     }
6605     if( cnt!=data[hdr+7] ){
6606       checkAppendMsg(pCheck, 0,
6607           "Fragmented space is %d byte reported as %d on page %d",
6608           cnt, data[hdr+7], iPage);
6609     }
6610   }
6611   sqlite3_free(hit);
6612 
6613   releasePage(pPage);
6614   return depth+1;
6615 }
6616 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6617 
6618 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6619 /*
6620 ** This routine does a complete check of the given BTree file.  aRoot[] is
6621 ** an array of pages numbers were each page number is the root page of
6622 ** a table.  nRoot is the number of entries in aRoot.
6623 **
6624 ** If everything checks out, this routine returns NULL.  If something is
6625 ** amiss, an error message is written into memory obtained from malloc()
6626 ** and a pointer to that error message is returned.  The calling function
6627 ** is responsible for freeing the error message when it is done.
6628 */
6629 char *sqlite3BtreeIntegrityCheck(
6630   Btree *p,     /* The btree to be checked */
6631   int *aRoot,   /* An array of root pages numbers for individual trees */
6632   int nRoot,    /* Number of entries in aRoot[] */
6633   int mxErr,    /* Stop reporting errors after this many */
6634   int *pnErr    /* Write number of errors seen to this variable */
6635 ){
6636   int i;
6637   int nRef;
6638   IntegrityCk sCheck;
6639   BtShared *pBt = p->pBt;
6640 
6641   sqlite3BtreeEnter(p);
6642   pBt->db = p->db;
6643   nRef = sqlite3PagerRefcount(pBt->pPager);
6644   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
6645     sqlite3BtreeLeave(p);
6646     return sqlite3StrDup("Unable to acquire a read lock on the database");
6647   }
6648   sCheck.pBt = pBt;
6649   sCheck.pPager = pBt->pPager;
6650   sCheck.nPage = sqlite3PagerPagecount(sCheck.pPager);
6651   sCheck.mxErr = mxErr;
6652   sCheck.nErr = 0;
6653   *pnErr = 0;
6654 #ifndef SQLITE_OMIT_AUTOVACUUM
6655   if( pBt->nTrunc!=0 ){
6656     sCheck.nPage = pBt->nTrunc;
6657   }
6658 #endif
6659   if( sCheck.nPage==0 ){
6660     unlockBtreeIfUnused(pBt);
6661     sqlite3BtreeLeave(p);
6662     return 0;
6663   }
6664   sCheck.anRef = sqlite3_malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
6665   if( !sCheck.anRef ){
6666     unlockBtreeIfUnused(pBt);
6667     *pnErr = 1;
6668     sqlite3BtreeLeave(p);
6669     return sqlite3MPrintf(p->db, "Unable to malloc %d bytes",
6670         (sCheck.nPage+1)*sizeof(sCheck.anRef[0]));
6671   }
6672   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
6673   i = PENDING_BYTE_PAGE(pBt);
6674   if( i<=sCheck.nPage ){
6675     sCheck.anRef[i] = 1;
6676   }
6677   sCheck.zErrMsg = 0;
6678 
6679   /* Check the integrity of the freelist
6680   */
6681   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6682             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
6683 
6684   /* Check all the tables.
6685   */
6686   for(i=0; i<nRoot && sCheck.mxErr; i++){
6687     if( aRoot[i]==0 ) continue;
6688 #ifndef SQLITE_OMIT_AUTOVACUUM
6689     if( pBt->autoVacuum && aRoot[i]>1 ){
6690       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6691     }
6692 #endif
6693     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
6694   }
6695 
6696   /* Make sure every page in the file is referenced
6697   */
6698   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
6699 #ifdef SQLITE_OMIT_AUTOVACUUM
6700     if( sCheck.anRef[i]==0 ){
6701       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6702     }
6703 #else
6704     /* If the database supports auto-vacuum, make sure no tables contain
6705     ** references to pointer-map pages.
6706     */
6707     if( sCheck.anRef[i]==0 &&
6708        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
6709       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6710     }
6711     if( sCheck.anRef[i]!=0 &&
6712        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
6713       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6714     }
6715 #endif
6716   }
6717 
6718   /* Make sure this analysis did not leave any unref() pages
6719   */
6720   unlockBtreeIfUnused(pBt);
6721   if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
6722     checkAppendMsg(&sCheck, 0,
6723       "Outstanding page count goes from %d to %d during this analysis",
6724       nRef, sqlite3PagerRefcount(pBt->pPager)
6725     );
6726   }
6727 
6728   /* Clean  up and report errors.
6729   */
6730   sqlite3BtreeLeave(p);
6731   sqlite3_free(sCheck.anRef);
6732   *pnErr = sCheck.nErr;
6733   return sCheck.zErrMsg;
6734 }
6735 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6736 
6737 /*
6738 ** Return the full pathname of the underlying database file.
6739 **
6740 ** The pager filename is invariant as long as the pager is
6741 ** open so it is safe to access without the BtShared mutex.
6742 */
6743 const char *sqlite3BtreeGetFilename(Btree *p){
6744   assert( p->pBt->pPager!=0 );
6745   return sqlite3PagerFilename(p->pBt->pPager);
6746 }
6747 
6748 /*
6749 ** Return the pathname of the directory that contains the database file.
6750 **
6751 ** The pager directory name is invariant as long as the pager is
6752 ** open so it is safe to access without the BtShared mutex.
6753 */
6754 const char *sqlite3BtreeGetDirname(Btree *p){
6755   assert( p->pBt->pPager!=0 );
6756   return sqlite3PagerDirname(p->pBt->pPager);
6757 }
6758 
6759 /*
6760 ** Return the pathname of the journal file for this database. The return
6761 ** value of this routine is the same regardless of whether the journal file
6762 ** has been created or not.
6763 **
6764 ** The pager journal filename is invariant as long as the pager is
6765 ** open so it is safe to access without the BtShared mutex.
6766 */
6767 const char *sqlite3BtreeGetJournalname(Btree *p){
6768   assert( p->pBt->pPager!=0 );
6769   return sqlite3PagerJournalname(p->pBt->pPager);
6770 }
6771 
6772 #ifndef SQLITE_OMIT_VACUUM
6773 /*
6774 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
6775 ** must be active for both files.
6776 **
6777 ** The size of file pTo may be reduced by this operation.
6778 ** If anything goes wrong, the transaction on pTo is rolled back.
6779 **
6780 ** If successful, CommitPhaseOne() may be called on pTo before returning.
6781 ** The caller should finish committing the transaction on pTo by calling
6782 ** sqlite3BtreeCommit().
6783 */
6784 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
6785   int rc = SQLITE_OK;
6786   Pgno i;
6787 
6788   Pgno nFromPage;     /* Number of pages in pFrom */
6789   Pgno nToPage;       /* Number of pages in pTo */
6790   Pgno nNewPage;      /* Number of pages in pTo after the copy */
6791 
6792   Pgno iSkip;         /* Pending byte page in pTo */
6793   int nToPageSize;    /* Page size of pTo in bytes */
6794   int nFromPageSize;  /* Page size of pFrom in bytes */
6795 
6796   BtShared *pBtTo = pTo->pBt;
6797   BtShared *pBtFrom = pFrom->pBt;
6798   pBtTo->db = pTo->db;
6799   pBtFrom->db = pFrom->db;
6800 
6801   nToPageSize = pBtTo->pageSize;
6802   nFromPageSize = pBtFrom->pageSize;
6803 
6804   if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
6805     return SQLITE_ERROR;
6806   }
6807   if( pBtTo->pCursor ){
6808     return SQLITE_BUSY;
6809   }
6810 
6811   nToPage = sqlite3PagerPagecount(pBtTo->pPager);
6812   nFromPage = sqlite3PagerPagecount(pBtFrom->pPager);
6813   iSkip = PENDING_BYTE_PAGE(pBtTo);
6814 
6815   /* Variable nNewPage is the number of pages required to store the
6816   ** contents of pFrom using the current page-size of pTo.
6817   */
6818   nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
6819       (i64)nToPageSize;
6820 
6821   for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
6822 
6823     /* Journal the original page.
6824     **
6825     ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
6826     ** in database *pTo (before the copy). This page is never written
6827     ** into the journal file. Unless i==iSkip or the page was not
6828     ** present in pTo before the copy operation, journal page i from pTo.
6829     */
6830     if( i!=iSkip && i<=nToPage ){
6831       DbPage *pDbPage;
6832       rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
6833       if( rc ){
6834         break;
6835       }
6836       rc = sqlite3PagerWrite(pDbPage);
6837       if( rc ){
6838         break;
6839       }
6840       if( i>nFromPage ){
6841         /* Yeah.  It seems wierd to call DontWrite() right after Write(). But
6842         ** that is because the names of those procedures do not exactly
6843         ** represent what they do.  Write() really means "put this page in the
6844         ** rollback journal and mark it as dirty so that it will be written
6845         ** to the database file later."  DontWrite() undoes the second part of
6846         ** that and prevents the page from being written to the database. The
6847         ** page is still on the rollback journal, though.  And that is the
6848         ** whole point of this block: to put pages on the rollback journal.
6849         */
6850         sqlite3PagerDontWrite(pDbPage);
6851       }
6852       sqlite3PagerUnref(pDbPage);
6853     }
6854 
6855     /* Overwrite the data in page i of the target database */
6856     if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
6857 
6858       DbPage *pToPage = 0;
6859       sqlite3_int64 iOff;
6860 
6861       rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
6862       if( rc==SQLITE_OK ){
6863         rc = sqlite3PagerWrite(pToPage);
6864       }
6865 
6866       for(
6867         iOff=(i-1)*nToPageSize;
6868         rc==SQLITE_OK && iOff<i*nToPageSize;
6869         iOff += nFromPageSize
6870       ){
6871         DbPage *pFromPage = 0;
6872         Pgno iFrom = (iOff/nFromPageSize)+1;
6873 
6874         if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
6875           continue;
6876         }
6877 
6878         rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
6879         if( rc==SQLITE_OK ){
6880           char *zTo = sqlite3PagerGetData(pToPage);
6881           char *zFrom = sqlite3PagerGetData(pFromPage);
6882           int nCopy;
6883 
6884           if( nFromPageSize>=nToPageSize ){
6885             zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
6886             nCopy = nToPageSize;
6887           }else{
6888             zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
6889             nCopy = nFromPageSize;
6890           }
6891 
6892           memcpy(zTo, zFrom, nCopy);
6893 	  sqlite3PagerUnref(pFromPage);
6894         }
6895       }
6896 
6897       if( pToPage ) sqlite3PagerUnref(pToPage);
6898     }
6899   }
6900 
6901   /* If things have worked so far, the database file may need to be
6902   ** truncated. The complex part is that it may need to be truncated to
6903   ** a size that is not an integer multiple of nToPageSize - the current
6904   ** page size used by the pager associated with B-Tree pTo.
6905   **
6906   ** For example, say the page-size of pTo is 2048 bytes and the original
6907   ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
6908   ** bytes and 9 pages, then the file needs to be truncated to 9KB.
6909   */
6910   if( rc==SQLITE_OK ){
6911     if( nFromPageSize!=nToPageSize ){
6912       sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
6913       i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
6914       i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
6915       i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
6916 
6917       assert( iSize<=iNow );
6918 
6919       /* Commit phase one syncs the journal file associated with pTo
6920       ** containing the original data. It does not sync the database file
6921       ** itself. After doing this it is safe to use OsTruncate() and other
6922       ** file APIs on the database file directly.
6923       */
6924       pBtTo->db = pTo->db;
6925       rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
6926       if( iSize<iNow && rc==SQLITE_OK ){
6927         rc = sqlite3OsTruncate(pFile, iSize);
6928       }
6929 
6930       /* The loop that copied data from database pFrom to pTo did not
6931       ** populate the locking page of database pTo. If the page-size of
6932       ** pFrom is smaller than that of pTo, this means some data will
6933       ** not have been copied.
6934       **
6935       ** This block copies the missing data from database pFrom to pTo
6936       ** using file APIs. This is safe because at this point we know that
6937       ** all of the original data from pTo has been synced into the
6938       ** journal file. At this point it would be safe to do anything at
6939       ** all to the database file except truncate it to zero bytes.
6940       */
6941       if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
6942         i64 iOff;
6943         for(
6944           iOff=iPending;
6945           rc==SQLITE_OK && iOff<(iPending+nToPageSize);
6946           iOff += nFromPageSize
6947         ){
6948           DbPage *pFromPage = 0;
6949           Pgno iFrom = (iOff/nFromPageSize)+1;
6950 
6951           if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
6952             continue;
6953           }
6954 
6955           rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
6956           if( rc==SQLITE_OK ){
6957             char *zFrom = sqlite3PagerGetData(pFromPage);
6958   	  rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
6959             sqlite3PagerUnref(pFromPage);
6960           }
6961         }
6962       }
6963 
6964       /* Sync the database file */
6965       if( rc==SQLITE_OK ){
6966         rc = sqlite3PagerSync(pBtTo->pPager);
6967       }
6968     }else{
6969       rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
6970     }
6971     if( rc==SQLITE_OK ){
6972       pBtTo->pageSizeFixed = 0;
6973     }
6974   }
6975 
6976   if( rc ){
6977     sqlite3BtreeRollback(pTo);
6978   }
6979 
6980   return rc;
6981 }
6982 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
6983   int rc;
6984   sqlite3BtreeEnter(pTo);
6985   sqlite3BtreeEnter(pFrom);
6986   rc = btreeCopyFile(pTo, pFrom);
6987   sqlite3BtreeLeave(pFrom);
6988   sqlite3BtreeLeave(pTo);
6989   return rc;
6990 }
6991 
6992 #endif /* SQLITE_OMIT_VACUUM */
6993 
6994 /*
6995 ** Return non-zero if a transaction is active.
6996 */
6997 int sqlite3BtreeIsInTrans(Btree *p){
6998   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
6999   return (p && (p->inTrans==TRANS_WRITE));
7000 }
7001 
7002 /*
7003 ** Return non-zero if a statement transaction is active.
7004 */
7005 int sqlite3BtreeIsInStmt(Btree *p){
7006   assert( sqlite3BtreeHoldsMutex(p) );
7007   return (p->pBt && p->pBt->inStmt);
7008 }
7009 
7010 /*
7011 ** Return non-zero if a read (or write) transaction is active.
7012 */
7013 int sqlite3BtreeIsInReadTrans(Btree *p){
7014   assert( sqlite3_mutex_held(p->db->mutex) );
7015   return (p && (p->inTrans!=TRANS_NONE));
7016 }
7017 
7018 /*
7019 ** This function returns a pointer to a blob of memory associated with
7020 ** a single shared-btree. The memory is used by client code for its own
7021 ** purposes (for example, to store a high-level schema associated with
7022 ** the shared-btree). The btree layer manages reference counting issues.
7023 **
7024 ** The first time this is called on a shared-btree, nBytes bytes of memory
7025 ** are allocated, zeroed, and returned to the caller. For each subsequent
7026 ** call the nBytes parameter is ignored and a pointer to the same blob
7027 ** of memory returned.
7028 **
7029 ** Just before the shared-btree is closed, the function passed as the
7030 ** xFree argument when the memory allocation was made is invoked on the
7031 ** blob of allocated memory. This function should not call sqlite3_free()
7032 ** on the memory, the btree layer does that.
7033 */
7034 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7035   BtShared *pBt = p->pBt;
7036   sqlite3BtreeEnter(p);
7037   if( !pBt->pSchema ){
7038     pBt->pSchema = sqlite3MallocZero(nBytes);
7039     pBt->xFreeSchema = xFree;
7040   }
7041   sqlite3BtreeLeave(p);
7042   return pBt->pSchema;
7043 }
7044 
7045 /*
7046 ** Return true if another user of the same shared btree as the argument
7047 ** handle holds an exclusive lock on the sqlite_master table.
7048 */
7049 int sqlite3BtreeSchemaLocked(Btree *p){
7050   int rc;
7051   assert( sqlite3_mutex_held(p->db->mutex) );
7052   sqlite3BtreeEnter(p);
7053   rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7054   sqlite3BtreeLeave(p);
7055   return rc;
7056 }
7057 
7058 
7059 #ifndef SQLITE_OMIT_SHARED_CACHE
7060 /*
7061 ** Obtain a lock on the table whose root page is iTab.  The
7062 ** lock is a write lock if isWritelock is true or a read lock
7063 ** if it is false.
7064 */
7065 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
7066   int rc = SQLITE_OK;
7067   if( p->sharable ){
7068     u8 lockType = READ_LOCK + isWriteLock;
7069     assert( READ_LOCK+1==WRITE_LOCK );
7070     assert( isWriteLock==0 || isWriteLock==1 );
7071     sqlite3BtreeEnter(p);
7072     rc = queryTableLock(p, iTab, lockType);
7073     if( rc==SQLITE_OK ){
7074       rc = lockTable(p, iTab, lockType);
7075     }
7076     sqlite3BtreeLeave(p);
7077   }
7078   return rc;
7079 }
7080 #endif
7081 
7082 #ifndef SQLITE_OMIT_INCRBLOB
7083 /*
7084 ** Argument pCsr must be a cursor opened for writing on an
7085 ** INTKEY table currently pointing at a valid table entry.
7086 ** This function modifies the data stored as part of that entry.
7087 ** Only the data content may only be modified, it is not possible
7088 ** to change the length of the data stored.
7089 */
7090 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
7091   assert( cursorHoldsMutex(pCsr) );
7092   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
7093   assert(pCsr->isIncrblobHandle);
7094   if( pCsr->eState>=CURSOR_REQUIRESEEK ){
7095     if( pCsr->eState==CURSOR_FAULT ){
7096       return pCsr->skip;
7097     }else{
7098       return SQLITE_ABORT;
7099     }
7100   }
7101 
7102   /* Check some preconditions:
7103   **   (a) the cursor is open for writing,
7104   **   (b) there is no read-lock on the table being modified and
7105   **   (c) the cursor points at a valid row of an intKey table.
7106   */
7107   if( !pCsr->wrFlag ){
7108     return SQLITE_READONLY;
7109   }
7110   assert( !pCsr->pBt->readOnly
7111           && pCsr->pBt->inTransaction==TRANS_WRITE );
7112   if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr) ){
7113     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7114   }
7115   if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){
7116     return SQLITE_ERROR;
7117   }
7118 
7119   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
7120 }
7121 
7122 /*
7123 ** Set a flag on this cursor to cache the locations of pages from the
7124 ** overflow list for the current row. This is used by cursors opened
7125 ** for incremental blob IO only.
7126 **
7127 ** This function sets a flag only. The actual page location cache
7128 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
7129 ** accessPayload() (the worker function for sqlite3BtreeData() and
7130 ** sqlite3BtreePutData()).
7131 */
7132 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
7133   assert( cursorHoldsMutex(pCur) );
7134   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
7135   assert(!pCur->isIncrblobHandle);
7136   assert(!pCur->aOverflow);
7137   pCur->isIncrblobHandle = 1;
7138 }
7139 #endif
7140