xref: /sqlite-3.40.0/src/btree.c (revision e8f2c9dc)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifndef SQLITE_OMIT_SHARED_CACHE
116 
117 #ifdef SQLITE_DEBUG
118 /*
119 **** This function is only used as part of an assert() statement. ***
120 **
121 ** Check to see if pBtree holds the required locks to read or write to the
122 ** table with root page iRoot.   Return 1 if it does and 0 if not.
123 **
124 ** For example, when writing to a table with root-page iRoot via
125 ** Btree connection pBtree:
126 **
127 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
128 **
129 ** When writing to an index that resides in a sharable database, the
130 ** caller should have first obtained a lock specifying the root page of
131 ** the corresponding table. This makes things a bit more complicated,
132 ** as this module treats each table as a separate structure. To determine
133 ** the table corresponding to the index being written, this
134 ** function has to search through the database schema.
135 **
136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
137 ** hold a write-lock on the schema table (root page 1). This is also
138 ** acceptable.
139 */
140 static int hasSharedCacheTableLock(
141   Btree *pBtree,         /* Handle that must hold lock */
142   Pgno iRoot,            /* Root page of b-tree */
143   int isIndex,           /* True if iRoot is the root of an index b-tree */
144   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
145 ){
146   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
147   Pgno iTab = 0;
148   BtLock *pLock;
149 
150   /* If this database is not shareable, or if the client is reading
151   ** and has the read-uncommitted flag set, then no lock is required.
152   ** Return true immediately.
153   */
154   if( (pBtree->sharable==0)
155    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
156   ){
157     return 1;
158   }
159 
160   /* If the client is reading  or writing an index and the schema is
161   ** not loaded, then it is too difficult to actually check to see if
162   ** the correct locks are held.  So do not bother - just return true.
163   ** This case does not come up very often anyhow.
164   */
165   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
166     return 1;
167   }
168 
169   /* Figure out the root-page that the lock should be held on. For table
170   ** b-trees, this is just the root page of the b-tree being read or
171   ** written. For index b-trees, it is the root page of the associated
172   ** table.  */
173   if( isIndex ){
174     HashElem *p;
175     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
176       Index *pIdx = (Index *)sqliteHashData(p);
177       if( pIdx->tnum==(int)iRoot ){
178         iTab = pIdx->pTable->tnum;
179       }
180     }
181   }else{
182     iTab = iRoot;
183   }
184 
185   /* Search for the required lock. Either a write-lock on root-page iTab, a
186   ** write-lock on the schema table, or (if the client is reading) a
187   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
188   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
189     if( pLock->pBtree==pBtree
190      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
191      && pLock->eLock>=eLockType
192     ){
193       return 1;
194     }
195   }
196 
197   /* Failed to find the required lock. */
198   return 0;
199 }
200 #endif /* SQLITE_DEBUG */
201 
202 #ifdef SQLITE_DEBUG
203 /*
204 **** This function may be used as part of assert() statements only. ****
205 **
206 ** Return true if it would be illegal for pBtree to write into the
207 ** table or index rooted at iRoot because other shared connections are
208 ** simultaneously reading that same table or index.
209 **
210 ** It is illegal for pBtree to write if some other Btree object that
211 ** shares the same BtShared object is currently reading or writing
212 ** the iRoot table.  Except, if the other Btree object has the
213 ** read-uncommitted flag set, then it is OK for the other object to
214 ** have a read cursor.
215 **
216 ** For example, before writing to any part of the table or index
217 ** rooted at page iRoot, one should call:
218 **
219 **    assert( !hasReadConflicts(pBtree, iRoot) );
220 */
221 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
222   BtCursor *p;
223   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
224     if( p->pgnoRoot==iRoot
225      && p->pBtree!=pBtree
226      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
227     ){
228       return 1;
229     }
230   }
231   return 0;
232 }
233 #endif    /* #ifdef SQLITE_DEBUG */
234 
235 /*
236 ** Query to see if Btree handle p may obtain a lock of type eLock
237 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
238 ** SQLITE_OK if the lock may be obtained (by calling
239 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
240 */
241 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
242   BtShared *pBt = p->pBt;
243   BtLock *pIter;
244 
245   assert( sqlite3BtreeHoldsMutex(p) );
246   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
247   assert( p->db!=0 );
248   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
249 
250   /* If requesting a write-lock, then the Btree must have an open write
251   ** transaction on this file. And, obviously, for this to be so there
252   ** must be an open write transaction on the file itself.
253   */
254   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
255   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
256 
257   /* This routine is a no-op if the shared-cache is not enabled */
258   if( !p->sharable ){
259     return SQLITE_OK;
260   }
261 
262   /* If some other connection is holding an exclusive lock, the
263   ** requested lock may not be obtained.
264   */
265   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
266     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
267     return SQLITE_LOCKED_SHAREDCACHE;
268   }
269 
270   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
271     /* The condition (pIter->eLock!=eLock) in the following if(...)
272     ** statement is a simplification of:
273     **
274     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
275     **
276     ** since we know that if eLock==WRITE_LOCK, then no other connection
277     ** may hold a WRITE_LOCK on any table in this file (since there can
278     ** only be a single writer).
279     */
280     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
281     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
282     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
283       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
284       if( eLock==WRITE_LOCK ){
285         assert( p==pBt->pWriter );
286         pBt->btsFlags |= BTS_PENDING;
287       }
288       return SQLITE_LOCKED_SHAREDCACHE;
289     }
290   }
291   return SQLITE_OK;
292 }
293 #endif /* !SQLITE_OMIT_SHARED_CACHE */
294 
295 #ifndef SQLITE_OMIT_SHARED_CACHE
296 /*
297 ** Add a lock on the table with root-page iTable to the shared-btree used
298 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
299 ** WRITE_LOCK.
300 **
301 ** This function assumes the following:
302 **
303 **   (a) The specified Btree object p is connected to a sharable
304 **       database (one with the BtShared.sharable flag set), and
305 **
306 **   (b) No other Btree objects hold a lock that conflicts
307 **       with the requested lock (i.e. querySharedCacheTableLock() has
308 **       already been called and returned SQLITE_OK).
309 **
310 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
311 ** is returned if a malloc attempt fails.
312 */
313 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
314   BtShared *pBt = p->pBt;
315   BtLock *pLock = 0;
316   BtLock *pIter;
317 
318   assert( sqlite3BtreeHoldsMutex(p) );
319   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
320   assert( p->db!=0 );
321 
322   /* A connection with the read-uncommitted flag set will never try to
323   ** obtain a read-lock using this function. The only read-lock obtained
324   ** by a connection in read-uncommitted mode is on the sqlite_master
325   ** table, and that lock is obtained in BtreeBeginTrans().  */
326   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
327 
328   /* This function should only be called on a sharable b-tree after it
329   ** has been determined that no other b-tree holds a conflicting lock.  */
330   assert( p->sharable );
331   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
332 
333   /* First search the list for an existing lock on this table. */
334   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
335     if( pIter->iTable==iTable && pIter->pBtree==p ){
336       pLock = pIter;
337       break;
338     }
339   }
340 
341   /* If the above search did not find a BtLock struct associating Btree p
342   ** with table iTable, allocate one and link it into the list.
343   */
344   if( !pLock ){
345     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
346     if( !pLock ){
347       return SQLITE_NOMEM;
348     }
349     pLock->iTable = iTable;
350     pLock->pBtree = p;
351     pLock->pNext = pBt->pLock;
352     pBt->pLock = pLock;
353   }
354 
355   /* Set the BtLock.eLock variable to the maximum of the current lock
356   ** and the requested lock. This means if a write-lock was already held
357   ** and a read-lock requested, we don't incorrectly downgrade the lock.
358   */
359   assert( WRITE_LOCK>READ_LOCK );
360   if( eLock>pLock->eLock ){
361     pLock->eLock = eLock;
362   }
363 
364   return SQLITE_OK;
365 }
366 #endif /* !SQLITE_OMIT_SHARED_CACHE */
367 
368 #ifndef SQLITE_OMIT_SHARED_CACHE
369 /*
370 ** Release all the table locks (locks obtained via calls to
371 ** the setSharedCacheTableLock() procedure) held by Btree object p.
372 **
373 ** This function assumes that Btree p has an open read or write
374 ** transaction. If it does not, then the BTS_PENDING flag
375 ** may be incorrectly cleared.
376 */
377 static void clearAllSharedCacheTableLocks(Btree *p){
378   BtShared *pBt = p->pBt;
379   BtLock **ppIter = &pBt->pLock;
380 
381   assert( sqlite3BtreeHoldsMutex(p) );
382   assert( p->sharable || 0==*ppIter );
383   assert( p->inTrans>0 );
384 
385   while( *ppIter ){
386     BtLock *pLock = *ppIter;
387     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
388     assert( pLock->pBtree->inTrans>=pLock->eLock );
389     if( pLock->pBtree==p ){
390       *ppIter = pLock->pNext;
391       assert( pLock->iTable!=1 || pLock==&p->lock );
392       if( pLock->iTable!=1 ){
393         sqlite3_free(pLock);
394       }
395     }else{
396       ppIter = &pLock->pNext;
397     }
398   }
399 
400   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
401   if( pBt->pWriter==p ){
402     pBt->pWriter = 0;
403     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
404   }else if( pBt->nTransaction==2 ){
405     /* This function is called when Btree p is concluding its
406     ** transaction. If there currently exists a writer, and p is not
407     ** that writer, then the number of locks held by connections other
408     ** than the writer must be about to drop to zero. In this case
409     ** set the BTS_PENDING flag to 0.
410     **
411     ** If there is not currently a writer, then BTS_PENDING must
412     ** be zero already. So this next line is harmless in that case.
413     */
414     pBt->btsFlags &= ~BTS_PENDING;
415   }
416 }
417 
418 /*
419 ** This function changes all write-locks held by Btree p into read-locks.
420 */
421 static void downgradeAllSharedCacheTableLocks(Btree *p){
422   BtShared *pBt = p->pBt;
423   if( pBt->pWriter==p ){
424     BtLock *pLock;
425     pBt->pWriter = 0;
426     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
427     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
428       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
429       pLock->eLock = READ_LOCK;
430     }
431   }
432 }
433 
434 #endif /* SQLITE_OMIT_SHARED_CACHE */
435 
436 static void releasePage(MemPage *pPage);  /* Forward reference */
437 
438 /*
439 ***** This routine is used inside of assert() only ****
440 **
441 ** Verify that the cursor holds the mutex on its BtShared
442 */
443 #ifdef SQLITE_DEBUG
444 static int cursorHoldsMutex(BtCursor *p){
445   return sqlite3_mutex_held(p->pBt->mutex);
446 }
447 #endif
448 
449 /*
450 ** Invalidate the overflow cache of the cursor passed as the first argument.
451 ** on the shared btree structure pBt.
452 */
453 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
454 
455 /*
456 ** Invalidate the overflow page-list cache for all cursors opened
457 ** on the shared btree structure pBt.
458 */
459 static void invalidateAllOverflowCache(BtShared *pBt){
460   BtCursor *p;
461   assert( sqlite3_mutex_held(pBt->mutex) );
462   for(p=pBt->pCursor; p; p=p->pNext){
463     invalidateOverflowCache(p);
464   }
465 }
466 
467 #ifndef SQLITE_OMIT_INCRBLOB
468 /*
469 ** This function is called before modifying the contents of a table
470 ** to invalidate any incrblob cursors that are open on the
471 ** row or one of the rows being modified.
472 **
473 ** If argument isClearTable is true, then the entire contents of the
474 ** table is about to be deleted. In this case invalidate all incrblob
475 ** cursors open on any row within the table with root-page pgnoRoot.
476 **
477 ** Otherwise, if argument isClearTable is false, then the row with
478 ** rowid iRow is being replaced or deleted. In this case invalidate
479 ** only those incrblob cursors open on that specific row.
480 */
481 static void invalidateIncrblobCursors(
482   Btree *pBtree,          /* The database file to check */
483   i64 iRow,               /* The rowid that might be changing */
484   int isClearTable        /* True if all rows are being deleted */
485 ){
486   BtCursor *p;
487   BtShared *pBt = pBtree->pBt;
488   assert( sqlite3BtreeHoldsMutex(pBtree) );
489   for(p=pBt->pCursor; p; p=p->pNext){
490     if( (p->curFlags & BTCF_Incrblob)!=0 && (isClearTable || p->info.nKey==iRow) ){
491       p->eState = CURSOR_INVALID;
492     }
493   }
494 }
495 
496 #else
497   /* Stub function when INCRBLOB is omitted */
498   #define invalidateIncrblobCursors(x,y,z)
499 #endif /* SQLITE_OMIT_INCRBLOB */
500 
501 /*
502 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
503 ** when a page that previously contained data becomes a free-list leaf
504 ** page.
505 **
506 ** The BtShared.pHasContent bitvec exists to work around an obscure
507 ** bug caused by the interaction of two useful IO optimizations surrounding
508 ** free-list leaf pages:
509 **
510 **   1) When all data is deleted from a page and the page becomes
511 **      a free-list leaf page, the page is not written to the database
512 **      (as free-list leaf pages contain no meaningful data). Sometimes
513 **      such a page is not even journalled (as it will not be modified,
514 **      why bother journalling it?).
515 **
516 **   2) When a free-list leaf page is reused, its content is not read
517 **      from the database or written to the journal file (why should it
518 **      be, if it is not at all meaningful?).
519 **
520 ** By themselves, these optimizations work fine and provide a handy
521 ** performance boost to bulk delete or insert operations. However, if
522 ** a page is moved to the free-list and then reused within the same
523 ** transaction, a problem comes up. If the page is not journalled when
524 ** it is moved to the free-list and it is also not journalled when it
525 ** is extracted from the free-list and reused, then the original data
526 ** may be lost. In the event of a rollback, it may not be possible
527 ** to restore the database to its original configuration.
528 **
529 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
530 ** moved to become a free-list leaf page, the corresponding bit is
531 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
532 ** optimization 2 above is omitted if the corresponding bit is already
533 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
534 ** at the end of every transaction.
535 */
536 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
537   int rc = SQLITE_OK;
538   if( !pBt->pHasContent ){
539     assert( pgno<=pBt->nPage );
540     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
541     if( !pBt->pHasContent ){
542       rc = SQLITE_NOMEM;
543     }
544   }
545   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
546     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
547   }
548   return rc;
549 }
550 
551 /*
552 ** Query the BtShared.pHasContent vector.
553 **
554 ** This function is called when a free-list leaf page is removed from the
555 ** free-list for reuse. It returns false if it is safe to retrieve the
556 ** page from the pager layer with the 'no-content' flag set. True otherwise.
557 */
558 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
559   Bitvec *p = pBt->pHasContent;
560   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
561 }
562 
563 /*
564 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
565 ** invoked at the conclusion of each write-transaction.
566 */
567 static void btreeClearHasContent(BtShared *pBt){
568   sqlite3BitvecDestroy(pBt->pHasContent);
569   pBt->pHasContent = 0;
570 }
571 
572 /*
573 ** Release all of the apPage[] pages for a cursor.
574 */
575 static void btreeReleaseAllCursorPages(BtCursor *pCur){
576   int i;
577   for(i=0; i<=pCur->iPage; i++){
578     releasePage(pCur->apPage[i]);
579     pCur->apPage[i] = 0;
580   }
581   pCur->iPage = -1;
582 }
583 
584 
585 /*
586 ** Save the current cursor position in the variables BtCursor.nKey
587 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
588 **
589 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
590 ** prior to calling this routine.
591 */
592 static int saveCursorPosition(BtCursor *pCur){
593   int rc;
594 
595   assert( CURSOR_VALID==pCur->eState );
596   assert( 0==pCur->pKey );
597   assert( cursorHoldsMutex(pCur) );
598 
599   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
600   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
601 
602   /* If this is an intKey table, then the above call to BtreeKeySize()
603   ** stores the integer key in pCur->nKey. In this case this value is
604   ** all that is required. Otherwise, if pCur is not open on an intKey
605   ** table, then malloc space for and store the pCur->nKey bytes of key
606   ** data.
607   */
608   if( 0==pCur->apPage[0]->intKey ){
609     void *pKey = sqlite3Malloc( (int)pCur->nKey );
610     if( pKey ){
611       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
612       if( rc==SQLITE_OK ){
613         pCur->pKey = pKey;
614       }else{
615         sqlite3_free(pKey);
616       }
617     }else{
618       rc = SQLITE_NOMEM;
619     }
620   }
621   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
622 
623   if( rc==SQLITE_OK ){
624     btreeReleaseAllCursorPages(pCur);
625     pCur->eState = CURSOR_REQUIRESEEK;
626   }
627 
628   invalidateOverflowCache(pCur);
629   return rc;
630 }
631 
632 /*
633 ** Save the positions of all cursors (except pExcept) that are open on
634 ** the table  with root-page iRoot. Usually, this is called just before cursor
635 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
636 */
637 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
638   BtCursor *p;
639   assert( sqlite3_mutex_held(pBt->mutex) );
640   assert( pExcept==0 || pExcept->pBt==pBt );
641   for(p=pBt->pCursor; p; p=p->pNext){
642     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
643       if( p->eState==CURSOR_VALID ){
644         int rc = saveCursorPosition(p);
645         if( SQLITE_OK!=rc ){
646           return rc;
647         }
648       }else{
649         testcase( p->iPage>0 );
650         btreeReleaseAllCursorPages(p);
651       }
652     }
653   }
654   return SQLITE_OK;
655 }
656 
657 /*
658 ** Clear the current cursor position.
659 */
660 void sqlite3BtreeClearCursor(BtCursor *pCur){
661   assert( cursorHoldsMutex(pCur) );
662   sqlite3_free(pCur->pKey);
663   pCur->pKey = 0;
664   pCur->eState = CURSOR_INVALID;
665 }
666 
667 /*
668 ** In this version of BtreeMoveto, pKey is a packed index record
669 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
670 ** record and then call BtreeMovetoUnpacked() to do the work.
671 */
672 static int btreeMoveto(
673   BtCursor *pCur,     /* Cursor open on the btree to be searched */
674   const void *pKey,   /* Packed key if the btree is an index */
675   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
676   int bias,           /* Bias search to the high end */
677   int *pRes           /* Write search results here */
678 ){
679   int rc;                    /* Status code */
680   UnpackedRecord *pIdxKey;   /* Unpacked index key */
681   char aSpace[200];          /* Temp space for pIdxKey - to avoid a malloc */
682   char *pFree = 0;
683 
684   if( pKey ){
685     assert( nKey==(i64)(int)nKey );
686     pIdxKey = sqlite3VdbeAllocUnpackedRecord(
687         pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
688     );
689     if( pIdxKey==0 ) return SQLITE_NOMEM;
690     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
691     if( pIdxKey->nField==0 ){
692       sqlite3DbFree(pCur->pKeyInfo->db, pFree);
693       return SQLITE_CORRUPT_BKPT;
694     }
695   }else{
696     pIdxKey = 0;
697   }
698   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
699   if( pFree ){
700     sqlite3DbFree(pCur->pKeyInfo->db, pFree);
701   }
702   return rc;
703 }
704 
705 /*
706 ** Restore the cursor to the position it was in (or as close to as possible)
707 ** when saveCursorPosition() was called. Note that this call deletes the
708 ** saved position info stored by saveCursorPosition(), so there can be
709 ** at most one effective restoreCursorPosition() call after each
710 ** saveCursorPosition().
711 */
712 static int btreeRestoreCursorPosition(BtCursor *pCur){
713   int rc;
714   assert( cursorHoldsMutex(pCur) );
715   assert( pCur->eState>=CURSOR_REQUIRESEEK );
716   if( pCur->eState==CURSOR_FAULT ){
717     return pCur->skipNext;
718   }
719   pCur->eState = CURSOR_INVALID;
720   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
721   if( rc==SQLITE_OK ){
722     sqlite3_free(pCur->pKey);
723     pCur->pKey = 0;
724     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
725     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
726       pCur->eState = CURSOR_SKIPNEXT;
727     }
728   }
729   return rc;
730 }
731 
732 #define restoreCursorPosition(p) \
733   (p->eState>=CURSOR_REQUIRESEEK ? \
734          btreeRestoreCursorPosition(p) : \
735          SQLITE_OK)
736 
737 /*
738 ** Determine whether or not a cursor has moved from the position it
739 ** was last placed at.  Cursors can move when the row they are pointing
740 ** at is deleted out from under them.
741 **
742 ** This routine returns an error code if something goes wrong.  The
743 ** integer *pHasMoved is set as follows:
744 **
745 **    0:   The cursor is unchanged
746 **    1:   The cursor is still pointing at the same row, but the pointers
747 **         returned by sqlite3BtreeKeyFetch() or sqlite3BtreeDataFetch()
748 **         might now be invalid because of a balance() or other change to the
749 **         b-tree.
750 **    2:   The cursor is no longer pointing to the row.  The row might have
751 **         been deleted out from under the cursor.
752 */
753 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
754   int rc;
755 
756   if( pCur->eState==CURSOR_VALID ){
757     *pHasMoved = 0;
758     return SQLITE_OK;
759   }
760   rc = restoreCursorPosition(pCur);
761   if( rc ){
762     *pHasMoved = 2;
763     return rc;
764   }
765   if( pCur->eState!=CURSOR_VALID || NEVER(pCur->skipNext!=0) ){
766     *pHasMoved = 2;
767   }else{
768     *pHasMoved = 1;
769   }
770   return SQLITE_OK;
771 }
772 
773 #ifndef SQLITE_OMIT_AUTOVACUUM
774 /*
775 ** Given a page number of a regular database page, return the page
776 ** number for the pointer-map page that contains the entry for the
777 ** input page number.
778 **
779 ** Return 0 (not a valid page) for pgno==1 since there is
780 ** no pointer map associated with page 1.  The integrity_check logic
781 ** requires that ptrmapPageno(*,1)!=1.
782 */
783 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
784   int nPagesPerMapPage;
785   Pgno iPtrMap, ret;
786   assert( sqlite3_mutex_held(pBt->mutex) );
787   if( pgno<2 ) return 0;
788   nPagesPerMapPage = (pBt->usableSize/5)+1;
789   iPtrMap = (pgno-2)/nPagesPerMapPage;
790   ret = (iPtrMap*nPagesPerMapPage) + 2;
791   if( ret==PENDING_BYTE_PAGE(pBt) ){
792     ret++;
793   }
794   return ret;
795 }
796 
797 /*
798 ** Write an entry into the pointer map.
799 **
800 ** This routine updates the pointer map entry for page number 'key'
801 ** so that it maps to type 'eType' and parent page number 'pgno'.
802 **
803 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
804 ** a no-op.  If an error occurs, the appropriate error code is written
805 ** into *pRC.
806 */
807 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
808   DbPage *pDbPage;  /* The pointer map page */
809   u8 *pPtrmap;      /* The pointer map data */
810   Pgno iPtrmap;     /* The pointer map page number */
811   int offset;       /* Offset in pointer map page */
812   int rc;           /* Return code from subfunctions */
813 
814   if( *pRC ) return;
815 
816   assert( sqlite3_mutex_held(pBt->mutex) );
817   /* The master-journal page number must never be used as a pointer map page */
818   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
819 
820   assert( pBt->autoVacuum );
821   if( key==0 ){
822     *pRC = SQLITE_CORRUPT_BKPT;
823     return;
824   }
825   iPtrmap = PTRMAP_PAGENO(pBt, key);
826   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
827   if( rc!=SQLITE_OK ){
828     *pRC = rc;
829     return;
830   }
831   offset = PTRMAP_PTROFFSET(iPtrmap, key);
832   if( offset<0 ){
833     *pRC = SQLITE_CORRUPT_BKPT;
834     goto ptrmap_exit;
835   }
836   assert( offset <= (int)pBt->usableSize-5 );
837   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
838 
839   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
840     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
841     *pRC= rc = sqlite3PagerWrite(pDbPage);
842     if( rc==SQLITE_OK ){
843       pPtrmap[offset] = eType;
844       put4byte(&pPtrmap[offset+1], parent);
845     }
846   }
847 
848 ptrmap_exit:
849   sqlite3PagerUnref(pDbPage);
850 }
851 
852 /*
853 ** Read an entry from the pointer map.
854 **
855 ** This routine retrieves the pointer map entry for page 'key', writing
856 ** the type and parent page number to *pEType and *pPgno respectively.
857 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
858 */
859 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
860   DbPage *pDbPage;   /* The pointer map page */
861   int iPtrmap;       /* Pointer map page index */
862   u8 *pPtrmap;       /* Pointer map page data */
863   int offset;        /* Offset of entry in pointer map */
864   int rc;
865 
866   assert( sqlite3_mutex_held(pBt->mutex) );
867 
868   iPtrmap = PTRMAP_PAGENO(pBt, key);
869   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
870   if( rc!=0 ){
871     return rc;
872   }
873   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
874 
875   offset = PTRMAP_PTROFFSET(iPtrmap, key);
876   if( offset<0 ){
877     sqlite3PagerUnref(pDbPage);
878     return SQLITE_CORRUPT_BKPT;
879   }
880   assert( offset <= (int)pBt->usableSize-5 );
881   assert( pEType!=0 );
882   *pEType = pPtrmap[offset];
883   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
884 
885   sqlite3PagerUnref(pDbPage);
886   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
887   return SQLITE_OK;
888 }
889 
890 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
891   #define ptrmapPut(w,x,y,z,rc)
892   #define ptrmapGet(w,x,y,z) SQLITE_OK
893   #define ptrmapPutOvflPtr(x, y, rc)
894 #endif
895 
896 /*
897 ** Given a btree page and a cell index (0 means the first cell on
898 ** the page, 1 means the second cell, and so forth) return a pointer
899 ** to the cell content.
900 **
901 ** This routine works only for pages that do not contain overflow cells.
902 */
903 #define findCell(P,I) \
904   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)])))
905 #define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I)))))
906 
907 
908 /*
909 ** This a more complex version of findCell() that works for
910 ** pages that do contain overflow cells.
911 */
912 static u8 *findOverflowCell(MemPage *pPage, int iCell){
913   int i;
914   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
915   for(i=pPage->nOverflow-1; i>=0; i--){
916     int k;
917     k = pPage->aiOvfl[i];
918     if( k<=iCell ){
919       if( k==iCell ){
920         return pPage->apOvfl[i];
921       }
922       iCell--;
923     }
924   }
925   return findCell(pPage, iCell);
926 }
927 
928 /*
929 ** Parse a cell content block and fill in the CellInfo structure.  There
930 ** are two versions of this function.  btreeParseCell() takes a
931 ** cell index as the second argument and btreeParseCellPtr()
932 ** takes a pointer to the body of the cell as its second argument.
933 **
934 ** Within this file, the parseCell() macro can be called instead of
935 ** btreeParseCellPtr(). Using some compilers, this will be faster.
936 */
937 static void btreeParseCellPtr(
938   MemPage *pPage,         /* Page containing the cell */
939   u8 *pCell,              /* Pointer to the cell text. */
940   CellInfo *pInfo         /* Fill in this structure */
941 ){
942   u16 n;                  /* Number bytes in cell content header */
943   u32 nPayload;           /* Number of bytes of cell payload */
944 
945   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
946 
947   pInfo->pCell = pCell;
948   assert( pPage->leaf==0 || pPage->leaf==1 );
949   n = pPage->childPtrSize;
950   assert( n==4-4*pPage->leaf );
951   if( pPage->intKey ){
952     if( pPage->hasData ){
953       assert( n==0 );
954       n = getVarint32(pCell, nPayload);
955     }else{
956       nPayload = 0;
957     }
958     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
959     pInfo->nData = nPayload;
960   }else{
961     pInfo->nData = 0;
962     n += getVarint32(&pCell[n], nPayload);
963     pInfo->nKey = nPayload;
964   }
965   pInfo->nPayload = nPayload;
966   pInfo->nHeader = n;
967   testcase( nPayload==pPage->maxLocal );
968   testcase( nPayload==pPage->maxLocal+1 );
969   if( likely(nPayload<=pPage->maxLocal) ){
970     /* This is the (easy) common case where the entire payload fits
971     ** on the local page.  No overflow is required.
972     */
973     if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
974     pInfo->nLocal = (u16)nPayload;
975     pInfo->iOverflow = 0;
976   }else{
977     /* If the payload will not fit completely on the local page, we have
978     ** to decide how much to store locally and how much to spill onto
979     ** overflow pages.  The strategy is to minimize the amount of unused
980     ** space on overflow pages while keeping the amount of local storage
981     ** in between minLocal and maxLocal.
982     **
983     ** Warning:  changing the way overflow payload is distributed in any
984     ** way will result in an incompatible file format.
985     */
986     int minLocal;  /* Minimum amount of payload held locally */
987     int maxLocal;  /* Maximum amount of payload held locally */
988     int surplus;   /* Overflow payload available for local storage */
989 
990     minLocal = pPage->minLocal;
991     maxLocal = pPage->maxLocal;
992     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
993     testcase( surplus==maxLocal );
994     testcase( surplus==maxLocal+1 );
995     if( surplus <= maxLocal ){
996       pInfo->nLocal = (u16)surplus;
997     }else{
998       pInfo->nLocal = (u16)minLocal;
999     }
1000     pInfo->iOverflow = (u16)(pInfo->nLocal + n);
1001     pInfo->nSize = pInfo->iOverflow + 4;
1002   }
1003 }
1004 #define parseCell(pPage, iCell, pInfo) \
1005   btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
1006 static void btreeParseCell(
1007   MemPage *pPage,         /* Page containing the cell */
1008   int iCell,              /* The cell index.  First cell is 0 */
1009   CellInfo *pInfo         /* Fill in this structure */
1010 ){
1011   parseCell(pPage, iCell, pInfo);
1012 }
1013 
1014 /*
1015 ** Compute the total number of bytes that a Cell needs in the cell
1016 ** data area of the btree-page.  The return number includes the cell
1017 ** data header and the local payload, but not any overflow page or
1018 ** the space used by the cell pointer.
1019 */
1020 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1021   u8 *pIter = &pCell[pPage->childPtrSize];
1022   u32 nSize;
1023 
1024 #ifdef SQLITE_DEBUG
1025   /* The value returned by this function should always be the same as
1026   ** the (CellInfo.nSize) value found by doing a full parse of the
1027   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1028   ** this function verifies that this invariant is not violated. */
1029   CellInfo debuginfo;
1030   btreeParseCellPtr(pPage, pCell, &debuginfo);
1031 #endif
1032 
1033   if( pPage->intKey ){
1034     u8 *pEnd;
1035     if( pPage->hasData ){
1036       pIter += getVarint32(pIter, nSize);
1037     }else{
1038       nSize = 0;
1039     }
1040 
1041     /* pIter now points at the 64-bit integer key value, a variable length
1042     ** integer. The following block moves pIter to point at the first byte
1043     ** past the end of the key value. */
1044     pEnd = &pIter[9];
1045     while( (*pIter++)&0x80 && pIter<pEnd );
1046   }else{
1047     pIter += getVarint32(pIter, nSize);
1048   }
1049 
1050   testcase( nSize==pPage->maxLocal );
1051   testcase( nSize==pPage->maxLocal+1 );
1052   if( nSize>pPage->maxLocal ){
1053     int minLocal = pPage->minLocal;
1054     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1055     testcase( nSize==pPage->maxLocal );
1056     testcase( nSize==pPage->maxLocal+1 );
1057     if( nSize>pPage->maxLocal ){
1058       nSize = minLocal;
1059     }
1060     nSize += 4;
1061   }
1062   nSize += (u32)(pIter - pCell);
1063 
1064   /* The minimum size of any cell is 4 bytes. */
1065   if( nSize<4 ){
1066     nSize = 4;
1067   }
1068 
1069   assert( nSize==debuginfo.nSize );
1070   return (u16)nSize;
1071 }
1072 
1073 #ifdef SQLITE_DEBUG
1074 /* This variation on cellSizePtr() is used inside of assert() statements
1075 ** only. */
1076 static u16 cellSize(MemPage *pPage, int iCell){
1077   return cellSizePtr(pPage, findCell(pPage, iCell));
1078 }
1079 #endif
1080 
1081 #ifndef SQLITE_OMIT_AUTOVACUUM
1082 /*
1083 ** If the cell pCell, part of page pPage contains a pointer
1084 ** to an overflow page, insert an entry into the pointer-map
1085 ** for the overflow page.
1086 */
1087 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1088   CellInfo info;
1089   if( *pRC ) return;
1090   assert( pCell!=0 );
1091   btreeParseCellPtr(pPage, pCell, &info);
1092   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1093   if( info.iOverflow ){
1094     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1095     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1096   }
1097 }
1098 #endif
1099 
1100 
1101 /*
1102 ** Defragment the page given.  All Cells are moved to the
1103 ** end of the page and all free space is collected into one
1104 ** big FreeBlk that occurs in between the header and cell
1105 ** pointer array and the cell content area.
1106 */
1107 static int defragmentPage(MemPage *pPage){
1108   int i;                     /* Loop counter */
1109   int pc;                    /* Address of a i-th cell */
1110   int hdr;                   /* Offset to the page header */
1111   int size;                  /* Size of a cell */
1112   int usableSize;            /* Number of usable bytes on a page */
1113   int cellOffset;            /* Offset to the cell pointer array */
1114   int cbrk;                  /* Offset to the cell content area */
1115   int nCell;                 /* Number of cells on the page */
1116   unsigned char *data;       /* The page data */
1117   unsigned char *temp;       /* Temp area for cell content */
1118   int iCellFirst;            /* First allowable cell index */
1119   int iCellLast;             /* Last possible cell index */
1120 
1121 
1122   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1123   assert( pPage->pBt!=0 );
1124   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1125   assert( pPage->nOverflow==0 );
1126   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1127   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1128   data = pPage->aData;
1129   hdr = pPage->hdrOffset;
1130   cellOffset = pPage->cellOffset;
1131   nCell = pPage->nCell;
1132   assert( nCell==get2byte(&data[hdr+3]) );
1133   usableSize = pPage->pBt->usableSize;
1134   cbrk = get2byte(&data[hdr+5]);
1135   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1136   cbrk = usableSize;
1137   iCellFirst = cellOffset + 2*nCell;
1138   iCellLast = usableSize - 4;
1139   for(i=0; i<nCell; i++){
1140     u8 *pAddr;     /* The i-th cell pointer */
1141     pAddr = &data[cellOffset + i*2];
1142     pc = get2byte(pAddr);
1143     testcase( pc==iCellFirst );
1144     testcase( pc==iCellLast );
1145 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1146     /* These conditions have already been verified in btreeInitPage()
1147     ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1148     */
1149     if( pc<iCellFirst || pc>iCellLast ){
1150       return SQLITE_CORRUPT_BKPT;
1151     }
1152 #endif
1153     assert( pc>=iCellFirst && pc<=iCellLast );
1154     size = cellSizePtr(pPage, &temp[pc]);
1155     cbrk -= size;
1156 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1157     if( cbrk<iCellFirst ){
1158       return SQLITE_CORRUPT_BKPT;
1159     }
1160 #else
1161     if( cbrk<iCellFirst || pc+size>usableSize ){
1162       return SQLITE_CORRUPT_BKPT;
1163     }
1164 #endif
1165     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1166     testcase( cbrk+size==usableSize );
1167     testcase( pc+size==usableSize );
1168     memcpy(&data[cbrk], &temp[pc], size);
1169     put2byte(pAddr, cbrk);
1170   }
1171   assert( cbrk>=iCellFirst );
1172   put2byte(&data[hdr+5], cbrk);
1173   data[hdr+1] = 0;
1174   data[hdr+2] = 0;
1175   data[hdr+7] = 0;
1176   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1177   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1178   if( cbrk-iCellFirst!=pPage->nFree ){
1179     return SQLITE_CORRUPT_BKPT;
1180   }
1181   return SQLITE_OK;
1182 }
1183 
1184 /*
1185 ** Allocate nByte bytes of space from within the B-Tree page passed
1186 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1187 ** of the first byte of allocated space. Return either SQLITE_OK or
1188 ** an error code (usually SQLITE_CORRUPT).
1189 **
1190 ** The caller guarantees that there is sufficient space to make the
1191 ** allocation.  This routine might need to defragment in order to bring
1192 ** all the space together, however.  This routine will avoid using
1193 ** the first two bytes past the cell pointer area since presumably this
1194 ** allocation is being made in order to insert a new cell, so we will
1195 ** also end up needing a new cell pointer.
1196 */
1197 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1198   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1199   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1200   int nFrag;                           /* Number of fragmented bytes on pPage */
1201   int top;                             /* First byte of cell content area */
1202   int gap;        /* First byte of gap between cell pointers and cell content */
1203   int rc;         /* Integer return code */
1204   int usableSize; /* Usable size of the page */
1205 
1206   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1207   assert( pPage->pBt );
1208   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1209   assert( nByte>=0 );  /* Minimum cell size is 4 */
1210   assert( pPage->nFree>=nByte );
1211   assert( pPage->nOverflow==0 );
1212   usableSize = pPage->pBt->usableSize;
1213   assert( nByte < usableSize-8 );
1214 
1215   nFrag = data[hdr+7];
1216   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1217   gap = pPage->cellOffset + 2*pPage->nCell;
1218   top = get2byteNotZero(&data[hdr+5]);
1219   if( gap>top ) return SQLITE_CORRUPT_BKPT;
1220   testcase( gap+2==top );
1221   testcase( gap+1==top );
1222   testcase( gap==top );
1223 
1224   if( nFrag>=60 ){
1225     /* Always defragment highly fragmented pages */
1226     rc = defragmentPage(pPage);
1227     if( rc ) return rc;
1228     top = get2byteNotZero(&data[hdr+5]);
1229   }else if( gap+2<=top ){
1230     /* Search the freelist looking for a free slot big enough to satisfy
1231     ** the request. The allocation is made from the first free slot in
1232     ** the list that is large enough to accommodate it.
1233     */
1234     int pc, addr;
1235     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1236       int size;            /* Size of the free slot */
1237       if( pc>usableSize-4 || pc<addr+4 ){
1238         return SQLITE_CORRUPT_BKPT;
1239       }
1240       size = get2byte(&data[pc+2]);
1241       if( size>=nByte ){
1242         int x = size - nByte;
1243         testcase( x==4 );
1244         testcase( x==3 );
1245         if( x<4 ){
1246           /* Remove the slot from the free-list. Update the number of
1247           ** fragmented bytes within the page. */
1248           memcpy(&data[addr], &data[pc], 2);
1249           data[hdr+7] = (u8)(nFrag + x);
1250         }else if( size+pc > usableSize ){
1251           return SQLITE_CORRUPT_BKPT;
1252         }else{
1253           /* The slot remains on the free-list. Reduce its size to account
1254           ** for the portion used by the new allocation. */
1255           put2byte(&data[pc+2], x);
1256         }
1257         *pIdx = pc + x;
1258         return SQLITE_OK;
1259       }
1260     }
1261   }
1262 
1263   /* Check to make sure there is enough space in the gap to satisfy
1264   ** the allocation.  If not, defragment.
1265   */
1266   testcase( gap+2+nByte==top );
1267   if( gap+2+nByte>top ){
1268     rc = defragmentPage(pPage);
1269     if( rc ) return rc;
1270     top = get2byteNotZero(&data[hdr+5]);
1271     assert( gap+nByte<=top );
1272   }
1273 
1274 
1275   /* Allocate memory from the gap in between the cell pointer array
1276   ** and the cell content area.  The btreeInitPage() call has already
1277   ** validated the freelist.  Given that the freelist is valid, there
1278   ** is no way that the allocation can extend off the end of the page.
1279   ** The assert() below verifies the previous sentence.
1280   */
1281   top -= nByte;
1282   put2byte(&data[hdr+5], top);
1283   assert( top+nByte <= (int)pPage->pBt->usableSize );
1284   *pIdx = top;
1285   return SQLITE_OK;
1286 }
1287 
1288 /*
1289 ** Return a section of the pPage->aData to the freelist.
1290 ** The first byte of the new free block is pPage->aDisk[start]
1291 ** and the size of the block is "size" bytes.
1292 **
1293 ** Most of the effort here is involved in coalesing adjacent
1294 ** free blocks into a single big free block.
1295 */
1296 static int freeSpace(MemPage *pPage, int start, int size){
1297   int addr, pbegin, hdr;
1298   int iLast;                        /* Largest possible freeblock offset */
1299   unsigned char *data = pPage->aData;
1300 
1301   assert( pPage->pBt!=0 );
1302   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1303   assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1304   assert( (start + size) <= (int)pPage->pBt->usableSize );
1305   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1306   assert( size>=0 );   /* Minimum cell size is 4 */
1307 
1308   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1309     /* Overwrite deleted information with zeros when the secure_delete
1310     ** option is enabled */
1311     memset(&data[start], 0, size);
1312   }
1313 
1314   /* Add the space back into the linked list of freeblocks.  Note that
1315   ** even though the freeblock list was checked by btreeInitPage(),
1316   ** btreeInitPage() did not detect overlapping cells or
1317   ** freeblocks that overlapped cells.   Nor does it detect when the
1318   ** cell content area exceeds the value in the page header.  If these
1319   ** situations arise, then subsequent insert operations might corrupt
1320   ** the freelist.  So we do need to check for corruption while scanning
1321   ** the freelist.
1322   */
1323   hdr = pPage->hdrOffset;
1324   addr = hdr + 1;
1325   iLast = pPage->pBt->usableSize - 4;
1326   assert( start<=iLast );
1327   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1328     if( pbegin<addr+4 ){
1329       return SQLITE_CORRUPT_BKPT;
1330     }
1331     addr = pbegin;
1332   }
1333   if( pbegin>iLast ){
1334     return SQLITE_CORRUPT_BKPT;
1335   }
1336   assert( pbegin>addr || pbegin==0 );
1337   put2byte(&data[addr], start);
1338   put2byte(&data[start], pbegin);
1339   put2byte(&data[start+2], size);
1340   pPage->nFree = pPage->nFree + (u16)size;
1341 
1342   /* Coalesce adjacent free blocks */
1343   addr = hdr + 1;
1344   while( (pbegin = get2byte(&data[addr]))>0 ){
1345     int pnext, psize, x;
1346     assert( pbegin>addr );
1347     assert( pbegin <= (int)pPage->pBt->usableSize-4 );
1348     pnext = get2byte(&data[pbegin]);
1349     psize = get2byte(&data[pbegin+2]);
1350     if( pbegin + psize + 3 >= pnext && pnext>0 ){
1351       int frag = pnext - (pbegin+psize);
1352       if( (frag<0) || (frag>(int)data[hdr+7]) ){
1353         return SQLITE_CORRUPT_BKPT;
1354       }
1355       data[hdr+7] -= (u8)frag;
1356       x = get2byte(&data[pnext]);
1357       put2byte(&data[pbegin], x);
1358       x = pnext + get2byte(&data[pnext+2]) - pbegin;
1359       put2byte(&data[pbegin+2], x);
1360     }else{
1361       addr = pbegin;
1362     }
1363   }
1364 
1365   /* If the cell content area begins with a freeblock, remove it. */
1366   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1367     int top;
1368     pbegin = get2byte(&data[hdr+1]);
1369     memcpy(&data[hdr+1], &data[pbegin], 2);
1370     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1371     put2byte(&data[hdr+5], top);
1372   }
1373   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1374   return SQLITE_OK;
1375 }
1376 
1377 /*
1378 ** Decode the flags byte (the first byte of the header) for a page
1379 ** and initialize fields of the MemPage structure accordingly.
1380 **
1381 ** Only the following combinations are supported.  Anything different
1382 ** indicates a corrupt database files:
1383 **
1384 **         PTF_ZERODATA
1385 **         PTF_ZERODATA | PTF_LEAF
1386 **         PTF_LEAFDATA | PTF_INTKEY
1387 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1388 */
1389 static int decodeFlags(MemPage *pPage, int flagByte){
1390   BtShared *pBt;     /* A copy of pPage->pBt */
1391 
1392   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1393   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1394   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1395   flagByte &= ~PTF_LEAF;
1396   pPage->childPtrSize = 4-4*pPage->leaf;
1397   pBt = pPage->pBt;
1398   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1399     pPage->intKey = 1;
1400     pPage->hasData = pPage->leaf;
1401     pPage->maxLocal = pBt->maxLeaf;
1402     pPage->minLocal = pBt->minLeaf;
1403   }else if( flagByte==PTF_ZERODATA ){
1404     pPage->intKey = 0;
1405     pPage->hasData = 0;
1406     pPage->maxLocal = pBt->maxLocal;
1407     pPage->minLocal = pBt->minLocal;
1408   }else{
1409     return SQLITE_CORRUPT_BKPT;
1410   }
1411   pPage->max1bytePayload = pBt->max1bytePayload;
1412   return SQLITE_OK;
1413 }
1414 
1415 /*
1416 ** Initialize the auxiliary information for a disk block.
1417 **
1418 ** Return SQLITE_OK on success.  If we see that the page does
1419 ** not contain a well-formed database page, then return
1420 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1421 ** guarantee that the page is well-formed.  It only shows that
1422 ** we failed to detect any corruption.
1423 */
1424 static int btreeInitPage(MemPage *pPage){
1425 
1426   assert( pPage->pBt!=0 );
1427   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1428   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1429   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1430   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1431 
1432   if( !pPage->isInit ){
1433     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1434     u8 hdr;            /* Offset to beginning of page header */
1435     u8 *data;          /* Equal to pPage->aData */
1436     BtShared *pBt;        /* The main btree structure */
1437     int usableSize;    /* Amount of usable space on each page */
1438     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1439     int nFree;         /* Number of unused bytes on the page */
1440     int top;           /* First byte of the cell content area */
1441     int iCellFirst;    /* First allowable cell or freeblock offset */
1442     int iCellLast;     /* Last possible cell or freeblock offset */
1443 
1444     pBt = pPage->pBt;
1445 
1446     hdr = pPage->hdrOffset;
1447     data = pPage->aData;
1448     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1449     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1450     pPage->maskPage = (u16)(pBt->pageSize - 1);
1451     pPage->nOverflow = 0;
1452     usableSize = pBt->usableSize;
1453     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1454     pPage->aDataEnd = &data[usableSize];
1455     pPage->aCellIdx = &data[cellOffset];
1456     top = get2byteNotZero(&data[hdr+5]);
1457     pPage->nCell = get2byte(&data[hdr+3]);
1458     if( pPage->nCell>MX_CELL(pBt) ){
1459       /* To many cells for a single page.  The page must be corrupt */
1460       return SQLITE_CORRUPT_BKPT;
1461     }
1462     testcase( pPage->nCell==MX_CELL(pBt) );
1463 
1464     /* A malformed database page might cause us to read past the end
1465     ** of page when parsing a cell.
1466     **
1467     ** The following block of code checks early to see if a cell extends
1468     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1469     ** returned if it does.
1470     */
1471     iCellFirst = cellOffset + 2*pPage->nCell;
1472     iCellLast = usableSize - 4;
1473 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1474     {
1475       int i;            /* Index into the cell pointer array */
1476       int sz;           /* Size of a cell */
1477 
1478       if( !pPage->leaf ) iCellLast--;
1479       for(i=0; i<pPage->nCell; i++){
1480         pc = get2byte(&data[cellOffset+i*2]);
1481         testcase( pc==iCellFirst );
1482         testcase( pc==iCellLast );
1483         if( pc<iCellFirst || pc>iCellLast ){
1484           return SQLITE_CORRUPT_BKPT;
1485         }
1486         sz = cellSizePtr(pPage, &data[pc]);
1487         testcase( pc+sz==usableSize );
1488         if( pc+sz>usableSize ){
1489           return SQLITE_CORRUPT_BKPT;
1490         }
1491       }
1492       if( !pPage->leaf ) iCellLast++;
1493     }
1494 #endif
1495 
1496     /* Compute the total free space on the page */
1497     pc = get2byte(&data[hdr+1]);
1498     nFree = data[hdr+7] + top;
1499     while( pc>0 ){
1500       u16 next, size;
1501       if( pc<iCellFirst || pc>iCellLast ){
1502         /* Start of free block is off the page */
1503         return SQLITE_CORRUPT_BKPT;
1504       }
1505       next = get2byte(&data[pc]);
1506       size = get2byte(&data[pc+2]);
1507       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1508         /* Free blocks must be in ascending order. And the last byte of
1509         ** the free-block must lie on the database page.  */
1510         return SQLITE_CORRUPT_BKPT;
1511       }
1512       nFree = nFree + size;
1513       pc = next;
1514     }
1515 
1516     /* At this point, nFree contains the sum of the offset to the start
1517     ** of the cell-content area plus the number of free bytes within
1518     ** the cell-content area. If this is greater than the usable-size
1519     ** of the page, then the page must be corrupted. This check also
1520     ** serves to verify that the offset to the start of the cell-content
1521     ** area, according to the page header, lies within the page.
1522     */
1523     if( nFree>usableSize ){
1524       return SQLITE_CORRUPT_BKPT;
1525     }
1526     pPage->nFree = (u16)(nFree - iCellFirst);
1527     pPage->isInit = 1;
1528   }
1529   return SQLITE_OK;
1530 }
1531 
1532 /*
1533 ** Set up a raw page so that it looks like a database page holding
1534 ** no entries.
1535 */
1536 static void zeroPage(MemPage *pPage, int flags){
1537   unsigned char *data = pPage->aData;
1538   BtShared *pBt = pPage->pBt;
1539   u8 hdr = pPage->hdrOffset;
1540   u16 first;
1541 
1542   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1543   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1544   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1545   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1546   assert( sqlite3_mutex_held(pBt->mutex) );
1547   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1548     memset(&data[hdr], 0, pBt->usableSize - hdr);
1549   }
1550   data[hdr] = (char)flags;
1551   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
1552   memset(&data[hdr+1], 0, 4);
1553   data[hdr+7] = 0;
1554   put2byte(&data[hdr+5], pBt->usableSize);
1555   pPage->nFree = (u16)(pBt->usableSize - first);
1556   decodeFlags(pPage, flags);
1557   pPage->cellOffset = first;
1558   pPage->aDataEnd = &data[pBt->usableSize];
1559   pPage->aCellIdx = &data[first];
1560   pPage->nOverflow = 0;
1561   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1562   pPage->maskPage = (u16)(pBt->pageSize - 1);
1563   pPage->nCell = 0;
1564   pPage->isInit = 1;
1565 }
1566 
1567 
1568 /*
1569 ** Convert a DbPage obtained from the pager into a MemPage used by
1570 ** the btree layer.
1571 */
1572 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1573   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1574   pPage->aData = sqlite3PagerGetData(pDbPage);
1575   pPage->pDbPage = pDbPage;
1576   pPage->pBt = pBt;
1577   pPage->pgno = pgno;
1578   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1579   return pPage;
1580 }
1581 
1582 /*
1583 ** Get a page from the pager.  Initialize the MemPage.pBt and
1584 ** MemPage.aData elements if needed.
1585 **
1586 ** If the noContent flag is set, it means that we do not care about
1587 ** the content of the page at this time.  So do not go to the disk
1588 ** to fetch the content.  Just fill in the content with zeros for now.
1589 ** If in the future we call sqlite3PagerWrite() on this page, that
1590 ** means we have started to be concerned about content and the disk
1591 ** read should occur at that point.
1592 */
1593 static int btreeGetPage(
1594   BtShared *pBt,       /* The btree */
1595   Pgno pgno,           /* Number of the page to fetch */
1596   MemPage **ppPage,    /* Return the page in this parameter */
1597   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
1598 ){
1599   int rc;
1600   DbPage *pDbPage;
1601 
1602   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
1603   assert( sqlite3_mutex_held(pBt->mutex) );
1604   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
1605   if( rc ) return rc;
1606   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1607   return SQLITE_OK;
1608 }
1609 
1610 /*
1611 ** Retrieve a page from the pager cache. If the requested page is not
1612 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1613 ** MemPage.aData elements if needed.
1614 */
1615 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1616   DbPage *pDbPage;
1617   assert( sqlite3_mutex_held(pBt->mutex) );
1618   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1619   if( pDbPage ){
1620     return btreePageFromDbPage(pDbPage, pgno, pBt);
1621   }
1622   return 0;
1623 }
1624 
1625 /*
1626 ** Return the size of the database file in pages. If there is any kind of
1627 ** error, return ((unsigned int)-1).
1628 */
1629 static Pgno btreePagecount(BtShared *pBt){
1630   return pBt->nPage;
1631 }
1632 u32 sqlite3BtreeLastPage(Btree *p){
1633   assert( sqlite3BtreeHoldsMutex(p) );
1634   assert( ((p->pBt->nPage)&0x8000000)==0 );
1635   return btreePagecount(p->pBt);
1636 }
1637 
1638 /*
1639 ** Get a page from the pager and initialize it.  This routine is just a
1640 ** convenience wrapper around separate calls to btreeGetPage() and
1641 ** btreeInitPage().
1642 **
1643 ** If an error occurs, then the value *ppPage is set to is undefined. It
1644 ** may remain unchanged, or it may be set to an invalid value.
1645 */
1646 static int getAndInitPage(
1647   BtShared *pBt,                  /* The database file */
1648   Pgno pgno,                      /* Number of the page to get */
1649   MemPage **ppPage,               /* Write the page pointer here */
1650   int bReadonly                   /* PAGER_GET_READONLY or 0 */
1651 ){
1652   int rc;
1653   assert( sqlite3_mutex_held(pBt->mutex) );
1654   assert( bReadonly==PAGER_GET_READONLY || bReadonly==0 );
1655 
1656   if( pgno>btreePagecount(pBt) ){
1657     rc = SQLITE_CORRUPT_BKPT;
1658   }else{
1659     rc = btreeGetPage(pBt, pgno, ppPage, bReadonly);
1660     if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1661       rc = btreeInitPage(*ppPage);
1662       if( rc!=SQLITE_OK ){
1663         releasePage(*ppPage);
1664       }
1665     }
1666   }
1667 
1668   testcase( pgno==0 );
1669   assert( pgno!=0 || rc==SQLITE_CORRUPT );
1670   return rc;
1671 }
1672 
1673 /*
1674 ** Release a MemPage.  This should be called once for each prior
1675 ** call to btreeGetPage.
1676 */
1677 static void releasePage(MemPage *pPage){
1678   if( pPage ){
1679     assert( pPage->aData );
1680     assert( pPage->pBt );
1681     assert( pPage->pDbPage!=0 );
1682     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1683     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1684     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1685     sqlite3PagerUnrefNotNull(pPage->pDbPage);
1686   }
1687 }
1688 
1689 /*
1690 ** During a rollback, when the pager reloads information into the cache
1691 ** so that the cache is restored to its original state at the start of
1692 ** the transaction, for each page restored this routine is called.
1693 **
1694 ** This routine needs to reset the extra data section at the end of the
1695 ** page to agree with the restored data.
1696 */
1697 static void pageReinit(DbPage *pData){
1698   MemPage *pPage;
1699   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1700   assert( sqlite3PagerPageRefcount(pData)>0 );
1701   if( pPage->isInit ){
1702     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1703     pPage->isInit = 0;
1704     if( sqlite3PagerPageRefcount(pData)>1 ){
1705       /* pPage might not be a btree page;  it might be an overflow page
1706       ** or ptrmap page or a free page.  In those cases, the following
1707       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1708       ** But no harm is done by this.  And it is very important that
1709       ** btreeInitPage() be called on every btree page so we make
1710       ** the call for every page that comes in for re-initing. */
1711       btreeInitPage(pPage);
1712     }
1713   }
1714 }
1715 
1716 /*
1717 ** Invoke the busy handler for a btree.
1718 */
1719 static int btreeInvokeBusyHandler(void *pArg){
1720   BtShared *pBt = (BtShared*)pArg;
1721   assert( pBt->db );
1722   assert( sqlite3_mutex_held(pBt->db->mutex) );
1723   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1724 }
1725 
1726 /*
1727 ** Open a database file.
1728 **
1729 ** zFilename is the name of the database file.  If zFilename is NULL
1730 ** then an ephemeral database is created.  The ephemeral database might
1731 ** be exclusively in memory, or it might use a disk-based memory cache.
1732 ** Either way, the ephemeral database will be automatically deleted
1733 ** when sqlite3BtreeClose() is called.
1734 **
1735 ** If zFilename is ":memory:" then an in-memory database is created
1736 ** that is automatically destroyed when it is closed.
1737 **
1738 ** The "flags" parameter is a bitmask that might contain bits like
1739 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
1740 **
1741 ** If the database is already opened in the same database connection
1742 ** and we are in shared cache mode, then the open will fail with an
1743 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
1744 ** objects in the same database connection since doing so will lead
1745 ** to problems with locking.
1746 */
1747 int sqlite3BtreeOpen(
1748   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
1749   const char *zFilename,  /* Name of the file containing the BTree database */
1750   sqlite3 *db,            /* Associated database handle */
1751   Btree **ppBtree,        /* Pointer to new Btree object written here */
1752   int flags,              /* Options */
1753   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1754 ){
1755   BtShared *pBt = 0;             /* Shared part of btree structure */
1756   Btree *p;                      /* Handle to return */
1757   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
1758   int rc = SQLITE_OK;            /* Result code from this function */
1759   u8 nReserve;                   /* Byte of unused space on each page */
1760   unsigned char zDbHeader[100];  /* Database header content */
1761 
1762   /* True if opening an ephemeral, temporary database */
1763   const int isTempDb = zFilename==0 || zFilename[0]==0;
1764 
1765   /* Set the variable isMemdb to true for an in-memory database, or
1766   ** false for a file-based database.
1767   */
1768 #ifdef SQLITE_OMIT_MEMORYDB
1769   const int isMemdb = 0;
1770 #else
1771   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
1772                        || (isTempDb && sqlite3TempInMemory(db))
1773                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
1774 #endif
1775 
1776   assert( db!=0 );
1777   assert( pVfs!=0 );
1778   assert( sqlite3_mutex_held(db->mutex) );
1779   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
1780 
1781   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
1782   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
1783 
1784   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
1785   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
1786 
1787   if( isMemdb ){
1788     flags |= BTREE_MEMORY;
1789   }
1790   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
1791     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
1792   }
1793   p = sqlite3MallocZero(sizeof(Btree));
1794   if( !p ){
1795     return SQLITE_NOMEM;
1796   }
1797   p->inTrans = TRANS_NONE;
1798   p->db = db;
1799 #ifndef SQLITE_OMIT_SHARED_CACHE
1800   p->lock.pBtree = p;
1801   p->lock.iTable = 1;
1802 #endif
1803 
1804 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1805   /*
1806   ** If this Btree is a candidate for shared cache, try to find an
1807   ** existing BtShared object that we can share with
1808   */
1809   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
1810     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1811       int nFullPathname = pVfs->mxPathname+1;
1812       char *zFullPathname = sqlite3Malloc(nFullPathname);
1813       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
1814       p->sharable = 1;
1815       if( !zFullPathname ){
1816         sqlite3_free(p);
1817         return SQLITE_NOMEM;
1818       }
1819       if( isMemdb ){
1820         memcpy(zFullPathname, zFilename, sqlite3Strlen30(zFilename)+1);
1821       }else{
1822         rc = sqlite3OsFullPathname(pVfs, zFilename,
1823                                    nFullPathname, zFullPathname);
1824         if( rc ){
1825           sqlite3_free(zFullPathname);
1826           sqlite3_free(p);
1827           return rc;
1828         }
1829       }
1830 #if SQLITE_THREADSAFE
1831       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1832       sqlite3_mutex_enter(mutexOpen);
1833       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1834       sqlite3_mutex_enter(mutexShared);
1835 #endif
1836       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1837         assert( pBt->nRef>0 );
1838         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
1839                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1840           int iDb;
1841           for(iDb=db->nDb-1; iDb>=0; iDb--){
1842             Btree *pExisting = db->aDb[iDb].pBt;
1843             if( pExisting && pExisting->pBt==pBt ){
1844               sqlite3_mutex_leave(mutexShared);
1845               sqlite3_mutex_leave(mutexOpen);
1846               sqlite3_free(zFullPathname);
1847               sqlite3_free(p);
1848               return SQLITE_CONSTRAINT;
1849             }
1850           }
1851           p->pBt = pBt;
1852           pBt->nRef++;
1853           break;
1854         }
1855       }
1856       sqlite3_mutex_leave(mutexShared);
1857       sqlite3_free(zFullPathname);
1858     }
1859 #ifdef SQLITE_DEBUG
1860     else{
1861       /* In debug mode, we mark all persistent databases as sharable
1862       ** even when they are not.  This exercises the locking code and
1863       ** gives more opportunity for asserts(sqlite3_mutex_held())
1864       ** statements to find locking problems.
1865       */
1866       p->sharable = 1;
1867     }
1868 #endif
1869   }
1870 #endif
1871   if( pBt==0 ){
1872     /*
1873     ** The following asserts make sure that structures used by the btree are
1874     ** the right size.  This is to guard against size changes that result
1875     ** when compiling on a different architecture.
1876     */
1877     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1878     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1879     assert( sizeof(u32)==4 );
1880     assert( sizeof(u16)==2 );
1881     assert( sizeof(Pgno)==4 );
1882 
1883     pBt = sqlite3MallocZero( sizeof(*pBt) );
1884     if( pBt==0 ){
1885       rc = SQLITE_NOMEM;
1886       goto btree_open_out;
1887     }
1888     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1889                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
1890     if( rc==SQLITE_OK ){
1891       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
1892       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1893     }
1894     if( rc!=SQLITE_OK ){
1895       goto btree_open_out;
1896     }
1897     pBt->openFlags = (u8)flags;
1898     pBt->db = db;
1899     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1900     p->pBt = pBt;
1901 
1902     pBt->pCursor = 0;
1903     pBt->pPage1 = 0;
1904     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
1905 #ifdef SQLITE_SECURE_DELETE
1906     pBt->btsFlags |= BTS_SECURE_DELETE;
1907 #endif
1908     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
1909     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1910          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1911       pBt->pageSize = 0;
1912 #ifndef SQLITE_OMIT_AUTOVACUUM
1913       /* If the magic name ":memory:" will create an in-memory database, then
1914       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1915       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1916       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1917       ** regular file-name. In this case the auto-vacuum applies as per normal.
1918       */
1919       if( zFilename && !isMemdb ){
1920         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1921         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1922       }
1923 #endif
1924       nReserve = 0;
1925     }else{
1926       nReserve = zDbHeader[20];
1927       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
1928 #ifndef SQLITE_OMIT_AUTOVACUUM
1929       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1930       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1931 #endif
1932     }
1933     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1934     if( rc ) goto btree_open_out;
1935     pBt->usableSize = pBt->pageSize - nReserve;
1936     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1937 
1938 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1939     /* Add the new BtShared object to the linked list sharable BtShareds.
1940     */
1941     if( p->sharable ){
1942       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
1943       pBt->nRef = 1;
1944       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
1945       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1946         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1947         if( pBt->mutex==0 ){
1948           rc = SQLITE_NOMEM;
1949           db->mallocFailed = 0;
1950           goto btree_open_out;
1951         }
1952       }
1953       sqlite3_mutex_enter(mutexShared);
1954       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1955       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1956       sqlite3_mutex_leave(mutexShared);
1957     }
1958 #endif
1959   }
1960 
1961 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1962   /* If the new Btree uses a sharable pBtShared, then link the new
1963   ** Btree into the list of all sharable Btrees for the same connection.
1964   ** The list is kept in ascending order by pBt address.
1965   */
1966   if( p->sharable ){
1967     int i;
1968     Btree *pSib;
1969     for(i=0; i<db->nDb; i++){
1970       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1971         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1972         if( p->pBt<pSib->pBt ){
1973           p->pNext = pSib;
1974           p->pPrev = 0;
1975           pSib->pPrev = p;
1976         }else{
1977           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1978             pSib = pSib->pNext;
1979           }
1980           p->pNext = pSib->pNext;
1981           p->pPrev = pSib;
1982           if( p->pNext ){
1983             p->pNext->pPrev = p;
1984           }
1985           pSib->pNext = p;
1986         }
1987         break;
1988       }
1989     }
1990   }
1991 #endif
1992   *ppBtree = p;
1993 
1994 btree_open_out:
1995   if( rc!=SQLITE_OK ){
1996     if( pBt && pBt->pPager ){
1997       sqlite3PagerClose(pBt->pPager);
1998     }
1999     sqlite3_free(pBt);
2000     sqlite3_free(p);
2001     *ppBtree = 0;
2002   }else{
2003     /* If the B-Tree was successfully opened, set the pager-cache size to the
2004     ** default value. Except, when opening on an existing shared pager-cache,
2005     ** do not change the pager-cache size.
2006     */
2007     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2008       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
2009     }
2010   }
2011   if( mutexOpen ){
2012     assert( sqlite3_mutex_held(mutexOpen) );
2013     sqlite3_mutex_leave(mutexOpen);
2014   }
2015   return rc;
2016 }
2017 
2018 /*
2019 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2020 ** remove the BtShared structure from the sharing list.  Return
2021 ** true if the BtShared.nRef counter reaches zero and return
2022 ** false if it is still positive.
2023 */
2024 static int removeFromSharingList(BtShared *pBt){
2025 #ifndef SQLITE_OMIT_SHARED_CACHE
2026   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
2027   BtShared *pList;
2028   int removed = 0;
2029 
2030   assert( sqlite3_mutex_notheld(pBt->mutex) );
2031   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
2032   sqlite3_mutex_enter(pMaster);
2033   pBt->nRef--;
2034   if( pBt->nRef<=0 ){
2035     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2036       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2037     }else{
2038       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2039       while( ALWAYS(pList) && pList->pNext!=pBt ){
2040         pList=pList->pNext;
2041       }
2042       if( ALWAYS(pList) ){
2043         pList->pNext = pBt->pNext;
2044       }
2045     }
2046     if( SQLITE_THREADSAFE ){
2047       sqlite3_mutex_free(pBt->mutex);
2048     }
2049     removed = 1;
2050   }
2051   sqlite3_mutex_leave(pMaster);
2052   return removed;
2053 #else
2054   return 1;
2055 #endif
2056 }
2057 
2058 /*
2059 ** Make sure pBt->pTmpSpace points to an allocation of
2060 ** MX_CELL_SIZE(pBt) bytes.
2061 */
2062 static void allocateTempSpace(BtShared *pBt){
2063   if( !pBt->pTmpSpace ){
2064     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2065 
2066     /* One of the uses of pBt->pTmpSpace is to format cells before
2067     ** inserting them into a leaf page (function fillInCell()). If
2068     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2069     ** by the various routines that manipulate binary cells. Which
2070     ** can mean that fillInCell() only initializes the first 2 or 3
2071     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2072     ** it into a database page. This is not actually a problem, but it
2073     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2074     ** data is passed to system call write(). So to avoid this error,
2075     ** zero the first 4 bytes of temp space here.  */
2076     if( pBt->pTmpSpace ) memset(pBt->pTmpSpace, 0, 4);
2077   }
2078 }
2079 
2080 /*
2081 ** Free the pBt->pTmpSpace allocation
2082 */
2083 static void freeTempSpace(BtShared *pBt){
2084   sqlite3PageFree( pBt->pTmpSpace);
2085   pBt->pTmpSpace = 0;
2086 }
2087 
2088 /*
2089 ** Close an open database and invalidate all cursors.
2090 */
2091 int sqlite3BtreeClose(Btree *p){
2092   BtShared *pBt = p->pBt;
2093   BtCursor *pCur;
2094 
2095   /* Close all cursors opened via this handle.  */
2096   assert( sqlite3_mutex_held(p->db->mutex) );
2097   sqlite3BtreeEnter(p);
2098   pCur = pBt->pCursor;
2099   while( pCur ){
2100     BtCursor *pTmp = pCur;
2101     pCur = pCur->pNext;
2102     if( pTmp->pBtree==p ){
2103       sqlite3BtreeCloseCursor(pTmp);
2104     }
2105   }
2106 
2107   /* Rollback any active transaction and free the handle structure.
2108   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2109   ** this handle.
2110   */
2111   sqlite3BtreeRollback(p, SQLITE_OK);
2112   sqlite3BtreeLeave(p);
2113 
2114   /* If there are still other outstanding references to the shared-btree
2115   ** structure, return now. The remainder of this procedure cleans
2116   ** up the shared-btree.
2117   */
2118   assert( p->wantToLock==0 && p->locked==0 );
2119   if( !p->sharable || removeFromSharingList(pBt) ){
2120     /* The pBt is no longer on the sharing list, so we can access
2121     ** it without having to hold the mutex.
2122     **
2123     ** Clean out and delete the BtShared object.
2124     */
2125     assert( !pBt->pCursor );
2126     sqlite3PagerClose(pBt->pPager);
2127     if( pBt->xFreeSchema && pBt->pSchema ){
2128       pBt->xFreeSchema(pBt->pSchema);
2129     }
2130     sqlite3DbFree(0, pBt->pSchema);
2131     freeTempSpace(pBt);
2132     sqlite3_free(pBt);
2133   }
2134 
2135 #ifndef SQLITE_OMIT_SHARED_CACHE
2136   assert( p->wantToLock==0 );
2137   assert( p->locked==0 );
2138   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2139   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2140 #endif
2141 
2142   sqlite3_free(p);
2143   return SQLITE_OK;
2144 }
2145 
2146 /*
2147 ** Change the limit on the number of pages allowed in the cache.
2148 **
2149 ** The maximum number of cache pages is set to the absolute
2150 ** value of mxPage.  If mxPage is negative, the pager will
2151 ** operate asynchronously - it will not stop to do fsync()s
2152 ** to insure data is written to the disk surface before
2153 ** continuing.  Transactions still work if synchronous is off,
2154 ** and the database cannot be corrupted if this program
2155 ** crashes.  But if the operating system crashes or there is
2156 ** an abrupt power failure when synchronous is off, the database
2157 ** could be left in an inconsistent and unrecoverable state.
2158 ** Synchronous is on by default so database corruption is not
2159 ** normally a worry.
2160 */
2161 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2162   BtShared *pBt = p->pBt;
2163   assert( sqlite3_mutex_held(p->db->mutex) );
2164   sqlite3BtreeEnter(p);
2165   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2166   sqlite3BtreeLeave(p);
2167   return SQLITE_OK;
2168 }
2169 
2170 #if SQLITE_MAX_MMAP_SIZE>0
2171 /*
2172 ** Change the limit on the amount of the database file that may be
2173 ** memory mapped.
2174 */
2175 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2176   BtShared *pBt = p->pBt;
2177   assert( sqlite3_mutex_held(p->db->mutex) );
2178   sqlite3BtreeEnter(p);
2179   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2180   sqlite3BtreeLeave(p);
2181   return SQLITE_OK;
2182 }
2183 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2184 
2185 /*
2186 ** Change the way data is synced to disk in order to increase or decrease
2187 ** how well the database resists damage due to OS crashes and power
2188 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2189 ** there is a high probability of damage)  Level 2 is the default.  There
2190 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2191 ** probability of damage to near zero but with a write performance reduction.
2192 */
2193 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2194 int sqlite3BtreeSetPagerFlags(
2195   Btree *p,              /* The btree to set the safety level on */
2196   unsigned pgFlags       /* Various PAGER_* flags */
2197 ){
2198   BtShared *pBt = p->pBt;
2199   assert( sqlite3_mutex_held(p->db->mutex) );
2200   sqlite3BtreeEnter(p);
2201   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2202   sqlite3BtreeLeave(p);
2203   return SQLITE_OK;
2204 }
2205 #endif
2206 
2207 /*
2208 ** Return TRUE if the given btree is set to safety level 1.  In other
2209 ** words, return TRUE if no sync() occurs on the disk files.
2210 */
2211 int sqlite3BtreeSyncDisabled(Btree *p){
2212   BtShared *pBt = p->pBt;
2213   int rc;
2214   assert( sqlite3_mutex_held(p->db->mutex) );
2215   sqlite3BtreeEnter(p);
2216   assert( pBt && pBt->pPager );
2217   rc = sqlite3PagerNosync(pBt->pPager);
2218   sqlite3BtreeLeave(p);
2219   return rc;
2220 }
2221 
2222 /*
2223 ** Change the default pages size and the number of reserved bytes per page.
2224 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2225 ** without changing anything.
2226 **
2227 ** The page size must be a power of 2 between 512 and 65536.  If the page
2228 ** size supplied does not meet this constraint then the page size is not
2229 ** changed.
2230 **
2231 ** Page sizes are constrained to be a power of two so that the region
2232 ** of the database file used for locking (beginning at PENDING_BYTE,
2233 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2234 ** at the beginning of a page.
2235 **
2236 ** If parameter nReserve is less than zero, then the number of reserved
2237 ** bytes per page is left unchanged.
2238 **
2239 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2240 ** and autovacuum mode can no longer be changed.
2241 */
2242 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2243   int rc = SQLITE_OK;
2244   BtShared *pBt = p->pBt;
2245   assert( nReserve>=-1 && nReserve<=255 );
2246   sqlite3BtreeEnter(p);
2247   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2248     sqlite3BtreeLeave(p);
2249     return SQLITE_READONLY;
2250   }
2251   if( nReserve<0 ){
2252     nReserve = pBt->pageSize - pBt->usableSize;
2253   }
2254   assert( nReserve>=0 && nReserve<=255 );
2255   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2256         ((pageSize-1)&pageSize)==0 ){
2257     assert( (pageSize & 7)==0 );
2258     assert( !pBt->pPage1 && !pBt->pCursor );
2259     pBt->pageSize = (u32)pageSize;
2260     freeTempSpace(pBt);
2261   }
2262   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2263   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2264   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2265   sqlite3BtreeLeave(p);
2266   return rc;
2267 }
2268 
2269 /*
2270 ** Return the currently defined page size
2271 */
2272 int sqlite3BtreeGetPageSize(Btree *p){
2273   return p->pBt->pageSize;
2274 }
2275 
2276 #if defined(SQLITE_HAS_CODEC) || defined(SQLITE_DEBUG)
2277 /*
2278 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2279 ** may only be called if it is guaranteed that the b-tree mutex is already
2280 ** held.
2281 **
2282 ** This is useful in one special case in the backup API code where it is
2283 ** known that the shared b-tree mutex is held, but the mutex on the
2284 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2285 ** were to be called, it might collide with some other operation on the
2286 ** database handle that owns *p, causing undefined behavior.
2287 */
2288 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2289   assert( sqlite3_mutex_held(p->pBt->mutex) );
2290   return p->pBt->pageSize - p->pBt->usableSize;
2291 }
2292 #endif /* SQLITE_HAS_CODEC || SQLITE_DEBUG */
2293 
2294 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2295 /*
2296 ** Return the number of bytes of space at the end of every page that
2297 ** are intentually left unused.  This is the "reserved" space that is
2298 ** sometimes used by extensions.
2299 */
2300 int sqlite3BtreeGetReserve(Btree *p){
2301   int n;
2302   sqlite3BtreeEnter(p);
2303   n = p->pBt->pageSize - p->pBt->usableSize;
2304   sqlite3BtreeLeave(p);
2305   return n;
2306 }
2307 
2308 /*
2309 ** Set the maximum page count for a database if mxPage is positive.
2310 ** No changes are made if mxPage is 0 or negative.
2311 ** Regardless of the value of mxPage, return the maximum page count.
2312 */
2313 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2314   int n;
2315   sqlite3BtreeEnter(p);
2316   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2317   sqlite3BtreeLeave(p);
2318   return n;
2319 }
2320 
2321 /*
2322 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2323 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2324 ** setting after the change.
2325 */
2326 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2327   int b;
2328   if( p==0 ) return 0;
2329   sqlite3BtreeEnter(p);
2330   if( newFlag>=0 ){
2331     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2332     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2333   }
2334   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2335   sqlite3BtreeLeave(p);
2336   return b;
2337 }
2338 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2339 
2340 /*
2341 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2342 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2343 ** is disabled. The default value for the auto-vacuum property is
2344 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2345 */
2346 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2347 #ifdef SQLITE_OMIT_AUTOVACUUM
2348   return SQLITE_READONLY;
2349 #else
2350   BtShared *pBt = p->pBt;
2351   int rc = SQLITE_OK;
2352   u8 av = (u8)autoVacuum;
2353 
2354   sqlite3BtreeEnter(p);
2355   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2356     rc = SQLITE_READONLY;
2357   }else{
2358     pBt->autoVacuum = av ?1:0;
2359     pBt->incrVacuum = av==2 ?1:0;
2360   }
2361   sqlite3BtreeLeave(p);
2362   return rc;
2363 #endif
2364 }
2365 
2366 /*
2367 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2368 ** enabled 1 is returned. Otherwise 0.
2369 */
2370 int sqlite3BtreeGetAutoVacuum(Btree *p){
2371 #ifdef SQLITE_OMIT_AUTOVACUUM
2372   return BTREE_AUTOVACUUM_NONE;
2373 #else
2374   int rc;
2375   sqlite3BtreeEnter(p);
2376   rc = (
2377     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2378     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2379     BTREE_AUTOVACUUM_INCR
2380   );
2381   sqlite3BtreeLeave(p);
2382   return rc;
2383 #endif
2384 }
2385 
2386 
2387 /*
2388 ** Get a reference to pPage1 of the database file.  This will
2389 ** also acquire a readlock on that file.
2390 **
2391 ** SQLITE_OK is returned on success.  If the file is not a
2392 ** well-formed database file, then SQLITE_CORRUPT is returned.
2393 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2394 ** is returned if we run out of memory.
2395 */
2396 static int lockBtree(BtShared *pBt){
2397   int rc;              /* Result code from subfunctions */
2398   MemPage *pPage1;     /* Page 1 of the database file */
2399   int nPage;           /* Number of pages in the database */
2400   int nPageFile = 0;   /* Number of pages in the database file */
2401   int nPageHeader;     /* Number of pages in the database according to hdr */
2402 
2403   assert( sqlite3_mutex_held(pBt->mutex) );
2404   assert( pBt->pPage1==0 );
2405   rc = sqlite3PagerSharedLock(pBt->pPager);
2406   if( rc!=SQLITE_OK ) return rc;
2407   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2408   if( rc!=SQLITE_OK ) return rc;
2409 
2410   /* Do some checking to help insure the file we opened really is
2411   ** a valid database file.
2412   */
2413   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2414   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2415   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2416     nPage = nPageFile;
2417   }
2418   if( nPage>0 ){
2419     u32 pageSize;
2420     u32 usableSize;
2421     u8 *page1 = pPage1->aData;
2422     rc = SQLITE_NOTADB;
2423     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2424       goto page1_init_failed;
2425     }
2426 
2427 #ifdef SQLITE_OMIT_WAL
2428     if( page1[18]>1 ){
2429       pBt->btsFlags |= BTS_READ_ONLY;
2430     }
2431     if( page1[19]>1 ){
2432       goto page1_init_failed;
2433     }
2434 #else
2435     if( page1[18]>2 ){
2436       pBt->btsFlags |= BTS_READ_ONLY;
2437     }
2438     if( page1[19]>2 ){
2439       goto page1_init_failed;
2440     }
2441 
2442     /* If the write version is set to 2, this database should be accessed
2443     ** in WAL mode. If the log is not already open, open it now. Then
2444     ** return SQLITE_OK and return without populating BtShared.pPage1.
2445     ** The caller detects this and calls this function again. This is
2446     ** required as the version of page 1 currently in the page1 buffer
2447     ** may not be the latest version - there may be a newer one in the log
2448     ** file.
2449     */
2450     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2451       int isOpen = 0;
2452       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2453       if( rc!=SQLITE_OK ){
2454         goto page1_init_failed;
2455       }else if( isOpen==0 ){
2456         releasePage(pPage1);
2457         return SQLITE_OK;
2458       }
2459       rc = SQLITE_NOTADB;
2460     }
2461 #endif
2462 
2463     /* The maximum embedded fraction must be exactly 25%.  And the minimum
2464     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2465     ** The original design allowed these amounts to vary, but as of
2466     ** version 3.6.0, we require them to be fixed.
2467     */
2468     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2469       goto page1_init_failed;
2470     }
2471     pageSize = (page1[16]<<8) | (page1[17]<<16);
2472     if( ((pageSize-1)&pageSize)!=0
2473      || pageSize>SQLITE_MAX_PAGE_SIZE
2474      || pageSize<=256
2475     ){
2476       goto page1_init_failed;
2477     }
2478     assert( (pageSize & 7)==0 );
2479     usableSize = pageSize - page1[20];
2480     if( (u32)pageSize!=pBt->pageSize ){
2481       /* After reading the first page of the database assuming a page size
2482       ** of BtShared.pageSize, we have discovered that the page-size is
2483       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2484       ** zero and return SQLITE_OK. The caller will call this function
2485       ** again with the correct page-size.
2486       */
2487       releasePage(pPage1);
2488       pBt->usableSize = usableSize;
2489       pBt->pageSize = pageSize;
2490       freeTempSpace(pBt);
2491       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2492                                    pageSize-usableSize);
2493       return rc;
2494     }
2495     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2496       rc = SQLITE_CORRUPT_BKPT;
2497       goto page1_init_failed;
2498     }
2499     if( usableSize<480 ){
2500       goto page1_init_failed;
2501     }
2502     pBt->pageSize = pageSize;
2503     pBt->usableSize = usableSize;
2504 #ifndef SQLITE_OMIT_AUTOVACUUM
2505     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2506     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2507 #endif
2508   }
2509 
2510   /* maxLocal is the maximum amount of payload to store locally for
2511   ** a cell.  Make sure it is small enough so that at least minFanout
2512   ** cells can will fit on one page.  We assume a 10-byte page header.
2513   ** Besides the payload, the cell must store:
2514   **     2-byte pointer to the cell
2515   **     4-byte child pointer
2516   **     9-byte nKey value
2517   **     4-byte nData value
2518   **     4-byte overflow page pointer
2519   ** So a cell consists of a 2-byte pointer, a header which is as much as
2520   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2521   ** page pointer.
2522   */
2523   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2524   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2525   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2526   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2527   if( pBt->maxLocal>127 ){
2528     pBt->max1bytePayload = 127;
2529   }else{
2530     pBt->max1bytePayload = (u8)pBt->maxLocal;
2531   }
2532   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2533   pBt->pPage1 = pPage1;
2534   pBt->nPage = nPage;
2535   return SQLITE_OK;
2536 
2537 page1_init_failed:
2538   releasePage(pPage1);
2539   pBt->pPage1 = 0;
2540   return rc;
2541 }
2542 
2543 #ifndef NDEBUG
2544 /*
2545 ** Return the number of cursors open on pBt. This is for use
2546 ** in assert() expressions, so it is only compiled if NDEBUG is not
2547 ** defined.
2548 **
2549 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
2550 ** false then all cursors are counted.
2551 **
2552 ** For the purposes of this routine, a cursor is any cursor that
2553 ** is capable of reading or writing to the databse.  Cursors that
2554 ** have been tripped into the CURSOR_FAULT state are not counted.
2555 */
2556 static int countValidCursors(BtShared *pBt, int wrOnly){
2557   BtCursor *pCur;
2558   int r = 0;
2559   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2560     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
2561      && pCur->eState!=CURSOR_FAULT ) r++;
2562   }
2563   return r;
2564 }
2565 #endif
2566 
2567 /*
2568 ** If there are no outstanding cursors and we are not in the middle
2569 ** of a transaction but there is a read lock on the database, then
2570 ** this routine unrefs the first page of the database file which
2571 ** has the effect of releasing the read lock.
2572 **
2573 ** If there is a transaction in progress, this routine is a no-op.
2574 */
2575 static void unlockBtreeIfUnused(BtShared *pBt){
2576   assert( sqlite3_mutex_held(pBt->mutex) );
2577   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
2578   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2579     assert( pBt->pPage1->aData );
2580     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2581     assert( pBt->pPage1->aData );
2582     releasePage(pBt->pPage1);
2583     pBt->pPage1 = 0;
2584   }
2585 }
2586 
2587 /*
2588 ** If pBt points to an empty file then convert that empty file
2589 ** into a new empty database by initializing the first page of
2590 ** the database.
2591 */
2592 static int newDatabase(BtShared *pBt){
2593   MemPage *pP1;
2594   unsigned char *data;
2595   int rc;
2596 
2597   assert( sqlite3_mutex_held(pBt->mutex) );
2598   if( pBt->nPage>0 ){
2599     return SQLITE_OK;
2600   }
2601   pP1 = pBt->pPage1;
2602   assert( pP1!=0 );
2603   data = pP1->aData;
2604   rc = sqlite3PagerWrite(pP1->pDbPage);
2605   if( rc ) return rc;
2606   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2607   assert( sizeof(zMagicHeader)==16 );
2608   data[16] = (u8)((pBt->pageSize>>8)&0xff);
2609   data[17] = (u8)((pBt->pageSize>>16)&0xff);
2610   data[18] = 1;
2611   data[19] = 1;
2612   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2613   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2614   data[21] = 64;
2615   data[22] = 32;
2616   data[23] = 32;
2617   memset(&data[24], 0, 100-24);
2618   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2619   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2620 #ifndef SQLITE_OMIT_AUTOVACUUM
2621   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2622   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2623   put4byte(&data[36 + 4*4], pBt->autoVacuum);
2624   put4byte(&data[36 + 7*4], pBt->incrVacuum);
2625 #endif
2626   pBt->nPage = 1;
2627   data[31] = 1;
2628   return SQLITE_OK;
2629 }
2630 
2631 /*
2632 ** Initialize the first page of the database file (creating a database
2633 ** consisting of a single page and no schema objects). Return SQLITE_OK
2634 ** if successful, or an SQLite error code otherwise.
2635 */
2636 int sqlite3BtreeNewDb(Btree *p){
2637   int rc;
2638   sqlite3BtreeEnter(p);
2639   p->pBt->nPage = 0;
2640   rc = newDatabase(p->pBt);
2641   sqlite3BtreeLeave(p);
2642   return rc;
2643 }
2644 
2645 /*
2646 ** Attempt to start a new transaction. A write-transaction
2647 ** is started if the second argument is nonzero, otherwise a read-
2648 ** transaction.  If the second argument is 2 or more and exclusive
2649 ** transaction is started, meaning that no other process is allowed
2650 ** to access the database.  A preexisting transaction may not be
2651 ** upgraded to exclusive by calling this routine a second time - the
2652 ** exclusivity flag only works for a new transaction.
2653 **
2654 ** A write-transaction must be started before attempting any
2655 ** changes to the database.  None of the following routines
2656 ** will work unless a transaction is started first:
2657 **
2658 **      sqlite3BtreeCreateTable()
2659 **      sqlite3BtreeCreateIndex()
2660 **      sqlite3BtreeClearTable()
2661 **      sqlite3BtreeDropTable()
2662 **      sqlite3BtreeInsert()
2663 **      sqlite3BtreeDelete()
2664 **      sqlite3BtreeUpdateMeta()
2665 **
2666 ** If an initial attempt to acquire the lock fails because of lock contention
2667 ** and the database was previously unlocked, then invoke the busy handler
2668 ** if there is one.  But if there was previously a read-lock, do not
2669 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
2670 ** returned when there is already a read-lock in order to avoid a deadlock.
2671 **
2672 ** Suppose there are two processes A and B.  A has a read lock and B has
2673 ** a reserved lock.  B tries to promote to exclusive but is blocked because
2674 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
2675 ** One or the other of the two processes must give way or there can be
2676 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
2677 ** when A already has a read lock, we encourage A to give up and let B
2678 ** proceed.
2679 */
2680 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2681   sqlite3 *pBlock = 0;
2682   BtShared *pBt = p->pBt;
2683   int rc = SQLITE_OK;
2684 
2685   sqlite3BtreeEnter(p);
2686   btreeIntegrity(p);
2687 
2688   /* If the btree is already in a write-transaction, or it
2689   ** is already in a read-transaction and a read-transaction
2690   ** is requested, this is a no-op.
2691   */
2692   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2693     goto trans_begun;
2694   }
2695   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
2696 
2697   /* Write transactions are not possible on a read-only database */
2698   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
2699     rc = SQLITE_READONLY;
2700     goto trans_begun;
2701   }
2702 
2703 #ifndef SQLITE_OMIT_SHARED_CACHE
2704   /* If another database handle has already opened a write transaction
2705   ** on this shared-btree structure and a second write transaction is
2706   ** requested, return SQLITE_LOCKED.
2707   */
2708   if( (wrflag && pBt->inTransaction==TRANS_WRITE)
2709    || (pBt->btsFlags & BTS_PENDING)!=0
2710   ){
2711     pBlock = pBt->pWriter->db;
2712   }else if( wrflag>1 ){
2713     BtLock *pIter;
2714     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2715       if( pIter->pBtree!=p ){
2716         pBlock = pIter->pBtree->db;
2717         break;
2718       }
2719     }
2720   }
2721   if( pBlock ){
2722     sqlite3ConnectionBlocked(p->db, pBlock);
2723     rc = SQLITE_LOCKED_SHAREDCACHE;
2724     goto trans_begun;
2725   }
2726 #endif
2727 
2728   /* Any read-only or read-write transaction implies a read-lock on
2729   ** page 1. So if some other shared-cache client already has a write-lock
2730   ** on page 1, the transaction cannot be opened. */
2731   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2732   if( SQLITE_OK!=rc ) goto trans_begun;
2733 
2734   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
2735   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
2736   do {
2737     /* Call lockBtree() until either pBt->pPage1 is populated or
2738     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2739     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2740     ** reading page 1 it discovers that the page-size of the database
2741     ** file is not pBt->pageSize. In this case lockBtree() will update
2742     ** pBt->pageSize to the page-size of the file on disk.
2743     */
2744     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2745 
2746     if( rc==SQLITE_OK && wrflag ){
2747       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
2748         rc = SQLITE_READONLY;
2749       }else{
2750         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2751         if( rc==SQLITE_OK ){
2752           rc = newDatabase(pBt);
2753         }
2754       }
2755     }
2756 
2757     if( rc!=SQLITE_OK ){
2758       unlockBtreeIfUnused(pBt);
2759     }
2760   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2761           btreeInvokeBusyHandler(pBt) );
2762 
2763   if( rc==SQLITE_OK ){
2764     if( p->inTrans==TRANS_NONE ){
2765       pBt->nTransaction++;
2766 #ifndef SQLITE_OMIT_SHARED_CACHE
2767       if( p->sharable ){
2768         assert( p->lock.pBtree==p && p->lock.iTable==1 );
2769         p->lock.eLock = READ_LOCK;
2770         p->lock.pNext = pBt->pLock;
2771         pBt->pLock = &p->lock;
2772       }
2773 #endif
2774     }
2775     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2776     if( p->inTrans>pBt->inTransaction ){
2777       pBt->inTransaction = p->inTrans;
2778     }
2779     if( wrflag ){
2780       MemPage *pPage1 = pBt->pPage1;
2781 #ifndef SQLITE_OMIT_SHARED_CACHE
2782       assert( !pBt->pWriter );
2783       pBt->pWriter = p;
2784       pBt->btsFlags &= ~BTS_EXCLUSIVE;
2785       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
2786 #endif
2787 
2788       /* If the db-size header field is incorrect (as it may be if an old
2789       ** client has been writing the database file), update it now. Doing
2790       ** this sooner rather than later means the database size can safely
2791       ** re-read the database size from page 1 if a savepoint or transaction
2792       ** rollback occurs within the transaction.
2793       */
2794       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
2795         rc = sqlite3PagerWrite(pPage1->pDbPage);
2796         if( rc==SQLITE_OK ){
2797           put4byte(&pPage1->aData[28], pBt->nPage);
2798         }
2799       }
2800     }
2801   }
2802 
2803 
2804 trans_begun:
2805   if( rc==SQLITE_OK && wrflag ){
2806     /* This call makes sure that the pager has the correct number of
2807     ** open savepoints. If the second parameter is greater than 0 and
2808     ** the sub-journal is not already open, then it will be opened here.
2809     */
2810     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2811   }
2812 
2813   btreeIntegrity(p);
2814   sqlite3BtreeLeave(p);
2815   return rc;
2816 }
2817 
2818 #ifndef SQLITE_OMIT_AUTOVACUUM
2819 
2820 /*
2821 ** Set the pointer-map entries for all children of page pPage. Also, if
2822 ** pPage contains cells that point to overflow pages, set the pointer
2823 ** map entries for the overflow pages as well.
2824 */
2825 static int setChildPtrmaps(MemPage *pPage){
2826   int i;                             /* Counter variable */
2827   int nCell;                         /* Number of cells in page pPage */
2828   int rc;                            /* Return code */
2829   BtShared *pBt = pPage->pBt;
2830   u8 isInitOrig = pPage->isInit;
2831   Pgno pgno = pPage->pgno;
2832 
2833   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2834   rc = btreeInitPage(pPage);
2835   if( rc!=SQLITE_OK ){
2836     goto set_child_ptrmaps_out;
2837   }
2838   nCell = pPage->nCell;
2839 
2840   for(i=0; i<nCell; i++){
2841     u8 *pCell = findCell(pPage, i);
2842 
2843     ptrmapPutOvflPtr(pPage, pCell, &rc);
2844 
2845     if( !pPage->leaf ){
2846       Pgno childPgno = get4byte(pCell);
2847       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2848     }
2849   }
2850 
2851   if( !pPage->leaf ){
2852     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2853     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2854   }
2855 
2856 set_child_ptrmaps_out:
2857   pPage->isInit = isInitOrig;
2858   return rc;
2859 }
2860 
2861 /*
2862 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
2863 ** that it points to iTo. Parameter eType describes the type of pointer to
2864 ** be modified, as  follows:
2865 **
2866 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2867 **                   page of pPage.
2868 **
2869 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2870 **                   page pointed to by one of the cells on pPage.
2871 **
2872 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2873 **                   overflow page in the list.
2874 */
2875 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2876   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2877   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2878   if( eType==PTRMAP_OVERFLOW2 ){
2879     /* The pointer is always the first 4 bytes of the page in this case.  */
2880     if( get4byte(pPage->aData)!=iFrom ){
2881       return SQLITE_CORRUPT_BKPT;
2882     }
2883     put4byte(pPage->aData, iTo);
2884   }else{
2885     u8 isInitOrig = pPage->isInit;
2886     int i;
2887     int nCell;
2888 
2889     btreeInitPage(pPage);
2890     nCell = pPage->nCell;
2891 
2892     for(i=0; i<nCell; i++){
2893       u8 *pCell = findCell(pPage, i);
2894       if( eType==PTRMAP_OVERFLOW1 ){
2895         CellInfo info;
2896         btreeParseCellPtr(pPage, pCell, &info);
2897         if( info.iOverflow
2898          && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage
2899          && iFrom==get4byte(&pCell[info.iOverflow])
2900         ){
2901           put4byte(&pCell[info.iOverflow], iTo);
2902           break;
2903         }
2904       }else{
2905         if( get4byte(pCell)==iFrom ){
2906           put4byte(pCell, iTo);
2907           break;
2908         }
2909       }
2910     }
2911 
2912     if( i==nCell ){
2913       if( eType!=PTRMAP_BTREE ||
2914           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2915         return SQLITE_CORRUPT_BKPT;
2916       }
2917       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2918     }
2919 
2920     pPage->isInit = isInitOrig;
2921   }
2922   return SQLITE_OK;
2923 }
2924 
2925 
2926 /*
2927 ** Move the open database page pDbPage to location iFreePage in the
2928 ** database. The pDbPage reference remains valid.
2929 **
2930 ** The isCommit flag indicates that there is no need to remember that
2931 ** the journal needs to be sync()ed before database page pDbPage->pgno
2932 ** can be written to. The caller has already promised not to write to that
2933 ** page.
2934 */
2935 static int relocatePage(
2936   BtShared *pBt,           /* Btree */
2937   MemPage *pDbPage,        /* Open page to move */
2938   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2939   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2940   Pgno iFreePage,          /* The location to move pDbPage to */
2941   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
2942 ){
2943   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2944   Pgno iDbPage = pDbPage->pgno;
2945   Pager *pPager = pBt->pPager;
2946   int rc;
2947 
2948   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2949       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2950   assert( sqlite3_mutex_held(pBt->mutex) );
2951   assert( pDbPage->pBt==pBt );
2952 
2953   /* Move page iDbPage from its current location to page number iFreePage */
2954   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2955       iDbPage, iFreePage, iPtrPage, eType));
2956   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2957   if( rc!=SQLITE_OK ){
2958     return rc;
2959   }
2960   pDbPage->pgno = iFreePage;
2961 
2962   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2963   ** that point to overflow pages. The pointer map entries for all these
2964   ** pages need to be changed.
2965   **
2966   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2967   ** pointer to a subsequent overflow page. If this is the case, then
2968   ** the pointer map needs to be updated for the subsequent overflow page.
2969   */
2970   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2971     rc = setChildPtrmaps(pDbPage);
2972     if( rc!=SQLITE_OK ){
2973       return rc;
2974     }
2975   }else{
2976     Pgno nextOvfl = get4byte(pDbPage->aData);
2977     if( nextOvfl!=0 ){
2978       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2979       if( rc!=SQLITE_OK ){
2980         return rc;
2981       }
2982     }
2983   }
2984 
2985   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2986   ** that it points at iFreePage. Also fix the pointer map entry for
2987   ** iPtrPage.
2988   */
2989   if( eType!=PTRMAP_ROOTPAGE ){
2990     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2991     if( rc!=SQLITE_OK ){
2992       return rc;
2993     }
2994     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2995     if( rc!=SQLITE_OK ){
2996       releasePage(pPtrPage);
2997       return rc;
2998     }
2999     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3000     releasePage(pPtrPage);
3001     if( rc==SQLITE_OK ){
3002       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3003     }
3004   }
3005   return rc;
3006 }
3007 
3008 /* Forward declaration required by incrVacuumStep(). */
3009 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3010 
3011 /*
3012 ** Perform a single step of an incremental-vacuum. If successful, return
3013 ** SQLITE_OK. If there is no work to do (and therefore no point in
3014 ** calling this function again), return SQLITE_DONE. Or, if an error
3015 ** occurs, return some other error code.
3016 **
3017 ** More specificly, this function attempts to re-organize the database so
3018 ** that the last page of the file currently in use is no longer in use.
3019 **
3020 ** Parameter nFin is the number of pages that this database would contain
3021 ** were this function called until it returns SQLITE_DONE.
3022 **
3023 ** If the bCommit parameter is non-zero, this function assumes that the
3024 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3025 ** or an error. bCommit is passed true for an auto-vacuum-on-commmit
3026 ** operation, or false for an incremental vacuum.
3027 */
3028 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3029   Pgno nFreeList;           /* Number of pages still on the free-list */
3030   int rc;
3031 
3032   assert( sqlite3_mutex_held(pBt->mutex) );
3033   assert( iLastPg>nFin );
3034 
3035   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3036     u8 eType;
3037     Pgno iPtrPage;
3038 
3039     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3040     if( nFreeList==0 ){
3041       return SQLITE_DONE;
3042     }
3043 
3044     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3045     if( rc!=SQLITE_OK ){
3046       return rc;
3047     }
3048     if( eType==PTRMAP_ROOTPAGE ){
3049       return SQLITE_CORRUPT_BKPT;
3050     }
3051 
3052     if( eType==PTRMAP_FREEPAGE ){
3053       if( bCommit==0 ){
3054         /* Remove the page from the files free-list. This is not required
3055         ** if bCommit is non-zero. In that case, the free-list will be
3056         ** truncated to zero after this function returns, so it doesn't
3057         ** matter if it still contains some garbage entries.
3058         */
3059         Pgno iFreePg;
3060         MemPage *pFreePg;
3061         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3062         if( rc!=SQLITE_OK ){
3063           return rc;
3064         }
3065         assert( iFreePg==iLastPg );
3066         releasePage(pFreePg);
3067       }
3068     } else {
3069       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3070       MemPage *pLastPg;
3071       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3072       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3073 
3074       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3075       if( rc!=SQLITE_OK ){
3076         return rc;
3077       }
3078 
3079       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3080       ** is swapped with the first free page pulled off the free list.
3081       **
3082       ** On the other hand, if bCommit is greater than zero, then keep
3083       ** looping until a free-page located within the first nFin pages
3084       ** of the file is found.
3085       */
3086       if( bCommit==0 ){
3087         eMode = BTALLOC_LE;
3088         iNear = nFin;
3089       }
3090       do {
3091         MemPage *pFreePg;
3092         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3093         if( rc!=SQLITE_OK ){
3094           releasePage(pLastPg);
3095           return rc;
3096         }
3097         releasePage(pFreePg);
3098       }while( bCommit && iFreePg>nFin );
3099       assert( iFreePg<iLastPg );
3100 
3101       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3102       releasePage(pLastPg);
3103       if( rc!=SQLITE_OK ){
3104         return rc;
3105       }
3106     }
3107   }
3108 
3109   if( bCommit==0 ){
3110     do {
3111       iLastPg--;
3112     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3113     pBt->bDoTruncate = 1;
3114     pBt->nPage = iLastPg;
3115   }
3116   return SQLITE_OK;
3117 }
3118 
3119 /*
3120 ** The database opened by the first argument is an auto-vacuum database
3121 ** nOrig pages in size containing nFree free pages. Return the expected
3122 ** size of the database in pages following an auto-vacuum operation.
3123 */
3124 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3125   int nEntry;                     /* Number of entries on one ptrmap page */
3126   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3127   Pgno nFin;                      /* Return value */
3128 
3129   nEntry = pBt->usableSize/5;
3130   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3131   nFin = nOrig - nFree - nPtrmap;
3132   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3133     nFin--;
3134   }
3135   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3136     nFin--;
3137   }
3138 
3139   return nFin;
3140 }
3141 
3142 /*
3143 ** A write-transaction must be opened before calling this function.
3144 ** It performs a single unit of work towards an incremental vacuum.
3145 **
3146 ** If the incremental vacuum is finished after this function has run,
3147 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3148 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3149 */
3150 int sqlite3BtreeIncrVacuum(Btree *p){
3151   int rc;
3152   BtShared *pBt = p->pBt;
3153 
3154   sqlite3BtreeEnter(p);
3155   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3156   if( !pBt->autoVacuum ){
3157     rc = SQLITE_DONE;
3158   }else{
3159     Pgno nOrig = btreePagecount(pBt);
3160     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3161     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3162 
3163     if( nOrig<nFin ){
3164       rc = SQLITE_CORRUPT_BKPT;
3165     }else if( nFree>0 ){
3166       rc = saveAllCursors(pBt, 0, 0);
3167       if( rc==SQLITE_OK ){
3168         invalidateAllOverflowCache(pBt);
3169         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3170       }
3171       if( rc==SQLITE_OK ){
3172         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3173         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3174       }
3175     }else{
3176       rc = SQLITE_DONE;
3177     }
3178   }
3179   sqlite3BtreeLeave(p);
3180   return rc;
3181 }
3182 
3183 /*
3184 ** This routine is called prior to sqlite3PagerCommit when a transaction
3185 ** is committed for an auto-vacuum database.
3186 **
3187 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3188 ** the database file should be truncated to during the commit process.
3189 ** i.e. the database has been reorganized so that only the first *pnTrunc
3190 ** pages are in use.
3191 */
3192 static int autoVacuumCommit(BtShared *pBt){
3193   int rc = SQLITE_OK;
3194   Pager *pPager = pBt->pPager;
3195   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
3196 
3197   assert( sqlite3_mutex_held(pBt->mutex) );
3198   invalidateAllOverflowCache(pBt);
3199   assert(pBt->autoVacuum);
3200   if( !pBt->incrVacuum ){
3201     Pgno nFin;         /* Number of pages in database after autovacuuming */
3202     Pgno nFree;        /* Number of pages on the freelist initially */
3203     Pgno iFree;        /* The next page to be freed */
3204     Pgno nOrig;        /* Database size before freeing */
3205 
3206     nOrig = btreePagecount(pBt);
3207     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3208       /* It is not possible to create a database for which the final page
3209       ** is either a pointer-map page or the pending-byte page. If one
3210       ** is encountered, this indicates corruption.
3211       */
3212       return SQLITE_CORRUPT_BKPT;
3213     }
3214 
3215     nFree = get4byte(&pBt->pPage1->aData[36]);
3216     nFin = finalDbSize(pBt, nOrig, nFree);
3217     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3218     if( nFin<nOrig ){
3219       rc = saveAllCursors(pBt, 0, 0);
3220     }
3221     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3222       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3223     }
3224     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3225       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3226       put4byte(&pBt->pPage1->aData[32], 0);
3227       put4byte(&pBt->pPage1->aData[36], 0);
3228       put4byte(&pBt->pPage1->aData[28], nFin);
3229       pBt->bDoTruncate = 1;
3230       pBt->nPage = nFin;
3231     }
3232     if( rc!=SQLITE_OK ){
3233       sqlite3PagerRollback(pPager);
3234     }
3235   }
3236 
3237   assert( nRef>=sqlite3PagerRefcount(pPager) );
3238   return rc;
3239 }
3240 
3241 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3242 # define setChildPtrmaps(x) SQLITE_OK
3243 #endif
3244 
3245 /*
3246 ** This routine does the first phase of a two-phase commit.  This routine
3247 ** causes a rollback journal to be created (if it does not already exist)
3248 ** and populated with enough information so that if a power loss occurs
3249 ** the database can be restored to its original state by playing back
3250 ** the journal.  Then the contents of the journal are flushed out to
3251 ** the disk.  After the journal is safely on oxide, the changes to the
3252 ** database are written into the database file and flushed to oxide.
3253 ** At the end of this call, the rollback journal still exists on the
3254 ** disk and we are still holding all locks, so the transaction has not
3255 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3256 ** commit process.
3257 **
3258 ** This call is a no-op if no write-transaction is currently active on pBt.
3259 **
3260 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3261 ** the name of a master journal file that should be written into the
3262 ** individual journal file, or is NULL, indicating no master journal file
3263 ** (single database transaction).
3264 **
3265 ** When this is called, the master journal should already have been
3266 ** created, populated with this journal pointer and synced to disk.
3267 **
3268 ** Once this is routine has returned, the only thing required to commit
3269 ** the write-transaction for this database file is to delete the journal.
3270 */
3271 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3272   int rc = SQLITE_OK;
3273   if( p->inTrans==TRANS_WRITE ){
3274     BtShared *pBt = p->pBt;
3275     sqlite3BtreeEnter(p);
3276 #ifndef SQLITE_OMIT_AUTOVACUUM
3277     if( pBt->autoVacuum ){
3278       rc = autoVacuumCommit(pBt);
3279       if( rc!=SQLITE_OK ){
3280         sqlite3BtreeLeave(p);
3281         return rc;
3282       }
3283     }
3284     if( pBt->bDoTruncate ){
3285       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
3286     }
3287 #endif
3288     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3289     sqlite3BtreeLeave(p);
3290   }
3291   return rc;
3292 }
3293 
3294 /*
3295 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3296 ** at the conclusion of a transaction.
3297 */
3298 static void btreeEndTransaction(Btree *p){
3299   BtShared *pBt = p->pBt;
3300   sqlite3 *db = p->db;
3301   assert( sqlite3BtreeHoldsMutex(p) );
3302 
3303 #ifndef SQLITE_OMIT_AUTOVACUUM
3304   pBt->bDoTruncate = 0;
3305 #endif
3306   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
3307     /* If there are other active statements that belong to this database
3308     ** handle, downgrade to a read-only transaction. The other statements
3309     ** may still be reading from the database.  */
3310     downgradeAllSharedCacheTableLocks(p);
3311     p->inTrans = TRANS_READ;
3312   }else{
3313     /* If the handle had any kind of transaction open, decrement the
3314     ** transaction count of the shared btree. If the transaction count
3315     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3316     ** call below will unlock the pager.  */
3317     if( p->inTrans!=TRANS_NONE ){
3318       clearAllSharedCacheTableLocks(p);
3319       pBt->nTransaction--;
3320       if( 0==pBt->nTransaction ){
3321         pBt->inTransaction = TRANS_NONE;
3322       }
3323     }
3324 
3325     /* Set the current transaction state to TRANS_NONE and unlock the
3326     ** pager if this call closed the only read or write transaction.  */
3327     p->inTrans = TRANS_NONE;
3328     unlockBtreeIfUnused(pBt);
3329   }
3330 
3331   btreeIntegrity(p);
3332 }
3333 
3334 /*
3335 ** Commit the transaction currently in progress.
3336 **
3337 ** This routine implements the second phase of a 2-phase commit.  The
3338 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3339 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3340 ** routine did all the work of writing information out to disk and flushing the
3341 ** contents so that they are written onto the disk platter.  All this
3342 ** routine has to do is delete or truncate or zero the header in the
3343 ** the rollback journal (which causes the transaction to commit) and
3344 ** drop locks.
3345 **
3346 ** Normally, if an error occurs while the pager layer is attempting to
3347 ** finalize the underlying journal file, this function returns an error and
3348 ** the upper layer will attempt a rollback. However, if the second argument
3349 ** is non-zero then this b-tree transaction is part of a multi-file
3350 ** transaction. In this case, the transaction has already been committed
3351 ** (by deleting a master journal file) and the caller will ignore this
3352 ** functions return code. So, even if an error occurs in the pager layer,
3353 ** reset the b-tree objects internal state to indicate that the write
3354 ** transaction has been closed. This is quite safe, as the pager will have
3355 ** transitioned to the error state.
3356 **
3357 ** This will release the write lock on the database file.  If there
3358 ** are no active cursors, it also releases the read lock.
3359 */
3360 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3361 
3362   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3363   sqlite3BtreeEnter(p);
3364   btreeIntegrity(p);
3365 
3366   /* If the handle has a write-transaction open, commit the shared-btrees
3367   ** transaction and set the shared state to TRANS_READ.
3368   */
3369   if( p->inTrans==TRANS_WRITE ){
3370     int rc;
3371     BtShared *pBt = p->pBt;
3372     assert( pBt->inTransaction==TRANS_WRITE );
3373     assert( pBt->nTransaction>0 );
3374     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3375     if( rc!=SQLITE_OK && bCleanup==0 ){
3376       sqlite3BtreeLeave(p);
3377       return rc;
3378     }
3379     pBt->inTransaction = TRANS_READ;
3380     btreeClearHasContent(pBt);
3381   }
3382 
3383   btreeEndTransaction(p);
3384   sqlite3BtreeLeave(p);
3385   return SQLITE_OK;
3386 }
3387 
3388 /*
3389 ** Do both phases of a commit.
3390 */
3391 int sqlite3BtreeCommit(Btree *p){
3392   int rc;
3393   sqlite3BtreeEnter(p);
3394   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3395   if( rc==SQLITE_OK ){
3396     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3397   }
3398   sqlite3BtreeLeave(p);
3399   return rc;
3400 }
3401 
3402 /*
3403 ** This routine sets the state to CURSOR_FAULT and the error
3404 ** code to errCode for every cursor on BtShared that pBtree
3405 ** references.
3406 **
3407 ** Every cursor is tripped, including cursors that belong
3408 ** to other database connections that happen to be sharing
3409 ** the cache with pBtree.
3410 **
3411 ** This routine gets called when a rollback occurs.
3412 ** All cursors using the same cache must be tripped
3413 ** to prevent them from trying to use the btree after
3414 ** the rollback.  The rollback may have deleted tables
3415 ** or moved root pages, so it is not sufficient to
3416 ** save the state of the cursor.  The cursor must be
3417 ** invalidated.
3418 */
3419 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3420   BtCursor *p;
3421   if( pBtree==0 ) return;
3422   sqlite3BtreeEnter(pBtree);
3423   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3424     int i;
3425     sqlite3BtreeClearCursor(p);
3426     p->eState = CURSOR_FAULT;
3427     p->skipNext = errCode;
3428     for(i=0; i<=p->iPage; i++){
3429       releasePage(p->apPage[i]);
3430       p->apPage[i] = 0;
3431     }
3432   }
3433   sqlite3BtreeLeave(pBtree);
3434 }
3435 
3436 /*
3437 ** Rollback the transaction in progress.  All cursors will be
3438 ** invalided by this operation.  Any attempt to use a cursor
3439 ** that was open at the beginning of this operation will result
3440 ** in an error.
3441 **
3442 ** This will release the write lock on the database file.  If there
3443 ** are no active cursors, it also releases the read lock.
3444 */
3445 int sqlite3BtreeRollback(Btree *p, int tripCode){
3446   int rc;
3447   BtShared *pBt = p->pBt;
3448   MemPage *pPage1;
3449 
3450   sqlite3BtreeEnter(p);
3451   if( tripCode==SQLITE_OK ){
3452     rc = tripCode = saveAllCursors(pBt, 0, 0);
3453   }else{
3454     rc = SQLITE_OK;
3455   }
3456   if( tripCode ){
3457     sqlite3BtreeTripAllCursors(p, tripCode);
3458   }
3459   btreeIntegrity(p);
3460 
3461   if( p->inTrans==TRANS_WRITE ){
3462     int rc2;
3463 
3464     assert( TRANS_WRITE==pBt->inTransaction );
3465     rc2 = sqlite3PagerRollback(pBt->pPager);
3466     if( rc2!=SQLITE_OK ){
3467       rc = rc2;
3468     }
3469 
3470     /* The rollback may have destroyed the pPage1->aData value.  So
3471     ** call btreeGetPage() on page 1 again to make
3472     ** sure pPage1->aData is set correctly. */
3473     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3474       int nPage = get4byte(28+(u8*)pPage1->aData);
3475       testcase( nPage==0 );
3476       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3477       testcase( pBt->nPage!=nPage );
3478       pBt->nPage = nPage;
3479       releasePage(pPage1);
3480     }
3481     assert( countValidCursors(pBt, 1)==0 );
3482     pBt->inTransaction = TRANS_READ;
3483     btreeClearHasContent(pBt);
3484   }
3485 
3486   btreeEndTransaction(p);
3487   sqlite3BtreeLeave(p);
3488   return rc;
3489 }
3490 
3491 /*
3492 ** Start a statement subtransaction. The subtransaction can can be rolled
3493 ** back independently of the main transaction. You must start a transaction
3494 ** before starting a subtransaction. The subtransaction is ended automatically
3495 ** if the main transaction commits or rolls back.
3496 **
3497 ** Statement subtransactions are used around individual SQL statements
3498 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3499 ** error occurs within the statement, the effect of that one statement
3500 ** can be rolled back without having to rollback the entire transaction.
3501 **
3502 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3503 ** value passed as the second parameter is the total number of savepoints,
3504 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3505 ** are no active savepoints and no other statement-transactions open,
3506 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3507 ** using the sqlite3BtreeSavepoint() function.
3508 */
3509 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3510   int rc;
3511   BtShared *pBt = p->pBt;
3512   sqlite3BtreeEnter(p);
3513   assert( p->inTrans==TRANS_WRITE );
3514   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
3515   assert( iStatement>0 );
3516   assert( iStatement>p->db->nSavepoint );
3517   assert( pBt->inTransaction==TRANS_WRITE );
3518   /* At the pager level, a statement transaction is a savepoint with
3519   ** an index greater than all savepoints created explicitly using
3520   ** SQL statements. It is illegal to open, release or rollback any
3521   ** such savepoints while the statement transaction savepoint is active.
3522   */
3523   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3524   sqlite3BtreeLeave(p);
3525   return rc;
3526 }
3527 
3528 /*
3529 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3530 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3531 ** savepoint identified by parameter iSavepoint, depending on the value
3532 ** of op.
3533 **
3534 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3535 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3536 ** contents of the entire transaction are rolled back. This is different
3537 ** from a normal transaction rollback, as no locks are released and the
3538 ** transaction remains open.
3539 */
3540 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3541   int rc = SQLITE_OK;
3542   if( p && p->inTrans==TRANS_WRITE ){
3543     BtShared *pBt = p->pBt;
3544     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3545     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3546     sqlite3BtreeEnter(p);
3547     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3548     if( rc==SQLITE_OK ){
3549       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
3550         pBt->nPage = 0;
3551       }
3552       rc = newDatabase(pBt);
3553       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3554 
3555       /* The database size was written into the offset 28 of the header
3556       ** when the transaction started, so we know that the value at offset
3557       ** 28 is nonzero. */
3558       assert( pBt->nPage>0 );
3559     }
3560     sqlite3BtreeLeave(p);
3561   }
3562   return rc;
3563 }
3564 
3565 /*
3566 ** Create a new cursor for the BTree whose root is on the page
3567 ** iTable. If a read-only cursor is requested, it is assumed that
3568 ** the caller already has at least a read-only transaction open
3569 ** on the database already. If a write-cursor is requested, then
3570 ** the caller is assumed to have an open write transaction.
3571 **
3572 ** If wrFlag==0, then the cursor can only be used for reading.
3573 ** If wrFlag==1, then the cursor can be used for reading or for
3574 ** writing if other conditions for writing are also met.  These
3575 ** are the conditions that must be met in order for writing to
3576 ** be allowed:
3577 **
3578 ** 1:  The cursor must have been opened with wrFlag==1
3579 **
3580 ** 2:  Other database connections that share the same pager cache
3581 **     but which are not in the READ_UNCOMMITTED state may not have
3582 **     cursors open with wrFlag==0 on the same table.  Otherwise
3583 **     the changes made by this write cursor would be visible to
3584 **     the read cursors in the other database connection.
3585 **
3586 ** 3:  The database must be writable (not on read-only media)
3587 **
3588 ** 4:  There must be an active transaction.
3589 **
3590 ** No checking is done to make sure that page iTable really is the
3591 ** root page of a b-tree.  If it is not, then the cursor acquired
3592 ** will not work correctly.
3593 **
3594 ** It is assumed that the sqlite3BtreeCursorZero() has been called
3595 ** on pCur to initialize the memory space prior to invoking this routine.
3596 */
3597 static int btreeCursor(
3598   Btree *p,                              /* The btree */
3599   int iTable,                            /* Root page of table to open */
3600   int wrFlag,                            /* 1 to write. 0 read-only */
3601   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
3602   BtCursor *pCur                         /* Space for new cursor */
3603 ){
3604   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
3605 
3606   assert( sqlite3BtreeHoldsMutex(p) );
3607   assert( wrFlag==0 || wrFlag==1 );
3608 
3609   /* The following assert statements verify that if this is a sharable
3610   ** b-tree database, the connection is holding the required table locks,
3611   ** and that no other connection has any open cursor that conflicts with
3612   ** this lock.  */
3613   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3614   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3615 
3616   /* Assert that the caller has opened the required transaction. */
3617   assert( p->inTrans>TRANS_NONE );
3618   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3619   assert( pBt->pPage1 && pBt->pPage1->aData );
3620 
3621   if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){
3622     return SQLITE_READONLY;
3623   }
3624   if( iTable==1 && btreePagecount(pBt)==0 ){
3625     assert( wrFlag==0 );
3626     iTable = 0;
3627   }
3628 
3629   /* Now that no other errors can occur, finish filling in the BtCursor
3630   ** variables and link the cursor into the BtShared list.  */
3631   pCur->pgnoRoot = (Pgno)iTable;
3632   pCur->iPage = -1;
3633   pCur->pKeyInfo = pKeyInfo;
3634   pCur->pBtree = p;
3635   pCur->pBt = pBt;
3636   assert( wrFlag==0 || wrFlag==BTCF_WriteFlag );
3637   pCur->curFlags = wrFlag;
3638   pCur->pNext = pBt->pCursor;
3639   if( pCur->pNext ){
3640     pCur->pNext->pPrev = pCur;
3641   }
3642   pBt->pCursor = pCur;
3643   pCur->eState = CURSOR_INVALID;
3644   return SQLITE_OK;
3645 }
3646 int sqlite3BtreeCursor(
3647   Btree *p,                                   /* The btree */
3648   int iTable,                                 /* Root page of table to open */
3649   int wrFlag,                                 /* 1 to write. 0 read-only */
3650   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
3651   BtCursor *pCur                              /* Write new cursor here */
3652 ){
3653   int rc;
3654   sqlite3BtreeEnter(p);
3655   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3656   sqlite3BtreeLeave(p);
3657   return rc;
3658 }
3659 
3660 /*
3661 ** Return the size of a BtCursor object in bytes.
3662 **
3663 ** This interfaces is needed so that users of cursors can preallocate
3664 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3665 ** to users so they cannot do the sizeof() themselves - they must call
3666 ** this routine.
3667 */
3668 int sqlite3BtreeCursorSize(void){
3669   return ROUND8(sizeof(BtCursor));
3670 }
3671 
3672 /*
3673 ** Initialize memory that will be converted into a BtCursor object.
3674 **
3675 ** The simple approach here would be to memset() the entire object
3676 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
3677 ** do not need to be zeroed and they are large, so we can save a lot
3678 ** of run-time by skipping the initialization of those elements.
3679 */
3680 void sqlite3BtreeCursorZero(BtCursor *p){
3681   memset(p, 0, offsetof(BtCursor, iPage));
3682 }
3683 
3684 /*
3685 ** Close a cursor.  The read lock on the database file is released
3686 ** when the last cursor is closed.
3687 */
3688 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3689   Btree *pBtree = pCur->pBtree;
3690   if( pBtree ){
3691     int i;
3692     BtShared *pBt = pCur->pBt;
3693     sqlite3BtreeEnter(pBtree);
3694     sqlite3BtreeClearCursor(pCur);
3695     if( pCur->pPrev ){
3696       pCur->pPrev->pNext = pCur->pNext;
3697     }else{
3698       pBt->pCursor = pCur->pNext;
3699     }
3700     if( pCur->pNext ){
3701       pCur->pNext->pPrev = pCur->pPrev;
3702     }
3703     for(i=0; i<=pCur->iPage; i++){
3704       releasePage(pCur->apPage[i]);
3705     }
3706     unlockBtreeIfUnused(pBt);
3707     sqlite3DbFree(pBtree->db, pCur->aOverflow);
3708     /* sqlite3_free(pCur); */
3709     sqlite3BtreeLeave(pBtree);
3710   }
3711   return SQLITE_OK;
3712 }
3713 
3714 /*
3715 ** Make sure the BtCursor* given in the argument has a valid
3716 ** BtCursor.info structure.  If it is not already valid, call
3717 ** btreeParseCell() to fill it in.
3718 **
3719 ** BtCursor.info is a cache of the information in the current cell.
3720 ** Using this cache reduces the number of calls to btreeParseCell().
3721 **
3722 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
3723 ** compiler to crash when getCellInfo() is implemented as a macro.
3724 ** But there is a measureable speed advantage to using the macro on gcc
3725 ** (when less compiler optimizations like -Os or -O0 are used and the
3726 ** compiler is not doing agressive inlining.)  So we use a real function
3727 ** for MSVC and a macro for everything else.  Ticket #2457.
3728 */
3729 #ifndef NDEBUG
3730   static void assertCellInfo(BtCursor *pCur){
3731     CellInfo info;
3732     int iPage = pCur->iPage;
3733     memset(&info, 0, sizeof(info));
3734     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3735     assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
3736   }
3737 #else
3738   #define assertCellInfo(x)
3739 #endif
3740 #ifdef _MSC_VER
3741   /* Use a real function in MSVC to work around bugs in that compiler. */
3742   static void getCellInfo(BtCursor *pCur){
3743     if( pCur->info.nSize==0 ){
3744       int iPage = pCur->iPage;
3745       btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3746       pCur->curFlags |= BTCF_ValidNKey;
3747     }else{
3748       assertCellInfo(pCur);
3749     }
3750   }
3751 #else /* if not _MSC_VER */
3752   /* Use a macro in all other compilers so that the function is inlined */
3753 #define getCellInfo(pCur)                                                      \
3754   if( pCur->info.nSize==0 ){                                                   \
3755     int iPage = pCur->iPage;                                                   \
3756     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);        \
3757     pCur->curFlags |= BTCF_ValidNKey;                                          \
3758   }else{                                                                       \
3759     assertCellInfo(pCur);                                                      \
3760   }
3761 #endif /* _MSC_VER */
3762 
3763 #ifndef NDEBUG  /* The next routine used only within assert() statements */
3764 /*
3765 ** Return true if the given BtCursor is valid.  A valid cursor is one
3766 ** that is currently pointing to a row in a (non-empty) table.
3767 ** This is a verification routine is used only within assert() statements.
3768 */
3769 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3770   return pCur && pCur->eState==CURSOR_VALID;
3771 }
3772 #endif /* NDEBUG */
3773 
3774 /*
3775 ** Set *pSize to the size of the buffer needed to hold the value of
3776 ** the key for the current entry.  If the cursor is not pointing
3777 ** to a valid entry, *pSize is set to 0.
3778 **
3779 ** For a table with the INTKEY flag set, this routine returns the key
3780 ** itself, not the number of bytes in the key.
3781 **
3782 ** The caller must position the cursor prior to invoking this routine.
3783 **
3784 ** This routine cannot fail.  It always returns SQLITE_OK.
3785 */
3786 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3787   assert( cursorHoldsMutex(pCur) );
3788   assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3789   if( pCur->eState!=CURSOR_VALID ){
3790     *pSize = 0;
3791   }else{
3792     getCellInfo(pCur);
3793     *pSize = pCur->info.nKey;
3794   }
3795   return SQLITE_OK;
3796 }
3797 
3798 /*
3799 ** Set *pSize to the number of bytes of data in the entry the
3800 ** cursor currently points to.
3801 **
3802 ** The caller must guarantee that the cursor is pointing to a non-NULL
3803 ** valid entry.  In other words, the calling procedure must guarantee
3804 ** that the cursor has Cursor.eState==CURSOR_VALID.
3805 **
3806 ** Failure is not possible.  This function always returns SQLITE_OK.
3807 ** It might just as well be a procedure (returning void) but we continue
3808 ** to return an integer result code for historical reasons.
3809 */
3810 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3811   assert( cursorHoldsMutex(pCur) );
3812   assert( pCur->eState==CURSOR_VALID );
3813   getCellInfo(pCur);
3814   *pSize = pCur->info.nData;
3815   return SQLITE_OK;
3816 }
3817 
3818 /*
3819 ** Given the page number of an overflow page in the database (parameter
3820 ** ovfl), this function finds the page number of the next page in the
3821 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3822 ** pointer-map data instead of reading the content of page ovfl to do so.
3823 **
3824 ** If an error occurs an SQLite error code is returned. Otherwise:
3825 **
3826 ** The page number of the next overflow page in the linked list is
3827 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3828 ** list, *pPgnoNext is set to zero.
3829 **
3830 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3831 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3832 ** reference. It is the responsibility of the caller to call releasePage()
3833 ** on *ppPage to free the reference. In no reference was obtained (because
3834 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3835 ** *ppPage is set to zero.
3836 */
3837 static int getOverflowPage(
3838   BtShared *pBt,               /* The database file */
3839   Pgno ovfl,                   /* Current overflow page number */
3840   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
3841   Pgno *pPgnoNext              /* OUT: Next overflow page number */
3842 ){
3843   Pgno next = 0;
3844   MemPage *pPage = 0;
3845   int rc = SQLITE_OK;
3846 
3847   assert( sqlite3_mutex_held(pBt->mutex) );
3848   assert(pPgnoNext);
3849 
3850 #ifndef SQLITE_OMIT_AUTOVACUUM
3851   /* Try to find the next page in the overflow list using the
3852   ** autovacuum pointer-map pages. Guess that the next page in
3853   ** the overflow list is page number (ovfl+1). If that guess turns
3854   ** out to be wrong, fall back to loading the data of page
3855   ** number ovfl to determine the next page number.
3856   */
3857   if( pBt->autoVacuum ){
3858     Pgno pgno;
3859     Pgno iGuess = ovfl+1;
3860     u8 eType;
3861 
3862     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3863       iGuess++;
3864     }
3865 
3866     if( iGuess<=btreePagecount(pBt) ){
3867       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3868       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3869         next = iGuess;
3870         rc = SQLITE_DONE;
3871       }
3872     }
3873   }
3874 #endif
3875 
3876   assert( next==0 || rc==SQLITE_DONE );
3877   if( rc==SQLITE_OK ){
3878     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
3879     assert( rc==SQLITE_OK || pPage==0 );
3880     if( rc==SQLITE_OK ){
3881       next = get4byte(pPage->aData);
3882     }
3883   }
3884 
3885   *pPgnoNext = next;
3886   if( ppPage ){
3887     *ppPage = pPage;
3888   }else{
3889     releasePage(pPage);
3890   }
3891   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3892 }
3893 
3894 /*
3895 ** Copy data from a buffer to a page, or from a page to a buffer.
3896 **
3897 ** pPayload is a pointer to data stored on database page pDbPage.
3898 ** If argument eOp is false, then nByte bytes of data are copied
3899 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3900 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3901 ** of data are copied from the buffer pBuf to pPayload.
3902 **
3903 ** SQLITE_OK is returned on success, otherwise an error code.
3904 */
3905 static int copyPayload(
3906   void *pPayload,           /* Pointer to page data */
3907   void *pBuf,               /* Pointer to buffer */
3908   int nByte,                /* Number of bytes to copy */
3909   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3910   DbPage *pDbPage           /* Page containing pPayload */
3911 ){
3912   if( eOp ){
3913     /* Copy data from buffer to page (a write operation) */
3914     int rc = sqlite3PagerWrite(pDbPage);
3915     if( rc!=SQLITE_OK ){
3916       return rc;
3917     }
3918     memcpy(pPayload, pBuf, nByte);
3919   }else{
3920     /* Copy data from page to buffer (a read operation) */
3921     memcpy(pBuf, pPayload, nByte);
3922   }
3923   return SQLITE_OK;
3924 }
3925 
3926 /*
3927 ** This function is used to read or overwrite payload information
3928 ** for the entry that the pCur cursor is pointing to. The eOp
3929 ** argument is interpreted as follows:
3930 **
3931 **   0: The operation is a read. Populate the overflow cache.
3932 **   1: The operation is a write. Populate the overflow cache.
3933 **   2: The operation is a read. Do not populate the overflow cache.
3934 **
3935 ** A total of "amt" bytes are read or written beginning at "offset".
3936 ** Data is read to or from the buffer pBuf.
3937 **
3938 ** The content being read or written might appear on the main page
3939 ** or be scattered out on multiple overflow pages.
3940 **
3941 ** If the current cursor entry uses one or more overflow pages and the
3942 ** eOp argument is not 2, this function may allocate space for and lazily
3943 ** popluates the overflow page-list cache array (BtCursor.aOverflow).
3944 ** Subsequent calls use this cache to make seeking to the supplied offset
3945 ** more efficient.
3946 **
3947 ** Once an overflow page-list cache has been allocated, it may be
3948 ** invalidated if some other cursor writes to the same table, or if
3949 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3950 ** mode, the following events may invalidate an overflow page-list cache.
3951 **
3952 **   * An incremental vacuum,
3953 **   * A commit in auto_vacuum="full" mode,
3954 **   * Creating a table (may require moving an overflow page).
3955 */
3956 static int accessPayload(
3957   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3958   u32 offset,          /* Begin reading this far into payload */
3959   u32 amt,             /* Read this many bytes */
3960   unsigned char *pBuf, /* Write the bytes into this buffer */
3961   int eOp              /* zero to read. non-zero to write. */
3962 ){
3963   unsigned char *aPayload;
3964   int rc = SQLITE_OK;
3965   u32 nKey;
3966   int iIdx = 0;
3967   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3968   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
3969 #ifdef SQLITE_DIRECT_OVERFLOW_READ
3970   int bEnd;                                   /* True if reading to end of data */
3971 #endif
3972 
3973   assert( pPage );
3974   assert( pCur->eState==CURSOR_VALID );
3975   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3976   assert( cursorHoldsMutex(pCur) );
3977   assert( eOp!=2 || offset==0 );      /* Always start from beginning for eOp==2 */
3978 
3979   getCellInfo(pCur);
3980   aPayload = pCur->info.pCell + pCur->info.nHeader;
3981   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3982 #ifdef SQLITE_DIRECT_OVERFLOW_READ
3983   bEnd = (offset+amt==nKey+pCur->info.nData);
3984 #endif
3985 
3986   if( NEVER(offset+amt > nKey+pCur->info.nData)
3987    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3988   ){
3989     /* Trying to read or write past the end of the data is an error */
3990     return SQLITE_CORRUPT_BKPT;
3991   }
3992 
3993   /* Check if data must be read/written to/from the btree page itself. */
3994   if( offset<pCur->info.nLocal ){
3995     int a = amt;
3996     if( a+offset>pCur->info.nLocal ){
3997       a = pCur->info.nLocal - offset;
3998     }
3999     rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
4000     offset = 0;
4001     pBuf += a;
4002     amt -= a;
4003   }else{
4004     offset -= pCur->info.nLocal;
4005   }
4006 
4007   if( rc==SQLITE_OK && amt>0 ){
4008     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4009     Pgno nextPage;
4010 
4011     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4012 
4013     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4014     ** Except, do not allocate aOverflow[] for eOp==2.
4015     **
4016     ** The aOverflow[] array is sized at one entry for each overflow page
4017     ** in the overflow chain. The page number of the first overflow page is
4018     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4019     ** means "not yet known" (the cache is lazily populated).
4020     */
4021     if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4022       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4023       if( nOvfl>pCur->nOvflAlloc ){
4024         Pgno *aNew = (Pgno*)sqlite3DbRealloc(
4025             pCur->pBtree->db, pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4026         );
4027         if( aNew==0 ){
4028           rc = SQLITE_NOMEM;
4029         }else{
4030           pCur->nOvflAlloc = nOvfl*2;
4031           pCur->aOverflow = aNew;
4032         }
4033       }
4034       if( rc==SQLITE_OK ){
4035         memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4036         pCur->curFlags |= BTCF_ValidOvfl;
4037       }
4038     }
4039 
4040     /* If the overflow page-list cache has been allocated and the
4041     ** entry for the first required overflow page is valid, skip
4042     ** directly to it.
4043     */
4044     if( (pCur->curFlags & BTCF_ValidOvfl)!=0 && pCur->aOverflow[offset/ovflSize] ){
4045       iIdx = (offset/ovflSize);
4046       nextPage = pCur->aOverflow[iIdx];
4047       offset = (offset%ovflSize);
4048     }
4049 
4050     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
4051 
4052       /* If required, populate the overflow page-list cache. */
4053       if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
4054         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
4055         pCur->aOverflow[iIdx] = nextPage;
4056       }
4057 
4058       if( offset>=ovflSize ){
4059         /* The only reason to read this page is to obtain the page
4060         ** number for the next page in the overflow chain. The page
4061         ** data is not required. So first try to lookup the overflow
4062         ** page-list cache, if any, then fall back to the getOverflowPage()
4063         ** function.
4064         **
4065         ** Note that the aOverflow[] array must be allocated because eOp!=2
4066         ** here.  If eOp==2, then offset==0 and this branch is never taken.
4067         */
4068         assert( eOp!=2 );
4069         assert( pCur->curFlags & BTCF_ValidOvfl );
4070         if( pCur->aOverflow[iIdx+1] ){
4071           nextPage = pCur->aOverflow[iIdx+1];
4072         }else{
4073           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4074         }
4075         offset -= ovflSize;
4076       }else{
4077         /* Need to read this page properly. It contains some of the
4078         ** range of data that is being read (eOp==0) or written (eOp!=0).
4079         */
4080 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4081         sqlite3_file *fd;
4082 #endif
4083         int a = amt;
4084         if( a + offset > ovflSize ){
4085           a = ovflSize - offset;
4086         }
4087 
4088 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4089         /* If all the following are true:
4090         **
4091         **   1) this is a read operation, and
4092         **   2) data is required from the start of this overflow page, and
4093         **   3) the database is file-backed, and
4094         **   4) there is no open write-transaction, and
4095         **   5) the database is not a WAL database,
4096         **   6) all data from the page is being read.
4097         **
4098         ** then data can be read directly from the database file into the
4099         ** output buffer, bypassing the page-cache altogether. This speeds
4100         ** up loading large records that span many overflow pages.
4101         */
4102         if( (eOp&0x01)==0                                      /* (1) */
4103          && offset==0                                          /* (2) */
4104          && (bEnd || a==ovflSize)                              /* (6) */
4105          && pBt->inTransaction==TRANS_READ                     /* (4) */
4106          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
4107          && pBt->pPage1->aData[19]==0x01                       /* (5) */
4108         ){
4109           u8 aSave[4];
4110           u8 *aWrite = &pBuf[-4];
4111           memcpy(aSave, aWrite, 4);
4112           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4113           nextPage = get4byte(aWrite);
4114           memcpy(aWrite, aSave, 4);
4115         }else
4116 #endif
4117 
4118         {
4119           DbPage *pDbPage;
4120           rc = sqlite3PagerAcquire(pBt->pPager, nextPage, &pDbPage,
4121               ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)
4122           );
4123           if( rc==SQLITE_OK ){
4124             aPayload = sqlite3PagerGetData(pDbPage);
4125             nextPage = get4byte(aPayload);
4126             rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);
4127             sqlite3PagerUnref(pDbPage);
4128             offset = 0;
4129           }
4130         }
4131         amt -= a;
4132         pBuf += a;
4133       }
4134     }
4135   }
4136 
4137   if( rc==SQLITE_OK && amt>0 ){
4138     return SQLITE_CORRUPT_BKPT;
4139   }
4140   return rc;
4141 }
4142 
4143 /*
4144 ** Read part of the key associated with cursor pCur.  Exactly
4145 ** "amt" bytes will be transfered into pBuf[].  The transfer
4146 ** begins at "offset".
4147 **
4148 ** The caller must ensure that pCur is pointing to a valid row
4149 ** in the table.
4150 **
4151 ** Return SQLITE_OK on success or an error code if anything goes
4152 ** wrong.  An error is returned if "offset+amt" is larger than
4153 ** the available payload.
4154 */
4155 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4156   assert( cursorHoldsMutex(pCur) );
4157   assert( pCur->eState==CURSOR_VALID );
4158   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4159   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4160   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4161 }
4162 
4163 /*
4164 ** Read part of the data associated with cursor pCur.  Exactly
4165 ** "amt" bytes will be transfered into pBuf[].  The transfer
4166 ** begins at "offset".
4167 **
4168 ** Return SQLITE_OK on success or an error code if anything goes
4169 ** wrong.  An error is returned if "offset+amt" is larger than
4170 ** the available payload.
4171 */
4172 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4173   int rc;
4174 
4175 #ifndef SQLITE_OMIT_INCRBLOB
4176   if ( pCur->eState==CURSOR_INVALID ){
4177     return SQLITE_ABORT;
4178   }
4179 #endif
4180 
4181   assert( cursorHoldsMutex(pCur) );
4182   rc = restoreCursorPosition(pCur);
4183   if( rc==SQLITE_OK ){
4184     assert( pCur->eState==CURSOR_VALID );
4185     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4186     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4187     rc = accessPayload(pCur, offset, amt, pBuf, 0);
4188   }
4189   return rc;
4190 }
4191 
4192 /*
4193 ** Return a pointer to payload information from the entry that the
4194 ** pCur cursor is pointing to.  The pointer is to the beginning of
4195 ** the key if index btrees (pPage->intKey==0) and is the data for
4196 ** table btrees (pPage->intKey==1). The number of bytes of available
4197 ** key/data is written into *pAmt.  If *pAmt==0, then the value
4198 ** returned will not be a valid pointer.
4199 **
4200 ** This routine is an optimization.  It is common for the entire key
4201 ** and data to fit on the local page and for there to be no overflow
4202 ** pages.  When that is so, this routine can be used to access the
4203 ** key and data without making a copy.  If the key and/or data spills
4204 ** onto overflow pages, then accessPayload() must be used to reassemble
4205 ** the key/data and copy it into a preallocated buffer.
4206 **
4207 ** The pointer returned by this routine looks directly into the cached
4208 ** page of the database.  The data might change or move the next time
4209 ** any btree routine is called.
4210 */
4211 static const void *fetchPayload(
4212   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4213   u32 *pAmt            /* Write the number of available bytes here */
4214 ){
4215   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4216   assert( pCur->eState==CURSOR_VALID );
4217   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4218   assert( cursorHoldsMutex(pCur) );
4219   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4220   assert( pCur->info.nSize>0 );
4221   *pAmt = pCur->info.nLocal;
4222   return (void*)(pCur->info.pCell + pCur->info.nHeader);
4223 }
4224 
4225 
4226 /*
4227 ** For the entry that cursor pCur is point to, return as
4228 ** many bytes of the key or data as are available on the local
4229 ** b-tree page.  Write the number of available bytes into *pAmt.
4230 **
4231 ** The pointer returned is ephemeral.  The key/data may move
4232 ** or be destroyed on the next call to any Btree routine,
4233 ** including calls from other threads against the same cache.
4234 ** Hence, a mutex on the BtShared should be held prior to calling
4235 ** this routine.
4236 **
4237 ** These routines is used to get quick access to key and data
4238 ** in the common case where no overflow pages are used.
4239 */
4240 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, u32 *pAmt){
4241   return fetchPayload(pCur, pAmt);
4242 }
4243 const void *sqlite3BtreeDataFetch(BtCursor *pCur, u32 *pAmt){
4244   return fetchPayload(pCur, pAmt);
4245 }
4246 
4247 
4248 /*
4249 ** Move the cursor down to a new child page.  The newPgno argument is the
4250 ** page number of the child page to move to.
4251 **
4252 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4253 ** the new child page does not match the flags field of the parent (i.e.
4254 ** if an intkey page appears to be the parent of a non-intkey page, or
4255 ** vice-versa).
4256 */
4257 static int moveToChild(BtCursor *pCur, u32 newPgno){
4258   int rc;
4259   int i = pCur->iPage;
4260   MemPage *pNewPage;
4261   BtShared *pBt = pCur->pBt;
4262 
4263   assert( cursorHoldsMutex(pCur) );
4264   assert( pCur->eState==CURSOR_VALID );
4265   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4266   assert( pCur->iPage>=0 );
4267   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4268     return SQLITE_CORRUPT_BKPT;
4269   }
4270   rc = getAndInitPage(pBt, newPgno, &pNewPage,
4271                (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0);
4272   if( rc ) return rc;
4273   pCur->apPage[i+1] = pNewPage;
4274   pCur->aiIdx[i+1] = 0;
4275   pCur->iPage++;
4276 
4277   pCur->info.nSize = 0;
4278   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4279   if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
4280     return SQLITE_CORRUPT_BKPT;
4281   }
4282   return SQLITE_OK;
4283 }
4284 
4285 #if 0
4286 /*
4287 ** Page pParent is an internal (non-leaf) tree page. This function
4288 ** asserts that page number iChild is the left-child if the iIdx'th
4289 ** cell in page pParent. Or, if iIdx is equal to the total number of
4290 ** cells in pParent, that page number iChild is the right-child of
4291 ** the page.
4292 */
4293 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4294   assert( iIdx<=pParent->nCell );
4295   if( iIdx==pParent->nCell ){
4296     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4297   }else{
4298     assert( get4byte(findCell(pParent, iIdx))==iChild );
4299   }
4300 }
4301 #else
4302 #  define assertParentIndex(x,y,z)
4303 #endif
4304 
4305 /*
4306 ** Move the cursor up to the parent page.
4307 **
4308 ** pCur->idx is set to the cell index that contains the pointer
4309 ** to the page we are coming from.  If we are coming from the
4310 ** right-most child page then pCur->idx is set to one more than
4311 ** the largest cell index.
4312 */
4313 static void moveToParent(BtCursor *pCur){
4314   assert( cursorHoldsMutex(pCur) );
4315   assert( pCur->eState==CURSOR_VALID );
4316   assert( pCur->iPage>0 );
4317   assert( pCur->apPage[pCur->iPage] );
4318 
4319   /* UPDATE: It is actually possible for the condition tested by the assert
4320   ** below to be untrue if the database file is corrupt. This can occur if
4321   ** one cursor has modified page pParent while a reference to it is held
4322   ** by a second cursor. Which can only happen if a single page is linked
4323   ** into more than one b-tree structure in a corrupt database.  */
4324 #if 0
4325   assertParentIndex(
4326     pCur->apPage[pCur->iPage-1],
4327     pCur->aiIdx[pCur->iPage-1],
4328     pCur->apPage[pCur->iPage]->pgno
4329   );
4330 #endif
4331   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4332 
4333   releasePage(pCur->apPage[pCur->iPage]);
4334   pCur->iPage--;
4335   pCur->info.nSize = 0;
4336   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4337 }
4338 
4339 /*
4340 ** Move the cursor to point to the root page of its b-tree structure.
4341 **
4342 ** If the table has a virtual root page, then the cursor is moved to point
4343 ** to the virtual root page instead of the actual root page. A table has a
4344 ** virtual root page when the actual root page contains no cells and a
4345 ** single child page. This can only happen with the table rooted at page 1.
4346 **
4347 ** If the b-tree structure is empty, the cursor state is set to
4348 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4349 ** cell located on the root (or virtual root) page and the cursor state
4350 ** is set to CURSOR_VALID.
4351 **
4352 ** If this function returns successfully, it may be assumed that the
4353 ** page-header flags indicate that the [virtual] root-page is the expected
4354 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4355 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4356 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4357 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4358 ** b-tree).
4359 */
4360 static int moveToRoot(BtCursor *pCur){
4361   MemPage *pRoot;
4362   int rc = SQLITE_OK;
4363 
4364   assert( cursorHoldsMutex(pCur) );
4365   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4366   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4367   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4368   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4369     if( pCur->eState==CURSOR_FAULT ){
4370       assert( pCur->skipNext!=SQLITE_OK );
4371       return pCur->skipNext;
4372     }
4373     sqlite3BtreeClearCursor(pCur);
4374   }
4375 
4376   if( pCur->iPage>=0 ){
4377     while( pCur->iPage ) releasePage(pCur->apPage[pCur->iPage--]);
4378   }else if( pCur->pgnoRoot==0 ){
4379     pCur->eState = CURSOR_INVALID;
4380     return SQLITE_OK;
4381   }else{
4382     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],
4383                  (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0);
4384     if( rc!=SQLITE_OK ){
4385       pCur->eState = CURSOR_INVALID;
4386       return rc;
4387     }
4388     pCur->iPage = 0;
4389   }
4390   pRoot = pCur->apPage[0];
4391   assert( pRoot->pgno==pCur->pgnoRoot );
4392 
4393   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4394   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4395   ** NULL, the caller expects a table b-tree. If this is not the case,
4396   ** return an SQLITE_CORRUPT error.
4397   **
4398   ** Earlier versions of SQLite assumed that this test could not fail
4399   ** if the root page was already loaded when this function was called (i.e.
4400   ** if pCur->iPage>=0). But this is not so if the database is corrupted
4401   ** in such a way that page pRoot is linked into a second b-tree table
4402   ** (or the freelist).  */
4403   assert( pRoot->intKey==1 || pRoot->intKey==0 );
4404   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
4405     return SQLITE_CORRUPT_BKPT;
4406   }
4407 
4408   pCur->aiIdx[0] = 0;
4409   pCur->info.nSize = 0;
4410   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
4411 
4412   if( pRoot->nCell>0 ){
4413     pCur->eState = CURSOR_VALID;
4414   }else if( !pRoot->leaf ){
4415     Pgno subpage;
4416     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4417     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4418     pCur->eState = CURSOR_VALID;
4419     rc = moveToChild(pCur, subpage);
4420   }else{
4421     pCur->eState = CURSOR_INVALID;
4422   }
4423   return rc;
4424 }
4425 
4426 /*
4427 ** Move the cursor down to the left-most leaf entry beneath the
4428 ** entry to which it is currently pointing.
4429 **
4430 ** The left-most leaf is the one with the smallest key - the first
4431 ** in ascending order.
4432 */
4433 static int moveToLeftmost(BtCursor *pCur){
4434   Pgno pgno;
4435   int rc = SQLITE_OK;
4436   MemPage *pPage;
4437 
4438   assert( cursorHoldsMutex(pCur) );
4439   assert( pCur->eState==CURSOR_VALID );
4440   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4441     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4442     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4443     rc = moveToChild(pCur, pgno);
4444   }
4445   return rc;
4446 }
4447 
4448 /*
4449 ** Move the cursor down to the right-most leaf entry beneath the
4450 ** page to which it is currently pointing.  Notice the difference
4451 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4452 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4453 ** finds the right-most entry beneath the *page*.
4454 **
4455 ** The right-most entry is the one with the largest key - the last
4456 ** key in ascending order.
4457 */
4458 static int moveToRightmost(BtCursor *pCur){
4459   Pgno pgno;
4460   int rc = SQLITE_OK;
4461   MemPage *pPage = 0;
4462 
4463   assert( cursorHoldsMutex(pCur) );
4464   assert( pCur->eState==CURSOR_VALID );
4465   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4466     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4467     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4468     rc = moveToChild(pCur, pgno);
4469   }
4470   if( rc==SQLITE_OK ){
4471     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4472     pCur->info.nSize = 0;
4473     pCur->curFlags &= ~BTCF_ValidNKey;
4474   }
4475   return rc;
4476 }
4477 
4478 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4479 ** on success.  Set *pRes to 0 if the cursor actually points to something
4480 ** or set *pRes to 1 if the table is empty.
4481 */
4482 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4483   int rc;
4484 
4485   assert( cursorHoldsMutex(pCur) );
4486   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4487   rc = moveToRoot(pCur);
4488   if( rc==SQLITE_OK ){
4489     if( pCur->eState==CURSOR_INVALID ){
4490       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4491       *pRes = 1;
4492     }else{
4493       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4494       *pRes = 0;
4495       rc = moveToLeftmost(pCur);
4496     }
4497   }
4498   return rc;
4499 }
4500 
4501 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4502 ** on success.  Set *pRes to 0 if the cursor actually points to something
4503 ** or set *pRes to 1 if the table is empty.
4504 */
4505 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4506   int rc;
4507 
4508   assert( cursorHoldsMutex(pCur) );
4509   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4510 
4511   /* If the cursor already points to the last entry, this is a no-op. */
4512   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
4513 #ifdef SQLITE_DEBUG
4514     /* This block serves to assert() that the cursor really does point
4515     ** to the last entry in the b-tree. */
4516     int ii;
4517     for(ii=0; ii<pCur->iPage; ii++){
4518       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4519     }
4520     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4521     assert( pCur->apPage[pCur->iPage]->leaf );
4522 #endif
4523     return SQLITE_OK;
4524   }
4525 
4526   rc = moveToRoot(pCur);
4527   if( rc==SQLITE_OK ){
4528     if( CURSOR_INVALID==pCur->eState ){
4529       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4530       *pRes = 1;
4531     }else{
4532       assert( pCur->eState==CURSOR_VALID );
4533       *pRes = 0;
4534       rc = moveToRightmost(pCur);
4535       if( rc==SQLITE_OK ){
4536         pCur->curFlags |= BTCF_AtLast;
4537       }else{
4538         pCur->curFlags &= ~BTCF_AtLast;
4539       }
4540 
4541     }
4542   }
4543   return rc;
4544 }
4545 
4546 /* Move the cursor so that it points to an entry near the key
4547 ** specified by pIdxKey or intKey.   Return a success code.
4548 **
4549 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
4550 ** must be NULL.  For index tables, pIdxKey is used and intKey
4551 ** is ignored.
4552 **
4553 ** If an exact match is not found, then the cursor is always
4554 ** left pointing at a leaf page which would hold the entry if it
4555 ** were present.  The cursor might point to an entry that comes
4556 ** before or after the key.
4557 **
4558 ** An integer is written into *pRes which is the result of
4559 ** comparing the key with the entry to which the cursor is
4560 ** pointing.  The meaning of the integer written into
4561 ** *pRes is as follows:
4562 **
4563 **     *pRes<0      The cursor is left pointing at an entry that
4564 **                  is smaller than intKey/pIdxKey or if the table is empty
4565 **                  and the cursor is therefore left point to nothing.
4566 **
4567 **     *pRes==0     The cursor is left pointing at an entry that
4568 **                  exactly matches intKey/pIdxKey.
4569 **
4570 **     *pRes>0      The cursor is left pointing at an entry that
4571 **                  is larger than intKey/pIdxKey.
4572 **
4573 */
4574 int sqlite3BtreeMovetoUnpacked(
4575   BtCursor *pCur,          /* The cursor to be moved */
4576   UnpackedRecord *pIdxKey, /* Unpacked index key */
4577   i64 intKey,              /* The table key */
4578   int biasRight,           /* If true, bias the search to the high end */
4579   int *pRes                /* Write search results here */
4580 ){
4581   int rc;
4582   RecordCompare xRecordCompare;
4583 
4584   assert( cursorHoldsMutex(pCur) );
4585   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4586   assert( pRes );
4587   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4588 
4589   /* If the cursor is already positioned at the point we are trying
4590   ** to move to, then just return without doing any work */
4591   if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
4592    && pCur->apPage[0]->intKey
4593   ){
4594     if( pCur->info.nKey==intKey ){
4595       *pRes = 0;
4596       return SQLITE_OK;
4597     }
4598     if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){
4599       *pRes = -1;
4600       return SQLITE_OK;
4601     }
4602   }
4603 
4604   if( pIdxKey ){
4605     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
4606     pIdxKey->isCorrupt = 0;
4607     assert( pIdxKey->default_rc==1
4608          || pIdxKey->default_rc==0
4609          || pIdxKey->default_rc==-1
4610     );
4611   }else{
4612     xRecordCompare = 0; /* All keys are integers */
4613   }
4614 
4615   rc = moveToRoot(pCur);
4616   if( rc ){
4617     return rc;
4618   }
4619   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
4620   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
4621   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
4622   if( pCur->eState==CURSOR_INVALID ){
4623     *pRes = -1;
4624     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4625     return SQLITE_OK;
4626   }
4627   assert( pCur->apPage[0]->intKey || pIdxKey );
4628   for(;;){
4629     int lwr, upr, idx, c;
4630     Pgno chldPg;
4631     MemPage *pPage = pCur->apPage[pCur->iPage];
4632     u8 *pCell;                          /* Pointer to current cell in pPage */
4633 
4634     /* pPage->nCell must be greater than zero. If this is the root-page
4635     ** the cursor would have been INVALID above and this for(;;) loop
4636     ** not run. If this is not the root-page, then the moveToChild() routine
4637     ** would have already detected db corruption. Similarly, pPage must
4638     ** be the right kind (index or table) of b-tree page. Otherwise
4639     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
4640     assert( pPage->nCell>0 );
4641     assert( pPage->intKey==(pIdxKey==0) );
4642     lwr = 0;
4643     upr = pPage->nCell-1;
4644     assert( biasRight==0 || biasRight==1 );
4645     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
4646     pCur->aiIdx[pCur->iPage] = (u16)idx;
4647     if( xRecordCompare==0 ){
4648       for(;;){
4649         i64 nCellKey;
4650         pCell = findCell(pPage, idx) + pPage->childPtrSize;
4651         if( pPage->hasData ){
4652           while( 0x80 <= *(pCell++) ){
4653             if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
4654           }
4655         }
4656         getVarint(pCell, (u64*)&nCellKey);
4657         if( nCellKey<intKey ){
4658           lwr = idx+1;
4659           if( lwr>upr ){ c = -1; break; }
4660         }else if( nCellKey>intKey ){
4661           upr = idx-1;
4662           if( lwr>upr ){ c = +1; break; }
4663         }else{
4664           assert( nCellKey==intKey );
4665           pCur->curFlags |= BTCF_ValidNKey;
4666           pCur->info.nKey = nCellKey;
4667           pCur->aiIdx[pCur->iPage] = (u16)idx;
4668           if( !pPage->leaf ){
4669             lwr = idx;
4670             goto moveto_next_layer;
4671           }else{
4672             *pRes = 0;
4673             rc = SQLITE_OK;
4674             goto moveto_finish;
4675           }
4676         }
4677         assert( lwr+upr>=0 );
4678         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
4679       }
4680     }else{
4681       for(;;){
4682         int nCell;
4683         pCell = findCell(pPage, idx) + pPage->childPtrSize;
4684 
4685         /* The maximum supported page-size is 65536 bytes. This means that
4686         ** the maximum number of record bytes stored on an index B-Tree
4687         ** page is less than 16384 bytes and may be stored as a 2-byte
4688         ** varint. This information is used to attempt to avoid parsing
4689         ** the entire cell by checking for the cases where the record is
4690         ** stored entirely within the b-tree page by inspecting the first
4691         ** 2 bytes of the cell.
4692         */
4693         nCell = pCell[0];
4694         if( nCell<=pPage->max1bytePayload ){
4695           /* This branch runs if the record-size field of the cell is a
4696           ** single byte varint and the record fits entirely on the main
4697           ** b-tree page.  */
4698           testcase( pCell+nCell+1==pPage->aDataEnd );
4699           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey, 0);
4700         }else if( !(pCell[1] & 0x80)
4701           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4702         ){
4703           /* The record-size field is a 2 byte varint and the record
4704           ** fits entirely on the main b-tree page.  */
4705           testcase( pCell+nCell+2==pPage->aDataEnd );
4706           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey, 0);
4707         }else{
4708           /* The record flows over onto one or more overflow pages. In
4709           ** this case the whole cell needs to be parsed, a buffer allocated
4710           ** and accessPayload() used to retrieve the record into the
4711           ** buffer before VdbeRecordCompare() can be called. */
4712           void *pCellKey;
4713           u8 * const pCellBody = pCell - pPage->childPtrSize;
4714           btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4715           nCell = (int)pCur->info.nKey;
4716           pCellKey = sqlite3Malloc( nCell );
4717           if( pCellKey==0 ){
4718             rc = SQLITE_NOMEM;
4719             goto moveto_finish;
4720           }
4721           pCur->aiIdx[pCur->iPage] = (u16)idx;
4722           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);
4723           if( rc ){
4724             sqlite3_free(pCellKey);
4725             goto moveto_finish;
4726           }
4727           c = xRecordCompare(nCell, pCellKey, pIdxKey, 0);
4728           sqlite3_free(pCellKey);
4729         }
4730         assert( pIdxKey->isCorrupt==0 || c==0 );
4731         if( c<0 ){
4732           lwr = idx+1;
4733         }else if( c>0 ){
4734           upr = idx-1;
4735         }else{
4736           assert( c==0 );
4737           *pRes = 0;
4738           rc = SQLITE_OK;
4739           pCur->aiIdx[pCur->iPage] = (u16)idx;
4740           if( pIdxKey->isCorrupt ) rc = SQLITE_CORRUPT;
4741           goto moveto_finish;
4742         }
4743         if( lwr>upr ) break;
4744         assert( lwr+upr>=0 );
4745         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
4746       }
4747     }
4748     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
4749     assert( pPage->isInit );
4750     if( pPage->leaf ){
4751       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4752       pCur->aiIdx[pCur->iPage] = (u16)idx;
4753       *pRes = c;
4754       rc = SQLITE_OK;
4755       goto moveto_finish;
4756     }
4757 moveto_next_layer:
4758     if( lwr>=pPage->nCell ){
4759       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4760     }else{
4761       chldPg = get4byte(findCell(pPage, lwr));
4762     }
4763     pCur->aiIdx[pCur->iPage] = (u16)lwr;
4764     rc = moveToChild(pCur, chldPg);
4765     if( rc ) break;
4766   }
4767 moveto_finish:
4768   pCur->info.nSize = 0;
4769   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4770   return rc;
4771 }
4772 
4773 
4774 /*
4775 ** Return TRUE if the cursor is not pointing at an entry of the table.
4776 **
4777 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4778 ** past the last entry in the table or sqlite3BtreePrev() moves past
4779 ** the first entry.  TRUE is also returned if the table is empty.
4780 */
4781 int sqlite3BtreeEof(BtCursor *pCur){
4782   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4783   ** have been deleted? This API will need to change to return an error code
4784   ** as well as the boolean result value.
4785   */
4786   return (CURSOR_VALID!=pCur->eState);
4787 }
4788 
4789 /*
4790 ** Advance the cursor to the next entry in the database.  If
4791 ** successful then set *pRes=0.  If the cursor
4792 ** was already pointing to the last entry in the database before
4793 ** this routine was called, then set *pRes=1.
4794 **
4795 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
4796 ** will be 1 if the cursor being stepped corresponds to an SQL index and
4797 ** if this routine could have been skipped if that SQL index had been
4798 ** a unique index.  Otherwise the caller will have set *pRes to zero.
4799 ** Zero is the common case. The btree implementation is free to use the
4800 ** initial *pRes value as a hint to improve performance, but the current
4801 ** SQLite btree implementation does not. (Note that the comdb2 btree
4802 ** implementation does use this hint, however.)
4803 */
4804 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4805   int rc;
4806   int idx;
4807   MemPage *pPage;
4808 
4809   assert( cursorHoldsMutex(pCur) );
4810   assert( pRes!=0 );
4811   assert( *pRes==0 || *pRes==1 );
4812   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
4813   if( pCur->eState!=CURSOR_VALID ){
4814     invalidateOverflowCache(pCur);
4815     rc = restoreCursorPosition(pCur);
4816     if( rc!=SQLITE_OK ){
4817       *pRes = 0;
4818       return rc;
4819     }
4820     if( CURSOR_INVALID==pCur->eState ){
4821       *pRes = 1;
4822       return SQLITE_OK;
4823     }
4824     if( pCur->skipNext ){
4825       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
4826       pCur->eState = CURSOR_VALID;
4827       if( pCur->skipNext>0 ){
4828         pCur->skipNext = 0;
4829         *pRes = 0;
4830         return SQLITE_OK;
4831       }
4832       pCur->skipNext = 0;
4833     }
4834   }
4835 
4836   pPage = pCur->apPage[pCur->iPage];
4837   idx = ++pCur->aiIdx[pCur->iPage];
4838   assert( pPage->isInit );
4839 
4840   /* If the database file is corrupt, it is possible for the value of idx
4841   ** to be invalid here. This can only occur if a second cursor modifies
4842   ** the page while cursor pCur is holding a reference to it. Which can
4843   ** only happen if the database is corrupt in such a way as to link the
4844   ** page into more than one b-tree structure. */
4845   testcase( idx>pPage->nCell );
4846 
4847   pCur->info.nSize = 0;
4848   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4849   if( idx>=pPage->nCell ){
4850     if( !pPage->leaf ){
4851       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4852       if( rc ){
4853         *pRes = 0;
4854         return rc;
4855       }
4856       rc = moveToLeftmost(pCur);
4857       *pRes = 0;
4858       return rc;
4859     }
4860     do{
4861       if( pCur->iPage==0 ){
4862         *pRes = 1;
4863         pCur->eState = CURSOR_INVALID;
4864         return SQLITE_OK;
4865       }
4866       moveToParent(pCur);
4867       pPage = pCur->apPage[pCur->iPage];
4868     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4869     *pRes = 0;
4870     if( pPage->intKey ){
4871       rc = sqlite3BtreeNext(pCur, pRes);
4872     }else{
4873       rc = SQLITE_OK;
4874     }
4875     return rc;
4876   }
4877   *pRes = 0;
4878   if( pPage->leaf ){
4879     return SQLITE_OK;
4880   }
4881   rc = moveToLeftmost(pCur);
4882   return rc;
4883 }
4884 
4885 
4886 /*
4887 ** Step the cursor to the back to the previous entry in the database.  If
4888 ** successful then set *pRes=0.  If the cursor
4889 ** was already pointing to the first entry in the database before
4890 ** this routine was called, then set *pRes=1.
4891 **
4892 ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
4893 ** will be 1 if the cursor being stepped corresponds to an SQL index and
4894 ** if this routine could have been skipped if that SQL index had been
4895 ** a unique index.  Otherwise the caller will have set *pRes to zero.
4896 ** Zero is the common case. The btree implementation is free to use the
4897 ** initial *pRes value as a hint to improve performance, but the current
4898 ** SQLite btree implementation does not. (Note that the comdb2 btree
4899 ** implementation does use this hint, however.)
4900 */
4901 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4902   int rc;
4903   MemPage *pPage;
4904 
4905   assert( cursorHoldsMutex(pCur) );
4906   assert( pRes!=0 );
4907   assert( *pRes==0 || *pRes==1 );
4908   assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
4909   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl);
4910   if( pCur->eState!=CURSOR_VALID ){
4911     if( ALWAYS(pCur->eState>=CURSOR_REQUIRESEEK) ){
4912       rc = btreeRestoreCursorPosition(pCur);
4913       if( rc!=SQLITE_OK ){
4914         *pRes = 0;
4915         return rc;
4916       }
4917     }
4918     if( CURSOR_INVALID==pCur->eState ){
4919       *pRes = 1;
4920       return SQLITE_OK;
4921     }
4922     if( pCur->skipNext ){
4923       assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
4924       pCur->eState = CURSOR_VALID;
4925       if( pCur->skipNext<0 ){
4926         pCur->skipNext = 0;
4927         *pRes = 0;
4928         return SQLITE_OK;
4929       }
4930       pCur->skipNext = 0;
4931     }
4932   }
4933 
4934   pPage = pCur->apPage[pCur->iPage];
4935   assert( pPage->isInit );
4936   if( !pPage->leaf ){
4937     int idx = pCur->aiIdx[pCur->iPage];
4938     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4939     if( rc ){
4940       *pRes = 0;
4941       return rc;
4942     }
4943     rc = moveToRightmost(pCur);
4944   }else{
4945     while( pCur->aiIdx[pCur->iPage]==0 ){
4946       if( pCur->iPage==0 ){
4947         pCur->eState = CURSOR_INVALID;
4948         *pRes = 1;
4949         return SQLITE_OK;
4950       }
4951       moveToParent(pCur);
4952     }
4953     pCur->info.nSize = 0;
4954     pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
4955 
4956     pCur->aiIdx[pCur->iPage]--;
4957     pPage = pCur->apPage[pCur->iPage];
4958     if( pPage->intKey && !pPage->leaf ){
4959       rc = sqlite3BtreePrevious(pCur, pRes);
4960     }else{
4961       rc = SQLITE_OK;
4962     }
4963   }
4964   *pRes = 0;
4965   return rc;
4966 }
4967 
4968 /*
4969 ** Allocate a new page from the database file.
4970 **
4971 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
4972 ** has already been called on the new page.)  The new page has also
4973 ** been referenced and the calling routine is responsible for calling
4974 ** sqlite3PagerUnref() on the new page when it is done.
4975 **
4976 ** SQLITE_OK is returned on success.  Any other return value indicates
4977 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
4978 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4979 **
4980 ** If the "nearby" parameter is not 0, then an effort is made to
4981 ** locate a page close to the page number "nearby".  This can be used in an
4982 ** attempt to keep related pages close to each other in the database file,
4983 ** which in turn can make database access faster.
4984 **
4985 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
4986 ** anywhere on the free-list, then it is guaranteed to be returned.  If
4987 ** eMode is BTALLOC_LT then the page returned will be less than or equal
4988 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
4989 ** are no restrictions on which page is returned.
4990 */
4991 static int allocateBtreePage(
4992   BtShared *pBt,         /* The btree */
4993   MemPage **ppPage,      /* Store pointer to the allocated page here */
4994   Pgno *pPgno,           /* Store the page number here */
4995   Pgno nearby,           /* Search for a page near this one */
4996   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
4997 ){
4998   MemPage *pPage1;
4999   int rc;
5000   u32 n;     /* Number of pages on the freelist */
5001   u32 k;     /* Number of leaves on the trunk of the freelist */
5002   MemPage *pTrunk = 0;
5003   MemPage *pPrevTrunk = 0;
5004   Pgno mxPage;     /* Total size of the database file */
5005 
5006   assert( sqlite3_mutex_held(pBt->mutex) );
5007   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5008   pPage1 = pBt->pPage1;
5009   mxPage = btreePagecount(pBt);
5010   n = get4byte(&pPage1->aData[36]);
5011   testcase( n==mxPage-1 );
5012   if( n>=mxPage ){
5013     return SQLITE_CORRUPT_BKPT;
5014   }
5015   if( n>0 ){
5016     /* There are pages on the freelist.  Reuse one of those pages. */
5017     Pgno iTrunk;
5018     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5019 
5020     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5021     ** shows that the page 'nearby' is somewhere on the free-list, then
5022     ** the entire-list will be searched for that page.
5023     */
5024 #ifndef SQLITE_OMIT_AUTOVACUUM
5025     if( eMode==BTALLOC_EXACT ){
5026       if( nearby<=mxPage ){
5027         u8 eType;
5028         assert( nearby>0 );
5029         assert( pBt->autoVacuum );
5030         rc = ptrmapGet(pBt, nearby, &eType, 0);
5031         if( rc ) return rc;
5032         if( eType==PTRMAP_FREEPAGE ){
5033           searchList = 1;
5034         }
5035       }
5036     }else if( eMode==BTALLOC_LE ){
5037       searchList = 1;
5038     }
5039 #endif
5040 
5041     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5042     ** first free-list trunk page. iPrevTrunk is initially 1.
5043     */
5044     rc = sqlite3PagerWrite(pPage1->pDbPage);
5045     if( rc ) return rc;
5046     put4byte(&pPage1->aData[36], n-1);
5047 
5048     /* The code within this loop is run only once if the 'searchList' variable
5049     ** is not true. Otherwise, it runs once for each trunk-page on the
5050     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5051     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5052     */
5053     do {
5054       pPrevTrunk = pTrunk;
5055       if( pPrevTrunk ){
5056         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5057       }else{
5058         iTrunk = get4byte(&pPage1->aData[32]);
5059       }
5060       testcase( iTrunk==mxPage );
5061       if( iTrunk>mxPage ){
5062         rc = SQLITE_CORRUPT_BKPT;
5063       }else{
5064         rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5065       }
5066       if( rc ){
5067         pTrunk = 0;
5068         goto end_allocate_page;
5069       }
5070       assert( pTrunk!=0 );
5071       assert( pTrunk->aData!=0 );
5072 
5073       k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
5074       if( k==0 && !searchList ){
5075         /* The trunk has no leaves and the list is not being searched.
5076         ** So extract the trunk page itself and use it as the newly
5077         ** allocated page */
5078         assert( pPrevTrunk==0 );
5079         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5080         if( rc ){
5081           goto end_allocate_page;
5082         }
5083         *pPgno = iTrunk;
5084         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5085         *ppPage = pTrunk;
5086         pTrunk = 0;
5087         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5088       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
5089         /* Value of k is out of range.  Database corruption */
5090         rc = SQLITE_CORRUPT_BKPT;
5091         goto end_allocate_page;
5092 #ifndef SQLITE_OMIT_AUTOVACUUM
5093       }else if( searchList
5094             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
5095       ){
5096         /* The list is being searched and this trunk page is the page
5097         ** to allocate, regardless of whether it has leaves.
5098         */
5099         *pPgno = iTrunk;
5100         *ppPage = pTrunk;
5101         searchList = 0;
5102         rc = sqlite3PagerWrite(pTrunk->pDbPage);
5103         if( rc ){
5104           goto end_allocate_page;
5105         }
5106         if( k==0 ){
5107           if( !pPrevTrunk ){
5108             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
5109           }else{
5110             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5111             if( rc!=SQLITE_OK ){
5112               goto end_allocate_page;
5113             }
5114             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
5115           }
5116         }else{
5117           /* The trunk page is required by the caller but it contains
5118           ** pointers to free-list leaves. The first leaf becomes a trunk
5119           ** page in this case.
5120           */
5121           MemPage *pNewTrunk;
5122           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
5123           if( iNewTrunk>mxPage ){
5124             rc = SQLITE_CORRUPT_BKPT;
5125             goto end_allocate_page;
5126           }
5127           testcase( iNewTrunk==mxPage );
5128           rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
5129           if( rc!=SQLITE_OK ){
5130             goto end_allocate_page;
5131           }
5132           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
5133           if( rc!=SQLITE_OK ){
5134             releasePage(pNewTrunk);
5135             goto end_allocate_page;
5136           }
5137           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
5138           put4byte(&pNewTrunk->aData[4], k-1);
5139           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
5140           releasePage(pNewTrunk);
5141           if( !pPrevTrunk ){
5142             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
5143             put4byte(&pPage1->aData[32], iNewTrunk);
5144           }else{
5145             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
5146             if( rc ){
5147               goto end_allocate_page;
5148             }
5149             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
5150           }
5151         }
5152         pTrunk = 0;
5153         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
5154 #endif
5155       }else if( k>0 ){
5156         /* Extract a leaf from the trunk */
5157         u32 closest;
5158         Pgno iPage;
5159         unsigned char *aData = pTrunk->aData;
5160         if( nearby>0 ){
5161           u32 i;
5162           closest = 0;
5163           if( eMode==BTALLOC_LE ){
5164             for(i=0; i<k; i++){
5165               iPage = get4byte(&aData[8+i*4]);
5166               if( iPage<=nearby ){
5167                 closest = i;
5168                 break;
5169               }
5170             }
5171           }else{
5172             int dist;
5173             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5174             for(i=1; i<k; i++){
5175               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5176               if( d2<dist ){
5177                 closest = i;
5178                 dist = d2;
5179               }
5180             }
5181           }
5182         }else{
5183           closest = 0;
5184         }
5185 
5186         iPage = get4byte(&aData[8+closest*4]);
5187         testcase( iPage==mxPage );
5188         if( iPage>mxPage ){
5189           rc = SQLITE_CORRUPT_BKPT;
5190           goto end_allocate_page;
5191         }
5192         testcase( iPage==mxPage );
5193         if( !searchList
5194          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
5195         ){
5196           int noContent;
5197           *pPgno = iPage;
5198           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5199                  ": %d more free pages\n",
5200                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5201           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5202           if( rc ) goto end_allocate_page;
5203           if( closest<k-1 ){
5204             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5205           }
5206           put4byte(&aData[4], k-1);
5207           noContent = !btreeGetHasContent(pBt, *pPgno) ? PAGER_GET_NOCONTENT : 0;
5208           rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
5209           if( rc==SQLITE_OK ){
5210             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5211             if( rc!=SQLITE_OK ){
5212               releasePage(*ppPage);
5213             }
5214           }
5215           searchList = 0;
5216         }
5217       }
5218       releasePage(pPrevTrunk);
5219       pPrevTrunk = 0;
5220     }while( searchList );
5221   }else{
5222     /* There are no pages on the freelist, so append a new page to the
5223     ** database image.
5224     **
5225     ** Normally, new pages allocated by this block can be requested from the
5226     ** pager layer with the 'no-content' flag set. This prevents the pager
5227     ** from trying to read the pages content from disk. However, if the
5228     ** current transaction has already run one or more incremental-vacuum
5229     ** steps, then the page we are about to allocate may contain content
5230     ** that is required in the event of a rollback. In this case, do
5231     ** not set the no-content flag. This causes the pager to load and journal
5232     ** the current page content before overwriting it.
5233     **
5234     ** Note that the pager will not actually attempt to load or journal
5235     ** content for any page that really does lie past the end of the database
5236     ** file on disk. So the effects of disabling the no-content optimization
5237     ** here are confined to those pages that lie between the end of the
5238     ** database image and the end of the database file.
5239     */
5240     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate)) ? PAGER_GET_NOCONTENT : 0;
5241 
5242     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5243     if( rc ) return rc;
5244     pBt->nPage++;
5245     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5246 
5247 #ifndef SQLITE_OMIT_AUTOVACUUM
5248     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5249       /* If *pPgno refers to a pointer-map page, allocate two new pages
5250       ** at the end of the file instead of one. The first allocated page
5251       ** becomes a new pointer-map page, the second is used by the caller.
5252       */
5253       MemPage *pPg = 0;
5254       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5255       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5256       rc = btreeGetPage(pBt, pBt->nPage, &pPg, bNoContent);
5257       if( rc==SQLITE_OK ){
5258         rc = sqlite3PagerWrite(pPg->pDbPage);
5259         releasePage(pPg);
5260       }
5261       if( rc ) return rc;
5262       pBt->nPage++;
5263       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5264     }
5265 #endif
5266     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5267     *pPgno = pBt->nPage;
5268 
5269     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5270     rc = btreeGetPage(pBt, *pPgno, ppPage, bNoContent);
5271     if( rc ) return rc;
5272     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5273     if( rc!=SQLITE_OK ){
5274       releasePage(*ppPage);
5275     }
5276     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5277   }
5278 
5279   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5280 
5281 end_allocate_page:
5282   releasePage(pTrunk);
5283   releasePage(pPrevTrunk);
5284   if( rc==SQLITE_OK ){
5285     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
5286       releasePage(*ppPage);
5287       *ppPage = 0;
5288       return SQLITE_CORRUPT_BKPT;
5289     }
5290     (*ppPage)->isInit = 0;
5291   }else{
5292     *ppPage = 0;
5293   }
5294   assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) );
5295   return rc;
5296 }
5297 
5298 /*
5299 ** This function is used to add page iPage to the database file free-list.
5300 ** It is assumed that the page is not already a part of the free-list.
5301 **
5302 ** The value passed as the second argument to this function is optional.
5303 ** If the caller happens to have a pointer to the MemPage object
5304 ** corresponding to page iPage handy, it may pass it as the second value.
5305 ** Otherwise, it may pass NULL.
5306 **
5307 ** If a pointer to a MemPage object is passed as the second argument,
5308 ** its reference count is not altered by this function.
5309 */
5310 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5311   MemPage *pTrunk = 0;                /* Free-list trunk page */
5312   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5313   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5314   MemPage *pPage;                     /* Page being freed. May be NULL. */
5315   int rc;                             /* Return Code */
5316   int nFree;                          /* Initial number of pages on free-list */
5317 
5318   assert( sqlite3_mutex_held(pBt->mutex) );
5319   assert( iPage>1 );
5320   assert( !pMemPage || pMemPage->pgno==iPage );
5321 
5322   if( pMemPage ){
5323     pPage = pMemPage;
5324     sqlite3PagerRef(pPage->pDbPage);
5325   }else{
5326     pPage = btreePageLookup(pBt, iPage);
5327   }
5328 
5329   /* Increment the free page count on pPage1 */
5330   rc = sqlite3PagerWrite(pPage1->pDbPage);
5331   if( rc ) goto freepage_out;
5332   nFree = get4byte(&pPage1->aData[36]);
5333   put4byte(&pPage1->aData[36], nFree+1);
5334 
5335   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5336     /* If the secure_delete option is enabled, then
5337     ** always fully overwrite deleted information with zeros.
5338     */
5339     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5340      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5341     ){
5342       goto freepage_out;
5343     }
5344     memset(pPage->aData, 0, pPage->pBt->pageSize);
5345   }
5346 
5347   /* If the database supports auto-vacuum, write an entry in the pointer-map
5348   ** to indicate that the page is free.
5349   */
5350   if( ISAUTOVACUUM ){
5351     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5352     if( rc ) goto freepage_out;
5353   }
5354 
5355   /* Now manipulate the actual database free-list structure. There are two
5356   ** possibilities. If the free-list is currently empty, or if the first
5357   ** trunk page in the free-list is full, then this page will become a
5358   ** new free-list trunk page. Otherwise, it will become a leaf of the
5359   ** first trunk page in the current free-list. This block tests if it
5360   ** is possible to add the page as a new free-list leaf.
5361   */
5362   if( nFree!=0 ){
5363     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5364 
5365     iTrunk = get4byte(&pPage1->aData[32]);
5366     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5367     if( rc!=SQLITE_OK ){
5368       goto freepage_out;
5369     }
5370 
5371     nLeaf = get4byte(&pTrunk->aData[4]);
5372     assert( pBt->usableSize>32 );
5373     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5374       rc = SQLITE_CORRUPT_BKPT;
5375       goto freepage_out;
5376     }
5377     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5378       /* In this case there is room on the trunk page to insert the page
5379       ** being freed as a new leaf.
5380       **
5381       ** Note that the trunk page is not really full until it contains
5382       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5383       ** coded.  But due to a coding error in versions of SQLite prior to
5384       ** 3.6.0, databases with freelist trunk pages holding more than
5385       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5386       ** to maintain backwards compatibility with older versions of SQLite,
5387       ** we will continue to restrict the number of entries to usableSize/4 - 8
5388       ** for now.  At some point in the future (once everyone has upgraded
5389       ** to 3.6.0 or later) we should consider fixing the conditional above
5390       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5391       */
5392       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5393       if( rc==SQLITE_OK ){
5394         put4byte(&pTrunk->aData[4], nLeaf+1);
5395         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5396         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5397           sqlite3PagerDontWrite(pPage->pDbPage);
5398         }
5399         rc = btreeSetHasContent(pBt, iPage);
5400       }
5401       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5402       goto freepage_out;
5403     }
5404   }
5405 
5406   /* If control flows to this point, then it was not possible to add the
5407   ** the page being freed as a leaf page of the first trunk in the free-list.
5408   ** Possibly because the free-list is empty, or possibly because the
5409   ** first trunk in the free-list is full. Either way, the page being freed
5410   ** will become the new first trunk page in the free-list.
5411   */
5412   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5413     goto freepage_out;
5414   }
5415   rc = sqlite3PagerWrite(pPage->pDbPage);
5416   if( rc!=SQLITE_OK ){
5417     goto freepage_out;
5418   }
5419   put4byte(pPage->aData, iTrunk);
5420   put4byte(&pPage->aData[4], 0);
5421   put4byte(&pPage1->aData[32], iPage);
5422   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5423 
5424 freepage_out:
5425   if( pPage ){
5426     pPage->isInit = 0;
5427   }
5428   releasePage(pPage);
5429   releasePage(pTrunk);
5430   return rc;
5431 }
5432 static void freePage(MemPage *pPage, int *pRC){
5433   if( (*pRC)==SQLITE_OK ){
5434     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5435   }
5436 }
5437 
5438 /*
5439 ** Free any overflow pages associated with the given Cell.
5440 */
5441 static int clearCell(MemPage *pPage, unsigned char *pCell){
5442   BtShared *pBt = pPage->pBt;
5443   CellInfo info;
5444   Pgno ovflPgno;
5445   int rc;
5446   int nOvfl;
5447   u32 ovflPageSize;
5448 
5449   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5450   btreeParseCellPtr(pPage, pCell, &info);
5451   if( info.iOverflow==0 ){
5452     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
5453   }
5454   if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){
5455     return SQLITE_CORRUPT_BKPT;  /* Cell extends past end of page */
5456   }
5457   ovflPgno = get4byte(&pCell[info.iOverflow]);
5458   assert( pBt->usableSize > 4 );
5459   ovflPageSize = pBt->usableSize - 4;
5460   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5461   assert( ovflPgno==0 || nOvfl>0 );
5462   while( nOvfl-- ){
5463     Pgno iNext = 0;
5464     MemPage *pOvfl = 0;
5465     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5466       /* 0 is not a legal page number and page 1 cannot be an
5467       ** overflow page. Therefore if ovflPgno<2 or past the end of the
5468       ** file the database must be corrupt. */
5469       return SQLITE_CORRUPT_BKPT;
5470     }
5471     if( nOvfl ){
5472       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5473       if( rc ) return rc;
5474     }
5475 
5476     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5477      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5478     ){
5479       /* There is no reason any cursor should have an outstanding reference
5480       ** to an overflow page belonging to a cell that is being deleted/updated.
5481       ** So if there exists more than one reference to this page, then it
5482       ** must not really be an overflow page and the database must be corrupt.
5483       ** It is helpful to detect this before calling freePage2(), as
5484       ** freePage2() may zero the page contents if secure-delete mode is
5485       ** enabled. If this 'overflow' page happens to be a page that the
5486       ** caller is iterating through or using in some other way, this
5487       ** can be problematic.
5488       */
5489       rc = SQLITE_CORRUPT_BKPT;
5490     }else{
5491       rc = freePage2(pBt, pOvfl, ovflPgno);
5492     }
5493 
5494     if( pOvfl ){
5495       sqlite3PagerUnref(pOvfl->pDbPage);
5496     }
5497     if( rc ) return rc;
5498     ovflPgno = iNext;
5499   }
5500   return SQLITE_OK;
5501 }
5502 
5503 /*
5504 ** Create the byte sequence used to represent a cell on page pPage
5505 ** and write that byte sequence into pCell[].  Overflow pages are
5506 ** allocated and filled in as necessary.  The calling procedure
5507 ** is responsible for making sure sufficient space has been allocated
5508 ** for pCell[].
5509 **
5510 ** Note that pCell does not necessary need to point to the pPage->aData
5511 ** area.  pCell might point to some temporary storage.  The cell will
5512 ** be constructed in this temporary area then copied into pPage->aData
5513 ** later.
5514 */
5515 static int fillInCell(
5516   MemPage *pPage,                /* The page that contains the cell */
5517   unsigned char *pCell,          /* Complete text of the cell */
5518   const void *pKey, i64 nKey,    /* The key */
5519   const void *pData,int nData,   /* The data */
5520   int nZero,                     /* Extra zero bytes to append to pData */
5521   int *pnSize                    /* Write cell size here */
5522 ){
5523   int nPayload;
5524   const u8 *pSrc;
5525   int nSrc, n, rc;
5526   int spaceLeft;
5527   MemPage *pOvfl = 0;
5528   MemPage *pToRelease = 0;
5529   unsigned char *pPrior;
5530   unsigned char *pPayload;
5531   BtShared *pBt = pPage->pBt;
5532   Pgno pgnoOvfl = 0;
5533   int nHeader;
5534   CellInfo info;
5535 
5536   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5537 
5538   /* pPage is not necessarily writeable since pCell might be auxiliary
5539   ** buffer space that is separate from the pPage buffer area */
5540   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5541             || sqlite3PagerIswriteable(pPage->pDbPage) );
5542 
5543   /* Fill in the header. */
5544   nHeader = 0;
5545   if( !pPage->leaf ){
5546     nHeader += 4;
5547   }
5548   if( pPage->hasData ){
5549     nHeader += putVarint32(&pCell[nHeader], nData+nZero);
5550   }else{
5551     nData = nZero = 0;
5552   }
5553   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5554   btreeParseCellPtr(pPage, pCell, &info);
5555   assert( info.nHeader==nHeader );
5556   assert( info.nKey==nKey );
5557   assert( info.nData==(u32)(nData+nZero) );
5558 
5559   /* Fill in the payload */
5560   nPayload = nData + nZero;
5561   if( pPage->intKey ){
5562     pSrc = pData;
5563     nSrc = nData;
5564     nData = 0;
5565   }else{
5566     if( NEVER(nKey>0x7fffffff || pKey==0) ){
5567       return SQLITE_CORRUPT_BKPT;
5568     }
5569     nPayload += (int)nKey;
5570     pSrc = pKey;
5571     nSrc = (int)nKey;
5572   }
5573   *pnSize = info.nSize;
5574   spaceLeft = info.nLocal;
5575   pPayload = &pCell[nHeader];
5576   pPrior = &pCell[info.iOverflow];
5577 
5578   while( nPayload>0 ){
5579     if( spaceLeft==0 ){
5580 #ifndef SQLITE_OMIT_AUTOVACUUM
5581       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5582       if( pBt->autoVacuum ){
5583         do{
5584           pgnoOvfl++;
5585         } while(
5586           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5587         );
5588       }
5589 #endif
5590       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5591 #ifndef SQLITE_OMIT_AUTOVACUUM
5592       /* If the database supports auto-vacuum, and the second or subsequent
5593       ** overflow page is being allocated, add an entry to the pointer-map
5594       ** for that page now.
5595       **
5596       ** If this is the first overflow page, then write a partial entry
5597       ** to the pointer-map. If we write nothing to this pointer-map slot,
5598       ** then the optimistic overflow chain processing in clearCell()
5599       ** may misinterpret the uninitialized values and delete the
5600       ** wrong pages from the database.
5601       */
5602       if( pBt->autoVacuum && rc==SQLITE_OK ){
5603         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5604         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5605         if( rc ){
5606           releasePage(pOvfl);
5607         }
5608       }
5609 #endif
5610       if( rc ){
5611         releasePage(pToRelease);
5612         return rc;
5613       }
5614 
5615       /* If pToRelease is not zero than pPrior points into the data area
5616       ** of pToRelease.  Make sure pToRelease is still writeable. */
5617       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5618 
5619       /* If pPrior is part of the data area of pPage, then make sure pPage
5620       ** is still writeable */
5621       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5622             || sqlite3PagerIswriteable(pPage->pDbPage) );
5623 
5624       put4byte(pPrior, pgnoOvfl);
5625       releasePage(pToRelease);
5626       pToRelease = pOvfl;
5627       pPrior = pOvfl->aData;
5628       put4byte(pPrior, 0);
5629       pPayload = &pOvfl->aData[4];
5630       spaceLeft = pBt->usableSize - 4;
5631     }
5632     n = nPayload;
5633     if( n>spaceLeft ) n = spaceLeft;
5634 
5635     /* If pToRelease is not zero than pPayload points into the data area
5636     ** of pToRelease.  Make sure pToRelease is still writeable. */
5637     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5638 
5639     /* If pPayload is part of the data area of pPage, then make sure pPage
5640     ** is still writeable */
5641     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5642             || sqlite3PagerIswriteable(pPage->pDbPage) );
5643 
5644     if( nSrc>0 ){
5645       if( n>nSrc ) n = nSrc;
5646       assert( pSrc );
5647       memcpy(pPayload, pSrc, n);
5648     }else{
5649       memset(pPayload, 0, n);
5650     }
5651     nPayload -= n;
5652     pPayload += n;
5653     pSrc += n;
5654     nSrc -= n;
5655     spaceLeft -= n;
5656     if( nSrc==0 ){
5657       nSrc = nData;
5658       pSrc = pData;
5659     }
5660   }
5661   releasePage(pToRelease);
5662   return SQLITE_OK;
5663 }
5664 
5665 /*
5666 ** Remove the i-th cell from pPage.  This routine effects pPage only.
5667 ** The cell content is not freed or deallocated.  It is assumed that
5668 ** the cell content has been copied someplace else.  This routine just
5669 ** removes the reference to the cell from pPage.
5670 **
5671 ** "sz" must be the number of bytes in the cell.
5672 */
5673 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5674   u32 pc;         /* Offset to cell content of cell being deleted */
5675   u8 *data;       /* pPage->aData */
5676   u8 *ptr;        /* Used to move bytes around within data[] */
5677   int rc;         /* The return code */
5678   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
5679 
5680   if( *pRC ) return;
5681 
5682   assert( idx>=0 && idx<pPage->nCell );
5683   assert( sz==cellSize(pPage, idx) );
5684   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5685   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5686   data = pPage->aData;
5687   ptr = &pPage->aCellIdx[2*idx];
5688   pc = get2byte(ptr);
5689   hdr = pPage->hdrOffset;
5690   testcase( pc==get2byte(&data[hdr+5]) );
5691   testcase( pc+sz==pPage->pBt->usableSize );
5692   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5693     *pRC = SQLITE_CORRUPT_BKPT;
5694     return;
5695   }
5696   rc = freeSpace(pPage, pc, sz);
5697   if( rc ){
5698     *pRC = rc;
5699     return;
5700   }
5701   pPage->nCell--;
5702   memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
5703   put2byte(&data[hdr+3], pPage->nCell);
5704   pPage->nFree += 2;
5705 }
5706 
5707 /*
5708 ** Insert a new cell on pPage at cell index "i".  pCell points to the
5709 ** content of the cell.
5710 **
5711 ** If the cell content will fit on the page, then put it there.  If it
5712 ** will not fit, then make a copy of the cell content into pTemp if
5713 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
5714 ** in pPage->apOvfl[] and make it point to the cell content (either
5715 ** in pTemp or the original pCell) and also record its index.
5716 ** Allocating a new entry in pPage->aCell[] implies that
5717 ** pPage->nOverflow is incremented.
5718 **
5719 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5720 ** cell. The caller will overwrite them after this function returns. If
5721 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5722 ** (but pCell+nSkip is always valid).
5723 */
5724 static void insertCell(
5725   MemPage *pPage,   /* Page into which we are copying */
5726   int i,            /* New cell becomes the i-th cell of the page */
5727   u8 *pCell,        /* Content of the new cell */
5728   int sz,           /* Bytes of content in pCell */
5729   u8 *pTemp,        /* Temp storage space for pCell, if needed */
5730   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
5731   int *pRC          /* Read and write return code from here */
5732 ){
5733   int idx = 0;      /* Where to write new cell content in data[] */
5734   int j;            /* Loop counter */
5735   int end;          /* First byte past the last cell pointer in data[] */
5736   int ins;          /* Index in data[] where new cell pointer is inserted */
5737   int cellOffset;   /* Address of first cell pointer in data[] */
5738   u8 *data;         /* The content of the whole page */
5739   int nSkip = (iChild ? 4 : 0);
5740 
5741   if( *pRC ) return;
5742 
5743   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5744   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
5745   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
5746   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
5747   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5748   /* The cell should normally be sized correctly.  However, when moving a
5749   ** malformed cell from a leaf page to an interior page, if the cell size
5750   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
5751   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
5752   ** the term after the || in the following assert(). */
5753   assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
5754   if( pPage->nOverflow || sz+2>pPage->nFree ){
5755     if( pTemp ){
5756       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5757       pCell = pTemp;
5758     }
5759     if( iChild ){
5760       put4byte(pCell, iChild);
5761     }
5762     j = pPage->nOverflow++;
5763     assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );
5764     pPage->apOvfl[j] = pCell;
5765     pPage->aiOvfl[j] = (u16)i;
5766   }else{
5767     int rc = sqlite3PagerWrite(pPage->pDbPage);
5768     if( rc!=SQLITE_OK ){
5769       *pRC = rc;
5770       return;
5771     }
5772     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5773     data = pPage->aData;
5774     cellOffset = pPage->cellOffset;
5775     end = cellOffset + 2*pPage->nCell;
5776     ins = cellOffset + 2*i;
5777     rc = allocateSpace(pPage, sz, &idx);
5778     if( rc ){ *pRC = rc; return; }
5779     /* The allocateSpace() routine guarantees the following two properties
5780     ** if it returns success */
5781     assert( idx >= end+2 );
5782     assert( idx+sz <= (int)pPage->pBt->usableSize );
5783     pPage->nCell++;
5784     pPage->nFree -= (u16)(2 + sz);
5785     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5786     if( iChild ){
5787       put4byte(&data[idx], iChild);
5788     }
5789     memmove(&data[ins+2], &data[ins], end-ins);
5790     put2byte(&data[ins], idx);
5791     put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5792 #ifndef SQLITE_OMIT_AUTOVACUUM
5793     if( pPage->pBt->autoVacuum ){
5794       /* The cell may contain a pointer to an overflow page. If so, write
5795       ** the entry for the overflow page into the pointer map.
5796       */
5797       ptrmapPutOvflPtr(pPage, pCell, pRC);
5798     }
5799 #endif
5800   }
5801 }
5802 
5803 /*
5804 ** Add a list of cells to a page.  The page should be initially empty.
5805 ** The cells are guaranteed to fit on the page.
5806 */
5807 static void assemblePage(
5808   MemPage *pPage,   /* The page to be assemblied */
5809   int nCell,        /* The number of cells to add to this page */
5810   u8 **apCell,      /* Pointers to cell bodies */
5811   u16 *aSize        /* Sizes of the cells */
5812 ){
5813   int i;            /* Loop counter */
5814   u8 *pCellptr;     /* Address of next cell pointer */
5815   int cellbody;     /* Address of next cell body */
5816   u8 * const data = pPage->aData;             /* Pointer to data for pPage */
5817   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */
5818   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5819 
5820   assert( pPage->nOverflow==0 );
5821   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5822   assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
5823             && (int)MX_CELL(pPage->pBt)<=10921);
5824   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5825 
5826   /* Check that the page has just been zeroed by zeroPage() */
5827   assert( pPage->nCell==0 );
5828   assert( get2byteNotZero(&data[hdr+5])==nUsable );
5829 
5830   pCellptr = &pPage->aCellIdx[nCell*2];
5831   cellbody = nUsable;
5832   for(i=nCell-1; i>=0; i--){
5833     u16 sz = aSize[i];
5834     pCellptr -= 2;
5835     cellbody -= sz;
5836     put2byte(pCellptr, cellbody);
5837     memcpy(&data[cellbody], apCell[i], sz);
5838   }
5839   put2byte(&data[hdr+3], nCell);
5840   put2byte(&data[hdr+5], cellbody);
5841   pPage->nFree -= (nCell*2 + nUsable - cellbody);
5842   pPage->nCell = (u16)nCell;
5843 }
5844 
5845 /*
5846 ** The following parameters determine how many adjacent pages get involved
5847 ** in a balancing operation.  NN is the number of neighbors on either side
5848 ** of the page that participate in the balancing operation.  NB is the
5849 ** total number of pages that participate, including the target page and
5850 ** NN neighbors on either side.
5851 **
5852 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
5853 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5854 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5855 ** The value of NN appears to give the best results overall.
5856 */
5857 #define NN 1             /* Number of neighbors on either side of pPage */
5858 #define NB (NN*2+1)      /* Total pages involved in the balance */
5859 
5860 
5861 #ifndef SQLITE_OMIT_QUICKBALANCE
5862 /*
5863 ** This version of balance() handles the common special case where
5864 ** a new entry is being inserted on the extreme right-end of the
5865 ** tree, in other words, when the new entry will become the largest
5866 ** entry in the tree.
5867 **
5868 ** Instead of trying to balance the 3 right-most leaf pages, just add
5869 ** a new page to the right-hand side and put the one new entry in
5870 ** that page.  This leaves the right side of the tree somewhat
5871 ** unbalanced.  But odds are that we will be inserting new entries
5872 ** at the end soon afterwards so the nearly empty page will quickly
5873 ** fill up.  On average.
5874 **
5875 ** pPage is the leaf page which is the right-most page in the tree.
5876 ** pParent is its parent.  pPage must have a single overflow entry
5877 ** which is also the right-most entry on the page.
5878 **
5879 ** The pSpace buffer is used to store a temporary copy of the divider
5880 ** cell that will be inserted into pParent. Such a cell consists of a 4
5881 ** byte page number followed by a variable length integer. In other
5882 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5883 ** least 13 bytes in size.
5884 */
5885 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5886   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
5887   MemPage *pNew;                       /* Newly allocated page */
5888   int rc;                              /* Return Code */
5889   Pgno pgnoNew;                        /* Page number of pNew */
5890 
5891   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5892   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5893   assert( pPage->nOverflow==1 );
5894 
5895   /* This error condition is now caught prior to reaching this function */
5896   if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;
5897 
5898   /* Allocate a new page. This page will become the right-sibling of
5899   ** pPage. Make the parent page writable, so that the new divider cell
5900   ** may be inserted. If both these operations are successful, proceed.
5901   */
5902   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5903 
5904   if( rc==SQLITE_OK ){
5905 
5906     u8 *pOut = &pSpace[4];
5907     u8 *pCell = pPage->apOvfl[0];
5908     u16 szCell = cellSizePtr(pPage, pCell);
5909     u8 *pStop;
5910 
5911     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5912     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5913     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5914     assemblePage(pNew, 1, &pCell, &szCell);
5915 
5916     /* If this is an auto-vacuum database, update the pointer map
5917     ** with entries for the new page, and any pointer from the
5918     ** cell on the page to an overflow page. If either of these
5919     ** operations fails, the return code is set, but the contents
5920     ** of the parent page are still manipulated by thh code below.
5921     ** That is Ok, at this point the parent page is guaranteed to
5922     ** be marked as dirty. Returning an error code will cause a
5923     ** rollback, undoing any changes made to the parent page.
5924     */
5925     if( ISAUTOVACUUM ){
5926       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5927       if( szCell>pNew->minLocal ){
5928         ptrmapPutOvflPtr(pNew, pCell, &rc);
5929       }
5930     }
5931 
5932     /* Create a divider cell to insert into pParent. The divider cell
5933     ** consists of a 4-byte page number (the page number of pPage) and
5934     ** a variable length key value (which must be the same value as the
5935     ** largest key on pPage).
5936     **
5937     ** To find the largest key value on pPage, first find the right-most
5938     ** cell on pPage. The first two fields of this cell are the
5939     ** record-length (a variable length integer at most 32-bits in size)
5940     ** and the key value (a variable length integer, may have any value).
5941     ** The first of the while(...) loops below skips over the record-length
5942     ** field. The second while(...) loop copies the key value from the
5943     ** cell on pPage into the pSpace buffer.
5944     */
5945     pCell = findCell(pPage, pPage->nCell-1);
5946     pStop = &pCell[9];
5947     while( (*(pCell++)&0x80) && pCell<pStop );
5948     pStop = &pCell[9];
5949     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5950 
5951     /* Insert the new divider cell into pParent. */
5952     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5953                0, pPage->pgno, &rc);
5954 
5955     /* Set the right-child pointer of pParent to point to the new page. */
5956     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5957 
5958     /* Release the reference to the new page. */
5959     releasePage(pNew);
5960   }
5961 
5962   return rc;
5963 }
5964 #endif /* SQLITE_OMIT_QUICKBALANCE */
5965 
5966 #if 0
5967 /*
5968 ** This function does not contribute anything to the operation of SQLite.
5969 ** it is sometimes activated temporarily while debugging code responsible
5970 ** for setting pointer-map entries.
5971 */
5972 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5973   int i, j;
5974   for(i=0; i<nPage; i++){
5975     Pgno n;
5976     u8 e;
5977     MemPage *pPage = apPage[i];
5978     BtShared *pBt = pPage->pBt;
5979     assert( pPage->isInit );
5980 
5981     for(j=0; j<pPage->nCell; j++){
5982       CellInfo info;
5983       u8 *z;
5984 
5985       z = findCell(pPage, j);
5986       btreeParseCellPtr(pPage, z, &info);
5987       if( info.iOverflow ){
5988         Pgno ovfl = get4byte(&z[info.iOverflow]);
5989         ptrmapGet(pBt, ovfl, &e, &n);
5990         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5991       }
5992       if( !pPage->leaf ){
5993         Pgno child = get4byte(z);
5994         ptrmapGet(pBt, child, &e, &n);
5995         assert( n==pPage->pgno && e==PTRMAP_BTREE );
5996       }
5997     }
5998     if( !pPage->leaf ){
5999       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6000       ptrmapGet(pBt, child, &e, &n);
6001       assert( n==pPage->pgno && e==PTRMAP_BTREE );
6002     }
6003   }
6004   return 1;
6005 }
6006 #endif
6007 
6008 /*
6009 ** This function is used to copy the contents of the b-tree node stored
6010 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
6011 ** the pointer-map entries for each child page are updated so that the
6012 ** parent page stored in the pointer map is page pTo. If pFrom contained
6013 ** any cells with overflow page pointers, then the corresponding pointer
6014 ** map entries are also updated so that the parent page is page pTo.
6015 **
6016 ** If pFrom is currently carrying any overflow cells (entries in the
6017 ** MemPage.apOvfl[] array), they are not copied to pTo.
6018 **
6019 ** Before returning, page pTo is reinitialized using btreeInitPage().
6020 **
6021 ** The performance of this function is not critical. It is only used by
6022 ** the balance_shallower() and balance_deeper() procedures, neither of
6023 ** which are called often under normal circumstances.
6024 */
6025 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
6026   if( (*pRC)==SQLITE_OK ){
6027     BtShared * const pBt = pFrom->pBt;
6028     u8 * const aFrom = pFrom->aData;
6029     u8 * const aTo = pTo->aData;
6030     int const iFromHdr = pFrom->hdrOffset;
6031     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
6032     int rc;
6033     int iData;
6034 
6035 
6036     assert( pFrom->isInit );
6037     assert( pFrom->nFree>=iToHdr );
6038     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
6039 
6040     /* Copy the b-tree node content from page pFrom to page pTo. */
6041     iData = get2byte(&aFrom[iFromHdr+5]);
6042     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
6043     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
6044 
6045     /* Reinitialize page pTo so that the contents of the MemPage structure
6046     ** match the new data. The initialization of pTo can actually fail under
6047     ** fairly obscure circumstances, even though it is a copy of initialized
6048     ** page pFrom.
6049     */
6050     pTo->isInit = 0;
6051     rc = btreeInitPage(pTo);
6052     if( rc!=SQLITE_OK ){
6053       *pRC = rc;
6054       return;
6055     }
6056 
6057     /* If this is an auto-vacuum database, update the pointer-map entries
6058     ** for any b-tree or overflow pages that pTo now contains the pointers to.
6059     */
6060     if( ISAUTOVACUUM ){
6061       *pRC = setChildPtrmaps(pTo);
6062     }
6063   }
6064 }
6065 
6066 /*
6067 ** This routine redistributes cells on the iParentIdx'th child of pParent
6068 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
6069 ** same amount of free space. Usually a single sibling on either side of the
6070 ** page are used in the balancing, though both siblings might come from one
6071 ** side if the page is the first or last child of its parent. If the page
6072 ** has fewer than 2 siblings (something which can only happen if the page
6073 ** is a root page or a child of a root page) then all available siblings
6074 ** participate in the balancing.
6075 **
6076 ** The number of siblings of the page might be increased or decreased by
6077 ** one or two in an effort to keep pages nearly full but not over full.
6078 **
6079 ** Note that when this routine is called, some of the cells on the page
6080 ** might not actually be stored in MemPage.aData[]. This can happen
6081 ** if the page is overfull. This routine ensures that all cells allocated
6082 ** to the page and its siblings fit into MemPage.aData[] before returning.
6083 **
6084 ** In the course of balancing the page and its siblings, cells may be
6085 ** inserted into or removed from the parent page (pParent). Doing so
6086 ** may cause the parent page to become overfull or underfull. If this
6087 ** happens, it is the responsibility of the caller to invoke the correct
6088 ** balancing routine to fix this problem (see the balance() routine).
6089 **
6090 ** If this routine fails for any reason, it might leave the database
6091 ** in a corrupted state. So if this routine fails, the database should
6092 ** be rolled back.
6093 **
6094 ** The third argument to this function, aOvflSpace, is a pointer to a
6095 ** buffer big enough to hold one page. If while inserting cells into the parent
6096 ** page (pParent) the parent page becomes overfull, this buffer is
6097 ** used to store the parent's overflow cells. Because this function inserts
6098 ** a maximum of four divider cells into the parent page, and the maximum
6099 ** size of a cell stored within an internal node is always less than 1/4
6100 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
6101 ** enough for all overflow cells.
6102 **
6103 ** If aOvflSpace is set to a null pointer, this function returns
6104 ** SQLITE_NOMEM.
6105 */
6106 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
6107 #pragma optimize("", off)
6108 #endif
6109 static int balance_nonroot(
6110   MemPage *pParent,               /* Parent page of siblings being balanced */
6111   int iParentIdx,                 /* Index of "the page" in pParent */
6112   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
6113   int isRoot,                     /* True if pParent is a root-page */
6114   int bBulk                       /* True if this call is part of a bulk load */
6115 ){
6116   BtShared *pBt;               /* The whole database */
6117   int nCell = 0;               /* Number of cells in apCell[] */
6118   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
6119   int nNew = 0;                /* Number of pages in apNew[] */
6120   int nOld;                    /* Number of pages in apOld[] */
6121   int i, j, k;                 /* Loop counters */
6122   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
6123   int rc = SQLITE_OK;          /* The return code */
6124   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
6125   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
6126   int usableSpace;             /* Bytes in pPage beyond the header */
6127   int pageFlags;               /* Value of pPage->aData[0] */
6128   int subtotal;                /* Subtotal of bytes in cells on one page */
6129   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
6130   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
6131   int szScratch;               /* Size of scratch memory requested */
6132   MemPage *apOld[NB];          /* pPage and up to two siblings */
6133   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
6134   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
6135   u8 *pRight;                  /* Location in parent of right-sibling pointer */
6136   u8 *apDiv[NB-1];             /* Divider cells in pParent */
6137   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
6138   int szNew[NB+2];             /* Combined size of cells place on i-th page */
6139   u8 **apCell = 0;             /* All cells begin balanced */
6140   u16 *szCell;                 /* Local size of all cells in apCell[] */
6141   u8 *aSpace1;                 /* Space for copies of dividers cells */
6142   Pgno pgno;                   /* Temp var to store a page number in */
6143 
6144   pBt = pParent->pBt;
6145   assert( sqlite3_mutex_held(pBt->mutex) );
6146   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6147 
6148 #if 0
6149   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
6150 #endif
6151 
6152   /* At this point pParent may have at most one overflow cell. And if
6153   ** this overflow cell is present, it must be the cell with
6154   ** index iParentIdx. This scenario comes about when this function
6155   ** is called (indirectly) from sqlite3BtreeDelete().
6156   */
6157   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
6158   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
6159 
6160   if( !aOvflSpace ){
6161     return SQLITE_NOMEM;
6162   }
6163 
6164   /* Find the sibling pages to balance. Also locate the cells in pParent
6165   ** that divide the siblings. An attempt is made to find NN siblings on
6166   ** either side of pPage. More siblings are taken from one side, however,
6167   ** if there are fewer than NN siblings on the other side. If pParent
6168   ** has NB or fewer children then all children of pParent are taken.
6169   **
6170   ** This loop also drops the divider cells from the parent page. This
6171   ** way, the remainder of the function does not have to deal with any
6172   ** overflow cells in the parent page, since if any existed they will
6173   ** have already been removed.
6174   */
6175   i = pParent->nOverflow + pParent->nCell;
6176   if( i<2 ){
6177     nxDiv = 0;
6178   }else{
6179     assert( bBulk==0 || bBulk==1 );
6180     if( iParentIdx==0 ){
6181       nxDiv = 0;
6182     }else if( iParentIdx==i ){
6183       nxDiv = i-2+bBulk;
6184     }else{
6185       assert( bBulk==0 );
6186       nxDiv = iParentIdx-1;
6187     }
6188     i = 2-bBulk;
6189   }
6190   nOld = i+1;
6191   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
6192     pRight = &pParent->aData[pParent->hdrOffset+8];
6193   }else{
6194     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
6195   }
6196   pgno = get4byte(pRight);
6197   while( 1 ){
6198     rc = getAndInitPage(pBt, pgno, &apOld[i], 0);
6199     if( rc ){
6200       memset(apOld, 0, (i+1)*sizeof(MemPage*));
6201       goto balance_cleanup;
6202     }
6203     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
6204     if( (i--)==0 ) break;
6205 
6206     if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){
6207       apDiv[i] = pParent->apOvfl[0];
6208       pgno = get4byte(apDiv[i]);
6209       szNew[i] = cellSizePtr(pParent, apDiv[i]);
6210       pParent->nOverflow = 0;
6211     }else{
6212       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
6213       pgno = get4byte(apDiv[i]);
6214       szNew[i] = cellSizePtr(pParent, apDiv[i]);
6215 
6216       /* Drop the cell from the parent page. apDiv[i] still points to
6217       ** the cell within the parent, even though it has been dropped.
6218       ** This is safe because dropping a cell only overwrites the first
6219       ** four bytes of it, and this function does not need the first
6220       ** four bytes of the divider cell. So the pointer is safe to use
6221       ** later on.
6222       **
6223       ** But not if we are in secure-delete mode. In secure-delete mode,
6224       ** the dropCell() routine will overwrite the entire cell with zeroes.
6225       ** In this case, temporarily copy the cell into the aOvflSpace[]
6226       ** buffer. It will be copied out again as soon as the aSpace[] buffer
6227       ** is allocated.  */
6228       if( pBt->btsFlags & BTS_SECURE_DELETE ){
6229         int iOff;
6230 
6231         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
6232         if( (iOff+szNew[i])>(int)pBt->usableSize ){
6233           rc = SQLITE_CORRUPT_BKPT;
6234           memset(apOld, 0, (i+1)*sizeof(MemPage*));
6235           goto balance_cleanup;
6236         }else{
6237           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
6238           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
6239         }
6240       }
6241       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
6242     }
6243   }
6244 
6245   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
6246   ** alignment */
6247   nMaxCells = (nMaxCells + 3)&~3;
6248 
6249   /*
6250   ** Allocate space for memory structures
6251   */
6252   k = pBt->pageSize + ROUND8(sizeof(MemPage));
6253   szScratch =
6254        nMaxCells*sizeof(u8*)                       /* apCell */
6255      + nMaxCells*sizeof(u16)                       /* szCell */
6256      + pBt->pageSize                               /* aSpace1 */
6257      + k*nOld;                                     /* Page copies (apCopy) */
6258   apCell = sqlite3ScratchMalloc( szScratch );
6259   if( apCell==0 ){
6260     rc = SQLITE_NOMEM;
6261     goto balance_cleanup;
6262   }
6263   szCell = (u16*)&apCell[nMaxCells];
6264   aSpace1 = (u8*)&szCell[nMaxCells];
6265   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
6266 
6267   /*
6268   ** Load pointers to all cells on sibling pages and the divider cells
6269   ** into the local apCell[] array.  Make copies of the divider cells
6270   ** into space obtained from aSpace1[] and remove the divider cells
6271   ** from pParent.
6272   **
6273   ** If the siblings are on leaf pages, then the child pointers of the
6274   ** divider cells are stripped from the cells before they are copied
6275   ** into aSpace1[].  In this way, all cells in apCell[] are without
6276   ** child pointers.  If siblings are not leaves, then all cell in
6277   ** apCell[] include child pointers.  Either way, all cells in apCell[]
6278   ** are alike.
6279   **
6280   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
6281   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
6282   */
6283   leafCorrection = apOld[0]->leaf*4;
6284   leafData = apOld[0]->hasData;
6285   for(i=0; i<nOld; i++){
6286     int limit;
6287 
6288     /* Before doing anything else, take a copy of the i'th original sibling
6289     ** The rest of this function will use data from the copies rather
6290     ** that the original pages since the original pages will be in the
6291     ** process of being overwritten.  */
6292     MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
6293     memcpy(pOld, apOld[i], sizeof(MemPage));
6294     pOld->aData = (void*)&pOld[1];
6295     memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
6296 
6297     limit = pOld->nCell+pOld->nOverflow;
6298     if( pOld->nOverflow>0 ){
6299       for(j=0; j<limit; j++){
6300         assert( nCell<nMaxCells );
6301         apCell[nCell] = findOverflowCell(pOld, j);
6302         szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6303         nCell++;
6304       }
6305     }else{
6306       u8 *aData = pOld->aData;
6307       u16 maskPage = pOld->maskPage;
6308       u16 cellOffset = pOld->cellOffset;
6309       for(j=0; j<limit; j++){
6310         assert( nCell<nMaxCells );
6311         apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);
6312         szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6313         nCell++;
6314       }
6315     }
6316     if( i<nOld-1 && !leafData){
6317       u16 sz = (u16)szNew[i];
6318       u8 *pTemp;
6319       assert( nCell<nMaxCells );
6320       szCell[nCell] = sz;
6321       pTemp = &aSpace1[iSpace1];
6322       iSpace1 += sz;
6323       assert( sz<=pBt->maxLocal+23 );
6324       assert( iSpace1 <= (int)pBt->pageSize );
6325       memcpy(pTemp, apDiv[i], sz);
6326       apCell[nCell] = pTemp+leafCorrection;
6327       assert( leafCorrection==0 || leafCorrection==4 );
6328       szCell[nCell] = szCell[nCell] - leafCorrection;
6329       if( !pOld->leaf ){
6330         assert( leafCorrection==0 );
6331         assert( pOld->hdrOffset==0 );
6332         /* The right pointer of the child page pOld becomes the left
6333         ** pointer of the divider cell */
6334         memcpy(apCell[nCell], &pOld->aData[8], 4);
6335       }else{
6336         assert( leafCorrection==4 );
6337         if( szCell[nCell]<4 ){
6338           /* Do not allow any cells smaller than 4 bytes. */
6339           szCell[nCell] = 4;
6340         }
6341       }
6342       nCell++;
6343     }
6344   }
6345 
6346   /*
6347   ** Figure out the number of pages needed to hold all nCell cells.
6348   ** Store this number in "k".  Also compute szNew[] which is the total
6349   ** size of all cells on the i-th page and cntNew[] which is the index
6350   ** in apCell[] of the cell that divides page i from page i+1.
6351   ** cntNew[k] should equal nCell.
6352   **
6353   ** Values computed by this block:
6354   **
6355   **           k: The total number of sibling pages
6356   **    szNew[i]: Spaced used on the i-th sibling page.
6357   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
6358   **              the right of the i-th sibling page.
6359   ** usableSpace: Number of bytes of space available on each sibling.
6360   **
6361   */
6362   usableSpace = pBt->usableSize - 12 + leafCorrection;
6363   for(subtotal=k=i=0; i<nCell; i++){
6364     assert( i<nMaxCells );
6365     subtotal += szCell[i] + 2;
6366     if( subtotal > usableSpace ){
6367       szNew[k] = subtotal - szCell[i];
6368       cntNew[k] = i;
6369       if( leafData ){ i--; }
6370       subtotal = 0;
6371       k++;
6372       if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
6373     }
6374   }
6375   szNew[k] = subtotal;
6376   cntNew[k] = nCell;
6377   k++;
6378 
6379   /*
6380   ** The packing computed by the previous block is biased toward the siblings
6381   ** on the left side.  The left siblings are always nearly full, while the
6382   ** right-most sibling might be nearly empty.  This block of code attempts
6383   ** to adjust the packing of siblings to get a better balance.
6384   **
6385   ** This adjustment is more than an optimization.  The packing above might
6386   ** be so out of balance as to be illegal.  For example, the right-most
6387   ** sibling might be completely empty.  This adjustment is not optional.
6388   */
6389   for(i=k-1; i>0; i--){
6390     int szRight = szNew[i];  /* Size of sibling on the right */
6391     int szLeft = szNew[i-1]; /* Size of sibling on the left */
6392     int r;              /* Index of right-most cell in left sibling */
6393     int d;              /* Index of first cell to the left of right sibling */
6394 
6395     r = cntNew[i-1] - 1;
6396     d = r + 1 - leafData;
6397     assert( d<nMaxCells );
6398     assert( r<nMaxCells );
6399     while( szRight==0
6400        || (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2))
6401     ){
6402       szRight += szCell[d] + 2;
6403       szLeft -= szCell[r] + 2;
6404       cntNew[i-1]--;
6405       r = cntNew[i-1] - 1;
6406       d = r + 1 - leafData;
6407     }
6408     szNew[i] = szRight;
6409     szNew[i-1] = szLeft;
6410   }
6411 
6412   /* Either we found one or more cells (cntnew[0])>0) or pPage is
6413   ** a virtual root page.  A virtual root page is when the real root
6414   ** page is page 1 and we are the only child of that page.
6415   **
6416   ** UPDATE:  The assert() below is not necessarily true if the database
6417   ** file is corrupt.  The corruption will be detected and reported later
6418   ** in this procedure so there is no need to act upon it now.
6419   */
6420 #if 0
6421   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
6422 #endif
6423 
6424   TRACE(("BALANCE: old: %d %d %d  ",
6425     apOld[0]->pgno,
6426     nOld>=2 ? apOld[1]->pgno : 0,
6427     nOld>=3 ? apOld[2]->pgno : 0
6428   ));
6429 
6430   /*
6431   ** Allocate k new pages.  Reuse old pages where possible.
6432   */
6433   if( apOld[0]->pgno<=1 ){
6434     rc = SQLITE_CORRUPT_BKPT;
6435     goto balance_cleanup;
6436   }
6437   pageFlags = apOld[0]->aData[0];
6438   for(i=0; i<k; i++){
6439     MemPage *pNew;
6440     if( i<nOld ){
6441       pNew = apNew[i] = apOld[i];
6442       apOld[i] = 0;
6443       rc = sqlite3PagerWrite(pNew->pDbPage);
6444       nNew++;
6445       if( rc ) goto balance_cleanup;
6446     }else{
6447       assert( i>0 );
6448       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
6449       if( rc ) goto balance_cleanup;
6450       apNew[i] = pNew;
6451       nNew++;
6452 
6453       /* Set the pointer-map entry for the new sibling page. */
6454       if( ISAUTOVACUUM ){
6455         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6456         if( rc!=SQLITE_OK ){
6457           goto balance_cleanup;
6458         }
6459       }
6460     }
6461   }
6462 
6463   /* Free any old pages that were not reused as new pages.
6464   */
6465   while( i<nOld ){
6466     freePage(apOld[i], &rc);
6467     if( rc ) goto balance_cleanup;
6468     releasePage(apOld[i]);
6469     apOld[i] = 0;
6470     i++;
6471   }
6472 
6473   /*
6474   ** Put the new pages in accending order.  This helps to
6475   ** keep entries in the disk file in order so that a scan
6476   ** of the table is a linear scan through the file.  That
6477   ** in turn helps the operating system to deliver pages
6478   ** from the disk more rapidly.
6479   **
6480   ** An O(n^2) insertion sort algorithm is used, but since
6481   ** n is never more than NB (a small constant), that should
6482   ** not be a problem.
6483   **
6484   ** When NB==3, this one optimization makes the database
6485   ** about 25% faster for large insertions and deletions.
6486   */
6487   for(i=0; i<k-1; i++){
6488     int minV = apNew[i]->pgno;
6489     int minI = i;
6490     for(j=i+1; j<k; j++){
6491       if( apNew[j]->pgno<(unsigned)minV ){
6492         minI = j;
6493         minV = apNew[j]->pgno;
6494       }
6495     }
6496     if( minI>i ){
6497       MemPage *pT;
6498       pT = apNew[i];
6499       apNew[i] = apNew[minI];
6500       apNew[minI] = pT;
6501     }
6502   }
6503   TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
6504     apNew[0]->pgno, szNew[0],
6505     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
6506     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
6507     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
6508     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
6509 
6510   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6511   put4byte(pRight, apNew[nNew-1]->pgno);
6512 
6513   /*
6514   ** Evenly distribute the data in apCell[] across the new pages.
6515   ** Insert divider cells into pParent as necessary.
6516   */
6517   j = 0;
6518   for(i=0; i<nNew; i++){
6519     /* Assemble the new sibling page. */
6520     MemPage *pNew = apNew[i];
6521     assert( j<nMaxCells );
6522     zeroPage(pNew, pageFlags);
6523     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6524     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6525     assert( pNew->nOverflow==0 );
6526 
6527     j = cntNew[i];
6528 
6529     /* If the sibling page assembled above was not the right-most sibling,
6530     ** insert a divider cell into the parent page.
6531     */
6532     assert( i<nNew-1 || j==nCell );
6533     if( j<nCell ){
6534       u8 *pCell;
6535       u8 *pTemp;
6536       int sz;
6537 
6538       assert( j<nMaxCells );
6539       pCell = apCell[j];
6540       sz = szCell[j] + leafCorrection;
6541       pTemp = &aOvflSpace[iOvflSpace];
6542       if( !pNew->leaf ){
6543         memcpy(&pNew->aData[8], pCell, 4);
6544       }else if( leafData ){
6545         /* If the tree is a leaf-data tree, and the siblings are leaves,
6546         ** then there is no divider cell in apCell[]. Instead, the divider
6547         ** cell consists of the integer key for the right-most cell of
6548         ** the sibling-page assembled above only.
6549         */
6550         CellInfo info;
6551         j--;
6552         btreeParseCellPtr(pNew, apCell[j], &info);
6553         pCell = pTemp;
6554         sz = 4 + putVarint(&pCell[4], info.nKey);
6555         pTemp = 0;
6556       }else{
6557         pCell -= 4;
6558         /* Obscure case for non-leaf-data trees: If the cell at pCell was
6559         ** previously stored on a leaf node, and its reported size was 4
6560         ** bytes, then it may actually be smaller than this
6561         ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6562         ** any cell). But it is important to pass the correct size to
6563         ** insertCell(), so reparse the cell now.
6564         **
6565         ** Note that this can never happen in an SQLite data file, as all
6566         ** cells are at least 4 bytes. It only happens in b-trees used
6567         ** to evaluate "IN (SELECT ...)" and similar clauses.
6568         */
6569         if( szCell[j]==4 ){
6570           assert(leafCorrection==4);
6571           sz = cellSizePtr(pParent, pCell);
6572         }
6573       }
6574       iOvflSpace += sz;
6575       assert( sz<=pBt->maxLocal+23 );
6576       assert( iOvflSpace <= (int)pBt->pageSize );
6577       insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6578       if( rc!=SQLITE_OK ) goto balance_cleanup;
6579       assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6580 
6581       j++;
6582       nxDiv++;
6583     }
6584   }
6585   assert( j==nCell );
6586   assert( nOld>0 );
6587   assert( nNew>0 );
6588   if( (pageFlags & PTF_LEAF)==0 ){
6589     u8 *zChild = &apCopy[nOld-1]->aData[8];
6590     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6591   }
6592 
6593   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6594     /* The root page of the b-tree now contains no cells. The only sibling
6595     ** page is the right-child of the parent. Copy the contents of the
6596     ** child page into the parent, decreasing the overall height of the
6597     ** b-tree structure by one. This is described as the "balance-shallower"
6598     ** sub-algorithm in some documentation.
6599     **
6600     ** If this is an auto-vacuum database, the call to copyNodeContent()
6601     ** sets all pointer-map entries corresponding to database image pages
6602     ** for which the pointer is stored within the content being copied.
6603     **
6604     ** The second assert below verifies that the child page is defragmented
6605     ** (it must be, as it was just reconstructed using assemblePage()). This
6606     ** is important if the parent page happens to be page 1 of the database
6607     ** image.  */
6608     assert( nNew==1 );
6609     assert( apNew[0]->nFree ==
6610         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6611     );
6612     copyNodeContent(apNew[0], pParent, &rc);
6613     freePage(apNew[0], &rc);
6614   }else if( ISAUTOVACUUM ){
6615     /* Fix the pointer-map entries for all the cells that were shifted around.
6616     ** There are several different types of pointer-map entries that need to
6617     ** be dealt with by this routine. Some of these have been set already, but
6618     ** many have not. The following is a summary:
6619     **
6620     **   1) The entries associated with new sibling pages that were not
6621     **      siblings when this function was called. These have already
6622     **      been set. We don't need to worry about old siblings that were
6623     **      moved to the free-list - the freePage() code has taken care
6624     **      of those.
6625     **
6626     **   2) The pointer-map entries associated with the first overflow
6627     **      page in any overflow chains used by new divider cells. These
6628     **      have also already been taken care of by the insertCell() code.
6629     **
6630     **   3) If the sibling pages are not leaves, then the child pages of
6631     **      cells stored on the sibling pages may need to be updated.
6632     **
6633     **   4) If the sibling pages are not internal intkey nodes, then any
6634     **      overflow pages used by these cells may need to be updated
6635     **      (internal intkey nodes never contain pointers to overflow pages).
6636     **
6637     **   5) If the sibling pages are not leaves, then the pointer-map
6638     **      entries for the right-child pages of each sibling may need
6639     **      to be updated.
6640     **
6641     ** Cases 1 and 2 are dealt with above by other code. The next
6642     ** block deals with cases 3 and 4 and the one after that, case 5. Since
6643     ** setting a pointer map entry is a relatively expensive operation, this
6644     ** code only sets pointer map entries for child or overflow pages that have
6645     ** actually moved between pages.  */
6646     MemPage *pNew = apNew[0];
6647     MemPage *pOld = apCopy[0];
6648     int nOverflow = pOld->nOverflow;
6649     int iNextOld = pOld->nCell + nOverflow;
6650     int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
6651     j = 0;                             /* Current 'old' sibling page */
6652     k = 0;                             /* Current 'new' sibling page */
6653     for(i=0; i<nCell; i++){
6654       int isDivider = 0;
6655       while( i==iNextOld ){
6656         /* Cell i is the cell immediately following the last cell on old
6657         ** sibling page j. If the siblings are not leaf pages of an
6658         ** intkey b-tree, then cell i was a divider cell. */
6659         assert( j+1 < ArraySize(apCopy) );
6660         assert( j+1 < nOld );
6661         pOld = apCopy[++j];
6662         iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6663         if( pOld->nOverflow ){
6664           nOverflow = pOld->nOverflow;
6665           iOverflow = i + !leafData + pOld->aiOvfl[0];
6666         }
6667         isDivider = !leafData;
6668       }
6669 
6670       assert(nOverflow>0 || iOverflow<i );
6671       assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
6672       assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
6673       if( i==iOverflow ){
6674         isDivider = 1;
6675         if( (--nOverflow)>0 ){
6676           iOverflow++;
6677         }
6678       }
6679 
6680       if( i==cntNew[k] ){
6681         /* Cell i is the cell immediately following the last cell on new
6682         ** sibling page k. If the siblings are not leaf pages of an
6683         ** intkey b-tree, then cell i is a divider cell.  */
6684         pNew = apNew[++k];
6685         if( !leafData ) continue;
6686       }
6687       assert( j<nOld );
6688       assert( k<nNew );
6689 
6690       /* If the cell was originally divider cell (and is not now) or
6691       ** an overflow cell, or if the cell was located on a different sibling
6692       ** page before the balancing, then the pointer map entries associated
6693       ** with any child or overflow pages need to be updated.  */
6694       if( isDivider || pOld->pgno!=pNew->pgno ){
6695         if( !leafCorrection ){
6696           ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6697         }
6698         if( szCell[i]>pNew->minLocal ){
6699           ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6700         }
6701       }
6702     }
6703 
6704     if( !leafCorrection ){
6705       for(i=0; i<nNew; i++){
6706         u32 key = get4byte(&apNew[i]->aData[8]);
6707         ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6708       }
6709     }
6710 
6711 #if 0
6712     /* The ptrmapCheckPages() contains assert() statements that verify that
6713     ** all pointer map pages are set correctly. This is helpful while
6714     ** debugging. This is usually disabled because a corrupt database may
6715     ** cause an assert() statement to fail.  */
6716     ptrmapCheckPages(apNew, nNew);
6717     ptrmapCheckPages(&pParent, 1);
6718 #endif
6719   }
6720 
6721   assert( pParent->isInit );
6722   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6723           nOld, nNew, nCell));
6724 
6725   /*
6726   ** Cleanup before returning.
6727   */
6728 balance_cleanup:
6729   sqlite3ScratchFree(apCell);
6730   for(i=0; i<nOld; i++){
6731     releasePage(apOld[i]);
6732   }
6733   for(i=0; i<nNew; i++){
6734     releasePage(apNew[i]);
6735   }
6736 
6737   return rc;
6738 }
6739 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
6740 #pragma optimize("", on)
6741 #endif
6742 
6743 
6744 /*
6745 ** This function is called when the root page of a b-tree structure is
6746 ** overfull (has one or more overflow pages).
6747 **
6748 ** A new child page is allocated and the contents of the current root
6749 ** page, including overflow cells, are copied into the child. The root
6750 ** page is then overwritten to make it an empty page with the right-child
6751 ** pointer pointing to the new page.
6752 **
6753 ** Before returning, all pointer-map entries corresponding to pages
6754 ** that the new child-page now contains pointers to are updated. The
6755 ** entry corresponding to the new right-child pointer of the root
6756 ** page is also updated.
6757 **
6758 ** If successful, *ppChild is set to contain a reference to the child
6759 ** page and SQLITE_OK is returned. In this case the caller is required
6760 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6761 ** an error code is returned and *ppChild is set to 0.
6762 */
6763 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6764   int rc;                        /* Return value from subprocedures */
6765   MemPage *pChild = 0;           /* Pointer to a new child page */
6766   Pgno pgnoChild = 0;            /* Page number of the new child page */
6767   BtShared *pBt = pRoot->pBt;    /* The BTree */
6768 
6769   assert( pRoot->nOverflow>0 );
6770   assert( sqlite3_mutex_held(pBt->mutex) );
6771 
6772   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6773   ** page that will become the new right-child of pPage. Copy the contents
6774   ** of the node stored on pRoot into the new child page.
6775   */
6776   rc = sqlite3PagerWrite(pRoot->pDbPage);
6777   if( rc==SQLITE_OK ){
6778     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6779     copyNodeContent(pRoot, pChild, &rc);
6780     if( ISAUTOVACUUM ){
6781       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6782     }
6783   }
6784   if( rc ){
6785     *ppChild = 0;
6786     releasePage(pChild);
6787     return rc;
6788   }
6789   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6790   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6791   assert( pChild->nCell==pRoot->nCell );
6792 
6793   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6794 
6795   /* Copy the overflow cells from pRoot to pChild */
6796   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
6797          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
6798   memcpy(pChild->apOvfl, pRoot->apOvfl,
6799          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
6800   pChild->nOverflow = pRoot->nOverflow;
6801 
6802   /* Zero the contents of pRoot. Then install pChild as the right-child. */
6803   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6804   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6805 
6806   *ppChild = pChild;
6807   return SQLITE_OK;
6808 }
6809 
6810 /*
6811 ** The page that pCur currently points to has just been modified in
6812 ** some way. This function figures out if this modification means the
6813 ** tree needs to be balanced, and if so calls the appropriate balancing
6814 ** routine. Balancing routines are:
6815 **
6816 **   balance_quick()
6817 **   balance_deeper()
6818 **   balance_nonroot()
6819 */
6820 static int balance(BtCursor *pCur){
6821   int rc = SQLITE_OK;
6822   const int nMin = pCur->pBt->usableSize * 2 / 3;
6823   u8 aBalanceQuickSpace[13];
6824   u8 *pFree = 0;
6825 
6826   TESTONLY( int balance_quick_called = 0 );
6827   TESTONLY( int balance_deeper_called = 0 );
6828 
6829   do {
6830     int iPage = pCur->iPage;
6831     MemPage *pPage = pCur->apPage[iPage];
6832 
6833     if( iPage==0 ){
6834       if( pPage->nOverflow ){
6835         /* The root page of the b-tree is overfull. In this case call the
6836         ** balance_deeper() function to create a new child for the root-page
6837         ** and copy the current contents of the root-page to it. The
6838         ** next iteration of the do-loop will balance the child page.
6839         */
6840         assert( (balance_deeper_called++)==0 );
6841         rc = balance_deeper(pPage, &pCur->apPage[1]);
6842         if( rc==SQLITE_OK ){
6843           pCur->iPage = 1;
6844           pCur->aiIdx[0] = 0;
6845           pCur->aiIdx[1] = 0;
6846           assert( pCur->apPage[1]->nOverflow );
6847         }
6848       }else{
6849         break;
6850       }
6851     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6852       break;
6853     }else{
6854       MemPage * const pParent = pCur->apPage[iPage-1];
6855       int const iIdx = pCur->aiIdx[iPage-1];
6856 
6857       rc = sqlite3PagerWrite(pParent->pDbPage);
6858       if( rc==SQLITE_OK ){
6859 #ifndef SQLITE_OMIT_QUICKBALANCE
6860         if( pPage->hasData
6861          && pPage->nOverflow==1
6862          && pPage->aiOvfl[0]==pPage->nCell
6863          && pParent->pgno!=1
6864          && pParent->nCell==iIdx
6865         ){
6866           /* Call balance_quick() to create a new sibling of pPage on which
6867           ** to store the overflow cell. balance_quick() inserts a new cell
6868           ** into pParent, which may cause pParent overflow. If this
6869           ** happens, the next interation of the do-loop will balance pParent
6870           ** use either balance_nonroot() or balance_deeper(). Until this
6871           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6872           ** buffer.
6873           **
6874           ** The purpose of the following assert() is to check that only a
6875           ** single call to balance_quick() is made for each call to this
6876           ** function. If this were not verified, a subtle bug involving reuse
6877           ** of the aBalanceQuickSpace[] might sneak in.
6878           */
6879           assert( (balance_quick_called++)==0 );
6880           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6881         }else
6882 #endif
6883         {
6884           /* In this case, call balance_nonroot() to redistribute cells
6885           ** between pPage and up to 2 of its sibling pages. This involves
6886           ** modifying the contents of pParent, which may cause pParent to
6887           ** become overfull or underfull. The next iteration of the do-loop
6888           ** will balance the parent page to correct this.
6889           **
6890           ** If the parent page becomes overfull, the overflow cell or cells
6891           ** are stored in the pSpace buffer allocated immediately below.
6892           ** A subsequent iteration of the do-loop will deal with this by
6893           ** calling balance_nonroot() (balance_deeper() may be called first,
6894           ** but it doesn't deal with overflow cells - just moves them to a
6895           ** different page). Once this subsequent call to balance_nonroot()
6896           ** has completed, it is safe to release the pSpace buffer used by
6897           ** the previous call, as the overflow cell data will have been
6898           ** copied either into the body of a database page or into the new
6899           ** pSpace buffer passed to the latter call to balance_nonroot().
6900           */
6901           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6902           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);
6903           if( pFree ){
6904             /* If pFree is not NULL, it points to the pSpace buffer used
6905             ** by a previous call to balance_nonroot(). Its contents are
6906             ** now stored either on real database pages or within the
6907             ** new pSpace buffer, so it may be safely freed here. */
6908             sqlite3PageFree(pFree);
6909           }
6910 
6911           /* The pSpace buffer will be freed after the next call to
6912           ** balance_nonroot(), or just before this function returns, whichever
6913           ** comes first. */
6914           pFree = pSpace;
6915         }
6916       }
6917 
6918       pPage->nOverflow = 0;
6919 
6920       /* The next iteration of the do-loop balances the parent page. */
6921       releasePage(pPage);
6922       pCur->iPage--;
6923     }
6924   }while( rc==SQLITE_OK );
6925 
6926   if( pFree ){
6927     sqlite3PageFree(pFree);
6928   }
6929   return rc;
6930 }
6931 
6932 
6933 /*
6934 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
6935 ** and the data is given by (pData,nData).  The cursor is used only to
6936 ** define what table the record should be inserted into.  The cursor
6937 ** is left pointing at a random location.
6938 **
6939 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
6940 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
6941 **
6942 ** If the seekResult parameter is non-zero, then a successful call to
6943 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6944 ** been performed. seekResult is the search result returned (a negative
6945 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6946 ** a positive value if pCur points at an etry that is larger than
6947 ** (pKey, nKey)).
6948 **
6949 ** If the seekResult parameter is non-zero, then the caller guarantees that
6950 ** cursor pCur is pointing at the existing copy of a row that is to be
6951 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
6952 ** point to any entry or to no entry at all and so this function has to seek
6953 ** the cursor before the new key can be inserted.
6954 */
6955 int sqlite3BtreeInsert(
6956   BtCursor *pCur,                /* Insert data into the table of this cursor */
6957   const void *pKey, i64 nKey,    /* The key of the new record */
6958   const void *pData, int nData,  /* The data of the new record */
6959   int nZero,                     /* Number of extra 0 bytes to append to data */
6960   int appendBias,                /* True if this is likely an append */
6961   int seekResult                 /* Result of prior MovetoUnpacked() call */
6962 ){
6963   int rc;
6964   int loc = seekResult;          /* -1: before desired location  +1: after */
6965   int szNew = 0;
6966   int idx;
6967   MemPage *pPage;
6968   Btree *p = pCur->pBtree;
6969   BtShared *pBt = p->pBt;
6970   unsigned char *oldCell;
6971   unsigned char *newCell = 0;
6972 
6973   if( pCur->eState==CURSOR_FAULT ){
6974     assert( pCur->skipNext!=SQLITE_OK );
6975     return pCur->skipNext;
6976   }
6977 
6978   assert( cursorHoldsMutex(pCur) );
6979   assert( (pCur->curFlags & BTCF_WriteFlag)!=0 && pBt->inTransaction==TRANS_WRITE
6980               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
6981   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6982 
6983   /* Assert that the caller has been consistent. If this cursor was opened
6984   ** expecting an index b-tree, then the caller should be inserting blob
6985   ** keys with no associated data. If the cursor was opened expecting an
6986   ** intkey table, the caller should be inserting integer keys with a
6987   ** blob of associated data.  */
6988   assert( (pKey==0)==(pCur->pKeyInfo==0) );
6989 
6990   /* Save the positions of any other cursors open on this table.
6991   **
6992   ** In some cases, the call to btreeMoveto() below is a no-op. For
6993   ** example, when inserting data into a table with auto-generated integer
6994   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6995   ** integer key to use. It then calls this function to actually insert the
6996   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6997   ** that the cursor is already where it needs to be and returns without
6998   ** doing any work. To avoid thwarting these optimizations, it is important
6999   ** not to clear the cursor here.
7000   */
7001   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
7002   if( rc ) return rc;
7003 
7004   if( pCur->pKeyInfo==0 ){
7005     /* If this is an insert into a table b-tree, invalidate any incrblob
7006     ** cursors open on the row being replaced */
7007     invalidateIncrblobCursors(p, nKey, 0);
7008 
7009     /* If the cursor is currently on the last row and we are appending a
7010     ** new row onto the end, set the "loc" to avoid an unnecessary btreeMoveto()
7011     ** call */
7012     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0 && pCur->info.nKey==nKey-1 ){
7013       loc = -1;
7014     }
7015   }
7016 
7017   if( !loc ){
7018     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
7019     if( rc ) return rc;
7020   }
7021   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
7022 
7023   pPage = pCur->apPage[pCur->iPage];
7024   assert( pPage->intKey || nKey>=0 );
7025   assert( pPage->leaf || !pPage->intKey );
7026 
7027   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
7028           pCur->pgnoRoot, nKey, nData, pPage->pgno,
7029           loc==0 ? "overwrite" : "new entry"));
7030   assert( pPage->isInit );
7031   allocateTempSpace(pBt);
7032   newCell = pBt->pTmpSpace;
7033   if( newCell==0 ) return SQLITE_NOMEM;
7034   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
7035   if( rc ) goto end_insert;
7036   assert( szNew==cellSizePtr(pPage, newCell) );
7037   assert( szNew <= MX_CELL_SIZE(pBt) );
7038   idx = pCur->aiIdx[pCur->iPage];
7039   if( loc==0 ){
7040     u16 szOld;
7041     assert( idx<pPage->nCell );
7042     rc = sqlite3PagerWrite(pPage->pDbPage);
7043     if( rc ){
7044       goto end_insert;
7045     }
7046     oldCell = findCell(pPage, idx);
7047     if( !pPage->leaf ){
7048       memcpy(newCell, oldCell, 4);
7049     }
7050     szOld = cellSizePtr(pPage, oldCell);
7051     rc = clearCell(pPage, oldCell);
7052     dropCell(pPage, idx, szOld, &rc);
7053     if( rc ) goto end_insert;
7054   }else if( loc<0 && pPage->nCell>0 ){
7055     assert( pPage->leaf );
7056     idx = ++pCur->aiIdx[pCur->iPage];
7057   }else{
7058     assert( pPage->leaf );
7059   }
7060   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
7061   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
7062 
7063   /* If no error has occurred and pPage has an overflow cell, call balance()
7064   ** to redistribute the cells within the tree. Since balance() may move
7065   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
7066   ** variables.
7067   **
7068   ** Previous versions of SQLite called moveToRoot() to move the cursor
7069   ** back to the root page as balance() used to invalidate the contents
7070   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
7071   ** set the cursor state to "invalid". This makes common insert operations
7072   ** slightly faster.
7073   **
7074   ** There is a subtle but important optimization here too. When inserting
7075   ** multiple records into an intkey b-tree using a single cursor (as can
7076   ** happen while processing an "INSERT INTO ... SELECT" statement), it
7077   ** is advantageous to leave the cursor pointing to the last entry in
7078   ** the b-tree if possible. If the cursor is left pointing to the last
7079   ** entry in the table, and the next row inserted has an integer key
7080   ** larger than the largest existing key, it is possible to insert the
7081   ** row without seeking the cursor. This can be a big performance boost.
7082   */
7083   pCur->info.nSize = 0;
7084   if( rc==SQLITE_OK && pPage->nOverflow ){
7085     pCur->curFlags &= ~(BTCF_ValidNKey);
7086     rc = balance(pCur);
7087 
7088     /* Must make sure nOverflow is reset to zero even if the balance()
7089     ** fails. Internal data structure corruption will result otherwise.
7090     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
7091     ** from trying to save the current position of the cursor.  */
7092     pCur->apPage[pCur->iPage]->nOverflow = 0;
7093     pCur->eState = CURSOR_INVALID;
7094   }
7095   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
7096 
7097 end_insert:
7098   return rc;
7099 }
7100 
7101 /*
7102 ** Delete the entry that the cursor is pointing to.  The cursor
7103 ** is left pointing at a arbitrary location.
7104 */
7105 int sqlite3BtreeDelete(BtCursor *pCur){
7106   Btree *p = pCur->pBtree;
7107   BtShared *pBt = p->pBt;
7108   int rc;                              /* Return code */
7109   MemPage *pPage;                      /* Page to delete cell from */
7110   unsigned char *pCell;                /* Pointer to cell to delete */
7111   int iCellIdx;                        /* Index of cell to delete */
7112   int iCellDepth;                      /* Depth of node containing pCell */
7113 
7114   assert( cursorHoldsMutex(pCur) );
7115   assert( pBt->inTransaction==TRANS_WRITE );
7116   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
7117   assert( pCur->curFlags & BTCF_WriteFlag );
7118   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
7119   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
7120 
7121   if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
7122    || NEVER(pCur->eState!=CURSOR_VALID)
7123   ){
7124     return SQLITE_ERROR;  /* Something has gone awry. */
7125   }
7126 
7127   iCellDepth = pCur->iPage;
7128   iCellIdx = pCur->aiIdx[iCellDepth];
7129   pPage = pCur->apPage[iCellDepth];
7130   pCell = findCell(pPage, iCellIdx);
7131 
7132   /* If the page containing the entry to delete is not a leaf page, move
7133   ** the cursor to the largest entry in the tree that is smaller than
7134   ** the entry being deleted. This cell will replace the cell being deleted
7135   ** from the internal node. The 'previous' entry is used for this instead
7136   ** of the 'next' entry, as the previous entry is always a part of the
7137   ** sub-tree headed by the child page of the cell being deleted. This makes
7138   ** balancing the tree following the delete operation easier.  */
7139   if( !pPage->leaf ){
7140     int notUsed = 0;
7141     rc = sqlite3BtreePrevious(pCur, &notUsed);
7142     if( rc ) return rc;
7143   }
7144 
7145   /* Save the positions of any other cursors open on this table before
7146   ** making any modifications. Make the page containing the entry to be
7147   ** deleted writable. Then free any overflow pages associated with the
7148   ** entry and finally remove the cell itself from within the page.
7149   */
7150   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
7151   if( rc ) return rc;
7152 
7153   /* If this is a delete operation to remove a row from a table b-tree,
7154   ** invalidate any incrblob cursors open on the row being deleted.  */
7155   if( pCur->pKeyInfo==0 ){
7156     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
7157   }
7158 
7159   rc = sqlite3PagerWrite(pPage->pDbPage);
7160   if( rc ) return rc;
7161   rc = clearCell(pPage, pCell);
7162   dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
7163   if( rc ) return rc;
7164 
7165   /* If the cell deleted was not located on a leaf page, then the cursor
7166   ** is currently pointing to the largest entry in the sub-tree headed
7167   ** by the child-page of the cell that was just deleted from an internal
7168   ** node. The cell from the leaf node needs to be moved to the internal
7169   ** node to replace the deleted cell.  */
7170   if( !pPage->leaf ){
7171     MemPage *pLeaf = pCur->apPage[pCur->iPage];
7172     int nCell;
7173     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
7174     unsigned char *pTmp;
7175 
7176     pCell = findCell(pLeaf, pLeaf->nCell-1);
7177     nCell = cellSizePtr(pLeaf, pCell);
7178     assert( MX_CELL_SIZE(pBt) >= nCell );
7179 
7180     allocateTempSpace(pBt);
7181     pTmp = pBt->pTmpSpace;
7182 
7183     rc = sqlite3PagerWrite(pLeaf->pDbPage);
7184     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
7185     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
7186     if( rc ) return rc;
7187   }
7188 
7189   /* Balance the tree. If the entry deleted was located on a leaf page,
7190   ** then the cursor still points to that page. In this case the first
7191   ** call to balance() repairs the tree, and the if(...) condition is
7192   ** never true.
7193   **
7194   ** Otherwise, if the entry deleted was on an internal node page, then
7195   ** pCur is pointing to the leaf page from which a cell was removed to
7196   ** replace the cell deleted from the internal node. This is slightly
7197   ** tricky as the leaf node may be underfull, and the internal node may
7198   ** be either under or overfull. In this case run the balancing algorithm
7199   ** on the leaf node first. If the balance proceeds far enough up the
7200   ** tree that we can be sure that any problem in the internal node has
7201   ** been corrected, so be it. Otherwise, after balancing the leaf node,
7202   ** walk the cursor up the tree to the internal node and balance it as
7203   ** well.  */
7204   rc = balance(pCur);
7205   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
7206     while( pCur->iPage>iCellDepth ){
7207       releasePage(pCur->apPage[pCur->iPage--]);
7208     }
7209     rc = balance(pCur);
7210   }
7211 
7212   if( rc==SQLITE_OK ){
7213     moveToRoot(pCur);
7214   }
7215   return rc;
7216 }
7217 
7218 /*
7219 ** Create a new BTree table.  Write into *piTable the page
7220 ** number for the root page of the new table.
7221 **
7222 ** The type of type is determined by the flags parameter.  Only the
7223 ** following values of flags are currently in use.  Other values for
7224 ** flags might not work:
7225 **
7226 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
7227 **     BTREE_ZERODATA                  Used for SQL indices
7228 */
7229 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
7230   BtShared *pBt = p->pBt;
7231   MemPage *pRoot;
7232   Pgno pgnoRoot;
7233   int rc;
7234   int ptfFlags;          /* Page-type flage for the root page of new table */
7235 
7236   assert( sqlite3BtreeHoldsMutex(p) );
7237   assert( pBt->inTransaction==TRANS_WRITE );
7238   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
7239 
7240 #ifdef SQLITE_OMIT_AUTOVACUUM
7241   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7242   if( rc ){
7243     return rc;
7244   }
7245 #else
7246   if( pBt->autoVacuum ){
7247     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
7248     MemPage *pPageMove; /* The page to move to. */
7249 
7250     /* Creating a new table may probably require moving an existing database
7251     ** to make room for the new tables root page. In case this page turns
7252     ** out to be an overflow page, delete all overflow page-map caches
7253     ** held by open cursors.
7254     */
7255     invalidateAllOverflowCache(pBt);
7256 
7257     /* Read the value of meta[3] from the database to determine where the
7258     ** root page of the new table should go. meta[3] is the largest root-page
7259     ** created so far, so the new root-page is (meta[3]+1).
7260     */
7261     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
7262     pgnoRoot++;
7263 
7264     /* The new root-page may not be allocated on a pointer-map page, or the
7265     ** PENDING_BYTE page.
7266     */
7267     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
7268         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
7269       pgnoRoot++;
7270     }
7271     assert( pgnoRoot>=3 );
7272 
7273     /* Allocate a page. The page that currently resides at pgnoRoot will
7274     ** be moved to the allocated page (unless the allocated page happens
7275     ** to reside at pgnoRoot).
7276     */
7277     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
7278     if( rc!=SQLITE_OK ){
7279       return rc;
7280     }
7281 
7282     if( pgnoMove!=pgnoRoot ){
7283       /* pgnoRoot is the page that will be used for the root-page of
7284       ** the new table (assuming an error did not occur). But we were
7285       ** allocated pgnoMove. If required (i.e. if it was not allocated
7286       ** by extending the file), the current page at position pgnoMove
7287       ** is already journaled.
7288       */
7289       u8 eType = 0;
7290       Pgno iPtrPage = 0;
7291 
7292       /* Save the positions of any open cursors. This is required in
7293       ** case they are holding a reference to an xFetch reference
7294       ** corresponding to page pgnoRoot.  */
7295       rc = saveAllCursors(pBt, 0, 0);
7296       releasePage(pPageMove);
7297       if( rc!=SQLITE_OK ){
7298         return rc;
7299       }
7300 
7301       /* Move the page currently at pgnoRoot to pgnoMove. */
7302       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
7303       if( rc!=SQLITE_OK ){
7304         return rc;
7305       }
7306       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
7307       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
7308         rc = SQLITE_CORRUPT_BKPT;
7309       }
7310       if( rc!=SQLITE_OK ){
7311         releasePage(pRoot);
7312         return rc;
7313       }
7314       assert( eType!=PTRMAP_ROOTPAGE );
7315       assert( eType!=PTRMAP_FREEPAGE );
7316       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
7317       releasePage(pRoot);
7318 
7319       /* Obtain the page at pgnoRoot */
7320       if( rc!=SQLITE_OK ){
7321         return rc;
7322       }
7323       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
7324       if( rc!=SQLITE_OK ){
7325         return rc;
7326       }
7327       rc = sqlite3PagerWrite(pRoot->pDbPage);
7328       if( rc!=SQLITE_OK ){
7329         releasePage(pRoot);
7330         return rc;
7331       }
7332     }else{
7333       pRoot = pPageMove;
7334     }
7335 
7336     /* Update the pointer-map and meta-data with the new root-page number. */
7337     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
7338     if( rc ){
7339       releasePage(pRoot);
7340       return rc;
7341     }
7342 
7343     /* When the new root page was allocated, page 1 was made writable in
7344     ** order either to increase the database filesize, or to decrement the
7345     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
7346     */
7347     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
7348     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
7349     if( NEVER(rc) ){
7350       releasePage(pRoot);
7351       return rc;
7352     }
7353 
7354   }else{
7355     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7356     if( rc ) return rc;
7357   }
7358 #endif
7359   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7360   if( createTabFlags & BTREE_INTKEY ){
7361     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
7362   }else{
7363     ptfFlags = PTF_ZERODATA | PTF_LEAF;
7364   }
7365   zeroPage(pRoot, ptfFlags);
7366   sqlite3PagerUnref(pRoot->pDbPage);
7367   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
7368   *piTable = (int)pgnoRoot;
7369   return SQLITE_OK;
7370 }
7371 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
7372   int rc;
7373   sqlite3BtreeEnter(p);
7374   rc = btreeCreateTable(p, piTable, flags);
7375   sqlite3BtreeLeave(p);
7376   return rc;
7377 }
7378 
7379 /*
7380 ** Erase the given database page and all its children.  Return
7381 ** the page to the freelist.
7382 */
7383 static int clearDatabasePage(
7384   BtShared *pBt,           /* The BTree that contains the table */
7385   Pgno pgno,               /* Page number to clear */
7386   int freePageFlag,        /* Deallocate page if true */
7387   int *pnChange            /* Add number of Cells freed to this counter */
7388 ){
7389   MemPage *pPage;
7390   int rc;
7391   unsigned char *pCell;
7392   int i;
7393   int hdr;
7394 
7395   assert( sqlite3_mutex_held(pBt->mutex) );
7396   if( pgno>btreePagecount(pBt) ){
7397     return SQLITE_CORRUPT_BKPT;
7398   }
7399 
7400   rc = getAndInitPage(pBt, pgno, &pPage, 0);
7401   if( rc ) return rc;
7402   hdr = pPage->hdrOffset;
7403   for(i=0; i<pPage->nCell; i++){
7404     pCell = findCell(pPage, i);
7405     if( !pPage->leaf ){
7406       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
7407       if( rc ) goto cleardatabasepage_out;
7408     }
7409     rc = clearCell(pPage, pCell);
7410     if( rc ) goto cleardatabasepage_out;
7411   }
7412   if( !pPage->leaf ){
7413     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
7414     if( rc ) goto cleardatabasepage_out;
7415   }else if( pnChange ){
7416     assert( pPage->intKey );
7417     *pnChange += pPage->nCell;
7418   }
7419   if( freePageFlag ){
7420     freePage(pPage, &rc);
7421   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
7422     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
7423   }
7424 
7425 cleardatabasepage_out:
7426   releasePage(pPage);
7427   return rc;
7428 }
7429 
7430 /*
7431 ** Delete all information from a single table in the database.  iTable is
7432 ** the page number of the root of the table.  After this routine returns,
7433 ** the root page is empty, but still exists.
7434 **
7435 ** This routine will fail with SQLITE_LOCKED if there are any open
7436 ** read cursors on the table.  Open write cursors are moved to the
7437 ** root of the table.
7438 **
7439 ** If pnChange is not NULL, then table iTable must be an intkey table. The
7440 ** integer value pointed to by pnChange is incremented by the number of
7441 ** entries in the table.
7442 */
7443 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
7444   int rc;
7445   BtShared *pBt = p->pBt;
7446   sqlite3BtreeEnter(p);
7447   assert( p->inTrans==TRANS_WRITE );
7448 
7449   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
7450 
7451   if( SQLITE_OK==rc ){
7452     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
7453     ** is the root of a table b-tree - if it is not, the following call is
7454     ** a no-op).  */
7455     invalidateIncrblobCursors(p, 0, 1);
7456     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
7457   }
7458   sqlite3BtreeLeave(p);
7459   return rc;
7460 }
7461 
7462 /*
7463 ** Delete all information from the single table that pCur is open on.
7464 **
7465 ** This routine only work for pCur on an ephemeral table.
7466 */
7467 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
7468   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
7469 }
7470 
7471 /*
7472 ** Erase all information in a table and add the root of the table to
7473 ** the freelist.  Except, the root of the principle table (the one on
7474 ** page 1) is never added to the freelist.
7475 **
7476 ** This routine will fail with SQLITE_LOCKED if there are any open
7477 ** cursors on the table.
7478 **
7479 ** If AUTOVACUUM is enabled and the page at iTable is not the last
7480 ** root page in the database file, then the last root page
7481 ** in the database file is moved into the slot formerly occupied by
7482 ** iTable and that last slot formerly occupied by the last root page
7483 ** is added to the freelist instead of iTable.  In this say, all
7484 ** root pages are kept at the beginning of the database file, which
7485 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
7486 ** page number that used to be the last root page in the file before
7487 ** the move.  If no page gets moved, *piMoved is set to 0.
7488 ** The last root page is recorded in meta[3] and the value of
7489 ** meta[3] is updated by this procedure.
7490 */
7491 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
7492   int rc;
7493   MemPage *pPage = 0;
7494   BtShared *pBt = p->pBt;
7495 
7496   assert( sqlite3BtreeHoldsMutex(p) );
7497   assert( p->inTrans==TRANS_WRITE );
7498 
7499   /* It is illegal to drop a table if any cursors are open on the
7500   ** database. This is because in auto-vacuum mode the backend may
7501   ** need to move another root-page to fill a gap left by the deleted
7502   ** root page. If an open cursor was using this page a problem would
7503   ** occur.
7504   **
7505   ** This error is caught long before control reaches this point.
7506   */
7507   if( NEVER(pBt->pCursor) ){
7508     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
7509     return SQLITE_LOCKED_SHAREDCACHE;
7510   }
7511 
7512   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
7513   if( rc ) return rc;
7514   rc = sqlite3BtreeClearTable(p, iTable, 0);
7515   if( rc ){
7516     releasePage(pPage);
7517     return rc;
7518   }
7519 
7520   *piMoved = 0;
7521 
7522   if( iTable>1 ){
7523 #ifdef SQLITE_OMIT_AUTOVACUUM
7524     freePage(pPage, &rc);
7525     releasePage(pPage);
7526 #else
7527     if( pBt->autoVacuum ){
7528       Pgno maxRootPgno;
7529       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
7530 
7531       if( iTable==maxRootPgno ){
7532         /* If the table being dropped is the table with the largest root-page
7533         ** number in the database, put the root page on the free list.
7534         */
7535         freePage(pPage, &rc);
7536         releasePage(pPage);
7537         if( rc!=SQLITE_OK ){
7538           return rc;
7539         }
7540       }else{
7541         /* The table being dropped does not have the largest root-page
7542         ** number in the database. So move the page that does into the
7543         ** gap left by the deleted root-page.
7544         */
7545         MemPage *pMove;
7546         releasePage(pPage);
7547         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7548         if( rc!=SQLITE_OK ){
7549           return rc;
7550         }
7551         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
7552         releasePage(pMove);
7553         if( rc!=SQLITE_OK ){
7554           return rc;
7555         }
7556         pMove = 0;
7557         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7558         freePage(pMove, &rc);
7559         releasePage(pMove);
7560         if( rc!=SQLITE_OK ){
7561           return rc;
7562         }
7563         *piMoved = maxRootPgno;
7564       }
7565 
7566       /* Set the new 'max-root-page' value in the database header. This
7567       ** is the old value less one, less one more if that happens to
7568       ** be a root-page number, less one again if that is the
7569       ** PENDING_BYTE_PAGE.
7570       */
7571       maxRootPgno--;
7572       while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7573              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7574         maxRootPgno--;
7575       }
7576       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7577 
7578       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7579     }else{
7580       freePage(pPage, &rc);
7581       releasePage(pPage);
7582     }
7583 #endif
7584   }else{
7585     /* If sqlite3BtreeDropTable was called on page 1.
7586     ** This really never should happen except in a corrupt
7587     ** database.
7588     */
7589     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7590     releasePage(pPage);
7591   }
7592   return rc;
7593 }
7594 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7595   int rc;
7596   sqlite3BtreeEnter(p);
7597   rc = btreeDropTable(p, iTable, piMoved);
7598   sqlite3BtreeLeave(p);
7599   return rc;
7600 }
7601 
7602 
7603 /*
7604 ** This function may only be called if the b-tree connection already
7605 ** has a read or write transaction open on the database.
7606 **
7607 ** Read the meta-information out of a database file.  Meta[0]
7608 ** is the number of free pages currently in the database.  Meta[1]
7609 ** through meta[15] are available for use by higher layers.  Meta[0]
7610 ** is read-only, the others are read/write.
7611 **
7612 ** The schema layer numbers meta values differently.  At the schema
7613 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7614 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
7615 */
7616 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7617   BtShared *pBt = p->pBt;
7618 
7619   sqlite3BtreeEnter(p);
7620   assert( p->inTrans>TRANS_NONE );
7621   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7622   assert( pBt->pPage1 );
7623   assert( idx>=0 && idx<=15 );
7624 
7625   *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7626 
7627   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7628   ** database, mark the database as read-only.  */
7629 #ifdef SQLITE_OMIT_AUTOVACUUM
7630   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
7631     pBt->btsFlags |= BTS_READ_ONLY;
7632   }
7633 #endif
7634 
7635   sqlite3BtreeLeave(p);
7636 }
7637 
7638 /*
7639 ** Write meta-information back into the database.  Meta[0] is
7640 ** read-only and may not be written.
7641 */
7642 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7643   BtShared *pBt = p->pBt;
7644   unsigned char *pP1;
7645   int rc;
7646   assert( idx>=1 && idx<=15 );
7647   sqlite3BtreeEnter(p);
7648   assert( p->inTrans==TRANS_WRITE );
7649   assert( pBt->pPage1!=0 );
7650   pP1 = pBt->pPage1->aData;
7651   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7652   if( rc==SQLITE_OK ){
7653     put4byte(&pP1[36 + idx*4], iMeta);
7654 #ifndef SQLITE_OMIT_AUTOVACUUM
7655     if( idx==BTREE_INCR_VACUUM ){
7656       assert( pBt->autoVacuum || iMeta==0 );
7657       assert( iMeta==0 || iMeta==1 );
7658       pBt->incrVacuum = (u8)iMeta;
7659     }
7660 #endif
7661   }
7662   sqlite3BtreeLeave(p);
7663   return rc;
7664 }
7665 
7666 #ifndef SQLITE_OMIT_BTREECOUNT
7667 /*
7668 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7669 ** number of entries in the b-tree and write the result to *pnEntry.
7670 **
7671 ** SQLITE_OK is returned if the operation is successfully executed.
7672 ** Otherwise, if an error is encountered (i.e. an IO error or database
7673 ** corruption) an SQLite error code is returned.
7674 */
7675 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7676   i64 nEntry = 0;                      /* Value to return in *pnEntry */
7677   int rc;                              /* Return code */
7678 
7679   if( pCur->pgnoRoot==0 ){
7680     *pnEntry = 0;
7681     return SQLITE_OK;
7682   }
7683   rc = moveToRoot(pCur);
7684 
7685   /* Unless an error occurs, the following loop runs one iteration for each
7686   ** page in the B-Tree structure (not including overflow pages).
7687   */
7688   while( rc==SQLITE_OK ){
7689     int iIdx;                          /* Index of child node in parent */
7690     MemPage *pPage;                    /* Current page of the b-tree */
7691 
7692     /* If this is a leaf page or the tree is not an int-key tree, then
7693     ** this page contains countable entries. Increment the entry counter
7694     ** accordingly.
7695     */
7696     pPage = pCur->apPage[pCur->iPage];
7697     if( pPage->leaf || !pPage->intKey ){
7698       nEntry += pPage->nCell;
7699     }
7700 
7701     /* pPage is a leaf node. This loop navigates the cursor so that it
7702     ** points to the first interior cell that it points to the parent of
7703     ** the next page in the tree that has not yet been visited. The
7704     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7705     ** of the page, or to the number of cells in the page if the next page
7706     ** to visit is the right-child of its parent.
7707     **
7708     ** If all pages in the tree have been visited, return SQLITE_OK to the
7709     ** caller.
7710     */
7711     if( pPage->leaf ){
7712       do {
7713         if( pCur->iPage==0 ){
7714           /* All pages of the b-tree have been visited. Return successfully. */
7715           *pnEntry = nEntry;
7716           return SQLITE_OK;
7717         }
7718         moveToParent(pCur);
7719       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7720 
7721       pCur->aiIdx[pCur->iPage]++;
7722       pPage = pCur->apPage[pCur->iPage];
7723     }
7724 
7725     /* Descend to the child node of the cell that the cursor currently
7726     ** points at. This is the right-child if (iIdx==pPage->nCell).
7727     */
7728     iIdx = pCur->aiIdx[pCur->iPage];
7729     if( iIdx==pPage->nCell ){
7730       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7731     }else{
7732       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7733     }
7734   }
7735 
7736   /* An error has occurred. Return an error code. */
7737   return rc;
7738 }
7739 #endif
7740 
7741 /*
7742 ** Return the pager associated with a BTree.  This routine is used for
7743 ** testing and debugging only.
7744 */
7745 Pager *sqlite3BtreePager(Btree *p){
7746   return p->pBt->pPager;
7747 }
7748 
7749 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7750 /*
7751 ** Append a message to the error message string.
7752 */
7753 static void checkAppendMsg(
7754   IntegrityCk *pCheck,
7755   char *zMsg1,
7756   const char *zFormat,
7757   ...
7758 ){
7759   va_list ap;
7760   if( !pCheck->mxErr ) return;
7761   pCheck->mxErr--;
7762   pCheck->nErr++;
7763   va_start(ap, zFormat);
7764   if( pCheck->errMsg.nChar ){
7765     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7766   }
7767   if( zMsg1 ){
7768     sqlite3StrAccumAppendAll(&pCheck->errMsg, zMsg1);
7769   }
7770   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7771   va_end(ap);
7772   if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
7773     pCheck->mallocFailed = 1;
7774   }
7775 }
7776 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7777 
7778 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7779 
7780 /*
7781 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
7782 ** corresponds to page iPg is already set.
7783 */
7784 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
7785   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
7786   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
7787 }
7788 
7789 /*
7790 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
7791 */
7792 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
7793   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
7794   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
7795 }
7796 
7797 
7798 /*
7799 ** Add 1 to the reference count for page iPage.  If this is the second
7800 ** reference to the page, add an error message to pCheck->zErrMsg.
7801 ** Return 1 if there are 2 ore more references to the page and 0 if
7802 ** if this is the first reference to the page.
7803 **
7804 ** Also check that the page number is in bounds.
7805 */
7806 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7807   if( iPage==0 ) return 1;
7808   if( iPage>pCheck->nPage ){
7809     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7810     return 1;
7811   }
7812   if( getPageReferenced(pCheck, iPage) ){
7813     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7814     return 1;
7815   }
7816   setPageReferenced(pCheck, iPage);
7817   return 0;
7818 }
7819 
7820 #ifndef SQLITE_OMIT_AUTOVACUUM
7821 /*
7822 ** Check that the entry in the pointer-map for page iChild maps to
7823 ** page iParent, pointer type ptrType. If not, append an error message
7824 ** to pCheck.
7825 */
7826 static void checkPtrmap(
7827   IntegrityCk *pCheck,   /* Integrity check context */
7828   Pgno iChild,           /* Child page number */
7829   u8 eType,              /* Expected pointer map type */
7830   Pgno iParent,          /* Expected pointer map parent page number */
7831   char *zContext         /* Context description (used for error msg) */
7832 ){
7833   int rc;
7834   u8 ePtrmapType;
7835   Pgno iPtrmapParent;
7836 
7837   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7838   if( rc!=SQLITE_OK ){
7839     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7840     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7841     return;
7842   }
7843 
7844   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7845     checkAppendMsg(pCheck, zContext,
7846       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7847       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7848   }
7849 }
7850 #endif
7851 
7852 /*
7853 ** Check the integrity of the freelist or of an overflow page list.
7854 ** Verify that the number of pages on the list is N.
7855 */
7856 static void checkList(
7857   IntegrityCk *pCheck,  /* Integrity checking context */
7858   int isFreeList,       /* True for a freelist.  False for overflow page list */
7859   int iPage,            /* Page number for first page in the list */
7860   int N,                /* Expected number of pages in the list */
7861   char *zContext        /* Context for error messages */
7862 ){
7863   int i;
7864   int expected = N;
7865   int iFirst = iPage;
7866   while( N-- > 0 && pCheck->mxErr ){
7867     DbPage *pOvflPage;
7868     unsigned char *pOvflData;
7869     if( iPage<1 ){
7870       checkAppendMsg(pCheck, zContext,
7871          "%d of %d pages missing from overflow list starting at %d",
7872           N+1, expected, iFirst);
7873       break;
7874     }
7875     if( checkRef(pCheck, iPage, zContext) ) break;
7876     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7877       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7878       break;
7879     }
7880     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7881     if( isFreeList ){
7882       int n = get4byte(&pOvflData[4]);
7883 #ifndef SQLITE_OMIT_AUTOVACUUM
7884       if( pCheck->pBt->autoVacuum ){
7885         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7886       }
7887 #endif
7888       if( n>(int)pCheck->pBt->usableSize/4-2 ){
7889         checkAppendMsg(pCheck, zContext,
7890            "freelist leaf count too big on page %d", iPage);
7891         N--;
7892       }else{
7893         for(i=0; i<n; i++){
7894           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7895 #ifndef SQLITE_OMIT_AUTOVACUUM
7896           if( pCheck->pBt->autoVacuum ){
7897             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7898           }
7899 #endif
7900           checkRef(pCheck, iFreePage, zContext);
7901         }
7902         N -= n;
7903       }
7904     }
7905 #ifndef SQLITE_OMIT_AUTOVACUUM
7906     else{
7907       /* If this database supports auto-vacuum and iPage is not the last
7908       ** page in this overflow list, check that the pointer-map entry for
7909       ** the following page matches iPage.
7910       */
7911       if( pCheck->pBt->autoVacuum && N>0 ){
7912         i = get4byte(pOvflData);
7913         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7914       }
7915     }
7916 #endif
7917     iPage = get4byte(pOvflData);
7918     sqlite3PagerUnref(pOvflPage);
7919   }
7920 }
7921 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7922 
7923 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7924 /*
7925 ** Do various sanity checks on a single page of a tree.  Return
7926 ** the tree depth.  Root pages return 0.  Parents of root pages
7927 ** return 1, and so forth.
7928 **
7929 ** These checks are done:
7930 **
7931 **      1.  Make sure that cells and freeblocks do not overlap
7932 **          but combine to completely cover the page.
7933 **  NO  2.  Make sure cell keys are in order.
7934 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
7935 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
7936 **      5.  Check the integrity of overflow pages.
7937 **      6.  Recursively call checkTreePage on all children.
7938 **      7.  Verify that the depth of all children is the same.
7939 **      8.  Make sure this page is at least 33% full or else it is
7940 **          the root of the tree.
7941 */
7942 static int checkTreePage(
7943   IntegrityCk *pCheck,  /* Context for the sanity check */
7944   int iPage,            /* Page number of the page to check */
7945   char *zParentContext, /* Parent context */
7946   i64 *pnParentMinKey,
7947   i64 *pnParentMaxKey
7948 ){
7949   MemPage *pPage;
7950   int i, rc, depth, d2, pgno, cnt;
7951   int hdr, cellStart;
7952   int nCell;
7953   u8 *data;
7954   BtShared *pBt;
7955   int usableSize;
7956   char zContext[100];
7957   char *hit = 0;
7958   i64 nMinKey = 0;
7959   i64 nMaxKey = 0;
7960 
7961   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7962 
7963   /* Check that the page exists
7964   */
7965   pBt = pCheck->pBt;
7966   usableSize = pBt->usableSize;
7967   if( iPage==0 ) return 0;
7968   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7969   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7970     checkAppendMsg(pCheck, zContext,
7971        "unable to get the page. error code=%d", rc);
7972     return 0;
7973   }
7974 
7975   /* Clear MemPage.isInit to make sure the corruption detection code in
7976   ** btreeInitPage() is executed.  */
7977   pPage->isInit = 0;
7978   if( (rc = btreeInitPage(pPage))!=0 ){
7979     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
7980     checkAppendMsg(pCheck, zContext,
7981                    "btreeInitPage() returns error code %d", rc);
7982     releasePage(pPage);
7983     return 0;
7984   }
7985 
7986   /* Check out all the cells.
7987   */
7988   depth = 0;
7989   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7990     u8 *pCell;
7991     u32 sz;
7992     CellInfo info;
7993 
7994     /* Check payload overflow pages
7995     */
7996     sqlite3_snprintf(sizeof(zContext), zContext,
7997              "On tree page %d cell %d: ", iPage, i);
7998     pCell = findCell(pPage,i);
7999     btreeParseCellPtr(pPage, pCell, &info);
8000     sz = info.nData;
8001     if( !pPage->intKey ) sz += (int)info.nKey;
8002     /* For intKey pages, check that the keys are in order.
8003     */
8004     else if( i==0 ) nMinKey = nMaxKey = info.nKey;
8005     else{
8006       if( info.nKey <= nMaxKey ){
8007         checkAppendMsg(pCheck, zContext,
8008             "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
8009       }
8010       nMaxKey = info.nKey;
8011     }
8012     assert( sz==info.nPayload );
8013     if( (sz>info.nLocal)
8014      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
8015     ){
8016       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
8017       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
8018 #ifndef SQLITE_OMIT_AUTOVACUUM
8019       if( pBt->autoVacuum ){
8020         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
8021       }
8022 #endif
8023       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
8024     }
8025 
8026     /* Check sanity of left child page.
8027     */
8028     if( !pPage->leaf ){
8029       pgno = get4byte(pCell);
8030 #ifndef SQLITE_OMIT_AUTOVACUUM
8031       if( pBt->autoVacuum ){
8032         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
8033       }
8034 #endif
8035       d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
8036       if( i>0 && d2!=depth ){
8037         checkAppendMsg(pCheck, zContext, "Child page depth differs");
8038       }
8039       depth = d2;
8040     }
8041   }
8042 
8043   if( !pPage->leaf ){
8044     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
8045     sqlite3_snprintf(sizeof(zContext), zContext,
8046                      "On page %d at right child: ", iPage);
8047 #ifndef SQLITE_OMIT_AUTOVACUUM
8048     if( pBt->autoVacuum ){
8049       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
8050     }
8051 #endif
8052     checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
8053   }
8054 
8055   /* For intKey leaf pages, check that the min/max keys are in order
8056   ** with any left/parent/right pages.
8057   */
8058   if( pPage->leaf && pPage->intKey ){
8059     /* if we are a left child page */
8060     if( pnParentMinKey ){
8061       /* if we are the left most child page */
8062       if( !pnParentMaxKey ){
8063         if( nMaxKey > *pnParentMinKey ){
8064           checkAppendMsg(pCheck, zContext,
8065               "Rowid %lld out of order (max larger than parent min of %lld)",
8066               nMaxKey, *pnParentMinKey);
8067         }
8068       }else{
8069         if( nMinKey <= *pnParentMinKey ){
8070           checkAppendMsg(pCheck, zContext,
8071               "Rowid %lld out of order (min less than parent min of %lld)",
8072               nMinKey, *pnParentMinKey);
8073         }
8074         if( nMaxKey > *pnParentMaxKey ){
8075           checkAppendMsg(pCheck, zContext,
8076               "Rowid %lld out of order (max larger than parent max of %lld)",
8077               nMaxKey, *pnParentMaxKey);
8078         }
8079         *pnParentMinKey = nMaxKey;
8080       }
8081     /* else if we're a right child page */
8082     } else if( pnParentMaxKey ){
8083       if( nMinKey <= *pnParentMaxKey ){
8084         checkAppendMsg(pCheck, zContext,
8085             "Rowid %lld out of order (min less than parent max of %lld)",
8086             nMinKey, *pnParentMaxKey);
8087       }
8088     }
8089   }
8090 
8091   /* Check for complete coverage of the page
8092   */
8093   data = pPage->aData;
8094   hdr = pPage->hdrOffset;
8095   hit = sqlite3PageMalloc( pBt->pageSize );
8096   if( hit==0 ){
8097     pCheck->mallocFailed = 1;
8098   }else{
8099     int contentOffset = get2byteNotZero(&data[hdr+5]);
8100     assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
8101     memset(hit+contentOffset, 0, usableSize-contentOffset);
8102     memset(hit, 1, contentOffset);
8103     nCell = get2byte(&data[hdr+3]);
8104     cellStart = hdr + 12 - 4*pPage->leaf;
8105     for(i=0; i<nCell; i++){
8106       int pc = get2byte(&data[cellStart+i*2]);
8107       u32 size = 65536;
8108       int j;
8109       if( pc<=usableSize-4 ){
8110         size = cellSizePtr(pPage, &data[pc]);
8111       }
8112       if( (int)(pc+size-1)>=usableSize ){
8113         checkAppendMsg(pCheck, 0,
8114             "Corruption detected in cell %d on page %d",i,iPage);
8115       }else{
8116         for(j=pc+size-1; j>=pc; j--) hit[j]++;
8117       }
8118     }
8119     i = get2byte(&data[hdr+1]);
8120     while( i>0 ){
8121       int size, j;
8122       assert( i<=usableSize-4 );     /* Enforced by btreeInitPage() */
8123       size = get2byte(&data[i+2]);
8124       assert( i+size<=usableSize );  /* Enforced by btreeInitPage() */
8125       for(j=i+size-1; j>=i; j--) hit[j]++;
8126       j = get2byte(&data[i]);
8127       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
8128       assert( j<=usableSize-4 );   /* Enforced by btreeInitPage() */
8129       i = j;
8130     }
8131     for(i=cnt=0; i<usableSize; i++){
8132       if( hit[i]==0 ){
8133         cnt++;
8134       }else if( hit[i]>1 ){
8135         checkAppendMsg(pCheck, 0,
8136           "Multiple uses for byte %d of page %d", i, iPage);
8137         break;
8138       }
8139     }
8140     if( cnt!=data[hdr+7] ){
8141       checkAppendMsg(pCheck, 0,
8142           "Fragmentation of %d bytes reported as %d on page %d",
8143           cnt, data[hdr+7], iPage);
8144     }
8145   }
8146   sqlite3PageFree(hit);
8147   releasePage(pPage);
8148   return depth+1;
8149 }
8150 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8151 
8152 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
8153 /*
8154 ** This routine does a complete check of the given BTree file.  aRoot[] is
8155 ** an array of pages numbers were each page number is the root page of
8156 ** a table.  nRoot is the number of entries in aRoot.
8157 **
8158 ** A read-only or read-write transaction must be opened before calling
8159 ** this function.
8160 **
8161 ** Write the number of error seen in *pnErr.  Except for some memory
8162 ** allocation errors,  an error message held in memory obtained from
8163 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
8164 ** returned.  If a memory allocation error occurs, NULL is returned.
8165 */
8166 char *sqlite3BtreeIntegrityCheck(
8167   Btree *p,     /* The btree to be checked */
8168   int *aRoot,   /* An array of root pages numbers for individual trees */
8169   int nRoot,    /* Number of entries in aRoot[] */
8170   int mxErr,    /* Stop reporting errors after this many */
8171   int *pnErr    /* Write number of errors seen to this variable */
8172 ){
8173   Pgno i;
8174   int nRef;
8175   IntegrityCk sCheck;
8176   BtShared *pBt = p->pBt;
8177   char zErr[100];
8178 
8179   sqlite3BtreeEnter(p);
8180   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
8181   nRef = sqlite3PagerRefcount(pBt->pPager);
8182   sCheck.pBt = pBt;
8183   sCheck.pPager = pBt->pPager;
8184   sCheck.nPage = btreePagecount(sCheck.pBt);
8185   sCheck.mxErr = mxErr;
8186   sCheck.nErr = 0;
8187   sCheck.mallocFailed = 0;
8188   *pnErr = 0;
8189   if( sCheck.nPage==0 ){
8190     sqlite3BtreeLeave(p);
8191     return 0;
8192   }
8193 
8194   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
8195   if( !sCheck.aPgRef ){
8196     *pnErr = 1;
8197     sqlite3BtreeLeave(p);
8198     return 0;
8199   }
8200   i = PENDING_BYTE_PAGE(pBt);
8201   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
8202   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
8203   sCheck.errMsg.useMalloc = 2;
8204 
8205   /* Check the integrity of the freelist
8206   */
8207   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
8208             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
8209 
8210   /* Check all the tables.
8211   */
8212   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
8213     if( aRoot[i]==0 ) continue;
8214 #ifndef SQLITE_OMIT_AUTOVACUUM
8215     if( pBt->autoVacuum && aRoot[i]>1 ){
8216       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
8217     }
8218 #endif
8219     checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
8220   }
8221 
8222   /* Make sure every page in the file is referenced
8223   */
8224   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
8225 #ifdef SQLITE_OMIT_AUTOVACUUM
8226     if( getPageReferenced(&sCheck, i)==0 ){
8227       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
8228     }
8229 #else
8230     /* If the database supports auto-vacuum, make sure no tables contain
8231     ** references to pointer-map pages.
8232     */
8233     if( getPageReferenced(&sCheck, i)==0 &&
8234        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
8235       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
8236     }
8237     if( getPageReferenced(&sCheck, i)!=0 &&
8238        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
8239       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
8240     }
8241 #endif
8242   }
8243 
8244   /* Make sure this analysis did not leave any unref() pages.
8245   ** This is an internal consistency check; an integrity check
8246   ** of the integrity check.
8247   */
8248   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
8249     checkAppendMsg(&sCheck, 0,
8250       "Outstanding page count goes from %d to %d during this analysis",
8251       nRef, sqlite3PagerRefcount(pBt->pPager)
8252     );
8253   }
8254 
8255   /* Clean  up and report errors.
8256   */
8257   sqlite3BtreeLeave(p);
8258   sqlite3_free(sCheck.aPgRef);
8259   if( sCheck.mallocFailed ){
8260     sqlite3StrAccumReset(&sCheck.errMsg);
8261     *pnErr = sCheck.nErr+1;
8262     return 0;
8263   }
8264   *pnErr = sCheck.nErr;
8265   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
8266   return sqlite3StrAccumFinish(&sCheck.errMsg);
8267 }
8268 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8269 
8270 /*
8271 ** Return the full pathname of the underlying database file.  Return
8272 ** an empty string if the database is in-memory or a TEMP database.
8273 **
8274 ** The pager filename is invariant as long as the pager is
8275 ** open so it is safe to access without the BtShared mutex.
8276 */
8277 const char *sqlite3BtreeGetFilename(Btree *p){
8278   assert( p->pBt->pPager!=0 );
8279   return sqlite3PagerFilename(p->pBt->pPager, 1);
8280 }
8281 
8282 /*
8283 ** Return the pathname of the journal file for this database. The return
8284 ** value of this routine is the same regardless of whether the journal file
8285 ** has been created or not.
8286 **
8287 ** The pager journal filename is invariant as long as the pager is
8288 ** open so it is safe to access without the BtShared mutex.
8289 */
8290 const char *sqlite3BtreeGetJournalname(Btree *p){
8291   assert( p->pBt->pPager!=0 );
8292   return sqlite3PagerJournalname(p->pBt->pPager);
8293 }
8294 
8295 /*
8296 ** Return non-zero if a transaction is active.
8297 */
8298 int sqlite3BtreeIsInTrans(Btree *p){
8299   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
8300   return (p && (p->inTrans==TRANS_WRITE));
8301 }
8302 
8303 #ifndef SQLITE_OMIT_WAL
8304 /*
8305 ** Run a checkpoint on the Btree passed as the first argument.
8306 **
8307 ** Return SQLITE_LOCKED if this or any other connection has an open
8308 ** transaction on the shared-cache the argument Btree is connected to.
8309 **
8310 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
8311 */
8312 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
8313   int rc = SQLITE_OK;
8314   if( p ){
8315     BtShared *pBt = p->pBt;
8316     sqlite3BtreeEnter(p);
8317     if( pBt->inTransaction!=TRANS_NONE ){
8318       rc = SQLITE_LOCKED;
8319     }else{
8320       rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
8321     }
8322     sqlite3BtreeLeave(p);
8323   }
8324   return rc;
8325 }
8326 #endif
8327 
8328 /*
8329 ** Return non-zero if a read (or write) transaction is active.
8330 */
8331 int sqlite3BtreeIsInReadTrans(Btree *p){
8332   assert( p );
8333   assert( sqlite3_mutex_held(p->db->mutex) );
8334   return p->inTrans!=TRANS_NONE;
8335 }
8336 
8337 int sqlite3BtreeIsInBackup(Btree *p){
8338   assert( p );
8339   assert( sqlite3_mutex_held(p->db->mutex) );
8340   return p->nBackup!=0;
8341 }
8342 
8343 /*
8344 ** This function returns a pointer to a blob of memory associated with
8345 ** a single shared-btree. The memory is used by client code for its own
8346 ** purposes (for example, to store a high-level schema associated with
8347 ** the shared-btree). The btree layer manages reference counting issues.
8348 **
8349 ** The first time this is called on a shared-btree, nBytes bytes of memory
8350 ** are allocated, zeroed, and returned to the caller. For each subsequent
8351 ** call the nBytes parameter is ignored and a pointer to the same blob
8352 ** of memory returned.
8353 **
8354 ** If the nBytes parameter is 0 and the blob of memory has not yet been
8355 ** allocated, a null pointer is returned. If the blob has already been
8356 ** allocated, it is returned as normal.
8357 **
8358 ** Just before the shared-btree is closed, the function passed as the
8359 ** xFree argument when the memory allocation was made is invoked on the
8360 ** blob of allocated memory. The xFree function should not call sqlite3_free()
8361 ** on the memory, the btree layer does that.
8362 */
8363 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
8364   BtShared *pBt = p->pBt;
8365   sqlite3BtreeEnter(p);
8366   if( !pBt->pSchema && nBytes ){
8367     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
8368     pBt->xFreeSchema = xFree;
8369   }
8370   sqlite3BtreeLeave(p);
8371   return pBt->pSchema;
8372 }
8373 
8374 /*
8375 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
8376 ** btree as the argument handle holds an exclusive lock on the
8377 ** sqlite_master table. Otherwise SQLITE_OK.
8378 */
8379 int sqlite3BtreeSchemaLocked(Btree *p){
8380   int rc;
8381   assert( sqlite3_mutex_held(p->db->mutex) );
8382   sqlite3BtreeEnter(p);
8383   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
8384   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
8385   sqlite3BtreeLeave(p);
8386   return rc;
8387 }
8388 
8389 
8390 #ifndef SQLITE_OMIT_SHARED_CACHE
8391 /*
8392 ** Obtain a lock on the table whose root page is iTab.  The
8393 ** lock is a write lock if isWritelock is true or a read lock
8394 ** if it is false.
8395 */
8396 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
8397   int rc = SQLITE_OK;
8398   assert( p->inTrans!=TRANS_NONE );
8399   if( p->sharable ){
8400     u8 lockType = READ_LOCK + isWriteLock;
8401     assert( READ_LOCK+1==WRITE_LOCK );
8402     assert( isWriteLock==0 || isWriteLock==1 );
8403 
8404     sqlite3BtreeEnter(p);
8405     rc = querySharedCacheTableLock(p, iTab, lockType);
8406     if( rc==SQLITE_OK ){
8407       rc = setSharedCacheTableLock(p, iTab, lockType);
8408     }
8409     sqlite3BtreeLeave(p);
8410   }
8411   return rc;
8412 }
8413 #endif
8414 
8415 #ifndef SQLITE_OMIT_INCRBLOB
8416 /*
8417 ** Argument pCsr must be a cursor opened for writing on an
8418 ** INTKEY table currently pointing at a valid table entry.
8419 ** This function modifies the data stored as part of that entry.
8420 **
8421 ** Only the data content may only be modified, it is not possible to
8422 ** change the length of the data stored. If this function is called with
8423 ** parameters that attempt to write past the end of the existing data,
8424 ** no modifications are made and SQLITE_CORRUPT is returned.
8425 */
8426 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
8427   int rc;
8428   assert( cursorHoldsMutex(pCsr) );
8429   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
8430   assert( pCsr->curFlags & BTCF_Incrblob );
8431 
8432   rc = restoreCursorPosition(pCsr);
8433   if( rc!=SQLITE_OK ){
8434     return rc;
8435   }
8436   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
8437   if( pCsr->eState!=CURSOR_VALID ){
8438     return SQLITE_ABORT;
8439   }
8440 
8441   /* Save the positions of all other cursors open on this table. This is
8442   ** required in case any of them are holding references to an xFetch
8443   ** version of the b-tree page modified by the accessPayload call below.
8444   **
8445   ** Note that pCsr must be open on a BTREE_INTKEY table and saveCursorPosition()
8446   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
8447   ** saveAllCursors can only return SQLITE_OK.
8448   */
8449   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
8450   assert( rc==SQLITE_OK );
8451 
8452   /* Check some assumptions:
8453   **   (a) the cursor is open for writing,
8454   **   (b) there is a read/write transaction open,
8455   **   (c) the connection holds a write-lock on the table (if required),
8456   **   (d) there are no conflicting read-locks, and
8457   **   (e) the cursor points at a valid row of an intKey table.
8458   */
8459   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
8460     return SQLITE_READONLY;
8461   }
8462   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
8463               && pCsr->pBt->inTransaction==TRANS_WRITE );
8464   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
8465   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
8466   assert( pCsr->apPage[pCsr->iPage]->intKey );
8467 
8468   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
8469 }
8470 
8471 /*
8472 ** Mark this cursor as an incremental blob cursor.
8473 */
8474 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
8475   pCur->curFlags |= BTCF_Incrblob;
8476 }
8477 #endif
8478 
8479 /*
8480 ** Set both the "read version" (single byte at byte offset 18) and
8481 ** "write version" (single byte at byte offset 19) fields in the database
8482 ** header to iVersion.
8483 */
8484 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
8485   BtShared *pBt = pBtree->pBt;
8486   int rc;                         /* Return code */
8487 
8488   assert( iVersion==1 || iVersion==2 );
8489 
8490   /* If setting the version fields to 1, do not automatically open the
8491   ** WAL connection, even if the version fields are currently set to 2.
8492   */
8493   pBt->btsFlags &= ~BTS_NO_WAL;
8494   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
8495 
8496   rc = sqlite3BtreeBeginTrans(pBtree, 0);
8497   if( rc==SQLITE_OK ){
8498     u8 *aData = pBt->pPage1->aData;
8499     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
8500       rc = sqlite3BtreeBeginTrans(pBtree, 2);
8501       if( rc==SQLITE_OK ){
8502         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8503         if( rc==SQLITE_OK ){
8504           aData[18] = (u8)iVersion;
8505           aData[19] = (u8)iVersion;
8506         }
8507       }
8508     }
8509   }
8510 
8511   pBt->btsFlags &= ~BTS_NO_WAL;
8512   return rc;
8513 }
8514 
8515 /*
8516 ** set the mask of hint flags for cursor pCsr. Currently the only valid
8517 ** values are 0 and BTREE_BULKLOAD.
8518 */
8519 void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
8520   assert( mask==BTREE_BULKLOAD || mask==0 );
8521   pCsr->hints = mask;
8522 }
8523 
8524 /*
8525 ** Return true if the given Btree is read-only.
8526 */
8527 int sqlite3BtreeIsReadonly(Btree *p){
8528   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
8529 }
8530