xref: /sqlite-3.40.0/src/btree.c (revision 754d3adf)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 #ifndef SQLITE_OMIT_SHARED_CACHE
47 /*
48 ** A list of BtShared objects that are eligible for participation
49 ** in shared cache.  This variable has file scope during normal builds,
50 ** but the test harness needs to access it so we make it global for
51 ** test builds.
52 **
53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
54 */
55 #ifdef SQLITE_TEST
56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
57 #else
58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
59 #endif
60 #endif /* SQLITE_OMIT_SHARED_CACHE */
61 
62 #ifndef SQLITE_OMIT_SHARED_CACHE
63 /*
64 ** Enable or disable the shared pager and schema features.
65 **
66 ** This routine has no effect on existing database connections.
67 ** The shared cache setting effects only future calls to
68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
69 */
70 int sqlite3_enable_shared_cache(int enable){
71   sqlite3GlobalConfig.sharedCacheEnabled = enable;
72   return SQLITE_OK;
73 }
74 #endif
75 
76 
77 
78 #ifdef SQLITE_OMIT_SHARED_CACHE
79   /*
80   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
81   ** and clearAllSharedCacheTableLocks()
82   ** manipulate entries in the BtShared.pLock linked list used to store
83   ** shared-cache table level locks. If the library is compiled with the
84   ** shared-cache feature disabled, then there is only ever one user
85   ** of each BtShared structure and so this locking is not necessary.
86   ** So define the lock related functions as no-ops.
87   */
88   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
89   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
90   #define clearAllSharedCacheTableLocks(a)
91   #define downgradeAllSharedCacheTableLocks(a)
92   #define hasSharedCacheTableLock(a,b,c,d) 1
93   #define hasReadConflicts(a, b) 0
94 #endif
95 
96 #ifndef SQLITE_OMIT_SHARED_CACHE
97 
98 #ifdef SQLITE_DEBUG
99 /*
100 **** This function is only used as part of an assert() statement. ***
101 **
102 ** Check to see if pBtree holds the required locks to read or write to the
103 ** table with root page iRoot.   Return 1 if it does and 0 if not.
104 **
105 ** For example, when writing to a table with root-page iRoot via
106 ** Btree connection pBtree:
107 **
108 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
109 **
110 ** When writing to an index that resides in a sharable database, the
111 ** caller should have first obtained a lock specifying the root page of
112 ** the corresponding table. This makes things a bit more complicated,
113 ** as this module treats each table as a separate structure. To determine
114 ** the table corresponding to the index being written, this
115 ** function has to search through the database schema.
116 **
117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
118 ** hold a write-lock on the schema table (root page 1). This is also
119 ** acceptable.
120 */
121 static int hasSharedCacheTableLock(
122   Btree *pBtree,         /* Handle that must hold lock */
123   Pgno iRoot,            /* Root page of b-tree */
124   int isIndex,           /* True if iRoot is the root of an index b-tree */
125   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
126 ){
127   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
128   Pgno iTab = 0;
129   BtLock *pLock;
130 
131   /* If this database is not shareable, or if the client is reading
132   ** and has the read-uncommitted flag set, then no lock is required.
133   ** Return true immediately.
134   */
135   if( (pBtree->sharable==0)
136    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
137   ){
138     return 1;
139   }
140 
141   /* If the client is reading  or writing an index and the schema is
142   ** not loaded, then it is too difficult to actually check to see if
143   ** the correct locks are held.  So do not bother - just return true.
144   ** This case does not come up very often anyhow.
145   */
146   if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
147     return 1;
148   }
149 
150   /* Figure out the root-page that the lock should be held on. For table
151   ** b-trees, this is just the root page of the b-tree being read or
152   ** written. For index b-trees, it is the root page of the associated
153   ** table.  */
154   if( isIndex ){
155     HashElem *p;
156     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
157       Index *pIdx = (Index *)sqliteHashData(p);
158       if( pIdx->tnum==(int)iRoot ){
159         iTab = pIdx->pTable->tnum;
160       }
161     }
162   }else{
163     iTab = iRoot;
164   }
165 
166   /* Search for the required lock. Either a write-lock on root-page iTab, a
167   ** write-lock on the schema table, or (if the client is reading) a
168   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
169   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
170     if( pLock->pBtree==pBtree
171      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
172      && pLock->eLock>=eLockType
173     ){
174       return 1;
175     }
176   }
177 
178   /* Failed to find the required lock. */
179   return 0;
180 }
181 #endif /* SQLITE_DEBUG */
182 
183 #ifdef SQLITE_DEBUG
184 /*
185 **** This function may be used as part of assert() statements only. ****
186 **
187 ** Return true if it would be illegal for pBtree to write into the
188 ** table or index rooted at iRoot because other shared connections are
189 ** simultaneously reading that same table or index.
190 **
191 ** It is illegal for pBtree to write if some other Btree object that
192 ** shares the same BtShared object is currently reading or writing
193 ** the iRoot table.  Except, if the other Btree object has the
194 ** read-uncommitted flag set, then it is OK for the other object to
195 ** have a read cursor.
196 **
197 ** For example, before writing to any part of the table or index
198 ** rooted at page iRoot, one should call:
199 **
200 **    assert( !hasReadConflicts(pBtree, iRoot) );
201 */
202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
203   BtCursor *p;
204   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
205     if( p->pgnoRoot==iRoot
206      && p->pBtree!=pBtree
207      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
208     ){
209       return 1;
210     }
211   }
212   return 0;
213 }
214 #endif    /* #ifdef SQLITE_DEBUG */
215 
216 /*
217 ** Query to see if Btree handle p may obtain a lock of type eLock
218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
219 ** SQLITE_OK if the lock may be obtained (by calling
220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
221 */
222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
223   BtShared *pBt = p->pBt;
224   BtLock *pIter;
225 
226   assert( sqlite3BtreeHoldsMutex(p) );
227   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
228   assert( p->db!=0 );
229   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
230 
231   /* If requesting a write-lock, then the Btree must have an open write
232   ** transaction on this file. And, obviously, for this to be so there
233   ** must be an open write transaction on the file itself.
234   */
235   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
236   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
237 
238   /* This routine is a no-op if the shared-cache is not enabled */
239   if( !p->sharable ){
240     return SQLITE_OK;
241   }
242 
243   /* If some other connection is holding an exclusive lock, the
244   ** requested lock may not be obtained.
245   */
246   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
247     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
248     return SQLITE_LOCKED_SHAREDCACHE;
249   }
250 
251   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
252     /* The condition (pIter->eLock!=eLock) in the following if(...)
253     ** statement is a simplification of:
254     **
255     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
256     **
257     ** since we know that if eLock==WRITE_LOCK, then no other connection
258     ** may hold a WRITE_LOCK on any table in this file (since there can
259     ** only be a single writer).
260     */
261     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
262     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
263     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
264       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
265       if( eLock==WRITE_LOCK ){
266         assert( p==pBt->pWriter );
267         pBt->btsFlags |= BTS_PENDING;
268       }
269       return SQLITE_LOCKED_SHAREDCACHE;
270     }
271   }
272   return SQLITE_OK;
273 }
274 #endif /* !SQLITE_OMIT_SHARED_CACHE */
275 
276 #ifndef SQLITE_OMIT_SHARED_CACHE
277 /*
278 ** Add a lock on the table with root-page iTable to the shared-btree used
279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
280 ** WRITE_LOCK.
281 **
282 ** This function assumes the following:
283 **
284 **   (a) The specified Btree object p is connected to a sharable
285 **       database (one with the BtShared.sharable flag set), and
286 **
287 **   (b) No other Btree objects hold a lock that conflicts
288 **       with the requested lock (i.e. querySharedCacheTableLock() has
289 **       already been called and returned SQLITE_OK).
290 **
291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
292 ** is returned if a malloc attempt fails.
293 */
294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
295   BtShared *pBt = p->pBt;
296   BtLock *pLock = 0;
297   BtLock *pIter;
298 
299   assert( sqlite3BtreeHoldsMutex(p) );
300   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
301   assert( p->db!=0 );
302 
303   /* A connection with the read-uncommitted flag set will never try to
304   ** obtain a read-lock using this function. The only read-lock obtained
305   ** by a connection in read-uncommitted mode is on the sqlite_master
306   ** table, and that lock is obtained in BtreeBeginTrans().  */
307   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
308 
309   /* This function should only be called on a sharable b-tree after it
310   ** has been determined that no other b-tree holds a conflicting lock.  */
311   assert( p->sharable );
312   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
313 
314   /* First search the list for an existing lock on this table. */
315   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
316     if( pIter->iTable==iTable && pIter->pBtree==p ){
317       pLock = pIter;
318       break;
319     }
320   }
321 
322   /* If the above search did not find a BtLock struct associating Btree p
323   ** with table iTable, allocate one and link it into the list.
324   */
325   if( !pLock ){
326     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
327     if( !pLock ){
328       return SQLITE_NOMEM;
329     }
330     pLock->iTable = iTable;
331     pLock->pBtree = p;
332     pLock->pNext = pBt->pLock;
333     pBt->pLock = pLock;
334   }
335 
336   /* Set the BtLock.eLock variable to the maximum of the current lock
337   ** and the requested lock. This means if a write-lock was already held
338   ** and a read-lock requested, we don't incorrectly downgrade the lock.
339   */
340   assert( WRITE_LOCK>READ_LOCK );
341   if( eLock>pLock->eLock ){
342     pLock->eLock = eLock;
343   }
344 
345   return SQLITE_OK;
346 }
347 #endif /* !SQLITE_OMIT_SHARED_CACHE */
348 
349 #ifndef SQLITE_OMIT_SHARED_CACHE
350 /*
351 ** Release all the table locks (locks obtained via calls to
352 ** the setSharedCacheTableLock() procedure) held by Btree object p.
353 **
354 ** This function assumes that Btree p has an open read or write
355 ** transaction. If it does not, then the BTS_PENDING flag
356 ** may be incorrectly cleared.
357 */
358 static void clearAllSharedCacheTableLocks(Btree *p){
359   BtShared *pBt = p->pBt;
360   BtLock **ppIter = &pBt->pLock;
361 
362   assert( sqlite3BtreeHoldsMutex(p) );
363   assert( p->sharable || 0==*ppIter );
364   assert( p->inTrans>0 );
365 
366   while( *ppIter ){
367     BtLock *pLock = *ppIter;
368     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
369     assert( pLock->pBtree->inTrans>=pLock->eLock );
370     if( pLock->pBtree==p ){
371       *ppIter = pLock->pNext;
372       assert( pLock->iTable!=1 || pLock==&p->lock );
373       if( pLock->iTable!=1 ){
374         sqlite3_free(pLock);
375       }
376     }else{
377       ppIter = &pLock->pNext;
378     }
379   }
380 
381   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
382   if( pBt->pWriter==p ){
383     pBt->pWriter = 0;
384     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
385   }else if( pBt->nTransaction==2 ){
386     /* This function is called when Btree p is concluding its
387     ** transaction. If there currently exists a writer, and p is not
388     ** that writer, then the number of locks held by connections other
389     ** than the writer must be about to drop to zero. In this case
390     ** set the BTS_PENDING flag to 0.
391     **
392     ** If there is not currently a writer, then BTS_PENDING must
393     ** be zero already. So this next line is harmless in that case.
394     */
395     pBt->btsFlags &= ~BTS_PENDING;
396   }
397 }
398 
399 /*
400 ** This function changes all write-locks held by Btree p into read-locks.
401 */
402 static void downgradeAllSharedCacheTableLocks(Btree *p){
403   BtShared *pBt = p->pBt;
404   if( pBt->pWriter==p ){
405     BtLock *pLock;
406     pBt->pWriter = 0;
407     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
408     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
409       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
410       pLock->eLock = READ_LOCK;
411     }
412   }
413 }
414 
415 #endif /* SQLITE_OMIT_SHARED_CACHE */
416 
417 static void releasePage(MemPage *pPage);  /* Forward reference */
418 
419 /*
420 ***** This routine is used inside of assert() only ****
421 **
422 ** Verify that the cursor holds the mutex on its BtShared
423 */
424 #ifdef SQLITE_DEBUG
425 static int cursorHoldsMutex(BtCursor *p){
426   return sqlite3_mutex_held(p->pBt->mutex);
427 }
428 #endif
429 
430 
431 #ifndef SQLITE_OMIT_INCRBLOB
432 /*
433 ** Invalidate the overflow page-list cache for cursor pCur, if any.
434 */
435 static void invalidateOverflowCache(BtCursor *pCur){
436   assert( cursorHoldsMutex(pCur) );
437   sqlite3_free(pCur->aOverflow);
438   pCur->aOverflow = 0;
439 }
440 
441 /*
442 ** Invalidate the overflow page-list cache for all cursors opened
443 ** on the shared btree structure pBt.
444 */
445 static void invalidateAllOverflowCache(BtShared *pBt){
446   BtCursor *p;
447   assert( sqlite3_mutex_held(pBt->mutex) );
448   for(p=pBt->pCursor; p; p=p->pNext){
449     invalidateOverflowCache(p);
450   }
451 }
452 
453 /*
454 ** This function is called before modifying the contents of a table
455 ** to invalidate any incrblob cursors that are open on the
456 ** row or one of the rows being modified.
457 **
458 ** If argument isClearTable is true, then the entire contents of the
459 ** table is about to be deleted. In this case invalidate all incrblob
460 ** cursors open on any row within the table with root-page pgnoRoot.
461 **
462 ** Otherwise, if argument isClearTable is false, then the row with
463 ** rowid iRow is being replaced or deleted. In this case invalidate
464 ** only those incrblob cursors open on that specific row.
465 */
466 static void invalidateIncrblobCursors(
467   Btree *pBtree,          /* The database file to check */
468   i64 iRow,               /* The rowid that might be changing */
469   int isClearTable        /* True if all rows are being deleted */
470 ){
471   BtCursor *p;
472   BtShared *pBt = pBtree->pBt;
473   assert( sqlite3BtreeHoldsMutex(pBtree) );
474   for(p=pBt->pCursor; p; p=p->pNext){
475     if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
476       p->eState = CURSOR_INVALID;
477     }
478   }
479 }
480 
481 #else
482   /* Stub functions when INCRBLOB is omitted */
483   #define invalidateOverflowCache(x)
484   #define invalidateAllOverflowCache(x)
485   #define invalidateIncrblobCursors(x,y,z)
486 #endif /* SQLITE_OMIT_INCRBLOB */
487 
488 /*
489 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
490 ** when a page that previously contained data becomes a free-list leaf
491 ** page.
492 **
493 ** The BtShared.pHasContent bitvec exists to work around an obscure
494 ** bug caused by the interaction of two useful IO optimizations surrounding
495 ** free-list leaf pages:
496 **
497 **   1) When all data is deleted from a page and the page becomes
498 **      a free-list leaf page, the page is not written to the database
499 **      (as free-list leaf pages contain no meaningful data). Sometimes
500 **      such a page is not even journalled (as it will not be modified,
501 **      why bother journalling it?).
502 **
503 **   2) When a free-list leaf page is reused, its content is not read
504 **      from the database or written to the journal file (why should it
505 **      be, if it is not at all meaningful?).
506 **
507 ** By themselves, these optimizations work fine and provide a handy
508 ** performance boost to bulk delete or insert operations. However, if
509 ** a page is moved to the free-list and then reused within the same
510 ** transaction, a problem comes up. If the page is not journalled when
511 ** it is moved to the free-list and it is also not journalled when it
512 ** is extracted from the free-list and reused, then the original data
513 ** may be lost. In the event of a rollback, it may not be possible
514 ** to restore the database to its original configuration.
515 **
516 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
517 ** moved to become a free-list leaf page, the corresponding bit is
518 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
519 ** optimization 2 above is omitted if the corresponding bit is already
520 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
521 ** at the end of every transaction.
522 */
523 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
524   int rc = SQLITE_OK;
525   if( !pBt->pHasContent ){
526     assert( pgno<=pBt->nPage );
527     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
528     if( !pBt->pHasContent ){
529       rc = SQLITE_NOMEM;
530     }
531   }
532   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
533     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
534   }
535   return rc;
536 }
537 
538 /*
539 ** Query the BtShared.pHasContent vector.
540 **
541 ** This function is called when a free-list leaf page is removed from the
542 ** free-list for reuse. It returns false if it is safe to retrieve the
543 ** page from the pager layer with the 'no-content' flag set. True otherwise.
544 */
545 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
546   Bitvec *p = pBt->pHasContent;
547   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
548 }
549 
550 /*
551 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
552 ** invoked at the conclusion of each write-transaction.
553 */
554 static void btreeClearHasContent(BtShared *pBt){
555   sqlite3BitvecDestroy(pBt->pHasContent);
556   pBt->pHasContent = 0;
557 }
558 
559 /*
560 ** Save the current cursor position in the variables BtCursor.nKey
561 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
562 **
563 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
564 ** prior to calling this routine.
565 */
566 static int saveCursorPosition(BtCursor *pCur){
567   int rc;
568 
569   assert( CURSOR_VALID==pCur->eState );
570   assert( 0==pCur->pKey );
571   assert( cursorHoldsMutex(pCur) );
572 
573   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
574   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
575 
576   /* If this is an intKey table, then the above call to BtreeKeySize()
577   ** stores the integer key in pCur->nKey. In this case this value is
578   ** all that is required. Otherwise, if pCur is not open on an intKey
579   ** table, then malloc space for and store the pCur->nKey bytes of key
580   ** data.
581   */
582   if( 0==pCur->apPage[0]->intKey ){
583     void *pKey = sqlite3Malloc( (int)pCur->nKey );
584     if( pKey ){
585       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
586       if( rc==SQLITE_OK ){
587         pCur->pKey = pKey;
588       }else{
589         sqlite3_free(pKey);
590       }
591     }else{
592       rc = SQLITE_NOMEM;
593     }
594   }
595   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
596 
597   if( rc==SQLITE_OK ){
598     int i;
599     for(i=0; i<=pCur->iPage; i++){
600       releasePage(pCur->apPage[i]);
601       pCur->apPage[i] = 0;
602     }
603     pCur->iPage = -1;
604     pCur->eState = CURSOR_REQUIRESEEK;
605   }
606 
607   invalidateOverflowCache(pCur);
608   return rc;
609 }
610 
611 /*
612 ** Save the positions of all cursors (except pExcept) that are open on
613 ** the table  with root-page iRoot. Usually, this is called just before cursor
614 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
615 */
616 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
617   BtCursor *p;
618   assert( sqlite3_mutex_held(pBt->mutex) );
619   assert( pExcept==0 || pExcept->pBt==pBt );
620   for(p=pBt->pCursor; p; p=p->pNext){
621     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
622         p->eState==CURSOR_VALID ){
623       int rc = saveCursorPosition(p);
624       if( SQLITE_OK!=rc ){
625         return rc;
626       }
627     }
628   }
629   return SQLITE_OK;
630 }
631 
632 /*
633 ** Clear the current cursor position.
634 */
635 void sqlite3BtreeClearCursor(BtCursor *pCur){
636   assert( cursorHoldsMutex(pCur) );
637   sqlite3_free(pCur->pKey);
638   pCur->pKey = 0;
639   pCur->eState = CURSOR_INVALID;
640 }
641 
642 /*
643 ** In this version of BtreeMoveto, pKey is a packed index record
644 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
645 ** record and then call BtreeMovetoUnpacked() to do the work.
646 */
647 static int btreeMoveto(
648   BtCursor *pCur,     /* Cursor open on the btree to be searched */
649   const void *pKey,   /* Packed key if the btree is an index */
650   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
651   int bias,           /* Bias search to the high end */
652   int *pRes           /* Write search results here */
653 ){
654   int rc;                    /* Status code */
655   UnpackedRecord *pIdxKey;   /* Unpacked index key */
656   char aSpace[150];          /* Temp space for pIdxKey - to avoid a malloc */
657   char *pFree = 0;
658 
659   if( pKey ){
660     assert( nKey==(i64)(int)nKey );
661     pIdxKey = sqlite3VdbeAllocUnpackedRecord(
662         pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
663     );
664     if( pIdxKey==0 ) return SQLITE_NOMEM;
665     sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
666   }else{
667     pIdxKey = 0;
668   }
669   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
670   if( pFree ){
671     sqlite3DbFree(pCur->pKeyInfo->db, pFree);
672   }
673   return rc;
674 }
675 
676 /*
677 ** Restore the cursor to the position it was in (or as close to as possible)
678 ** when saveCursorPosition() was called. Note that this call deletes the
679 ** saved position info stored by saveCursorPosition(), so there can be
680 ** at most one effective restoreCursorPosition() call after each
681 ** saveCursorPosition().
682 */
683 static int btreeRestoreCursorPosition(BtCursor *pCur){
684   int rc;
685   assert( cursorHoldsMutex(pCur) );
686   assert( pCur->eState>=CURSOR_REQUIRESEEK );
687   if( pCur->eState==CURSOR_FAULT ){
688     return pCur->skipNext;
689   }
690   pCur->eState = CURSOR_INVALID;
691   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
692   if( rc==SQLITE_OK ){
693     sqlite3_free(pCur->pKey);
694     pCur->pKey = 0;
695     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
696   }
697   return rc;
698 }
699 
700 #define restoreCursorPosition(p) \
701   (p->eState>=CURSOR_REQUIRESEEK ? \
702          btreeRestoreCursorPosition(p) : \
703          SQLITE_OK)
704 
705 /*
706 ** Determine whether or not a cursor has moved from the position it
707 ** was last placed at.  Cursors can move when the row they are pointing
708 ** at is deleted out from under them.
709 **
710 ** This routine returns an error code if something goes wrong.  The
711 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
712 */
713 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
714   int rc;
715 
716   rc = restoreCursorPosition(pCur);
717   if( rc ){
718     *pHasMoved = 1;
719     return rc;
720   }
721   if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
722     *pHasMoved = 1;
723   }else{
724     *pHasMoved = 0;
725   }
726   return SQLITE_OK;
727 }
728 
729 #ifndef SQLITE_OMIT_AUTOVACUUM
730 /*
731 ** Given a page number of a regular database page, return the page
732 ** number for the pointer-map page that contains the entry for the
733 ** input page number.
734 **
735 ** Return 0 (not a valid page) for pgno==1 since there is
736 ** no pointer map associated with page 1.  The integrity_check logic
737 ** requires that ptrmapPageno(*,1)!=1.
738 */
739 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
740   int nPagesPerMapPage;
741   Pgno iPtrMap, ret;
742   assert( sqlite3_mutex_held(pBt->mutex) );
743   if( pgno<2 ) return 0;
744   nPagesPerMapPage = (pBt->usableSize/5)+1;
745   iPtrMap = (pgno-2)/nPagesPerMapPage;
746   ret = (iPtrMap*nPagesPerMapPage) + 2;
747   if( ret==PENDING_BYTE_PAGE(pBt) ){
748     ret++;
749   }
750   return ret;
751 }
752 
753 /*
754 ** Write an entry into the pointer map.
755 **
756 ** This routine updates the pointer map entry for page number 'key'
757 ** so that it maps to type 'eType' and parent page number 'pgno'.
758 **
759 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
760 ** a no-op.  If an error occurs, the appropriate error code is written
761 ** into *pRC.
762 */
763 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
764   DbPage *pDbPage;  /* The pointer map page */
765   u8 *pPtrmap;      /* The pointer map data */
766   Pgno iPtrmap;     /* The pointer map page number */
767   int offset;       /* Offset in pointer map page */
768   int rc;           /* Return code from subfunctions */
769 
770   if( *pRC ) return;
771 
772   assert( sqlite3_mutex_held(pBt->mutex) );
773   /* The master-journal page number must never be used as a pointer map page */
774   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
775 
776   assert( pBt->autoVacuum );
777   if( key==0 ){
778     *pRC = SQLITE_CORRUPT_BKPT;
779     return;
780   }
781   iPtrmap = PTRMAP_PAGENO(pBt, key);
782   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
783   if( rc!=SQLITE_OK ){
784     *pRC = rc;
785     return;
786   }
787   offset = PTRMAP_PTROFFSET(iPtrmap, key);
788   if( offset<0 ){
789     *pRC = SQLITE_CORRUPT_BKPT;
790     goto ptrmap_exit;
791   }
792   assert( offset <= (int)pBt->usableSize-5 );
793   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
794 
795   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
796     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
797     *pRC= rc = sqlite3PagerWrite(pDbPage);
798     if( rc==SQLITE_OK ){
799       pPtrmap[offset] = eType;
800       put4byte(&pPtrmap[offset+1], parent);
801     }
802   }
803 
804 ptrmap_exit:
805   sqlite3PagerUnref(pDbPage);
806 }
807 
808 /*
809 ** Read an entry from the pointer map.
810 **
811 ** This routine retrieves the pointer map entry for page 'key', writing
812 ** the type and parent page number to *pEType and *pPgno respectively.
813 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
814 */
815 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
816   DbPage *pDbPage;   /* The pointer map page */
817   int iPtrmap;       /* Pointer map page index */
818   u8 *pPtrmap;       /* Pointer map page data */
819   int offset;        /* Offset of entry in pointer map */
820   int rc;
821 
822   assert( sqlite3_mutex_held(pBt->mutex) );
823 
824   iPtrmap = PTRMAP_PAGENO(pBt, key);
825   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
826   if( rc!=0 ){
827     return rc;
828   }
829   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
830 
831   offset = PTRMAP_PTROFFSET(iPtrmap, key);
832   if( offset<0 ){
833     sqlite3PagerUnref(pDbPage);
834     return SQLITE_CORRUPT_BKPT;
835   }
836   assert( offset <= (int)pBt->usableSize-5 );
837   assert( pEType!=0 );
838   *pEType = pPtrmap[offset];
839   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
840 
841   sqlite3PagerUnref(pDbPage);
842   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
843   return SQLITE_OK;
844 }
845 
846 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
847   #define ptrmapPut(w,x,y,z,rc)
848   #define ptrmapGet(w,x,y,z) SQLITE_OK
849   #define ptrmapPutOvflPtr(x, y, rc)
850 #endif
851 
852 /*
853 ** Given a btree page and a cell index (0 means the first cell on
854 ** the page, 1 means the second cell, and so forth) return a pointer
855 ** to the cell content.
856 **
857 ** This routine works only for pages that do not contain overflow cells.
858 */
859 #define findCell(P,I) \
860   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)])))
861 #define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I)))))
862 
863 
864 /*
865 ** This a more complex version of findCell() that works for
866 ** pages that do contain overflow cells.
867 */
868 static u8 *findOverflowCell(MemPage *pPage, int iCell){
869   int i;
870   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
871   for(i=pPage->nOverflow-1; i>=0; i--){
872     int k;
873     k = pPage->aiOvfl[i];
874     if( k<=iCell ){
875       if( k==iCell ){
876         return pPage->apOvfl[i];
877       }
878       iCell--;
879     }
880   }
881   return findCell(pPage, iCell);
882 }
883 
884 /*
885 ** Parse a cell content block and fill in the CellInfo structure.  There
886 ** are two versions of this function.  btreeParseCell() takes a
887 ** cell index as the second argument and btreeParseCellPtr()
888 ** takes a pointer to the body of the cell as its second argument.
889 **
890 ** Within this file, the parseCell() macro can be called instead of
891 ** btreeParseCellPtr(). Using some compilers, this will be faster.
892 */
893 static void btreeParseCellPtr(
894   MemPage *pPage,         /* Page containing the cell */
895   u8 *pCell,              /* Pointer to the cell text. */
896   CellInfo *pInfo         /* Fill in this structure */
897 ){
898   u16 n;                  /* Number bytes in cell content header */
899   u32 nPayload;           /* Number of bytes of cell payload */
900 
901   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
902 
903   pInfo->pCell = pCell;
904   assert( pPage->leaf==0 || pPage->leaf==1 );
905   n = pPage->childPtrSize;
906   assert( n==4-4*pPage->leaf );
907   if( pPage->intKey ){
908     if( pPage->hasData ){
909       n += getVarint32(&pCell[n], nPayload);
910     }else{
911       nPayload = 0;
912     }
913     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
914     pInfo->nData = nPayload;
915   }else{
916     pInfo->nData = 0;
917     n += getVarint32(&pCell[n], nPayload);
918     pInfo->nKey = nPayload;
919   }
920   pInfo->nPayload = nPayload;
921   pInfo->nHeader = n;
922   testcase( nPayload==pPage->maxLocal );
923   testcase( nPayload==pPage->maxLocal+1 );
924   if( likely(nPayload<=pPage->maxLocal) ){
925     /* This is the (easy) common case where the entire payload fits
926     ** on the local page.  No overflow is required.
927     */
928     if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
929     pInfo->nLocal = (u16)nPayload;
930     pInfo->iOverflow = 0;
931   }else{
932     /* If the payload will not fit completely on the local page, we have
933     ** to decide how much to store locally and how much to spill onto
934     ** overflow pages.  The strategy is to minimize the amount of unused
935     ** space on overflow pages while keeping the amount of local storage
936     ** in between minLocal and maxLocal.
937     **
938     ** Warning:  changing the way overflow payload is distributed in any
939     ** way will result in an incompatible file format.
940     */
941     int minLocal;  /* Minimum amount of payload held locally */
942     int maxLocal;  /* Maximum amount of payload held locally */
943     int surplus;   /* Overflow payload available for local storage */
944 
945     minLocal = pPage->minLocal;
946     maxLocal = pPage->maxLocal;
947     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
948     testcase( surplus==maxLocal );
949     testcase( surplus==maxLocal+1 );
950     if( surplus <= maxLocal ){
951       pInfo->nLocal = (u16)surplus;
952     }else{
953       pInfo->nLocal = (u16)minLocal;
954     }
955     pInfo->iOverflow = (u16)(pInfo->nLocal + n);
956     pInfo->nSize = pInfo->iOverflow + 4;
957   }
958 }
959 #define parseCell(pPage, iCell, pInfo) \
960   btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
961 static void btreeParseCell(
962   MemPage *pPage,         /* Page containing the cell */
963   int iCell,              /* The cell index.  First cell is 0 */
964   CellInfo *pInfo         /* Fill in this structure */
965 ){
966   parseCell(pPage, iCell, pInfo);
967 }
968 
969 /*
970 ** Compute the total number of bytes that a Cell needs in the cell
971 ** data area of the btree-page.  The return number includes the cell
972 ** data header and the local payload, but not any overflow page or
973 ** the space used by the cell pointer.
974 */
975 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
976   u8 *pIter = &pCell[pPage->childPtrSize];
977   u32 nSize;
978 
979 #ifdef SQLITE_DEBUG
980   /* The value returned by this function should always be the same as
981   ** the (CellInfo.nSize) value found by doing a full parse of the
982   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
983   ** this function verifies that this invariant is not violated. */
984   CellInfo debuginfo;
985   btreeParseCellPtr(pPage, pCell, &debuginfo);
986 #endif
987 
988   if( pPage->intKey ){
989     u8 *pEnd;
990     if( pPage->hasData ){
991       pIter += getVarint32(pIter, nSize);
992     }else{
993       nSize = 0;
994     }
995 
996     /* pIter now points at the 64-bit integer key value, a variable length
997     ** integer. The following block moves pIter to point at the first byte
998     ** past the end of the key value. */
999     pEnd = &pIter[9];
1000     while( (*pIter++)&0x80 && pIter<pEnd );
1001   }else{
1002     pIter += getVarint32(pIter, nSize);
1003   }
1004 
1005   testcase( nSize==pPage->maxLocal );
1006   testcase( nSize==pPage->maxLocal+1 );
1007   if( nSize>pPage->maxLocal ){
1008     int minLocal = pPage->minLocal;
1009     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1010     testcase( nSize==pPage->maxLocal );
1011     testcase( nSize==pPage->maxLocal+1 );
1012     if( nSize>pPage->maxLocal ){
1013       nSize = minLocal;
1014     }
1015     nSize += 4;
1016   }
1017   nSize += (u32)(pIter - pCell);
1018 
1019   /* The minimum size of any cell is 4 bytes. */
1020   if( nSize<4 ){
1021     nSize = 4;
1022   }
1023 
1024   assert( nSize==debuginfo.nSize );
1025   return (u16)nSize;
1026 }
1027 
1028 #ifdef SQLITE_DEBUG
1029 /* This variation on cellSizePtr() is used inside of assert() statements
1030 ** only. */
1031 static u16 cellSize(MemPage *pPage, int iCell){
1032   return cellSizePtr(pPage, findCell(pPage, iCell));
1033 }
1034 #endif
1035 
1036 #ifndef SQLITE_OMIT_AUTOVACUUM
1037 /*
1038 ** If the cell pCell, part of page pPage contains a pointer
1039 ** to an overflow page, insert an entry into the pointer-map
1040 ** for the overflow page.
1041 */
1042 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1043   CellInfo info;
1044   if( *pRC ) return;
1045   assert( pCell!=0 );
1046   btreeParseCellPtr(pPage, pCell, &info);
1047   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1048   if( info.iOverflow ){
1049     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1050     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1051   }
1052 }
1053 #endif
1054 
1055 
1056 /*
1057 ** Defragment the page given.  All Cells are moved to the
1058 ** end of the page and all free space is collected into one
1059 ** big FreeBlk that occurs in between the header and cell
1060 ** pointer array and the cell content area.
1061 */
1062 static int defragmentPage(MemPage *pPage){
1063   int i;                     /* Loop counter */
1064   int pc;                    /* Address of a i-th cell */
1065   int hdr;                   /* Offset to the page header */
1066   int size;                  /* Size of a cell */
1067   int usableSize;            /* Number of usable bytes on a page */
1068   int cellOffset;            /* Offset to the cell pointer array */
1069   int cbrk;                  /* Offset to the cell content area */
1070   int nCell;                 /* Number of cells on the page */
1071   unsigned char *data;       /* The page data */
1072   unsigned char *temp;       /* Temp area for cell content */
1073   int iCellFirst;            /* First allowable cell index */
1074   int iCellLast;             /* Last possible cell index */
1075 
1076 
1077   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1078   assert( pPage->pBt!=0 );
1079   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1080   assert( pPage->nOverflow==0 );
1081   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1082   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1083   data = pPage->aData;
1084   hdr = pPage->hdrOffset;
1085   cellOffset = pPage->cellOffset;
1086   nCell = pPage->nCell;
1087   assert( nCell==get2byte(&data[hdr+3]) );
1088   usableSize = pPage->pBt->usableSize;
1089   cbrk = get2byte(&data[hdr+5]);
1090   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1091   cbrk = usableSize;
1092   iCellFirst = cellOffset + 2*nCell;
1093   iCellLast = usableSize - 4;
1094   for(i=0; i<nCell; i++){
1095     u8 *pAddr;     /* The i-th cell pointer */
1096     pAddr = &data[cellOffset + i*2];
1097     pc = get2byte(pAddr);
1098     testcase( pc==iCellFirst );
1099     testcase( pc==iCellLast );
1100 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1101     /* These conditions have already been verified in btreeInitPage()
1102     ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1103     */
1104     if( pc<iCellFirst || pc>iCellLast ){
1105       return SQLITE_CORRUPT_BKPT;
1106     }
1107 #endif
1108     assert( pc>=iCellFirst && pc<=iCellLast );
1109     size = cellSizePtr(pPage, &temp[pc]);
1110     cbrk -= size;
1111 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1112     if( cbrk<iCellFirst ){
1113       return SQLITE_CORRUPT_BKPT;
1114     }
1115 #else
1116     if( cbrk<iCellFirst || pc+size>usableSize ){
1117       return SQLITE_CORRUPT_BKPT;
1118     }
1119 #endif
1120     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1121     testcase( cbrk+size==usableSize );
1122     testcase( pc+size==usableSize );
1123     memcpy(&data[cbrk], &temp[pc], size);
1124     put2byte(pAddr, cbrk);
1125   }
1126   assert( cbrk>=iCellFirst );
1127   put2byte(&data[hdr+5], cbrk);
1128   data[hdr+1] = 0;
1129   data[hdr+2] = 0;
1130   data[hdr+7] = 0;
1131   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1132   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1133   if( cbrk-iCellFirst!=pPage->nFree ){
1134     return SQLITE_CORRUPT_BKPT;
1135   }
1136   return SQLITE_OK;
1137 }
1138 
1139 /*
1140 ** Allocate nByte bytes of space from within the B-Tree page passed
1141 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1142 ** of the first byte of allocated space. Return either SQLITE_OK or
1143 ** an error code (usually SQLITE_CORRUPT).
1144 **
1145 ** The caller guarantees that there is sufficient space to make the
1146 ** allocation.  This routine might need to defragment in order to bring
1147 ** all the space together, however.  This routine will avoid using
1148 ** the first two bytes past the cell pointer area since presumably this
1149 ** allocation is being made in order to insert a new cell, so we will
1150 ** also end up needing a new cell pointer.
1151 */
1152 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1153   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1154   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1155   int nFrag;                           /* Number of fragmented bytes on pPage */
1156   int top;                             /* First byte of cell content area */
1157   int gap;        /* First byte of gap between cell pointers and cell content */
1158   int rc;         /* Integer return code */
1159   int usableSize; /* Usable size of the page */
1160 
1161   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1162   assert( pPage->pBt );
1163   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1164   assert( nByte>=0 );  /* Minimum cell size is 4 */
1165   assert( pPage->nFree>=nByte );
1166   assert( pPage->nOverflow==0 );
1167   usableSize = pPage->pBt->usableSize;
1168   assert( nByte < usableSize-8 );
1169 
1170   nFrag = data[hdr+7];
1171   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1172   gap = pPage->cellOffset + 2*pPage->nCell;
1173   top = get2byteNotZero(&data[hdr+5]);
1174   if( gap>top ) return SQLITE_CORRUPT_BKPT;
1175   testcase( gap+2==top );
1176   testcase( gap+1==top );
1177   testcase( gap==top );
1178 
1179   if( nFrag>=60 ){
1180     /* Always defragment highly fragmented pages */
1181     rc = defragmentPage(pPage);
1182     if( rc ) return rc;
1183     top = get2byteNotZero(&data[hdr+5]);
1184   }else if( gap+2<=top ){
1185     /* Search the freelist looking for a free slot big enough to satisfy
1186     ** the request. The allocation is made from the first free slot in
1187     ** the list that is large enough to accomadate it.
1188     */
1189     int pc, addr;
1190     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1191       int size;            /* Size of the free slot */
1192       if( pc>usableSize-4 || pc<addr+4 ){
1193         return SQLITE_CORRUPT_BKPT;
1194       }
1195       size = get2byte(&data[pc+2]);
1196       if( size>=nByte ){
1197         int x = size - nByte;
1198         testcase( x==4 );
1199         testcase( x==3 );
1200         if( x<4 ){
1201           /* Remove the slot from the free-list. Update the number of
1202           ** fragmented bytes within the page. */
1203           memcpy(&data[addr], &data[pc], 2);
1204           data[hdr+7] = (u8)(nFrag + x);
1205         }else if( size+pc > usableSize ){
1206           return SQLITE_CORRUPT_BKPT;
1207         }else{
1208           /* The slot remains on the free-list. Reduce its size to account
1209           ** for the portion used by the new allocation. */
1210           put2byte(&data[pc+2], x);
1211         }
1212         *pIdx = pc + x;
1213         return SQLITE_OK;
1214       }
1215     }
1216   }
1217 
1218   /* Check to make sure there is enough space in the gap to satisfy
1219   ** the allocation.  If not, defragment.
1220   */
1221   testcase( gap+2+nByte==top );
1222   if( gap+2+nByte>top ){
1223     rc = defragmentPage(pPage);
1224     if( rc ) return rc;
1225     top = get2byteNotZero(&data[hdr+5]);
1226     assert( gap+nByte<=top );
1227   }
1228 
1229 
1230   /* Allocate memory from the gap in between the cell pointer array
1231   ** and the cell content area.  The btreeInitPage() call has already
1232   ** validated the freelist.  Given that the freelist is valid, there
1233   ** is no way that the allocation can extend off the end of the page.
1234   ** The assert() below verifies the previous sentence.
1235   */
1236   top -= nByte;
1237   put2byte(&data[hdr+5], top);
1238   assert( top+nByte <= (int)pPage->pBt->usableSize );
1239   *pIdx = top;
1240   return SQLITE_OK;
1241 }
1242 
1243 /*
1244 ** Return a section of the pPage->aData to the freelist.
1245 ** The first byte of the new free block is pPage->aDisk[start]
1246 ** and the size of the block is "size" bytes.
1247 **
1248 ** Most of the effort here is involved in coalesing adjacent
1249 ** free blocks into a single big free block.
1250 */
1251 static int freeSpace(MemPage *pPage, int start, int size){
1252   int addr, pbegin, hdr;
1253   int iLast;                        /* Largest possible freeblock offset */
1254   unsigned char *data = pPage->aData;
1255 
1256   assert( pPage->pBt!=0 );
1257   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1258   assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1259   assert( (start + size) <= (int)pPage->pBt->usableSize );
1260   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1261   assert( size>=0 );   /* Minimum cell size is 4 */
1262 
1263   if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
1264     /* Overwrite deleted information with zeros when the secure_delete
1265     ** option is enabled */
1266     memset(&data[start], 0, size);
1267   }
1268 
1269   /* Add the space back into the linked list of freeblocks.  Note that
1270   ** even though the freeblock list was checked by btreeInitPage(),
1271   ** btreeInitPage() did not detect overlapping cells or
1272   ** freeblocks that overlapped cells.   Nor does it detect when the
1273   ** cell content area exceeds the value in the page header.  If these
1274   ** situations arise, then subsequent insert operations might corrupt
1275   ** the freelist.  So we do need to check for corruption while scanning
1276   ** the freelist.
1277   */
1278   hdr = pPage->hdrOffset;
1279   addr = hdr + 1;
1280   iLast = pPage->pBt->usableSize - 4;
1281   assert( start<=iLast );
1282   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1283     if( pbegin<addr+4 ){
1284       return SQLITE_CORRUPT_BKPT;
1285     }
1286     addr = pbegin;
1287   }
1288   if( pbegin>iLast ){
1289     return SQLITE_CORRUPT_BKPT;
1290   }
1291   assert( pbegin>addr || pbegin==0 );
1292   put2byte(&data[addr], start);
1293   put2byte(&data[start], pbegin);
1294   put2byte(&data[start+2], size);
1295   pPage->nFree = pPage->nFree + (u16)size;
1296 
1297   /* Coalesce adjacent free blocks */
1298   addr = hdr + 1;
1299   while( (pbegin = get2byte(&data[addr]))>0 ){
1300     int pnext, psize, x;
1301     assert( pbegin>addr );
1302     assert( pbegin <= (int)pPage->pBt->usableSize-4 );
1303     pnext = get2byte(&data[pbegin]);
1304     psize = get2byte(&data[pbegin+2]);
1305     if( pbegin + psize + 3 >= pnext && pnext>0 ){
1306       int frag = pnext - (pbegin+psize);
1307       if( (frag<0) || (frag>(int)data[hdr+7]) ){
1308         return SQLITE_CORRUPT_BKPT;
1309       }
1310       data[hdr+7] -= (u8)frag;
1311       x = get2byte(&data[pnext]);
1312       put2byte(&data[pbegin], x);
1313       x = pnext + get2byte(&data[pnext+2]) - pbegin;
1314       put2byte(&data[pbegin+2], x);
1315     }else{
1316       addr = pbegin;
1317     }
1318   }
1319 
1320   /* If the cell content area begins with a freeblock, remove it. */
1321   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1322     int top;
1323     pbegin = get2byte(&data[hdr+1]);
1324     memcpy(&data[hdr+1], &data[pbegin], 2);
1325     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1326     put2byte(&data[hdr+5], top);
1327   }
1328   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1329   return SQLITE_OK;
1330 }
1331 
1332 /*
1333 ** Decode the flags byte (the first byte of the header) for a page
1334 ** and initialize fields of the MemPage structure accordingly.
1335 **
1336 ** Only the following combinations are supported.  Anything different
1337 ** indicates a corrupt database files:
1338 **
1339 **         PTF_ZERODATA
1340 **         PTF_ZERODATA | PTF_LEAF
1341 **         PTF_LEAFDATA | PTF_INTKEY
1342 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1343 */
1344 static int decodeFlags(MemPage *pPage, int flagByte){
1345   BtShared *pBt;     /* A copy of pPage->pBt */
1346 
1347   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1348   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1349   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1350   flagByte &= ~PTF_LEAF;
1351   pPage->childPtrSize = 4-4*pPage->leaf;
1352   pBt = pPage->pBt;
1353   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1354     pPage->intKey = 1;
1355     pPage->hasData = pPage->leaf;
1356     pPage->maxLocal = pBt->maxLeaf;
1357     pPage->minLocal = pBt->minLeaf;
1358   }else if( flagByte==PTF_ZERODATA ){
1359     pPage->intKey = 0;
1360     pPage->hasData = 0;
1361     pPage->maxLocal = pBt->maxLocal;
1362     pPage->minLocal = pBt->minLocal;
1363   }else{
1364     return SQLITE_CORRUPT_BKPT;
1365   }
1366   pPage->max1bytePayload = pBt->max1bytePayload;
1367   return SQLITE_OK;
1368 }
1369 
1370 /*
1371 ** Initialize the auxiliary information for a disk block.
1372 **
1373 ** Return SQLITE_OK on success.  If we see that the page does
1374 ** not contain a well-formed database page, then return
1375 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1376 ** guarantee that the page is well-formed.  It only shows that
1377 ** we failed to detect any corruption.
1378 */
1379 static int btreeInitPage(MemPage *pPage){
1380 
1381   assert( pPage->pBt!=0 );
1382   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1383   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1384   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1385   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1386 
1387   if( !pPage->isInit ){
1388     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1389     u8 hdr;            /* Offset to beginning of page header */
1390     u8 *data;          /* Equal to pPage->aData */
1391     BtShared *pBt;        /* The main btree structure */
1392     int usableSize;    /* Amount of usable space on each page */
1393     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1394     int nFree;         /* Number of unused bytes on the page */
1395     int top;           /* First byte of the cell content area */
1396     int iCellFirst;    /* First allowable cell or freeblock offset */
1397     int iCellLast;     /* Last possible cell or freeblock offset */
1398 
1399     pBt = pPage->pBt;
1400 
1401     hdr = pPage->hdrOffset;
1402     data = pPage->aData;
1403     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1404     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1405     pPage->maskPage = (u16)(pBt->pageSize - 1);
1406     pPage->nOverflow = 0;
1407     usableSize = pBt->usableSize;
1408     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1409     pPage->aDataEnd = &data[usableSize];
1410     pPage->aCellIdx = &data[cellOffset];
1411     top = get2byteNotZero(&data[hdr+5]);
1412     pPage->nCell = get2byte(&data[hdr+3]);
1413     if( pPage->nCell>MX_CELL(pBt) ){
1414       /* To many cells for a single page.  The page must be corrupt */
1415       return SQLITE_CORRUPT_BKPT;
1416     }
1417     testcase( pPage->nCell==MX_CELL(pBt) );
1418 
1419     /* A malformed database page might cause us to read past the end
1420     ** of page when parsing a cell.
1421     **
1422     ** The following block of code checks early to see if a cell extends
1423     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1424     ** returned if it does.
1425     */
1426     iCellFirst = cellOffset + 2*pPage->nCell;
1427     iCellLast = usableSize - 4;
1428 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1429     {
1430       int i;            /* Index into the cell pointer array */
1431       int sz;           /* Size of a cell */
1432 
1433       if( !pPage->leaf ) iCellLast--;
1434       for(i=0; i<pPage->nCell; i++){
1435         pc = get2byte(&data[cellOffset+i*2]);
1436         testcase( pc==iCellFirst );
1437         testcase( pc==iCellLast );
1438         if( pc<iCellFirst || pc>iCellLast ){
1439           return SQLITE_CORRUPT_BKPT;
1440         }
1441         sz = cellSizePtr(pPage, &data[pc]);
1442         testcase( pc+sz==usableSize );
1443         if( pc+sz>usableSize ){
1444           return SQLITE_CORRUPT_BKPT;
1445         }
1446       }
1447       if( !pPage->leaf ) iCellLast++;
1448     }
1449 #endif
1450 
1451     /* Compute the total free space on the page */
1452     pc = get2byte(&data[hdr+1]);
1453     nFree = data[hdr+7] + top;
1454     while( pc>0 ){
1455       u16 next, size;
1456       if( pc<iCellFirst || pc>iCellLast ){
1457         /* Start of free block is off the page */
1458         return SQLITE_CORRUPT_BKPT;
1459       }
1460       next = get2byte(&data[pc]);
1461       size = get2byte(&data[pc+2]);
1462       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1463         /* Free blocks must be in ascending order. And the last byte of
1464 	** the free-block must lie on the database page.  */
1465         return SQLITE_CORRUPT_BKPT;
1466       }
1467       nFree = nFree + size;
1468       pc = next;
1469     }
1470 
1471     /* At this point, nFree contains the sum of the offset to the start
1472     ** of the cell-content area plus the number of free bytes within
1473     ** the cell-content area. If this is greater than the usable-size
1474     ** of the page, then the page must be corrupted. This check also
1475     ** serves to verify that the offset to the start of the cell-content
1476     ** area, according to the page header, lies within the page.
1477     */
1478     if( nFree>usableSize ){
1479       return SQLITE_CORRUPT_BKPT;
1480     }
1481     pPage->nFree = (u16)(nFree - iCellFirst);
1482     pPage->isInit = 1;
1483   }
1484   return SQLITE_OK;
1485 }
1486 
1487 /*
1488 ** Set up a raw page so that it looks like a database page holding
1489 ** no entries.
1490 */
1491 static void zeroPage(MemPage *pPage, int flags){
1492   unsigned char *data = pPage->aData;
1493   BtShared *pBt = pPage->pBt;
1494   u8 hdr = pPage->hdrOffset;
1495   u16 first;
1496 
1497   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1498   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1499   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1500   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1501   assert( sqlite3_mutex_held(pBt->mutex) );
1502   if( pBt->btsFlags & BTS_SECURE_DELETE ){
1503     memset(&data[hdr], 0, pBt->usableSize - hdr);
1504   }
1505   data[hdr] = (char)flags;
1506   first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1507   memset(&data[hdr+1], 0, 4);
1508   data[hdr+7] = 0;
1509   put2byte(&data[hdr+5], pBt->usableSize);
1510   pPage->nFree = (u16)(pBt->usableSize - first);
1511   decodeFlags(pPage, flags);
1512   pPage->hdrOffset = hdr;
1513   pPage->cellOffset = first;
1514   pPage->aDataEnd = &data[pBt->usableSize];
1515   pPage->aCellIdx = &data[first];
1516   pPage->nOverflow = 0;
1517   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1518   pPage->maskPage = (u16)(pBt->pageSize - 1);
1519   pPage->nCell = 0;
1520   pPage->isInit = 1;
1521 }
1522 
1523 
1524 /*
1525 ** Convert a DbPage obtained from the pager into a MemPage used by
1526 ** the btree layer.
1527 */
1528 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1529   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1530   pPage->aData = sqlite3PagerGetData(pDbPage);
1531   pPage->pDbPage = pDbPage;
1532   pPage->pBt = pBt;
1533   pPage->pgno = pgno;
1534   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1535   return pPage;
1536 }
1537 
1538 /*
1539 ** Get a page from the pager.  Initialize the MemPage.pBt and
1540 ** MemPage.aData elements if needed.
1541 **
1542 ** If the noContent flag is set, it means that we do not care about
1543 ** the content of the page at this time.  So do not go to the disk
1544 ** to fetch the content.  Just fill in the content with zeros for now.
1545 ** If in the future we call sqlite3PagerWrite() on this page, that
1546 ** means we have started to be concerned about content and the disk
1547 ** read should occur at that point.
1548 */
1549 static int btreeGetPage(
1550   BtShared *pBt,       /* The btree */
1551   Pgno pgno,           /* Number of the page to fetch */
1552   MemPage **ppPage,    /* Return the page in this parameter */
1553   int noContent        /* Do not load page content if true */
1554 ){
1555   int rc;
1556   DbPage *pDbPage;
1557 
1558   assert( sqlite3_mutex_held(pBt->mutex) );
1559   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1560   if( rc ) return rc;
1561   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1562   return SQLITE_OK;
1563 }
1564 
1565 /*
1566 ** Retrieve a page from the pager cache. If the requested page is not
1567 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1568 ** MemPage.aData elements if needed.
1569 */
1570 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1571   DbPage *pDbPage;
1572   assert( sqlite3_mutex_held(pBt->mutex) );
1573   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1574   if( pDbPage ){
1575     return btreePageFromDbPage(pDbPage, pgno, pBt);
1576   }
1577   return 0;
1578 }
1579 
1580 /*
1581 ** Return the size of the database file in pages. If there is any kind of
1582 ** error, return ((unsigned int)-1).
1583 */
1584 static Pgno btreePagecount(BtShared *pBt){
1585   return pBt->nPage;
1586 }
1587 u32 sqlite3BtreeLastPage(Btree *p){
1588   assert( sqlite3BtreeHoldsMutex(p) );
1589   assert( ((p->pBt->nPage)&0x8000000)==0 );
1590   return (int)btreePagecount(p->pBt);
1591 }
1592 
1593 /*
1594 ** Get a page from the pager and initialize it.  This routine is just a
1595 ** convenience wrapper around separate calls to btreeGetPage() and
1596 ** btreeInitPage().
1597 **
1598 ** If an error occurs, then the value *ppPage is set to is undefined. It
1599 ** may remain unchanged, or it may be set to an invalid value.
1600 */
1601 static int getAndInitPage(
1602   BtShared *pBt,          /* The database file */
1603   Pgno pgno,           /* Number of the page to get */
1604   MemPage **ppPage     /* Write the page pointer here */
1605 ){
1606   int rc;
1607   assert( sqlite3_mutex_held(pBt->mutex) );
1608 
1609   if( pgno>btreePagecount(pBt) ){
1610     rc = SQLITE_CORRUPT_BKPT;
1611   }else{
1612     rc = btreeGetPage(pBt, pgno, ppPage, 0);
1613     if( rc==SQLITE_OK ){
1614       rc = btreeInitPage(*ppPage);
1615       if( rc!=SQLITE_OK ){
1616         releasePage(*ppPage);
1617       }
1618     }
1619   }
1620 
1621   testcase( pgno==0 );
1622   assert( pgno!=0 || rc==SQLITE_CORRUPT );
1623   return rc;
1624 }
1625 
1626 /*
1627 ** Release a MemPage.  This should be called once for each prior
1628 ** call to btreeGetPage.
1629 */
1630 static void releasePage(MemPage *pPage){
1631   if( pPage ){
1632     assert( pPage->aData );
1633     assert( pPage->pBt );
1634     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1635     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1636     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1637     sqlite3PagerUnref(pPage->pDbPage);
1638   }
1639 }
1640 
1641 /*
1642 ** During a rollback, when the pager reloads information into the cache
1643 ** so that the cache is restored to its original state at the start of
1644 ** the transaction, for each page restored this routine is called.
1645 **
1646 ** This routine needs to reset the extra data section at the end of the
1647 ** page to agree with the restored data.
1648 */
1649 static void pageReinit(DbPage *pData){
1650   MemPage *pPage;
1651   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1652   assert( sqlite3PagerPageRefcount(pData)>0 );
1653   if( pPage->isInit ){
1654     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1655     pPage->isInit = 0;
1656     if( sqlite3PagerPageRefcount(pData)>1 ){
1657       /* pPage might not be a btree page;  it might be an overflow page
1658       ** or ptrmap page or a free page.  In those cases, the following
1659       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1660       ** But no harm is done by this.  And it is very important that
1661       ** btreeInitPage() be called on every btree page so we make
1662       ** the call for every page that comes in for re-initing. */
1663       btreeInitPage(pPage);
1664     }
1665   }
1666 }
1667 
1668 /*
1669 ** Invoke the busy handler for a btree.
1670 */
1671 static int btreeInvokeBusyHandler(void *pArg){
1672   BtShared *pBt = (BtShared*)pArg;
1673   assert( pBt->db );
1674   assert( sqlite3_mutex_held(pBt->db->mutex) );
1675   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1676 }
1677 
1678 /*
1679 ** Open a database file.
1680 **
1681 ** zFilename is the name of the database file.  If zFilename is NULL
1682 ** then an ephemeral database is created.  The ephemeral database might
1683 ** be exclusively in memory, or it might use a disk-based memory cache.
1684 ** Either way, the ephemeral database will be automatically deleted
1685 ** when sqlite3BtreeClose() is called.
1686 **
1687 ** If zFilename is ":memory:" then an in-memory database is created
1688 ** that is automatically destroyed when it is closed.
1689 **
1690 ** The "flags" parameter is a bitmask that might contain bits like
1691 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
1692 **
1693 ** If the database is already opened in the same database connection
1694 ** and we are in shared cache mode, then the open will fail with an
1695 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
1696 ** objects in the same database connection since doing so will lead
1697 ** to problems with locking.
1698 */
1699 int sqlite3BtreeOpen(
1700   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
1701   const char *zFilename,  /* Name of the file containing the BTree database */
1702   sqlite3 *db,            /* Associated database handle */
1703   Btree **ppBtree,        /* Pointer to new Btree object written here */
1704   int flags,              /* Options */
1705   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1706 ){
1707   BtShared *pBt = 0;             /* Shared part of btree structure */
1708   Btree *p;                      /* Handle to return */
1709   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
1710   int rc = SQLITE_OK;            /* Result code from this function */
1711   u8 nReserve;                   /* Byte of unused space on each page */
1712   unsigned char zDbHeader[100];  /* Database header content */
1713 
1714   /* True if opening an ephemeral, temporary database */
1715   const int isTempDb = zFilename==0 || zFilename[0]==0;
1716 
1717   /* Set the variable isMemdb to true for an in-memory database, or
1718   ** false for a file-based database.
1719   */
1720 #ifdef SQLITE_OMIT_MEMORYDB
1721   const int isMemdb = 0;
1722 #else
1723   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
1724                        || (isTempDb && sqlite3TempInMemory(db))
1725                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
1726 #endif
1727 
1728   assert( db!=0 );
1729   assert( pVfs!=0 );
1730   assert( sqlite3_mutex_held(db->mutex) );
1731   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
1732 
1733   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
1734   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
1735 
1736   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
1737   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
1738 
1739   if( isMemdb ){
1740     flags |= BTREE_MEMORY;
1741   }
1742   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
1743     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
1744   }
1745   p = sqlite3MallocZero(sizeof(Btree));
1746   if( !p ){
1747     return SQLITE_NOMEM;
1748   }
1749   p->inTrans = TRANS_NONE;
1750   p->db = db;
1751 #ifndef SQLITE_OMIT_SHARED_CACHE
1752   p->lock.pBtree = p;
1753   p->lock.iTable = 1;
1754 #endif
1755 
1756 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1757   /*
1758   ** If this Btree is a candidate for shared cache, try to find an
1759   ** existing BtShared object that we can share with
1760   */
1761   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
1762     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1763       int nFullPathname = pVfs->mxPathname+1;
1764       char *zFullPathname = sqlite3Malloc(nFullPathname);
1765       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
1766       p->sharable = 1;
1767       if( !zFullPathname ){
1768         sqlite3_free(p);
1769         return SQLITE_NOMEM;
1770       }
1771       if( isMemdb ){
1772         memcpy(zFullPathname, zFilename, sqlite3Strlen30(zFilename)+1);
1773       }else{
1774         rc = sqlite3OsFullPathname(pVfs, zFilename,
1775                                    nFullPathname, zFullPathname);
1776         if( rc ){
1777           sqlite3_free(zFullPathname);
1778           sqlite3_free(p);
1779           return rc;
1780         }
1781       }
1782 #if SQLITE_THREADSAFE
1783       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1784       sqlite3_mutex_enter(mutexOpen);
1785       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1786       sqlite3_mutex_enter(mutexShared);
1787 #endif
1788       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1789         assert( pBt->nRef>0 );
1790         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
1791                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1792           int iDb;
1793           for(iDb=db->nDb-1; iDb>=0; iDb--){
1794             Btree *pExisting = db->aDb[iDb].pBt;
1795             if( pExisting && pExisting->pBt==pBt ){
1796               sqlite3_mutex_leave(mutexShared);
1797               sqlite3_mutex_leave(mutexOpen);
1798               sqlite3_free(zFullPathname);
1799               sqlite3_free(p);
1800               return SQLITE_CONSTRAINT;
1801             }
1802           }
1803           p->pBt = pBt;
1804           pBt->nRef++;
1805           break;
1806         }
1807       }
1808       sqlite3_mutex_leave(mutexShared);
1809       sqlite3_free(zFullPathname);
1810     }
1811 #ifdef SQLITE_DEBUG
1812     else{
1813       /* In debug mode, we mark all persistent databases as sharable
1814       ** even when they are not.  This exercises the locking code and
1815       ** gives more opportunity for asserts(sqlite3_mutex_held())
1816       ** statements to find locking problems.
1817       */
1818       p->sharable = 1;
1819     }
1820 #endif
1821   }
1822 #endif
1823   if( pBt==0 ){
1824     /*
1825     ** The following asserts make sure that structures used by the btree are
1826     ** the right size.  This is to guard against size changes that result
1827     ** when compiling on a different architecture.
1828     */
1829     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1830     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1831     assert( sizeof(u32)==4 );
1832     assert( sizeof(u16)==2 );
1833     assert( sizeof(Pgno)==4 );
1834 
1835     pBt = sqlite3MallocZero( sizeof(*pBt) );
1836     if( pBt==0 ){
1837       rc = SQLITE_NOMEM;
1838       goto btree_open_out;
1839     }
1840     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1841                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
1842     if( rc==SQLITE_OK ){
1843       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1844     }
1845     if( rc!=SQLITE_OK ){
1846       goto btree_open_out;
1847     }
1848     pBt->openFlags = (u8)flags;
1849     pBt->db = db;
1850     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1851     p->pBt = pBt;
1852 
1853     pBt->pCursor = 0;
1854     pBt->pPage1 = 0;
1855     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
1856 #ifdef SQLITE_SECURE_DELETE
1857     pBt->btsFlags |= BTS_SECURE_DELETE;
1858 #endif
1859     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
1860     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1861          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1862       pBt->pageSize = 0;
1863 #ifndef SQLITE_OMIT_AUTOVACUUM
1864       /* If the magic name ":memory:" will create an in-memory database, then
1865       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1866       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1867       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1868       ** regular file-name. In this case the auto-vacuum applies as per normal.
1869       */
1870       if( zFilename && !isMemdb ){
1871         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1872         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1873       }
1874 #endif
1875       nReserve = 0;
1876     }else{
1877       nReserve = zDbHeader[20];
1878       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
1879 #ifndef SQLITE_OMIT_AUTOVACUUM
1880       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1881       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1882 #endif
1883     }
1884     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1885     if( rc ) goto btree_open_out;
1886     pBt->usableSize = pBt->pageSize - nReserve;
1887     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1888 
1889 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1890     /* Add the new BtShared object to the linked list sharable BtShareds.
1891     */
1892     if( p->sharable ){
1893       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
1894       pBt->nRef = 1;
1895       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
1896       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1897         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1898         if( pBt->mutex==0 ){
1899           rc = SQLITE_NOMEM;
1900           db->mallocFailed = 0;
1901           goto btree_open_out;
1902         }
1903       }
1904       sqlite3_mutex_enter(mutexShared);
1905       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1906       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1907       sqlite3_mutex_leave(mutexShared);
1908     }
1909 #endif
1910   }
1911 
1912 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1913   /* If the new Btree uses a sharable pBtShared, then link the new
1914   ** Btree into the list of all sharable Btrees for the same connection.
1915   ** The list is kept in ascending order by pBt address.
1916   */
1917   if( p->sharable ){
1918     int i;
1919     Btree *pSib;
1920     for(i=0; i<db->nDb; i++){
1921       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1922         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1923         if( p->pBt<pSib->pBt ){
1924           p->pNext = pSib;
1925           p->pPrev = 0;
1926           pSib->pPrev = p;
1927         }else{
1928           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1929             pSib = pSib->pNext;
1930           }
1931           p->pNext = pSib->pNext;
1932           p->pPrev = pSib;
1933           if( p->pNext ){
1934             p->pNext->pPrev = p;
1935           }
1936           pSib->pNext = p;
1937         }
1938         break;
1939       }
1940     }
1941   }
1942 #endif
1943   *ppBtree = p;
1944 
1945 btree_open_out:
1946   if( rc!=SQLITE_OK ){
1947     if( pBt && pBt->pPager ){
1948       sqlite3PagerClose(pBt->pPager);
1949     }
1950     sqlite3_free(pBt);
1951     sqlite3_free(p);
1952     *ppBtree = 0;
1953   }else{
1954     /* If the B-Tree was successfully opened, set the pager-cache size to the
1955     ** default value. Except, when opening on an existing shared pager-cache,
1956     ** do not change the pager-cache size.
1957     */
1958     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
1959       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
1960     }
1961   }
1962   if( mutexOpen ){
1963     assert( sqlite3_mutex_held(mutexOpen) );
1964     sqlite3_mutex_leave(mutexOpen);
1965   }
1966   return rc;
1967 }
1968 
1969 /*
1970 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1971 ** remove the BtShared structure from the sharing list.  Return
1972 ** true if the BtShared.nRef counter reaches zero and return
1973 ** false if it is still positive.
1974 */
1975 static int removeFromSharingList(BtShared *pBt){
1976 #ifndef SQLITE_OMIT_SHARED_CACHE
1977   MUTEX_LOGIC( sqlite3_mutex *pMaster; )
1978   BtShared *pList;
1979   int removed = 0;
1980 
1981   assert( sqlite3_mutex_notheld(pBt->mutex) );
1982   MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
1983   sqlite3_mutex_enter(pMaster);
1984   pBt->nRef--;
1985   if( pBt->nRef<=0 ){
1986     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1987       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1988     }else{
1989       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1990       while( ALWAYS(pList) && pList->pNext!=pBt ){
1991         pList=pList->pNext;
1992       }
1993       if( ALWAYS(pList) ){
1994         pList->pNext = pBt->pNext;
1995       }
1996     }
1997     if( SQLITE_THREADSAFE ){
1998       sqlite3_mutex_free(pBt->mutex);
1999     }
2000     removed = 1;
2001   }
2002   sqlite3_mutex_leave(pMaster);
2003   return removed;
2004 #else
2005   return 1;
2006 #endif
2007 }
2008 
2009 /*
2010 ** Make sure pBt->pTmpSpace points to an allocation of
2011 ** MX_CELL_SIZE(pBt) bytes.
2012 */
2013 static void allocateTempSpace(BtShared *pBt){
2014   if( !pBt->pTmpSpace ){
2015     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2016   }
2017 }
2018 
2019 /*
2020 ** Free the pBt->pTmpSpace allocation
2021 */
2022 static void freeTempSpace(BtShared *pBt){
2023   sqlite3PageFree( pBt->pTmpSpace);
2024   pBt->pTmpSpace = 0;
2025 }
2026 
2027 /*
2028 ** Close an open database and invalidate all cursors.
2029 */
2030 int sqlite3BtreeClose(Btree *p){
2031   BtShared *pBt = p->pBt;
2032   BtCursor *pCur;
2033 
2034   /* Close all cursors opened via this handle.  */
2035   assert( sqlite3_mutex_held(p->db->mutex) );
2036   sqlite3BtreeEnter(p);
2037   pCur = pBt->pCursor;
2038   while( pCur ){
2039     BtCursor *pTmp = pCur;
2040     pCur = pCur->pNext;
2041     if( pTmp->pBtree==p ){
2042       sqlite3BtreeCloseCursor(pTmp);
2043     }
2044   }
2045 
2046   /* Rollback any active transaction and free the handle structure.
2047   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2048   ** this handle.
2049   */
2050   sqlite3BtreeRollback(p, SQLITE_OK);
2051   sqlite3BtreeLeave(p);
2052 
2053   /* If there are still other outstanding references to the shared-btree
2054   ** structure, return now. The remainder of this procedure cleans
2055   ** up the shared-btree.
2056   */
2057   assert( p->wantToLock==0 && p->locked==0 );
2058   if( !p->sharable || removeFromSharingList(pBt) ){
2059     /* The pBt is no longer on the sharing list, so we can access
2060     ** it without having to hold the mutex.
2061     **
2062     ** Clean out and delete the BtShared object.
2063     */
2064     assert( !pBt->pCursor );
2065     sqlite3PagerClose(pBt->pPager);
2066     if( pBt->xFreeSchema && pBt->pSchema ){
2067       pBt->xFreeSchema(pBt->pSchema);
2068     }
2069     sqlite3DbFree(0, pBt->pSchema);
2070     freeTempSpace(pBt);
2071     sqlite3_free(pBt);
2072   }
2073 
2074 #ifndef SQLITE_OMIT_SHARED_CACHE
2075   assert( p->wantToLock==0 );
2076   assert( p->locked==0 );
2077   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2078   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2079 #endif
2080 
2081   sqlite3_free(p);
2082   return SQLITE_OK;
2083 }
2084 
2085 /*
2086 ** Change the limit on the number of pages allowed in the cache.
2087 **
2088 ** The maximum number of cache pages is set to the absolute
2089 ** value of mxPage.  If mxPage is negative, the pager will
2090 ** operate asynchronously - it will not stop to do fsync()s
2091 ** to insure data is written to the disk surface before
2092 ** continuing.  Transactions still work if synchronous is off,
2093 ** and the database cannot be corrupted if this program
2094 ** crashes.  But if the operating system crashes or there is
2095 ** an abrupt power failure when synchronous is off, the database
2096 ** could be left in an inconsistent and unrecoverable state.
2097 ** Synchronous is on by default so database corruption is not
2098 ** normally a worry.
2099 */
2100 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2101   BtShared *pBt = p->pBt;
2102   assert( sqlite3_mutex_held(p->db->mutex) );
2103   sqlite3BtreeEnter(p);
2104   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2105   sqlite3BtreeLeave(p);
2106   return SQLITE_OK;
2107 }
2108 
2109 /*
2110 ** Change the way data is synced to disk in order to increase or decrease
2111 ** how well the database resists damage due to OS crashes and power
2112 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2113 ** there is a high probability of damage)  Level 2 is the default.  There
2114 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2115 ** probability of damage to near zero but with a write performance reduction.
2116 */
2117 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2118 int sqlite3BtreeSetSafetyLevel(
2119   Btree *p,              /* The btree to set the safety level on */
2120   int level,             /* PRAGMA synchronous.  1=OFF, 2=NORMAL, 3=FULL */
2121   int fullSync,          /* PRAGMA fullfsync. */
2122   int ckptFullSync       /* PRAGMA checkpoint_fullfync */
2123 ){
2124   BtShared *pBt = p->pBt;
2125   assert( sqlite3_mutex_held(p->db->mutex) );
2126   assert( level>=1 && level<=3 );
2127   sqlite3BtreeEnter(p);
2128   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync, ckptFullSync);
2129   sqlite3BtreeLeave(p);
2130   return SQLITE_OK;
2131 }
2132 #endif
2133 
2134 /*
2135 ** Return TRUE if the given btree is set to safety level 1.  In other
2136 ** words, return TRUE if no sync() occurs on the disk files.
2137 */
2138 int sqlite3BtreeSyncDisabled(Btree *p){
2139   BtShared *pBt = p->pBt;
2140   int rc;
2141   assert( sqlite3_mutex_held(p->db->mutex) );
2142   sqlite3BtreeEnter(p);
2143   assert( pBt && pBt->pPager );
2144   rc = sqlite3PagerNosync(pBt->pPager);
2145   sqlite3BtreeLeave(p);
2146   return rc;
2147 }
2148 
2149 /*
2150 ** Change the default pages size and the number of reserved bytes per page.
2151 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2152 ** without changing anything.
2153 **
2154 ** The page size must be a power of 2 between 512 and 65536.  If the page
2155 ** size supplied does not meet this constraint then the page size is not
2156 ** changed.
2157 **
2158 ** Page sizes are constrained to be a power of two so that the region
2159 ** of the database file used for locking (beginning at PENDING_BYTE,
2160 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2161 ** at the beginning of a page.
2162 **
2163 ** If parameter nReserve is less than zero, then the number of reserved
2164 ** bytes per page is left unchanged.
2165 **
2166 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2167 ** and autovacuum mode can no longer be changed.
2168 */
2169 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2170   int rc = SQLITE_OK;
2171   BtShared *pBt = p->pBt;
2172   assert( nReserve>=-1 && nReserve<=255 );
2173   sqlite3BtreeEnter(p);
2174   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2175     sqlite3BtreeLeave(p);
2176     return SQLITE_READONLY;
2177   }
2178   if( nReserve<0 ){
2179     nReserve = pBt->pageSize - pBt->usableSize;
2180   }
2181   assert( nReserve>=0 && nReserve<=255 );
2182   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2183         ((pageSize-1)&pageSize)==0 ){
2184     assert( (pageSize & 7)==0 );
2185     assert( !pBt->pPage1 && !pBt->pCursor );
2186     pBt->pageSize = (u32)pageSize;
2187     freeTempSpace(pBt);
2188   }
2189   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2190   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2191   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2192   sqlite3BtreeLeave(p);
2193   return rc;
2194 }
2195 
2196 /*
2197 ** Return the currently defined page size
2198 */
2199 int sqlite3BtreeGetPageSize(Btree *p){
2200   return p->pBt->pageSize;
2201 }
2202 
2203 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2204 /*
2205 ** Return the number of bytes of space at the end of every page that
2206 ** are intentually left unused.  This is the "reserved" space that is
2207 ** sometimes used by extensions.
2208 */
2209 int sqlite3BtreeGetReserve(Btree *p){
2210   int n;
2211   sqlite3BtreeEnter(p);
2212   n = p->pBt->pageSize - p->pBt->usableSize;
2213   sqlite3BtreeLeave(p);
2214   return n;
2215 }
2216 
2217 /*
2218 ** Set the maximum page count for a database if mxPage is positive.
2219 ** No changes are made if mxPage is 0 or negative.
2220 ** Regardless of the value of mxPage, return the maximum page count.
2221 */
2222 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2223   int n;
2224   sqlite3BtreeEnter(p);
2225   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2226   sqlite3BtreeLeave(p);
2227   return n;
2228 }
2229 
2230 /*
2231 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
2232 ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
2233 ** setting after the change.
2234 */
2235 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2236   int b;
2237   if( p==0 ) return 0;
2238   sqlite3BtreeEnter(p);
2239   if( newFlag>=0 ){
2240     p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
2241     if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
2242   }
2243   b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
2244   sqlite3BtreeLeave(p);
2245   return b;
2246 }
2247 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2248 
2249 /*
2250 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2251 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2252 ** is disabled. The default value for the auto-vacuum property is
2253 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2254 */
2255 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2256 #ifdef SQLITE_OMIT_AUTOVACUUM
2257   return SQLITE_READONLY;
2258 #else
2259   BtShared *pBt = p->pBt;
2260   int rc = SQLITE_OK;
2261   u8 av = (u8)autoVacuum;
2262 
2263   sqlite3BtreeEnter(p);
2264   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
2265     rc = SQLITE_READONLY;
2266   }else{
2267     pBt->autoVacuum = av ?1:0;
2268     pBt->incrVacuum = av==2 ?1:0;
2269   }
2270   sqlite3BtreeLeave(p);
2271   return rc;
2272 #endif
2273 }
2274 
2275 /*
2276 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2277 ** enabled 1 is returned. Otherwise 0.
2278 */
2279 int sqlite3BtreeGetAutoVacuum(Btree *p){
2280 #ifdef SQLITE_OMIT_AUTOVACUUM
2281   return BTREE_AUTOVACUUM_NONE;
2282 #else
2283   int rc;
2284   sqlite3BtreeEnter(p);
2285   rc = (
2286     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2287     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2288     BTREE_AUTOVACUUM_INCR
2289   );
2290   sqlite3BtreeLeave(p);
2291   return rc;
2292 #endif
2293 }
2294 
2295 
2296 /*
2297 ** Get a reference to pPage1 of the database file.  This will
2298 ** also acquire a readlock on that file.
2299 **
2300 ** SQLITE_OK is returned on success.  If the file is not a
2301 ** well-formed database file, then SQLITE_CORRUPT is returned.
2302 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2303 ** is returned if we run out of memory.
2304 */
2305 static int lockBtree(BtShared *pBt){
2306   int rc;              /* Result code from subfunctions */
2307   MemPage *pPage1;     /* Page 1 of the database file */
2308   int nPage;           /* Number of pages in the database */
2309   int nPageFile = 0;   /* Number of pages in the database file */
2310   int nPageHeader;     /* Number of pages in the database according to hdr */
2311 
2312   assert( sqlite3_mutex_held(pBt->mutex) );
2313   assert( pBt->pPage1==0 );
2314   rc = sqlite3PagerSharedLock(pBt->pPager);
2315   if( rc!=SQLITE_OK ) return rc;
2316   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2317   if( rc!=SQLITE_OK ) return rc;
2318 
2319   /* Do some checking to help insure the file we opened really is
2320   ** a valid database file.
2321   */
2322   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2323   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2324   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2325     nPage = nPageFile;
2326   }
2327   if( nPage>0 ){
2328     u32 pageSize;
2329     u32 usableSize;
2330     u8 *page1 = pPage1->aData;
2331     rc = SQLITE_NOTADB;
2332     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2333       goto page1_init_failed;
2334     }
2335 
2336 #ifdef SQLITE_OMIT_WAL
2337     if( page1[18]>1 ){
2338       pBt->btsFlags |= BTS_READ_ONLY;
2339     }
2340     if( page1[19]>1 ){
2341       goto page1_init_failed;
2342     }
2343 #else
2344     if( page1[18]>2 ){
2345       pBt->btsFlags |= BTS_READ_ONLY;
2346     }
2347     if( page1[19]>2 ){
2348       goto page1_init_failed;
2349     }
2350 
2351     /* If the write version is set to 2, this database should be accessed
2352     ** in WAL mode. If the log is not already open, open it now. Then
2353     ** return SQLITE_OK and return without populating BtShared.pPage1.
2354     ** The caller detects this and calls this function again. This is
2355     ** required as the version of page 1 currently in the page1 buffer
2356     ** may not be the latest version - there may be a newer one in the log
2357     ** file.
2358     */
2359     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
2360       int isOpen = 0;
2361       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2362       if( rc!=SQLITE_OK ){
2363         goto page1_init_failed;
2364       }else if( isOpen==0 ){
2365         releasePage(pPage1);
2366         return SQLITE_OK;
2367       }
2368       rc = SQLITE_NOTADB;
2369     }
2370 #endif
2371 
2372     /* The maximum embedded fraction must be exactly 25%.  And the minimum
2373     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2374     ** The original design allowed these amounts to vary, but as of
2375     ** version 3.6.0, we require them to be fixed.
2376     */
2377     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2378       goto page1_init_failed;
2379     }
2380     pageSize = (page1[16]<<8) | (page1[17]<<16);
2381     if( ((pageSize-1)&pageSize)!=0
2382      || pageSize>SQLITE_MAX_PAGE_SIZE
2383      || pageSize<=256
2384     ){
2385       goto page1_init_failed;
2386     }
2387     assert( (pageSize & 7)==0 );
2388     usableSize = pageSize - page1[20];
2389     if( (u32)pageSize!=pBt->pageSize ){
2390       /* After reading the first page of the database assuming a page size
2391       ** of BtShared.pageSize, we have discovered that the page-size is
2392       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2393       ** zero and return SQLITE_OK. The caller will call this function
2394       ** again with the correct page-size.
2395       */
2396       releasePage(pPage1);
2397       pBt->usableSize = usableSize;
2398       pBt->pageSize = pageSize;
2399       freeTempSpace(pBt);
2400       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2401                                    pageSize-usableSize);
2402       return rc;
2403     }
2404     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2405       rc = SQLITE_CORRUPT_BKPT;
2406       goto page1_init_failed;
2407     }
2408     if( usableSize<480 ){
2409       goto page1_init_failed;
2410     }
2411     pBt->pageSize = pageSize;
2412     pBt->usableSize = usableSize;
2413 #ifndef SQLITE_OMIT_AUTOVACUUM
2414     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2415     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2416 #endif
2417   }
2418 
2419   /* maxLocal is the maximum amount of payload to store locally for
2420   ** a cell.  Make sure it is small enough so that at least minFanout
2421   ** cells can will fit on one page.  We assume a 10-byte page header.
2422   ** Besides the payload, the cell must store:
2423   **     2-byte pointer to the cell
2424   **     4-byte child pointer
2425   **     9-byte nKey value
2426   **     4-byte nData value
2427   **     4-byte overflow page pointer
2428   ** So a cell consists of a 2-byte pointer, a header which is as much as
2429   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2430   ** page pointer.
2431   */
2432   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2433   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2434   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2435   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2436   if( pBt->maxLocal>127 ){
2437     pBt->max1bytePayload = 127;
2438   }else{
2439     pBt->max1bytePayload = (u8)pBt->maxLocal;
2440   }
2441   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2442   pBt->pPage1 = pPage1;
2443   pBt->nPage = nPage;
2444   return SQLITE_OK;
2445 
2446 page1_init_failed:
2447   releasePage(pPage1);
2448   pBt->pPage1 = 0;
2449   return rc;
2450 }
2451 
2452 /*
2453 ** If there are no outstanding cursors and we are not in the middle
2454 ** of a transaction but there is a read lock on the database, then
2455 ** this routine unrefs the first page of the database file which
2456 ** has the effect of releasing the read lock.
2457 **
2458 ** If there is a transaction in progress, this routine is a no-op.
2459 */
2460 static void unlockBtreeIfUnused(BtShared *pBt){
2461   assert( sqlite3_mutex_held(pBt->mutex) );
2462   assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2463   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2464     assert( pBt->pPage1->aData );
2465     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2466     assert( pBt->pPage1->aData );
2467     releasePage(pBt->pPage1);
2468     pBt->pPage1 = 0;
2469   }
2470 }
2471 
2472 /*
2473 ** If pBt points to an empty file then convert that empty file
2474 ** into a new empty database by initializing the first page of
2475 ** the database.
2476 */
2477 static int newDatabase(BtShared *pBt){
2478   MemPage *pP1;
2479   unsigned char *data;
2480   int rc;
2481 
2482   assert( sqlite3_mutex_held(pBt->mutex) );
2483   if( pBt->nPage>0 ){
2484     return SQLITE_OK;
2485   }
2486   pP1 = pBt->pPage1;
2487   assert( pP1!=0 );
2488   data = pP1->aData;
2489   rc = sqlite3PagerWrite(pP1->pDbPage);
2490   if( rc ) return rc;
2491   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2492   assert( sizeof(zMagicHeader)==16 );
2493   data[16] = (u8)((pBt->pageSize>>8)&0xff);
2494   data[17] = (u8)((pBt->pageSize>>16)&0xff);
2495   data[18] = 1;
2496   data[19] = 1;
2497   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2498   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2499   data[21] = 64;
2500   data[22] = 32;
2501   data[23] = 32;
2502   memset(&data[24], 0, 100-24);
2503   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2504   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2505 #ifndef SQLITE_OMIT_AUTOVACUUM
2506   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2507   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2508   put4byte(&data[36 + 4*4], pBt->autoVacuum);
2509   put4byte(&data[36 + 7*4], pBt->incrVacuum);
2510 #endif
2511   pBt->nPage = 1;
2512   data[31] = 1;
2513   return SQLITE_OK;
2514 }
2515 
2516 /*
2517 ** Attempt to start a new transaction. A write-transaction
2518 ** is started if the second argument is nonzero, otherwise a read-
2519 ** transaction.  If the second argument is 2 or more and exclusive
2520 ** transaction is started, meaning that no other process is allowed
2521 ** to access the database.  A preexisting transaction may not be
2522 ** upgraded to exclusive by calling this routine a second time - the
2523 ** exclusivity flag only works for a new transaction.
2524 **
2525 ** A write-transaction must be started before attempting any
2526 ** changes to the database.  None of the following routines
2527 ** will work unless a transaction is started first:
2528 **
2529 **      sqlite3BtreeCreateTable()
2530 **      sqlite3BtreeCreateIndex()
2531 **      sqlite3BtreeClearTable()
2532 **      sqlite3BtreeDropTable()
2533 **      sqlite3BtreeInsert()
2534 **      sqlite3BtreeDelete()
2535 **      sqlite3BtreeUpdateMeta()
2536 **
2537 ** If an initial attempt to acquire the lock fails because of lock contention
2538 ** and the database was previously unlocked, then invoke the busy handler
2539 ** if there is one.  But if there was previously a read-lock, do not
2540 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
2541 ** returned when there is already a read-lock in order to avoid a deadlock.
2542 **
2543 ** Suppose there are two processes A and B.  A has a read lock and B has
2544 ** a reserved lock.  B tries to promote to exclusive but is blocked because
2545 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
2546 ** One or the other of the two processes must give way or there can be
2547 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
2548 ** when A already has a read lock, we encourage A to give up and let B
2549 ** proceed.
2550 */
2551 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2552   sqlite3 *pBlock = 0;
2553   BtShared *pBt = p->pBt;
2554   int rc = SQLITE_OK;
2555 
2556   sqlite3BtreeEnter(p);
2557   btreeIntegrity(p);
2558 
2559   /* If the btree is already in a write-transaction, or it
2560   ** is already in a read-transaction and a read-transaction
2561   ** is requested, this is a no-op.
2562   */
2563   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2564     goto trans_begun;
2565   }
2566 
2567   /* Write transactions are not possible on a read-only database */
2568   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
2569     rc = SQLITE_READONLY;
2570     goto trans_begun;
2571   }
2572 
2573 #ifndef SQLITE_OMIT_SHARED_CACHE
2574   /* If another database handle has already opened a write transaction
2575   ** on this shared-btree structure and a second write transaction is
2576   ** requested, return SQLITE_LOCKED.
2577   */
2578   if( (wrflag && pBt->inTransaction==TRANS_WRITE)
2579    || (pBt->btsFlags & BTS_PENDING)!=0
2580   ){
2581     pBlock = pBt->pWriter->db;
2582   }else if( wrflag>1 ){
2583     BtLock *pIter;
2584     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2585       if( pIter->pBtree!=p ){
2586         pBlock = pIter->pBtree->db;
2587         break;
2588       }
2589     }
2590   }
2591   if( pBlock ){
2592     sqlite3ConnectionBlocked(p->db, pBlock);
2593     rc = SQLITE_LOCKED_SHAREDCACHE;
2594     goto trans_begun;
2595   }
2596 #endif
2597 
2598   /* Any read-only or read-write transaction implies a read-lock on
2599   ** page 1. So if some other shared-cache client already has a write-lock
2600   ** on page 1, the transaction cannot be opened. */
2601   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2602   if( SQLITE_OK!=rc ) goto trans_begun;
2603 
2604   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
2605   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
2606   do {
2607     /* Call lockBtree() until either pBt->pPage1 is populated or
2608     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2609     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2610     ** reading page 1 it discovers that the page-size of the database
2611     ** file is not pBt->pageSize. In this case lockBtree() will update
2612     ** pBt->pageSize to the page-size of the file on disk.
2613     */
2614     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2615 
2616     if( rc==SQLITE_OK && wrflag ){
2617       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
2618         rc = SQLITE_READONLY;
2619       }else{
2620         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2621         if( rc==SQLITE_OK ){
2622           rc = newDatabase(pBt);
2623         }
2624       }
2625     }
2626 
2627     if( rc!=SQLITE_OK ){
2628       unlockBtreeIfUnused(pBt);
2629     }
2630   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2631           btreeInvokeBusyHandler(pBt) );
2632 
2633   if( rc==SQLITE_OK ){
2634     if( p->inTrans==TRANS_NONE ){
2635       pBt->nTransaction++;
2636 #ifndef SQLITE_OMIT_SHARED_CACHE
2637       if( p->sharable ){
2638 	assert( p->lock.pBtree==p && p->lock.iTable==1 );
2639         p->lock.eLock = READ_LOCK;
2640         p->lock.pNext = pBt->pLock;
2641         pBt->pLock = &p->lock;
2642       }
2643 #endif
2644     }
2645     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2646     if( p->inTrans>pBt->inTransaction ){
2647       pBt->inTransaction = p->inTrans;
2648     }
2649     if( wrflag ){
2650       MemPage *pPage1 = pBt->pPage1;
2651 #ifndef SQLITE_OMIT_SHARED_CACHE
2652       assert( !pBt->pWriter );
2653       pBt->pWriter = p;
2654       pBt->btsFlags &= ~BTS_EXCLUSIVE;
2655       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
2656 #endif
2657 
2658       /* If the db-size header field is incorrect (as it may be if an old
2659       ** client has been writing the database file), update it now. Doing
2660       ** this sooner rather than later means the database size can safely
2661       ** re-read the database size from page 1 if a savepoint or transaction
2662       ** rollback occurs within the transaction.
2663       */
2664       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
2665         rc = sqlite3PagerWrite(pPage1->pDbPage);
2666         if( rc==SQLITE_OK ){
2667           put4byte(&pPage1->aData[28], pBt->nPage);
2668         }
2669       }
2670     }
2671   }
2672 
2673 
2674 trans_begun:
2675   if( rc==SQLITE_OK && wrflag ){
2676     /* This call makes sure that the pager has the correct number of
2677     ** open savepoints. If the second parameter is greater than 0 and
2678     ** the sub-journal is not already open, then it will be opened here.
2679     */
2680     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2681   }
2682 
2683   btreeIntegrity(p);
2684   sqlite3BtreeLeave(p);
2685   return rc;
2686 }
2687 
2688 #ifndef SQLITE_OMIT_AUTOVACUUM
2689 
2690 /*
2691 ** Set the pointer-map entries for all children of page pPage. Also, if
2692 ** pPage contains cells that point to overflow pages, set the pointer
2693 ** map entries for the overflow pages as well.
2694 */
2695 static int setChildPtrmaps(MemPage *pPage){
2696   int i;                             /* Counter variable */
2697   int nCell;                         /* Number of cells in page pPage */
2698   int rc;                            /* Return code */
2699   BtShared *pBt = pPage->pBt;
2700   u8 isInitOrig = pPage->isInit;
2701   Pgno pgno = pPage->pgno;
2702 
2703   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2704   rc = btreeInitPage(pPage);
2705   if( rc!=SQLITE_OK ){
2706     goto set_child_ptrmaps_out;
2707   }
2708   nCell = pPage->nCell;
2709 
2710   for(i=0; i<nCell; i++){
2711     u8 *pCell = findCell(pPage, i);
2712 
2713     ptrmapPutOvflPtr(pPage, pCell, &rc);
2714 
2715     if( !pPage->leaf ){
2716       Pgno childPgno = get4byte(pCell);
2717       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2718     }
2719   }
2720 
2721   if( !pPage->leaf ){
2722     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2723     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2724   }
2725 
2726 set_child_ptrmaps_out:
2727   pPage->isInit = isInitOrig;
2728   return rc;
2729 }
2730 
2731 /*
2732 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
2733 ** that it points to iTo. Parameter eType describes the type of pointer to
2734 ** be modified, as  follows:
2735 **
2736 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2737 **                   page of pPage.
2738 **
2739 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2740 **                   page pointed to by one of the cells on pPage.
2741 **
2742 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2743 **                   overflow page in the list.
2744 */
2745 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2746   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2747   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2748   if( eType==PTRMAP_OVERFLOW2 ){
2749     /* The pointer is always the first 4 bytes of the page in this case.  */
2750     if( get4byte(pPage->aData)!=iFrom ){
2751       return SQLITE_CORRUPT_BKPT;
2752     }
2753     put4byte(pPage->aData, iTo);
2754   }else{
2755     u8 isInitOrig = pPage->isInit;
2756     int i;
2757     int nCell;
2758 
2759     btreeInitPage(pPage);
2760     nCell = pPage->nCell;
2761 
2762     for(i=0; i<nCell; i++){
2763       u8 *pCell = findCell(pPage, i);
2764       if( eType==PTRMAP_OVERFLOW1 ){
2765         CellInfo info;
2766         btreeParseCellPtr(pPage, pCell, &info);
2767         if( info.iOverflow
2768          && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage
2769          && iFrom==get4byte(&pCell[info.iOverflow])
2770         ){
2771           put4byte(&pCell[info.iOverflow], iTo);
2772           break;
2773         }
2774       }else{
2775         if( get4byte(pCell)==iFrom ){
2776           put4byte(pCell, iTo);
2777           break;
2778         }
2779       }
2780     }
2781 
2782     if( i==nCell ){
2783       if( eType!=PTRMAP_BTREE ||
2784           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2785         return SQLITE_CORRUPT_BKPT;
2786       }
2787       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2788     }
2789 
2790     pPage->isInit = isInitOrig;
2791   }
2792   return SQLITE_OK;
2793 }
2794 
2795 
2796 /*
2797 ** Move the open database page pDbPage to location iFreePage in the
2798 ** database. The pDbPage reference remains valid.
2799 **
2800 ** The isCommit flag indicates that there is no need to remember that
2801 ** the journal needs to be sync()ed before database page pDbPage->pgno
2802 ** can be written to. The caller has already promised not to write to that
2803 ** page.
2804 */
2805 static int relocatePage(
2806   BtShared *pBt,           /* Btree */
2807   MemPage *pDbPage,        /* Open page to move */
2808   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2809   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2810   Pgno iFreePage,          /* The location to move pDbPage to */
2811   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
2812 ){
2813   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2814   Pgno iDbPage = pDbPage->pgno;
2815   Pager *pPager = pBt->pPager;
2816   int rc;
2817 
2818   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2819       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2820   assert( sqlite3_mutex_held(pBt->mutex) );
2821   assert( pDbPage->pBt==pBt );
2822 
2823   /* Move page iDbPage from its current location to page number iFreePage */
2824   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2825       iDbPage, iFreePage, iPtrPage, eType));
2826   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2827   if( rc!=SQLITE_OK ){
2828     return rc;
2829   }
2830   pDbPage->pgno = iFreePage;
2831 
2832   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2833   ** that point to overflow pages. The pointer map entries for all these
2834   ** pages need to be changed.
2835   **
2836   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2837   ** pointer to a subsequent overflow page. If this is the case, then
2838   ** the pointer map needs to be updated for the subsequent overflow page.
2839   */
2840   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2841     rc = setChildPtrmaps(pDbPage);
2842     if( rc!=SQLITE_OK ){
2843       return rc;
2844     }
2845   }else{
2846     Pgno nextOvfl = get4byte(pDbPage->aData);
2847     if( nextOvfl!=0 ){
2848       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2849       if( rc!=SQLITE_OK ){
2850         return rc;
2851       }
2852     }
2853   }
2854 
2855   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2856   ** that it points at iFreePage. Also fix the pointer map entry for
2857   ** iPtrPage.
2858   */
2859   if( eType!=PTRMAP_ROOTPAGE ){
2860     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2861     if( rc!=SQLITE_OK ){
2862       return rc;
2863     }
2864     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2865     if( rc!=SQLITE_OK ){
2866       releasePage(pPtrPage);
2867       return rc;
2868     }
2869     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2870     releasePage(pPtrPage);
2871     if( rc==SQLITE_OK ){
2872       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
2873     }
2874   }
2875   return rc;
2876 }
2877 
2878 /* Forward declaration required by incrVacuumStep(). */
2879 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2880 
2881 /*
2882 ** Perform a single step of an incremental-vacuum. If successful,
2883 ** return SQLITE_OK. If there is no work to do (and therefore no
2884 ** point in calling this function again), return SQLITE_DONE.
2885 **
2886 ** More specificly, this function attempts to re-organize the
2887 ** database so that the last page of the file currently in use
2888 ** is no longer in use.
2889 **
2890 ** If the nFin parameter is non-zero, this function assumes
2891 ** that the caller will keep calling incrVacuumStep() until
2892 ** it returns SQLITE_DONE or an error, and that nFin is the
2893 ** number of pages the database file will contain after this
2894 ** process is complete.  If nFin is zero, it is assumed that
2895 ** incrVacuumStep() will be called a finite amount of times
2896 ** which may or may not empty the freelist.  A full autovacuum
2897 ** has nFin>0.  A "PRAGMA incremental_vacuum" has nFin==0.
2898 */
2899 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2900   Pgno nFreeList;           /* Number of pages still on the free-list */
2901   int rc;
2902 
2903   assert( sqlite3_mutex_held(pBt->mutex) );
2904   assert( iLastPg>nFin );
2905 
2906   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2907     u8 eType;
2908     Pgno iPtrPage;
2909 
2910     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2911     if( nFreeList==0 ){
2912       return SQLITE_DONE;
2913     }
2914 
2915     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2916     if( rc!=SQLITE_OK ){
2917       return rc;
2918     }
2919     if( eType==PTRMAP_ROOTPAGE ){
2920       return SQLITE_CORRUPT_BKPT;
2921     }
2922 
2923     if( eType==PTRMAP_FREEPAGE ){
2924       if( nFin==0 ){
2925         /* Remove the page from the files free-list. This is not required
2926         ** if nFin is non-zero. In that case, the free-list will be
2927         ** truncated to zero after this function returns, so it doesn't
2928         ** matter if it still contains some garbage entries.
2929         */
2930         Pgno iFreePg;
2931         MemPage *pFreePg;
2932         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2933         if( rc!=SQLITE_OK ){
2934           return rc;
2935         }
2936         assert( iFreePg==iLastPg );
2937         releasePage(pFreePg);
2938       }
2939     } else {
2940       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2941       MemPage *pLastPg;
2942 
2943       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
2944       if( rc!=SQLITE_OK ){
2945         return rc;
2946       }
2947 
2948       /* If nFin is zero, this loop runs exactly once and page pLastPg
2949       ** is swapped with the first free page pulled off the free list.
2950       **
2951       ** On the other hand, if nFin is greater than zero, then keep
2952       ** looping until a free-page located within the first nFin pages
2953       ** of the file is found.
2954       */
2955       do {
2956         MemPage *pFreePg;
2957         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2958         if( rc!=SQLITE_OK ){
2959           releasePage(pLastPg);
2960           return rc;
2961         }
2962         releasePage(pFreePg);
2963       }while( nFin!=0 && iFreePg>nFin );
2964       assert( iFreePg<iLastPg );
2965 
2966       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2967       if( rc==SQLITE_OK ){
2968         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2969       }
2970       releasePage(pLastPg);
2971       if( rc!=SQLITE_OK ){
2972         return rc;
2973       }
2974     }
2975   }
2976 
2977   if( nFin==0 ){
2978     iLastPg--;
2979     while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2980       if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2981         MemPage *pPg;
2982         rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
2983         if( rc!=SQLITE_OK ){
2984           return rc;
2985         }
2986         rc = sqlite3PagerWrite(pPg->pDbPage);
2987         releasePage(pPg);
2988         if( rc!=SQLITE_OK ){
2989           return rc;
2990         }
2991       }
2992       iLastPg--;
2993     }
2994     sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2995     pBt->nPage = iLastPg;
2996   }
2997   return SQLITE_OK;
2998 }
2999 
3000 /*
3001 ** A write-transaction must be opened before calling this function.
3002 ** It performs a single unit of work towards an incremental vacuum.
3003 **
3004 ** If the incremental vacuum is finished after this function has run,
3005 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3006 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3007 */
3008 int sqlite3BtreeIncrVacuum(Btree *p){
3009   int rc;
3010   BtShared *pBt = p->pBt;
3011 
3012   sqlite3BtreeEnter(p);
3013   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3014   if( !pBt->autoVacuum ){
3015     rc = SQLITE_DONE;
3016   }else{
3017     invalidateAllOverflowCache(pBt);
3018     rc = incrVacuumStep(pBt, 0, btreePagecount(pBt));
3019     if( rc==SQLITE_OK ){
3020       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3021       put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3022     }
3023   }
3024   sqlite3BtreeLeave(p);
3025   return rc;
3026 }
3027 
3028 /*
3029 ** This routine is called prior to sqlite3PagerCommit when a transaction
3030 ** is commited for an auto-vacuum database.
3031 **
3032 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3033 ** the database file should be truncated to during the commit process.
3034 ** i.e. the database has been reorganized so that only the first *pnTrunc
3035 ** pages are in use.
3036 */
3037 static int autoVacuumCommit(BtShared *pBt){
3038   int rc = SQLITE_OK;
3039   Pager *pPager = pBt->pPager;
3040   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
3041 
3042   assert( sqlite3_mutex_held(pBt->mutex) );
3043   invalidateAllOverflowCache(pBt);
3044   assert(pBt->autoVacuum);
3045   if( !pBt->incrVacuum ){
3046     Pgno nFin;         /* Number of pages in database after autovacuuming */
3047     Pgno nFree;        /* Number of pages on the freelist initially */
3048     Pgno nPtrmap;      /* Number of PtrMap pages to be freed */
3049     Pgno iFree;        /* The next page to be freed */
3050     int nEntry;        /* Number of entries on one ptrmap page */
3051     Pgno nOrig;        /* Database size before freeing */
3052 
3053     nOrig = btreePagecount(pBt);
3054     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3055       /* It is not possible to create a database for which the final page
3056       ** is either a pointer-map page or the pending-byte page. If one
3057       ** is encountered, this indicates corruption.
3058       */
3059       return SQLITE_CORRUPT_BKPT;
3060     }
3061 
3062     nFree = get4byte(&pBt->pPage1->aData[36]);
3063     nEntry = pBt->usableSize/5;
3064     nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3065     nFin = nOrig - nFree - nPtrmap;
3066     if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3067       nFin--;
3068     }
3069     while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3070       nFin--;
3071     }
3072     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3073 
3074     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3075       rc = incrVacuumStep(pBt, nFin, iFree);
3076     }
3077     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3078       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3079       put4byte(&pBt->pPage1->aData[32], 0);
3080       put4byte(&pBt->pPage1->aData[36], 0);
3081       put4byte(&pBt->pPage1->aData[28], nFin);
3082       sqlite3PagerTruncateImage(pBt->pPager, nFin);
3083       pBt->nPage = nFin;
3084     }
3085     if( rc!=SQLITE_OK ){
3086       sqlite3PagerRollback(pPager);
3087     }
3088   }
3089 
3090   assert( nRef==sqlite3PagerRefcount(pPager) );
3091   return rc;
3092 }
3093 
3094 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3095 # define setChildPtrmaps(x) SQLITE_OK
3096 #endif
3097 
3098 /*
3099 ** This routine does the first phase of a two-phase commit.  This routine
3100 ** causes a rollback journal to be created (if it does not already exist)
3101 ** and populated with enough information so that if a power loss occurs
3102 ** the database can be restored to its original state by playing back
3103 ** the journal.  Then the contents of the journal are flushed out to
3104 ** the disk.  After the journal is safely on oxide, the changes to the
3105 ** database are written into the database file and flushed to oxide.
3106 ** At the end of this call, the rollback journal still exists on the
3107 ** disk and we are still holding all locks, so the transaction has not
3108 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3109 ** commit process.
3110 **
3111 ** This call is a no-op if no write-transaction is currently active on pBt.
3112 **
3113 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3114 ** the name of a master journal file that should be written into the
3115 ** individual journal file, or is NULL, indicating no master journal file
3116 ** (single database transaction).
3117 **
3118 ** When this is called, the master journal should already have been
3119 ** created, populated with this journal pointer and synced to disk.
3120 **
3121 ** Once this is routine has returned, the only thing required to commit
3122 ** the write-transaction for this database file is to delete the journal.
3123 */
3124 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3125   int rc = SQLITE_OK;
3126   if( p->inTrans==TRANS_WRITE ){
3127     BtShared *pBt = p->pBt;
3128     sqlite3BtreeEnter(p);
3129 #ifndef SQLITE_OMIT_AUTOVACUUM
3130     if( pBt->autoVacuum ){
3131       rc = autoVacuumCommit(pBt);
3132       if( rc!=SQLITE_OK ){
3133         sqlite3BtreeLeave(p);
3134         return rc;
3135       }
3136     }
3137 #endif
3138     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3139     sqlite3BtreeLeave(p);
3140   }
3141   return rc;
3142 }
3143 
3144 /*
3145 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3146 ** at the conclusion of a transaction.
3147 */
3148 static void btreeEndTransaction(Btree *p){
3149   BtShared *pBt = p->pBt;
3150   assert( sqlite3BtreeHoldsMutex(p) );
3151 
3152   btreeClearHasContent(pBt);
3153   if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){
3154     /* If there are other active statements that belong to this database
3155     ** handle, downgrade to a read-only transaction. The other statements
3156     ** may still be reading from the database.  */
3157     downgradeAllSharedCacheTableLocks(p);
3158     p->inTrans = TRANS_READ;
3159   }else{
3160     /* If the handle had any kind of transaction open, decrement the
3161     ** transaction count of the shared btree. If the transaction count
3162     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3163     ** call below will unlock the pager.  */
3164     if( p->inTrans!=TRANS_NONE ){
3165       clearAllSharedCacheTableLocks(p);
3166       pBt->nTransaction--;
3167       if( 0==pBt->nTransaction ){
3168         pBt->inTransaction = TRANS_NONE;
3169       }
3170     }
3171 
3172     /* Set the current transaction state to TRANS_NONE and unlock the
3173     ** pager if this call closed the only read or write transaction.  */
3174     p->inTrans = TRANS_NONE;
3175     unlockBtreeIfUnused(pBt);
3176   }
3177 
3178   btreeIntegrity(p);
3179 }
3180 
3181 /*
3182 ** Commit the transaction currently in progress.
3183 **
3184 ** This routine implements the second phase of a 2-phase commit.  The
3185 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3186 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3187 ** routine did all the work of writing information out to disk and flushing the
3188 ** contents so that they are written onto the disk platter.  All this
3189 ** routine has to do is delete or truncate or zero the header in the
3190 ** the rollback journal (which causes the transaction to commit) and
3191 ** drop locks.
3192 **
3193 ** Normally, if an error occurs while the pager layer is attempting to
3194 ** finalize the underlying journal file, this function returns an error and
3195 ** the upper layer will attempt a rollback. However, if the second argument
3196 ** is non-zero then this b-tree transaction is part of a multi-file
3197 ** transaction. In this case, the transaction has already been committed
3198 ** (by deleting a master journal file) and the caller will ignore this
3199 ** functions return code. So, even if an error occurs in the pager layer,
3200 ** reset the b-tree objects internal state to indicate that the write
3201 ** transaction has been closed. This is quite safe, as the pager will have
3202 ** transitioned to the error state.
3203 **
3204 ** This will release the write lock on the database file.  If there
3205 ** are no active cursors, it also releases the read lock.
3206 */
3207 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3208 
3209   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3210   sqlite3BtreeEnter(p);
3211   btreeIntegrity(p);
3212 
3213   /* If the handle has a write-transaction open, commit the shared-btrees
3214   ** transaction and set the shared state to TRANS_READ.
3215   */
3216   if( p->inTrans==TRANS_WRITE ){
3217     int rc;
3218     BtShared *pBt = p->pBt;
3219     assert( pBt->inTransaction==TRANS_WRITE );
3220     assert( pBt->nTransaction>0 );
3221     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3222     if( rc!=SQLITE_OK && bCleanup==0 ){
3223       sqlite3BtreeLeave(p);
3224       return rc;
3225     }
3226     pBt->inTransaction = TRANS_READ;
3227   }
3228 
3229   btreeEndTransaction(p);
3230   sqlite3BtreeLeave(p);
3231   return SQLITE_OK;
3232 }
3233 
3234 /*
3235 ** Do both phases of a commit.
3236 */
3237 int sqlite3BtreeCommit(Btree *p){
3238   int rc;
3239   sqlite3BtreeEnter(p);
3240   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3241   if( rc==SQLITE_OK ){
3242     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3243   }
3244   sqlite3BtreeLeave(p);
3245   return rc;
3246 }
3247 
3248 #ifndef NDEBUG
3249 /*
3250 ** Return the number of write-cursors open on this handle. This is for use
3251 ** in assert() expressions, so it is only compiled if NDEBUG is not
3252 ** defined.
3253 **
3254 ** For the purposes of this routine, a write-cursor is any cursor that
3255 ** is capable of writing to the databse.  That means the cursor was
3256 ** originally opened for writing and the cursor has not be disabled
3257 ** by having its state changed to CURSOR_FAULT.
3258 */
3259 static int countWriteCursors(BtShared *pBt){
3260   BtCursor *pCur;
3261   int r = 0;
3262   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3263     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
3264   }
3265   return r;
3266 }
3267 #endif
3268 
3269 /*
3270 ** This routine sets the state to CURSOR_FAULT and the error
3271 ** code to errCode for every cursor on BtShared that pBtree
3272 ** references.
3273 **
3274 ** Every cursor is tripped, including cursors that belong
3275 ** to other database connections that happen to be sharing
3276 ** the cache with pBtree.
3277 **
3278 ** This routine gets called when a rollback occurs.
3279 ** All cursors using the same cache must be tripped
3280 ** to prevent them from trying to use the btree after
3281 ** the rollback.  The rollback may have deleted tables
3282 ** or moved root pages, so it is not sufficient to
3283 ** save the state of the cursor.  The cursor must be
3284 ** invalidated.
3285 */
3286 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3287   BtCursor *p;
3288   if( pBtree==0 ) return;
3289   sqlite3BtreeEnter(pBtree);
3290   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3291     int i;
3292     sqlite3BtreeClearCursor(p);
3293     p->eState = CURSOR_FAULT;
3294     p->skipNext = errCode;
3295     for(i=0; i<=p->iPage; i++){
3296       releasePage(p->apPage[i]);
3297       p->apPage[i] = 0;
3298     }
3299   }
3300   sqlite3BtreeLeave(pBtree);
3301 }
3302 
3303 /*
3304 ** Rollback the transaction in progress.  All cursors will be
3305 ** invalided by this operation.  Any attempt to use a cursor
3306 ** that was open at the beginning of this operation will result
3307 ** in an error.
3308 **
3309 ** This will release the write lock on the database file.  If there
3310 ** are no active cursors, it also releases the read lock.
3311 */
3312 int sqlite3BtreeRollback(Btree *p, int tripCode){
3313   int rc;
3314   BtShared *pBt = p->pBt;
3315   MemPage *pPage1;
3316 
3317   sqlite3BtreeEnter(p);
3318   if( tripCode==SQLITE_OK ){
3319     rc = tripCode = saveAllCursors(pBt, 0, 0);
3320   }else{
3321     rc = SQLITE_OK;
3322   }
3323   if( tripCode ){
3324     sqlite3BtreeTripAllCursors(p, tripCode);
3325   }
3326   btreeIntegrity(p);
3327 
3328   if( p->inTrans==TRANS_WRITE ){
3329     int rc2;
3330 
3331     assert( TRANS_WRITE==pBt->inTransaction );
3332     rc2 = sqlite3PagerRollback(pBt->pPager);
3333     if( rc2!=SQLITE_OK ){
3334       rc = rc2;
3335     }
3336 
3337     /* The rollback may have destroyed the pPage1->aData value.  So
3338     ** call btreeGetPage() on page 1 again to make
3339     ** sure pPage1->aData is set correctly. */
3340     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3341       int nPage = get4byte(28+(u8*)pPage1->aData);
3342       testcase( nPage==0 );
3343       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3344       testcase( pBt->nPage!=nPage );
3345       pBt->nPage = nPage;
3346       releasePage(pPage1);
3347     }
3348     assert( countWriteCursors(pBt)==0 );
3349     pBt->inTransaction = TRANS_READ;
3350   }
3351 
3352   btreeEndTransaction(p);
3353   sqlite3BtreeLeave(p);
3354   return rc;
3355 }
3356 
3357 /*
3358 ** Start a statement subtransaction. The subtransaction can can be rolled
3359 ** back independently of the main transaction. You must start a transaction
3360 ** before starting a subtransaction. The subtransaction is ended automatically
3361 ** if the main transaction commits or rolls back.
3362 **
3363 ** Statement subtransactions are used around individual SQL statements
3364 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3365 ** error occurs within the statement, the effect of that one statement
3366 ** can be rolled back without having to rollback the entire transaction.
3367 **
3368 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3369 ** value passed as the second parameter is the total number of savepoints,
3370 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3371 ** are no active savepoints and no other statement-transactions open,
3372 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3373 ** using the sqlite3BtreeSavepoint() function.
3374 */
3375 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3376   int rc;
3377   BtShared *pBt = p->pBt;
3378   sqlite3BtreeEnter(p);
3379   assert( p->inTrans==TRANS_WRITE );
3380   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
3381   assert( iStatement>0 );
3382   assert( iStatement>p->db->nSavepoint );
3383   assert( pBt->inTransaction==TRANS_WRITE );
3384   /* At the pager level, a statement transaction is a savepoint with
3385   ** an index greater than all savepoints created explicitly using
3386   ** SQL statements. It is illegal to open, release or rollback any
3387   ** such savepoints while the statement transaction savepoint is active.
3388   */
3389   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3390   sqlite3BtreeLeave(p);
3391   return rc;
3392 }
3393 
3394 /*
3395 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3396 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3397 ** savepoint identified by parameter iSavepoint, depending on the value
3398 ** of op.
3399 **
3400 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3401 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3402 ** contents of the entire transaction are rolled back. This is different
3403 ** from a normal transaction rollback, as no locks are released and the
3404 ** transaction remains open.
3405 */
3406 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3407   int rc = SQLITE_OK;
3408   if( p && p->inTrans==TRANS_WRITE ){
3409     BtShared *pBt = p->pBt;
3410     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3411     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3412     sqlite3BtreeEnter(p);
3413     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3414     if( rc==SQLITE_OK ){
3415       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
3416         pBt->nPage = 0;
3417       }
3418       rc = newDatabase(pBt);
3419       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3420 
3421       /* The database size was written into the offset 28 of the header
3422       ** when the transaction started, so we know that the value at offset
3423       ** 28 is nonzero. */
3424       assert( pBt->nPage>0 );
3425     }
3426     sqlite3BtreeLeave(p);
3427   }
3428   return rc;
3429 }
3430 
3431 /*
3432 ** Create a new cursor for the BTree whose root is on the page
3433 ** iTable. If a read-only cursor is requested, it is assumed that
3434 ** the caller already has at least a read-only transaction open
3435 ** on the database already. If a write-cursor is requested, then
3436 ** the caller is assumed to have an open write transaction.
3437 **
3438 ** If wrFlag==0, then the cursor can only be used for reading.
3439 ** If wrFlag==1, then the cursor can be used for reading or for
3440 ** writing if other conditions for writing are also met.  These
3441 ** are the conditions that must be met in order for writing to
3442 ** be allowed:
3443 **
3444 ** 1:  The cursor must have been opened with wrFlag==1
3445 **
3446 ** 2:  Other database connections that share the same pager cache
3447 **     but which are not in the READ_UNCOMMITTED state may not have
3448 **     cursors open with wrFlag==0 on the same table.  Otherwise
3449 **     the changes made by this write cursor would be visible to
3450 **     the read cursors in the other database connection.
3451 **
3452 ** 3:  The database must be writable (not on read-only media)
3453 **
3454 ** 4:  There must be an active transaction.
3455 **
3456 ** No checking is done to make sure that page iTable really is the
3457 ** root page of a b-tree.  If it is not, then the cursor acquired
3458 ** will not work correctly.
3459 **
3460 ** It is assumed that the sqlite3BtreeCursorZero() has been called
3461 ** on pCur to initialize the memory space prior to invoking this routine.
3462 */
3463 static int btreeCursor(
3464   Btree *p,                              /* The btree */
3465   int iTable,                            /* Root page of table to open */
3466   int wrFlag,                            /* 1 to write. 0 read-only */
3467   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
3468   BtCursor *pCur                         /* Space for new cursor */
3469 ){
3470   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
3471 
3472   assert( sqlite3BtreeHoldsMutex(p) );
3473   assert( wrFlag==0 || wrFlag==1 );
3474 
3475   /* The following assert statements verify that if this is a sharable
3476   ** b-tree database, the connection is holding the required table locks,
3477   ** and that no other connection has any open cursor that conflicts with
3478   ** this lock.  */
3479   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3480   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3481 
3482   /* Assert that the caller has opened the required transaction. */
3483   assert( p->inTrans>TRANS_NONE );
3484   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3485   assert( pBt->pPage1 && pBt->pPage1->aData );
3486 
3487   if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){
3488     return SQLITE_READONLY;
3489   }
3490   if( iTable==1 && btreePagecount(pBt)==0 ){
3491     assert( wrFlag==0 );
3492     iTable = 0;
3493   }
3494 
3495   /* Now that no other errors can occur, finish filling in the BtCursor
3496   ** variables and link the cursor into the BtShared list.  */
3497   pCur->pgnoRoot = (Pgno)iTable;
3498   pCur->iPage = -1;
3499   pCur->pKeyInfo = pKeyInfo;
3500   pCur->pBtree = p;
3501   pCur->pBt = pBt;
3502   pCur->wrFlag = (u8)wrFlag;
3503   pCur->pNext = pBt->pCursor;
3504   if( pCur->pNext ){
3505     pCur->pNext->pPrev = pCur;
3506   }
3507   pBt->pCursor = pCur;
3508   pCur->eState = CURSOR_INVALID;
3509   pCur->cachedRowid = 0;
3510   return SQLITE_OK;
3511 }
3512 int sqlite3BtreeCursor(
3513   Btree *p,                                   /* The btree */
3514   int iTable,                                 /* Root page of table to open */
3515   int wrFlag,                                 /* 1 to write. 0 read-only */
3516   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
3517   BtCursor *pCur                              /* Write new cursor here */
3518 ){
3519   int rc;
3520   sqlite3BtreeEnter(p);
3521   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3522   sqlite3BtreeLeave(p);
3523   return rc;
3524 }
3525 
3526 /*
3527 ** Return the size of a BtCursor object in bytes.
3528 **
3529 ** This interfaces is needed so that users of cursors can preallocate
3530 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3531 ** to users so they cannot do the sizeof() themselves - they must call
3532 ** this routine.
3533 */
3534 int sqlite3BtreeCursorSize(void){
3535   return ROUND8(sizeof(BtCursor));
3536 }
3537 
3538 /*
3539 ** Initialize memory that will be converted into a BtCursor object.
3540 **
3541 ** The simple approach here would be to memset() the entire object
3542 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
3543 ** do not need to be zeroed and they are large, so we can save a lot
3544 ** of run-time by skipping the initialization of those elements.
3545 */
3546 void sqlite3BtreeCursorZero(BtCursor *p){
3547   memset(p, 0, offsetof(BtCursor, iPage));
3548 }
3549 
3550 /*
3551 ** Set the cached rowid value of every cursor in the same database file
3552 ** as pCur and having the same root page number as pCur.  The value is
3553 ** set to iRowid.
3554 **
3555 ** Only positive rowid values are considered valid for this cache.
3556 ** The cache is initialized to zero, indicating an invalid cache.
3557 ** A btree will work fine with zero or negative rowids.  We just cannot
3558 ** cache zero or negative rowids, which means tables that use zero or
3559 ** negative rowids might run a little slower.  But in practice, zero
3560 ** or negative rowids are very uncommon so this should not be a problem.
3561 */
3562 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3563   BtCursor *p;
3564   for(p=pCur->pBt->pCursor; p; p=p->pNext){
3565     if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3566   }
3567   assert( pCur->cachedRowid==iRowid );
3568 }
3569 
3570 /*
3571 ** Return the cached rowid for the given cursor.  A negative or zero
3572 ** return value indicates that the rowid cache is invalid and should be
3573 ** ignored.  If the rowid cache has never before been set, then a
3574 ** zero is returned.
3575 */
3576 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3577   return pCur->cachedRowid;
3578 }
3579 
3580 /*
3581 ** Close a cursor.  The read lock on the database file is released
3582 ** when the last cursor is closed.
3583 */
3584 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3585   Btree *pBtree = pCur->pBtree;
3586   if( pBtree ){
3587     int i;
3588     BtShared *pBt = pCur->pBt;
3589     sqlite3BtreeEnter(pBtree);
3590     sqlite3BtreeClearCursor(pCur);
3591     if( pCur->pPrev ){
3592       pCur->pPrev->pNext = pCur->pNext;
3593     }else{
3594       pBt->pCursor = pCur->pNext;
3595     }
3596     if( pCur->pNext ){
3597       pCur->pNext->pPrev = pCur->pPrev;
3598     }
3599     for(i=0; i<=pCur->iPage; i++){
3600       releasePage(pCur->apPage[i]);
3601     }
3602     unlockBtreeIfUnused(pBt);
3603     invalidateOverflowCache(pCur);
3604     /* sqlite3_free(pCur); */
3605     sqlite3BtreeLeave(pBtree);
3606   }
3607   return SQLITE_OK;
3608 }
3609 
3610 /*
3611 ** Make sure the BtCursor* given in the argument has a valid
3612 ** BtCursor.info structure.  If it is not already valid, call
3613 ** btreeParseCell() to fill it in.
3614 **
3615 ** BtCursor.info is a cache of the information in the current cell.
3616 ** Using this cache reduces the number of calls to btreeParseCell().
3617 **
3618 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
3619 ** compiler to crash when getCellInfo() is implemented as a macro.
3620 ** But there is a measureable speed advantage to using the macro on gcc
3621 ** (when less compiler optimizations like -Os or -O0 are used and the
3622 ** compiler is not doing agressive inlining.)  So we use a real function
3623 ** for MSVC and a macro for everything else.  Ticket #2457.
3624 */
3625 #ifndef NDEBUG
3626   static void assertCellInfo(BtCursor *pCur){
3627     CellInfo info;
3628     int iPage = pCur->iPage;
3629     memset(&info, 0, sizeof(info));
3630     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3631     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3632   }
3633 #else
3634   #define assertCellInfo(x)
3635 #endif
3636 #ifdef _MSC_VER
3637   /* Use a real function in MSVC to work around bugs in that compiler. */
3638   static void getCellInfo(BtCursor *pCur){
3639     if( pCur->info.nSize==0 ){
3640       int iPage = pCur->iPage;
3641       btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3642       pCur->validNKey = 1;
3643     }else{
3644       assertCellInfo(pCur);
3645     }
3646   }
3647 #else /* if not _MSC_VER */
3648   /* Use a macro in all other compilers so that the function is inlined */
3649 #define getCellInfo(pCur)                                                      \
3650   if( pCur->info.nSize==0 ){                                                   \
3651     int iPage = pCur->iPage;                                                   \
3652     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3653     pCur->validNKey = 1;                                                       \
3654   }else{                                                                       \
3655     assertCellInfo(pCur);                                                      \
3656   }
3657 #endif /* _MSC_VER */
3658 
3659 #ifndef NDEBUG  /* The next routine used only within assert() statements */
3660 /*
3661 ** Return true if the given BtCursor is valid.  A valid cursor is one
3662 ** that is currently pointing to a row in a (non-empty) table.
3663 ** This is a verification routine is used only within assert() statements.
3664 */
3665 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3666   return pCur && pCur->eState==CURSOR_VALID;
3667 }
3668 #endif /* NDEBUG */
3669 
3670 /*
3671 ** Set *pSize to the size of the buffer needed to hold the value of
3672 ** the key for the current entry.  If the cursor is not pointing
3673 ** to a valid entry, *pSize is set to 0.
3674 **
3675 ** For a table with the INTKEY flag set, this routine returns the key
3676 ** itself, not the number of bytes in the key.
3677 **
3678 ** The caller must position the cursor prior to invoking this routine.
3679 **
3680 ** This routine cannot fail.  It always returns SQLITE_OK.
3681 */
3682 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3683   assert( cursorHoldsMutex(pCur) );
3684   assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3685   if( pCur->eState!=CURSOR_VALID ){
3686     *pSize = 0;
3687   }else{
3688     getCellInfo(pCur);
3689     *pSize = pCur->info.nKey;
3690   }
3691   return SQLITE_OK;
3692 }
3693 
3694 /*
3695 ** Set *pSize to the number of bytes of data in the entry the
3696 ** cursor currently points to.
3697 **
3698 ** The caller must guarantee that the cursor is pointing to a non-NULL
3699 ** valid entry.  In other words, the calling procedure must guarantee
3700 ** that the cursor has Cursor.eState==CURSOR_VALID.
3701 **
3702 ** Failure is not possible.  This function always returns SQLITE_OK.
3703 ** It might just as well be a procedure (returning void) but we continue
3704 ** to return an integer result code for historical reasons.
3705 */
3706 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3707   assert( cursorHoldsMutex(pCur) );
3708   assert( pCur->eState==CURSOR_VALID );
3709   getCellInfo(pCur);
3710   *pSize = pCur->info.nData;
3711   return SQLITE_OK;
3712 }
3713 
3714 /*
3715 ** Given the page number of an overflow page in the database (parameter
3716 ** ovfl), this function finds the page number of the next page in the
3717 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3718 ** pointer-map data instead of reading the content of page ovfl to do so.
3719 **
3720 ** If an error occurs an SQLite error code is returned. Otherwise:
3721 **
3722 ** The page number of the next overflow page in the linked list is
3723 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3724 ** list, *pPgnoNext is set to zero.
3725 **
3726 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3727 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3728 ** reference. It is the responsibility of the caller to call releasePage()
3729 ** on *ppPage to free the reference. In no reference was obtained (because
3730 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3731 ** *ppPage is set to zero.
3732 */
3733 static int getOverflowPage(
3734   BtShared *pBt,               /* The database file */
3735   Pgno ovfl,                   /* Current overflow page number */
3736   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
3737   Pgno *pPgnoNext              /* OUT: Next overflow page number */
3738 ){
3739   Pgno next = 0;
3740   MemPage *pPage = 0;
3741   int rc = SQLITE_OK;
3742 
3743   assert( sqlite3_mutex_held(pBt->mutex) );
3744   assert(pPgnoNext);
3745 
3746 #ifndef SQLITE_OMIT_AUTOVACUUM
3747   /* Try to find the next page in the overflow list using the
3748   ** autovacuum pointer-map pages. Guess that the next page in
3749   ** the overflow list is page number (ovfl+1). If that guess turns
3750   ** out to be wrong, fall back to loading the data of page
3751   ** number ovfl to determine the next page number.
3752   */
3753   if( pBt->autoVacuum ){
3754     Pgno pgno;
3755     Pgno iGuess = ovfl+1;
3756     u8 eType;
3757 
3758     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3759       iGuess++;
3760     }
3761 
3762     if( iGuess<=btreePagecount(pBt) ){
3763       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3764       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3765         next = iGuess;
3766         rc = SQLITE_DONE;
3767       }
3768     }
3769   }
3770 #endif
3771 
3772   assert( next==0 || rc==SQLITE_DONE );
3773   if( rc==SQLITE_OK ){
3774     rc = btreeGetPage(pBt, ovfl, &pPage, 0);
3775     assert( rc==SQLITE_OK || pPage==0 );
3776     if( rc==SQLITE_OK ){
3777       next = get4byte(pPage->aData);
3778     }
3779   }
3780 
3781   *pPgnoNext = next;
3782   if( ppPage ){
3783     *ppPage = pPage;
3784   }else{
3785     releasePage(pPage);
3786   }
3787   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3788 }
3789 
3790 /*
3791 ** Copy data from a buffer to a page, or from a page to a buffer.
3792 **
3793 ** pPayload is a pointer to data stored on database page pDbPage.
3794 ** If argument eOp is false, then nByte bytes of data are copied
3795 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3796 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3797 ** of data are copied from the buffer pBuf to pPayload.
3798 **
3799 ** SQLITE_OK is returned on success, otherwise an error code.
3800 */
3801 static int copyPayload(
3802   void *pPayload,           /* Pointer to page data */
3803   void *pBuf,               /* Pointer to buffer */
3804   int nByte,                /* Number of bytes to copy */
3805   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3806   DbPage *pDbPage           /* Page containing pPayload */
3807 ){
3808   if( eOp ){
3809     /* Copy data from buffer to page (a write operation) */
3810     int rc = sqlite3PagerWrite(pDbPage);
3811     if( rc!=SQLITE_OK ){
3812       return rc;
3813     }
3814     memcpy(pPayload, pBuf, nByte);
3815   }else{
3816     /* Copy data from page to buffer (a read operation) */
3817     memcpy(pBuf, pPayload, nByte);
3818   }
3819   return SQLITE_OK;
3820 }
3821 
3822 /*
3823 ** This function is used to read or overwrite payload information
3824 ** for the entry that the pCur cursor is pointing to. If the eOp
3825 ** parameter is 0, this is a read operation (data copied into
3826 ** buffer pBuf). If it is non-zero, a write (data copied from
3827 ** buffer pBuf).
3828 **
3829 ** A total of "amt" bytes are read or written beginning at "offset".
3830 ** Data is read to or from the buffer pBuf.
3831 **
3832 ** The content being read or written might appear on the main page
3833 ** or be scattered out on multiple overflow pages.
3834 **
3835 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3836 ** cursor entry uses one or more overflow pages, this function
3837 ** allocates space for and lazily popluates the overflow page-list
3838 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3839 ** cache to make seeking to the supplied offset more efficient.
3840 **
3841 ** Once an overflow page-list cache has been allocated, it may be
3842 ** invalidated if some other cursor writes to the same table, or if
3843 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3844 ** mode, the following events may invalidate an overflow page-list cache.
3845 **
3846 **   * An incremental vacuum,
3847 **   * A commit in auto_vacuum="full" mode,
3848 **   * Creating a table (may require moving an overflow page).
3849 */
3850 static int accessPayload(
3851   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3852   u32 offset,          /* Begin reading this far into payload */
3853   u32 amt,             /* Read this many bytes */
3854   unsigned char *pBuf, /* Write the bytes into this buffer */
3855   int eOp              /* zero to read. non-zero to write. */
3856 ){
3857   unsigned char *aPayload;
3858   int rc = SQLITE_OK;
3859   u32 nKey;
3860   int iIdx = 0;
3861   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3862   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
3863 
3864   assert( pPage );
3865   assert( pCur->eState==CURSOR_VALID );
3866   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3867   assert( cursorHoldsMutex(pCur) );
3868 
3869   getCellInfo(pCur);
3870   aPayload = pCur->info.pCell + pCur->info.nHeader;
3871   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3872 
3873   if( NEVER(offset+amt > nKey+pCur->info.nData)
3874    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3875   ){
3876     /* Trying to read or write past the end of the data is an error */
3877     return SQLITE_CORRUPT_BKPT;
3878   }
3879 
3880   /* Check if data must be read/written to/from the btree page itself. */
3881   if( offset<pCur->info.nLocal ){
3882     int a = amt;
3883     if( a+offset>pCur->info.nLocal ){
3884       a = pCur->info.nLocal - offset;
3885     }
3886     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3887     offset = 0;
3888     pBuf += a;
3889     amt -= a;
3890   }else{
3891     offset -= pCur->info.nLocal;
3892   }
3893 
3894   if( rc==SQLITE_OK && amt>0 ){
3895     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3896     Pgno nextPage;
3897 
3898     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3899 
3900 #ifndef SQLITE_OMIT_INCRBLOB
3901     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3902     ** has not been allocated, allocate it now. The array is sized at
3903     ** one entry for each overflow page in the overflow chain. The
3904     ** page number of the first overflow page is stored in aOverflow[0],
3905     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3906     ** (the cache is lazily populated).
3907     */
3908     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3909       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3910       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3911       /* nOvfl is always positive.  If it were zero, fetchPayload would have
3912       ** been used instead of this routine. */
3913       if( ALWAYS(nOvfl) && !pCur->aOverflow ){
3914         rc = SQLITE_NOMEM;
3915       }
3916     }
3917 
3918     /* If the overflow page-list cache has been allocated and the
3919     ** entry for the first required overflow page is valid, skip
3920     ** directly to it.
3921     */
3922     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3923       iIdx = (offset/ovflSize);
3924       nextPage = pCur->aOverflow[iIdx];
3925       offset = (offset%ovflSize);
3926     }
3927 #endif
3928 
3929     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3930 
3931 #ifndef SQLITE_OMIT_INCRBLOB
3932       /* If required, populate the overflow page-list cache. */
3933       if( pCur->aOverflow ){
3934         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3935         pCur->aOverflow[iIdx] = nextPage;
3936       }
3937 #endif
3938 
3939       if( offset>=ovflSize ){
3940         /* The only reason to read this page is to obtain the page
3941         ** number for the next page in the overflow chain. The page
3942         ** data is not required. So first try to lookup the overflow
3943         ** page-list cache, if any, then fall back to the getOverflowPage()
3944         ** function.
3945         */
3946 #ifndef SQLITE_OMIT_INCRBLOB
3947         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3948           nextPage = pCur->aOverflow[iIdx+1];
3949         } else
3950 #endif
3951           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3952         offset -= ovflSize;
3953       }else{
3954         /* Need to read this page properly. It contains some of the
3955         ** range of data that is being read (eOp==0) or written (eOp!=0).
3956         */
3957 #ifdef SQLITE_DIRECT_OVERFLOW_READ
3958         sqlite3_file *fd;
3959 #endif
3960         int a = amt;
3961         if( a + offset > ovflSize ){
3962           a = ovflSize - offset;
3963         }
3964 
3965 #ifdef SQLITE_DIRECT_OVERFLOW_READ
3966         /* If all the following are true:
3967         **
3968         **   1) this is a read operation, and
3969         **   2) data is required from the start of this overflow page, and
3970         **   3) the database is file-backed, and
3971         **   4) there is no open write-transaction, and
3972         **   5) the database is not a WAL database,
3973         **
3974         ** then data can be read directly from the database file into the
3975         ** output buffer, bypassing the page-cache altogether. This speeds
3976         ** up loading large records that span many overflow pages.
3977         */
3978         if( eOp==0                                             /* (1) */
3979          && offset==0                                          /* (2) */
3980          && pBt->inTransaction==TRANS_READ                     /* (4) */
3981          && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
3982          && pBt->pPage1->aData[19]==0x01                       /* (5) */
3983         ){
3984           u8 aSave[4];
3985           u8 *aWrite = &pBuf[-4];
3986           memcpy(aSave, aWrite, 4);
3987           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
3988           nextPage = get4byte(aWrite);
3989           memcpy(aWrite, aSave, 4);
3990         }else
3991 #endif
3992 
3993         {
3994           DbPage *pDbPage;
3995           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3996           if( rc==SQLITE_OK ){
3997             aPayload = sqlite3PagerGetData(pDbPage);
3998             nextPage = get4byte(aPayload);
3999             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4000             sqlite3PagerUnref(pDbPage);
4001             offset = 0;
4002           }
4003         }
4004         amt -= a;
4005         pBuf += a;
4006       }
4007     }
4008   }
4009 
4010   if( rc==SQLITE_OK && amt>0 ){
4011     return SQLITE_CORRUPT_BKPT;
4012   }
4013   return rc;
4014 }
4015 
4016 /*
4017 ** Read part of the key associated with cursor pCur.  Exactly
4018 ** "amt" bytes will be transfered into pBuf[].  The transfer
4019 ** begins at "offset".
4020 **
4021 ** The caller must ensure that pCur is pointing to a valid row
4022 ** in the table.
4023 **
4024 ** Return SQLITE_OK on success or an error code if anything goes
4025 ** wrong.  An error is returned if "offset+amt" is larger than
4026 ** the available payload.
4027 */
4028 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4029   assert( cursorHoldsMutex(pCur) );
4030   assert( pCur->eState==CURSOR_VALID );
4031   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4032   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4033   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
4034 }
4035 
4036 /*
4037 ** Read part of the data associated with cursor pCur.  Exactly
4038 ** "amt" bytes will be transfered into pBuf[].  The transfer
4039 ** begins at "offset".
4040 **
4041 ** Return SQLITE_OK on success or an error code if anything goes
4042 ** wrong.  An error is returned if "offset+amt" is larger than
4043 ** the available payload.
4044 */
4045 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4046   int rc;
4047 
4048 #ifndef SQLITE_OMIT_INCRBLOB
4049   if ( pCur->eState==CURSOR_INVALID ){
4050     return SQLITE_ABORT;
4051   }
4052 #endif
4053 
4054   assert( cursorHoldsMutex(pCur) );
4055   rc = restoreCursorPosition(pCur);
4056   if( rc==SQLITE_OK ){
4057     assert( pCur->eState==CURSOR_VALID );
4058     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
4059     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4060     rc = accessPayload(pCur, offset, amt, pBuf, 0);
4061   }
4062   return rc;
4063 }
4064 
4065 /*
4066 ** Return a pointer to payload information from the entry that the
4067 ** pCur cursor is pointing to.  The pointer is to the beginning of
4068 ** the key if skipKey==0 and it points to the beginning of data if
4069 ** skipKey==1.  The number of bytes of available key/data is written
4070 ** into *pAmt.  If *pAmt==0, then the value returned will not be
4071 ** a valid pointer.
4072 **
4073 ** This routine is an optimization.  It is common for the entire key
4074 ** and data to fit on the local page and for there to be no overflow
4075 ** pages.  When that is so, this routine can be used to access the
4076 ** key and data without making a copy.  If the key and/or data spills
4077 ** onto overflow pages, then accessPayload() must be used to reassemble
4078 ** the key/data and copy it into a preallocated buffer.
4079 **
4080 ** The pointer returned by this routine looks directly into the cached
4081 ** page of the database.  The data might change or move the next time
4082 ** any btree routine is called.
4083 */
4084 static const unsigned char *fetchPayload(
4085   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4086   int *pAmt,           /* Write the number of available bytes here */
4087   int skipKey          /* read beginning at data if this is true */
4088 ){
4089   unsigned char *aPayload;
4090   MemPage *pPage;
4091   u32 nKey;
4092   u32 nLocal;
4093 
4094   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4095   assert( pCur->eState==CURSOR_VALID );
4096   assert( cursorHoldsMutex(pCur) );
4097   pPage = pCur->apPage[pCur->iPage];
4098   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4099   if( NEVER(pCur->info.nSize==0) ){
4100     btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
4101                    &pCur->info);
4102   }
4103   aPayload = pCur->info.pCell;
4104   aPayload += pCur->info.nHeader;
4105   if( pPage->intKey ){
4106     nKey = 0;
4107   }else{
4108     nKey = (int)pCur->info.nKey;
4109   }
4110   if( skipKey ){
4111     aPayload += nKey;
4112     nLocal = pCur->info.nLocal - nKey;
4113   }else{
4114     nLocal = pCur->info.nLocal;
4115     assert( nLocal<=nKey );
4116   }
4117   *pAmt = nLocal;
4118   return aPayload;
4119 }
4120 
4121 
4122 /*
4123 ** For the entry that cursor pCur is point to, return as
4124 ** many bytes of the key or data as are available on the local
4125 ** b-tree page.  Write the number of available bytes into *pAmt.
4126 **
4127 ** The pointer returned is ephemeral.  The key/data may move
4128 ** or be destroyed on the next call to any Btree routine,
4129 ** including calls from other threads against the same cache.
4130 ** Hence, a mutex on the BtShared should be held prior to calling
4131 ** this routine.
4132 **
4133 ** These routines is used to get quick access to key and data
4134 ** in the common case where no overflow pages are used.
4135 */
4136 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
4137   const void *p = 0;
4138   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4139   assert( cursorHoldsMutex(pCur) );
4140   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4141     p = (const void*)fetchPayload(pCur, pAmt, 0);
4142   }
4143   return p;
4144 }
4145 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
4146   const void *p = 0;
4147   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4148   assert( cursorHoldsMutex(pCur) );
4149   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4150     p = (const void*)fetchPayload(pCur, pAmt, 1);
4151   }
4152   return p;
4153 }
4154 
4155 
4156 /*
4157 ** Move the cursor down to a new child page.  The newPgno argument is the
4158 ** page number of the child page to move to.
4159 **
4160 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4161 ** the new child page does not match the flags field of the parent (i.e.
4162 ** if an intkey page appears to be the parent of a non-intkey page, or
4163 ** vice-versa).
4164 */
4165 static int moveToChild(BtCursor *pCur, u32 newPgno){
4166   int rc;
4167   int i = pCur->iPage;
4168   MemPage *pNewPage;
4169   BtShared *pBt = pCur->pBt;
4170 
4171   assert( cursorHoldsMutex(pCur) );
4172   assert( pCur->eState==CURSOR_VALID );
4173   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4174   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4175     return SQLITE_CORRUPT_BKPT;
4176   }
4177   rc = getAndInitPage(pBt, newPgno, &pNewPage);
4178   if( rc ) return rc;
4179   pCur->apPage[i+1] = pNewPage;
4180   pCur->aiIdx[i+1] = 0;
4181   pCur->iPage++;
4182 
4183   pCur->info.nSize = 0;
4184   pCur->validNKey = 0;
4185   if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
4186     return SQLITE_CORRUPT_BKPT;
4187   }
4188   return SQLITE_OK;
4189 }
4190 
4191 #if 0
4192 /*
4193 ** Page pParent is an internal (non-leaf) tree page. This function
4194 ** asserts that page number iChild is the left-child if the iIdx'th
4195 ** cell in page pParent. Or, if iIdx is equal to the total number of
4196 ** cells in pParent, that page number iChild is the right-child of
4197 ** the page.
4198 */
4199 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4200   assert( iIdx<=pParent->nCell );
4201   if( iIdx==pParent->nCell ){
4202     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4203   }else{
4204     assert( get4byte(findCell(pParent, iIdx))==iChild );
4205   }
4206 }
4207 #else
4208 #  define assertParentIndex(x,y,z)
4209 #endif
4210 
4211 /*
4212 ** Move the cursor up to the parent page.
4213 **
4214 ** pCur->idx is set to the cell index that contains the pointer
4215 ** to the page we are coming from.  If we are coming from the
4216 ** right-most child page then pCur->idx is set to one more than
4217 ** the largest cell index.
4218 */
4219 static void moveToParent(BtCursor *pCur){
4220   assert( cursorHoldsMutex(pCur) );
4221   assert( pCur->eState==CURSOR_VALID );
4222   assert( pCur->iPage>0 );
4223   assert( pCur->apPage[pCur->iPage] );
4224 
4225   /* UPDATE: It is actually possible for the condition tested by the assert
4226   ** below to be untrue if the database file is corrupt. This can occur if
4227   ** one cursor has modified page pParent while a reference to it is held
4228   ** by a second cursor. Which can only happen if a single page is linked
4229   ** into more than one b-tree structure in a corrupt database.  */
4230 #if 0
4231   assertParentIndex(
4232     pCur->apPage[pCur->iPage-1],
4233     pCur->aiIdx[pCur->iPage-1],
4234     pCur->apPage[pCur->iPage]->pgno
4235   );
4236 #endif
4237   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
4238 
4239   releasePage(pCur->apPage[pCur->iPage]);
4240   pCur->iPage--;
4241   pCur->info.nSize = 0;
4242   pCur->validNKey = 0;
4243 }
4244 
4245 /*
4246 ** Move the cursor to point to the root page of its b-tree structure.
4247 **
4248 ** If the table has a virtual root page, then the cursor is moved to point
4249 ** to the virtual root page instead of the actual root page. A table has a
4250 ** virtual root page when the actual root page contains no cells and a
4251 ** single child page. This can only happen with the table rooted at page 1.
4252 **
4253 ** If the b-tree structure is empty, the cursor state is set to
4254 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4255 ** cell located on the root (or virtual root) page and the cursor state
4256 ** is set to CURSOR_VALID.
4257 **
4258 ** If this function returns successfully, it may be assumed that the
4259 ** page-header flags indicate that the [virtual] root-page is the expected
4260 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4261 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4262 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4263 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4264 ** b-tree).
4265 */
4266 static int moveToRoot(BtCursor *pCur){
4267   MemPage *pRoot;
4268   int rc = SQLITE_OK;
4269   Btree *p = pCur->pBtree;
4270   BtShared *pBt = p->pBt;
4271 
4272   assert( cursorHoldsMutex(pCur) );
4273   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4274   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4275   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4276   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4277     if( pCur->eState==CURSOR_FAULT ){
4278       assert( pCur->skipNext!=SQLITE_OK );
4279       return pCur->skipNext;
4280     }
4281     sqlite3BtreeClearCursor(pCur);
4282   }
4283 
4284   if( pCur->iPage>=0 ){
4285     int i;
4286     for(i=1; i<=pCur->iPage; i++){
4287       releasePage(pCur->apPage[i]);
4288     }
4289     pCur->iPage = 0;
4290   }else if( pCur->pgnoRoot==0 ){
4291     pCur->eState = CURSOR_INVALID;
4292     return SQLITE_OK;
4293   }else{
4294     rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4295     if( rc!=SQLITE_OK ){
4296       pCur->eState = CURSOR_INVALID;
4297       return rc;
4298     }
4299     pCur->iPage = 0;
4300 
4301     /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4302     ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4303     ** NULL, the caller expects a table b-tree. If this is not the case,
4304     ** return an SQLITE_CORRUPT error.  */
4305     assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4306     if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4307       return SQLITE_CORRUPT_BKPT;
4308     }
4309   }
4310 
4311   /* Assert that the root page is of the correct type. This must be the
4312   ** case as the call to this function that loaded the root-page (either
4313   ** this call or a previous invocation) would have detected corruption
4314   ** if the assumption were not true, and it is not possible for the flags
4315   ** byte to have been modified while this cursor is holding a reference
4316   ** to the page.  */
4317   pRoot = pCur->apPage[0];
4318   assert( pRoot->pgno==pCur->pgnoRoot );
4319   assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4320 
4321   pCur->aiIdx[0] = 0;
4322   pCur->info.nSize = 0;
4323   pCur->atLast = 0;
4324   pCur->validNKey = 0;
4325 
4326   if( pRoot->nCell==0 && !pRoot->leaf ){
4327     Pgno subpage;
4328     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4329     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4330     pCur->eState = CURSOR_VALID;
4331     rc = moveToChild(pCur, subpage);
4332   }else{
4333     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
4334   }
4335   return rc;
4336 }
4337 
4338 /*
4339 ** Move the cursor down to the left-most leaf entry beneath the
4340 ** entry to which it is currently pointing.
4341 **
4342 ** The left-most leaf is the one with the smallest key - the first
4343 ** in ascending order.
4344 */
4345 static int moveToLeftmost(BtCursor *pCur){
4346   Pgno pgno;
4347   int rc = SQLITE_OK;
4348   MemPage *pPage;
4349 
4350   assert( cursorHoldsMutex(pCur) );
4351   assert( pCur->eState==CURSOR_VALID );
4352   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4353     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4354     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4355     rc = moveToChild(pCur, pgno);
4356   }
4357   return rc;
4358 }
4359 
4360 /*
4361 ** Move the cursor down to the right-most leaf entry beneath the
4362 ** page to which it is currently pointing.  Notice the difference
4363 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4364 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4365 ** finds the right-most entry beneath the *page*.
4366 **
4367 ** The right-most entry is the one with the largest key - the last
4368 ** key in ascending order.
4369 */
4370 static int moveToRightmost(BtCursor *pCur){
4371   Pgno pgno;
4372   int rc = SQLITE_OK;
4373   MemPage *pPage = 0;
4374 
4375   assert( cursorHoldsMutex(pCur) );
4376   assert( pCur->eState==CURSOR_VALID );
4377   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4378     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4379     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4380     rc = moveToChild(pCur, pgno);
4381   }
4382   if( rc==SQLITE_OK ){
4383     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4384     pCur->info.nSize = 0;
4385     pCur->validNKey = 0;
4386   }
4387   return rc;
4388 }
4389 
4390 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4391 ** on success.  Set *pRes to 0 if the cursor actually points to something
4392 ** or set *pRes to 1 if the table is empty.
4393 */
4394 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4395   int rc;
4396 
4397   assert( cursorHoldsMutex(pCur) );
4398   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4399   rc = moveToRoot(pCur);
4400   if( rc==SQLITE_OK ){
4401     if( pCur->eState==CURSOR_INVALID ){
4402       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4403       *pRes = 1;
4404     }else{
4405       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4406       *pRes = 0;
4407       rc = moveToLeftmost(pCur);
4408     }
4409   }
4410   return rc;
4411 }
4412 
4413 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4414 ** on success.  Set *pRes to 0 if the cursor actually points to something
4415 ** or set *pRes to 1 if the table is empty.
4416 */
4417 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4418   int rc;
4419 
4420   assert( cursorHoldsMutex(pCur) );
4421   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4422 
4423   /* If the cursor already points to the last entry, this is a no-op. */
4424   if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4425 #ifdef SQLITE_DEBUG
4426     /* This block serves to assert() that the cursor really does point
4427     ** to the last entry in the b-tree. */
4428     int ii;
4429     for(ii=0; ii<pCur->iPage; ii++){
4430       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4431     }
4432     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4433     assert( pCur->apPage[pCur->iPage]->leaf );
4434 #endif
4435     return SQLITE_OK;
4436   }
4437 
4438   rc = moveToRoot(pCur);
4439   if( rc==SQLITE_OK ){
4440     if( CURSOR_INVALID==pCur->eState ){
4441       assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4442       *pRes = 1;
4443     }else{
4444       assert( pCur->eState==CURSOR_VALID );
4445       *pRes = 0;
4446       rc = moveToRightmost(pCur);
4447       pCur->atLast = rc==SQLITE_OK ?1:0;
4448     }
4449   }
4450   return rc;
4451 }
4452 
4453 /* Move the cursor so that it points to an entry near the key
4454 ** specified by pIdxKey or intKey.   Return a success code.
4455 **
4456 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
4457 ** must be NULL.  For index tables, pIdxKey is used and intKey
4458 ** is ignored.
4459 **
4460 ** If an exact match is not found, then the cursor is always
4461 ** left pointing at a leaf page which would hold the entry if it
4462 ** were present.  The cursor might point to an entry that comes
4463 ** before or after the key.
4464 **
4465 ** An integer is written into *pRes which is the result of
4466 ** comparing the key with the entry to which the cursor is
4467 ** pointing.  The meaning of the integer written into
4468 ** *pRes is as follows:
4469 **
4470 **     *pRes<0      The cursor is left pointing at an entry that
4471 **                  is smaller than intKey/pIdxKey or if the table is empty
4472 **                  and the cursor is therefore left point to nothing.
4473 **
4474 **     *pRes==0     The cursor is left pointing at an entry that
4475 **                  exactly matches intKey/pIdxKey.
4476 **
4477 **     *pRes>0      The cursor is left pointing at an entry that
4478 **                  is larger than intKey/pIdxKey.
4479 **
4480 */
4481 int sqlite3BtreeMovetoUnpacked(
4482   BtCursor *pCur,          /* The cursor to be moved */
4483   UnpackedRecord *pIdxKey, /* Unpacked index key */
4484   i64 intKey,              /* The table key */
4485   int biasRight,           /* If true, bias the search to the high end */
4486   int *pRes                /* Write search results here */
4487 ){
4488   int rc;
4489 
4490   assert( cursorHoldsMutex(pCur) );
4491   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4492   assert( pRes );
4493   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4494 
4495   /* If the cursor is already positioned at the point we are trying
4496   ** to move to, then just return without doing any work */
4497   if( pCur->eState==CURSOR_VALID && pCur->validNKey
4498    && pCur->apPage[0]->intKey
4499   ){
4500     if( pCur->info.nKey==intKey ){
4501       *pRes = 0;
4502       return SQLITE_OK;
4503     }
4504     if( pCur->atLast && pCur->info.nKey<intKey ){
4505       *pRes = -1;
4506       return SQLITE_OK;
4507     }
4508   }
4509 
4510   rc = moveToRoot(pCur);
4511   if( rc ){
4512     return rc;
4513   }
4514   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
4515   assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
4516   assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
4517   if( pCur->eState==CURSOR_INVALID ){
4518     *pRes = -1;
4519     assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
4520     return SQLITE_OK;
4521   }
4522   assert( pCur->apPage[0]->intKey || pIdxKey );
4523   for(;;){
4524     int lwr, upr, idx;
4525     Pgno chldPg;
4526     MemPage *pPage = pCur->apPage[pCur->iPage];
4527     int c;
4528 
4529     /* pPage->nCell must be greater than zero. If this is the root-page
4530     ** the cursor would have been INVALID above and this for(;;) loop
4531     ** not run. If this is not the root-page, then the moveToChild() routine
4532     ** would have already detected db corruption. Similarly, pPage must
4533     ** be the right kind (index or table) of b-tree page. Otherwise
4534     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
4535     assert( pPage->nCell>0 );
4536     assert( pPage->intKey==(pIdxKey==0) );
4537     lwr = 0;
4538     upr = pPage->nCell-1;
4539     if( biasRight ){
4540       pCur->aiIdx[pCur->iPage] = (u16)(idx = upr);
4541     }else{
4542       pCur->aiIdx[pCur->iPage] = (u16)(idx = (upr+lwr)/2);
4543     }
4544     for(;;){
4545       u8 *pCell;                          /* Pointer to current cell in pPage */
4546 
4547       assert( idx==pCur->aiIdx[pCur->iPage] );
4548       pCur->info.nSize = 0;
4549       pCell = findCell(pPage, idx) + pPage->childPtrSize;
4550       if( pPage->intKey ){
4551         i64 nCellKey;
4552         if( pPage->hasData ){
4553           u32 dummy;
4554           pCell += getVarint32(pCell, dummy);
4555         }
4556         getVarint(pCell, (u64*)&nCellKey);
4557         if( nCellKey==intKey ){
4558           c = 0;
4559         }else if( nCellKey<intKey ){
4560           c = -1;
4561         }else{
4562           assert( nCellKey>intKey );
4563           c = +1;
4564         }
4565         pCur->validNKey = 1;
4566         pCur->info.nKey = nCellKey;
4567       }else{
4568         /* The maximum supported page-size is 65536 bytes. This means that
4569         ** the maximum number of record bytes stored on an index B-Tree
4570         ** page is less than 16384 bytes and may be stored as a 2-byte
4571         ** varint. This information is used to attempt to avoid parsing
4572         ** the entire cell by checking for the cases where the record is
4573         ** stored entirely within the b-tree page by inspecting the first
4574         ** 2 bytes of the cell.
4575         */
4576         int nCell = pCell[0];
4577         if( nCell<=pPage->max1bytePayload
4578          /* && (pCell+nCell)<pPage->aDataEnd */
4579         ){
4580           /* This branch runs if the record-size field of the cell is a
4581           ** single byte varint and the record fits entirely on the main
4582           ** b-tree page.  */
4583           testcase( pCell+nCell+1==pPage->aDataEnd );
4584           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4585         }else if( !(pCell[1] & 0x80)
4586           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4587           /* && (pCell+nCell+2)<=pPage->aDataEnd */
4588         ){
4589           /* The record-size field is a 2 byte varint and the record
4590           ** fits entirely on the main b-tree page.  */
4591           testcase( pCell+nCell+2==pPage->aDataEnd );
4592           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
4593         }else{
4594           /* The record flows over onto one or more overflow pages. In
4595           ** this case the whole cell needs to be parsed, a buffer allocated
4596           ** and accessPayload() used to retrieve the record into the
4597           ** buffer before VdbeRecordCompare() can be called. */
4598           void *pCellKey;
4599           u8 * const pCellBody = pCell - pPage->childPtrSize;
4600           btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4601           nCell = (int)pCur->info.nKey;
4602           pCellKey = sqlite3Malloc( nCell );
4603           if( pCellKey==0 ){
4604             rc = SQLITE_NOMEM;
4605             goto moveto_finish;
4606           }
4607           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
4608           if( rc ){
4609             sqlite3_free(pCellKey);
4610             goto moveto_finish;
4611           }
4612           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
4613           sqlite3_free(pCellKey);
4614         }
4615       }
4616       if( c==0 ){
4617         if( pPage->intKey && !pPage->leaf ){
4618           lwr = idx;
4619           break;
4620         }else{
4621           *pRes = 0;
4622           rc = SQLITE_OK;
4623           goto moveto_finish;
4624         }
4625       }
4626       if( c<0 ){
4627         lwr = idx+1;
4628       }else{
4629         upr = idx-1;
4630       }
4631       if( lwr>upr ){
4632         break;
4633       }
4634       pCur->aiIdx[pCur->iPage] = (u16)(idx = (lwr+upr)/2);
4635     }
4636     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
4637     assert( pPage->isInit );
4638     if( pPage->leaf ){
4639       chldPg = 0;
4640     }else if( lwr>=pPage->nCell ){
4641       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4642     }else{
4643       chldPg = get4byte(findCell(pPage, lwr));
4644     }
4645     if( chldPg==0 ){
4646       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4647       *pRes = c;
4648       rc = SQLITE_OK;
4649       goto moveto_finish;
4650     }
4651     pCur->aiIdx[pCur->iPage] = (u16)lwr;
4652     pCur->info.nSize = 0;
4653     pCur->validNKey = 0;
4654     rc = moveToChild(pCur, chldPg);
4655     if( rc ) goto moveto_finish;
4656   }
4657 moveto_finish:
4658   return rc;
4659 }
4660 
4661 
4662 /*
4663 ** Return TRUE if the cursor is not pointing at an entry of the table.
4664 **
4665 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4666 ** past the last entry in the table or sqlite3BtreePrev() moves past
4667 ** the first entry.  TRUE is also returned if the table is empty.
4668 */
4669 int sqlite3BtreeEof(BtCursor *pCur){
4670   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4671   ** have been deleted? This API will need to change to return an error code
4672   ** as well as the boolean result value.
4673   */
4674   return (CURSOR_VALID!=pCur->eState);
4675 }
4676 
4677 /*
4678 ** Advance the cursor to the next entry in the database.  If
4679 ** successful then set *pRes=0.  If the cursor
4680 ** was already pointing to the last entry in the database before
4681 ** this routine was called, then set *pRes=1.
4682 */
4683 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4684   int rc;
4685   int idx;
4686   MemPage *pPage;
4687 
4688   assert( cursorHoldsMutex(pCur) );
4689   rc = restoreCursorPosition(pCur);
4690   if( rc!=SQLITE_OK ){
4691     return rc;
4692   }
4693   assert( pRes!=0 );
4694   if( CURSOR_INVALID==pCur->eState ){
4695     *pRes = 1;
4696     return SQLITE_OK;
4697   }
4698   if( pCur->skipNext>0 ){
4699     pCur->skipNext = 0;
4700     *pRes = 0;
4701     return SQLITE_OK;
4702   }
4703   pCur->skipNext = 0;
4704 
4705   pPage = pCur->apPage[pCur->iPage];
4706   idx = ++pCur->aiIdx[pCur->iPage];
4707   assert( pPage->isInit );
4708 
4709   /* If the database file is corrupt, it is possible for the value of idx
4710   ** to be invalid here. This can only occur if a second cursor modifies
4711   ** the page while cursor pCur is holding a reference to it. Which can
4712   ** only happen if the database is corrupt in such a way as to link the
4713   ** page into more than one b-tree structure. */
4714   testcase( idx>pPage->nCell );
4715 
4716   pCur->info.nSize = 0;
4717   pCur->validNKey = 0;
4718   if( idx>=pPage->nCell ){
4719     if( !pPage->leaf ){
4720       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4721       if( rc ) return rc;
4722       rc = moveToLeftmost(pCur);
4723       *pRes = 0;
4724       return rc;
4725     }
4726     do{
4727       if( pCur->iPage==0 ){
4728         *pRes = 1;
4729         pCur->eState = CURSOR_INVALID;
4730         return SQLITE_OK;
4731       }
4732       moveToParent(pCur);
4733       pPage = pCur->apPage[pCur->iPage];
4734     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4735     *pRes = 0;
4736     if( pPage->intKey ){
4737       rc = sqlite3BtreeNext(pCur, pRes);
4738     }else{
4739       rc = SQLITE_OK;
4740     }
4741     return rc;
4742   }
4743   *pRes = 0;
4744   if( pPage->leaf ){
4745     return SQLITE_OK;
4746   }
4747   rc = moveToLeftmost(pCur);
4748   return rc;
4749 }
4750 
4751 
4752 /*
4753 ** Step the cursor to the back to the previous entry in the database.  If
4754 ** successful then set *pRes=0.  If the cursor
4755 ** was already pointing to the first entry in the database before
4756 ** this routine was called, then set *pRes=1.
4757 */
4758 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4759   int rc;
4760   MemPage *pPage;
4761 
4762   assert( cursorHoldsMutex(pCur) );
4763   rc = restoreCursorPosition(pCur);
4764   if( rc!=SQLITE_OK ){
4765     return rc;
4766   }
4767   pCur->atLast = 0;
4768   if( CURSOR_INVALID==pCur->eState ){
4769     *pRes = 1;
4770     return SQLITE_OK;
4771   }
4772   if( pCur->skipNext<0 ){
4773     pCur->skipNext = 0;
4774     *pRes = 0;
4775     return SQLITE_OK;
4776   }
4777   pCur->skipNext = 0;
4778 
4779   pPage = pCur->apPage[pCur->iPage];
4780   assert( pPage->isInit );
4781   if( !pPage->leaf ){
4782     int idx = pCur->aiIdx[pCur->iPage];
4783     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4784     if( rc ){
4785       return rc;
4786     }
4787     rc = moveToRightmost(pCur);
4788   }else{
4789     while( pCur->aiIdx[pCur->iPage]==0 ){
4790       if( pCur->iPage==0 ){
4791         pCur->eState = CURSOR_INVALID;
4792         *pRes = 1;
4793         return SQLITE_OK;
4794       }
4795       moveToParent(pCur);
4796     }
4797     pCur->info.nSize = 0;
4798     pCur->validNKey = 0;
4799 
4800     pCur->aiIdx[pCur->iPage]--;
4801     pPage = pCur->apPage[pCur->iPage];
4802     if( pPage->intKey && !pPage->leaf ){
4803       rc = sqlite3BtreePrevious(pCur, pRes);
4804     }else{
4805       rc = SQLITE_OK;
4806     }
4807   }
4808   *pRes = 0;
4809   return rc;
4810 }
4811 
4812 /*
4813 ** Allocate a new page from the database file.
4814 **
4815 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
4816 ** has already been called on the new page.)  The new page has also
4817 ** been referenced and the calling routine is responsible for calling
4818 ** sqlite3PagerUnref() on the new page when it is done.
4819 **
4820 ** SQLITE_OK is returned on success.  Any other return value indicates
4821 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
4822 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4823 **
4824 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4825 ** locate a page close to the page number "nearby".  This can be used in an
4826 ** attempt to keep related pages close to each other in the database file,
4827 ** which in turn can make database access faster.
4828 **
4829 ** If the "exact" parameter is not 0, and the page-number nearby exists
4830 ** anywhere on the free-list, then it is guarenteed to be returned. This
4831 ** is only used by auto-vacuum databases when allocating a new table.
4832 */
4833 static int allocateBtreePage(
4834   BtShared *pBt,
4835   MemPage **ppPage,
4836   Pgno *pPgno,
4837   Pgno nearby,
4838   u8 exact
4839 ){
4840   MemPage *pPage1;
4841   int rc;
4842   u32 n;     /* Number of pages on the freelist */
4843   u32 k;     /* Number of leaves on the trunk of the freelist */
4844   MemPage *pTrunk = 0;
4845   MemPage *pPrevTrunk = 0;
4846   Pgno mxPage;     /* Total size of the database file */
4847 
4848   assert( sqlite3_mutex_held(pBt->mutex) );
4849   pPage1 = pBt->pPage1;
4850   mxPage = btreePagecount(pBt);
4851   n = get4byte(&pPage1->aData[36]);
4852   testcase( n==mxPage-1 );
4853   if( n>=mxPage ){
4854     return SQLITE_CORRUPT_BKPT;
4855   }
4856   if( n>0 ){
4857     /* There are pages on the freelist.  Reuse one of those pages. */
4858     Pgno iTrunk;
4859     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4860 
4861     /* If the 'exact' parameter was true and a query of the pointer-map
4862     ** shows that the page 'nearby' is somewhere on the free-list, then
4863     ** the entire-list will be searched for that page.
4864     */
4865 #ifndef SQLITE_OMIT_AUTOVACUUM
4866     if( exact && nearby<=mxPage ){
4867       u8 eType;
4868       assert( nearby>0 );
4869       assert( pBt->autoVacuum );
4870       rc = ptrmapGet(pBt, nearby, &eType, 0);
4871       if( rc ) return rc;
4872       if( eType==PTRMAP_FREEPAGE ){
4873         searchList = 1;
4874       }
4875       *pPgno = nearby;
4876     }
4877 #endif
4878 
4879     /* Decrement the free-list count by 1. Set iTrunk to the index of the
4880     ** first free-list trunk page. iPrevTrunk is initially 1.
4881     */
4882     rc = sqlite3PagerWrite(pPage1->pDbPage);
4883     if( rc ) return rc;
4884     put4byte(&pPage1->aData[36], n-1);
4885 
4886     /* The code within this loop is run only once if the 'searchList' variable
4887     ** is not true. Otherwise, it runs once for each trunk-page on the
4888     ** free-list until the page 'nearby' is located.
4889     */
4890     do {
4891       pPrevTrunk = pTrunk;
4892       if( pPrevTrunk ){
4893         iTrunk = get4byte(&pPrevTrunk->aData[0]);
4894       }else{
4895         iTrunk = get4byte(&pPage1->aData[32]);
4896       }
4897       testcase( iTrunk==mxPage );
4898       if( iTrunk>mxPage ){
4899         rc = SQLITE_CORRUPT_BKPT;
4900       }else{
4901         rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4902       }
4903       if( rc ){
4904         pTrunk = 0;
4905         goto end_allocate_page;
4906       }
4907       assert( pTrunk!=0 );
4908       assert( pTrunk->aData!=0 );
4909 
4910       k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
4911       if( k==0 && !searchList ){
4912         /* The trunk has no leaves and the list is not being searched.
4913         ** So extract the trunk page itself and use it as the newly
4914         ** allocated page */
4915         assert( pPrevTrunk==0 );
4916         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4917         if( rc ){
4918           goto end_allocate_page;
4919         }
4920         *pPgno = iTrunk;
4921         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4922         *ppPage = pTrunk;
4923         pTrunk = 0;
4924         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4925       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
4926         /* Value of k is out of range.  Database corruption */
4927         rc = SQLITE_CORRUPT_BKPT;
4928         goto end_allocate_page;
4929 #ifndef SQLITE_OMIT_AUTOVACUUM
4930       }else if( searchList && nearby==iTrunk ){
4931         /* The list is being searched and this trunk page is the page
4932         ** to allocate, regardless of whether it has leaves.
4933         */
4934         assert( *pPgno==iTrunk );
4935         *ppPage = pTrunk;
4936         searchList = 0;
4937         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4938         if( rc ){
4939           goto end_allocate_page;
4940         }
4941         if( k==0 ){
4942           if( !pPrevTrunk ){
4943             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4944           }else{
4945             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4946             if( rc!=SQLITE_OK ){
4947               goto end_allocate_page;
4948             }
4949             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4950           }
4951         }else{
4952           /* The trunk page is required by the caller but it contains
4953           ** pointers to free-list leaves. The first leaf becomes a trunk
4954           ** page in this case.
4955           */
4956           MemPage *pNewTrunk;
4957           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4958           if( iNewTrunk>mxPage ){
4959             rc = SQLITE_CORRUPT_BKPT;
4960             goto end_allocate_page;
4961           }
4962           testcase( iNewTrunk==mxPage );
4963           rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4964           if( rc!=SQLITE_OK ){
4965             goto end_allocate_page;
4966           }
4967           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4968           if( rc!=SQLITE_OK ){
4969             releasePage(pNewTrunk);
4970             goto end_allocate_page;
4971           }
4972           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4973           put4byte(&pNewTrunk->aData[4], k-1);
4974           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4975           releasePage(pNewTrunk);
4976           if( !pPrevTrunk ){
4977             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4978             put4byte(&pPage1->aData[32], iNewTrunk);
4979           }else{
4980             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4981             if( rc ){
4982               goto end_allocate_page;
4983             }
4984             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4985           }
4986         }
4987         pTrunk = 0;
4988         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4989 #endif
4990       }else if( k>0 ){
4991         /* Extract a leaf from the trunk */
4992         u32 closest;
4993         Pgno iPage;
4994         unsigned char *aData = pTrunk->aData;
4995         if( nearby>0 ){
4996           u32 i;
4997           int dist;
4998           closest = 0;
4999           dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
5000           for(i=1; i<k; i++){
5001             int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
5002             if( d2<dist ){
5003               closest = i;
5004               dist = d2;
5005             }
5006           }
5007         }else{
5008           closest = 0;
5009         }
5010 
5011         iPage = get4byte(&aData[8+closest*4]);
5012         testcase( iPage==mxPage );
5013         if( iPage>mxPage ){
5014           rc = SQLITE_CORRUPT_BKPT;
5015           goto end_allocate_page;
5016         }
5017         testcase( iPage==mxPage );
5018         if( !searchList || iPage==nearby ){
5019           int noContent;
5020           *pPgno = iPage;
5021           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
5022                  ": %d more free pages\n",
5023                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
5024           rc = sqlite3PagerWrite(pTrunk->pDbPage);
5025           if( rc ) goto end_allocate_page;
5026           if( closest<k-1 ){
5027             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
5028           }
5029           put4byte(&aData[4], k-1);
5030           noContent = !btreeGetHasContent(pBt, *pPgno);
5031           rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
5032           if( rc==SQLITE_OK ){
5033             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5034             if( rc!=SQLITE_OK ){
5035               releasePage(*ppPage);
5036             }
5037           }
5038           searchList = 0;
5039         }
5040       }
5041       releasePage(pPrevTrunk);
5042       pPrevTrunk = 0;
5043     }while( searchList );
5044   }else{
5045     /* There are no pages on the freelist, so create a new page at the
5046     ** end of the file */
5047     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
5048     if( rc ) return rc;
5049     pBt->nPage++;
5050     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
5051 
5052 #ifndef SQLITE_OMIT_AUTOVACUUM
5053     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
5054       /* If *pPgno refers to a pointer-map page, allocate two new pages
5055       ** at the end of the file instead of one. The first allocated page
5056       ** becomes a new pointer-map page, the second is used by the caller.
5057       */
5058       MemPage *pPg = 0;
5059       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
5060       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
5061       rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1);
5062       if( rc==SQLITE_OK ){
5063         rc = sqlite3PagerWrite(pPg->pDbPage);
5064         releasePage(pPg);
5065       }
5066       if( rc ) return rc;
5067       pBt->nPage++;
5068       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
5069     }
5070 #endif
5071     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
5072     *pPgno = pBt->nPage;
5073 
5074     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5075     rc = btreeGetPage(pBt, *pPgno, ppPage, 1);
5076     if( rc ) return rc;
5077     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
5078     if( rc!=SQLITE_OK ){
5079       releasePage(*ppPage);
5080     }
5081     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
5082   }
5083 
5084   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
5085 
5086 end_allocate_page:
5087   releasePage(pTrunk);
5088   releasePage(pPrevTrunk);
5089   if( rc==SQLITE_OK ){
5090     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
5091       releasePage(*ppPage);
5092       return SQLITE_CORRUPT_BKPT;
5093     }
5094     (*ppPage)->isInit = 0;
5095   }else{
5096     *ppPage = 0;
5097   }
5098   assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) );
5099   return rc;
5100 }
5101 
5102 /*
5103 ** This function is used to add page iPage to the database file free-list.
5104 ** It is assumed that the page is not already a part of the free-list.
5105 **
5106 ** The value passed as the second argument to this function is optional.
5107 ** If the caller happens to have a pointer to the MemPage object
5108 ** corresponding to page iPage handy, it may pass it as the second value.
5109 ** Otherwise, it may pass NULL.
5110 **
5111 ** If a pointer to a MemPage object is passed as the second argument,
5112 ** its reference count is not altered by this function.
5113 */
5114 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5115   MemPage *pTrunk = 0;                /* Free-list trunk page */
5116   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5117   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5118   MemPage *pPage;                     /* Page being freed. May be NULL. */
5119   int rc;                             /* Return Code */
5120   int nFree;                          /* Initial number of pages on free-list */
5121 
5122   assert( sqlite3_mutex_held(pBt->mutex) );
5123   assert( iPage>1 );
5124   assert( !pMemPage || pMemPage->pgno==iPage );
5125 
5126   if( pMemPage ){
5127     pPage = pMemPage;
5128     sqlite3PagerRef(pPage->pDbPage);
5129   }else{
5130     pPage = btreePageLookup(pBt, iPage);
5131   }
5132 
5133   /* Increment the free page count on pPage1 */
5134   rc = sqlite3PagerWrite(pPage1->pDbPage);
5135   if( rc ) goto freepage_out;
5136   nFree = get4byte(&pPage1->aData[36]);
5137   put4byte(&pPage1->aData[36], nFree+1);
5138 
5139   if( pBt->btsFlags & BTS_SECURE_DELETE ){
5140     /* If the secure_delete option is enabled, then
5141     ** always fully overwrite deleted information with zeros.
5142     */
5143     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5144      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5145     ){
5146       goto freepage_out;
5147     }
5148     memset(pPage->aData, 0, pPage->pBt->pageSize);
5149   }
5150 
5151   /* If the database supports auto-vacuum, write an entry in the pointer-map
5152   ** to indicate that the page is free.
5153   */
5154   if( ISAUTOVACUUM ){
5155     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5156     if( rc ) goto freepage_out;
5157   }
5158 
5159   /* Now manipulate the actual database free-list structure. There are two
5160   ** possibilities. If the free-list is currently empty, or if the first
5161   ** trunk page in the free-list is full, then this page will become a
5162   ** new free-list trunk page. Otherwise, it will become a leaf of the
5163   ** first trunk page in the current free-list. This block tests if it
5164   ** is possible to add the page as a new free-list leaf.
5165   */
5166   if( nFree!=0 ){
5167     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5168 
5169     iTrunk = get4byte(&pPage1->aData[32]);
5170     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5171     if( rc!=SQLITE_OK ){
5172       goto freepage_out;
5173     }
5174 
5175     nLeaf = get4byte(&pTrunk->aData[4]);
5176     assert( pBt->usableSize>32 );
5177     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5178       rc = SQLITE_CORRUPT_BKPT;
5179       goto freepage_out;
5180     }
5181     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5182       /* In this case there is room on the trunk page to insert the page
5183       ** being freed as a new leaf.
5184       **
5185       ** Note that the trunk page is not really full until it contains
5186       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5187       ** coded.  But due to a coding error in versions of SQLite prior to
5188       ** 3.6.0, databases with freelist trunk pages holding more than
5189       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5190       ** to maintain backwards compatibility with older versions of SQLite,
5191       ** we will continue to restrict the number of entries to usableSize/4 - 8
5192       ** for now.  At some point in the future (once everyone has upgraded
5193       ** to 3.6.0 or later) we should consider fixing the conditional above
5194       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5195       */
5196       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5197       if( rc==SQLITE_OK ){
5198         put4byte(&pTrunk->aData[4], nLeaf+1);
5199         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5200         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
5201           sqlite3PagerDontWrite(pPage->pDbPage);
5202         }
5203         rc = btreeSetHasContent(pBt, iPage);
5204       }
5205       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5206       goto freepage_out;
5207     }
5208   }
5209 
5210   /* If control flows to this point, then it was not possible to add the
5211   ** the page being freed as a leaf page of the first trunk in the free-list.
5212   ** Possibly because the free-list is empty, or possibly because the
5213   ** first trunk in the free-list is full. Either way, the page being freed
5214   ** will become the new first trunk page in the free-list.
5215   */
5216   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5217     goto freepage_out;
5218   }
5219   rc = sqlite3PagerWrite(pPage->pDbPage);
5220   if( rc!=SQLITE_OK ){
5221     goto freepage_out;
5222   }
5223   put4byte(pPage->aData, iTrunk);
5224   put4byte(&pPage->aData[4], 0);
5225   put4byte(&pPage1->aData[32], iPage);
5226   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5227 
5228 freepage_out:
5229   if( pPage ){
5230     pPage->isInit = 0;
5231   }
5232   releasePage(pPage);
5233   releasePage(pTrunk);
5234   return rc;
5235 }
5236 static void freePage(MemPage *pPage, int *pRC){
5237   if( (*pRC)==SQLITE_OK ){
5238     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5239   }
5240 }
5241 
5242 /*
5243 ** Free any overflow pages associated with the given Cell.
5244 */
5245 static int clearCell(MemPage *pPage, unsigned char *pCell){
5246   BtShared *pBt = pPage->pBt;
5247   CellInfo info;
5248   Pgno ovflPgno;
5249   int rc;
5250   int nOvfl;
5251   u32 ovflPageSize;
5252 
5253   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5254   btreeParseCellPtr(pPage, pCell, &info);
5255   if( info.iOverflow==0 ){
5256     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
5257   }
5258   if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){
5259     return SQLITE_CORRUPT;  /* Cell extends past end of page */
5260   }
5261   ovflPgno = get4byte(&pCell[info.iOverflow]);
5262   assert( pBt->usableSize > 4 );
5263   ovflPageSize = pBt->usableSize - 4;
5264   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5265   assert( ovflPgno==0 || nOvfl>0 );
5266   while( nOvfl-- ){
5267     Pgno iNext = 0;
5268     MemPage *pOvfl = 0;
5269     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5270       /* 0 is not a legal page number and page 1 cannot be an
5271       ** overflow page. Therefore if ovflPgno<2 or past the end of the
5272       ** file the database must be corrupt. */
5273       return SQLITE_CORRUPT_BKPT;
5274     }
5275     if( nOvfl ){
5276       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5277       if( rc ) return rc;
5278     }
5279 
5280     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5281      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5282     ){
5283       /* There is no reason any cursor should have an outstanding reference
5284       ** to an overflow page belonging to a cell that is being deleted/updated.
5285       ** So if there exists more than one reference to this page, then it
5286       ** must not really be an overflow page and the database must be corrupt.
5287       ** It is helpful to detect this before calling freePage2(), as
5288       ** freePage2() may zero the page contents if secure-delete mode is
5289       ** enabled. If this 'overflow' page happens to be a page that the
5290       ** caller is iterating through or using in some other way, this
5291       ** can be problematic.
5292       */
5293       rc = SQLITE_CORRUPT_BKPT;
5294     }else{
5295       rc = freePage2(pBt, pOvfl, ovflPgno);
5296     }
5297 
5298     if( pOvfl ){
5299       sqlite3PagerUnref(pOvfl->pDbPage);
5300     }
5301     if( rc ) return rc;
5302     ovflPgno = iNext;
5303   }
5304   return SQLITE_OK;
5305 }
5306 
5307 /*
5308 ** Create the byte sequence used to represent a cell on page pPage
5309 ** and write that byte sequence into pCell[].  Overflow pages are
5310 ** allocated and filled in as necessary.  The calling procedure
5311 ** is responsible for making sure sufficient space has been allocated
5312 ** for pCell[].
5313 **
5314 ** Note that pCell does not necessary need to point to the pPage->aData
5315 ** area.  pCell might point to some temporary storage.  The cell will
5316 ** be constructed in this temporary area then copied into pPage->aData
5317 ** later.
5318 */
5319 static int fillInCell(
5320   MemPage *pPage,                /* The page that contains the cell */
5321   unsigned char *pCell,          /* Complete text of the cell */
5322   const void *pKey, i64 nKey,    /* The key */
5323   const void *pData,int nData,   /* The data */
5324   int nZero,                     /* Extra zero bytes to append to pData */
5325   int *pnSize                    /* Write cell size here */
5326 ){
5327   int nPayload;
5328   const u8 *pSrc;
5329   int nSrc, n, rc;
5330   int spaceLeft;
5331   MemPage *pOvfl = 0;
5332   MemPage *pToRelease = 0;
5333   unsigned char *pPrior;
5334   unsigned char *pPayload;
5335   BtShared *pBt = pPage->pBt;
5336   Pgno pgnoOvfl = 0;
5337   int nHeader;
5338   CellInfo info;
5339 
5340   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5341 
5342   /* pPage is not necessarily writeable since pCell might be auxiliary
5343   ** buffer space that is separate from the pPage buffer area */
5344   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5345             || sqlite3PagerIswriteable(pPage->pDbPage) );
5346 
5347   /* Fill in the header. */
5348   nHeader = 0;
5349   if( !pPage->leaf ){
5350     nHeader += 4;
5351   }
5352   if( pPage->hasData ){
5353     nHeader += putVarint(&pCell[nHeader], nData+nZero);
5354   }else{
5355     nData = nZero = 0;
5356   }
5357   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5358   btreeParseCellPtr(pPage, pCell, &info);
5359   assert( info.nHeader==nHeader );
5360   assert( info.nKey==nKey );
5361   assert( info.nData==(u32)(nData+nZero) );
5362 
5363   /* Fill in the payload */
5364   nPayload = nData + nZero;
5365   if( pPage->intKey ){
5366     pSrc = pData;
5367     nSrc = nData;
5368     nData = 0;
5369   }else{
5370     if( NEVER(nKey>0x7fffffff || pKey==0) ){
5371       return SQLITE_CORRUPT_BKPT;
5372     }
5373     nPayload += (int)nKey;
5374     pSrc = pKey;
5375     nSrc = (int)nKey;
5376   }
5377   *pnSize = info.nSize;
5378   spaceLeft = info.nLocal;
5379   pPayload = &pCell[nHeader];
5380   pPrior = &pCell[info.iOverflow];
5381 
5382   while( nPayload>0 ){
5383     if( spaceLeft==0 ){
5384 #ifndef SQLITE_OMIT_AUTOVACUUM
5385       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5386       if( pBt->autoVacuum ){
5387         do{
5388           pgnoOvfl++;
5389         } while(
5390           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5391         );
5392       }
5393 #endif
5394       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5395 #ifndef SQLITE_OMIT_AUTOVACUUM
5396       /* If the database supports auto-vacuum, and the second or subsequent
5397       ** overflow page is being allocated, add an entry to the pointer-map
5398       ** for that page now.
5399       **
5400       ** If this is the first overflow page, then write a partial entry
5401       ** to the pointer-map. If we write nothing to this pointer-map slot,
5402       ** then the optimistic overflow chain processing in clearCell()
5403       ** may misinterpret the uninitialised values and delete the
5404       ** wrong pages from the database.
5405       */
5406       if( pBt->autoVacuum && rc==SQLITE_OK ){
5407         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5408         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5409         if( rc ){
5410           releasePage(pOvfl);
5411         }
5412       }
5413 #endif
5414       if( rc ){
5415         releasePage(pToRelease);
5416         return rc;
5417       }
5418 
5419       /* If pToRelease is not zero than pPrior points into the data area
5420       ** of pToRelease.  Make sure pToRelease is still writeable. */
5421       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5422 
5423       /* If pPrior is part of the data area of pPage, then make sure pPage
5424       ** is still writeable */
5425       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5426             || sqlite3PagerIswriteable(pPage->pDbPage) );
5427 
5428       put4byte(pPrior, pgnoOvfl);
5429       releasePage(pToRelease);
5430       pToRelease = pOvfl;
5431       pPrior = pOvfl->aData;
5432       put4byte(pPrior, 0);
5433       pPayload = &pOvfl->aData[4];
5434       spaceLeft = pBt->usableSize - 4;
5435     }
5436     n = nPayload;
5437     if( n>spaceLeft ) n = spaceLeft;
5438 
5439     /* If pToRelease is not zero than pPayload points into the data area
5440     ** of pToRelease.  Make sure pToRelease is still writeable. */
5441     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5442 
5443     /* If pPayload is part of the data area of pPage, then make sure pPage
5444     ** is still writeable */
5445     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5446             || sqlite3PagerIswriteable(pPage->pDbPage) );
5447 
5448     if( nSrc>0 ){
5449       if( n>nSrc ) n = nSrc;
5450       assert( pSrc );
5451       memcpy(pPayload, pSrc, n);
5452     }else{
5453       memset(pPayload, 0, n);
5454     }
5455     nPayload -= n;
5456     pPayload += n;
5457     pSrc += n;
5458     nSrc -= n;
5459     spaceLeft -= n;
5460     if( nSrc==0 ){
5461       nSrc = nData;
5462       pSrc = pData;
5463     }
5464   }
5465   releasePage(pToRelease);
5466   return SQLITE_OK;
5467 }
5468 
5469 /*
5470 ** Remove the i-th cell from pPage.  This routine effects pPage only.
5471 ** The cell content is not freed or deallocated.  It is assumed that
5472 ** the cell content has been copied someplace else.  This routine just
5473 ** removes the reference to the cell from pPage.
5474 **
5475 ** "sz" must be the number of bytes in the cell.
5476 */
5477 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5478   u32 pc;         /* Offset to cell content of cell being deleted */
5479   u8 *data;       /* pPage->aData */
5480   u8 *ptr;        /* Used to move bytes around within data[] */
5481   u8 *endPtr;     /* End of loop */
5482   int rc;         /* The return code */
5483   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
5484 
5485   if( *pRC ) return;
5486 
5487   assert( idx>=0 && idx<pPage->nCell );
5488   assert( sz==cellSize(pPage, idx) );
5489   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5490   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5491   data = pPage->aData;
5492   ptr = &pPage->aCellIdx[2*idx];
5493   pc = get2byte(ptr);
5494   hdr = pPage->hdrOffset;
5495   testcase( pc==get2byte(&data[hdr+5]) );
5496   testcase( pc+sz==pPage->pBt->usableSize );
5497   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5498     *pRC = SQLITE_CORRUPT_BKPT;
5499     return;
5500   }
5501   rc = freeSpace(pPage, pc, sz);
5502   if( rc ){
5503     *pRC = rc;
5504     return;
5505   }
5506   endPtr = &pPage->aCellIdx[2*pPage->nCell - 2];
5507   assert( (SQLITE_PTR_TO_INT(ptr)&1)==0 );  /* ptr is always 2-byte aligned */
5508   while( ptr<endPtr ){
5509     *(u16*)ptr = *(u16*)&ptr[2];
5510     ptr += 2;
5511   }
5512   pPage->nCell--;
5513   put2byte(&data[hdr+3], pPage->nCell);
5514   pPage->nFree += 2;
5515 }
5516 
5517 /*
5518 ** Insert a new cell on pPage at cell index "i".  pCell points to the
5519 ** content of the cell.
5520 **
5521 ** If the cell content will fit on the page, then put it there.  If it
5522 ** will not fit, then make a copy of the cell content into pTemp if
5523 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
5524 ** in pPage->apOvfl[] and make it point to the cell content (either
5525 ** in pTemp or the original pCell) and also record its index.
5526 ** Allocating a new entry in pPage->aCell[] implies that
5527 ** pPage->nOverflow is incremented.
5528 **
5529 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5530 ** cell. The caller will overwrite them after this function returns. If
5531 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5532 ** (but pCell+nSkip is always valid).
5533 */
5534 static void insertCell(
5535   MemPage *pPage,   /* Page into which we are copying */
5536   int i,            /* New cell becomes the i-th cell of the page */
5537   u8 *pCell,        /* Content of the new cell */
5538   int sz,           /* Bytes of content in pCell */
5539   u8 *pTemp,        /* Temp storage space for pCell, if needed */
5540   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
5541   int *pRC          /* Read and write return code from here */
5542 ){
5543   int idx = 0;      /* Where to write new cell content in data[] */
5544   int j;            /* Loop counter */
5545   int end;          /* First byte past the last cell pointer in data[] */
5546   int ins;          /* Index in data[] where new cell pointer is inserted */
5547   int cellOffset;   /* Address of first cell pointer in data[] */
5548   u8 *data;         /* The content of the whole page */
5549   u8 *ptr;          /* Used for moving information around in data[] */
5550   u8 *endPtr;       /* End of the loop */
5551 
5552   int nSkip = (iChild ? 4 : 0);
5553 
5554   if( *pRC ) return;
5555 
5556   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5557   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
5558   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
5559   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
5560   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5561   /* The cell should normally be sized correctly.  However, when moving a
5562   ** malformed cell from a leaf page to an interior page, if the cell size
5563   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
5564   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
5565   ** the term after the || in the following assert(). */
5566   assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
5567   if( pPage->nOverflow || sz+2>pPage->nFree ){
5568     if( pTemp ){
5569       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5570       pCell = pTemp;
5571     }
5572     if( iChild ){
5573       put4byte(pCell, iChild);
5574     }
5575     j = pPage->nOverflow++;
5576     assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );
5577     pPage->apOvfl[j] = pCell;
5578     pPage->aiOvfl[j] = (u16)i;
5579   }else{
5580     int rc = sqlite3PagerWrite(pPage->pDbPage);
5581     if( rc!=SQLITE_OK ){
5582       *pRC = rc;
5583       return;
5584     }
5585     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5586     data = pPage->aData;
5587     cellOffset = pPage->cellOffset;
5588     end = cellOffset + 2*pPage->nCell;
5589     ins = cellOffset + 2*i;
5590     rc = allocateSpace(pPage, sz, &idx);
5591     if( rc ){ *pRC = rc; return; }
5592     /* The allocateSpace() routine guarantees the following two properties
5593     ** if it returns success */
5594     assert( idx >= end+2 );
5595     assert( idx+sz <= (int)pPage->pBt->usableSize );
5596     pPage->nCell++;
5597     pPage->nFree -= (u16)(2 + sz);
5598     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5599     if( iChild ){
5600       put4byte(&data[idx], iChild);
5601     }
5602     ptr = &data[end];
5603     endPtr = &data[ins];
5604     assert( (SQLITE_PTR_TO_INT(ptr)&1)==0 );  /* ptr is always 2-byte aligned */
5605     while( ptr>endPtr ){
5606       *(u16*)ptr = *(u16*)&ptr[-2];
5607       ptr -= 2;
5608     }
5609     put2byte(&data[ins], idx);
5610     put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5611 #ifndef SQLITE_OMIT_AUTOVACUUM
5612     if( pPage->pBt->autoVacuum ){
5613       /* The cell may contain a pointer to an overflow page. If so, write
5614       ** the entry for the overflow page into the pointer map.
5615       */
5616       ptrmapPutOvflPtr(pPage, pCell, pRC);
5617     }
5618 #endif
5619   }
5620 }
5621 
5622 /*
5623 ** Add a list of cells to a page.  The page should be initially empty.
5624 ** The cells are guaranteed to fit on the page.
5625 */
5626 static void assemblePage(
5627   MemPage *pPage,   /* The page to be assemblied */
5628   int nCell,        /* The number of cells to add to this page */
5629   u8 **apCell,      /* Pointers to cell bodies */
5630   u16 *aSize        /* Sizes of the cells */
5631 ){
5632   int i;            /* Loop counter */
5633   u8 *pCellptr;     /* Address of next cell pointer */
5634   int cellbody;     /* Address of next cell body */
5635   u8 * const data = pPage->aData;             /* Pointer to data for pPage */
5636   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */
5637   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5638 
5639   assert( pPage->nOverflow==0 );
5640   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5641   assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
5642             && (int)MX_CELL(pPage->pBt)<=10921);
5643   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5644 
5645   /* Check that the page has just been zeroed by zeroPage() */
5646   assert( pPage->nCell==0 );
5647   assert( get2byteNotZero(&data[hdr+5])==nUsable );
5648 
5649   pCellptr = &pPage->aCellIdx[nCell*2];
5650   cellbody = nUsable;
5651   for(i=nCell-1; i>=0; i--){
5652     u16 sz = aSize[i];
5653     pCellptr -= 2;
5654     cellbody -= sz;
5655     put2byte(pCellptr, cellbody);
5656     memcpy(&data[cellbody], apCell[i], sz);
5657   }
5658   put2byte(&data[hdr+3], nCell);
5659   put2byte(&data[hdr+5], cellbody);
5660   pPage->nFree -= (nCell*2 + nUsable - cellbody);
5661   pPage->nCell = (u16)nCell;
5662 }
5663 
5664 /*
5665 ** The following parameters determine how many adjacent pages get involved
5666 ** in a balancing operation.  NN is the number of neighbors on either side
5667 ** of the page that participate in the balancing operation.  NB is the
5668 ** total number of pages that participate, including the target page and
5669 ** NN neighbors on either side.
5670 **
5671 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
5672 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5673 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5674 ** The value of NN appears to give the best results overall.
5675 */
5676 #define NN 1             /* Number of neighbors on either side of pPage */
5677 #define NB (NN*2+1)      /* Total pages involved in the balance */
5678 
5679 
5680 #ifndef SQLITE_OMIT_QUICKBALANCE
5681 /*
5682 ** This version of balance() handles the common special case where
5683 ** a new entry is being inserted on the extreme right-end of the
5684 ** tree, in other words, when the new entry will become the largest
5685 ** entry in the tree.
5686 **
5687 ** Instead of trying to balance the 3 right-most leaf pages, just add
5688 ** a new page to the right-hand side and put the one new entry in
5689 ** that page.  This leaves the right side of the tree somewhat
5690 ** unbalanced.  But odds are that we will be inserting new entries
5691 ** at the end soon afterwards so the nearly empty page will quickly
5692 ** fill up.  On average.
5693 **
5694 ** pPage is the leaf page which is the right-most page in the tree.
5695 ** pParent is its parent.  pPage must have a single overflow entry
5696 ** which is also the right-most entry on the page.
5697 **
5698 ** The pSpace buffer is used to store a temporary copy of the divider
5699 ** cell that will be inserted into pParent. Such a cell consists of a 4
5700 ** byte page number followed by a variable length integer. In other
5701 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5702 ** least 13 bytes in size.
5703 */
5704 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5705   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
5706   MemPage *pNew;                       /* Newly allocated page */
5707   int rc;                              /* Return Code */
5708   Pgno pgnoNew;                        /* Page number of pNew */
5709 
5710   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5711   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5712   assert( pPage->nOverflow==1 );
5713 
5714   /* This error condition is now caught prior to reaching this function */
5715   if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
5716 
5717   /* Allocate a new page. This page will become the right-sibling of
5718   ** pPage. Make the parent page writable, so that the new divider cell
5719   ** may be inserted. If both these operations are successful, proceed.
5720   */
5721   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5722 
5723   if( rc==SQLITE_OK ){
5724 
5725     u8 *pOut = &pSpace[4];
5726     u8 *pCell = pPage->apOvfl[0];
5727     u16 szCell = cellSizePtr(pPage, pCell);
5728     u8 *pStop;
5729 
5730     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5731     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5732     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5733     assemblePage(pNew, 1, &pCell, &szCell);
5734 
5735     /* If this is an auto-vacuum database, update the pointer map
5736     ** with entries for the new page, and any pointer from the
5737     ** cell on the page to an overflow page. If either of these
5738     ** operations fails, the return code is set, but the contents
5739     ** of the parent page are still manipulated by thh code below.
5740     ** That is Ok, at this point the parent page is guaranteed to
5741     ** be marked as dirty. Returning an error code will cause a
5742     ** rollback, undoing any changes made to the parent page.
5743     */
5744     if( ISAUTOVACUUM ){
5745       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5746       if( szCell>pNew->minLocal ){
5747         ptrmapPutOvflPtr(pNew, pCell, &rc);
5748       }
5749     }
5750 
5751     /* Create a divider cell to insert into pParent. The divider cell
5752     ** consists of a 4-byte page number (the page number of pPage) and
5753     ** a variable length key value (which must be the same value as the
5754     ** largest key on pPage).
5755     **
5756     ** To find the largest key value on pPage, first find the right-most
5757     ** cell on pPage. The first two fields of this cell are the
5758     ** record-length (a variable length integer at most 32-bits in size)
5759     ** and the key value (a variable length integer, may have any value).
5760     ** The first of the while(...) loops below skips over the record-length
5761     ** field. The second while(...) loop copies the key value from the
5762     ** cell on pPage into the pSpace buffer.
5763     */
5764     pCell = findCell(pPage, pPage->nCell-1);
5765     pStop = &pCell[9];
5766     while( (*(pCell++)&0x80) && pCell<pStop );
5767     pStop = &pCell[9];
5768     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5769 
5770     /* Insert the new divider cell into pParent. */
5771     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5772                0, pPage->pgno, &rc);
5773 
5774     /* Set the right-child pointer of pParent to point to the new page. */
5775     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5776 
5777     /* Release the reference to the new page. */
5778     releasePage(pNew);
5779   }
5780 
5781   return rc;
5782 }
5783 #endif /* SQLITE_OMIT_QUICKBALANCE */
5784 
5785 #if 0
5786 /*
5787 ** This function does not contribute anything to the operation of SQLite.
5788 ** it is sometimes activated temporarily while debugging code responsible
5789 ** for setting pointer-map entries.
5790 */
5791 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5792   int i, j;
5793   for(i=0; i<nPage; i++){
5794     Pgno n;
5795     u8 e;
5796     MemPage *pPage = apPage[i];
5797     BtShared *pBt = pPage->pBt;
5798     assert( pPage->isInit );
5799 
5800     for(j=0; j<pPage->nCell; j++){
5801       CellInfo info;
5802       u8 *z;
5803 
5804       z = findCell(pPage, j);
5805       btreeParseCellPtr(pPage, z, &info);
5806       if( info.iOverflow ){
5807         Pgno ovfl = get4byte(&z[info.iOverflow]);
5808         ptrmapGet(pBt, ovfl, &e, &n);
5809         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5810       }
5811       if( !pPage->leaf ){
5812         Pgno child = get4byte(z);
5813         ptrmapGet(pBt, child, &e, &n);
5814         assert( n==pPage->pgno && e==PTRMAP_BTREE );
5815       }
5816     }
5817     if( !pPage->leaf ){
5818       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5819       ptrmapGet(pBt, child, &e, &n);
5820       assert( n==pPage->pgno && e==PTRMAP_BTREE );
5821     }
5822   }
5823   return 1;
5824 }
5825 #endif
5826 
5827 /*
5828 ** This function is used to copy the contents of the b-tree node stored
5829 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5830 ** the pointer-map entries for each child page are updated so that the
5831 ** parent page stored in the pointer map is page pTo. If pFrom contained
5832 ** any cells with overflow page pointers, then the corresponding pointer
5833 ** map entries are also updated so that the parent page is page pTo.
5834 **
5835 ** If pFrom is currently carrying any overflow cells (entries in the
5836 ** MemPage.apOvfl[] array), they are not copied to pTo.
5837 **
5838 ** Before returning, page pTo is reinitialized using btreeInitPage().
5839 **
5840 ** The performance of this function is not critical. It is only used by
5841 ** the balance_shallower() and balance_deeper() procedures, neither of
5842 ** which are called often under normal circumstances.
5843 */
5844 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5845   if( (*pRC)==SQLITE_OK ){
5846     BtShared * const pBt = pFrom->pBt;
5847     u8 * const aFrom = pFrom->aData;
5848     u8 * const aTo = pTo->aData;
5849     int const iFromHdr = pFrom->hdrOffset;
5850     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5851     int rc;
5852     int iData;
5853 
5854 
5855     assert( pFrom->isInit );
5856     assert( pFrom->nFree>=iToHdr );
5857     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
5858 
5859     /* Copy the b-tree node content from page pFrom to page pTo. */
5860     iData = get2byte(&aFrom[iFromHdr+5]);
5861     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5862     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5863 
5864     /* Reinitialize page pTo so that the contents of the MemPage structure
5865     ** match the new data. The initialization of pTo can actually fail under
5866     ** fairly obscure circumstances, even though it is a copy of initialized
5867     ** page pFrom.
5868     */
5869     pTo->isInit = 0;
5870     rc = btreeInitPage(pTo);
5871     if( rc!=SQLITE_OK ){
5872       *pRC = rc;
5873       return;
5874     }
5875 
5876     /* If this is an auto-vacuum database, update the pointer-map entries
5877     ** for any b-tree or overflow pages that pTo now contains the pointers to.
5878     */
5879     if( ISAUTOVACUUM ){
5880       *pRC = setChildPtrmaps(pTo);
5881     }
5882   }
5883 }
5884 
5885 /*
5886 ** This routine redistributes cells on the iParentIdx'th child of pParent
5887 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
5888 ** same amount of free space. Usually a single sibling on either side of the
5889 ** page are used in the balancing, though both siblings might come from one
5890 ** side if the page is the first or last child of its parent. If the page
5891 ** has fewer than 2 siblings (something which can only happen if the page
5892 ** is a root page or a child of a root page) then all available siblings
5893 ** participate in the balancing.
5894 **
5895 ** The number of siblings of the page might be increased or decreased by
5896 ** one or two in an effort to keep pages nearly full but not over full.
5897 **
5898 ** Note that when this routine is called, some of the cells on the page
5899 ** might not actually be stored in MemPage.aData[]. This can happen
5900 ** if the page is overfull. This routine ensures that all cells allocated
5901 ** to the page and its siblings fit into MemPage.aData[] before returning.
5902 **
5903 ** In the course of balancing the page and its siblings, cells may be
5904 ** inserted into or removed from the parent page (pParent). Doing so
5905 ** may cause the parent page to become overfull or underfull. If this
5906 ** happens, it is the responsibility of the caller to invoke the correct
5907 ** balancing routine to fix this problem (see the balance() routine).
5908 **
5909 ** If this routine fails for any reason, it might leave the database
5910 ** in a corrupted state. So if this routine fails, the database should
5911 ** be rolled back.
5912 **
5913 ** The third argument to this function, aOvflSpace, is a pointer to a
5914 ** buffer big enough to hold one page. If while inserting cells into the parent
5915 ** page (pParent) the parent page becomes overfull, this buffer is
5916 ** used to store the parent's overflow cells. Because this function inserts
5917 ** a maximum of four divider cells into the parent page, and the maximum
5918 ** size of a cell stored within an internal node is always less than 1/4
5919 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5920 ** enough for all overflow cells.
5921 **
5922 ** If aOvflSpace is set to a null pointer, this function returns
5923 ** SQLITE_NOMEM.
5924 */
5925 static int balance_nonroot(
5926   MemPage *pParent,               /* Parent page of siblings being balanced */
5927   int iParentIdx,                 /* Index of "the page" in pParent */
5928   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
5929   int isRoot                      /* True if pParent is a root-page */
5930 ){
5931   BtShared *pBt;               /* The whole database */
5932   int nCell = 0;               /* Number of cells in apCell[] */
5933   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
5934   int nNew = 0;                /* Number of pages in apNew[] */
5935   int nOld;                    /* Number of pages in apOld[] */
5936   int i, j, k;                 /* Loop counters */
5937   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
5938   int rc = SQLITE_OK;          /* The return code */
5939   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
5940   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
5941   int usableSpace;             /* Bytes in pPage beyond the header */
5942   int pageFlags;               /* Value of pPage->aData[0] */
5943   int subtotal;                /* Subtotal of bytes in cells on one page */
5944   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
5945   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
5946   int szScratch;               /* Size of scratch memory requested */
5947   MemPage *apOld[NB];          /* pPage and up to two siblings */
5948   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
5949   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
5950   u8 *pRight;                  /* Location in parent of right-sibling pointer */
5951   u8 *apDiv[NB-1];             /* Divider cells in pParent */
5952   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
5953   int szNew[NB+2];             /* Combined size of cells place on i-th page */
5954   u8 **apCell = 0;             /* All cells begin balanced */
5955   u16 *szCell;                 /* Local size of all cells in apCell[] */
5956   u8 *aSpace1;                 /* Space for copies of dividers cells */
5957   Pgno pgno;                   /* Temp var to store a page number in */
5958 
5959   pBt = pParent->pBt;
5960   assert( sqlite3_mutex_held(pBt->mutex) );
5961   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5962 
5963 #if 0
5964   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5965 #endif
5966 
5967   /* At this point pParent may have at most one overflow cell. And if
5968   ** this overflow cell is present, it must be the cell with
5969   ** index iParentIdx. This scenario comes about when this function
5970   ** is called (indirectly) from sqlite3BtreeDelete().
5971   */
5972   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5973   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
5974 
5975   if( !aOvflSpace ){
5976     return SQLITE_NOMEM;
5977   }
5978 
5979   /* Find the sibling pages to balance. Also locate the cells in pParent
5980   ** that divide the siblings. An attempt is made to find NN siblings on
5981   ** either side of pPage. More siblings are taken from one side, however,
5982   ** if there are fewer than NN siblings on the other side. If pParent
5983   ** has NB or fewer children then all children of pParent are taken.
5984   **
5985   ** This loop also drops the divider cells from the parent page. This
5986   ** way, the remainder of the function does not have to deal with any
5987   ** overflow cells in the parent page, since if any existed they will
5988   ** have already been removed.
5989   */
5990   i = pParent->nOverflow + pParent->nCell;
5991   if( i<2 ){
5992     nxDiv = 0;
5993     nOld = i+1;
5994   }else{
5995     nOld = 3;
5996     if( iParentIdx==0 ){
5997       nxDiv = 0;
5998     }else if( iParentIdx==i ){
5999       nxDiv = i-2;
6000     }else{
6001       nxDiv = iParentIdx-1;
6002     }
6003     i = 2;
6004   }
6005   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
6006     pRight = &pParent->aData[pParent->hdrOffset+8];
6007   }else{
6008     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
6009   }
6010   pgno = get4byte(pRight);
6011   while( 1 ){
6012     rc = getAndInitPage(pBt, pgno, &apOld[i]);
6013     if( rc ){
6014       memset(apOld, 0, (i+1)*sizeof(MemPage*));
6015       goto balance_cleanup;
6016     }
6017     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
6018     if( (i--)==0 ) break;
6019 
6020     if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){
6021       apDiv[i] = pParent->apOvfl[0];
6022       pgno = get4byte(apDiv[i]);
6023       szNew[i] = cellSizePtr(pParent, apDiv[i]);
6024       pParent->nOverflow = 0;
6025     }else{
6026       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
6027       pgno = get4byte(apDiv[i]);
6028       szNew[i] = cellSizePtr(pParent, apDiv[i]);
6029 
6030       /* Drop the cell from the parent page. apDiv[i] still points to
6031       ** the cell within the parent, even though it has been dropped.
6032       ** This is safe because dropping a cell only overwrites the first
6033       ** four bytes of it, and this function does not need the first
6034       ** four bytes of the divider cell. So the pointer is safe to use
6035       ** later on.
6036       **
6037       ** But not if we are in secure-delete mode. In secure-delete mode,
6038       ** the dropCell() routine will overwrite the entire cell with zeroes.
6039       ** In this case, temporarily copy the cell into the aOvflSpace[]
6040       ** buffer. It will be copied out again as soon as the aSpace[] buffer
6041       ** is allocated.  */
6042       if( pBt->btsFlags & BTS_SECURE_DELETE ){
6043         int iOff;
6044 
6045         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
6046         if( (iOff+szNew[i])>(int)pBt->usableSize ){
6047           rc = SQLITE_CORRUPT_BKPT;
6048           memset(apOld, 0, (i+1)*sizeof(MemPage*));
6049           goto balance_cleanup;
6050         }else{
6051           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
6052           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
6053         }
6054       }
6055       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
6056     }
6057   }
6058 
6059   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
6060   ** alignment */
6061   nMaxCells = (nMaxCells + 3)&~3;
6062 
6063   /*
6064   ** Allocate space for memory structures
6065   */
6066   k = pBt->pageSize + ROUND8(sizeof(MemPage));
6067   szScratch =
6068        nMaxCells*sizeof(u8*)                       /* apCell */
6069      + nMaxCells*sizeof(u16)                       /* szCell */
6070      + pBt->pageSize                               /* aSpace1 */
6071      + k*nOld;                                     /* Page copies (apCopy) */
6072   apCell = sqlite3ScratchMalloc( szScratch );
6073   if( apCell==0 ){
6074     rc = SQLITE_NOMEM;
6075     goto balance_cleanup;
6076   }
6077   szCell = (u16*)&apCell[nMaxCells];
6078   aSpace1 = (u8*)&szCell[nMaxCells];
6079   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
6080 
6081   /*
6082   ** Load pointers to all cells on sibling pages and the divider cells
6083   ** into the local apCell[] array.  Make copies of the divider cells
6084   ** into space obtained from aSpace1[] and remove the the divider Cells
6085   ** from pParent.
6086   **
6087   ** If the siblings are on leaf pages, then the child pointers of the
6088   ** divider cells are stripped from the cells before they are copied
6089   ** into aSpace1[].  In this way, all cells in apCell[] are without
6090   ** child pointers.  If siblings are not leaves, then all cell in
6091   ** apCell[] include child pointers.  Either way, all cells in apCell[]
6092   ** are alike.
6093   **
6094   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
6095   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
6096   */
6097   leafCorrection = apOld[0]->leaf*4;
6098   leafData = apOld[0]->hasData;
6099   for(i=0; i<nOld; i++){
6100     int limit;
6101 
6102     /* Before doing anything else, take a copy of the i'th original sibling
6103     ** The rest of this function will use data from the copies rather
6104     ** that the original pages since the original pages will be in the
6105     ** process of being overwritten.  */
6106     MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
6107     memcpy(pOld, apOld[i], sizeof(MemPage));
6108     pOld->aData = (void*)&pOld[1];
6109     memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
6110 
6111     limit = pOld->nCell+pOld->nOverflow;
6112     if( pOld->nOverflow>0 ){
6113       for(j=0; j<limit; j++){
6114         assert( nCell<nMaxCells );
6115         apCell[nCell] = findOverflowCell(pOld, j);
6116         szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6117         nCell++;
6118       }
6119     }else{
6120       u8 *aData = pOld->aData;
6121       u16 maskPage = pOld->maskPage;
6122       u16 cellOffset = pOld->cellOffset;
6123       for(j=0; j<limit; j++){
6124         assert( nCell<nMaxCells );
6125         apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);
6126         szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6127         nCell++;
6128       }
6129     }
6130     if( i<nOld-1 && !leafData){
6131       u16 sz = (u16)szNew[i];
6132       u8 *pTemp;
6133       assert( nCell<nMaxCells );
6134       szCell[nCell] = sz;
6135       pTemp = &aSpace1[iSpace1];
6136       iSpace1 += sz;
6137       assert( sz<=pBt->maxLocal+23 );
6138       assert( iSpace1 <= (int)pBt->pageSize );
6139       memcpy(pTemp, apDiv[i], sz);
6140       apCell[nCell] = pTemp+leafCorrection;
6141       assert( leafCorrection==0 || leafCorrection==4 );
6142       szCell[nCell] = szCell[nCell] - leafCorrection;
6143       if( !pOld->leaf ){
6144         assert( leafCorrection==0 );
6145         assert( pOld->hdrOffset==0 );
6146         /* The right pointer of the child page pOld becomes the left
6147         ** pointer of the divider cell */
6148         memcpy(apCell[nCell], &pOld->aData[8], 4);
6149       }else{
6150         assert( leafCorrection==4 );
6151         if( szCell[nCell]<4 ){
6152           /* Do not allow any cells smaller than 4 bytes. */
6153           szCell[nCell] = 4;
6154         }
6155       }
6156       nCell++;
6157     }
6158   }
6159 
6160   /*
6161   ** Figure out the number of pages needed to hold all nCell cells.
6162   ** Store this number in "k".  Also compute szNew[] which is the total
6163   ** size of all cells on the i-th page and cntNew[] which is the index
6164   ** in apCell[] of the cell that divides page i from page i+1.
6165   ** cntNew[k] should equal nCell.
6166   **
6167   ** Values computed by this block:
6168   **
6169   **           k: The total number of sibling pages
6170   **    szNew[i]: Spaced used on the i-th sibling page.
6171   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
6172   **              the right of the i-th sibling page.
6173   ** usableSpace: Number of bytes of space available on each sibling.
6174   **
6175   */
6176   usableSpace = pBt->usableSize - 12 + leafCorrection;
6177   for(subtotal=k=i=0; i<nCell; i++){
6178     assert( i<nMaxCells );
6179     subtotal += szCell[i] + 2;
6180     if( subtotal > usableSpace ){
6181       szNew[k] = subtotal - szCell[i];
6182       cntNew[k] = i;
6183       if( leafData ){ i--; }
6184       subtotal = 0;
6185       k++;
6186       if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
6187     }
6188   }
6189   szNew[k] = subtotal;
6190   cntNew[k] = nCell;
6191   k++;
6192 
6193   /*
6194   ** The packing computed by the previous block is biased toward the siblings
6195   ** on the left side.  The left siblings are always nearly full, while the
6196   ** right-most sibling might be nearly empty.  This block of code attempts
6197   ** to adjust the packing of siblings to get a better balance.
6198   **
6199   ** This adjustment is more than an optimization.  The packing above might
6200   ** be so out of balance as to be illegal.  For example, the right-most
6201   ** sibling might be completely empty.  This adjustment is not optional.
6202   */
6203   for(i=k-1; i>0; i--){
6204     int szRight = szNew[i];  /* Size of sibling on the right */
6205     int szLeft = szNew[i-1]; /* Size of sibling on the left */
6206     int r;              /* Index of right-most cell in left sibling */
6207     int d;              /* Index of first cell to the left of right sibling */
6208 
6209     r = cntNew[i-1] - 1;
6210     d = r + 1 - leafData;
6211     assert( d<nMaxCells );
6212     assert( r<nMaxCells );
6213     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
6214       szRight += szCell[d] + 2;
6215       szLeft -= szCell[r] + 2;
6216       cntNew[i-1]--;
6217       r = cntNew[i-1] - 1;
6218       d = r + 1 - leafData;
6219     }
6220     szNew[i] = szRight;
6221     szNew[i-1] = szLeft;
6222   }
6223 
6224   /* Either we found one or more cells (cntnew[0])>0) or pPage is
6225   ** a virtual root page.  A virtual root page is when the real root
6226   ** page is page 1 and we are the only child of that page.
6227   **
6228   ** UPDATE:  The assert() below is not necessarily true if the database
6229   ** file is corrupt.  The corruption will be detected and reported later
6230   ** in this procedure so there is no need to act upon it now.
6231   */
6232 #if 0
6233   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
6234 #endif
6235 
6236   TRACE(("BALANCE: old: %d %d %d  ",
6237     apOld[0]->pgno,
6238     nOld>=2 ? apOld[1]->pgno : 0,
6239     nOld>=3 ? apOld[2]->pgno : 0
6240   ));
6241 
6242   /*
6243   ** Allocate k new pages.  Reuse old pages where possible.
6244   */
6245   if( apOld[0]->pgno<=1 ){
6246     rc = SQLITE_CORRUPT_BKPT;
6247     goto balance_cleanup;
6248   }
6249   pageFlags = apOld[0]->aData[0];
6250   for(i=0; i<k; i++){
6251     MemPage *pNew;
6252     if( i<nOld ){
6253       pNew = apNew[i] = apOld[i];
6254       apOld[i] = 0;
6255       rc = sqlite3PagerWrite(pNew->pDbPage);
6256       nNew++;
6257       if( rc ) goto balance_cleanup;
6258     }else{
6259       assert( i>0 );
6260       rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
6261       if( rc ) goto balance_cleanup;
6262       apNew[i] = pNew;
6263       nNew++;
6264 
6265       /* Set the pointer-map entry for the new sibling page. */
6266       if( ISAUTOVACUUM ){
6267         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6268         if( rc!=SQLITE_OK ){
6269           goto balance_cleanup;
6270         }
6271       }
6272     }
6273   }
6274 
6275   /* Free any old pages that were not reused as new pages.
6276   */
6277   while( i<nOld ){
6278     freePage(apOld[i], &rc);
6279     if( rc ) goto balance_cleanup;
6280     releasePage(apOld[i]);
6281     apOld[i] = 0;
6282     i++;
6283   }
6284 
6285   /*
6286   ** Put the new pages in accending order.  This helps to
6287   ** keep entries in the disk file in order so that a scan
6288   ** of the table is a linear scan through the file.  That
6289   ** in turn helps the operating system to deliver pages
6290   ** from the disk more rapidly.
6291   **
6292   ** An O(n^2) insertion sort algorithm is used, but since
6293   ** n is never more than NB (a small constant), that should
6294   ** not be a problem.
6295   **
6296   ** When NB==3, this one optimization makes the database
6297   ** about 25% faster for large insertions and deletions.
6298   */
6299   for(i=0; i<k-1; i++){
6300     int minV = apNew[i]->pgno;
6301     int minI = i;
6302     for(j=i+1; j<k; j++){
6303       if( apNew[j]->pgno<(unsigned)minV ){
6304         minI = j;
6305         minV = apNew[j]->pgno;
6306       }
6307     }
6308     if( minI>i ){
6309       MemPage *pT;
6310       pT = apNew[i];
6311       apNew[i] = apNew[minI];
6312       apNew[minI] = pT;
6313     }
6314   }
6315   TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
6316     apNew[0]->pgno, szNew[0],
6317     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
6318     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
6319     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
6320     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
6321 
6322   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6323   put4byte(pRight, apNew[nNew-1]->pgno);
6324 
6325   /*
6326   ** Evenly distribute the data in apCell[] across the new pages.
6327   ** Insert divider cells into pParent as necessary.
6328   */
6329   j = 0;
6330   for(i=0; i<nNew; i++){
6331     /* Assemble the new sibling page. */
6332     MemPage *pNew = apNew[i];
6333     assert( j<nMaxCells );
6334     zeroPage(pNew, pageFlags);
6335     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6336     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6337     assert( pNew->nOverflow==0 );
6338 
6339     j = cntNew[i];
6340 
6341     /* If the sibling page assembled above was not the right-most sibling,
6342     ** insert a divider cell into the parent page.
6343     */
6344     assert( i<nNew-1 || j==nCell );
6345     if( j<nCell ){
6346       u8 *pCell;
6347       u8 *pTemp;
6348       int sz;
6349 
6350       assert( j<nMaxCells );
6351       pCell = apCell[j];
6352       sz = szCell[j] + leafCorrection;
6353       pTemp = &aOvflSpace[iOvflSpace];
6354       if( !pNew->leaf ){
6355         memcpy(&pNew->aData[8], pCell, 4);
6356       }else if( leafData ){
6357         /* If the tree is a leaf-data tree, and the siblings are leaves,
6358         ** then there is no divider cell in apCell[]. Instead, the divider
6359         ** cell consists of the integer key for the right-most cell of
6360         ** the sibling-page assembled above only.
6361         */
6362         CellInfo info;
6363         j--;
6364         btreeParseCellPtr(pNew, apCell[j], &info);
6365         pCell = pTemp;
6366         sz = 4 + putVarint(&pCell[4], info.nKey);
6367         pTemp = 0;
6368       }else{
6369         pCell -= 4;
6370         /* Obscure case for non-leaf-data trees: If the cell at pCell was
6371         ** previously stored on a leaf node, and its reported size was 4
6372         ** bytes, then it may actually be smaller than this
6373         ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6374         ** any cell). But it is important to pass the correct size to
6375         ** insertCell(), so reparse the cell now.
6376         **
6377         ** Note that this can never happen in an SQLite data file, as all
6378         ** cells are at least 4 bytes. It only happens in b-trees used
6379         ** to evaluate "IN (SELECT ...)" and similar clauses.
6380         */
6381         if( szCell[j]==4 ){
6382           assert(leafCorrection==4);
6383           sz = cellSizePtr(pParent, pCell);
6384         }
6385       }
6386       iOvflSpace += sz;
6387       assert( sz<=pBt->maxLocal+23 );
6388       assert( iOvflSpace <= (int)pBt->pageSize );
6389       insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6390       if( rc!=SQLITE_OK ) goto balance_cleanup;
6391       assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6392 
6393       j++;
6394       nxDiv++;
6395     }
6396   }
6397   assert( j==nCell );
6398   assert( nOld>0 );
6399   assert( nNew>0 );
6400   if( (pageFlags & PTF_LEAF)==0 ){
6401     u8 *zChild = &apCopy[nOld-1]->aData[8];
6402     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6403   }
6404 
6405   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6406     /* The root page of the b-tree now contains no cells. The only sibling
6407     ** page is the right-child of the parent. Copy the contents of the
6408     ** child page into the parent, decreasing the overall height of the
6409     ** b-tree structure by one. This is described as the "balance-shallower"
6410     ** sub-algorithm in some documentation.
6411     **
6412     ** If this is an auto-vacuum database, the call to copyNodeContent()
6413     ** sets all pointer-map entries corresponding to database image pages
6414     ** for which the pointer is stored within the content being copied.
6415     **
6416     ** The second assert below verifies that the child page is defragmented
6417     ** (it must be, as it was just reconstructed using assemblePage()). This
6418     ** is important if the parent page happens to be page 1 of the database
6419     ** image.  */
6420     assert( nNew==1 );
6421     assert( apNew[0]->nFree ==
6422         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6423     );
6424     copyNodeContent(apNew[0], pParent, &rc);
6425     freePage(apNew[0], &rc);
6426   }else if( ISAUTOVACUUM ){
6427     /* Fix the pointer-map entries for all the cells that were shifted around.
6428     ** There are several different types of pointer-map entries that need to
6429     ** be dealt with by this routine. Some of these have been set already, but
6430     ** many have not. The following is a summary:
6431     **
6432     **   1) The entries associated with new sibling pages that were not
6433     **      siblings when this function was called. These have already
6434     **      been set. We don't need to worry about old siblings that were
6435     **      moved to the free-list - the freePage() code has taken care
6436     **      of those.
6437     **
6438     **   2) The pointer-map entries associated with the first overflow
6439     **      page in any overflow chains used by new divider cells. These
6440     **      have also already been taken care of by the insertCell() code.
6441     **
6442     **   3) If the sibling pages are not leaves, then the child pages of
6443     **      cells stored on the sibling pages may need to be updated.
6444     **
6445     **   4) If the sibling pages are not internal intkey nodes, then any
6446     **      overflow pages used by these cells may need to be updated
6447     **      (internal intkey nodes never contain pointers to overflow pages).
6448     **
6449     **   5) If the sibling pages are not leaves, then the pointer-map
6450     **      entries for the right-child pages of each sibling may need
6451     **      to be updated.
6452     **
6453     ** Cases 1 and 2 are dealt with above by other code. The next
6454     ** block deals with cases 3 and 4 and the one after that, case 5. Since
6455     ** setting a pointer map entry is a relatively expensive operation, this
6456     ** code only sets pointer map entries for child or overflow pages that have
6457     ** actually moved between pages.  */
6458     MemPage *pNew = apNew[0];
6459     MemPage *pOld = apCopy[0];
6460     int nOverflow = pOld->nOverflow;
6461     int iNextOld = pOld->nCell + nOverflow;
6462     int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
6463     j = 0;                             /* Current 'old' sibling page */
6464     k = 0;                             /* Current 'new' sibling page */
6465     for(i=0; i<nCell; i++){
6466       int isDivider = 0;
6467       while( i==iNextOld ){
6468         /* Cell i is the cell immediately following the last cell on old
6469         ** sibling page j. If the siblings are not leaf pages of an
6470         ** intkey b-tree, then cell i was a divider cell. */
6471         assert( j+1 < ArraySize(apCopy) );
6472         pOld = apCopy[++j];
6473         iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6474         if( pOld->nOverflow ){
6475           nOverflow = pOld->nOverflow;
6476           iOverflow = i + !leafData + pOld->aiOvfl[0];
6477         }
6478         isDivider = !leafData;
6479       }
6480 
6481       assert(nOverflow>0 || iOverflow<i );
6482       assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
6483       assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
6484       if( i==iOverflow ){
6485         isDivider = 1;
6486         if( (--nOverflow)>0 ){
6487           iOverflow++;
6488         }
6489       }
6490 
6491       if( i==cntNew[k] ){
6492         /* Cell i is the cell immediately following the last cell on new
6493         ** sibling page k. If the siblings are not leaf pages of an
6494         ** intkey b-tree, then cell i is a divider cell.  */
6495         pNew = apNew[++k];
6496         if( !leafData ) continue;
6497       }
6498       assert( j<nOld );
6499       assert( k<nNew );
6500 
6501       /* If the cell was originally divider cell (and is not now) or
6502       ** an overflow cell, or if the cell was located on a different sibling
6503       ** page before the balancing, then the pointer map entries associated
6504       ** with any child or overflow pages need to be updated.  */
6505       if( isDivider || pOld->pgno!=pNew->pgno ){
6506         if( !leafCorrection ){
6507           ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6508         }
6509         if( szCell[i]>pNew->minLocal ){
6510           ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6511         }
6512       }
6513     }
6514 
6515     if( !leafCorrection ){
6516       for(i=0; i<nNew; i++){
6517         u32 key = get4byte(&apNew[i]->aData[8]);
6518         ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6519       }
6520     }
6521 
6522 #if 0
6523     /* The ptrmapCheckPages() contains assert() statements that verify that
6524     ** all pointer map pages are set correctly. This is helpful while
6525     ** debugging. This is usually disabled because a corrupt database may
6526     ** cause an assert() statement to fail.  */
6527     ptrmapCheckPages(apNew, nNew);
6528     ptrmapCheckPages(&pParent, 1);
6529 #endif
6530   }
6531 
6532   assert( pParent->isInit );
6533   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6534           nOld, nNew, nCell));
6535 
6536   /*
6537   ** Cleanup before returning.
6538   */
6539 balance_cleanup:
6540   sqlite3ScratchFree(apCell);
6541   for(i=0; i<nOld; i++){
6542     releasePage(apOld[i]);
6543   }
6544   for(i=0; i<nNew; i++){
6545     releasePage(apNew[i]);
6546   }
6547 
6548   return rc;
6549 }
6550 
6551 
6552 /*
6553 ** This function is called when the root page of a b-tree structure is
6554 ** overfull (has one or more overflow pages).
6555 **
6556 ** A new child page is allocated and the contents of the current root
6557 ** page, including overflow cells, are copied into the child. The root
6558 ** page is then overwritten to make it an empty page with the right-child
6559 ** pointer pointing to the new page.
6560 **
6561 ** Before returning, all pointer-map entries corresponding to pages
6562 ** that the new child-page now contains pointers to are updated. The
6563 ** entry corresponding to the new right-child pointer of the root
6564 ** page is also updated.
6565 **
6566 ** If successful, *ppChild is set to contain a reference to the child
6567 ** page and SQLITE_OK is returned. In this case the caller is required
6568 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6569 ** an error code is returned and *ppChild is set to 0.
6570 */
6571 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6572   int rc;                        /* Return value from subprocedures */
6573   MemPage *pChild = 0;           /* Pointer to a new child page */
6574   Pgno pgnoChild = 0;            /* Page number of the new child page */
6575   BtShared *pBt = pRoot->pBt;    /* The BTree */
6576 
6577   assert( pRoot->nOverflow>0 );
6578   assert( sqlite3_mutex_held(pBt->mutex) );
6579 
6580   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6581   ** page that will become the new right-child of pPage. Copy the contents
6582   ** of the node stored on pRoot into the new child page.
6583   */
6584   rc = sqlite3PagerWrite(pRoot->pDbPage);
6585   if( rc==SQLITE_OK ){
6586     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6587     copyNodeContent(pRoot, pChild, &rc);
6588     if( ISAUTOVACUUM ){
6589       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6590     }
6591   }
6592   if( rc ){
6593     *ppChild = 0;
6594     releasePage(pChild);
6595     return rc;
6596   }
6597   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6598   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6599   assert( pChild->nCell==pRoot->nCell );
6600 
6601   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6602 
6603   /* Copy the overflow cells from pRoot to pChild */
6604   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
6605          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
6606   memcpy(pChild->apOvfl, pRoot->apOvfl,
6607          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
6608   pChild->nOverflow = pRoot->nOverflow;
6609 
6610   /* Zero the contents of pRoot. Then install pChild as the right-child. */
6611   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6612   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6613 
6614   *ppChild = pChild;
6615   return SQLITE_OK;
6616 }
6617 
6618 /*
6619 ** The page that pCur currently points to has just been modified in
6620 ** some way. This function figures out if this modification means the
6621 ** tree needs to be balanced, and if so calls the appropriate balancing
6622 ** routine. Balancing routines are:
6623 **
6624 **   balance_quick()
6625 **   balance_deeper()
6626 **   balance_nonroot()
6627 */
6628 static int balance(BtCursor *pCur){
6629   int rc = SQLITE_OK;
6630   const int nMin = pCur->pBt->usableSize * 2 / 3;
6631   u8 aBalanceQuickSpace[13];
6632   u8 *pFree = 0;
6633 
6634   TESTONLY( int balance_quick_called = 0 );
6635   TESTONLY( int balance_deeper_called = 0 );
6636 
6637   do {
6638     int iPage = pCur->iPage;
6639     MemPage *pPage = pCur->apPage[iPage];
6640 
6641     if( iPage==0 ){
6642       if( pPage->nOverflow ){
6643         /* The root page of the b-tree is overfull. In this case call the
6644         ** balance_deeper() function to create a new child for the root-page
6645         ** and copy the current contents of the root-page to it. The
6646         ** next iteration of the do-loop will balance the child page.
6647         */
6648         assert( (balance_deeper_called++)==0 );
6649         rc = balance_deeper(pPage, &pCur->apPage[1]);
6650         if( rc==SQLITE_OK ){
6651           pCur->iPage = 1;
6652           pCur->aiIdx[0] = 0;
6653           pCur->aiIdx[1] = 0;
6654           assert( pCur->apPage[1]->nOverflow );
6655         }
6656       }else{
6657         break;
6658       }
6659     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6660       break;
6661     }else{
6662       MemPage * const pParent = pCur->apPage[iPage-1];
6663       int const iIdx = pCur->aiIdx[iPage-1];
6664 
6665       rc = sqlite3PagerWrite(pParent->pDbPage);
6666       if( rc==SQLITE_OK ){
6667 #ifndef SQLITE_OMIT_QUICKBALANCE
6668         if( pPage->hasData
6669          && pPage->nOverflow==1
6670          && pPage->aiOvfl[0]==pPage->nCell
6671          && pParent->pgno!=1
6672          && pParent->nCell==iIdx
6673         ){
6674           /* Call balance_quick() to create a new sibling of pPage on which
6675           ** to store the overflow cell. balance_quick() inserts a new cell
6676           ** into pParent, which may cause pParent overflow. If this
6677           ** happens, the next interation of the do-loop will balance pParent
6678           ** use either balance_nonroot() or balance_deeper(). Until this
6679           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6680           ** buffer.
6681           **
6682           ** The purpose of the following assert() is to check that only a
6683           ** single call to balance_quick() is made for each call to this
6684           ** function. If this were not verified, a subtle bug involving reuse
6685           ** of the aBalanceQuickSpace[] might sneak in.
6686           */
6687           assert( (balance_quick_called++)==0 );
6688           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6689         }else
6690 #endif
6691         {
6692           /* In this case, call balance_nonroot() to redistribute cells
6693           ** between pPage and up to 2 of its sibling pages. This involves
6694           ** modifying the contents of pParent, which may cause pParent to
6695           ** become overfull or underfull. The next iteration of the do-loop
6696           ** will balance the parent page to correct this.
6697           **
6698           ** If the parent page becomes overfull, the overflow cell or cells
6699           ** are stored in the pSpace buffer allocated immediately below.
6700           ** A subsequent iteration of the do-loop will deal with this by
6701           ** calling balance_nonroot() (balance_deeper() may be called first,
6702           ** but it doesn't deal with overflow cells - just moves them to a
6703           ** different page). Once this subsequent call to balance_nonroot()
6704           ** has completed, it is safe to release the pSpace buffer used by
6705           ** the previous call, as the overflow cell data will have been
6706           ** copied either into the body of a database page or into the new
6707           ** pSpace buffer passed to the latter call to balance_nonroot().
6708           */
6709           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6710           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
6711           if( pFree ){
6712             /* If pFree is not NULL, it points to the pSpace buffer used
6713             ** by a previous call to balance_nonroot(). Its contents are
6714             ** now stored either on real database pages or within the
6715             ** new pSpace buffer, so it may be safely freed here. */
6716             sqlite3PageFree(pFree);
6717           }
6718 
6719           /* The pSpace buffer will be freed after the next call to
6720           ** balance_nonroot(), or just before this function returns, whichever
6721           ** comes first. */
6722           pFree = pSpace;
6723         }
6724       }
6725 
6726       pPage->nOverflow = 0;
6727 
6728       /* The next iteration of the do-loop balances the parent page. */
6729       releasePage(pPage);
6730       pCur->iPage--;
6731     }
6732   }while( rc==SQLITE_OK );
6733 
6734   if( pFree ){
6735     sqlite3PageFree(pFree);
6736   }
6737   return rc;
6738 }
6739 
6740 
6741 /*
6742 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
6743 ** and the data is given by (pData,nData).  The cursor is used only to
6744 ** define what table the record should be inserted into.  The cursor
6745 ** is left pointing at a random location.
6746 **
6747 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
6748 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
6749 **
6750 ** If the seekResult parameter is non-zero, then a successful call to
6751 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6752 ** been performed. seekResult is the search result returned (a negative
6753 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6754 ** a positive value if pCur points at an etry that is larger than
6755 ** (pKey, nKey)).
6756 **
6757 ** If the seekResult parameter is non-zero, then the caller guarantees that
6758 ** cursor pCur is pointing at the existing copy of a row that is to be
6759 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
6760 ** point to any entry or to no entry at all and so this function has to seek
6761 ** the cursor before the new key can be inserted.
6762 */
6763 int sqlite3BtreeInsert(
6764   BtCursor *pCur,                /* Insert data into the table of this cursor */
6765   const void *pKey, i64 nKey,    /* The key of the new record */
6766   const void *pData, int nData,  /* The data of the new record */
6767   int nZero,                     /* Number of extra 0 bytes to append to data */
6768   int appendBias,                /* True if this is likely an append */
6769   int seekResult                 /* Result of prior MovetoUnpacked() call */
6770 ){
6771   int rc;
6772   int loc = seekResult;          /* -1: before desired location  +1: after */
6773   int szNew = 0;
6774   int idx;
6775   MemPage *pPage;
6776   Btree *p = pCur->pBtree;
6777   BtShared *pBt = p->pBt;
6778   unsigned char *oldCell;
6779   unsigned char *newCell = 0;
6780 
6781   if( pCur->eState==CURSOR_FAULT ){
6782     assert( pCur->skipNext!=SQLITE_OK );
6783     return pCur->skipNext;
6784   }
6785 
6786   assert( cursorHoldsMutex(pCur) );
6787   assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE
6788               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
6789   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6790 
6791   /* Assert that the caller has been consistent. If this cursor was opened
6792   ** expecting an index b-tree, then the caller should be inserting blob
6793   ** keys with no associated data. If the cursor was opened expecting an
6794   ** intkey table, the caller should be inserting integer keys with a
6795   ** blob of associated data.  */
6796   assert( (pKey==0)==(pCur->pKeyInfo==0) );
6797 
6798   /* Save the positions of any other cursors open on this table.
6799   **
6800   ** In some cases, the call to btreeMoveto() below is a no-op. For
6801   ** example, when inserting data into a table with auto-generated integer
6802   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6803   ** integer key to use. It then calls this function to actually insert the
6804   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6805   ** that the cursor is already where it needs to be and returns without
6806   ** doing any work. To avoid thwarting these optimizations, it is important
6807   ** not to clear the cursor here.
6808   */
6809   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6810   if( rc ) return rc;
6811 
6812   /* If this is an insert into a table b-tree, invalidate any incrblob
6813   ** cursors open on the row being replaced (assuming this is a replace
6814   ** operation - if it is not, the following is a no-op).  */
6815   if( pCur->pKeyInfo==0 ){
6816     invalidateIncrblobCursors(p, nKey, 0);
6817   }
6818 
6819   if( !loc ){
6820     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6821     if( rc ) return rc;
6822   }
6823   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
6824 
6825   pPage = pCur->apPage[pCur->iPage];
6826   assert( pPage->intKey || nKey>=0 );
6827   assert( pPage->leaf || !pPage->intKey );
6828 
6829   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6830           pCur->pgnoRoot, nKey, nData, pPage->pgno,
6831           loc==0 ? "overwrite" : "new entry"));
6832   assert( pPage->isInit );
6833   allocateTempSpace(pBt);
6834   newCell = pBt->pTmpSpace;
6835   if( newCell==0 ) return SQLITE_NOMEM;
6836   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6837   if( rc ) goto end_insert;
6838   assert( szNew==cellSizePtr(pPage, newCell) );
6839   assert( szNew <= MX_CELL_SIZE(pBt) );
6840   idx = pCur->aiIdx[pCur->iPage];
6841   if( loc==0 ){
6842     u16 szOld;
6843     assert( idx<pPage->nCell );
6844     rc = sqlite3PagerWrite(pPage->pDbPage);
6845     if( rc ){
6846       goto end_insert;
6847     }
6848     oldCell = findCell(pPage, idx);
6849     if( !pPage->leaf ){
6850       memcpy(newCell, oldCell, 4);
6851     }
6852     szOld = cellSizePtr(pPage, oldCell);
6853     rc = clearCell(pPage, oldCell);
6854     dropCell(pPage, idx, szOld, &rc);
6855     if( rc ) goto end_insert;
6856   }else if( loc<0 && pPage->nCell>0 ){
6857     assert( pPage->leaf );
6858     idx = ++pCur->aiIdx[pCur->iPage];
6859   }else{
6860     assert( pPage->leaf );
6861   }
6862   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6863   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
6864 
6865   /* If no error has occured and pPage has an overflow cell, call balance()
6866   ** to redistribute the cells within the tree. Since balance() may move
6867   ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6868   ** variables.
6869   **
6870   ** Previous versions of SQLite called moveToRoot() to move the cursor
6871   ** back to the root page as balance() used to invalidate the contents
6872   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6873   ** set the cursor state to "invalid". This makes common insert operations
6874   ** slightly faster.
6875   **
6876   ** There is a subtle but important optimization here too. When inserting
6877   ** multiple records into an intkey b-tree using a single cursor (as can
6878   ** happen while processing an "INSERT INTO ... SELECT" statement), it
6879   ** is advantageous to leave the cursor pointing to the last entry in
6880   ** the b-tree if possible. If the cursor is left pointing to the last
6881   ** entry in the table, and the next row inserted has an integer key
6882   ** larger than the largest existing key, it is possible to insert the
6883   ** row without seeking the cursor. This can be a big performance boost.
6884   */
6885   pCur->info.nSize = 0;
6886   pCur->validNKey = 0;
6887   if( rc==SQLITE_OK && pPage->nOverflow ){
6888     rc = balance(pCur);
6889 
6890     /* Must make sure nOverflow is reset to zero even if the balance()
6891     ** fails. Internal data structure corruption will result otherwise.
6892     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6893     ** from trying to save the current position of the cursor.  */
6894     pCur->apPage[pCur->iPage]->nOverflow = 0;
6895     pCur->eState = CURSOR_INVALID;
6896   }
6897   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
6898 
6899 end_insert:
6900   return rc;
6901 }
6902 
6903 /*
6904 ** Delete the entry that the cursor is pointing to.  The cursor
6905 ** is left pointing at a arbitrary location.
6906 */
6907 int sqlite3BtreeDelete(BtCursor *pCur){
6908   Btree *p = pCur->pBtree;
6909   BtShared *pBt = p->pBt;
6910   int rc;                              /* Return code */
6911   MemPage *pPage;                      /* Page to delete cell from */
6912   unsigned char *pCell;                /* Pointer to cell to delete */
6913   int iCellIdx;                        /* Index of cell to delete */
6914   int iCellDepth;                      /* Depth of node containing pCell */
6915 
6916   assert( cursorHoldsMutex(pCur) );
6917   assert( pBt->inTransaction==TRANS_WRITE );
6918   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
6919   assert( pCur->wrFlag );
6920   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6921   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6922 
6923   if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6924    || NEVER(pCur->eState!=CURSOR_VALID)
6925   ){
6926     return SQLITE_ERROR;  /* Something has gone awry. */
6927   }
6928 
6929   iCellDepth = pCur->iPage;
6930   iCellIdx = pCur->aiIdx[iCellDepth];
6931   pPage = pCur->apPage[iCellDepth];
6932   pCell = findCell(pPage, iCellIdx);
6933 
6934   /* If the page containing the entry to delete is not a leaf page, move
6935   ** the cursor to the largest entry in the tree that is smaller than
6936   ** the entry being deleted. This cell will replace the cell being deleted
6937   ** from the internal node. The 'previous' entry is used for this instead
6938   ** of the 'next' entry, as the previous entry is always a part of the
6939   ** sub-tree headed by the child page of the cell being deleted. This makes
6940   ** balancing the tree following the delete operation easier.  */
6941   if( !pPage->leaf ){
6942     int notUsed;
6943     rc = sqlite3BtreePrevious(pCur, &notUsed);
6944     if( rc ) return rc;
6945   }
6946 
6947   /* Save the positions of any other cursors open on this table before
6948   ** making any modifications. Make the page containing the entry to be
6949   ** deleted writable. Then free any overflow pages associated with the
6950   ** entry and finally remove the cell itself from within the page.
6951   */
6952   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6953   if( rc ) return rc;
6954 
6955   /* If this is a delete operation to remove a row from a table b-tree,
6956   ** invalidate any incrblob cursors open on the row being deleted.  */
6957   if( pCur->pKeyInfo==0 ){
6958     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
6959   }
6960 
6961   rc = sqlite3PagerWrite(pPage->pDbPage);
6962   if( rc ) return rc;
6963   rc = clearCell(pPage, pCell);
6964   dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
6965   if( rc ) return rc;
6966 
6967   /* If the cell deleted was not located on a leaf page, then the cursor
6968   ** is currently pointing to the largest entry in the sub-tree headed
6969   ** by the child-page of the cell that was just deleted from an internal
6970   ** node. The cell from the leaf node needs to be moved to the internal
6971   ** node to replace the deleted cell.  */
6972   if( !pPage->leaf ){
6973     MemPage *pLeaf = pCur->apPage[pCur->iPage];
6974     int nCell;
6975     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6976     unsigned char *pTmp;
6977 
6978     pCell = findCell(pLeaf, pLeaf->nCell-1);
6979     nCell = cellSizePtr(pLeaf, pCell);
6980     assert( MX_CELL_SIZE(pBt) >= nCell );
6981 
6982     allocateTempSpace(pBt);
6983     pTmp = pBt->pTmpSpace;
6984 
6985     rc = sqlite3PagerWrite(pLeaf->pDbPage);
6986     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6987     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
6988     if( rc ) return rc;
6989   }
6990 
6991   /* Balance the tree. If the entry deleted was located on a leaf page,
6992   ** then the cursor still points to that page. In this case the first
6993   ** call to balance() repairs the tree, and the if(...) condition is
6994   ** never true.
6995   **
6996   ** Otherwise, if the entry deleted was on an internal node page, then
6997   ** pCur is pointing to the leaf page from which a cell was removed to
6998   ** replace the cell deleted from the internal node. This is slightly
6999   ** tricky as the leaf node may be underfull, and the internal node may
7000   ** be either under or overfull. In this case run the balancing algorithm
7001   ** on the leaf node first. If the balance proceeds far enough up the
7002   ** tree that we can be sure that any problem in the internal node has
7003   ** been corrected, so be it. Otherwise, after balancing the leaf node,
7004   ** walk the cursor up the tree to the internal node and balance it as
7005   ** well.  */
7006   rc = balance(pCur);
7007   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
7008     while( pCur->iPage>iCellDepth ){
7009       releasePage(pCur->apPage[pCur->iPage--]);
7010     }
7011     rc = balance(pCur);
7012   }
7013 
7014   if( rc==SQLITE_OK ){
7015     moveToRoot(pCur);
7016   }
7017   return rc;
7018 }
7019 
7020 /*
7021 ** Create a new BTree table.  Write into *piTable the page
7022 ** number for the root page of the new table.
7023 **
7024 ** The type of type is determined by the flags parameter.  Only the
7025 ** following values of flags are currently in use.  Other values for
7026 ** flags might not work:
7027 **
7028 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
7029 **     BTREE_ZERODATA                  Used for SQL indices
7030 */
7031 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
7032   BtShared *pBt = p->pBt;
7033   MemPage *pRoot;
7034   Pgno pgnoRoot;
7035   int rc;
7036   int ptfFlags;          /* Page-type flage for the root page of new table */
7037 
7038   assert( sqlite3BtreeHoldsMutex(p) );
7039   assert( pBt->inTransaction==TRANS_WRITE );
7040   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
7041 
7042 #ifdef SQLITE_OMIT_AUTOVACUUM
7043   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7044   if( rc ){
7045     return rc;
7046   }
7047 #else
7048   if( pBt->autoVacuum ){
7049     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
7050     MemPage *pPageMove; /* The page to move to. */
7051 
7052     /* Creating a new table may probably require moving an existing database
7053     ** to make room for the new tables root page. In case this page turns
7054     ** out to be an overflow page, delete all overflow page-map caches
7055     ** held by open cursors.
7056     */
7057     invalidateAllOverflowCache(pBt);
7058 
7059     /* Read the value of meta[3] from the database to determine where the
7060     ** root page of the new table should go. meta[3] is the largest root-page
7061     ** created so far, so the new root-page is (meta[3]+1).
7062     */
7063     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
7064     pgnoRoot++;
7065 
7066     /* The new root-page may not be allocated on a pointer-map page, or the
7067     ** PENDING_BYTE page.
7068     */
7069     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
7070         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
7071       pgnoRoot++;
7072     }
7073     assert( pgnoRoot>=3 );
7074 
7075     /* Allocate a page. The page that currently resides at pgnoRoot will
7076     ** be moved to the allocated page (unless the allocated page happens
7077     ** to reside at pgnoRoot).
7078     */
7079     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
7080     if( rc!=SQLITE_OK ){
7081       return rc;
7082     }
7083 
7084     if( pgnoMove!=pgnoRoot ){
7085       /* pgnoRoot is the page that will be used for the root-page of
7086       ** the new table (assuming an error did not occur). But we were
7087       ** allocated pgnoMove. If required (i.e. if it was not allocated
7088       ** by extending the file), the current page at position pgnoMove
7089       ** is already journaled.
7090       */
7091       u8 eType = 0;
7092       Pgno iPtrPage = 0;
7093 
7094       releasePage(pPageMove);
7095 
7096       /* Move the page currently at pgnoRoot to pgnoMove. */
7097       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
7098       if( rc!=SQLITE_OK ){
7099         return rc;
7100       }
7101       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
7102       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
7103         rc = SQLITE_CORRUPT_BKPT;
7104       }
7105       if( rc!=SQLITE_OK ){
7106         releasePage(pRoot);
7107         return rc;
7108       }
7109       assert( eType!=PTRMAP_ROOTPAGE );
7110       assert( eType!=PTRMAP_FREEPAGE );
7111       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
7112       releasePage(pRoot);
7113 
7114       /* Obtain the page at pgnoRoot */
7115       if( rc!=SQLITE_OK ){
7116         return rc;
7117       }
7118       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
7119       if( rc!=SQLITE_OK ){
7120         return rc;
7121       }
7122       rc = sqlite3PagerWrite(pRoot->pDbPage);
7123       if( rc!=SQLITE_OK ){
7124         releasePage(pRoot);
7125         return rc;
7126       }
7127     }else{
7128       pRoot = pPageMove;
7129     }
7130 
7131     /* Update the pointer-map and meta-data with the new root-page number. */
7132     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
7133     if( rc ){
7134       releasePage(pRoot);
7135       return rc;
7136     }
7137 
7138     /* When the new root page was allocated, page 1 was made writable in
7139     ** order either to increase the database filesize, or to decrement the
7140     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
7141     */
7142     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
7143     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
7144     if( NEVER(rc) ){
7145       releasePage(pRoot);
7146       return rc;
7147     }
7148 
7149   }else{
7150     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7151     if( rc ) return rc;
7152   }
7153 #endif
7154   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7155   if( createTabFlags & BTREE_INTKEY ){
7156     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
7157   }else{
7158     ptfFlags = PTF_ZERODATA | PTF_LEAF;
7159   }
7160   zeroPage(pRoot, ptfFlags);
7161   sqlite3PagerUnref(pRoot->pDbPage);
7162   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
7163   *piTable = (int)pgnoRoot;
7164   return SQLITE_OK;
7165 }
7166 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
7167   int rc;
7168   sqlite3BtreeEnter(p);
7169   rc = btreeCreateTable(p, piTable, flags);
7170   sqlite3BtreeLeave(p);
7171   return rc;
7172 }
7173 
7174 /*
7175 ** Erase the given database page and all its children.  Return
7176 ** the page to the freelist.
7177 */
7178 static int clearDatabasePage(
7179   BtShared *pBt,           /* The BTree that contains the table */
7180   Pgno pgno,               /* Page number to clear */
7181   int freePageFlag,        /* Deallocate page if true */
7182   int *pnChange            /* Add number of Cells freed to this counter */
7183 ){
7184   MemPage *pPage;
7185   int rc;
7186   unsigned char *pCell;
7187   int i;
7188 
7189   assert( sqlite3_mutex_held(pBt->mutex) );
7190   if( pgno>btreePagecount(pBt) ){
7191     return SQLITE_CORRUPT_BKPT;
7192   }
7193 
7194   rc = getAndInitPage(pBt, pgno, &pPage);
7195   if( rc ) return rc;
7196   for(i=0; i<pPage->nCell; i++){
7197     pCell = findCell(pPage, i);
7198     if( !pPage->leaf ){
7199       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
7200       if( rc ) goto cleardatabasepage_out;
7201     }
7202     rc = clearCell(pPage, pCell);
7203     if( rc ) goto cleardatabasepage_out;
7204   }
7205   if( !pPage->leaf ){
7206     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
7207     if( rc ) goto cleardatabasepage_out;
7208   }else if( pnChange ){
7209     assert( pPage->intKey );
7210     *pnChange += pPage->nCell;
7211   }
7212   if( freePageFlag ){
7213     freePage(pPage, &rc);
7214   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
7215     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
7216   }
7217 
7218 cleardatabasepage_out:
7219   releasePage(pPage);
7220   return rc;
7221 }
7222 
7223 /*
7224 ** Delete all information from a single table in the database.  iTable is
7225 ** the page number of the root of the table.  After this routine returns,
7226 ** the root page is empty, but still exists.
7227 **
7228 ** This routine will fail with SQLITE_LOCKED if there are any open
7229 ** read cursors on the table.  Open write cursors are moved to the
7230 ** root of the table.
7231 **
7232 ** If pnChange is not NULL, then table iTable must be an intkey table. The
7233 ** integer value pointed to by pnChange is incremented by the number of
7234 ** entries in the table.
7235 */
7236 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
7237   int rc;
7238   BtShared *pBt = p->pBt;
7239   sqlite3BtreeEnter(p);
7240   assert( p->inTrans==TRANS_WRITE );
7241 
7242   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
7243 
7244   if( SQLITE_OK==rc ){
7245     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
7246     ** is the root of a table b-tree - if it is not, the following call is
7247     ** a no-op).  */
7248     invalidateIncrblobCursors(p, 0, 1);
7249     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
7250   }
7251   sqlite3BtreeLeave(p);
7252   return rc;
7253 }
7254 
7255 /*
7256 ** Erase all information in a table and add the root of the table to
7257 ** the freelist.  Except, the root of the principle table (the one on
7258 ** page 1) is never added to the freelist.
7259 **
7260 ** This routine will fail with SQLITE_LOCKED if there are any open
7261 ** cursors on the table.
7262 **
7263 ** If AUTOVACUUM is enabled and the page at iTable is not the last
7264 ** root page in the database file, then the last root page
7265 ** in the database file is moved into the slot formerly occupied by
7266 ** iTable and that last slot formerly occupied by the last root page
7267 ** is added to the freelist instead of iTable.  In this say, all
7268 ** root pages are kept at the beginning of the database file, which
7269 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
7270 ** page number that used to be the last root page in the file before
7271 ** the move.  If no page gets moved, *piMoved is set to 0.
7272 ** The last root page is recorded in meta[3] and the value of
7273 ** meta[3] is updated by this procedure.
7274 */
7275 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
7276   int rc;
7277   MemPage *pPage = 0;
7278   BtShared *pBt = p->pBt;
7279 
7280   assert( sqlite3BtreeHoldsMutex(p) );
7281   assert( p->inTrans==TRANS_WRITE );
7282 
7283   /* It is illegal to drop a table if any cursors are open on the
7284   ** database. This is because in auto-vacuum mode the backend may
7285   ** need to move another root-page to fill a gap left by the deleted
7286   ** root page. If an open cursor was using this page a problem would
7287   ** occur.
7288   **
7289   ** This error is caught long before control reaches this point.
7290   */
7291   if( NEVER(pBt->pCursor) ){
7292     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
7293     return SQLITE_LOCKED_SHAREDCACHE;
7294   }
7295 
7296   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
7297   if( rc ) return rc;
7298   rc = sqlite3BtreeClearTable(p, iTable, 0);
7299   if( rc ){
7300     releasePage(pPage);
7301     return rc;
7302   }
7303 
7304   *piMoved = 0;
7305 
7306   if( iTable>1 ){
7307 #ifdef SQLITE_OMIT_AUTOVACUUM
7308     freePage(pPage, &rc);
7309     releasePage(pPage);
7310 #else
7311     if( pBt->autoVacuum ){
7312       Pgno maxRootPgno;
7313       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
7314 
7315       if( iTable==maxRootPgno ){
7316         /* If the table being dropped is the table with the largest root-page
7317         ** number in the database, put the root page on the free list.
7318         */
7319         freePage(pPage, &rc);
7320         releasePage(pPage);
7321         if( rc!=SQLITE_OK ){
7322           return rc;
7323         }
7324       }else{
7325         /* The table being dropped does not have the largest root-page
7326         ** number in the database. So move the page that does into the
7327         ** gap left by the deleted root-page.
7328         */
7329         MemPage *pMove;
7330         releasePage(pPage);
7331         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7332         if( rc!=SQLITE_OK ){
7333           return rc;
7334         }
7335         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
7336         releasePage(pMove);
7337         if( rc!=SQLITE_OK ){
7338           return rc;
7339         }
7340         pMove = 0;
7341         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7342         freePage(pMove, &rc);
7343         releasePage(pMove);
7344         if( rc!=SQLITE_OK ){
7345           return rc;
7346         }
7347         *piMoved = maxRootPgno;
7348       }
7349 
7350       /* Set the new 'max-root-page' value in the database header. This
7351       ** is the old value less one, less one more if that happens to
7352       ** be a root-page number, less one again if that is the
7353       ** PENDING_BYTE_PAGE.
7354       */
7355       maxRootPgno--;
7356       while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7357              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7358         maxRootPgno--;
7359       }
7360       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7361 
7362       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7363     }else{
7364       freePage(pPage, &rc);
7365       releasePage(pPage);
7366     }
7367 #endif
7368   }else{
7369     /* If sqlite3BtreeDropTable was called on page 1.
7370     ** This really never should happen except in a corrupt
7371     ** database.
7372     */
7373     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7374     releasePage(pPage);
7375   }
7376   return rc;
7377 }
7378 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7379   int rc;
7380   sqlite3BtreeEnter(p);
7381   rc = btreeDropTable(p, iTable, piMoved);
7382   sqlite3BtreeLeave(p);
7383   return rc;
7384 }
7385 
7386 
7387 /*
7388 ** This function may only be called if the b-tree connection already
7389 ** has a read or write transaction open on the database.
7390 **
7391 ** Read the meta-information out of a database file.  Meta[0]
7392 ** is the number of free pages currently in the database.  Meta[1]
7393 ** through meta[15] are available for use by higher layers.  Meta[0]
7394 ** is read-only, the others are read/write.
7395 **
7396 ** The schema layer numbers meta values differently.  At the schema
7397 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7398 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
7399 */
7400 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7401   BtShared *pBt = p->pBt;
7402 
7403   sqlite3BtreeEnter(p);
7404   assert( p->inTrans>TRANS_NONE );
7405   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7406   assert( pBt->pPage1 );
7407   assert( idx>=0 && idx<=15 );
7408 
7409   *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7410 
7411   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7412   ** database, mark the database as read-only.  */
7413 #ifdef SQLITE_OMIT_AUTOVACUUM
7414   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
7415     pBt->btsFlags |= BTS_READ_ONLY;
7416   }
7417 #endif
7418 
7419   sqlite3BtreeLeave(p);
7420 }
7421 
7422 /*
7423 ** Write meta-information back into the database.  Meta[0] is
7424 ** read-only and may not be written.
7425 */
7426 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7427   BtShared *pBt = p->pBt;
7428   unsigned char *pP1;
7429   int rc;
7430   assert( idx>=1 && idx<=15 );
7431   sqlite3BtreeEnter(p);
7432   assert( p->inTrans==TRANS_WRITE );
7433   assert( pBt->pPage1!=0 );
7434   pP1 = pBt->pPage1->aData;
7435   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7436   if( rc==SQLITE_OK ){
7437     put4byte(&pP1[36 + idx*4], iMeta);
7438 #ifndef SQLITE_OMIT_AUTOVACUUM
7439     if( idx==BTREE_INCR_VACUUM ){
7440       assert( pBt->autoVacuum || iMeta==0 );
7441       assert( iMeta==0 || iMeta==1 );
7442       pBt->incrVacuum = (u8)iMeta;
7443     }
7444 #endif
7445   }
7446   sqlite3BtreeLeave(p);
7447   return rc;
7448 }
7449 
7450 #ifndef SQLITE_OMIT_BTREECOUNT
7451 /*
7452 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7453 ** number of entries in the b-tree and write the result to *pnEntry.
7454 **
7455 ** SQLITE_OK is returned if the operation is successfully executed.
7456 ** Otherwise, if an error is encountered (i.e. an IO error or database
7457 ** corruption) an SQLite error code is returned.
7458 */
7459 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7460   i64 nEntry = 0;                      /* Value to return in *pnEntry */
7461   int rc;                              /* Return code */
7462 
7463   if( pCur->pgnoRoot==0 ){
7464     *pnEntry = 0;
7465     return SQLITE_OK;
7466   }
7467   rc = moveToRoot(pCur);
7468 
7469   /* Unless an error occurs, the following loop runs one iteration for each
7470   ** page in the B-Tree structure (not including overflow pages).
7471   */
7472   while( rc==SQLITE_OK ){
7473     int iIdx;                          /* Index of child node in parent */
7474     MemPage *pPage;                    /* Current page of the b-tree */
7475 
7476     /* If this is a leaf page or the tree is not an int-key tree, then
7477     ** this page contains countable entries. Increment the entry counter
7478     ** accordingly.
7479     */
7480     pPage = pCur->apPage[pCur->iPage];
7481     if( pPage->leaf || !pPage->intKey ){
7482       nEntry += pPage->nCell;
7483     }
7484 
7485     /* pPage is a leaf node. This loop navigates the cursor so that it
7486     ** points to the first interior cell that it points to the parent of
7487     ** the next page in the tree that has not yet been visited. The
7488     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7489     ** of the page, or to the number of cells in the page if the next page
7490     ** to visit is the right-child of its parent.
7491     **
7492     ** If all pages in the tree have been visited, return SQLITE_OK to the
7493     ** caller.
7494     */
7495     if( pPage->leaf ){
7496       do {
7497         if( pCur->iPage==0 ){
7498           /* All pages of the b-tree have been visited. Return successfully. */
7499           *pnEntry = nEntry;
7500           return SQLITE_OK;
7501         }
7502         moveToParent(pCur);
7503       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7504 
7505       pCur->aiIdx[pCur->iPage]++;
7506       pPage = pCur->apPage[pCur->iPage];
7507     }
7508 
7509     /* Descend to the child node of the cell that the cursor currently
7510     ** points at. This is the right-child if (iIdx==pPage->nCell).
7511     */
7512     iIdx = pCur->aiIdx[pCur->iPage];
7513     if( iIdx==pPage->nCell ){
7514       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7515     }else{
7516       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7517     }
7518   }
7519 
7520   /* An error has occurred. Return an error code. */
7521   return rc;
7522 }
7523 #endif
7524 
7525 /*
7526 ** Return the pager associated with a BTree.  This routine is used for
7527 ** testing and debugging only.
7528 */
7529 Pager *sqlite3BtreePager(Btree *p){
7530   return p->pBt->pPager;
7531 }
7532 
7533 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7534 /*
7535 ** Append a message to the error message string.
7536 */
7537 static void checkAppendMsg(
7538   IntegrityCk *pCheck,
7539   char *zMsg1,
7540   const char *zFormat,
7541   ...
7542 ){
7543   va_list ap;
7544   if( !pCheck->mxErr ) return;
7545   pCheck->mxErr--;
7546   pCheck->nErr++;
7547   va_start(ap, zFormat);
7548   if( pCheck->errMsg.nChar ){
7549     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7550   }
7551   if( zMsg1 ){
7552     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7553   }
7554   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7555   va_end(ap);
7556   if( pCheck->errMsg.mallocFailed ){
7557     pCheck->mallocFailed = 1;
7558   }
7559 }
7560 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7561 
7562 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7563 
7564 /*
7565 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
7566 ** corresponds to page iPg is already set.
7567 */
7568 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
7569   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
7570   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
7571 }
7572 
7573 /*
7574 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
7575 */
7576 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
7577   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
7578   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
7579 }
7580 
7581 
7582 /*
7583 ** Add 1 to the reference count for page iPage.  If this is the second
7584 ** reference to the page, add an error message to pCheck->zErrMsg.
7585 ** Return 1 if there are 2 ore more references to the page and 0 if
7586 ** if this is the first reference to the page.
7587 **
7588 ** Also check that the page number is in bounds.
7589 */
7590 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7591   if( iPage==0 ) return 1;
7592   if( iPage>pCheck->nPage ){
7593     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7594     return 1;
7595   }
7596   if( getPageReferenced(pCheck, iPage) ){
7597     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7598     return 1;
7599   }
7600   setPageReferenced(pCheck, iPage);
7601   return 0;
7602 }
7603 
7604 #ifndef SQLITE_OMIT_AUTOVACUUM
7605 /*
7606 ** Check that the entry in the pointer-map for page iChild maps to
7607 ** page iParent, pointer type ptrType. If not, append an error message
7608 ** to pCheck.
7609 */
7610 static void checkPtrmap(
7611   IntegrityCk *pCheck,   /* Integrity check context */
7612   Pgno iChild,           /* Child page number */
7613   u8 eType,              /* Expected pointer map type */
7614   Pgno iParent,          /* Expected pointer map parent page number */
7615   char *zContext         /* Context description (used for error msg) */
7616 ){
7617   int rc;
7618   u8 ePtrmapType;
7619   Pgno iPtrmapParent;
7620 
7621   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7622   if( rc!=SQLITE_OK ){
7623     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7624     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7625     return;
7626   }
7627 
7628   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7629     checkAppendMsg(pCheck, zContext,
7630       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7631       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7632   }
7633 }
7634 #endif
7635 
7636 /*
7637 ** Check the integrity of the freelist or of an overflow page list.
7638 ** Verify that the number of pages on the list is N.
7639 */
7640 static void checkList(
7641   IntegrityCk *pCheck,  /* Integrity checking context */
7642   int isFreeList,       /* True for a freelist.  False for overflow page list */
7643   int iPage,            /* Page number for first page in the list */
7644   int N,                /* Expected number of pages in the list */
7645   char *zContext        /* Context for error messages */
7646 ){
7647   int i;
7648   int expected = N;
7649   int iFirst = iPage;
7650   while( N-- > 0 && pCheck->mxErr ){
7651     DbPage *pOvflPage;
7652     unsigned char *pOvflData;
7653     if( iPage<1 ){
7654       checkAppendMsg(pCheck, zContext,
7655          "%d of %d pages missing from overflow list starting at %d",
7656           N+1, expected, iFirst);
7657       break;
7658     }
7659     if( checkRef(pCheck, iPage, zContext) ) break;
7660     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7661       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7662       break;
7663     }
7664     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7665     if( isFreeList ){
7666       int n = get4byte(&pOvflData[4]);
7667 #ifndef SQLITE_OMIT_AUTOVACUUM
7668       if( pCheck->pBt->autoVacuum ){
7669         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7670       }
7671 #endif
7672       if( n>(int)pCheck->pBt->usableSize/4-2 ){
7673         checkAppendMsg(pCheck, zContext,
7674            "freelist leaf count too big on page %d", iPage);
7675         N--;
7676       }else{
7677         for(i=0; i<n; i++){
7678           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7679 #ifndef SQLITE_OMIT_AUTOVACUUM
7680           if( pCheck->pBt->autoVacuum ){
7681             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7682           }
7683 #endif
7684           checkRef(pCheck, iFreePage, zContext);
7685         }
7686         N -= n;
7687       }
7688     }
7689 #ifndef SQLITE_OMIT_AUTOVACUUM
7690     else{
7691       /* If this database supports auto-vacuum and iPage is not the last
7692       ** page in this overflow list, check that the pointer-map entry for
7693       ** the following page matches iPage.
7694       */
7695       if( pCheck->pBt->autoVacuum && N>0 ){
7696         i = get4byte(pOvflData);
7697         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7698       }
7699     }
7700 #endif
7701     iPage = get4byte(pOvflData);
7702     sqlite3PagerUnref(pOvflPage);
7703   }
7704 }
7705 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7706 
7707 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7708 /*
7709 ** Do various sanity checks on a single page of a tree.  Return
7710 ** the tree depth.  Root pages return 0.  Parents of root pages
7711 ** return 1, and so forth.
7712 **
7713 ** These checks are done:
7714 **
7715 **      1.  Make sure that cells and freeblocks do not overlap
7716 **          but combine to completely cover the page.
7717 **  NO  2.  Make sure cell keys are in order.
7718 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
7719 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
7720 **      5.  Check the integrity of overflow pages.
7721 **      6.  Recursively call checkTreePage on all children.
7722 **      7.  Verify that the depth of all children is the same.
7723 **      8.  Make sure this page is at least 33% full or else it is
7724 **          the root of the tree.
7725 */
7726 static int checkTreePage(
7727   IntegrityCk *pCheck,  /* Context for the sanity check */
7728   int iPage,            /* Page number of the page to check */
7729   char *zParentContext, /* Parent context */
7730   i64 *pnParentMinKey,
7731   i64 *pnParentMaxKey
7732 ){
7733   MemPage *pPage;
7734   int i, rc, depth, d2, pgno, cnt;
7735   int hdr, cellStart;
7736   int nCell;
7737   u8 *data;
7738   BtShared *pBt;
7739   int usableSize;
7740   char zContext[100];
7741   char *hit = 0;
7742   i64 nMinKey = 0;
7743   i64 nMaxKey = 0;
7744 
7745   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7746 
7747   /* Check that the page exists
7748   */
7749   pBt = pCheck->pBt;
7750   usableSize = pBt->usableSize;
7751   if( iPage==0 ) return 0;
7752   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7753   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7754     checkAppendMsg(pCheck, zContext,
7755        "unable to get the page. error code=%d", rc);
7756     return 0;
7757   }
7758 
7759   /* Clear MemPage.isInit to make sure the corruption detection code in
7760   ** btreeInitPage() is executed.  */
7761   pPage->isInit = 0;
7762   if( (rc = btreeInitPage(pPage))!=0 ){
7763     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
7764     checkAppendMsg(pCheck, zContext,
7765                    "btreeInitPage() returns error code %d", rc);
7766     releasePage(pPage);
7767     return 0;
7768   }
7769 
7770   /* Check out all the cells.
7771   */
7772   depth = 0;
7773   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7774     u8 *pCell;
7775     u32 sz;
7776     CellInfo info;
7777 
7778     /* Check payload overflow pages
7779     */
7780     sqlite3_snprintf(sizeof(zContext), zContext,
7781              "On tree page %d cell %d: ", iPage, i);
7782     pCell = findCell(pPage,i);
7783     btreeParseCellPtr(pPage, pCell, &info);
7784     sz = info.nData;
7785     if( !pPage->intKey ) sz += (int)info.nKey;
7786     /* For intKey pages, check that the keys are in order.
7787     */
7788     else if( i==0 ) nMinKey = nMaxKey = info.nKey;
7789     else{
7790       if( info.nKey <= nMaxKey ){
7791         checkAppendMsg(pCheck, zContext,
7792             "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
7793       }
7794       nMaxKey = info.nKey;
7795     }
7796     assert( sz==info.nPayload );
7797     if( (sz>info.nLocal)
7798      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7799     ){
7800       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7801       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7802 #ifndef SQLITE_OMIT_AUTOVACUUM
7803       if( pBt->autoVacuum ){
7804         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7805       }
7806 #endif
7807       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7808     }
7809 
7810     /* Check sanity of left child page.
7811     */
7812     if( !pPage->leaf ){
7813       pgno = get4byte(pCell);
7814 #ifndef SQLITE_OMIT_AUTOVACUUM
7815       if( pBt->autoVacuum ){
7816         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7817       }
7818 #endif
7819       d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
7820       if( i>0 && d2!=depth ){
7821         checkAppendMsg(pCheck, zContext, "Child page depth differs");
7822       }
7823       depth = d2;
7824     }
7825   }
7826 
7827   if( !pPage->leaf ){
7828     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7829     sqlite3_snprintf(sizeof(zContext), zContext,
7830                      "On page %d at right child: ", iPage);
7831 #ifndef SQLITE_OMIT_AUTOVACUUM
7832     if( pBt->autoVacuum ){
7833       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7834     }
7835 #endif
7836     checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
7837   }
7838 
7839   /* For intKey leaf pages, check that the min/max keys are in order
7840   ** with any left/parent/right pages.
7841   */
7842   if( pPage->leaf && pPage->intKey ){
7843     /* if we are a left child page */
7844     if( pnParentMinKey ){
7845       /* if we are the left most child page */
7846       if( !pnParentMaxKey ){
7847         if( nMaxKey > *pnParentMinKey ){
7848           checkAppendMsg(pCheck, zContext,
7849               "Rowid %lld out of order (max larger than parent min of %lld)",
7850               nMaxKey, *pnParentMinKey);
7851         }
7852       }else{
7853         if( nMinKey <= *pnParentMinKey ){
7854           checkAppendMsg(pCheck, zContext,
7855               "Rowid %lld out of order (min less than parent min of %lld)",
7856               nMinKey, *pnParentMinKey);
7857         }
7858         if( nMaxKey > *pnParentMaxKey ){
7859           checkAppendMsg(pCheck, zContext,
7860               "Rowid %lld out of order (max larger than parent max of %lld)",
7861               nMaxKey, *pnParentMaxKey);
7862         }
7863         *pnParentMinKey = nMaxKey;
7864       }
7865     /* else if we're a right child page */
7866     } else if( pnParentMaxKey ){
7867       if( nMinKey <= *pnParentMaxKey ){
7868         checkAppendMsg(pCheck, zContext,
7869             "Rowid %lld out of order (min less than parent max of %lld)",
7870             nMinKey, *pnParentMaxKey);
7871       }
7872     }
7873   }
7874 
7875   /* Check for complete coverage of the page
7876   */
7877   data = pPage->aData;
7878   hdr = pPage->hdrOffset;
7879   hit = sqlite3PageMalloc( pBt->pageSize );
7880   if( hit==0 ){
7881     pCheck->mallocFailed = 1;
7882   }else{
7883     int contentOffset = get2byteNotZero(&data[hdr+5]);
7884     assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
7885     memset(hit+contentOffset, 0, usableSize-contentOffset);
7886     memset(hit, 1, contentOffset);
7887     nCell = get2byte(&data[hdr+3]);
7888     cellStart = hdr + 12 - 4*pPage->leaf;
7889     for(i=0; i<nCell; i++){
7890       int pc = get2byte(&data[cellStart+i*2]);
7891       u32 size = 65536;
7892       int j;
7893       if( pc<=usableSize-4 ){
7894         size = cellSizePtr(pPage, &data[pc]);
7895       }
7896       if( (int)(pc+size-1)>=usableSize ){
7897         checkAppendMsg(pCheck, 0,
7898             "Corruption detected in cell %d on page %d",i,iPage);
7899       }else{
7900         for(j=pc+size-1; j>=pc; j--) hit[j]++;
7901       }
7902     }
7903     i = get2byte(&data[hdr+1]);
7904     while( i>0 ){
7905       int size, j;
7906       assert( i<=usableSize-4 );     /* Enforced by btreeInitPage() */
7907       size = get2byte(&data[i+2]);
7908       assert( i+size<=usableSize );  /* Enforced by btreeInitPage() */
7909       for(j=i+size-1; j>=i; j--) hit[j]++;
7910       j = get2byte(&data[i]);
7911       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
7912       assert( j<=usableSize-4 );   /* Enforced by btreeInitPage() */
7913       i = j;
7914     }
7915     for(i=cnt=0; i<usableSize; i++){
7916       if( hit[i]==0 ){
7917         cnt++;
7918       }else if( hit[i]>1 ){
7919         checkAppendMsg(pCheck, 0,
7920           "Multiple uses for byte %d of page %d", i, iPage);
7921         break;
7922       }
7923     }
7924     if( cnt!=data[hdr+7] ){
7925       checkAppendMsg(pCheck, 0,
7926           "Fragmentation of %d bytes reported as %d on page %d",
7927           cnt, data[hdr+7], iPage);
7928     }
7929   }
7930   sqlite3PageFree(hit);
7931   releasePage(pPage);
7932   return depth+1;
7933 }
7934 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7935 
7936 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7937 /*
7938 ** This routine does a complete check of the given BTree file.  aRoot[] is
7939 ** an array of pages numbers were each page number is the root page of
7940 ** a table.  nRoot is the number of entries in aRoot.
7941 **
7942 ** A read-only or read-write transaction must be opened before calling
7943 ** this function.
7944 **
7945 ** Write the number of error seen in *pnErr.  Except for some memory
7946 ** allocation errors,  an error message held in memory obtained from
7947 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
7948 ** returned.  If a memory allocation error occurs, NULL is returned.
7949 */
7950 char *sqlite3BtreeIntegrityCheck(
7951   Btree *p,     /* The btree to be checked */
7952   int *aRoot,   /* An array of root pages numbers for individual trees */
7953   int nRoot,    /* Number of entries in aRoot[] */
7954   int mxErr,    /* Stop reporting errors after this many */
7955   int *pnErr    /* Write number of errors seen to this variable */
7956 ){
7957   Pgno i;
7958   int nRef;
7959   IntegrityCk sCheck;
7960   BtShared *pBt = p->pBt;
7961   char zErr[100];
7962 
7963   sqlite3BtreeEnter(p);
7964   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
7965   nRef = sqlite3PagerRefcount(pBt->pPager);
7966   sCheck.pBt = pBt;
7967   sCheck.pPager = pBt->pPager;
7968   sCheck.nPage = btreePagecount(sCheck.pBt);
7969   sCheck.mxErr = mxErr;
7970   sCheck.nErr = 0;
7971   sCheck.mallocFailed = 0;
7972   *pnErr = 0;
7973   if( sCheck.nPage==0 ){
7974     sqlite3BtreeLeave(p);
7975     return 0;
7976   }
7977 
7978   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
7979   if( !sCheck.aPgRef ){
7980     *pnErr = 1;
7981     sqlite3BtreeLeave(p);
7982     return 0;
7983   }
7984   i = PENDING_BYTE_PAGE(pBt);
7985   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
7986   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7987   sCheck.errMsg.useMalloc = 2;
7988 
7989   /* Check the integrity of the freelist
7990   */
7991   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7992             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7993 
7994   /* Check all the tables.
7995   */
7996   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7997     if( aRoot[i]==0 ) continue;
7998 #ifndef SQLITE_OMIT_AUTOVACUUM
7999     if( pBt->autoVacuum && aRoot[i]>1 ){
8000       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
8001     }
8002 #endif
8003     checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
8004   }
8005 
8006   /* Make sure every page in the file is referenced
8007   */
8008   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
8009 #ifdef SQLITE_OMIT_AUTOVACUUM
8010     if( getPageReferenced(&sCheck, i)==0 ){
8011       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
8012     }
8013 #else
8014     /* If the database supports auto-vacuum, make sure no tables contain
8015     ** references to pointer-map pages.
8016     */
8017     if( getPageReferenced(&sCheck, i)==0 &&
8018        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
8019       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
8020     }
8021     if( getPageReferenced(&sCheck, i)!=0 &&
8022        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
8023       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
8024     }
8025 #endif
8026   }
8027 
8028   /* Make sure this analysis did not leave any unref() pages.
8029   ** This is an internal consistency check; an integrity check
8030   ** of the integrity check.
8031   */
8032   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
8033     checkAppendMsg(&sCheck, 0,
8034       "Outstanding page count goes from %d to %d during this analysis",
8035       nRef, sqlite3PagerRefcount(pBt->pPager)
8036     );
8037   }
8038 
8039   /* Clean  up and report errors.
8040   */
8041   sqlite3BtreeLeave(p);
8042   sqlite3_free(sCheck.aPgRef);
8043   if( sCheck.mallocFailed ){
8044     sqlite3StrAccumReset(&sCheck.errMsg);
8045     *pnErr = sCheck.nErr+1;
8046     return 0;
8047   }
8048   *pnErr = sCheck.nErr;
8049   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
8050   return sqlite3StrAccumFinish(&sCheck.errMsg);
8051 }
8052 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
8053 
8054 /*
8055 ** Return the full pathname of the underlying database file.  Return
8056 ** an empty string if the database is in-memory or a TEMP database.
8057 **
8058 ** The pager filename is invariant as long as the pager is
8059 ** open so it is safe to access without the BtShared mutex.
8060 */
8061 const char *sqlite3BtreeGetFilename(Btree *p){
8062   assert( p->pBt->pPager!=0 );
8063   return sqlite3PagerFilename(p->pBt->pPager, 1);
8064 }
8065 
8066 /*
8067 ** Return the pathname of the journal file for this database. The return
8068 ** value of this routine is the same regardless of whether the journal file
8069 ** has been created or not.
8070 **
8071 ** The pager journal filename is invariant as long as the pager is
8072 ** open so it is safe to access without the BtShared mutex.
8073 */
8074 const char *sqlite3BtreeGetJournalname(Btree *p){
8075   assert( p->pBt->pPager!=0 );
8076   return sqlite3PagerJournalname(p->pBt->pPager);
8077 }
8078 
8079 /*
8080 ** Return non-zero if a transaction is active.
8081 */
8082 int sqlite3BtreeIsInTrans(Btree *p){
8083   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
8084   return (p && (p->inTrans==TRANS_WRITE));
8085 }
8086 
8087 #ifndef SQLITE_OMIT_WAL
8088 /*
8089 ** Run a checkpoint on the Btree passed as the first argument.
8090 **
8091 ** Return SQLITE_LOCKED if this or any other connection has an open
8092 ** transaction on the shared-cache the argument Btree is connected to.
8093 **
8094 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
8095 */
8096 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
8097   int rc = SQLITE_OK;
8098   if( p ){
8099     BtShared *pBt = p->pBt;
8100     sqlite3BtreeEnter(p);
8101     if( pBt->inTransaction!=TRANS_NONE ){
8102       rc = SQLITE_LOCKED;
8103     }else{
8104       rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
8105     }
8106     sqlite3BtreeLeave(p);
8107   }
8108   return rc;
8109 }
8110 #endif
8111 
8112 /*
8113 ** Return non-zero if a read (or write) transaction is active.
8114 */
8115 int sqlite3BtreeIsInReadTrans(Btree *p){
8116   assert( p );
8117   assert( sqlite3_mutex_held(p->db->mutex) );
8118   return p->inTrans!=TRANS_NONE;
8119 }
8120 
8121 int sqlite3BtreeIsInBackup(Btree *p){
8122   assert( p );
8123   assert( sqlite3_mutex_held(p->db->mutex) );
8124   return p->nBackup!=0;
8125 }
8126 
8127 /*
8128 ** This function returns a pointer to a blob of memory associated with
8129 ** a single shared-btree. The memory is used by client code for its own
8130 ** purposes (for example, to store a high-level schema associated with
8131 ** the shared-btree). The btree layer manages reference counting issues.
8132 **
8133 ** The first time this is called on a shared-btree, nBytes bytes of memory
8134 ** are allocated, zeroed, and returned to the caller. For each subsequent
8135 ** call the nBytes parameter is ignored and a pointer to the same blob
8136 ** of memory returned.
8137 **
8138 ** If the nBytes parameter is 0 and the blob of memory has not yet been
8139 ** allocated, a null pointer is returned. If the blob has already been
8140 ** allocated, it is returned as normal.
8141 **
8142 ** Just before the shared-btree is closed, the function passed as the
8143 ** xFree argument when the memory allocation was made is invoked on the
8144 ** blob of allocated memory. The xFree function should not call sqlite3_free()
8145 ** on the memory, the btree layer does that.
8146 */
8147 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
8148   BtShared *pBt = p->pBt;
8149   sqlite3BtreeEnter(p);
8150   if( !pBt->pSchema && nBytes ){
8151     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
8152     pBt->xFreeSchema = xFree;
8153   }
8154   sqlite3BtreeLeave(p);
8155   return pBt->pSchema;
8156 }
8157 
8158 /*
8159 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
8160 ** btree as the argument handle holds an exclusive lock on the
8161 ** sqlite_master table. Otherwise SQLITE_OK.
8162 */
8163 int sqlite3BtreeSchemaLocked(Btree *p){
8164   int rc;
8165   assert( sqlite3_mutex_held(p->db->mutex) );
8166   sqlite3BtreeEnter(p);
8167   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
8168   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
8169   sqlite3BtreeLeave(p);
8170   return rc;
8171 }
8172 
8173 
8174 #ifndef SQLITE_OMIT_SHARED_CACHE
8175 /*
8176 ** Obtain a lock on the table whose root page is iTab.  The
8177 ** lock is a write lock if isWritelock is true or a read lock
8178 ** if it is false.
8179 */
8180 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
8181   int rc = SQLITE_OK;
8182   assert( p->inTrans!=TRANS_NONE );
8183   if( p->sharable ){
8184     u8 lockType = READ_LOCK + isWriteLock;
8185     assert( READ_LOCK+1==WRITE_LOCK );
8186     assert( isWriteLock==0 || isWriteLock==1 );
8187 
8188     sqlite3BtreeEnter(p);
8189     rc = querySharedCacheTableLock(p, iTab, lockType);
8190     if( rc==SQLITE_OK ){
8191       rc = setSharedCacheTableLock(p, iTab, lockType);
8192     }
8193     sqlite3BtreeLeave(p);
8194   }
8195   return rc;
8196 }
8197 #endif
8198 
8199 #ifndef SQLITE_OMIT_INCRBLOB
8200 /*
8201 ** Argument pCsr must be a cursor opened for writing on an
8202 ** INTKEY table currently pointing at a valid table entry.
8203 ** This function modifies the data stored as part of that entry.
8204 **
8205 ** Only the data content may only be modified, it is not possible to
8206 ** change the length of the data stored. If this function is called with
8207 ** parameters that attempt to write past the end of the existing data,
8208 ** no modifications are made and SQLITE_CORRUPT is returned.
8209 */
8210 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
8211   int rc;
8212   assert( cursorHoldsMutex(pCsr) );
8213   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
8214   assert( pCsr->isIncrblobHandle );
8215 
8216   rc = restoreCursorPosition(pCsr);
8217   if( rc!=SQLITE_OK ){
8218     return rc;
8219   }
8220   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
8221   if( pCsr->eState!=CURSOR_VALID ){
8222     return SQLITE_ABORT;
8223   }
8224 
8225   /* Check some assumptions:
8226   **   (a) the cursor is open for writing,
8227   **   (b) there is a read/write transaction open,
8228   **   (c) the connection holds a write-lock on the table (if required),
8229   **   (d) there are no conflicting read-locks, and
8230   **   (e) the cursor points at a valid row of an intKey table.
8231   */
8232   if( !pCsr->wrFlag ){
8233     return SQLITE_READONLY;
8234   }
8235   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
8236               && pCsr->pBt->inTransaction==TRANS_WRITE );
8237   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
8238   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
8239   assert( pCsr->apPage[pCsr->iPage]->intKey );
8240 
8241   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
8242 }
8243 
8244 /*
8245 ** Set a flag on this cursor to cache the locations of pages from the
8246 ** overflow list for the current row. This is used by cursors opened
8247 ** for incremental blob IO only.
8248 **
8249 ** This function sets a flag only. The actual page location cache
8250 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
8251 ** accessPayload() (the worker function for sqlite3BtreeData() and
8252 ** sqlite3BtreePutData()).
8253 */
8254 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
8255   assert( cursorHoldsMutex(pCur) );
8256   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
8257   invalidateOverflowCache(pCur);
8258   pCur->isIncrblobHandle = 1;
8259 }
8260 #endif
8261 
8262 /*
8263 ** Set both the "read version" (single byte at byte offset 18) and
8264 ** "write version" (single byte at byte offset 19) fields in the database
8265 ** header to iVersion.
8266 */
8267 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
8268   BtShared *pBt = pBtree->pBt;
8269   int rc;                         /* Return code */
8270 
8271   assert( iVersion==1 || iVersion==2 );
8272 
8273   /* If setting the version fields to 1, do not automatically open the
8274   ** WAL connection, even if the version fields are currently set to 2.
8275   */
8276   pBt->btsFlags &= ~BTS_NO_WAL;
8277   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
8278 
8279   rc = sqlite3BtreeBeginTrans(pBtree, 0);
8280   if( rc==SQLITE_OK ){
8281     u8 *aData = pBt->pPage1->aData;
8282     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
8283       rc = sqlite3BtreeBeginTrans(pBtree, 2);
8284       if( rc==SQLITE_OK ){
8285         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8286         if( rc==SQLITE_OK ){
8287           aData[18] = (u8)iVersion;
8288           aData[19] = (u8)iVersion;
8289         }
8290       }
8291     }
8292   }
8293 
8294   pBt->btsFlags &= ~BTS_NO_WAL;
8295   return rc;
8296 }
8297