xref: /sqlite-3.40.0/src/btree.c (revision f2fcd075)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 #ifndef SQLITE_OMIT_SHARED_CACHE
47 /*
48 ** A list of BtShared objects that are eligible for participation
49 ** in shared cache.  This variable has file scope during normal builds,
50 ** but the test harness needs to access it so we make it global for
51 ** test builds.
52 **
53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
54 */
55 #ifdef SQLITE_TEST
56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
57 #else
58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
59 #endif
60 #endif /* SQLITE_OMIT_SHARED_CACHE */
61 
62 #ifndef SQLITE_OMIT_SHARED_CACHE
63 /*
64 ** Enable or disable the shared pager and schema features.
65 **
66 ** This routine has no effect on existing database connections.
67 ** The shared cache setting effects only future calls to
68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
69 */
70 int sqlite3_enable_shared_cache(int enable){
71   sqlite3GlobalConfig.sharedCacheEnabled = enable;
72   return SQLITE_OK;
73 }
74 #endif
75 
76 
77 
78 #ifdef SQLITE_OMIT_SHARED_CACHE
79   /*
80   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
81   ** and clearAllSharedCacheTableLocks()
82   ** manipulate entries in the BtShared.pLock linked list used to store
83   ** shared-cache table level locks. If the library is compiled with the
84   ** shared-cache feature disabled, then there is only ever one user
85   ** of each BtShared structure and so this locking is not necessary.
86   ** So define the lock related functions as no-ops.
87   */
88   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
89   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
90   #define clearAllSharedCacheTableLocks(a)
91   #define downgradeAllSharedCacheTableLocks(a)
92   #define hasSharedCacheTableLock(a,b,c,d) 1
93   #define hasReadConflicts(a, b) 0
94 #endif
95 
96 #ifndef SQLITE_OMIT_SHARED_CACHE
97 
98 #ifdef SQLITE_DEBUG
99 /*
100 **** This function is only used as part of an assert() statement. ***
101 **
102 ** Check to see if pBtree holds the required locks to read or write to the
103 ** table with root page iRoot.   Return 1 if it does and 0 if not.
104 **
105 ** For example, when writing to a table with root-page iRoot via
106 ** Btree connection pBtree:
107 **
108 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
109 **
110 ** When writing to an index that resides in a sharable database, the
111 ** caller should have first obtained a lock specifying the root page of
112 ** the corresponding table. This makes things a bit more complicated,
113 ** as this module treats each table as a separate structure. To determine
114 ** the table corresponding to the index being written, this
115 ** function has to search through the database schema.
116 **
117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
118 ** hold a write-lock on the schema table (root page 1). This is also
119 ** acceptable.
120 */
121 static int hasSharedCacheTableLock(
122   Btree *pBtree,         /* Handle that must hold lock */
123   Pgno iRoot,            /* Root page of b-tree */
124   int isIndex,           /* True if iRoot is the root of an index b-tree */
125   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
126 ){
127   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
128   Pgno iTab = 0;
129   BtLock *pLock;
130 
131   /* If this database is not shareable, or if the client is reading
132   ** and has the read-uncommitted flag set, then no lock is required.
133   ** Return true immediately.
134   */
135   if( (pBtree->sharable==0)
136    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
137   ){
138     return 1;
139   }
140 
141   /* If the client is reading  or writing an index and the schema is
142   ** not loaded, then it is too difficult to actually check to see if
143   ** the correct locks are held.  So do not bother - just return true.
144   ** This case does not come up very often anyhow.
145   */
146   if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
147     return 1;
148   }
149 
150   /* Figure out the root-page that the lock should be held on. For table
151   ** b-trees, this is just the root page of the b-tree being read or
152   ** written. For index b-trees, it is the root page of the associated
153   ** table.  */
154   if( isIndex ){
155     HashElem *p;
156     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
157       Index *pIdx = (Index *)sqliteHashData(p);
158       if( pIdx->tnum==(int)iRoot ){
159         iTab = pIdx->pTable->tnum;
160       }
161     }
162   }else{
163     iTab = iRoot;
164   }
165 
166   /* Search for the required lock. Either a write-lock on root-page iTab, a
167   ** write-lock on the schema table, or (if the client is reading) a
168   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
169   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
170     if( pLock->pBtree==pBtree
171      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
172      && pLock->eLock>=eLockType
173     ){
174       return 1;
175     }
176   }
177 
178   /* Failed to find the required lock. */
179   return 0;
180 }
181 #endif /* SQLITE_DEBUG */
182 
183 #ifdef SQLITE_DEBUG
184 /*
185 **** This function may be used as part of assert() statements only. ****
186 **
187 ** Return true if it would be illegal for pBtree to write into the
188 ** table or index rooted at iRoot because other shared connections are
189 ** simultaneously reading that same table or index.
190 **
191 ** It is illegal for pBtree to write if some other Btree object that
192 ** shares the same BtShared object is currently reading or writing
193 ** the iRoot table.  Except, if the other Btree object has the
194 ** read-uncommitted flag set, then it is OK for the other object to
195 ** have a read cursor.
196 **
197 ** For example, before writing to any part of the table or index
198 ** rooted at page iRoot, one should call:
199 **
200 **    assert( !hasReadConflicts(pBtree, iRoot) );
201 */
202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
203   BtCursor *p;
204   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
205     if( p->pgnoRoot==iRoot
206      && p->pBtree!=pBtree
207      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
208     ){
209       return 1;
210     }
211   }
212   return 0;
213 }
214 #endif    /* #ifdef SQLITE_DEBUG */
215 
216 /*
217 ** Query to see if Btree handle p may obtain a lock of type eLock
218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
219 ** SQLITE_OK if the lock may be obtained (by calling
220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
221 */
222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
223   BtShared *pBt = p->pBt;
224   BtLock *pIter;
225 
226   assert( sqlite3BtreeHoldsMutex(p) );
227   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
228   assert( p->db!=0 );
229   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
230 
231   /* If requesting a write-lock, then the Btree must have an open write
232   ** transaction on this file. And, obviously, for this to be so there
233   ** must be an open write transaction on the file itself.
234   */
235   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
236   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
237 
238   /* This routine is a no-op if the shared-cache is not enabled */
239   if( !p->sharable ){
240     return SQLITE_OK;
241   }
242 
243   /* If some other connection is holding an exclusive lock, the
244   ** requested lock may not be obtained.
245   */
246   if( pBt->pWriter!=p && pBt->isExclusive ){
247     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
248     return SQLITE_LOCKED_SHAREDCACHE;
249   }
250 
251   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
252     /* The condition (pIter->eLock!=eLock) in the following if(...)
253     ** statement is a simplification of:
254     **
255     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
256     **
257     ** since we know that if eLock==WRITE_LOCK, then no other connection
258     ** may hold a WRITE_LOCK on any table in this file (since there can
259     ** only be a single writer).
260     */
261     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
262     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
263     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
264       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
265       if( eLock==WRITE_LOCK ){
266         assert( p==pBt->pWriter );
267         pBt->isPending = 1;
268       }
269       return SQLITE_LOCKED_SHAREDCACHE;
270     }
271   }
272   return SQLITE_OK;
273 }
274 #endif /* !SQLITE_OMIT_SHARED_CACHE */
275 
276 #ifndef SQLITE_OMIT_SHARED_CACHE
277 /*
278 ** Add a lock on the table with root-page iTable to the shared-btree used
279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
280 ** WRITE_LOCK.
281 **
282 ** This function assumes the following:
283 **
284 **   (a) The specified Btree object p is connected to a sharable
285 **       database (one with the BtShared.sharable flag set), and
286 **
287 **   (b) No other Btree objects hold a lock that conflicts
288 **       with the requested lock (i.e. querySharedCacheTableLock() has
289 **       already been called and returned SQLITE_OK).
290 **
291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
292 ** is returned if a malloc attempt fails.
293 */
294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
295   BtShared *pBt = p->pBt;
296   BtLock *pLock = 0;
297   BtLock *pIter;
298 
299   assert( sqlite3BtreeHoldsMutex(p) );
300   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
301   assert( p->db!=0 );
302 
303   /* A connection with the read-uncommitted flag set will never try to
304   ** obtain a read-lock using this function. The only read-lock obtained
305   ** by a connection in read-uncommitted mode is on the sqlite_master
306   ** table, and that lock is obtained in BtreeBeginTrans().  */
307   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
308 
309   /* This function should only be called on a sharable b-tree after it
310   ** has been determined that no other b-tree holds a conflicting lock.  */
311   assert( p->sharable );
312   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
313 
314   /* First search the list for an existing lock on this table. */
315   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
316     if( pIter->iTable==iTable && pIter->pBtree==p ){
317       pLock = pIter;
318       break;
319     }
320   }
321 
322   /* If the above search did not find a BtLock struct associating Btree p
323   ** with table iTable, allocate one and link it into the list.
324   */
325   if( !pLock ){
326     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
327     if( !pLock ){
328       return SQLITE_NOMEM;
329     }
330     pLock->iTable = iTable;
331     pLock->pBtree = p;
332     pLock->pNext = pBt->pLock;
333     pBt->pLock = pLock;
334   }
335 
336   /* Set the BtLock.eLock variable to the maximum of the current lock
337   ** and the requested lock. This means if a write-lock was already held
338   ** and a read-lock requested, we don't incorrectly downgrade the lock.
339   */
340   assert( WRITE_LOCK>READ_LOCK );
341   if( eLock>pLock->eLock ){
342     pLock->eLock = eLock;
343   }
344 
345   return SQLITE_OK;
346 }
347 #endif /* !SQLITE_OMIT_SHARED_CACHE */
348 
349 #ifndef SQLITE_OMIT_SHARED_CACHE
350 /*
351 ** Release all the table locks (locks obtained via calls to
352 ** the setSharedCacheTableLock() procedure) held by Btree object p.
353 **
354 ** This function assumes that Btree p has an open read or write
355 ** transaction. If it does not, then the BtShared.isPending variable
356 ** may be incorrectly cleared.
357 */
358 static void clearAllSharedCacheTableLocks(Btree *p){
359   BtShared *pBt = p->pBt;
360   BtLock **ppIter = &pBt->pLock;
361 
362   assert( sqlite3BtreeHoldsMutex(p) );
363   assert( p->sharable || 0==*ppIter );
364   assert( p->inTrans>0 );
365 
366   while( *ppIter ){
367     BtLock *pLock = *ppIter;
368     assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree );
369     assert( pLock->pBtree->inTrans>=pLock->eLock );
370     if( pLock->pBtree==p ){
371       *ppIter = pLock->pNext;
372       assert( pLock->iTable!=1 || pLock==&p->lock );
373       if( pLock->iTable!=1 ){
374         sqlite3_free(pLock);
375       }
376     }else{
377       ppIter = &pLock->pNext;
378     }
379   }
380 
381   assert( pBt->isPending==0 || pBt->pWriter );
382   if( pBt->pWriter==p ){
383     pBt->pWriter = 0;
384     pBt->isExclusive = 0;
385     pBt->isPending = 0;
386   }else if( pBt->nTransaction==2 ){
387     /* This function is called when Btree p is concluding its
388     ** transaction. If there currently exists a writer, and p is not
389     ** that writer, then the number of locks held by connections other
390     ** than the writer must be about to drop to zero. In this case
391     ** set the isPending flag to 0.
392     **
393     ** If there is not currently a writer, then BtShared.isPending must
394     ** be zero already. So this next line is harmless in that case.
395     */
396     pBt->isPending = 0;
397   }
398 }
399 
400 /*
401 ** This function changes all write-locks held by Btree p into read-locks.
402 */
403 static void downgradeAllSharedCacheTableLocks(Btree *p){
404   BtShared *pBt = p->pBt;
405   if( pBt->pWriter==p ){
406     BtLock *pLock;
407     pBt->pWriter = 0;
408     pBt->isExclusive = 0;
409     pBt->isPending = 0;
410     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
411       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
412       pLock->eLock = READ_LOCK;
413     }
414   }
415 }
416 
417 #endif /* SQLITE_OMIT_SHARED_CACHE */
418 
419 static void releasePage(MemPage *pPage);  /* Forward reference */
420 
421 /*
422 ***** This routine is used inside of assert() only ****
423 **
424 ** Verify that the cursor holds the mutex on its BtShared
425 */
426 #ifdef SQLITE_DEBUG
427 static int cursorHoldsMutex(BtCursor *p){
428   return sqlite3_mutex_held(p->pBt->mutex);
429 }
430 #endif
431 
432 
433 #ifndef SQLITE_OMIT_INCRBLOB
434 /*
435 ** Invalidate the overflow page-list cache for cursor pCur, if any.
436 */
437 static void invalidateOverflowCache(BtCursor *pCur){
438   assert( cursorHoldsMutex(pCur) );
439   sqlite3_free(pCur->aOverflow);
440   pCur->aOverflow = 0;
441 }
442 
443 /*
444 ** Invalidate the overflow page-list cache for all cursors opened
445 ** on the shared btree structure pBt.
446 */
447 static void invalidateAllOverflowCache(BtShared *pBt){
448   BtCursor *p;
449   assert( sqlite3_mutex_held(pBt->mutex) );
450   for(p=pBt->pCursor; p; p=p->pNext){
451     invalidateOverflowCache(p);
452   }
453 }
454 
455 /*
456 ** This function is called before modifying the contents of a table
457 ** to invalidate any incrblob cursors that are open on the
458 ** row or one of the rows being modified.
459 **
460 ** If argument isClearTable is true, then the entire contents of the
461 ** table is about to be deleted. In this case invalidate all incrblob
462 ** cursors open on any row within the table with root-page pgnoRoot.
463 **
464 ** Otherwise, if argument isClearTable is false, then the row with
465 ** rowid iRow is being replaced or deleted. In this case invalidate
466 ** only those incrblob cursors open on that specific row.
467 */
468 static void invalidateIncrblobCursors(
469   Btree *pBtree,          /* The database file to check */
470   i64 iRow,               /* The rowid that might be changing */
471   int isClearTable        /* True if all rows are being deleted */
472 ){
473   BtCursor *p;
474   BtShared *pBt = pBtree->pBt;
475   assert( sqlite3BtreeHoldsMutex(pBtree) );
476   for(p=pBt->pCursor; p; p=p->pNext){
477     if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
478       p->eState = CURSOR_INVALID;
479     }
480   }
481 }
482 
483 #else
484   /* Stub functions when INCRBLOB is omitted */
485   #define invalidateOverflowCache(x)
486   #define invalidateAllOverflowCache(x)
487   #define invalidateIncrblobCursors(x,y,z)
488 #endif /* SQLITE_OMIT_INCRBLOB */
489 
490 /*
491 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
492 ** when a page that previously contained data becomes a free-list leaf
493 ** page.
494 **
495 ** The BtShared.pHasContent bitvec exists to work around an obscure
496 ** bug caused by the interaction of two useful IO optimizations surrounding
497 ** free-list leaf pages:
498 **
499 **   1) When all data is deleted from a page and the page becomes
500 **      a free-list leaf page, the page is not written to the database
501 **      (as free-list leaf pages contain no meaningful data). Sometimes
502 **      such a page is not even journalled (as it will not be modified,
503 **      why bother journalling it?).
504 **
505 **   2) When a free-list leaf page is reused, its content is not read
506 **      from the database or written to the journal file (why should it
507 **      be, if it is not at all meaningful?).
508 **
509 ** By themselves, these optimizations work fine and provide a handy
510 ** performance boost to bulk delete or insert operations. However, if
511 ** a page is moved to the free-list and then reused within the same
512 ** transaction, a problem comes up. If the page is not journalled when
513 ** it is moved to the free-list and it is also not journalled when it
514 ** is extracted from the free-list and reused, then the original data
515 ** may be lost. In the event of a rollback, it may not be possible
516 ** to restore the database to its original configuration.
517 **
518 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
519 ** moved to become a free-list leaf page, the corresponding bit is
520 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
521 ** optimization 2 above is omitted if the corresponding bit is already
522 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
523 ** at the end of every transaction.
524 */
525 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
526   int rc = SQLITE_OK;
527   if( !pBt->pHasContent ){
528     assert( pgno<=pBt->nPage );
529     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
530     if( !pBt->pHasContent ){
531       rc = SQLITE_NOMEM;
532     }
533   }
534   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
535     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
536   }
537   return rc;
538 }
539 
540 /*
541 ** Query the BtShared.pHasContent vector.
542 **
543 ** This function is called when a free-list leaf page is removed from the
544 ** free-list for reuse. It returns false if it is safe to retrieve the
545 ** page from the pager layer with the 'no-content' flag set. True otherwise.
546 */
547 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
548   Bitvec *p = pBt->pHasContent;
549   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
550 }
551 
552 /*
553 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
554 ** invoked at the conclusion of each write-transaction.
555 */
556 static void btreeClearHasContent(BtShared *pBt){
557   sqlite3BitvecDestroy(pBt->pHasContent);
558   pBt->pHasContent = 0;
559 }
560 
561 /*
562 ** Save the current cursor position in the variables BtCursor.nKey
563 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
564 **
565 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
566 ** prior to calling this routine.
567 */
568 static int saveCursorPosition(BtCursor *pCur){
569   int rc;
570 
571   assert( CURSOR_VALID==pCur->eState );
572   assert( 0==pCur->pKey );
573   assert( cursorHoldsMutex(pCur) );
574 
575   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
576   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
577 
578   /* If this is an intKey table, then the above call to BtreeKeySize()
579   ** stores the integer key in pCur->nKey. In this case this value is
580   ** all that is required. Otherwise, if pCur is not open on an intKey
581   ** table, then malloc space for and store the pCur->nKey bytes of key
582   ** data.
583   */
584   if( 0==pCur->apPage[0]->intKey ){
585     void *pKey = sqlite3Malloc( (int)pCur->nKey );
586     if( pKey ){
587       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
588       if( rc==SQLITE_OK ){
589         pCur->pKey = pKey;
590       }else{
591         sqlite3_free(pKey);
592       }
593     }else{
594       rc = SQLITE_NOMEM;
595     }
596   }
597   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
598 
599   if( rc==SQLITE_OK ){
600     int i;
601     for(i=0; i<=pCur->iPage; i++){
602       releasePage(pCur->apPage[i]);
603       pCur->apPage[i] = 0;
604     }
605     pCur->iPage = -1;
606     pCur->eState = CURSOR_REQUIRESEEK;
607   }
608 
609   invalidateOverflowCache(pCur);
610   return rc;
611 }
612 
613 /*
614 ** Save the positions of all cursors (except pExcept) that are open on
615 ** the table  with root-page iRoot. Usually, this is called just before cursor
616 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
617 */
618 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
619   BtCursor *p;
620   assert( sqlite3_mutex_held(pBt->mutex) );
621   assert( pExcept==0 || pExcept->pBt==pBt );
622   for(p=pBt->pCursor; p; p=p->pNext){
623     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
624         p->eState==CURSOR_VALID ){
625       int rc = saveCursorPosition(p);
626       if( SQLITE_OK!=rc ){
627         return rc;
628       }
629     }
630   }
631   return SQLITE_OK;
632 }
633 
634 /*
635 ** Clear the current cursor position.
636 */
637 void sqlite3BtreeClearCursor(BtCursor *pCur){
638   assert( cursorHoldsMutex(pCur) );
639   sqlite3_free(pCur->pKey);
640   pCur->pKey = 0;
641   pCur->eState = CURSOR_INVALID;
642 }
643 
644 /*
645 ** In this version of BtreeMoveto, pKey is a packed index record
646 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
647 ** record and then call BtreeMovetoUnpacked() to do the work.
648 */
649 static int btreeMoveto(
650   BtCursor *pCur,     /* Cursor open on the btree to be searched */
651   const void *pKey,   /* Packed key if the btree is an index */
652   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
653   int bias,           /* Bias search to the high end */
654   int *pRes           /* Write search results here */
655 ){
656   int rc;                    /* Status code */
657   UnpackedRecord *pIdxKey;   /* Unpacked index key */
658   char aSpace[150];          /* Temp space for pIdxKey - to avoid a malloc */
659 
660   if( pKey ){
661     assert( nKey==(i64)(int)nKey );
662     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
663                                       aSpace, sizeof(aSpace));
664     if( pIdxKey==0 ) return SQLITE_NOMEM;
665   }else{
666     pIdxKey = 0;
667   }
668   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
669   if( pKey ){
670     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
671   }
672   return rc;
673 }
674 
675 /*
676 ** Restore the cursor to the position it was in (or as close to as possible)
677 ** when saveCursorPosition() was called. Note that this call deletes the
678 ** saved position info stored by saveCursorPosition(), so there can be
679 ** at most one effective restoreCursorPosition() call after each
680 ** saveCursorPosition().
681 */
682 static int btreeRestoreCursorPosition(BtCursor *pCur){
683   int rc;
684   assert( cursorHoldsMutex(pCur) );
685   assert( pCur->eState>=CURSOR_REQUIRESEEK );
686   if( pCur->eState==CURSOR_FAULT ){
687     return pCur->skipNext;
688   }
689   pCur->eState = CURSOR_INVALID;
690   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
691   if( rc==SQLITE_OK ){
692     sqlite3_free(pCur->pKey);
693     pCur->pKey = 0;
694     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
695   }
696   return rc;
697 }
698 
699 #define restoreCursorPosition(p) \
700   (p->eState>=CURSOR_REQUIRESEEK ? \
701          btreeRestoreCursorPosition(p) : \
702          SQLITE_OK)
703 
704 /*
705 ** Determine whether or not a cursor has moved from the position it
706 ** was last placed at.  Cursors can move when the row they are pointing
707 ** at is deleted out from under them.
708 **
709 ** This routine returns an error code if something goes wrong.  The
710 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
711 */
712 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
713   int rc;
714 
715   rc = restoreCursorPosition(pCur);
716   if( rc ){
717     *pHasMoved = 1;
718     return rc;
719   }
720   if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
721     *pHasMoved = 1;
722   }else{
723     *pHasMoved = 0;
724   }
725   return SQLITE_OK;
726 }
727 
728 #ifndef SQLITE_OMIT_AUTOVACUUM
729 /*
730 ** Given a page number of a regular database page, return the page
731 ** number for the pointer-map page that contains the entry for the
732 ** input page number.
733 **
734 ** Return 0 (not a valid page) for pgno==1 since there is
735 ** no pointer map associated with page 1.  The integrity_check logic
736 ** requires that ptrmapPageno(*,1)!=1.
737 */
738 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
739   int nPagesPerMapPage;
740   Pgno iPtrMap, ret;
741   assert( sqlite3_mutex_held(pBt->mutex) );
742   if( pgno<2 ) return 0;
743   nPagesPerMapPage = (pBt->usableSize/5)+1;
744   iPtrMap = (pgno-2)/nPagesPerMapPage;
745   ret = (iPtrMap*nPagesPerMapPage) + 2;
746   if( ret==PENDING_BYTE_PAGE(pBt) ){
747     ret++;
748   }
749   return ret;
750 }
751 
752 /*
753 ** Write an entry into the pointer map.
754 **
755 ** This routine updates the pointer map entry for page number 'key'
756 ** so that it maps to type 'eType' and parent page number 'pgno'.
757 **
758 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
759 ** a no-op.  If an error occurs, the appropriate error code is written
760 ** into *pRC.
761 */
762 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
763   DbPage *pDbPage;  /* The pointer map page */
764   u8 *pPtrmap;      /* The pointer map data */
765   Pgno iPtrmap;     /* The pointer map page number */
766   int offset;       /* Offset in pointer map page */
767   int rc;           /* Return code from subfunctions */
768 
769   if( *pRC ) return;
770 
771   assert( sqlite3_mutex_held(pBt->mutex) );
772   /* The master-journal page number must never be used as a pointer map page */
773   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
774 
775   assert( pBt->autoVacuum );
776   if( key==0 ){
777     *pRC = SQLITE_CORRUPT_BKPT;
778     return;
779   }
780   iPtrmap = PTRMAP_PAGENO(pBt, key);
781   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
782   if( rc!=SQLITE_OK ){
783     *pRC = rc;
784     return;
785   }
786   offset = PTRMAP_PTROFFSET(iPtrmap, key);
787   if( offset<0 ){
788     *pRC = SQLITE_CORRUPT_BKPT;
789     goto ptrmap_exit;
790   }
791   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
792 
793   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
794     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
795     *pRC= rc = sqlite3PagerWrite(pDbPage);
796     if( rc==SQLITE_OK ){
797       pPtrmap[offset] = eType;
798       put4byte(&pPtrmap[offset+1], parent);
799     }
800   }
801 
802 ptrmap_exit:
803   sqlite3PagerUnref(pDbPage);
804 }
805 
806 /*
807 ** Read an entry from the pointer map.
808 **
809 ** This routine retrieves the pointer map entry for page 'key', writing
810 ** the type and parent page number to *pEType and *pPgno respectively.
811 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
812 */
813 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
814   DbPage *pDbPage;   /* The pointer map page */
815   int iPtrmap;       /* Pointer map page index */
816   u8 *pPtrmap;       /* Pointer map page data */
817   int offset;        /* Offset of entry in pointer map */
818   int rc;
819 
820   assert( sqlite3_mutex_held(pBt->mutex) );
821 
822   iPtrmap = PTRMAP_PAGENO(pBt, key);
823   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
824   if( rc!=0 ){
825     return rc;
826   }
827   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
828 
829   offset = PTRMAP_PTROFFSET(iPtrmap, key);
830   assert( pEType!=0 );
831   *pEType = pPtrmap[offset];
832   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
833 
834   sqlite3PagerUnref(pDbPage);
835   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
836   return SQLITE_OK;
837 }
838 
839 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
840   #define ptrmapPut(w,x,y,z,rc)
841   #define ptrmapGet(w,x,y,z) SQLITE_OK
842   #define ptrmapPutOvflPtr(x, y, rc)
843 #endif
844 
845 /*
846 ** Given a btree page and a cell index (0 means the first cell on
847 ** the page, 1 means the second cell, and so forth) return a pointer
848 ** to the cell content.
849 **
850 ** This routine works only for pages that do not contain overflow cells.
851 */
852 #define findCell(P,I) \
853   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
854 
855 /*
856 ** This a more complex version of findCell() that works for
857 ** pages that do contain overflow cells.
858 */
859 static u8 *findOverflowCell(MemPage *pPage, int iCell){
860   int i;
861   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
862   for(i=pPage->nOverflow-1; i>=0; i--){
863     int k;
864     struct _OvflCell *pOvfl;
865     pOvfl = &pPage->aOvfl[i];
866     k = pOvfl->idx;
867     if( k<=iCell ){
868       if( k==iCell ){
869         return pOvfl->pCell;
870       }
871       iCell--;
872     }
873   }
874   return findCell(pPage, iCell);
875 }
876 
877 /*
878 ** Parse a cell content block and fill in the CellInfo structure.  There
879 ** are two versions of this function.  btreeParseCell() takes a
880 ** cell index as the second argument and btreeParseCellPtr()
881 ** takes a pointer to the body of the cell as its second argument.
882 **
883 ** Within this file, the parseCell() macro can be called instead of
884 ** btreeParseCellPtr(). Using some compilers, this will be faster.
885 */
886 static void btreeParseCellPtr(
887   MemPage *pPage,         /* Page containing the cell */
888   u8 *pCell,              /* Pointer to the cell text. */
889   CellInfo *pInfo         /* Fill in this structure */
890 ){
891   u16 n;                  /* Number bytes in cell content header */
892   u32 nPayload;           /* Number of bytes of cell payload */
893 
894   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
895 
896   pInfo->pCell = pCell;
897   assert( pPage->leaf==0 || pPage->leaf==1 );
898   n = pPage->childPtrSize;
899   assert( n==4-4*pPage->leaf );
900   if( pPage->intKey ){
901     if( pPage->hasData ){
902       n += getVarint32(&pCell[n], nPayload);
903     }else{
904       nPayload = 0;
905     }
906     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
907     pInfo->nData = nPayload;
908   }else{
909     pInfo->nData = 0;
910     n += getVarint32(&pCell[n], nPayload);
911     pInfo->nKey = nPayload;
912   }
913   pInfo->nPayload = nPayload;
914   pInfo->nHeader = n;
915   testcase( nPayload==pPage->maxLocal );
916   testcase( nPayload==pPage->maxLocal+1 );
917   if( likely(nPayload<=pPage->maxLocal) ){
918     /* This is the (easy) common case where the entire payload fits
919     ** on the local page.  No overflow is required.
920     */
921     int nSize;          /* Total size of cell content in bytes */
922     nSize = nPayload + n;
923     pInfo->nLocal = (u16)nPayload;
924     pInfo->iOverflow = 0;
925     if( (nSize & ~3)==0 ){
926       nSize = 4;        /* Minimum cell size is 4 */
927     }
928     pInfo->nSize = (u16)nSize;
929   }else{
930     /* If the payload will not fit completely on the local page, we have
931     ** to decide how much to store locally and how much to spill onto
932     ** overflow pages.  The strategy is to minimize the amount of unused
933     ** space on overflow pages while keeping the amount of local storage
934     ** in between minLocal and maxLocal.
935     **
936     ** Warning:  changing the way overflow payload is distributed in any
937     ** way will result in an incompatible file format.
938     */
939     int minLocal;  /* Minimum amount of payload held locally */
940     int maxLocal;  /* Maximum amount of payload held locally */
941     int surplus;   /* Overflow payload available for local storage */
942 
943     minLocal = pPage->minLocal;
944     maxLocal = pPage->maxLocal;
945     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
946     testcase( surplus==maxLocal );
947     testcase( surplus==maxLocal+1 );
948     if( surplus <= maxLocal ){
949       pInfo->nLocal = (u16)surplus;
950     }else{
951       pInfo->nLocal = (u16)minLocal;
952     }
953     pInfo->iOverflow = (u16)(pInfo->nLocal + n);
954     pInfo->nSize = pInfo->iOverflow + 4;
955   }
956 }
957 #define parseCell(pPage, iCell, pInfo) \
958   btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
959 static void btreeParseCell(
960   MemPage *pPage,         /* Page containing the cell */
961   int iCell,              /* The cell index.  First cell is 0 */
962   CellInfo *pInfo         /* Fill in this structure */
963 ){
964   parseCell(pPage, iCell, pInfo);
965 }
966 
967 /*
968 ** Compute the total number of bytes that a Cell needs in the cell
969 ** data area of the btree-page.  The return number includes the cell
970 ** data header and the local payload, but not any overflow page or
971 ** the space used by the cell pointer.
972 */
973 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
974   u8 *pIter = &pCell[pPage->childPtrSize];
975   u32 nSize;
976 
977 #ifdef SQLITE_DEBUG
978   /* The value returned by this function should always be the same as
979   ** the (CellInfo.nSize) value found by doing a full parse of the
980   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
981   ** this function verifies that this invariant is not violated. */
982   CellInfo debuginfo;
983   btreeParseCellPtr(pPage, pCell, &debuginfo);
984 #endif
985 
986   if( pPage->intKey ){
987     u8 *pEnd;
988     if( pPage->hasData ){
989       pIter += getVarint32(pIter, nSize);
990     }else{
991       nSize = 0;
992     }
993 
994     /* pIter now points at the 64-bit integer key value, a variable length
995     ** integer. The following block moves pIter to point at the first byte
996     ** past the end of the key value. */
997     pEnd = &pIter[9];
998     while( (*pIter++)&0x80 && pIter<pEnd );
999   }else{
1000     pIter += getVarint32(pIter, nSize);
1001   }
1002 
1003   testcase( nSize==pPage->maxLocal );
1004   testcase( nSize==pPage->maxLocal+1 );
1005   if( nSize>pPage->maxLocal ){
1006     int minLocal = pPage->minLocal;
1007     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1008     testcase( nSize==pPage->maxLocal );
1009     testcase( nSize==pPage->maxLocal+1 );
1010     if( nSize>pPage->maxLocal ){
1011       nSize = minLocal;
1012     }
1013     nSize += 4;
1014   }
1015   nSize += (u32)(pIter - pCell);
1016 
1017   /* The minimum size of any cell is 4 bytes. */
1018   if( nSize<4 ){
1019     nSize = 4;
1020   }
1021 
1022   assert( nSize==debuginfo.nSize );
1023   return (u16)nSize;
1024 }
1025 
1026 #ifdef SQLITE_DEBUG
1027 /* This variation on cellSizePtr() is used inside of assert() statements
1028 ** only. */
1029 static u16 cellSize(MemPage *pPage, int iCell){
1030   return cellSizePtr(pPage, findCell(pPage, iCell));
1031 }
1032 #endif
1033 
1034 #ifndef SQLITE_OMIT_AUTOVACUUM
1035 /*
1036 ** If the cell pCell, part of page pPage contains a pointer
1037 ** to an overflow page, insert an entry into the pointer-map
1038 ** for the overflow page.
1039 */
1040 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1041   CellInfo info;
1042   if( *pRC ) return;
1043   assert( pCell!=0 );
1044   btreeParseCellPtr(pPage, pCell, &info);
1045   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1046   if( info.iOverflow ){
1047     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1048     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1049   }
1050 }
1051 #endif
1052 
1053 
1054 /*
1055 ** Defragment the page given.  All Cells are moved to the
1056 ** end of the page and all free space is collected into one
1057 ** big FreeBlk that occurs in between the header and cell
1058 ** pointer array and the cell content area.
1059 */
1060 static int defragmentPage(MemPage *pPage){
1061   int i;                     /* Loop counter */
1062   int pc;                    /* Address of a i-th cell */
1063   int hdr;                   /* Offset to the page header */
1064   int size;                  /* Size of a cell */
1065   int usableSize;            /* Number of usable bytes on a page */
1066   int cellOffset;            /* Offset to the cell pointer array */
1067   int cbrk;                  /* Offset to the cell content area */
1068   int nCell;                 /* Number of cells on the page */
1069   unsigned char *data;       /* The page data */
1070   unsigned char *temp;       /* Temp area for cell content */
1071   int iCellFirst;            /* First allowable cell index */
1072   int iCellLast;             /* Last possible cell index */
1073 
1074 
1075   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1076   assert( pPage->pBt!=0 );
1077   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1078   assert( pPage->nOverflow==0 );
1079   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1080   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1081   data = pPage->aData;
1082   hdr = pPage->hdrOffset;
1083   cellOffset = pPage->cellOffset;
1084   nCell = pPage->nCell;
1085   assert( nCell==get2byte(&data[hdr+3]) );
1086   usableSize = pPage->pBt->usableSize;
1087   cbrk = get2byte(&data[hdr+5]);
1088   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1089   cbrk = usableSize;
1090   iCellFirst = cellOffset + 2*nCell;
1091   iCellLast = usableSize - 4;
1092   for(i=0; i<nCell; i++){
1093     u8 *pAddr;     /* The i-th cell pointer */
1094     pAddr = &data[cellOffset + i*2];
1095     pc = get2byte(pAddr);
1096     testcase( pc==iCellFirst );
1097     testcase( pc==iCellLast );
1098 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1099     /* These conditions have already been verified in btreeInitPage()
1100     ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1101     */
1102     if( pc<iCellFirst || pc>iCellLast ){
1103       return SQLITE_CORRUPT_BKPT;
1104     }
1105 #endif
1106     assert( pc>=iCellFirst && pc<=iCellLast );
1107     size = cellSizePtr(pPage, &temp[pc]);
1108     cbrk -= size;
1109 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1110     if( cbrk<iCellFirst ){
1111       return SQLITE_CORRUPT_BKPT;
1112     }
1113 #else
1114     if( cbrk<iCellFirst || pc+size>usableSize ){
1115       return SQLITE_CORRUPT_BKPT;
1116     }
1117 #endif
1118     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1119     testcase( cbrk+size==usableSize );
1120     testcase( pc+size==usableSize );
1121     memcpy(&data[cbrk], &temp[pc], size);
1122     put2byte(pAddr, cbrk);
1123   }
1124   assert( cbrk>=iCellFirst );
1125   put2byte(&data[hdr+5], cbrk);
1126   data[hdr+1] = 0;
1127   data[hdr+2] = 0;
1128   data[hdr+7] = 0;
1129   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1130   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1131   if( cbrk-iCellFirst!=pPage->nFree ){
1132     return SQLITE_CORRUPT_BKPT;
1133   }
1134   return SQLITE_OK;
1135 }
1136 
1137 /*
1138 ** Allocate nByte bytes of space from within the B-Tree page passed
1139 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1140 ** of the first byte of allocated space. Return either SQLITE_OK or
1141 ** an error code (usually SQLITE_CORRUPT).
1142 **
1143 ** The caller guarantees that there is sufficient space to make the
1144 ** allocation.  This routine might need to defragment in order to bring
1145 ** all the space together, however.  This routine will avoid using
1146 ** the first two bytes past the cell pointer area since presumably this
1147 ** allocation is being made in order to insert a new cell, so we will
1148 ** also end up needing a new cell pointer.
1149 */
1150 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1151   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1152   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1153   int nFrag;                           /* Number of fragmented bytes on pPage */
1154   int top;                             /* First byte of cell content area */
1155   int gap;        /* First byte of gap between cell pointers and cell content */
1156   int rc;         /* Integer return code */
1157   int usableSize; /* Usable size of the page */
1158 
1159   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1160   assert( pPage->pBt );
1161   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1162   assert( nByte>=0 );  /* Minimum cell size is 4 */
1163   assert( pPage->nFree>=nByte );
1164   assert( pPage->nOverflow==0 );
1165   usableSize = pPage->pBt->usableSize;
1166   assert( nByte < usableSize-8 );
1167 
1168   nFrag = data[hdr+7];
1169   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1170   gap = pPage->cellOffset + 2*pPage->nCell;
1171   top = get2byteNotZero(&data[hdr+5]);
1172   if( gap>top ) return SQLITE_CORRUPT_BKPT;
1173   testcase( gap+2==top );
1174   testcase( gap+1==top );
1175   testcase( gap==top );
1176 
1177   if( nFrag>=60 ){
1178     /* Always defragment highly fragmented pages */
1179     rc = defragmentPage(pPage);
1180     if( rc ) return rc;
1181     top = get2byteNotZero(&data[hdr+5]);
1182   }else if( gap+2<=top ){
1183     /* Search the freelist looking for a free slot big enough to satisfy
1184     ** the request. The allocation is made from the first free slot in
1185     ** the list that is large enough to accomadate it.
1186     */
1187     int pc, addr;
1188     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1189       int size;            /* Size of the free slot */
1190       if( pc>usableSize-4 || pc<addr+4 ){
1191         return SQLITE_CORRUPT_BKPT;
1192       }
1193       size = get2byte(&data[pc+2]);
1194       if( size>=nByte ){
1195         int x = size - nByte;
1196         testcase( x==4 );
1197         testcase( x==3 );
1198         if( x<4 ){
1199           /* Remove the slot from the free-list. Update the number of
1200           ** fragmented bytes within the page. */
1201           memcpy(&data[addr], &data[pc], 2);
1202           data[hdr+7] = (u8)(nFrag + x);
1203         }else if( size+pc > usableSize ){
1204           return SQLITE_CORRUPT_BKPT;
1205         }else{
1206           /* The slot remains on the free-list. Reduce its size to account
1207           ** for the portion used by the new allocation. */
1208           put2byte(&data[pc+2], x);
1209         }
1210         *pIdx = pc + x;
1211         return SQLITE_OK;
1212       }
1213     }
1214   }
1215 
1216   /* Check to make sure there is enough space in the gap to satisfy
1217   ** the allocation.  If not, defragment.
1218   */
1219   testcase( gap+2+nByte==top );
1220   if( gap+2+nByte>top ){
1221     rc = defragmentPage(pPage);
1222     if( rc ) return rc;
1223     top = get2byteNotZero(&data[hdr+5]);
1224     assert( gap+nByte<=top );
1225   }
1226 
1227 
1228   /* Allocate memory from the gap in between the cell pointer array
1229   ** and the cell content area.  The btreeInitPage() call has already
1230   ** validated the freelist.  Given that the freelist is valid, there
1231   ** is no way that the allocation can extend off the end of the page.
1232   ** The assert() below verifies the previous sentence.
1233   */
1234   top -= nByte;
1235   put2byte(&data[hdr+5], top);
1236   assert( top+nByte <= pPage->pBt->usableSize );
1237   *pIdx = top;
1238   return SQLITE_OK;
1239 }
1240 
1241 /*
1242 ** Return a section of the pPage->aData to the freelist.
1243 ** The first byte of the new free block is pPage->aDisk[start]
1244 ** and the size of the block is "size" bytes.
1245 **
1246 ** Most of the effort here is involved in coalesing adjacent
1247 ** free blocks into a single big free block.
1248 */
1249 static int freeSpace(MemPage *pPage, int start, int size){
1250   int addr, pbegin, hdr;
1251   int iLast;                        /* Largest possible freeblock offset */
1252   unsigned char *data = pPage->aData;
1253 
1254   assert( pPage->pBt!=0 );
1255   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1256   assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1257   assert( (start + size)<=pPage->pBt->usableSize );
1258   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1259   assert( size>=0 );   /* Minimum cell size is 4 */
1260 
1261   if( pPage->pBt->secureDelete ){
1262     /* Overwrite deleted information with zeros when the secure_delete
1263     ** option is enabled */
1264     memset(&data[start], 0, size);
1265   }
1266 
1267   /* Add the space back into the linked list of freeblocks.  Note that
1268   ** even though the freeblock list was checked by btreeInitPage(),
1269   ** btreeInitPage() did not detect overlapping cells or
1270   ** freeblocks that overlapped cells.   Nor does it detect when the
1271   ** cell content area exceeds the value in the page header.  If these
1272   ** situations arise, then subsequent insert operations might corrupt
1273   ** the freelist.  So we do need to check for corruption while scanning
1274   ** the freelist.
1275   */
1276   hdr = pPage->hdrOffset;
1277   addr = hdr + 1;
1278   iLast = pPage->pBt->usableSize - 4;
1279   assert( start<=iLast );
1280   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1281     if( pbegin<addr+4 ){
1282       return SQLITE_CORRUPT_BKPT;
1283     }
1284     addr = pbegin;
1285   }
1286   if( pbegin>iLast ){
1287     return SQLITE_CORRUPT_BKPT;
1288   }
1289   assert( pbegin>addr || pbegin==0 );
1290   put2byte(&data[addr], start);
1291   put2byte(&data[start], pbegin);
1292   put2byte(&data[start+2], size);
1293   pPage->nFree = pPage->nFree + (u16)size;
1294 
1295   /* Coalesce adjacent free blocks */
1296   addr = hdr + 1;
1297   while( (pbegin = get2byte(&data[addr]))>0 ){
1298     int pnext, psize, x;
1299     assert( pbegin>addr );
1300     assert( pbegin<=pPage->pBt->usableSize-4 );
1301     pnext = get2byte(&data[pbegin]);
1302     psize = get2byte(&data[pbegin+2]);
1303     if( pbegin + psize + 3 >= pnext && pnext>0 ){
1304       int frag = pnext - (pbegin+psize);
1305       if( (frag<0) || (frag>(int)data[hdr+7]) ){
1306         return SQLITE_CORRUPT_BKPT;
1307       }
1308       data[hdr+7] -= (u8)frag;
1309       x = get2byte(&data[pnext]);
1310       put2byte(&data[pbegin], x);
1311       x = pnext + get2byte(&data[pnext+2]) - pbegin;
1312       put2byte(&data[pbegin+2], x);
1313     }else{
1314       addr = pbegin;
1315     }
1316   }
1317 
1318   /* If the cell content area begins with a freeblock, remove it. */
1319   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1320     int top;
1321     pbegin = get2byte(&data[hdr+1]);
1322     memcpy(&data[hdr+1], &data[pbegin], 2);
1323     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1324     put2byte(&data[hdr+5], top);
1325   }
1326   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1327   return SQLITE_OK;
1328 }
1329 
1330 /*
1331 ** Decode the flags byte (the first byte of the header) for a page
1332 ** and initialize fields of the MemPage structure accordingly.
1333 **
1334 ** Only the following combinations are supported.  Anything different
1335 ** indicates a corrupt database files:
1336 **
1337 **         PTF_ZERODATA
1338 **         PTF_ZERODATA | PTF_LEAF
1339 **         PTF_LEAFDATA | PTF_INTKEY
1340 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1341 */
1342 static int decodeFlags(MemPage *pPage, int flagByte){
1343   BtShared *pBt;     /* A copy of pPage->pBt */
1344 
1345   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1346   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1347   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1348   flagByte &= ~PTF_LEAF;
1349   pPage->childPtrSize = 4-4*pPage->leaf;
1350   pBt = pPage->pBt;
1351   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1352     pPage->intKey = 1;
1353     pPage->hasData = pPage->leaf;
1354     pPage->maxLocal = pBt->maxLeaf;
1355     pPage->minLocal = pBt->minLeaf;
1356   }else if( flagByte==PTF_ZERODATA ){
1357     pPage->intKey = 0;
1358     pPage->hasData = 0;
1359     pPage->maxLocal = pBt->maxLocal;
1360     pPage->minLocal = pBt->minLocal;
1361   }else{
1362     return SQLITE_CORRUPT_BKPT;
1363   }
1364   return SQLITE_OK;
1365 }
1366 
1367 /*
1368 ** Initialize the auxiliary information for a disk block.
1369 **
1370 ** Return SQLITE_OK on success.  If we see that the page does
1371 ** not contain a well-formed database page, then return
1372 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1373 ** guarantee that the page is well-formed.  It only shows that
1374 ** we failed to detect any corruption.
1375 */
1376 static int btreeInitPage(MemPage *pPage){
1377 
1378   assert( pPage->pBt!=0 );
1379   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1380   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1381   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1382   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1383 
1384   if( !pPage->isInit ){
1385     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1386     u8 hdr;            /* Offset to beginning of page header */
1387     u8 *data;          /* Equal to pPage->aData */
1388     BtShared *pBt;        /* The main btree structure */
1389     int usableSize;    /* Amount of usable space on each page */
1390     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1391     int nFree;         /* Number of unused bytes on the page */
1392     int top;           /* First byte of the cell content area */
1393     int iCellFirst;    /* First allowable cell or freeblock offset */
1394     int iCellLast;     /* Last possible cell or freeblock offset */
1395 
1396     pBt = pPage->pBt;
1397 
1398     hdr = pPage->hdrOffset;
1399     data = pPage->aData;
1400     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1401     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1402     pPage->maskPage = (u16)(pBt->pageSize - 1);
1403     pPage->nOverflow = 0;
1404     usableSize = pBt->usableSize;
1405     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1406     top = get2byteNotZero(&data[hdr+5]);
1407     pPage->nCell = get2byte(&data[hdr+3]);
1408     if( pPage->nCell>MX_CELL(pBt) ){
1409       /* To many cells for a single page.  The page must be corrupt */
1410       return SQLITE_CORRUPT_BKPT;
1411     }
1412     testcase( pPage->nCell==MX_CELL(pBt) );
1413 
1414     /* A malformed database page might cause us to read past the end
1415     ** of page when parsing a cell.
1416     **
1417     ** The following block of code checks early to see if a cell extends
1418     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1419     ** returned if it does.
1420     */
1421     iCellFirst = cellOffset + 2*pPage->nCell;
1422     iCellLast = usableSize - 4;
1423 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1424     {
1425       int i;            /* Index into the cell pointer array */
1426       int sz;           /* Size of a cell */
1427 
1428       if( !pPage->leaf ) iCellLast--;
1429       for(i=0; i<pPage->nCell; i++){
1430         pc = get2byte(&data[cellOffset+i*2]);
1431         testcase( pc==iCellFirst );
1432         testcase( pc==iCellLast );
1433         if( pc<iCellFirst || pc>iCellLast ){
1434           return SQLITE_CORRUPT_BKPT;
1435         }
1436         sz = cellSizePtr(pPage, &data[pc]);
1437         testcase( pc+sz==usableSize );
1438         if( pc+sz>usableSize ){
1439           return SQLITE_CORRUPT_BKPT;
1440         }
1441       }
1442       if( !pPage->leaf ) iCellLast++;
1443     }
1444 #endif
1445 
1446     /* Compute the total free space on the page */
1447     pc = get2byte(&data[hdr+1]);
1448     nFree = data[hdr+7] + top;
1449     while( pc>0 ){
1450       u16 next, size;
1451       if( pc<iCellFirst || pc>iCellLast ){
1452         /* Start of free block is off the page */
1453         return SQLITE_CORRUPT_BKPT;
1454       }
1455       next = get2byte(&data[pc]);
1456       size = get2byte(&data[pc+2]);
1457       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1458         /* Free blocks must be in ascending order. And the last byte of
1459 	** the free-block must lie on the database page.  */
1460         return SQLITE_CORRUPT_BKPT;
1461       }
1462       nFree = nFree + size;
1463       pc = next;
1464     }
1465 
1466     /* At this point, nFree contains the sum of the offset to the start
1467     ** of the cell-content area plus the number of free bytes within
1468     ** the cell-content area. If this is greater than the usable-size
1469     ** of the page, then the page must be corrupted. This check also
1470     ** serves to verify that the offset to the start of the cell-content
1471     ** area, according to the page header, lies within the page.
1472     */
1473     if( nFree>usableSize ){
1474       return SQLITE_CORRUPT_BKPT;
1475     }
1476     pPage->nFree = (u16)(nFree - iCellFirst);
1477     pPage->isInit = 1;
1478   }
1479   return SQLITE_OK;
1480 }
1481 
1482 /*
1483 ** Set up a raw page so that it looks like a database page holding
1484 ** no entries.
1485 */
1486 static void zeroPage(MemPage *pPage, int flags){
1487   unsigned char *data = pPage->aData;
1488   BtShared *pBt = pPage->pBt;
1489   u8 hdr = pPage->hdrOffset;
1490   u16 first;
1491 
1492   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1493   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1494   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1495   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1496   assert( sqlite3_mutex_held(pBt->mutex) );
1497   if( pBt->secureDelete ){
1498     memset(&data[hdr], 0, pBt->usableSize - hdr);
1499   }
1500   data[hdr] = (char)flags;
1501   first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1502   memset(&data[hdr+1], 0, 4);
1503   data[hdr+7] = 0;
1504   put2byte(&data[hdr+5], pBt->usableSize);
1505   pPage->nFree = (u16)(pBt->usableSize - first);
1506   decodeFlags(pPage, flags);
1507   pPage->hdrOffset = hdr;
1508   pPage->cellOffset = first;
1509   pPage->nOverflow = 0;
1510   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1511   pPage->maskPage = (u16)(pBt->pageSize - 1);
1512   pPage->nCell = 0;
1513   pPage->isInit = 1;
1514 }
1515 
1516 
1517 /*
1518 ** Convert a DbPage obtained from the pager into a MemPage used by
1519 ** the btree layer.
1520 */
1521 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1522   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1523   pPage->aData = sqlite3PagerGetData(pDbPage);
1524   pPage->pDbPage = pDbPage;
1525   pPage->pBt = pBt;
1526   pPage->pgno = pgno;
1527   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1528   return pPage;
1529 }
1530 
1531 /*
1532 ** Get a page from the pager.  Initialize the MemPage.pBt and
1533 ** MemPage.aData elements if needed.
1534 **
1535 ** If the noContent flag is set, it means that we do not care about
1536 ** the content of the page at this time.  So do not go to the disk
1537 ** to fetch the content.  Just fill in the content with zeros for now.
1538 ** If in the future we call sqlite3PagerWrite() on this page, that
1539 ** means we have started to be concerned about content and the disk
1540 ** read should occur at that point.
1541 */
1542 static int btreeGetPage(
1543   BtShared *pBt,       /* The btree */
1544   Pgno pgno,           /* Number of the page to fetch */
1545   MemPage **ppPage,    /* Return the page in this parameter */
1546   int noContent        /* Do not load page content if true */
1547 ){
1548   int rc;
1549   DbPage *pDbPage;
1550 
1551   assert( sqlite3_mutex_held(pBt->mutex) );
1552   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1553   if( rc ) return rc;
1554   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1555   return SQLITE_OK;
1556 }
1557 
1558 /*
1559 ** Retrieve a page from the pager cache. If the requested page is not
1560 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1561 ** MemPage.aData elements if needed.
1562 */
1563 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1564   DbPage *pDbPage;
1565   assert( sqlite3_mutex_held(pBt->mutex) );
1566   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1567   if( pDbPage ){
1568     return btreePageFromDbPage(pDbPage, pgno, pBt);
1569   }
1570   return 0;
1571 }
1572 
1573 /*
1574 ** Return the size of the database file in pages. If there is any kind of
1575 ** error, return ((unsigned int)-1).
1576 */
1577 static Pgno btreePagecount(BtShared *pBt){
1578   return pBt->nPage;
1579 }
1580 u32 sqlite3BtreeLastPage(Btree *p){
1581   assert( sqlite3BtreeHoldsMutex(p) );
1582   assert( ((p->pBt->nPage)&0x8000000)==0 );
1583   return (int)btreePagecount(p->pBt);
1584 }
1585 
1586 /*
1587 ** Get a page from the pager and initialize it.  This routine is just a
1588 ** convenience wrapper around separate calls to btreeGetPage() and
1589 ** btreeInitPage().
1590 **
1591 ** If an error occurs, then the value *ppPage is set to is undefined. It
1592 ** may remain unchanged, or it may be set to an invalid value.
1593 */
1594 static int getAndInitPage(
1595   BtShared *pBt,          /* The database file */
1596   Pgno pgno,           /* Number of the page to get */
1597   MemPage **ppPage     /* Write the page pointer here */
1598 ){
1599   int rc;
1600   assert( sqlite3_mutex_held(pBt->mutex) );
1601 
1602   if( pgno>btreePagecount(pBt) ){
1603     rc = SQLITE_CORRUPT_BKPT;
1604   }else{
1605     rc = btreeGetPage(pBt, pgno, ppPage, 0);
1606     if( rc==SQLITE_OK ){
1607       rc = btreeInitPage(*ppPage);
1608       if( rc!=SQLITE_OK ){
1609         releasePage(*ppPage);
1610       }
1611     }
1612   }
1613 
1614   testcase( pgno==0 );
1615   assert( pgno!=0 || rc==SQLITE_CORRUPT );
1616   return rc;
1617 }
1618 
1619 /*
1620 ** Release a MemPage.  This should be called once for each prior
1621 ** call to btreeGetPage.
1622 */
1623 static void releasePage(MemPage *pPage){
1624   if( pPage ){
1625     assert( pPage->aData );
1626     assert( pPage->pBt );
1627     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1628     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1629     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1630     sqlite3PagerUnref(pPage->pDbPage);
1631   }
1632 }
1633 
1634 /*
1635 ** During a rollback, when the pager reloads information into the cache
1636 ** so that the cache is restored to its original state at the start of
1637 ** the transaction, for each page restored this routine is called.
1638 **
1639 ** This routine needs to reset the extra data section at the end of the
1640 ** page to agree with the restored data.
1641 */
1642 static void pageReinit(DbPage *pData){
1643   MemPage *pPage;
1644   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1645   assert( sqlite3PagerPageRefcount(pData)>0 );
1646   if( pPage->isInit ){
1647     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1648     pPage->isInit = 0;
1649     if( sqlite3PagerPageRefcount(pData)>1 ){
1650       /* pPage might not be a btree page;  it might be an overflow page
1651       ** or ptrmap page or a free page.  In those cases, the following
1652       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1653       ** But no harm is done by this.  And it is very important that
1654       ** btreeInitPage() be called on every btree page so we make
1655       ** the call for every page that comes in for re-initing. */
1656       btreeInitPage(pPage);
1657     }
1658   }
1659 }
1660 
1661 /*
1662 ** Invoke the busy handler for a btree.
1663 */
1664 static int btreeInvokeBusyHandler(void *pArg){
1665   BtShared *pBt = (BtShared*)pArg;
1666   assert( pBt->db );
1667   assert( sqlite3_mutex_held(pBt->db->mutex) );
1668   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1669 }
1670 
1671 /*
1672 ** Open a database file.
1673 **
1674 ** zFilename is the name of the database file.  If zFilename is NULL
1675 ** then an ephemeral database is created.  The ephemeral database might
1676 ** be exclusively in memory, or it might use a disk-based memory cache.
1677 ** Either way, the ephemeral database will be automatically deleted
1678 ** when sqlite3BtreeClose() is called.
1679 **
1680 ** If zFilename is ":memory:" then an in-memory database is created
1681 ** that is automatically destroyed when it is closed.
1682 **
1683 ** The "flags" parameter is a bitmask that might contain bits
1684 ** BTREE_OMIT_JOURNAL and/or BTREE_NO_READLOCK.  The BTREE_NO_READLOCK
1685 ** bit is also set if the SQLITE_NoReadlock flags is set in db->flags.
1686 ** These flags are passed through into sqlite3PagerOpen() and must
1687 ** be the same values as PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK.
1688 **
1689 ** If the database is already opened in the same database connection
1690 ** and we are in shared cache mode, then the open will fail with an
1691 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
1692 ** objects in the same database connection since doing so will lead
1693 ** to problems with locking.
1694 */
1695 int sqlite3BtreeOpen(
1696   const char *zFilename,  /* Name of the file containing the BTree database */
1697   sqlite3 *db,            /* Associated database handle */
1698   Btree **ppBtree,        /* Pointer to new Btree object written here */
1699   int flags,              /* Options */
1700   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1701 ){
1702   sqlite3_vfs *pVfs;             /* The VFS to use for this btree */
1703   BtShared *pBt = 0;             /* Shared part of btree structure */
1704   Btree *p;                      /* Handle to return */
1705   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
1706   int rc = SQLITE_OK;            /* Result code from this function */
1707   u8 nReserve;                   /* Byte of unused space on each page */
1708   unsigned char zDbHeader[100];  /* Database header content */
1709 
1710   /* True if opening an ephemeral, temporary database */
1711   const int isTempDb = zFilename==0 || zFilename[0]==0;
1712 
1713   /* Set the variable isMemdb to true for an in-memory database, or
1714   ** false for a file-based database. This symbol is only required if
1715   ** either of the shared-data or autovacuum features are compiled
1716   ** into the library.
1717   */
1718 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1719   #ifdef SQLITE_OMIT_MEMORYDB
1720     const int isMemdb = 0;
1721   #else
1722     const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
1723                          || (isTempDb && sqlite3TempInMemory(db));
1724   #endif
1725 #endif
1726 
1727   assert( db!=0 );
1728   assert( sqlite3_mutex_held(db->mutex) );
1729   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
1730 
1731   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
1732   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
1733 
1734   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
1735   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
1736 
1737   if( db->flags & SQLITE_NoReadlock ){
1738     flags |= BTREE_NO_READLOCK;
1739   }
1740   if( isMemdb ){
1741     flags |= BTREE_MEMORY;
1742   }
1743   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
1744     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
1745   }
1746   pVfs = db->pVfs;
1747   p = sqlite3MallocZero(sizeof(Btree));
1748   if( !p ){
1749     return SQLITE_NOMEM;
1750   }
1751   p->inTrans = TRANS_NONE;
1752   p->db = db;
1753 #ifndef SQLITE_OMIT_SHARED_CACHE
1754   p->lock.pBtree = p;
1755   p->lock.iTable = 1;
1756 #endif
1757 
1758 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1759   /*
1760   ** If this Btree is a candidate for shared cache, try to find an
1761   ** existing BtShared object that we can share with
1762   */
1763   if( isMemdb==0 && isTempDb==0 ){
1764     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1765       int nFullPathname = pVfs->mxPathname+1;
1766       char *zFullPathname = sqlite3Malloc(nFullPathname);
1767       sqlite3_mutex *mutexShared;
1768       p->sharable = 1;
1769       if( !zFullPathname ){
1770         sqlite3_free(p);
1771         return SQLITE_NOMEM;
1772       }
1773       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1774       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1775       sqlite3_mutex_enter(mutexOpen);
1776       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1777       sqlite3_mutex_enter(mutexShared);
1778       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1779         assert( pBt->nRef>0 );
1780         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1781                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1782           int iDb;
1783           for(iDb=db->nDb-1; iDb>=0; iDb--){
1784             Btree *pExisting = db->aDb[iDb].pBt;
1785             if( pExisting && pExisting->pBt==pBt ){
1786               sqlite3_mutex_leave(mutexShared);
1787               sqlite3_mutex_leave(mutexOpen);
1788               sqlite3_free(zFullPathname);
1789               sqlite3_free(p);
1790               return SQLITE_CONSTRAINT;
1791             }
1792           }
1793           p->pBt = pBt;
1794           pBt->nRef++;
1795           break;
1796         }
1797       }
1798       sqlite3_mutex_leave(mutexShared);
1799       sqlite3_free(zFullPathname);
1800     }
1801 #ifdef SQLITE_DEBUG
1802     else{
1803       /* In debug mode, we mark all persistent databases as sharable
1804       ** even when they are not.  This exercises the locking code and
1805       ** gives more opportunity for asserts(sqlite3_mutex_held())
1806       ** statements to find locking problems.
1807       */
1808       p->sharable = 1;
1809     }
1810 #endif
1811   }
1812 #endif
1813   if( pBt==0 ){
1814     /*
1815     ** The following asserts make sure that structures used by the btree are
1816     ** the right size.  This is to guard against size changes that result
1817     ** when compiling on a different architecture.
1818     */
1819     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1820     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1821     assert( sizeof(u32)==4 );
1822     assert( sizeof(u16)==2 );
1823     assert( sizeof(Pgno)==4 );
1824 
1825     pBt = sqlite3MallocZero( sizeof(*pBt) );
1826     if( pBt==0 ){
1827       rc = SQLITE_NOMEM;
1828       goto btree_open_out;
1829     }
1830     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1831                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
1832     if( rc==SQLITE_OK ){
1833       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1834     }
1835     if( rc!=SQLITE_OK ){
1836       goto btree_open_out;
1837     }
1838     pBt->openFlags = (u8)flags;
1839     pBt->db = db;
1840     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1841     p->pBt = pBt;
1842 
1843     pBt->pCursor = 0;
1844     pBt->pPage1 = 0;
1845     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1846 #ifdef SQLITE_SECURE_DELETE
1847     pBt->secureDelete = 1;
1848 #endif
1849     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
1850     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1851          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1852       pBt->pageSize = 0;
1853 #ifndef SQLITE_OMIT_AUTOVACUUM
1854       /* If the magic name ":memory:" will create an in-memory database, then
1855       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1856       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1857       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1858       ** regular file-name. In this case the auto-vacuum applies as per normal.
1859       */
1860       if( zFilename && !isMemdb ){
1861         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1862         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1863       }
1864 #endif
1865       nReserve = 0;
1866     }else{
1867       nReserve = zDbHeader[20];
1868       pBt->pageSizeFixed = 1;
1869 #ifndef SQLITE_OMIT_AUTOVACUUM
1870       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1871       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1872 #endif
1873     }
1874     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1875     if( rc ) goto btree_open_out;
1876     pBt->usableSize = pBt->pageSize - nReserve;
1877     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1878 
1879 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1880     /* Add the new BtShared object to the linked list sharable BtShareds.
1881     */
1882     if( p->sharable ){
1883       sqlite3_mutex *mutexShared;
1884       pBt->nRef = 1;
1885       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1886       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1887         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1888         if( pBt->mutex==0 ){
1889           rc = SQLITE_NOMEM;
1890           db->mallocFailed = 0;
1891           goto btree_open_out;
1892         }
1893       }
1894       sqlite3_mutex_enter(mutexShared);
1895       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1896       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1897       sqlite3_mutex_leave(mutexShared);
1898     }
1899 #endif
1900   }
1901 
1902 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1903   /* If the new Btree uses a sharable pBtShared, then link the new
1904   ** Btree into the list of all sharable Btrees for the same connection.
1905   ** The list is kept in ascending order by pBt address.
1906   */
1907   if( p->sharable ){
1908     int i;
1909     Btree *pSib;
1910     for(i=0; i<db->nDb; i++){
1911       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1912         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1913         if( p->pBt<pSib->pBt ){
1914           p->pNext = pSib;
1915           p->pPrev = 0;
1916           pSib->pPrev = p;
1917         }else{
1918           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1919             pSib = pSib->pNext;
1920           }
1921           p->pNext = pSib->pNext;
1922           p->pPrev = pSib;
1923           if( p->pNext ){
1924             p->pNext->pPrev = p;
1925           }
1926           pSib->pNext = p;
1927         }
1928         break;
1929       }
1930     }
1931   }
1932 #endif
1933   *ppBtree = p;
1934 
1935 btree_open_out:
1936   if( rc!=SQLITE_OK ){
1937     if( pBt && pBt->pPager ){
1938       sqlite3PagerClose(pBt->pPager);
1939     }
1940     sqlite3_free(pBt);
1941     sqlite3_free(p);
1942     *ppBtree = 0;
1943   }else{
1944     /* If the B-Tree was successfully opened, set the pager-cache size to the
1945     ** default value. Except, when opening on an existing shared pager-cache,
1946     ** do not change the pager-cache size.
1947     */
1948     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
1949       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
1950     }
1951   }
1952   if( mutexOpen ){
1953     assert( sqlite3_mutex_held(mutexOpen) );
1954     sqlite3_mutex_leave(mutexOpen);
1955   }
1956   return rc;
1957 }
1958 
1959 /*
1960 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1961 ** remove the BtShared structure from the sharing list.  Return
1962 ** true if the BtShared.nRef counter reaches zero and return
1963 ** false if it is still positive.
1964 */
1965 static int removeFromSharingList(BtShared *pBt){
1966 #ifndef SQLITE_OMIT_SHARED_CACHE
1967   sqlite3_mutex *pMaster;
1968   BtShared *pList;
1969   int removed = 0;
1970 
1971   assert( sqlite3_mutex_notheld(pBt->mutex) );
1972   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1973   sqlite3_mutex_enter(pMaster);
1974   pBt->nRef--;
1975   if( pBt->nRef<=0 ){
1976     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1977       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1978     }else{
1979       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1980       while( ALWAYS(pList) && pList->pNext!=pBt ){
1981         pList=pList->pNext;
1982       }
1983       if( ALWAYS(pList) ){
1984         pList->pNext = pBt->pNext;
1985       }
1986     }
1987     if( SQLITE_THREADSAFE ){
1988       sqlite3_mutex_free(pBt->mutex);
1989     }
1990     removed = 1;
1991   }
1992   sqlite3_mutex_leave(pMaster);
1993   return removed;
1994 #else
1995   return 1;
1996 #endif
1997 }
1998 
1999 /*
2000 ** Make sure pBt->pTmpSpace points to an allocation of
2001 ** MX_CELL_SIZE(pBt) bytes.
2002 */
2003 static void allocateTempSpace(BtShared *pBt){
2004   if( !pBt->pTmpSpace ){
2005     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2006   }
2007 }
2008 
2009 /*
2010 ** Free the pBt->pTmpSpace allocation
2011 */
2012 static void freeTempSpace(BtShared *pBt){
2013   sqlite3PageFree( pBt->pTmpSpace);
2014   pBt->pTmpSpace = 0;
2015 }
2016 
2017 /*
2018 ** Close an open database and invalidate all cursors.
2019 */
2020 int sqlite3BtreeClose(Btree *p){
2021   BtShared *pBt = p->pBt;
2022   BtCursor *pCur;
2023 
2024   /* Close all cursors opened via this handle.  */
2025   assert( sqlite3_mutex_held(p->db->mutex) );
2026   sqlite3BtreeEnter(p);
2027   pCur = pBt->pCursor;
2028   while( pCur ){
2029     BtCursor *pTmp = pCur;
2030     pCur = pCur->pNext;
2031     if( pTmp->pBtree==p ){
2032       sqlite3BtreeCloseCursor(pTmp);
2033     }
2034   }
2035 
2036   /* Rollback any active transaction and free the handle structure.
2037   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2038   ** this handle.
2039   */
2040   sqlite3BtreeRollback(p);
2041   sqlite3BtreeLeave(p);
2042 
2043   /* If there are still other outstanding references to the shared-btree
2044   ** structure, return now. The remainder of this procedure cleans
2045   ** up the shared-btree.
2046   */
2047   assert( p->wantToLock==0 && p->locked==0 );
2048   if( !p->sharable || removeFromSharingList(pBt) ){
2049     /* The pBt is no longer on the sharing list, so we can access
2050     ** it without having to hold the mutex.
2051     **
2052     ** Clean out and delete the BtShared object.
2053     */
2054     assert( !pBt->pCursor );
2055     sqlite3PagerClose(pBt->pPager);
2056     if( pBt->xFreeSchema && pBt->pSchema ){
2057       pBt->xFreeSchema(pBt->pSchema);
2058     }
2059     sqlite3DbFree(0, pBt->pSchema);
2060     freeTempSpace(pBt);
2061     sqlite3_free(pBt);
2062   }
2063 
2064 #ifndef SQLITE_OMIT_SHARED_CACHE
2065   assert( p->wantToLock==0 );
2066   assert( p->locked==0 );
2067   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2068   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2069 #endif
2070 
2071   sqlite3_free(p);
2072   return SQLITE_OK;
2073 }
2074 
2075 /*
2076 ** Change the limit on the number of pages allowed in the cache.
2077 **
2078 ** The maximum number of cache pages is set to the absolute
2079 ** value of mxPage.  If mxPage is negative, the pager will
2080 ** operate asynchronously - it will not stop to do fsync()s
2081 ** to insure data is written to the disk surface before
2082 ** continuing.  Transactions still work if synchronous is off,
2083 ** and the database cannot be corrupted if this program
2084 ** crashes.  But if the operating system crashes or there is
2085 ** an abrupt power failure when synchronous is off, the database
2086 ** could be left in an inconsistent and unrecoverable state.
2087 ** Synchronous is on by default so database corruption is not
2088 ** normally a worry.
2089 */
2090 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2091   BtShared *pBt = p->pBt;
2092   assert( sqlite3_mutex_held(p->db->mutex) );
2093   sqlite3BtreeEnter(p);
2094   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2095   sqlite3BtreeLeave(p);
2096   return SQLITE_OK;
2097 }
2098 
2099 /*
2100 ** Change the way data is synced to disk in order to increase or decrease
2101 ** how well the database resists damage due to OS crashes and power
2102 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2103 ** there is a high probability of damage)  Level 2 is the default.  There
2104 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2105 ** probability of damage to near zero but with a write performance reduction.
2106 */
2107 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2108 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
2109   BtShared *pBt = p->pBt;
2110   assert( sqlite3_mutex_held(p->db->mutex) );
2111   sqlite3BtreeEnter(p);
2112   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
2113   sqlite3BtreeLeave(p);
2114   return SQLITE_OK;
2115 }
2116 #endif
2117 
2118 /*
2119 ** Return TRUE if the given btree is set to safety level 1.  In other
2120 ** words, return TRUE if no sync() occurs on the disk files.
2121 */
2122 int sqlite3BtreeSyncDisabled(Btree *p){
2123   BtShared *pBt = p->pBt;
2124   int rc;
2125   assert( sqlite3_mutex_held(p->db->mutex) );
2126   sqlite3BtreeEnter(p);
2127   assert( pBt && pBt->pPager );
2128   rc = sqlite3PagerNosync(pBt->pPager);
2129   sqlite3BtreeLeave(p);
2130   return rc;
2131 }
2132 
2133 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2134 /*
2135 ** Change the default pages size and the number of reserved bytes per page.
2136 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2137 ** without changing anything.
2138 **
2139 ** The page size must be a power of 2 between 512 and 65536.  If the page
2140 ** size supplied does not meet this constraint then the page size is not
2141 ** changed.
2142 **
2143 ** Page sizes are constrained to be a power of two so that the region
2144 ** of the database file used for locking (beginning at PENDING_BYTE,
2145 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2146 ** at the beginning of a page.
2147 **
2148 ** If parameter nReserve is less than zero, then the number of reserved
2149 ** bytes per page is left unchanged.
2150 **
2151 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
2152 ** and autovacuum mode can no longer be changed.
2153 */
2154 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2155   int rc = SQLITE_OK;
2156   BtShared *pBt = p->pBt;
2157   assert( nReserve>=-1 && nReserve<=255 );
2158   sqlite3BtreeEnter(p);
2159   if( pBt->pageSizeFixed ){
2160     sqlite3BtreeLeave(p);
2161     return SQLITE_READONLY;
2162   }
2163   if( nReserve<0 ){
2164     nReserve = pBt->pageSize - pBt->usableSize;
2165   }
2166   assert( nReserve>=0 && nReserve<=255 );
2167   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2168         ((pageSize-1)&pageSize)==0 ){
2169     assert( (pageSize & 7)==0 );
2170     assert( !pBt->pPage1 && !pBt->pCursor );
2171     pBt->pageSize = (u32)pageSize;
2172     freeTempSpace(pBt);
2173   }
2174   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2175   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2176   if( iFix ) pBt->pageSizeFixed = 1;
2177   sqlite3BtreeLeave(p);
2178   return rc;
2179 }
2180 
2181 /*
2182 ** Return the currently defined page size
2183 */
2184 int sqlite3BtreeGetPageSize(Btree *p){
2185   return p->pBt->pageSize;
2186 }
2187 
2188 /*
2189 ** Return the number of bytes of space at the end of every page that
2190 ** are intentually left unused.  This is the "reserved" space that is
2191 ** sometimes used by extensions.
2192 */
2193 int sqlite3BtreeGetReserve(Btree *p){
2194   int n;
2195   sqlite3BtreeEnter(p);
2196   n = p->pBt->pageSize - p->pBt->usableSize;
2197   sqlite3BtreeLeave(p);
2198   return n;
2199 }
2200 
2201 /*
2202 ** Set the maximum page count for a database if mxPage is positive.
2203 ** No changes are made if mxPage is 0 or negative.
2204 ** Regardless of the value of mxPage, return the maximum page count.
2205 */
2206 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2207   int n;
2208   sqlite3BtreeEnter(p);
2209   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2210   sqlite3BtreeLeave(p);
2211   return n;
2212 }
2213 
2214 /*
2215 ** Set the secureDelete flag if newFlag is 0 or 1.  If newFlag is -1,
2216 ** then make no changes.  Always return the value of the secureDelete
2217 ** setting after the change.
2218 */
2219 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2220   int b;
2221   if( p==0 ) return 0;
2222   sqlite3BtreeEnter(p);
2223   if( newFlag>=0 ){
2224     p->pBt->secureDelete = (newFlag!=0) ? 1 : 0;
2225   }
2226   b = p->pBt->secureDelete;
2227   sqlite3BtreeLeave(p);
2228   return b;
2229 }
2230 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2231 
2232 /*
2233 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2234 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2235 ** is disabled. The default value for the auto-vacuum property is
2236 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2237 */
2238 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2239 #ifdef SQLITE_OMIT_AUTOVACUUM
2240   return SQLITE_READONLY;
2241 #else
2242   BtShared *pBt = p->pBt;
2243   int rc = SQLITE_OK;
2244   u8 av = (u8)autoVacuum;
2245 
2246   sqlite3BtreeEnter(p);
2247   if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){
2248     rc = SQLITE_READONLY;
2249   }else{
2250     pBt->autoVacuum = av ?1:0;
2251     pBt->incrVacuum = av==2 ?1:0;
2252   }
2253   sqlite3BtreeLeave(p);
2254   return rc;
2255 #endif
2256 }
2257 
2258 /*
2259 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2260 ** enabled 1 is returned. Otherwise 0.
2261 */
2262 int sqlite3BtreeGetAutoVacuum(Btree *p){
2263 #ifdef SQLITE_OMIT_AUTOVACUUM
2264   return BTREE_AUTOVACUUM_NONE;
2265 #else
2266   int rc;
2267   sqlite3BtreeEnter(p);
2268   rc = (
2269     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2270     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2271     BTREE_AUTOVACUUM_INCR
2272   );
2273   sqlite3BtreeLeave(p);
2274   return rc;
2275 #endif
2276 }
2277 
2278 
2279 /*
2280 ** Get a reference to pPage1 of the database file.  This will
2281 ** also acquire a readlock on that file.
2282 **
2283 ** SQLITE_OK is returned on success.  If the file is not a
2284 ** well-formed database file, then SQLITE_CORRUPT is returned.
2285 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2286 ** is returned if we run out of memory.
2287 */
2288 static int lockBtree(BtShared *pBt){
2289   int rc;              /* Result code from subfunctions */
2290   MemPage *pPage1;     /* Page 1 of the database file */
2291   int nPage;           /* Number of pages in the database */
2292   int nPageFile = 0;   /* Number of pages in the database file */
2293   int nPageHeader;     /* Number of pages in the database according to hdr */
2294 
2295   assert( sqlite3_mutex_held(pBt->mutex) );
2296   assert( pBt->pPage1==0 );
2297   rc = sqlite3PagerSharedLock(pBt->pPager);
2298   if( rc!=SQLITE_OK ) return rc;
2299   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2300   if( rc!=SQLITE_OK ) return rc;
2301 
2302   /* Do some checking to help insure the file we opened really is
2303   ** a valid database file.
2304   */
2305   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2306   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2307   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2308     nPage = nPageFile;
2309   }
2310   if( nPage>0 ){
2311     u32 pageSize;
2312     u32 usableSize;
2313     u8 *page1 = pPage1->aData;
2314     rc = SQLITE_NOTADB;
2315     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2316       goto page1_init_failed;
2317     }
2318 
2319 #ifdef SQLITE_OMIT_WAL
2320     if( page1[18]>1 ){
2321       pBt->readOnly = 1;
2322     }
2323     if( page1[19]>1 ){
2324       goto page1_init_failed;
2325     }
2326 #else
2327     if( page1[18]>2 ){
2328       pBt->readOnly = 1;
2329     }
2330     if( page1[19]>2 ){
2331       goto page1_init_failed;
2332     }
2333 
2334     /* If the write version is set to 2, this database should be accessed
2335     ** in WAL mode. If the log is not already open, open it now. Then
2336     ** return SQLITE_OK and return without populating BtShared.pPage1.
2337     ** The caller detects this and calls this function again. This is
2338     ** required as the version of page 1 currently in the page1 buffer
2339     ** may not be the latest version - there may be a newer one in the log
2340     ** file.
2341     */
2342     if( page1[19]==2 && pBt->doNotUseWAL==0 ){
2343       int isOpen = 0;
2344       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2345       if( rc!=SQLITE_OK ){
2346         goto page1_init_failed;
2347       }else if( isOpen==0 ){
2348         releasePage(pPage1);
2349         return SQLITE_OK;
2350       }
2351       rc = SQLITE_NOTADB;
2352     }
2353 #endif
2354 
2355     /* The maximum embedded fraction must be exactly 25%.  And the minimum
2356     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2357     ** The original design allowed these amounts to vary, but as of
2358     ** version 3.6.0, we require them to be fixed.
2359     */
2360     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2361       goto page1_init_failed;
2362     }
2363     pageSize = (page1[16]<<8) | (page1[17]<<16);
2364     if( ((pageSize-1)&pageSize)!=0
2365      || pageSize>SQLITE_MAX_PAGE_SIZE
2366      || pageSize<=256
2367     ){
2368       goto page1_init_failed;
2369     }
2370     assert( (pageSize & 7)==0 );
2371     usableSize = pageSize - page1[20];
2372     if( (u32)pageSize!=pBt->pageSize ){
2373       /* After reading the first page of the database assuming a page size
2374       ** of BtShared.pageSize, we have discovered that the page-size is
2375       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2376       ** zero and return SQLITE_OK. The caller will call this function
2377       ** again with the correct page-size.
2378       */
2379       releasePage(pPage1);
2380       pBt->usableSize = usableSize;
2381       pBt->pageSize = pageSize;
2382       freeTempSpace(pBt);
2383       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2384                                    pageSize-usableSize);
2385       return rc;
2386     }
2387     if( nPageHeader>nPageFile ){
2388       rc = SQLITE_CORRUPT_BKPT;
2389       goto page1_init_failed;
2390     }
2391     if( usableSize<480 ){
2392       goto page1_init_failed;
2393     }
2394     pBt->pageSize = pageSize;
2395     pBt->usableSize = usableSize;
2396 #ifndef SQLITE_OMIT_AUTOVACUUM
2397     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2398     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2399 #endif
2400   }
2401 
2402   /* maxLocal is the maximum amount of payload to store locally for
2403   ** a cell.  Make sure it is small enough so that at least minFanout
2404   ** cells can will fit on one page.  We assume a 10-byte page header.
2405   ** Besides the payload, the cell must store:
2406   **     2-byte pointer to the cell
2407   **     4-byte child pointer
2408   **     9-byte nKey value
2409   **     4-byte nData value
2410   **     4-byte overflow page pointer
2411   ** So a cell consists of a 2-byte pointer, a header which is as much as
2412   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2413   ** page pointer.
2414   */
2415   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2416   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2417   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2418   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2419   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2420   pBt->pPage1 = pPage1;
2421   pBt->nPage = nPage;
2422   return SQLITE_OK;
2423 
2424 page1_init_failed:
2425   releasePage(pPage1);
2426   pBt->pPage1 = 0;
2427   return rc;
2428 }
2429 
2430 /*
2431 ** If there are no outstanding cursors and we are not in the middle
2432 ** of a transaction but there is a read lock on the database, then
2433 ** this routine unrefs the first page of the database file which
2434 ** has the effect of releasing the read lock.
2435 **
2436 ** If there is a transaction in progress, this routine is a no-op.
2437 */
2438 static void unlockBtreeIfUnused(BtShared *pBt){
2439   assert( sqlite3_mutex_held(pBt->mutex) );
2440   assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2441   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2442     assert( pBt->pPage1->aData );
2443     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2444     assert( pBt->pPage1->aData );
2445     releasePage(pBt->pPage1);
2446     pBt->pPage1 = 0;
2447   }
2448 }
2449 
2450 /*
2451 ** If pBt points to an empty file then convert that empty file
2452 ** into a new empty database by initializing the first page of
2453 ** the database.
2454 */
2455 static int newDatabase(BtShared *pBt){
2456   MemPage *pP1;
2457   unsigned char *data;
2458   int rc;
2459 
2460   assert( sqlite3_mutex_held(pBt->mutex) );
2461   if( pBt->nPage>0 ){
2462     return SQLITE_OK;
2463   }
2464   pP1 = pBt->pPage1;
2465   assert( pP1!=0 );
2466   data = pP1->aData;
2467   rc = sqlite3PagerWrite(pP1->pDbPage);
2468   if( rc ) return rc;
2469   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2470   assert( sizeof(zMagicHeader)==16 );
2471   data[16] = (u8)((pBt->pageSize>>8)&0xff);
2472   data[17] = (u8)((pBt->pageSize>>16)&0xff);
2473   data[18] = 1;
2474   data[19] = 1;
2475   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2476   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2477   data[21] = 64;
2478   data[22] = 32;
2479   data[23] = 32;
2480   memset(&data[24], 0, 100-24);
2481   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2482   pBt->pageSizeFixed = 1;
2483 #ifndef SQLITE_OMIT_AUTOVACUUM
2484   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2485   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2486   put4byte(&data[36 + 4*4], pBt->autoVacuum);
2487   put4byte(&data[36 + 7*4], pBt->incrVacuum);
2488 #endif
2489   pBt->nPage = 1;
2490   data[31] = 1;
2491   return SQLITE_OK;
2492 }
2493 
2494 /*
2495 ** Attempt to start a new transaction. A write-transaction
2496 ** is started if the second argument is nonzero, otherwise a read-
2497 ** transaction.  If the second argument is 2 or more and exclusive
2498 ** transaction is started, meaning that no other process is allowed
2499 ** to access the database.  A preexisting transaction may not be
2500 ** upgraded to exclusive by calling this routine a second time - the
2501 ** exclusivity flag only works for a new transaction.
2502 **
2503 ** A write-transaction must be started before attempting any
2504 ** changes to the database.  None of the following routines
2505 ** will work unless a transaction is started first:
2506 **
2507 **      sqlite3BtreeCreateTable()
2508 **      sqlite3BtreeCreateIndex()
2509 **      sqlite3BtreeClearTable()
2510 **      sqlite3BtreeDropTable()
2511 **      sqlite3BtreeInsert()
2512 **      sqlite3BtreeDelete()
2513 **      sqlite3BtreeUpdateMeta()
2514 **
2515 ** If an initial attempt to acquire the lock fails because of lock contention
2516 ** and the database was previously unlocked, then invoke the busy handler
2517 ** if there is one.  But if there was previously a read-lock, do not
2518 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
2519 ** returned when there is already a read-lock in order to avoid a deadlock.
2520 **
2521 ** Suppose there are two processes A and B.  A has a read lock and B has
2522 ** a reserved lock.  B tries to promote to exclusive but is blocked because
2523 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
2524 ** One or the other of the two processes must give way or there can be
2525 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
2526 ** when A already has a read lock, we encourage A to give up and let B
2527 ** proceed.
2528 */
2529 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2530   sqlite3 *pBlock = 0;
2531   BtShared *pBt = p->pBt;
2532   int rc = SQLITE_OK;
2533 
2534   sqlite3BtreeEnter(p);
2535   btreeIntegrity(p);
2536 
2537   /* If the btree is already in a write-transaction, or it
2538   ** is already in a read-transaction and a read-transaction
2539   ** is requested, this is a no-op.
2540   */
2541   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2542     goto trans_begun;
2543   }
2544 
2545   /* Write transactions are not possible on a read-only database */
2546   if( pBt->readOnly && wrflag ){
2547     rc = SQLITE_READONLY;
2548     goto trans_begun;
2549   }
2550 
2551 #ifndef SQLITE_OMIT_SHARED_CACHE
2552   /* If another database handle has already opened a write transaction
2553   ** on this shared-btree structure and a second write transaction is
2554   ** requested, return SQLITE_LOCKED.
2555   */
2556   if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){
2557     pBlock = pBt->pWriter->db;
2558   }else if( wrflag>1 ){
2559     BtLock *pIter;
2560     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2561       if( pIter->pBtree!=p ){
2562         pBlock = pIter->pBtree->db;
2563         break;
2564       }
2565     }
2566   }
2567   if( pBlock ){
2568     sqlite3ConnectionBlocked(p->db, pBlock);
2569     rc = SQLITE_LOCKED_SHAREDCACHE;
2570     goto trans_begun;
2571   }
2572 #endif
2573 
2574   /* Any read-only or read-write transaction implies a read-lock on
2575   ** page 1. So if some other shared-cache client already has a write-lock
2576   ** on page 1, the transaction cannot be opened. */
2577   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2578   if( SQLITE_OK!=rc ) goto trans_begun;
2579 
2580   pBt->initiallyEmpty = (u8)(pBt->nPage==0);
2581   do {
2582     /* Call lockBtree() until either pBt->pPage1 is populated or
2583     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2584     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2585     ** reading page 1 it discovers that the page-size of the database
2586     ** file is not pBt->pageSize. In this case lockBtree() will update
2587     ** pBt->pageSize to the page-size of the file on disk.
2588     */
2589     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2590 
2591     if( rc==SQLITE_OK && wrflag ){
2592       if( pBt->readOnly ){
2593         rc = SQLITE_READONLY;
2594       }else{
2595         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2596         if( rc==SQLITE_OK ){
2597           rc = newDatabase(pBt);
2598         }
2599       }
2600     }
2601 
2602     if( rc!=SQLITE_OK ){
2603       unlockBtreeIfUnused(pBt);
2604     }
2605   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2606           btreeInvokeBusyHandler(pBt) );
2607 
2608   if( rc==SQLITE_OK ){
2609     if( p->inTrans==TRANS_NONE ){
2610       pBt->nTransaction++;
2611 #ifndef SQLITE_OMIT_SHARED_CACHE
2612       if( p->sharable ){
2613 	assert( p->lock.pBtree==p && p->lock.iTable==1 );
2614         p->lock.eLock = READ_LOCK;
2615         p->lock.pNext = pBt->pLock;
2616         pBt->pLock = &p->lock;
2617       }
2618 #endif
2619     }
2620     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2621     if( p->inTrans>pBt->inTransaction ){
2622       pBt->inTransaction = p->inTrans;
2623     }
2624     if( wrflag ){
2625       MemPage *pPage1 = pBt->pPage1;
2626 #ifndef SQLITE_OMIT_SHARED_CACHE
2627       assert( !pBt->pWriter );
2628       pBt->pWriter = p;
2629       pBt->isExclusive = (u8)(wrflag>1);
2630 #endif
2631 
2632       /* If the db-size header field is incorrect (as it may be if an old
2633       ** client has been writing the database file), update it now. Doing
2634       ** this sooner rather than later means the database size can safely
2635       ** re-read the database size from page 1 if a savepoint or transaction
2636       ** rollback occurs within the transaction.
2637       */
2638       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
2639         rc = sqlite3PagerWrite(pPage1->pDbPage);
2640         if( rc==SQLITE_OK ){
2641           put4byte(&pPage1->aData[28], pBt->nPage);
2642         }
2643       }
2644     }
2645   }
2646 
2647 
2648 trans_begun:
2649   if( rc==SQLITE_OK && wrflag ){
2650     /* This call makes sure that the pager has the correct number of
2651     ** open savepoints. If the second parameter is greater than 0 and
2652     ** the sub-journal is not already open, then it will be opened here.
2653     */
2654     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2655   }
2656 
2657   btreeIntegrity(p);
2658   sqlite3BtreeLeave(p);
2659   return rc;
2660 }
2661 
2662 #ifndef SQLITE_OMIT_AUTOVACUUM
2663 
2664 /*
2665 ** Set the pointer-map entries for all children of page pPage. Also, if
2666 ** pPage contains cells that point to overflow pages, set the pointer
2667 ** map entries for the overflow pages as well.
2668 */
2669 static int setChildPtrmaps(MemPage *pPage){
2670   int i;                             /* Counter variable */
2671   int nCell;                         /* Number of cells in page pPage */
2672   int rc;                            /* Return code */
2673   BtShared *pBt = pPage->pBt;
2674   u8 isInitOrig = pPage->isInit;
2675   Pgno pgno = pPage->pgno;
2676 
2677   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2678   rc = btreeInitPage(pPage);
2679   if( rc!=SQLITE_OK ){
2680     goto set_child_ptrmaps_out;
2681   }
2682   nCell = pPage->nCell;
2683 
2684   for(i=0; i<nCell; i++){
2685     u8 *pCell = findCell(pPage, i);
2686 
2687     ptrmapPutOvflPtr(pPage, pCell, &rc);
2688 
2689     if( !pPage->leaf ){
2690       Pgno childPgno = get4byte(pCell);
2691       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2692     }
2693   }
2694 
2695   if( !pPage->leaf ){
2696     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2697     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2698   }
2699 
2700 set_child_ptrmaps_out:
2701   pPage->isInit = isInitOrig;
2702   return rc;
2703 }
2704 
2705 /*
2706 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
2707 ** that it points to iTo. Parameter eType describes the type of pointer to
2708 ** be modified, as  follows:
2709 **
2710 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2711 **                   page of pPage.
2712 **
2713 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2714 **                   page pointed to by one of the cells on pPage.
2715 **
2716 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2717 **                   overflow page in the list.
2718 */
2719 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2720   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2721   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2722   if( eType==PTRMAP_OVERFLOW2 ){
2723     /* The pointer is always the first 4 bytes of the page in this case.  */
2724     if( get4byte(pPage->aData)!=iFrom ){
2725       return SQLITE_CORRUPT_BKPT;
2726     }
2727     put4byte(pPage->aData, iTo);
2728   }else{
2729     u8 isInitOrig = pPage->isInit;
2730     int i;
2731     int nCell;
2732 
2733     btreeInitPage(pPage);
2734     nCell = pPage->nCell;
2735 
2736     for(i=0; i<nCell; i++){
2737       u8 *pCell = findCell(pPage, i);
2738       if( eType==PTRMAP_OVERFLOW1 ){
2739         CellInfo info;
2740         btreeParseCellPtr(pPage, pCell, &info);
2741         if( info.iOverflow ){
2742           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2743             put4byte(&pCell[info.iOverflow], iTo);
2744             break;
2745           }
2746         }
2747       }else{
2748         if( get4byte(pCell)==iFrom ){
2749           put4byte(pCell, iTo);
2750           break;
2751         }
2752       }
2753     }
2754 
2755     if( i==nCell ){
2756       if( eType!=PTRMAP_BTREE ||
2757           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2758         return SQLITE_CORRUPT_BKPT;
2759       }
2760       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2761     }
2762 
2763     pPage->isInit = isInitOrig;
2764   }
2765   return SQLITE_OK;
2766 }
2767 
2768 
2769 /*
2770 ** Move the open database page pDbPage to location iFreePage in the
2771 ** database. The pDbPage reference remains valid.
2772 **
2773 ** The isCommit flag indicates that there is no need to remember that
2774 ** the journal needs to be sync()ed before database page pDbPage->pgno
2775 ** can be written to. The caller has already promised not to write to that
2776 ** page.
2777 */
2778 static int relocatePage(
2779   BtShared *pBt,           /* Btree */
2780   MemPage *pDbPage,        /* Open page to move */
2781   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2782   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2783   Pgno iFreePage,          /* The location to move pDbPage to */
2784   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
2785 ){
2786   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2787   Pgno iDbPage = pDbPage->pgno;
2788   Pager *pPager = pBt->pPager;
2789   int rc;
2790 
2791   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2792       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2793   assert( sqlite3_mutex_held(pBt->mutex) );
2794   assert( pDbPage->pBt==pBt );
2795 
2796   /* Move page iDbPage from its current location to page number iFreePage */
2797   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2798       iDbPage, iFreePage, iPtrPage, eType));
2799   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2800   if( rc!=SQLITE_OK ){
2801     return rc;
2802   }
2803   pDbPage->pgno = iFreePage;
2804 
2805   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2806   ** that point to overflow pages. The pointer map entries for all these
2807   ** pages need to be changed.
2808   **
2809   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2810   ** pointer to a subsequent overflow page. If this is the case, then
2811   ** the pointer map needs to be updated for the subsequent overflow page.
2812   */
2813   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2814     rc = setChildPtrmaps(pDbPage);
2815     if( rc!=SQLITE_OK ){
2816       return rc;
2817     }
2818   }else{
2819     Pgno nextOvfl = get4byte(pDbPage->aData);
2820     if( nextOvfl!=0 ){
2821       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2822       if( rc!=SQLITE_OK ){
2823         return rc;
2824       }
2825     }
2826   }
2827 
2828   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2829   ** that it points at iFreePage. Also fix the pointer map entry for
2830   ** iPtrPage.
2831   */
2832   if( eType!=PTRMAP_ROOTPAGE ){
2833     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2834     if( rc!=SQLITE_OK ){
2835       return rc;
2836     }
2837     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2838     if( rc!=SQLITE_OK ){
2839       releasePage(pPtrPage);
2840       return rc;
2841     }
2842     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2843     releasePage(pPtrPage);
2844     if( rc==SQLITE_OK ){
2845       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
2846     }
2847   }
2848   return rc;
2849 }
2850 
2851 /* Forward declaration required by incrVacuumStep(). */
2852 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2853 
2854 /*
2855 ** Perform a single step of an incremental-vacuum. If successful,
2856 ** return SQLITE_OK. If there is no work to do (and therefore no
2857 ** point in calling this function again), return SQLITE_DONE.
2858 **
2859 ** More specificly, this function attempts to re-organize the
2860 ** database so that the last page of the file currently in use
2861 ** is no longer in use.
2862 **
2863 ** If the nFin parameter is non-zero, this function assumes
2864 ** that the caller will keep calling incrVacuumStep() until
2865 ** it returns SQLITE_DONE or an error, and that nFin is the
2866 ** number of pages the database file will contain after this
2867 ** process is complete.  If nFin is zero, it is assumed that
2868 ** incrVacuumStep() will be called a finite amount of times
2869 ** which may or may not empty the freelist.  A full autovacuum
2870 ** has nFin>0.  A "PRAGMA incremental_vacuum" has nFin==0.
2871 */
2872 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2873   Pgno nFreeList;           /* Number of pages still on the free-list */
2874   int rc;
2875 
2876   assert( sqlite3_mutex_held(pBt->mutex) );
2877   assert( iLastPg>nFin );
2878 
2879   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2880     u8 eType;
2881     Pgno iPtrPage;
2882 
2883     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2884     if( nFreeList==0 ){
2885       return SQLITE_DONE;
2886     }
2887 
2888     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2889     if( rc!=SQLITE_OK ){
2890       return rc;
2891     }
2892     if( eType==PTRMAP_ROOTPAGE ){
2893       return SQLITE_CORRUPT_BKPT;
2894     }
2895 
2896     if( eType==PTRMAP_FREEPAGE ){
2897       if( nFin==0 ){
2898         /* Remove the page from the files free-list. This is not required
2899         ** if nFin is non-zero. In that case, the free-list will be
2900         ** truncated to zero after this function returns, so it doesn't
2901         ** matter if it still contains some garbage entries.
2902         */
2903         Pgno iFreePg;
2904         MemPage *pFreePg;
2905         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2906         if( rc!=SQLITE_OK ){
2907           return rc;
2908         }
2909         assert( iFreePg==iLastPg );
2910         releasePage(pFreePg);
2911       }
2912     } else {
2913       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2914       MemPage *pLastPg;
2915 
2916       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
2917       if( rc!=SQLITE_OK ){
2918         return rc;
2919       }
2920 
2921       /* If nFin is zero, this loop runs exactly once and page pLastPg
2922       ** is swapped with the first free page pulled off the free list.
2923       **
2924       ** On the other hand, if nFin is greater than zero, then keep
2925       ** looping until a free-page located within the first nFin pages
2926       ** of the file is found.
2927       */
2928       do {
2929         MemPage *pFreePg;
2930         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2931         if( rc!=SQLITE_OK ){
2932           releasePage(pLastPg);
2933           return rc;
2934         }
2935         releasePage(pFreePg);
2936       }while( nFin!=0 && iFreePg>nFin );
2937       assert( iFreePg<iLastPg );
2938 
2939       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2940       if( rc==SQLITE_OK ){
2941         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2942       }
2943       releasePage(pLastPg);
2944       if( rc!=SQLITE_OK ){
2945         return rc;
2946       }
2947     }
2948   }
2949 
2950   if( nFin==0 ){
2951     iLastPg--;
2952     while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2953       if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2954         MemPage *pPg;
2955         rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
2956         if( rc!=SQLITE_OK ){
2957           return rc;
2958         }
2959         rc = sqlite3PagerWrite(pPg->pDbPage);
2960         releasePage(pPg);
2961         if( rc!=SQLITE_OK ){
2962           return rc;
2963         }
2964       }
2965       iLastPg--;
2966     }
2967     sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2968     pBt->nPage = iLastPg;
2969   }
2970   return SQLITE_OK;
2971 }
2972 
2973 /*
2974 ** A write-transaction must be opened before calling this function.
2975 ** It performs a single unit of work towards an incremental vacuum.
2976 **
2977 ** If the incremental vacuum is finished after this function has run,
2978 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
2979 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2980 */
2981 int sqlite3BtreeIncrVacuum(Btree *p){
2982   int rc;
2983   BtShared *pBt = p->pBt;
2984 
2985   sqlite3BtreeEnter(p);
2986   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2987   if( !pBt->autoVacuum ){
2988     rc = SQLITE_DONE;
2989   }else{
2990     invalidateAllOverflowCache(pBt);
2991     rc = incrVacuumStep(pBt, 0, btreePagecount(pBt));
2992     if( rc==SQLITE_OK ){
2993       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2994       put4byte(&pBt->pPage1->aData[28], pBt->nPage);
2995     }
2996   }
2997   sqlite3BtreeLeave(p);
2998   return rc;
2999 }
3000 
3001 /*
3002 ** This routine is called prior to sqlite3PagerCommit when a transaction
3003 ** is commited for an auto-vacuum database.
3004 **
3005 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3006 ** the database file should be truncated to during the commit process.
3007 ** i.e. the database has been reorganized so that only the first *pnTrunc
3008 ** pages are in use.
3009 */
3010 static int autoVacuumCommit(BtShared *pBt){
3011   int rc = SQLITE_OK;
3012   Pager *pPager = pBt->pPager;
3013   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
3014 
3015   assert( sqlite3_mutex_held(pBt->mutex) );
3016   invalidateAllOverflowCache(pBt);
3017   assert(pBt->autoVacuum);
3018   if( !pBt->incrVacuum ){
3019     Pgno nFin;         /* Number of pages in database after autovacuuming */
3020     Pgno nFree;        /* Number of pages on the freelist initially */
3021     Pgno nPtrmap;      /* Number of PtrMap pages to be freed */
3022     Pgno iFree;        /* The next page to be freed */
3023     int nEntry;        /* Number of entries on one ptrmap page */
3024     Pgno nOrig;        /* Database size before freeing */
3025 
3026     nOrig = btreePagecount(pBt);
3027     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3028       /* It is not possible to create a database for which the final page
3029       ** is either a pointer-map page or the pending-byte page. If one
3030       ** is encountered, this indicates corruption.
3031       */
3032       return SQLITE_CORRUPT_BKPT;
3033     }
3034 
3035     nFree = get4byte(&pBt->pPage1->aData[36]);
3036     nEntry = pBt->usableSize/5;
3037     nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3038     nFin = nOrig - nFree - nPtrmap;
3039     if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3040       nFin--;
3041     }
3042     while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3043       nFin--;
3044     }
3045     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3046 
3047     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3048       rc = incrVacuumStep(pBt, nFin, iFree);
3049     }
3050     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3051       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3052       put4byte(&pBt->pPage1->aData[32], 0);
3053       put4byte(&pBt->pPage1->aData[36], 0);
3054       put4byte(&pBt->pPage1->aData[28], nFin);
3055       sqlite3PagerTruncateImage(pBt->pPager, nFin);
3056       pBt->nPage = nFin;
3057     }
3058     if( rc!=SQLITE_OK ){
3059       sqlite3PagerRollback(pPager);
3060     }
3061   }
3062 
3063   assert( nRef==sqlite3PagerRefcount(pPager) );
3064   return rc;
3065 }
3066 
3067 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3068 # define setChildPtrmaps(x) SQLITE_OK
3069 #endif
3070 
3071 /*
3072 ** This routine does the first phase of a two-phase commit.  This routine
3073 ** causes a rollback journal to be created (if it does not already exist)
3074 ** and populated with enough information so that if a power loss occurs
3075 ** the database can be restored to its original state by playing back
3076 ** the journal.  Then the contents of the journal are flushed out to
3077 ** the disk.  After the journal is safely on oxide, the changes to the
3078 ** database are written into the database file and flushed to oxide.
3079 ** At the end of this call, the rollback journal still exists on the
3080 ** disk and we are still holding all locks, so the transaction has not
3081 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3082 ** commit process.
3083 **
3084 ** This call is a no-op if no write-transaction is currently active on pBt.
3085 **
3086 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3087 ** the name of a master journal file that should be written into the
3088 ** individual journal file, or is NULL, indicating no master journal file
3089 ** (single database transaction).
3090 **
3091 ** When this is called, the master journal should already have been
3092 ** created, populated with this journal pointer and synced to disk.
3093 **
3094 ** Once this is routine has returned, the only thing required to commit
3095 ** the write-transaction for this database file is to delete the journal.
3096 */
3097 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3098   int rc = SQLITE_OK;
3099   if( p->inTrans==TRANS_WRITE ){
3100     BtShared *pBt = p->pBt;
3101     sqlite3BtreeEnter(p);
3102 #ifndef SQLITE_OMIT_AUTOVACUUM
3103     if( pBt->autoVacuum ){
3104       rc = autoVacuumCommit(pBt);
3105       if( rc!=SQLITE_OK ){
3106         sqlite3BtreeLeave(p);
3107         return rc;
3108       }
3109     }
3110 #endif
3111     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3112     sqlite3BtreeLeave(p);
3113   }
3114   return rc;
3115 }
3116 
3117 /*
3118 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3119 ** at the conclusion of a transaction.
3120 */
3121 static void btreeEndTransaction(Btree *p){
3122   BtShared *pBt = p->pBt;
3123   assert( sqlite3BtreeHoldsMutex(p) );
3124 
3125   btreeClearHasContent(pBt);
3126   if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){
3127     /* If there are other active statements that belong to this database
3128     ** handle, downgrade to a read-only transaction. The other statements
3129     ** may still be reading from the database.  */
3130     downgradeAllSharedCacheTableLocks(p);
3131     p->inTrans = TRANS_READ;
3132   }else{
3133     /* If the handle had any kind of transaction open, decrement the
3134     ** transaction count of the shared btree. If the transaction count
3135     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3136     ** call below will unlock the pager.  */
3137     if( p->inTrans!=TRANS_NONE ){
3138       clearAllSharedCacheTableLocks(p);
3139       pBt->nTransaction--;
3140       if( 0==pBt->nTransaction ){
3141         pBt->inTransaction = TRANS_NONE;
3142       }
3143     }
3144 
3145     /* Set the current transaction state to TRANS_NONE and unlock the
3146     ** pager if this call closed the only read or write transaction.  */
3147     p->inTrans = TRANS_NONE;
3148     unlockBtreeIfUnused(pBt);
3149   }
3150 
3151   btreeIntegrity(p);
3152 }
3153 
3154 /*
3155 ** Commit the transaction currently in progress.
3156 **
3157 ** This routine implements the second phase of a 2-phase commit.  The
3158 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3159 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3160 ** routine did all the work of writing information out to disk and flushing the
3161 ** contents so that they are written onto the disk platter.  All this
3162 ** routine has to do is delete or truncate or zero the header in the
3163 ** the rollback journal (which causes the transaction to commit) and
3164 ** drop locks.
3165 **
3166 ** This will release the write lock on the database file.  If there
3167 ** are no active cursors, it also releases the read lock.
3168 */
3169 int sqlite3BtreeCommitPhaseTwo(Btree *p){
3170   BtShared *pBt = p->pBt;
3171 
3172   sqlite3BtreeEnter(p);
3173   btreeIntegrity(p);
3174 
3175   /* If the handle has a write-transaction open, commit the shared-btrees
3176   ** transaction and set the shared state to TRANS_READ.
3177   */
3178   if( p->inTrans==TRANS_WRITE ){
3179     int rc;
3180     assert( pBt->inTransaction==TRANS_WRITE );
3181     assert( pBt->nTransaction>0 );
3182     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3183     if( rc!=SQLITE_OK ){
3184       sqlite3BtreeLeave(p);
3185       return rc;
3186     }
3187     pBt->inTransaction = TRANS_READ;
3188   }
3189 
3190   btreeEndTransaction(p);
3191   sqlite3BtreeLeave(p);
3192   return SQLITE_OK;
3193 }
3194 
3195 /*
3196 ** Do both phases of a commit.
3197 */
3198 int sqlite3BtreeCommit(Btree *p){
3199   int rc;
3200   sqlite3BtreeEnter(p);
3201   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3202   if( rc==SQLITE_OK ){
3203     rc = sqlite3BtreeCommitPhaseTwo(p);
3204   }
3205   sqlite3BtreeLeave(p);
3206   return rc;
3207 }
3208 
3209 #ifndef NDEBUG
3210 /*
3211 ** Return the number of write-cursors open on this handle. This is for use
3212 ** in assert() expressions, so it is only compiled if NDEBUG is not
3213 ** defined.
3214 **
3215 ** For the purposes of this routine, a write-cursor is any cursor that
3216 ** is capable of writing to the databse.  That means the cursor was
3217 ** originally opened for writing and the cursor has not be disabled
3218 ** by having its state changed to CURSOR_FAULT.
3219 */
3220 static int countWriteCursors(BtShared *pBt){
3221   BtCursor *pCur;
3222   int r = 0;
3223   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3224     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
3225   }
3226   return r;
3227 }
3228 #endif
3229 
3230 /*
3231 ** This routine sets the state to CURSOR_FAULT and the error
3232 ** code to errCode for every cursor on BtShared that pBtree
3233 ** references.
3234 **
3235 ** Every cursor is tripped, including cursors that belong
3236 ** to other database connections that happen to be sharing
3237 ** the cache with pBtree.
3238 **
3239 ** This routine gets called when a rollback occurs.
3240 ** All cursors using the same cache must be tripped
3241 ** to prevent them from trying to use the btree after
3242 ** the rollback.  The rollback may have deleted tables
3243 ** or moved root pages, so it is not sufficient to
3244 ** save the state of the cursor.  The cursor must be
3245 ** invalidated.
3246 */
3247 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3248   BtCursor *p;
3249   sqlite3BtreeEnter(pBtree);
3250   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3251     int i;
3252     sqlite3BtreeClearCursor(p);
3253     p->eState = CURSOR_FAULT;
3254     p->skipNext = errCode;
3255     for(i=0; i<=p->iPage; i++){
3256       releasePage(p->apPage[i]);
3257       p->apPage[i] = 0;
3258     }
3259   }
3260   sqlite3BtreeLeave(pBtree);
3261 }
3262 
3263 /*
3264 ** Rollback the transaction in progress.  All cursors will be
3265 ** invalided by this operation.  Any attempt to use a cursor
3266 ** that was open at the beginning of this operation will result
3267 ** in an error.
3268 **
3269 ** This will release the write lock on the database file.  If there
3270 ** are no active cursors, it also releases the read lock.
3271 */
3272 int sqlite3BtreeRollback(Btree *p){
3273   int rc;
3274   BtShared *pBt = p->pBt;
3275   MemPage *pPage1;
3276 
3277   sqlite3BtreeEnter(p);
3278   rc = saveAllCursors(pBt, 0, 0);
3279 #ifndef SQLITE_OMIT_SHARED_CACHE
3280   if( rc!=SQLITE_OK ){
3281     /* This is a horrible situation. An IO or malloc() error occurred whilst
3282     ** trying to save cursor positions. If this is an automatic rollback (as
3283     ** the result of a constraint, malloc() failure or IO error) then
3284     ** the cache may be internally inconsistent (not contain valid trees) so
3285     ** we cannot simply return the error to the caller. Instead, abort
3286     ** all queries that may be using any of the cursors that failed to save.
3287     */
3288     sqlite3BtreeTripAllCursors(p, rc);
3289   }
3290 #endif
3291   btreeIntegrity(p);
3292 
3293   if( p->inTrans==TRANS_WRITE ){
3294     int rc2;
3295 
3296     assert( TRANS_WRITE==pBt->inTransaction );
3297     rc2 = sqlite3PagerRollback(pBt->pPager);
3298     if( rc2!=SQLITE_OK ){
3299       rc = rc2;
3300     }
3301 
3302     /* The rollback may have destroyed the pPage1->aData value.  So
3303     ** call btreeGetPage() on page 1 again to make
3304     ** sure pPage1->aData is set correctly. */
3305     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3306       int nPage = get4byte(28+(u8*)pPage1->aData);
3307       testcase( nPage==0 );
3308       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3309       testcase( pBt->nPage!=nPage );
3310       pBt->nPage = nPage;
3311       releasePage(pPage1);
3312     }
3313     assert( countWriteCursors(pBt)==0 );
3314     pBt->inTransaction = TRANS_READ;
3315   }
3316 
3317   btreeEndTransaction(p);
3318   sqlite3BtreeLeave(p);
3319   return rc;
3320 }
3321 
3322 /*
3323 ** Start a statement subtransaction. The subtransaction can can be rolled
3324 ** back independently of the main transaction. You must start a transaction
3325 ** before starting a subtransaction. The subtransaction is ended automatically
3326 ** if the main transaction commits or rolls back.
3327 **
3328 ** Statement subtransactions are used around individual SQL statements
3329 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3330 ** error occurs within the statement, the effect of that one statement
3331 ** can be rolled back without having to rollback the entire transaction.
3332 **
3333 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3334 ** value passed as the second parameter is the total number of savepoints,
3335 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3336 ** are no active savepoints and no other statement-transactions open,
3337 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3338 ** using the sqlite3BtreeSavepoint() function.
3339 */
3340 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3341   int rc;
3342   BtShared *pBt = p->pBt;
3343   sqlite3BtreeEnter(p);
3344   assert( p->inTrans==TRANS_WRITE );
3345   assert( pBt->readOnly==0 );
3346   assert( iStatement>0 );
3347   assert( iStatement>p->db->nSavepoint );
3348   assert( pBt->inTransaction==TRANS_WRITE );
3349   /* At the pager level, a statement transaction is a savepoint with
3350   ** an index greater than all savepoints created explicitly using
3351   ** SQL statements. It is illegal to open, release or rollback any
3352   ** such savepoints while the statement transaction savepoint is active.
3353   */
3354   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3355   sqlite3BtreeLeave(p);
3356   return rc;
3357 }
3358 
3359 /*
3360 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3361 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3362 ** savepoint identified by parameter iSavepoint, depending on the value
3363 ** of op.
3364 **
3365 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3366 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3367 ** contents of the entire transaction are rolled back. This is different
3368 ** from a normal transaction rollback, as no locks are released and the
3369 ** transaction remains open.
3370 */
3371 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3372   int rc = SQLITE_OK;
3373   if( p && p->inTrans==TRANS_WRITE ){
3374     BtShared *pBt = p->pBt;
3375     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3376     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3377     sqlite3BtreeEnter(p);
3378     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3379     if( rc==SQLITE_OK ){
3380       if( iSavepoint<0 && pBt->initiallyEmpty ) pBt->nPage = 0;
3381       rc = newDatabase(pBt);
3382       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3383 
3384       /* The database size was written into the offset 28 of the header
3385       ** when the transaction started, so we know that the value at offset
3386       ** 28 is nonzero. */
3387       assert( pBt->nPage>0 );
3388     }
3389     sqlite3BtreeLeave(p);
3390   }
3391   return rc;
3392 }
3393 
3394 /*
3395 ** Create a new cursor for the BTree whose root is on the page
3396 ** iTable. If a read-only cursor is requested, it is assumed that
3397 ** the caller already has at least a read-only transaction open
3398 ** on the database already. If a write-cursor is requested, then
3399 ** the caller is assumed to have an open write transaction.
3400 **
3401 ** If wrFlag==0, then the cursor can only be used for reading.
3402 ** If wrFlag==1, then the cursor can be used for reading or for
3403 ** writing if other conditions for writing are also met.  These
3404 ** are the conditions that must be met in order for writing to
3405 ** be allowed:
3406 **
3407 ** 1:  The cursor must have been opened with wrFlag==1
3408 **
3409 ** 2:  Other database connections that share the same pager cache
3410 **     but which are not in the READ_UNCOMMITTED state may not have
3411 **     cursors open with wrFlag==0 on the same table.  Otherwise
3412 **     the changes made by this write cursor would be visible to
3413 **     the read cursors in the other database connection.
3414 **
3415 ** 3:  The database must be writable (not on read-only media)
3416 **
3417 ** 4:  There must be an active transaction.
3418 **
3419 ** No checking is done to make sure that page iTable really is the
3420 ** root page of a b-tree.  If it is not, then the cursor acquired
3421 ** will not work correctly.
3422 **
3423 ** It is assumed that the sqlite3BtreeCursorZero() has been called
3424 ** on pCur to initialize the memory space prior to invoking this routine.
3425 */
3426 static int btreeCursor(
3427   Btree *p,                              /* The btree */
3428   int iTable,                            /* Root page of table to open */
3429   int wrFlag,                            /* 1 to write. 0 read-only */
3430   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
3431   BtCursor *pCur                         /* Space for new cursor */
3432 ){
3433   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
3434 
3435   assert( sqlite3BtreeHoldsMutex(p) );
3436   assert( wrFlag==0 || wrFlag==1 );
3437 
3438   /* The following assert statements verify that if this is a sharable
3439   ** b-tree database, the connection is holding the required table locks,
3440   ** and that no other connection has any open cursor that conflicts with
3441   ** this lock.  */
3442   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3443   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3444 
3445   /* Assert that the caller has opened the required transaction. */
3446   assert( p->inTrans>TRANS_NONE );
3447   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3448   assert( pBt->pPage1 && pBt->pPage1->aData );
3449 
3450   if( NEVER(wrFlag && pBt->readOnly) ){
3451     return SQLITE_READONLY;
3452   }
3453   if( iTable==1 && btreePagecount(pBt)==0 ){
3454     return SQLITE_EMPTY;
3455   }
3456 
3457   /* Now that no other errors can occur, finish filling in the BtCursor
3458   ** variables and link the cursor into the BtShared list.  */
3459   pCur->pgnoRoot = (Pgno)iTable;
3460   pCur->iPage = -1;
3461   pCur->pKeyInfo = pKeyInfo;
3462   pCur->pBtree = p;
3463   pCur->pBt = pBt;
3464   pCur->wrFlag = (u8)wrFlag;
3465   pCur->pNext = pBt->pCursor;
3466   if( pCur->pNext ){
3467     pCur->pNext->pPrev = pCur;
3468   }
3469   pBt->pCursor = pCur;
3470   pCur->eState = CURSOR_INVALID;
3471   pCur->cachedRowid = 0;
3472   return SQLITE_OK;
3473 }
3474 int sqlite3BtreeCursor(
3475   Btree *p,                                   /* The btree */
3476   int iTable,                                 /* Root page of table to open */
3477   int wrFlag,                                 /* 1 to write. 0 read-only */
3478   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
3479   BtCursor *pCur                              /* Write new cursor here */
3480 ){
3481   int rc;
3482   sqlite3BtreeEnter(p);
3483   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3484   sqlite3BtreeLeave(p);
3485   return rc;
3486 }
3487 
3488 /*
3489 ** Return the size of a BtCursor object in bytes.
3490 **
3491 ** This interfaces is needed so that users of cursors can preallocate
3492 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3493 ** to users so they cannot do the sizeof() themselves - they must call
3494 ** this routine.
3495 */
3496 int sqlite3BtreeCursorSize(void){
3497   return ROUND8(sizeof(BtCursor));
3498 }
3499 
3500 /*
3501 ** Initialize memory that will be converted into a BtCursor object.
3502 **
3503 ** The simple approach here would be to memset() the entire object
3504 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
3505 ** do not need to be zeroed and they are large, so we can save a lot
3506 ** of run-time by skipping the initialization of those elements.
3507 */
3508 void sqlite3BtreeCursorZero(BtCursor *p){
3509   memset(p, 0, offsetof(BtCursor, iPage));
3510 }
3511 
3512 /*
3513 ** Set the cached rowid value of every cursor in the same database file
3514 ** as pCur and having the same root page number as pCur.  The value is
3515 ** set to iRowid.
3516 **
3517 ** Only positive rowid values are considered valid for this cache.
3518 ** The cache is initialized to zero, indicating an invalid cache.
3519 ** A btree will work fine with zero or negative rowids.  We just cannot
3520 ** cache zero or negative rowids, which means tables that use zero or
3521 ** negative rowids might run a little slower.  But in practice, zero
3522 ** or negative rowids are very uncommon so this should not be a problem.
3523 */
3524 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3525   BtCursor *p;
3526   for(p=pCur->pBt->pCursor; p; p=p->pNext){
3527     if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3528   }
3529   assert( pCur->cachedRowid==iRowid );
3530 }
3531 
3532 /*
3533 ** Return the cached rowid for the given cursor.  A negative or zero
3534 ** return value indicates that the rowid cache is invalid and should be
3535 ** ignored.  If the rowid cache has never before been set, then a
3536 ** zero is returned.
3537 */
3538 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3539   return pCur->cachedRowid;
3540 }
3541 
3542 /*
3543 ** Close a cursor.  The read lock on the database file is released
3544 ** when the last cursor is closed.
3545 */
3546 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3547   Btree *pBtree = pCur->pBtree;
3548   if( pBtree ){
3549     int i;
3550     BtShared *pBt = pCur->pBt;
3551     sqlite3BtreeEnter(pBtree);
3552     sqlite3BtreeClearCursor(pCur);
3553     if( pCur->pPrev ){
3554       pCur->pPrev->pNext = pCur->pNext;
3555     }else{
3556       pBt->pCursor = pCur->pNext;
3557     }
3558     if( pCur->pNext ){
3559       pCur->pNext->pPrev = pCur->pPrev;
3560     }
3561     for(i=0; i<=pCur->iPage; i++){
3562       releasePage(pCur->apPage[i]);
3563     }
3564     unlockBtreeIfUnused(pBt);
3565     invalidateOverflowCache(pCur);
3566     /* sqlite3_free(pCur); */
3567     sqlite3BtreeLeave(pBtree);
3568   }
3569   return SQLITE_OK;
3570 }
3571 
3572 /*
3573 ** Make sure the BtCursor* given in the argument has a valid
3574 ** BtCursor.info structure.  If it is not already valid, call
3575 ** btreeParseCell() to fill it in.
3576 **
3577 ** BtCursor.info is a cache of the information in the current cell.
3578 ** Using this cache reduces the number of calls to btreeParseCell().
3579 **
3580 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
3581 ** compiler to crash when getCellInfo() is implemented as a macro.
3582 ** But there is a measureable speed advantage to using the macro on gcc
3583 ** (when less compiler optimizations like -Os or -O0 are used and the
3584 ** compiler is not doing agressive inlining.)  So we use a real function
3585 ** for MSVC and a macro for everything else.  Ticket #2457.
3586 */
3587 #ifndef NDEBUG
3588   static void assertCellInfo(BtCursor *pCur){
3589     CellInfo info;
3590     int iPage = pCur->iPage;
3591     memset(&info, 0, sizeof(info));
3592     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3593     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3594   }
3595 #else
3596   #define assertCellInfo(x)
3597 #endif
3598 #ifdef _MSC_VER
3599   /* Use a real function in MSVC to work around bugs in that compiler. */
3600   static void getCellInfo(BtCursor *pCur){
3601     if( pCur->info.nSize==0 ){
3602       int iPage = pCur->iPage;
3603       btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3604       pCur->validNKey = 1;
3605     }else{
3606       assertCellInfo(pCur);
3607     }
3608   }
3609 #else /* if not _MSC_VER */
3610   /* Use a macro in all other compilers so that the function is inlined */
3611 #define getCellInfo(pCur)                                                      \
3612   if( pCur->info.nSize==0 ){                                                   \
3613     int iPage = pCur->iPage;                                                   \
3614     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3615     pCur->validNKey = 1;                                                       \
3616   }else{                                                                       \
3617     assertCellInfo(pCur);                                                      \
3618   }
3619 #endif /* _MSC_VER */
3620 
3621 #ifndef NDEBUG  /* The next routine used only within assert() statements */
3622 /*
3623 ** Return true if the given BtCursor is valid.  A valid cursor is one
3624 ** that is currently pointing to a row in a (non-empty) table.
3625 ** This is a verification routine is used only within assert() statements.
3626 */
3627 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3628   return pCur && pCur->eState==CURSOR_VALID;
3629 }
3630 #endif /* NDEBUG */
3631 
3632 /*
3633 ** Set *pSize to the size of the buffer needed to hold the value of
3634 ** the key for the current entry.  If the cursor is not pointing
3635 ** to a valid entry, *pSize is set to 0.
3636 **
3637 ** For a table with the INTKEY flag set, this routine returns the key
3638 ** itself, not the number of bytes in the key.
3639 **
3640 ** The caller must position the cursor prior to invoking this routine.
3641 **
3642 ** This routine cannot fail.  It always returns SQLITE_OK.
3643 */
3644 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3645   assert( cursorHoldsMutex(pCur) );
3646   assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3647   if( pCur->eState!=CURSOR_VALID ){
3648     *pSize = 0;
3649   }else{
3650     getCellInfo(pCur);
3651     *pSize = pCur->info.nKey;
3652   }
3653   return SQLITE_OK;
3654 }
3655 
3656 /*
3657 ** Set *pSize to the number of bytes of data in the entry the
3658 ** cursor currently points to.
3659 **
3660 ** The caller must guarantee that the cursor is pointing to a non-NULL
3661 ** valid entry.  In other words, the calling procedure must guarantee
3662 ** that the cursor has Cursor.eState==CURSOR_VALID.
3663 **
3664 ** Failure is not possible.  This function always returns SQLITE_OK.
3665 ** It might just as well be a procedure (returning void) but we continue
3666 ** to return an integer result code for historical reasons.
3667 */
3668 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3669   assert( cursorHoldsMutex(pCur) );
3670   assert( pCur->eState==CURSOR_VALID );
3671   getCellInfo(pCur);
3672   *pSize = pCur->info.nData;
3673   return SQLITE_OK;
3674 }
3675 
3676 /*
3677 ** Given the page number of an overflow page in the database (parameter
3678 ** ovfl), this function finds the page number of the next page in the
3679 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3680 ** pointer-map data instead of reading the content of page ovfl to do so.
3681 **
3682 ** If an error occurs an SQLite error code is returned. Otherwise:
3683 **
3684 ** The page number of the next overflow page in the linked list is
3685 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3686 ** list, *pPgnoNext is set to zero.
3687 **
3688 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3689 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3690 ** reference. It is the responsibility of the caller to call releasePage()
3691 ** on *ppPage to free the reference. In no reference was obtained (because
3692 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3693 ** *ppPage is set to zero.
3694 */
3695 static int getOverflowPage(
3696   BtShared *pBt,               /* The database file */
3697   Pgno ovfl,                   /* Current overflow page number */
3698   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
3699   Pgno *pPgnoNext              /* OUT: Next overflow page number */
3700 ){
3701   Pgno next = 0;
3702   MemPage *pPage = 0;
3703   int rc = SQLITE_OK;
3704 
3705   assert( sqlite3_mutex_held(pBt->mutex) );
3706   assert(pPgnoNext);
3707 
3708 #ifndef SQLITE_OMIT_AUTOVACUUM
3709   /* Try to find the next page in the overflow list using the
3710   ** autovacuum pointer-map pages. Guess that the next page in
3711   ** the overflow list is page number (ovfl+1). If that guess turns
3712   ** out to be wrong, fall back to loading the data of page
3713   ** number ovfl to determine the next page number.
3714   */
3715   if( pBt->autoVacuum ){
3716     Pgno pgno;
3717     Pgno iGuess = ovfl+1;
3718     u8 eType;
3719 
3720     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3721       iGuess++;
3722     }
3723 
3724     if( iGuess<=btreePagecount(pBt) ){
3725       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3726       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3727         next = iGuess;
3728         rc = SQLITE_DONE;
3729       }
3730     }
3731   }
3732 #endif
3733 
3734   assert( next==0 || rc==SQLITE_DONE );
3735   if( rc==SQLITE_OK ){
3736     rc = btreeGetPage(pBt, ovfl, &pPage, 0);
3737     assert( rc==SQLITE_OK || pPage==0 );
3738     if( rc==SQLITE_OK ){
3739       next = get4byte(pPage->aData);
3740     }
3741   }
3742 
3743   *pPgnoNext = next;
3744   if( ppPage ){
3745     *ppPage = pPage;
3746   }else{
3747     releasePage(pPage);
3748   }
3749   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3750 }
3751 
3752 /*
3753 ** Copy data from a buffer to a page, or from a page to a buffer.
3754 **
3755 ** pPayload is a pointer to data stored on database page pDbPage.
3756 ** If argument eOp is false, then nByte bytes of data are copied
3757 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3758 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3759 ** of data are copied from the buffer pBuf to pPayload.
3760 **
3761 ** SQLITE_OK is returned on success, otherwise an error code.
3762 */
3763 static int copyPayload(
3764   void *pPayload,           /* Pointer to page data */
3765   void *pBuf,               /* Pointer to buffer */
3766   int nByte,                /* Number of bytes to copy */
3767   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3768   DbPage *pDbPage           /* Page containing pPayload */
3769 ){
3770   if( eOp ){
3771     /* Copy data from buffer to page (a write operation) */
3772     int rc = sqlite3PagerWrite(pDbPage);
3773     if( rc!=SQLITE_OK ){
3774       return rc;
3775     }
3776     memcpy(pPayload, pBuf, nByte);
3777   }else{
3778     /* Copy data from page to buffer (a read operation) */
3779     memcpy(pBuf, pPayload, nByte);
3780   }
3781   return SQLITE_OK;
3782 }
3783 
3784 /*
3785 ** This function is used to read or overwrite payload information
3786 ** for the entry that the pCur cursor is pointing to. If the eOp
3787 ** parameter is 0, this is a read operation (data copied into
3788 ** buffer pBuf). If it is non-zero, a write (data copied from
3789 ** buffer pBuf).
3790 **
3791 ** A total of "amt" bytes are read or written beginning at "offset".
3792 ** Data is read to or from the buffer pBuf.
3793 **
3794 ** The content being read or written might appear on the main page
3795 ** or be scattered out on multiple overflow pages.
3796 **
3797 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3798 ** cursor entry uses one or more overflow pages, this function
3799 ** allocates space for and lazily popluates the overflow page-list
3800 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3801 ** cache to make seeking to the supplied offset more efficient.
3802 **
3803 ** Once an overflow page-list cache has been allocated, it may be
3804 ** invalidated if some other cursor writes to the same table, or if
3805 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3806 ** mode, the following events may invalidate an overflow page-list cache.
3807 **
3808 **   * An incremental vacuum,
3809 **   * A commit in auto_vacuum="full" mode,
3810 **   * Creating a table (may require moving an overflow page).
3811 */
3812 static int accessPayload(
3813   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3814   u32 offset,          /* Begin reading this far into payload */
3815   u32 amt,             /* Read this many bytes */
3816   unsigned char *pBuf, /* Write the bytes into this buffer */
3817   int eOp              /* zero to read. non-zero to write. */
3818 ){
3819   unsigned char *aPayload;
3820   int rc = SQLITE_OK;
3821   u32 nKey;
3822   int iIdx = 0;
3823   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3824   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
3825 
3826   assert( pPage );
3827   assert( pCur->eState==CURSOR_VALID );
3828   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3829   assert( cursorHoldsMutex(pCur) );
3830 
3831   getCellInfo(pCur);
3832   aPayload = pCur->info.pCell + pCur->info.nHeader;
3833   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3834 
3835   if( NEVER(offset+amt > nKey+pCur->info.nData)
3836    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3837   ){
3838     /* Trying to read or write past the end of the data is an error */
3839     return SQLITE_CORRUPT_BKPT;
3840   }
3841 
3842   /* Check if data must be read/written to/from the btree page itself. */
3843   if( offset<pCur->info.nLocal ){
3844     int a = amt;
3845     if( a+offset>pCur->info.nLocal ){
3846       a = pCur->info.nLocal - offset;
3847     }
3848     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3849     offset = 0;
3850     pBuf += a;
3851     amt -= a;
3852   }else{
3853     offset -= pCur->info.nLocal;
3854   }
3855 
3856   if( rc==SQLITE_OK && amt>0 ){
3857     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3858     Pgno nextPage;
3859 
3860     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3861 
3862 #ifndef SQLITE_OMIT_INCRBLOB
3863     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3864     ** has not been allocated, allocate it now. The array is sized at
3865     ** one entry for each overflow page in the overflow chain. The
3866     ** page number of the first overflow page is stored in aOverflow[0],
3867     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3868     ** (the cache is lazily populated).
3869     */
3870     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3871       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3872       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3873       /* nOvfl is always positive.  If it were zero, fetchPayload would have
3874       ** been used instead of this routine. */
3875       if( ALWAYS(nOvfl) && !pCur->aOverflow ){
3876         rc = SQLITE_NOMEM;
3877       }
3878     }
3879 
3880     /* If the overflow page-list cache has been allocated and the
3881     ** entry for the first required overflow page is valid, skip
3882     ** directly to it.
3883     */
3884     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3885       iIdx = (offset/ovflSize);
3886       nextPage = pCur->aOverflow[iIdx];
3887       offset = (offset%ovflSize);
3888     }
3889 #endif
3890 
3891     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3892 
3893 #ifndef SQLITE_OMIT_INCRBLOB
3894       /* If required, populate the overflow page-list cache. */
3895       if( pCur->aOverflow ){
3896         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3897         pCur->aOverflow[iIdx] = nextPage;
3898       }
3899 #endif
3900 
3901       if( offset>=ovflSize ){
3902         /* The only reason to read this page is to obtain the page
3903         ** number for the next page in the overflow chain. The page
3904         ** data is not required. So first try to lookup the overflow
3905         ** page-list cache, if any, then fall back to the getOverflowPage()
3906         ** function.
3907         */
3908 #ifndef SQLITE_OMIT_INCRBLOB
3909         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3910           nextPage = pCur->aOverflow[iIdx+1];
3911         } else
3912 #endif
3913           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3914         offset -= ovflSize;
3915       }else{
3916         /* Need to read this page properly. It contains some of the
3917         ** range of data that is being read (eOp==0) or written (eOp!=0).
3918         */
3919         DbPage *pDbPage;
3920         int a = amt;
3921         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3922         if( rc==SQLITE_OK ){
3923           aPayload = sqlite3PagerGetData(pDbPage);
3924           nextPage = get4byte(aPayload);
3925           if( a + offset > ovflSize ){
3926             a = ovflSize - offset;
3927           }
3928           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3929           sqlite3PagerUnref(pDbPage);
3930           offset = 0;
3931           amt -= a;
3932           pBuf += a;
3933         }
3934       }
3935     }
3936   }
3937 
3938   if( rc==SQLITE_OK && amt>0 ){
3939     return SQLITE_CORRUPT_BKPT;
3940   }
3941   return rc;
3942 }
3943 
3944 /*
3945 ** Read part of the key associated with cursor pCur.  Exactly
3946 ** "amt" bytes will be transfered into pBuf[].  The transfer
3947 ** begins at "offset".
3948 **
3949 ** The caller must ensure that pCur is pointing to a valid row
3950 ** in the table.
3951 **
3952 ** Return SQLITE_OK on success or an error code if anything goes
3953 ** wrong.  An error is returned if "offset+amt" is larger than
3954 ** the available payload.
3955 */
3956 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3957   assert( cursorHoldsMutex(pCur) );
3958   assert( pCur->eState==CURSOR_VALID );
3959   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3960   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3961   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
3962 }
3963 
3964 /*
3965 ** Read part of the data associated with cursor pCur.  Exactly
3966 ** "amt" bytes will be transfered into pBuf[].  The transfer
3967 ** begins at "offset".
3968 **
3969 ** Return SQLITE_OK on success or an error code if anything goes
3970 ** wrong.  An error is returned if "offset+amt" is larger than
3971 ** the available payload.
3972 */
3973 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3974   int rc;
3975 
3976 #ifndef SQLITE_OMIT_INCRBLOB
3977   if ( pCur->eState==CURSOR_INVALID ){
3978     return SQLITE_ABORT;
3979   }
3980 #endif
3981 
3982   assert( cursorHoldsMutex(pCur) );
3983   rc = restoreCursorPosition(pCur);
3984   if( rc==SQLITE_OK ){
3985     assert( pCur->eState==CURSOR_VALID );
3986     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3987     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3988     rc = accessPayload(pCur, offset, amt, pBuf, 0);
3989   }
3990   return rc;
3991 }
3992 
3993 /*
3994 ** Return a pointer to payload information from the entry that the
3995 ** pCur cursor is pointing to.  The pointer is to the beginning of
3996 ** the key if skipKey==0 and it points to the beginning of data if
3997 ** skipKey==1.  The number of bytes of available key/data is written
3998 ** into *pAmt.  If *pAmt==0, then the value returned will not be
3999 ** a valid pointer.
4000 **
4001 ** This routine is an optimization.  It is common for the entire key
4002 ** and data to fit on the local page and for there to be no overflow
4003 ** pages.  When that is so, this routine can be used to access the
4004 ** key and data without making a copy.  If the key and/or data spills
4005 ** onto overflow pages, then accessPayload() must be used to reassemble
4006 ** the key/data and copy it into a preallocated buffer.
4007 **
4008 ** The pointer returned by this routine looks directly into the cached
4009 ** page of the database.  The data might change or move the next time
4010 ** any btree routine is called.
4011 */
4012 static const unsigned char *fetchPayload(
4013   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4014   int *pAmt,           /* Write the number of available bytes here */
4015   int skipKey          /* read beginning at data if this is true */
4016 ){
4017   unsigned char *aPayload;
4018   MemPage *pPage;
4019   u32 nKey;
4020   u32 nLocal;
4021 
4022   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4023   assert( pCur->eState==CURSOR_VALID );
4024   assert( cursorHoldsMutex(pCur) );
4025   pPage = pCur->apPage[pCur->iPage];
4026   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4027   if( NEVER(pCur->info.nSize==0) ){
4028     btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
4029                    &pCur->info);
4030   }
4031   aPayload = pCur->info.pCell;
4032   aPayload += pCur->info.nHeader;
4033   if( pPage->intKey ){
4034     nKey = 0;
4035   }else{
4036     nKey = (int)pCur->info.nKey;
4037   }
4038   if( skipKey ){
4039     aPayload += nKey;
4040     nLocal = pCur->info.nLocal - nKey;
4041   }else{
4042     nLocal = pCur->info.nLocal;
4043     assert( nLocal<=nKey );
4044   }
4045   *pAmt = nLocal;
4046   return aPayload;
4047 }
4048 
4049 
4050 /*
4051 ** For the entry that cursor pCur is point to, return as
4052 ** many bytes of the key or data as are available on the local
4053 ** b-tree page.  Write the number of available bytes into *pAmt.
4054 **
4055 ** The pointer returned is ephemeral.  The key/data may move
4056 ** or be destroyed on the next call to any Btree routine,
4057 ** including calls from other threads against the same cache.
4058 ** Hence, a mutex on the BtShared should be held prior to calling
4059 ** this routine.
4060 **
4061 ** These routines is used to get quick access to key and data
4062 ** in the common case where no overflow pages are used.
4063 */
4064 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
4065   const void *p = 0;
4066   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4067   assert( cursorHoldsMutex(pCur) );
4068   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4069     p = (const void*)fetchPayload(pCur, pAmt, 0);
4070   }
4071   return p;
4072 }
4073 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
4074   const void *p = 0;
4075   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4076   assert( cursorHoldsMutex(pCur) );
4077   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4078     p = (const void*)fetchPayload(pCur, pAmt, 1);
4079   }
4080   return p;
4081 }
4082 
4083 
4084 /*
4085 ** Move the cursor down to a new child page.  The newPgno argument is the
4086 ** page number of the child page to move to.
4087 **
4088 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4089 ** the new child page does not match the flags field of the parent (i.e.
4090 ** if an intkey page appears to be the parent of a non-intkey page, or
4091 ** vice-versa).
4092 */
4093 static int moveToChild(BtCursor *pCur, u32 newPgno){
4094   int rc;
4095   int i = pCur->iPage;
4096   MemPage *pNewPage;
4097   BtShared *pBt = pCur->pBt;
4098 
4099   assert( cursorHoldsMutex(pCur) );
4100   assert( pCur->eState==CURSOR_VALID );
4101   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4102   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4103     return SQLITE_CORRUPT_BKPT;
4104   }
4105   rc = getAndInitPage(pBt, newPgno, &pNewPage);
4106   if( rc ) return rc;
4107   pCur->apPage[i+1] = pNewPage;
4108   pCur->aiIdx[i+1] = 0;
4109   pCur->iPage++;
4110 
4111   pCur->info.nSize = 0;
4112   pCur->validNKey = 0;
4113   if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
4114     return SQLITE_CORRUPT_BKPT;
4115   }
4116   return SQLITE_OK;
4117 }
4118 
4119 #ifndef NDEBUG
4120 /*
4121 ** Page pParent is an internal (non-leaf) tree page. This function
4122 ** asserts that page number iChild is the left-child if the iIdx'th
4123 ** cell in page pParent. Or, if iIdx is equal to the total number of
4124 ** cells in pParent, that page number iChild is the right-child of
4125 ** the page.
4126 */
4127 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4128   assert( iIdx<=pParent->nCell );
4129   if( iIdx==pParent->nCell ){
4130     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4131   }else{
4132     assert( get4byte(findCell(pParent, iIdx))==iChild );
4133   }
4134 }
4135 #else
4136 #  define assertParentIndex(x,y,z)
4137 #endif
4138 
4139 /*
4140 ** Move the cursor up to the parent page.
4141 **
4142 ** pCur->idx is set to the cell index that contains the pointer
4143 ** to the page we are coming from.  If we are coming from the
4144 ** right-most child page then pCur->idx is set to one more than
4145 ** the largest cell index.
4146 */
4147 static void moveToParent(BtCursor *pCur){
4148   assert( cursorHoldsMutex(pCur) );
4149   assert( pCur->eState==CURSOR_VALID );
4150   assert( pCur->iPage>0 );
4151   assert( pCur->apPage[pCur->iPage] );
4152   assertParentIndex(
4153     pCur->apPage[pCur->iPage-1],
4154     pCur->aiIdx[pCur->iPage-1],
4155     pCur->apPage[pCur->iPage]->pgno
4156   );
4157   releasePage(pCur->apPage[pCur->iPage]);
4158   pCur->iPage--;
4159   pCur->info.nSize = 0;
4160   pCur->validNKey = 0;
4161 }
4162 
4163 /*
4164 ** Move the cursor to point to the root page of its b-tree structure.
4165 **
4166 ** If the table has a virtual root page, then the cursor is moved to point
4167 ** to the virtual root page instead of the actual root page. A table has a
4168 ** virtual root page when the actual root page contains no cells and a
4169 ** single child page. This can only happen with the table rooted at page 1.
4170 **
4171 ** If the b-tree structure is empty, the cursor state is set to
4172 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4173 ** cell located on the root (or virtual root) page and the cursor state
4174 ** is set to CURSOR_VALID.
4175 **
4176 ** If this function returns successfully, it may be assumed that the
4177 ** page-header flags indicate that the [virtual] root-page is the expected
4178 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4179 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4180 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4181 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4182 ** b-tree).
4183 */
4184 static int moveToRoot(BtCursor *pCur){
4185   MemPage *pRoot;
4186   int rc = SQLITE_OK;
4187   Btree *p = pCur->pBtree;
4188   BtShared *pBt = p->pBt;
4189 
4190   assert( cursorHoldsMutex(pCur) );
4191   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4192   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4193   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4194   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4195     if( pCur->eState==CURSOR_FAULT ){
4196       assert( pCur->skipNext!=SQLITE_OK );
4197       return pCur->skipNext;
4198     }
4199     sqlite3BtreeClearCursor(pCur);
4200   }
4201 
4202   if( pCur->iPage>=0 ){
4203     int i;
4204     for(i=1; i<=pCur->iPage; i++){
4205       releasePage(pCur->apPage[i]);
4206     }
4207     pCur->iPage = 0;
4208   }else{
4209     rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4210     if( rc!=SQLITE_OK ){
4211       pCur->eState = CURSOR_INVALID;
4212       return rc;
4213     }
4214     pCur->iPage = 0;
4215 
4216     /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4217     ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4218     ** NULL, the caller expects a table b-tree. If this is not the case,
4219     ** return an SQLITE_CORRUPT error.  */
4220     assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4221     if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4222       return SQLITE_CORRUPT_BKPT;
4223     }
4224   }
4225 
4226   /* Assert that the root page is of the correct type. This must be the
4227   ** case as the call to this function that loaded the root-page (either
4228   ** this call or a previous invocation) would have detected corruption
4229   ** if the assumption were not true, and it is not possible for the flags
4230   ** byte to have been modified while this cursor is holding a reference
4231   ** to the page.  */
4232   pRoot = pCur->apPage[0];
4233   assert( pRoot->pgno==pCur->pgnoRoot );
4234   assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4235 
4236   pCur->aiIdx[0] = 0;
4237   pCur->info.nSize = 0;
4238   pCur->atLast = 0;
4239   pCur->validNKey = 0;
4240 
4241   if( pRoot->nCell==0 && !pRoot->leaf ){
4242     Pgno subpage;
4243     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4244     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4245     pCur->eState = CURSOR_VALID;
4246     rc = moveToChild(pCur, subpage);
4247   }else{
4248     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
4249   }
4250   return rc;
4251 }
4252 
4253 /*
4254 ** Move the cursor down to the left-most leaf entry beneath the
4255 ** entry to which it is currently pointing.
4256 **
4257 ** The left-most leaf is the one with the smallest key - the first
4258 ** in ascending order.
4259 */
4260 static int moveToLeftmost(BtCursor *pCur){
4261   Pgno pgno;
4262   int rc = SQLITE_OK;
4263   MemPage *pPage;
4264 
4265   assert( cursorHoldsMutex(pCur) );
4266   assert( pCur->eState==CURSOR_VALID );
4267   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4268     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4269     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4270     rc = moveToChild(pCur, pgno);
4271   }
4272   return rc;
4273 }
4274 
4275 /*
4276 ** Move the cursor down to the right-most leaf entry beneath the
4277 ** page to which it is currently pointing.  Notice the difference
4278 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4279 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4280 ** finds the right-most entry beneath the *page*.
4281 **
4282 ** The right-most entry is the one with the largest key - the last
4283 ** key in ascending order.
4284 */
4285 static int moveToRightmost(BtCursor *pCur){
4286   Pgno pgno;
4287   int rc = SQLITE_OK;
4288   MemPage *pPage = 0;
4289 
4290   assert( cursorHoldsMutex(pCur) );
4291   assert( pCur->eState==CURSOR_VALID );
4292   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4293     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4294     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4295     rc = moveToChild(pCur, pgno);
4296   }
4297   if( rc==SQLITE_OK ){
4298     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4299     pCur->info.nSize = 0;
4300     pCur->validNKey = 0;
4301   }
4302   return rc;
4303 }
4304 
4305 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4306 ** on success.  Set *pRes to 0 if the cursor actually points to something
4307 ** or set *pRes to 1 if the table is empty.
4308 */
4309 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4310   int rc;
4311 
4312   assert( cursorHoldsMutex(pCur) );
4313   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4314   rc = moveToRoot(pCur);
4315   if( rc==SQLITE_OK ){
4316     if( pCur->eState==CURSOR_INVALID ){
4317       assert( pCur->apPage[pCur->iPage]->nCell==0 );
4318       *pRes = 1;
4319     }else{
4320       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4321       *pRes = 0;
4322       rc = moveToLeftmost(pCur);
4323     }
4324   }
4325   return rc;
4326 }
4327 
4328 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4329 ** on success.  Set *pRes to 0 if the cursor actually points to something
4330 ** or set *pRes to 1 if the table is empty.
4331 */
4332 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4333   int rc;
4334 
4335   assert( cursorHoldsMutex(pCur) );
4336   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4337 
4338   /* If the cursor already points to the last entry, this is a no-op. */
4339   if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4340 #ifdef SQLITE_DEBUG
4341     /* This block serves to assert() that the cursor really does point
4342     ** to the last entry in the b-tree. */
4343     int ii;
4344     for(ii=0; ii<pCur->iPage; ii++){
4345       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4346     }
4347     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4348     assert( pCur->apPage[pCur->iPage]->leaf );
4349 #endif
4350     return SQLITE_OK;
4351   }
4352 
4353   rc = moveToRoot(pCur);
4354   if( rc==SQLITE_OK ){
4355     if( CURSOR_INVALID==pCur->eState ){
4356       assert( pCur->apPage[pCur->iPage]->nCell==0 );
4357       *pRes = 1;
4358     }else{
4359       assert( pCur->eState==CURSOR_VALID );
4360       *pRes = 0;
4361       rc = moveToRightmost(pCur);
4362       pCur->atLast = rc==SQLITE_OK ?1:0;
4363     }
4364   }
4365   return rc;
4366 }
4367 
4368 /* Move the cursor so that it points to an entry near the key
4369 ** specified by pIdxKey or intKey.   Return a success code.
4370 **
4371 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
4372 ** must be NULL.  For index tables, pIdxKey is used and intKey
4373 ** is ignored.
4374 **
4375 ** If an exact match is not found, then the cursor is always
4376 ** left pointing at a leaf page which would hold the entry if it
4377 ** were present.  The cursor might point to an entry that comes
4378 ** before or after the key.
4379 **
4380 ** An integer is written into *pRes which is the result of
4381 ** comparing the key with the entry to which the cursor is
4382 ** pointing.  The meaning of the integer written into
4383 ** *pRes is as follows:
4384 **
4385 **     *pRes<0      The cursor is left pointing at an entry that
4386 **                  is smaller than intKey/pIdxKey or if the table is empty
4387 **                  and the cursor is therefore left point to nothing.
4388 **
4389 **     *pRes==0     The cursor is left pointing at an entry that
4390 **                  exactly matches intKey/pIdxKey.
4391 **
4392 **     *pRes>0      The cursor is left pointing at an entry that
4393 **                  is larger than intKey/pIdxKey.
4394 **
4395 */
4396 int sqlite3BtreeMovetoUnpacked(
4397   BtCursor *pCur,          /* The cursor to be moved */
4398   UnpackedRecord *pIdxKey, /* Unpacked index key */
4399   i64 intKey,              /* The table key */
4400   int biasRight,           /* If true, bias the search to the high end */
4401   int *pRes                /* Write search results here */
4402 ){
4403   int rc;
4404 
4405   assert( cursorHoldsMutex(pCur) );
4406   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4407   assert( pRes );
4408   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4409 
4410   /* If the cursor is already positioned at the point we are trying
4411   ** to move to, then just return without doing any work */
4412   if( pCur->eState==CURSOR_VALID && pCur->validNKey
4413    && pCur->apPage[0]->intKey
4414   ){
4415     if( pCur->info.nKey==intKey ){
4416       *pRes = 0;
4417       return SQLITE_OK;
4418     }
4419     if( pCur->atLast && pCur->info.nKey<intKey ){
4420       *pRes = -1;
4421       return SQLITE_OK;
4422     }
4423   }
4424 
4425   rc = moveToRoot(pCur);
4426   if( rc ){
4427     return rc;
4428   }
4429   assert( pCur->apPage[pCur->iPage] );
4430   assert( pCur->apPage[pCur->iPage]->isInit );
4431   assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID );
4432   if( pCur->eState==CURSOR_INVALID ){
4433     *pRes = -1;
4434     assert( pCur->apPage[pCur->iPage]->nCell==0 );
4435     return SQLITE_OK;
4436   }
4437   assert( pCur->apPage[0]->intKey || pIdxKey );
4438   for(;;){
4439     int lwr, upr;
4440     Pgno chldPg;
4441     MemPage *pPage = pCur->apPage[pCur->iPage];
4442     int c;
4443 
4444     /* pPage->nCell must be greater than zero. If this is the root-page
4445     ** the cursor would have been INVALID above and this for(;;) loop
4446     ** not run. If this is not the root-page, then the moveToChild() routine
4447     ** would have already detected db corruption. Similarly, pPage must
4448     ** be the right kind (index or table) of b-tree page. Otherwise
4449     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
4450     assert( pPage->nCell>0 );
4451     assert( pPage->intKey==(pIdxKey==0) );
4452     lwr = 0;
4453     upr = pPage->nCell-1;
4454     if( biasRight ){
4455       pCur->aiIdx[pCur->iPage] = (u16)upr;
4456     }else{
4457       pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
4458     }
4459     for(;;){
4460       int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */
4461       u8 *pCell;                          /* Pointer to current cell in pPage */
4462 
4463       pCur->info.nSize = 0;
4464       pCell = findCell(pPage, idx) + pPage->childPtrSize;
4465       if( pPage->intKey ){
4466         i64 nCellKey;
4467         if( pPage->hasData ){
4468           u32 dummy;
4469           pCell += getVarint32(pCell, dummy);
4470         }
4471         getVarint(pCell, (u64*)&nCellKey);
4472         if( nCellKey==intKey ){
4473           c = 0;
4474         }else if( nCellKey<intKey ){
4475           c = -1;
4476         }else{
4477           assert( nCellKey>intKey );
4478           c = +1;
4479         }
4480         pCur->validNKey = 1;
4481         pCur->info.nKey = nCellKey;
4482       }else{
4483         /* The maximum supported page-size is 65536 bytes. This means that
4484         ** the maximum number of record bytes stored on an index B-Tree
4485         ** page is less than 16384 bytes and may be stored as a 2-byte
4486         ** varint. This information is used to attempt to avoid parsing
4487         ** the entire cell by checking for the cases where the record is
4488         ** stored entirely within the b-tree page by inspecting the first
4489         ** 2 bytes of the cell.
4490         */
4491         int nCell = pCell[0];
4492         if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){
4493           /* This branch runs if the record-size field of the cell is a
4494           ** single byte varint and the record fits entirely on the main
4495           ** b-tree page.  */
4496           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4497         }else if( !(pCell[1] & 0x80)
4498           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4499         ){
4500           /* The record-size field is a 2 byte varint and the record
4501           ** fits entirely on the main b-tree page.  */
4502           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
4503         }else{
4504           /* The record flows over onto one or more overflow pages. In
4505           ** this case the whole cell needs to be parsed, a buffer allocated
4506           ** and accessPayload() used to retrieve the record into the
4507           ** buffer before VdbeRecordCompare() can be called. */
4508           void *pCellKey;
4509           u8 * const pCellBody = pCell - pPage->childPtrSize;
4510           btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4511           nCell = (int)pCur->info.nKey;
4512           pCellKey = sqlite3Malloc( nCell );
4513           if( pCellKey==0 ){
4514             rc = SQLITE_NOMEM;
4515             goto moveto_finish;
4516           }
4517           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
4518           if( rc ){
4519             sqlite3_free(pCellKey);
4520             goto moveto_finish;
4521           }
4522           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
4523           sqlite3_free(pCellKey);
4524         }
4525       }
4526       if( c==0 ){
4527         if( pPage->intKey && !pPage->leaf ){
4528           lwr = idx;
4529           upr = lwr - 1;
4530           break;
4531         }else{
4532           *pRes = 0;
4533           rc = SQLITE_OK;
4534           goto moveto_finish;
4535         }
4536       }
4537       if( c<0 ){
4538         lwr = idx+1;
4539       }else{
4540         upr = idx-1;
4541       }
4542       if( lwr>upr ){
4543         break;
4544       }
4545       pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
4546     }
4547     assert( lwr==upr+1 );
4548     assert( pPage->isInit );
4549     if( pPage->leaf ){
4550       chldPg = 0;
4551     }else if( lwr>=pPage->nCell ){
4552       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4553     }else{
4554       chldPg = get4byte(findCell(pPage, lwr));
4555     }
4556     if( chldPg==0 ){
4557       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4558       *pRes = c;
4559       rc = SQLITE_OK;
4560       goto moveto_finish;
4561     }
4562     pCur->aiIdx[pCur->iPage] = (u16)lwr;
4563     pCur->info.nSize = 0;
4564     pCur->validNKey = 0;
4565     rc = moveToChild(pCur, chldPg);
4566     if( rc ) goto moveto_finish;
4567   }
4568 moveto_finish:
4569   return rc;
4570 }
4571 
4572 
4573 /*
4574 ** Return TRUE if the cursor is not pointing at an entry of the table.
4575 **
4576 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4577 ** past the last entry in the table or sqlite3BtreePrev() moves past
4578 ** the first entry.  TRUE is also returned if the table is empty.
4579 */
4580 int sqlite3BtreeEof(BtCursor *pCur){
4581   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4582   ** have been deleted? This API will need to change to return an error code
4583   ** as well as the boolean result value.
4584   */
4585   return (CURSOR_VALID!=pCur->eState);
4586 }
4587 
4588 /*
4589 ** Advance the cursor to the next entry in the database.  If
4590 ** successful then set *pRes=0.  If the cursor
4591 ** was already pointing to the last entry in the database before
4592 ** this routine was called, then set *pRes=1.
4593 */
4594 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4595   int rc;
4596   int idx;
4597   MemPage *pPage;
4598 
4599   assert( cursorHoldsMutex(pCur) );
4600   rc = restoreCursorPosition(pCur);
4601   if( rc!=SQLITE_OK ){
4602     return rc;
4603   }
4604   assert( pRes!=0 );
4605   if( CURSOR_INVALID==pCur->eState ){
4606     *pRes = 1;
4607     return SQLITE_OK;
4608   }
4609   if( pCur->skipNext>0 ){
4610     pCur->skipNext = 0;
4611     *pRes = 0;
4612     return SQLITE_OK;
4613   }
4614   pCur->skipNext = 0;
4615 
4616   pPage = pCur->apPage[pCur->iPage];
4617   idx = ++pCur->aiIdx[pCur->iPage];
4618   assert( pPage->isInit );
4619   assert( idx<=pPage->nCell );
4620 
4621   pCur->info.nSize = 0;
4622   pCur->validNKey = 0;
4623   if( idx>=pPage->nCell ){
4624     if( !pPage->leaf ){
4625       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4626       if( rc ) return rc;
4627       rc = moveToLeftmost(pCur);
4628       *pRes = 0;
4629       return rc;
4630     }
4631     do{
4632       if( pCur->iPage==0 ){
4633         *pRes = 1;
4634         pCur->eState = CURSOR_INVALID;
4635         return SQLITE_OK;
4636       }
4637       moveToParent(pCur);
4638       pPage = pCur->apPage[pCur->iPage];
4639     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4640     *pRes = 0;
4641     if( pPage->intKey ){
4642       rc = sqlite3BtreeNext(pCur, pRes);
4643     }else{
4644       rc = SQLITE_OK;
4645     }
4646     return rc;
4647   }
4648   *pRes = 0;
4649   if( pPage->leaf ){
4650     return SQLITE_OK;
4651   }
4652   rc = moveToLeftmost(pCur);
4653   return rc;
4654 }
4655 
4656 
4657 /*
4658 ** Step the cursor to the back to the previous entry in the database.  If
4659 ** successful then set *pRes=0.  If the cursor
4660 ** was already pointing to the first entry in the database before
4661 ** this routine was called, then set *pRes=1.
4662 */
4663 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4664   int rc;
4665   MemPage *pPage;
4666 
4667   assert( cursorHoldsMutex(pCur) );
4668   rc = restoreCursorPosition(pCur);
4669   if( rc!=SQLITE_OK ){
4670     return rc;
4671   }
4672   pCur->atLast = 0;
4673   if( CURSOR_INVALID==pCur->eState ){
4674     *pRes = 1;
4675     return SQLITE_OK;
4676   }
4677   if( pCur->skipNext<0 ){
4678     pCur->skipNext = 0;
4679     *pRes = 0;
4680     return SQLITE_OK;
4681   }
4682   pCur->skipNext = 0;
4683 
4684   pPage = pCur->apPage[pCur->iPage];
4685   assert( pPage->isInit );
4686   if( !pPage->leaf ){
4687     int idx = pCur->aiIdx[pCur->iPage];
4688     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4689     if( rc ){
4690       return rc;
4691     }
4692     rc = moveToRightmost(pCur);
4693   }else{
4694     while( pCur->aiIdx[pCur->iPage]==0 ){
4695       if( pCur->iPage==0 ){
4696         pCur->eState = CURSOR_INVALID;
4697         *pRes = 1;
4698         return SQLITE_OK;
4699       }
4700       moveToParent(pCur);
4701     }
4702     pCur->info.nSize = 0;
4703     pCur->validNKey = 0;
4704 
4705     pCur->aiIdx[pCur->iPage]--;
4706     pPage = pCur->apPage[pCur->iPage];
4707     if( pPage->intKey && !pPage->leaf ){
4708       rc = sqlite3BtreePrevious(pCur, pRes);
4709     }else{
4710       rc = SQLITE_OK;
4711     }
4712   }
4713   *pRes = 0;
4714   return rc;
4715 }
4716 
4717 /*
4718 ** Allocate a new page from the database file.
4719 **
4720 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
4721 ** has already been called on the new page.)  The new page has also
4722 ** been referenced and the calling routine is responsible for calling
4723 ** sqlite3PagerUnref() on the new page when it is done.
4724 **
4725 ** SQLITE_OK is returned on success.  Any other return value indicates
4726 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
4727 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4728 **
4729 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4730 ** locate a page close to the page number "nearby".  This can be used in an
4731 ** attempt to keep related pages close to each other in the database file,
4732 ** which in turn can make database access faster.
4733 **
4734 ** If the "exact" parameter is not 0, and the page-number nearby exists
4735 ** anywhere on the free-list, then it is guarenteed to be returned. This
4736 ** is only used by auto-vacuum databases when allocating a new table.
4737 */
4738 static int allocateBtreePage(
4739   BtShared *pBt,
4740   MemPage **ppPage,
4741   Pgno *pPgno,
4742   Pgno nearby,
4743   u8 exact
4744 ){
4745   MemPage *pPage1;
4746   int rc;
4747   u32 n;     /* Number of pages on the freelist */
4748   u32 k;     /* Number of leaves on the trunk of the freelist */
4749   MemPage *pTrunk = 0;
4750   MemPage *pPrevTrunk = 0;
4751   Pgno mxPage;     /* Total size of the database file */
4752 
4753   assert( sqlite3_mutex_held(pBt->mutex) );
4754   pPage1 = pBt->pPage1;
4755   mxPage = btreePagecount(pBt);
4756   n = get4byte(&pPage1->aData[36]);
4757   testcase( n==mxPage-1 );
4758   if( n>=mxPage ){
4759     return SQLITE_CORRUPT_BKPT;
4760   }
4761   if( n>0 ){
4762     /* There are pages on the freelist.  Reuse one of those pages. */
4763     Pgno iTrunk;
4764     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4765 
4766     /* If the 'exact' parameter was true and a query of the pointer-map
4767     ** shows that the page 'nearby' is somewhere on the free-list, then
4768     ** the entire-list will be searched for that page.
4769     */
4770 #ifndef SQLITE_OMIT_AUTOVACUUM
4771     if( exact && nearby<=mxPage ){
4772       u8 eType;
4773       assert( nearby>0 );
4774       assert( pBt->autoVacuum );
4775       rc = ptrmapGet(pBt, nearby, &eType, 0);
4776       if( rc ) return rc;
4777       if( eType==PTRMAP_FREEPAGE ){
4778         searchList = 1;
4779       }
4780       *pPgno = nearby;
4781     }
4782 #endif
4783 
4784     /* Decrement the free-list count by 1. Set iTrunk to the index of the
4785     ** first free-list trunk page. iPrevTrunk is initially 1.
4786     */
4787     rc = sqlite3PagerWrite(pPage1->pDbPage);
4788     if( rc ) return rc;
4789     put4byte(&pPage1->aData[36], n-1);
4790 
4791     /* The code within this loop is run only once if the 'searchList' variable
4792     ** is not true. Otherwise, it runs once for each trunk-page on the
4793     ** free-list until the page 'nearby' is located.
4794     */
4795     do {
4796       pPrevTrunk = pTrunk;
4797       if( pPrevTrunk ){
4798         iTrunk = get4byte(&pPrevTrunk->aData[0]);
4799       }else{
4800         iTrunk = get4byte(&pPage1->aData[32]);
4801       }
4802       testcase( iTrunk==mxPage );
4803       if( iTrunk>mxPage ){
4804         rc = SQLITE_CORRUPT_BKPT;
4805       }else{
4806         rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4807       }
4808       if( rc ){
4809         pTrunk = 0;
4810         goto end_allocate_page;
4811       }
4812 
4813       k = get4byte(&pTrunk->aData[4]);
4814       if( k==0 && !searchList ){
4815         /* The trunk has no leaves and the list is not being searched.
4816         ** So extract the trunk page itself and use it as the newly
4817         ** allocated page */
4818         assert( pPrevTrunk==0 );
4819         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4820         if( rc ){
4821           goto end_allocate_page;
4822         }
4823         *pPgno = iTrunk;
4824         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4825         *ppPage = pTrunk;
4826         pTrunk = 0;
4827         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4828       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
4829         /* Value of k is out of range.  Database corruption */
4830         rc = SQLITE_CORRUPT_BKPT;
4831         goto end_allocate_page;
4832 #ifndef SQLITE_OMIT_AUTOVACUUM
4833       }else if( searchList && nearby==iTrunk ){
4834         /* The list is being searched and this trunk page is the page
4835         ** to allocate, regardless of whether it has leaves.
4836         */
4837         assert( *pPgno==iTrunk );
4838         *ppPage = pTrunk;
4839         searchList = 0;
4840         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4841         if( rc ){
4842           goto end_allocate_page;
4843         }
4844         if( k==0 ){
4845           if( !pPrevTrunk ){
4846             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4847           }else{
4848             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4849             if( rc!=SQLITE_OK ){
4850               goto end_allocate_page;
4851             }
4852             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4853           }
4854         }else{
4855           /* The trunk page is required by the caller but it contains
4856           ** pointers to free-list leaves. The first leaf becomes a trunk
4857           ** page in this case.
4858           */
4859           MemPage *pNewTrunk;
4860           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4861           if( iNewTrunk>mxPage ){
4862             rc = SQLITE_CORRUPT_BKPT;
4863             goto end_allocate_page;
4864           }
4865           testcase( iNewTrunk==mxPage );
4866           rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4867           if( rc!=SQLITE_OK ){
4868             goto end_allocate_page;
4869           }
4870           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4871           if( rc!=SQLITE_OK ){
4872             releasePage(pNewTrunk);
4873             goto end_allocate_page;
4874           }
4875           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4876           put4byte(&pNewTrunk->aData[4], k-1);
4877           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4878           releasePage(pNewTrunk);
4879           if( !pPrevTrunk ){
4880             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4881             put4byte(&pPage1->aData[32], iNewTrunk);
4882           }else{
4883             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4884             if( rc ){
4885               goto end_allocate_page;
4886             }
4887             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4888           }
4889         }
4890         pTrunk = 0;
4891         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4892 #endif
4893       }else if( k>0 ){
4894         /* Extract a leaf from the trunk */
4895         u32 closest;
4896         Pgno iPage;
4897         unsigned char *aData = pTrunk->aData;
4898         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4899         if( rc ){
4900           goto end_allocate_page;
4901         }
4902         if( nearby>0 ){
4903           u32 i;
4904           int dist;
4905           closest = 0;
4906           dist = get4byte(&aData[8]) - nearby;
4907           if( dist<0 ) dist = -dist;
4908           for(i=1; i<k; i++){
4909             int d2 = get4byte(&aData[8+i*4]) - nearby;
4910             if( d2<0 ) d2 = -d2;
4911             if( d2<dist ){
4912               closest = i;
4913               dist = d2;
4914             }
4915           }
4916         }else{
4917           closest = 0;
4918         }
4919 
4920         iPage = get4byte(&aData[8+closest*4]);
4921         testcase( iPage==mxPage );
4922         if( iPage>mxPage ){
4923           rc = SQLITE_CORRUPT_BKPT;
4924           goto end_allocate_page;
4925         }
4926         testcase( iPage==mxPage );
4927         if( !searchList || iPage==nearby ){
4928           int noContent;
4929           *pPgno = iPage;
4930           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4931                  ": %d more free pages\n",
4932                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
4933           if( closest<k-1 ){
4934             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4935           }
4936           put4byte(&aData[4], k-1);
4937           assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
4938           noContent = !btreeGetHasContent(pBt, *pPgno);
4939           rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
4940           if( rc==SQLITE_OK ){
4941             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4942             if( rc!=SQLITE_OK ){
4943               releasePage(*ppPage);
4944             }
4945           }
4946           searchList = 0;
4947         }
4948       }
4949       releasePage(pPrevTrunk);
4950       pPrevTrunk = 0;
4951     }while( searchList );
4952   }else{
4953     /* There are no pages on the freelist, so create a new page at the
4954     ** end of the file */
4955     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
4956     if( rc ) return rc;
4957     pBt->nPage++;
4958     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
4959 
4960 #ifndef SQLITE_OMIT_AUTOVACUUM
4961     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
4962       /* If *pPgno refers to a pointer-map page, allocate two new pages
4963       ** at the end of the file instead of one. The first allocated page
4964       ** becomes a new pointer-map page, the second is used by the caller.
4965       */
4966       MemPage *pPg = 0;
4967       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
4968       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
4969       rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1);
4970       if( rc==SQLITE_OK ){
4971         rc = sqlite3PagerWrite(pPg->pDbPage);
4972         releasePage(pPg);
4973       }
4974       if( rc ) return rc;
4975       pBt->nPage++;
4976       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
4977     }
4978 #endif
4979     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
4980     *pPgno = pBt->nPage;
4981 
4982     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4983     rc = btreeGetPage(pBt, *pPgno, ppPage, 1);
4984     if( rc ) return rc;
4985     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4986     if( rc!=SQLITE_OK ){
4987       releasePage(*ppPage);
4988     }
4989     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4990   }
4991 
4992   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4993 
4994 end_allocate_page:
4995   releasePage(pTrunk);
4996   releasePage(pPrevTrunk);
4997   if( rc==SQLITE_OK ){
4998     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4999       releasePage(*ppPage);
5000       return SQLITE_CORRUPT_BKPT;
5001     }
5002     (*ppPage)->isInit = 0;
5003   }else{
5004     *ppPage = 0;
5005   }
5006   return rc;
5007 }
5008 
5009 /*
5010 ** This function is used to add page iPage to the database file free-list.
5011 ** It is assumed that the page is not already a part of the free-list.
5012 **
5013 ** The value passed as the second argument to this function is optional.
5014 ** If the caller happens to have a pointer to the MemPage object
5015 ** corresponding to page iPage handy, it may pass it as the second value.
5016 ** Otherwise, it may pass NULL.
5017 **
5018 ** If a pointer to a MemPage object is passed as the second argument,
5019 ** its reference count is not altered by this function.
5020 */
5021 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5022   MemPage *pTrunk = 0;                /* Free-list trunk page */
5023   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5024   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5025   MemPage *pPage;                     /* Page being freed. May be NULL. */
5026   int rc;                             /* Return Code */
5027   int nFree;                          /* Initial number of pages on free-list */
5028 
5029   assert( sqlite3_mutex_held(pBt->mutex) );
5030   assert( iPage>1 );
5031   assert( !pMemPage || pMemPage->pgno==iPage );
5032 
5033   if( pMemPage ){
5034     pPage = pMemPage;
5035     sqlite3PagerRef(pPage->pDbPage);
5036   }else{
5037     pPage = btreePageLookup(pBt, iPage);
5038   }
5039 
5040   /* Increment the free page count on pPage1 */
5041   rc = sqlite3PagerWrite(pPage1->pDbPage);
5042   if( rc ) goto freepage_out;
5043   nFree = get4byte(&pPage1->aData[36]);
5044   put4byte(&pPage1->aData[36], nFree+1);
5045 
5046   if( pBt->secureDelete ){
5047     /* If the secure_delete option is enabled, then
5048     ** always fully overwrite deleted information with zeros.
5049     */
5050     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5051      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5052     ){
5053       goto freepage_out;
5054     }
5055     memset(pPage->aData, 0, pPage->pBt->pageSize);
5056   }
5057 
5058   /* If the database supports auto-vacuum, write an entry in the pointer-map
5059   ** to indicate that the page is free.
5060   */
5061   if( ISAUTOVACUUM ){
5062     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5063     if( rc ) goto freepage_out;
5064   }
5065 
5066   /* Now manipulate the actual database free-list structure. There are two
5067   ** possibilities. If the free-list is currently empty, or if the first
5068   ** trunk page in the free-list is full, then this page will become a
5069   ** new free-list trunk page. Otherwise, it will become a leaf of the
5070   ** first trunk page in the current free-list. This block tests if it
5071   ** is possible to add the page as a new free-list leaf.
5072   */
5073   if( nFree!=0 ){
5074     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5075 
5076     iTrunk = get4byte(&pPage1->aData[32]);
5077     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5078     if( rc!=SQLITE_OK ){
5079       goto freepage_out;
5080     }
5081 
5082     nLeaf = get4byte(&pTrunk->aData[4]);
5083     assert( pBt->usableSize>32 );
5084     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5085       rc = SQLITE_CORRUPT_BKPT;
5086       goto freepage_out;
5087     }
5088     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5089       /* In this case there is room on the trunk page to insert the page
5090       ** being freed as a new leaf.
5091       **
5092       ** Note that the trunk page is not really full until it contains
5093       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5094       ** coded.  But due to a coding error in versions of SQLite prior to
5095       ** 3.6.0, databases with freelist trunk pages holding more than
5096       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5097       ** to maintain backwards compatibility with older versions of SQLite,
5098       ** we will continue to restrict the number of entries to usableSize/4 - 8
5099       ** for now.  At some point in the future (once everyone has upgraded
5100       ** to 3.6.0 or later) we should consider fixing the conditional above
5101       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5102       */
5103       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5104       if( rc==SQLITE_OK ){
5105         put4byte(&pTrunk->aData[4], nLeaf+1);
5106         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5107         if( pPage && !pBt->secureDelete ){
5108           sqlite3PagerDontWrite(pPage->pDbPage);
5109         }
5110         rc = btreeSetHasContent(pBt, iPage);
5111       }
5112       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5113       goto freepage_out;
5114     }
5115   }
5116 
5117   /* If control flows to this point, then it was not possible to add the
5118   ** the page being freed as a leaf page of the first trunk in the free-list.
5119   ** Possibly because the free-list is empty, or possibly because the
5120   ** first trunk in the free-list is full. Either way, the page being freed
5121   ** will become the new first trunk page in the free-list.
5122   */
5123   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5124     goto freepage_out;
5125   }
5126   rc = sqlite3PagerWrite(pPage->pDbPage);
5127   if( rc!=SQLITE_OK ){
5128     goto freepage_out;
5129   }
5130   put4byte(pPage->aData, iTrunk);
5131   put4byte(&pPage->aData[4], 0);
5132   put4byte(&pPage1->aData[32], iPage);
5133   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5134 
5135 freepage_out:
5136   if( pPage ){
5137     pPage->isInit = 0;
5138   }
5139   releasePage(pPage);
5140   releasePage(pTrunk);
5141   return rc;
5142 }
5143 static void freePage(MemPage *pPage, int *pRC){
5144   if( (*pRC)==SQLITE_OK ){
5145     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5146   }
5147 }
5148 
5149 /*
5150 ** Free any overflow pages associated with the given Cell.
5151 */
5152 static int clearCell(MemPage *pPage, unsigned char *pCell){
5153   BtShared *pBt = pPage->pBt;
5154   CellInfo info;
5155   Pgno ovflPgno;
5156   int rc;
5157   int nOvfl;
5158   u32 ovflPageSize;
5159 
5160   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5161   btreeParseCellPtr(pPage, pCell, &info);
5162   if( info.iOverflow==0 ){
5163     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
5164   }
5165   ovflPgno = get4byte(&pCell[info.iOverflow]);
5166   assert( pBt->usableSize > 4 );
5167   ovflPageSize = pBt->usableSize - 4;
5168   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5169   assert( ovflPgno==0 || nOvfl>0 );
5170   while( nOvfl-- ){
5171     Pgno iNext = 0;
5172     MemPage *pOvfl = 0;
5173     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5174       /* 0 is not a legal page number and page 1 cannot be an
5175       ** overflow page. Therefore if ovflPgno<2 or past the end of the
5176       ** file the database must be corrupt. */
5177       return SQLITE_CORRUPT_BKPT;
5178     }
5179     if( nOvfl ){
5180       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5181       if( rc ) return rc;
5182     }
5183 
5184     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5185      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5186     ){
5187       /* There is no reason any cursor should have an outstanding reference
5188       ** to an overflow page belonging to a cell that is being deleted/updated.
5189       ** So if there exists more than one reference to this page, then it
5190       ** must not really be an overflow page and the database must be corrupt.
5191       ** It is helpful to detect this before calling freePage2(), as
5192       ** freePage2() may zero the page contents if secure-delete mode is
5193       ** enabled. If this 'overflow' page happens to be a page that the
5194       ** caller is iterating through or using in some other way, this
5195       ** can be problematic.
5196       */
5197       rc = SQLITE_CORRUPT_BKPT;
5198     }else{
5199       rc = freePage2(pBt, pOvfl, ovflPgno);
5200     }
5201 
5202     if( pOvfl ){
5203       sqlite3PagerUnref(pOvfl->pDbPage);
5204     }
5205     if( rc ) return rc;
5206     ovflPgno = iNext;
5207   }
5208   return SQLITE_OK;
5209 }
5210 
5211 /*
5212 ** Create the byte sequence used to represent a cell on page pPage
5213 ** and write that byte sequence into pCell[].  Overflow pages are
5214 ** allocated and filled in as necessary.  The calling procedure
5215 ** is responsible for making sure sufficient space has been allocated
5216 ** for pCell[].
5217 **
5218 ** Note that pCell does not necessary need to point to the pPage->aData
5219 ** area.  pCell might point to some temporary storage.  The cell will
5220 ** be constructed in this temporary area then copied into pPage->aData
5221 ** later.
5222 */
5223 static int fillInCell(
5224   MemPage *pPage,                /* The page that contains the cell */
5225   unsigned char *pCell,          /* Complete text of the cell */
5226   const void *pKey, i64 nKey,    /* The key */
5227   const void *pData,int nData,   /* The data */
5228   int nZero,                     /* Extra zero bytes to append to pData */
5229   int *pnSize                    /* Write cell size here */
5230 ){
5231   int nPayload;
5232   const u8 *pSrc;
5233   int nSrc, n, rc;
5234   int spaceLeft;
5235   MemPage *pOvfl = 0;
5236   MemPage *pToRelease = 0;
5237   unsigned char *pPrior;
5238   unsigned char *pPayload;
5239   BtShared *pBt = pPage->pBt;
5240   Pgno pgnoOvfl = 0;
5241   int nHeader;
5242   CellInfo info;
5243 
5244   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5245 
5246   /* pPage is not necessarily writeable since pCell might be auxiliary
5247   ** buffer space that is separate from the pPage buffer area */
5248   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5249             || sqlite3PagerIswriteable(pPage->pDbPage) );
5250 
5251   /* Fill in the header. */
5252   nHeader = 0;
5253   if( !pPage->leaf ){
5254     nHeader += 4;
5255   }
5256   if( pPage->hasData ){
5257     nHeader += putVarint(&pCell[nHeader], nData+nZero);
5258   }else{
5259     nData = nZero = 0;
5260   }
5261   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5262   btreeParseCellPtr(pPage, pCell, &info);
5263   assert( info.nHeader==nHeader );
5264   assert( info.nKey==nKey );
5265   assert( info.nData==(u32)(nData+nZero) );
5266 
5267   /* Fill in the payload */
5268   nPayload = nData + nZero;
5269   if( pPage->intKey ){
5270     pSrc = pData;
5271     nSrc = nData;
5272     nData = 0;
5273   }else{
5274     if( NEVER(nKey>0x7fffffff || pKey==0) ){
5275       return SQLITE_CORRUPT_BKPT;
5276     }
5277     nPayload += (int)nKey;
5278     pSrc = pKey;
5279     nSrc = (int)nKey;
5280   }
5281   *pnSize = info.nSize;
5282   spaceLeft = info.nLocal;
5283   pPayload = &pCell[nHeader];
5284   pPrior = &pCell[info.iOverflow];
5285 
5286   while( nPayload>0 ){
5287     if( spaceLeft==0 ){
5288 #ifndef SQLITE_OMIT_AUTOVACUUM
5289       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5290       if( pBt->autoVacuum ){
5291         do{
5292           pgnoOvfl++;
5293         } while(
5294           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5295         );
5296       }
5297 #endif
5298       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5299 #ifndef SQLITE_OMIT_AUTOVACUUM
5300       /* If the database supports auto-vacuum, and the second or subsequent
5301       ** overflow page is being allocated, add an entry to the pointer-map
5302       ** for that page now.
5303       **
5304       ** If this is the first overflow page, then write a partial entry
5305       ** to the pointer-map. If we write nothing to this pointer-map slot,
5306       ** then the optimistic overflow chain processing in clearCell()
5307       ** may misinterpret the uninitialised values and delete the
5308       ** wrong pages from the database.
5309       */
5310       if( pBt->autoVacuum && rc==SQLITE_OK ){
5311         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5312         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5313         if( rc ){
5314           releasePage(pOvfl);
5315         }
5316       }
5317 #endif
5318       if( rc ){
5319         releasePage(pToRelease);
5320         return rc;
5321       }
5322 
5323       /* If pToRelease is not zero than pPrior points into the data area
5324       ** of pToRelease.  Make sure pToRelease is still writeable. */
5325       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5326 
5327       /* If pPrior is part of the data area of pPage, then make sure pPage
5328       ** is still writeable */
5329       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5330             || sqlite3PagerIswriteable(pPage->pDbPage) );
5331 
5332       put4byte(pPrior, pgnoOvfl);
5333       releasePage(pToRelease);
5334       pToRelease = pOvfl;
5335       pPrior = pOvfl->aData;
5336       put4byte(pPrior, 0);
5337       pPayload = &pOvfl->aData[4];
5338       spaceLeft = pBt->usableSize - 4;
5339     }
5340     n = nPayload;
5341     if( n>spaceLeft ) n = spaceLeft;
5342 
5343     /* If pToRelease is not zero than pPayload points into the data area
5344     ** of pToRelease.  Make sure pToRelease is still writeable. */
5345     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5346 
5347     /* If pPayload is part of the data area of pPage, then make sure pPage
5348     ** is still writeable */
5349     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5350             || sqlite3PagerIswriteable(pPage->pDbPage) );
5351 
5352     if( nSrc>0 ){
5353       if( n>nSrc ) n = nSrc;
5354       assert( pSrc );
5355       memcpy(pPayload, pSrc, n);
5356     }else{
5357       memset(pPayload, 0, n);
5358     }
5359     nPayload -= n;
5360     pPayload += n;
5361     pSrc += n;
5362     nSrc -= n;
5363     spaceLeft -= n;
5364     if( nSrc==0 ){
5365       nSrc = nData;
5366       pSrc = pData;
5367     }
5368   }
5369   releasePage(pToRelease);
5370   return SQLITE_OK;
5371 }
5372 
5373 /*
5374 ** Remove the i-th cell from pPage.  This routine effects pPage only.
5375 ** The cell content is not freed or deallocated.  It is assumed that
5376 ** the cell content has been copied someplace else.  This routine just
5377 ** removes the reference to the cell from pPage.
5378 **
5379 ** "sz" must be the number of bytes in the cell.
5380 */
5381 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5382   int i;          /* Loop counter */
5383   u32 pc;         /* Offset to cell content of cell being deleted */
5384   u8 *data;       /* pPage->aData */
5385   u8 *ptr;        /* Used to move bytes around within data[] */
5386   int rc;         /* The return code */
5387   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
5388 
5389   if( *pRC ) return;
5390 
5391   assert( idx>=0 && idx<pPage->nCell );
5392   assert( sz==cellSize(pPage, idx) );
5393   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5394   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5395   data = pPage->aData;
5396   ptr = &data[pPage->cellOffset + 2*idx];
5397   pc = get2byte(ptr);
5398   hdr = pPage->hdrOffset;
5399   testcase( pc==get2byte(&data[hdr+5]) );
5400   testcase( pc+sz==pPage->pBt->usableSize );
5401   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5402     *pRC = SQLITE_CORRUPT_BKPT;
5403     return;
5404   }
5405   rc = freeSpace(pPage, pc, sz);
5406   if( rc ){
5407     *pRC = rc;
5408     return;
5409   }
5410   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
5411     ptr[0] = ptr[2];
5412     ptr[1] = ptr[3];
5413   }
5414   pPage->nCell--;
5415   put2byte(&data[hdr+3], pPage->nCell);
5416   pPage->nFree += 2;
5417 }
5418 
5419 /*
5420 ** Insert a new cell on pPage at cell index "i".  pCell points to the
5421 ** content of the cell.
5422 **
5423 ** If the cell content will fit on the page, then put it there.  If it
5424 ** will not fit, then make a copy of the cell content into pTemp if
5425 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
5426 ** in pPage->aOvfl[] and make it point to the cell content (either
5427 ** in pTemp or the original pCell) and also record its index.
5428 ** Allocating a new entry in pPage->aCell[] implies that
5429 ** pPage->nOverflow is incremented.
5430 **
5431 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5432 ** cell. The caller will overwrite them after this function returns. If
5433 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5434 ** (but pCell+nSkip is always valid).
5435 */
5436 static void insertCell(
5437   MemPage *pPage,   /* Page into which we are copying */
5438   int i,            /* New cell becomes the i-th cell of the page */
5439   u8 *pCell,        /* Content of the new cell */
5440   int sz,           /* Bytes of content in pCell */
5441   u8 *pTemp,        /* Temp storage space for pCell, if needed */
5442   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
5443   int *pRC          /* Read and write return code from here */
5444 ){
5445   int idx = 0;      /* Where to write new cell content in data[] */
5446   int j;            /* Loop counter */
5447   int end;          /* First byte past the last cell pointer in data[] */
5448   int ins;          /* Index in data[] where new cell pointer is inserted */
5449   int cellOffset;   /* Address of first cell pointer in data[] */
5450   u8 *data;         /* The content of the whole page */
5451   u8 *ptr;          /* Used for moving information around in data[] */
5452 
5453   int nSkip = (iChild ? 4 : 0);
5454 
5455   if( *pRC ) return;
5456 
5457   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5458   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
5459   assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
5460   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5461   /* The cell should normally be sized correctly.  However, when moving a
5462   ** malformed cell from a leaf page to an interior page, if the cell size
5463   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
5464   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
5465   ** the term after the || in the following assert(). */
5466   assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
5467   if( pPage->nOverflow || sz+2>pPage->nFree ){
5468     if( pTemp ){
5469       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5470       pCell = pTemp;
5471     }
5472     if( iChild ){
5473       put4byte(pCell, iChild);
5474     }
5475     j = pPage->nOverflow++;
5476     assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
5477     pPage->aOvfl[j].pCell = pCell;
5478     pPage->aOvfl[j].idx = (u16)i;
5479   }else{
5480     int rc = sqlite3PagerWrite(pPage->pDbPage);
5481     if( rc!=SQLITE_OK ){
5482       *pRC = rc;
5483       return;
5484     }
5485     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5486     data = pPage->aData;
5487     cellOffset = pPage->cellOffset;
5488     end = cellOffset + 2*pPage->nCell;
5489     ins = cellOffset + 2*i;
5490     rc = allocateSpace(pPage, sz, &idx);
5491     if( rc ){ *pRC = rc; return; }
5492     /* The allocateSpace() routine guarantees the following two properties
5493     ** if it returns success */
5494     assert( idx >= end+2 );
5495     assert( idx+sz <= pPage->pBt->usableSize );
5496     pPage->nCell++;
5497     pPage->nFree -= (u16)(2 + sz);
5498     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5499     if( iChild ){
5500       put4byte(&data[idx], iChild);
5501     }
5502     for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){
5503       ptr[0] = ptr[-2];
5504       ptr[1] = ptr[-1];
5505     }
5506     put2byte(&data[ins], idx);
5507     put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5508 #ifndef SQLITE_OMIT_AUTOVACUUM
5509     if( pPage->pBt->autoVacuum ){
5510       /* The cell may contain a pointer to an overflow page. If so, write
5511       ** the entry for the overflow page into the pointer map.
5512       */
5513       ptrmapPutOvflPtr(pPage, pCell, pRC);
5514     }
5515 #endif
5516   }
5517 }
5518 
5519 /*
5520 ** Add a list of cells to a page.  The page should be initially empty.
5521 ** The cells are guaranteed to fit on the page.
5522 */
5523 static void assemblePage(
5524   MemPage *pPage,   /* The page to be assemblied */
5525   int nCell,        /* The number of cells to add to this page */
5526   u8 **apCell,      /* Pointers to cell bodies */
5527   u16 *aSize        /* Sizes of the cells */
5528 ){
5529   int i;            /* Loop counter */
5530   u8 *pCellptr;     /* Address of next cell pointer */
5531   int cellbody;     /* Address of next cell body */
5532   u8 * const data = pPage->aData;             /* Pointer to data for pPage */
5533   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */
5534   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5535 
5536   assert( pPage->nOverflow==0 );
5537   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5538   assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921);
5539   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5540 
5541   /* Check that the page has just been zeroed by zeroPage() */
5542   assert( pPage->nCell==0 );
5543   assert( get2byteNotZero(&data[hdr+5])==nUsable );
5544 
5545   pCellptr = &data[pPage->cellOffset + nCell*2];
5546   cellbody = nUsable;
5547   for(i=nCell-1; i>=0; i--){
5548     pCellptr -= 2;
5549     cellbody -= aSize[i];
5550     put2byte(pCellptr, cellbody);
5551     memcpy(&data[cellbody], apCell[i], aSize[i]);
5552   }
5553   put2byte(&data[hdr+3], nCell);
5554   put2byte(&data[hdr+5], cellbody);
5555   pPage->nFree -= (nCell*2 + nUsable - cellbody);
5556   pPage->nCell = (u16)nCell;
5557 }
5558 
5559 /*
5560 ** The following parameters determine how many adjacent pages get involved
5561 ** in a balancing operation.  NN is the number of neighbors on either side
5562 ** of the page that participate in the balancing operation.  NB is the
5563 ** total number of pages that participate, including the target page and
5564 ** NN neighbors on either side.
5565 **
5566 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
5567 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5568 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5569 ** The value of NN appears to give the best results overall.
5570 */
5571 #define NN 1             /* Number of neighbors on either side of pPage */
5572 #define NB (NN*2+1)      /* Total pages involved in the balance */
5573 
5574 
5575 #ifndef SQLITE_OMIT_QUICKBALANCE
5576 /*
5577 ** This version of balance() handles the common special case where
5578 ** a new entry is being inserted on the extreme right-end of the
5579 ** tree, in other words, when the new entry will become the largest
5580 ** entry in the tree.
5581 **
5582 ** Instead of trying to balance the 3 right-most leaf pages, just add
5583 ** a new page to the right-hand side and put the one new entry in
5584 ** that page.  This leaves the right side of the tree somewhat
5585 ** unbalanced.  But odds are that we will be inserting new entries
5586 ** at the end soon afterwards so the nearly empty page will quickly
5587 ** fill up.  On average.
5588 **
5589 ** pPage is the leaf page which is the right-most page in the tree.
5590 ** pParent is its parent.  pPage must have a single overflow entry
5591 ** which is also the right-most entry on the page.
5592 **
5593 ** The pSpace buffer is used to store a temporary copy of the divider
5594 ** cell that will be inserted into pParent. Such a cell consists of a 4
5595 ** byte page number followed by a variable length integer. In other
5596 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5597 ** least 13 bytes in size.
5598 */
5599 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5600   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
5601   MemPage *pNew;                       /* Newly allocated page */
5602   int rc;                              /* Return Code */
5603   Pgno pgnoNew;                        /* Page number of pNew */
5604 
5605   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5606   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5607   assert( pPage->nOverflow==1 );
5608 
5609   /* This error condition is now caught prior to reaching this function */
5610   if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
5611 
5612   /* Allocate a new page. This page will become the right-sibling of
5613   ** pPage. Make the parent page writable, so that the new divider cell
5614   ** may be inserted. If both these operations are successful, proceed.
5615   */
5616   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5617 
5618   if( rc==SQLITE_OK ){
5619 
5620     u8 *pOut = &pSpace[4];
5621     u8 *pCell = pPage->aOvfl[0].pCell;
5622     u16 szCell = cellSizePtr(pPage, pCell);
5623     u8 *pStop;
5624 
5625     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5626     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5627     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5628     assemblePage(pNew, 1, &pCell, &szCell);
5629 
5630     /* If this is an auto-vacuum database, update the pointer map
5631     ** with entries for the new page, and any pointer from the
5632     ** cell on the page to an overflow page. If either of these
5633     ** operations fails, the return code is set, but the contents
5634     ** of the parent page are still manipulated by thh code below.
5635     ** That is Ok, at this point the parent page is guaranteed to
5636     ** be marked as dirty. Returning an error code will cause a
5637     ** rollback, undoing any changes made to the parent page.
5638     */
5639     if( ISAUTOVACUUM ){
5640       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5641       if( szCell>pNew->minLocal ){
5642         ptrmapPutOvflPtr(pNew, pCell, &rc);
5643       }
5644     }
5645 
5646     /* Create a divider cell to insert into pParent. The divider cell
5647     ** consists of a 4-byte page number (the page number of pPage) and
5648     ** a variable length key value (which must be the same value as the
5649     ** largest key on pPage).
5650     **
5651     ** To find the largest key value on pPage, first find the right-most
5652     ** cell on pPage. The first two fields of this cell are the
5653     ** record-length (a variable length integer at most 32-bits in size)
5654     ** and the key value (a variable length integer, may have any value).
5655     ** The first of the while(...) loops below skips over the record-length
5656     ** field. The second while(...) loop copies the key value from the
5657     ** cell on pPage into the pSpace buffer.
5658     */
5659     pCell = findCell(pPage, pPage->nCell-1);
5660     pStop = &pCell[9];
5661     while( (*(pCell++)&0x80) && pCell<pStop );
5662     pStop = &pCell[9];
5663     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5664 
5665     /* Insert the new divider cell into pParent. */
5666     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5667                0, pPage->pgno, &rc);
5668 
5669     /* Set the right-child pointer of pParent to point to the new page. */
5670     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5671 
5672     /* Release the reference to the new page. */
5673     releasePage(pNew);
5674   }
5675 
5676   return rc;
5677 }
5678 #endif /* SQLITE_OMIT_QUICKBALANCE */
5679 
5680 #if 0
5681 /*
5682 ** This function does not contribute anything to the operation of SQLite.
5683 ** it is sometimes activated temporarily while debugging code responsible
5684 ** for setting pointer-map entries.
5685 */
5686 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5687   int i, j;
5688   for(i=0; i<nPage; i++){
5689     Pgno n;
5690     u8 e;
5691     MemPage *pPage = apPage[i];
5692     BtShared *pBt = pPage->pBt;
5693     assert( pPage->isInit );
5694 
5695     for(j=0; j<pPage->nCell; j++){
5696       CellInfo info;
5697       u8 *z;
5698 
5699       z = findCell(pPage, j);
5700       btreeParseCellPtr(pPage, z, &info);
5701       if( info.iOverflow ){
5702         Pgno ovfl = get4byte(&z[info.iOverflow]);
5703         ptrmapGet(pBt, ovfl, &e, &n);
5704         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5705       }
5706       if( !pPage->leaf ){
5707         Pgno child = get4byte(z);
5708         ptrmapGet(pBt, child, &e, &n);
5709         assert( n==pPage->pgno && e==PTRMAP_BTREE );
5710       }
5711     }
5712     if( !pPage->leaf ){
5713       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5714       ptrmapGet(pBt, child, &e, &n);
5715       assert( n==pPage->pgno && e==PTRMAP_BTREE );
5716     }
5717   }
5718   return 1;
5719 }
5720 #endif
5721 
5722 /*
5723 ** This function is used to copy the contents of the b-tree node stored
5724 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5725 ** the pointer-map entries for each child page are updated so that the
5726 ** parent page stored in the pointer map is page pTo. If pFrom contained
5727 ** any cells with overflow page pointers, then the corresponding pointer
5728 ** map entries are also updated so that the parent page is page pTo.
5729 **
5730 ** If pFrom is currently carrying any overflow cells (entries in the
5731 ** MemPage.aOvfl[] array), they are not copied to pTo.
5732 **
5733 ** Before returning, page pTo is reinitialized using btreeInitPage().
5734 **
5735 ** The performance of this function is not critical. It is only used by
5736 ** the balance_shallower() and balance_deeper() procedures, neither of
5737 ** which are called often under normal circumstances.
5738 */
5739 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5740   if( (*pRC)==SQLITE_OK ){
5741     BtShared * const pBt = pFrom->pBt;
5742     u8 * const aFrom = pFrom->aData;
5743     u8 * const aTo = pTo->aData;
5744     int const iFromHdr = pFrom->hdrOffset;
5745     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5746     int rc;
5747     int iData;
5748 
5749 
5750     assert( pFrom->isInit );
5751     assert( pFrom->nFree>=iToHdr );
5752     assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize );
5753 
5754     /* Copy the b-tree node content from page pFrom to page pTo. */
5755     iData = get2byte(&aFrom[iFromHdr+5]);
5756     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5757     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5758 
5759     /* Reinitialize page pTo so that the contents of the MemPage structure
5760     ** match the new data. The initialization of pTo can actually fail under
5761     ** fairly obscure circumstances, even though it is a copy of initialized
5762     ** page pFrom.
5763     */
5764     pTo->isInit = 0;
5765     rc = btreeInitPage(pTo);
5766     if( rc!=SQLITE_OK ){
5767       *pRC = rc;
5768       return;
5769     }
5770 
5771     /* If this is an auto-vacuum database, update the pointer-map entries
5772     ** for any b-tree or overflow pages that pTo now contains the pointers to.
5773     */
5774     if( ISAUTOVACUUM ){
5775       *pRC = setChildPtrmaps(pTo);
5776     }
5777   }
5778 }
5779 
5780 /*
5781 ** This routine redistributes cells on the iParentIdx'th child of pParent
5782 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
5783 ** same amount of free space. Usually a single sibling on either side of the
5784 ** page are used in the balancing, though both siblings might come from one
5785 ** side if the page is the first or last child of its parent. If the page
5786 ** has fewer than 2 siblings (something which can only happen if the page
5787 ** is a root page or a child of a root page) then all available siblings
5788 ** participate in the balancing.
5789 **
5790 ** The number of siblings of the page might be increased or decreased by
5791 ** one or two in an effort to keep pages nearly full but not over full.
5792 **
5793 ** Note that when this routine is called, some of the cells on the page
5794 ** might not actually be stored in MemPage.aData[]. This can happen
5795 ** if the page is overfull. This routine ensures that all cells allocated
5796 ** to the page and its siblings fit into MemPage.aData[] before returning.
5797 **
5798 ** In the course of balancing the page and its siblings, cells may be
5799 ** inserted into or removed from the parent page (pParent). Doing so
5800 ** may cause the parent page to become overfull or underfull. If this
5801 ** happens, it is the responsibility of the caller to invoke the correct
5802 ** balancing routine to fix this problem (see the balance() routine).
5803 **
5804 ** If this routine fails for any reason, it might leave the database
5805 ** in a corrupted state. So if this routine fails, the database should
5806 ** be rolled back.
5807 **
5808 ** The third argument to this function, aOvflSpace, is a pointer to a
5809 ** buffer big enough to hold one page. If while inserting cells into the parent
5810 ** page (pParent) the parent page becomes overfull, this buffer is
5811 ** used to store the parent's overflow cells. Because this function inserts
5812 ** a maximum of four divider cells into the parent page, and the maximum
5813 ** size of a cell stored within an internal node is always less than 1/4
5814 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5815 ** enough for all overflow cells.
5816 **
5817 ** If aOvflSpace is set to a null pointer, this function returns
5818 ** SQLITE_NOMEM.
5819 */
5820 static int balance_nonroot(
5821   MemPage *pParent,               /* Parent page of siblings being balanced */
5822   int iParentIdx,                 /* Index of "the page" in pParent */
5823   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
5824   int isRoot                      /* True if pParent is a root-page */
5825 ){
5826   BtShared *pBt;               /* The whole database */
5827   int nCell = 0;               /* Number of cells in apCell[] */
5828   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
5829   int nNew = 0;                /* Number of pages in apNew[] */
5830   int nOld;                    /* Number of pages in apOld[] */
5831   int i, j, k;                 /* Loop counters */
5832   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
5833   int rc = SQLITE_OK;          /* The return code */
5834   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
5835   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
5836   int usableSpace;             /* Bytes in pPage beyond the header */
5837   int pageFlags;               /* Value of pPage->aData[0] */
5838   int subtotal;                /* Subtotal of bytes in cells on one page */
5839   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
5840   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
5841   int szScratch;               /* Size of scratch memory requested */
5842   MemPage *apOld[NB];          /* pPage and up to two siblings */
5843   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
5844   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
5845   u8 *pRight;                  /* Location in parent of right-sibling pointer */
5846   u8 *apDiv[NB-1];             /* Divider cells in pParent */
5847   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
5848   int szNew[NB+2];             /* Combined size of cells place on i-th page */
5849   u8 **apCell = 0;             /* All cells begin balanced */
5850   u16 *szCell;                 /* Local size of all cells in apCell[] */
5851   u8 *aSpace1;                 /* Space for copies of dividers cells */
5852   Pgno pgno;                   /* Temp var to store a page number in */
5853 
5854   pBt = pParent->pBt;
5855   assert( sqlite3_mutex_held(pBt->mutex) );
5856   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5857 
5858 #if 0
5859   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5860 #endif
5861 
5862   /* At this point pParent may have at most one overflow cell. And if
5863   ** this overflow cell is present, it must be the cell with
5864   ** index iParentIdx. This scenario comes about when this function
5865   ** is called (indirectly) from sqlite3BtreeDelete().
5866   */
5867   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5868   assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx );
5869 
5870   if( !aOvflSpace ){
5871     return SQLITE_NOMEM;
5872   }
5873 
5874   /* Find the sibling pages to balance. Also locate the cells in pParent
5875   ** that divide the siblings. An attempt is made to find NN siblings on
5876   ** either side of pPage. More siblings are taken from one side, however,
5877   ** if there are fewer than NN siblings on the other side. If pParent
5878   ** has NB or fewer children then all children of pParent are taken.
5879   **
5880   ** This loop also drops the divider cells from the parent page. This
5881   ** way, the remainder of the function does not have to deal with any
5882   ** overflow cells in the parent page, since if any existed they will
5883   ** have already been removed.
5884   */
5885   i = pParent->nOverflow + pParent->nCell;
5886   if( i<2 ){
5887     nxDiv = 0;
5888     nOld = i+1;
5889   }else{
5890     nOld = 3;
5891     if( iParentIdx==0 ){
5892       nxDiv = 0;
5893     }else if( iParentIdx==i ){
5894       nxDiv = i-2;
5895     }else{
5896       nxDiv = iParentIdx-1;
5897     }
5898     i = 2;
5899   }
5900   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
5901     pRight = &pParent->aData[pParent->hdrOffset+8];
5902   }else{
5903     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
5904   }
5905   pgno = get4byte(pRight);
5906   while( 1 ){
5907     rc = getAndInitPage(pBt, pgno, &apOld[i]);
5908     if( rc ){
5909       memset(apOld, 0, (i+1)*sizeof(MemPage*));
5910       goto balance_cleanup;
5911     }
5912     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5913     if( (i--)==0 ) break;
5914 
5915     if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){
5916       apDiv[i] = pParent->aOvfl[0].pCell;
5917       pgno = get4byte(apDiv[i]);
5918       szNew[i] = cellSizePtr(pParent, apDiv[i]);
5919       pParent->nOverflow = 0;
5920     }else{
5921       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
5922       pgno = get4byte(apDiv[i]);
5923       szNew[i] = cellSizePtr(pParent, apDiv[i]);
5924 
5925       /* Drop the cell from the parent page. apDiv[i] still points to
5926       ** the cell within the parent, even though it has been dropped.
5927       ** This is safe because dropping a cell only overwrites the first
5928       ** four bytes of it, and this function does not need the first
5929       ** four bytes of the divider cell. So the pointer is safe to use
5930       ** later on.
5931       **
5932       ** Unless SQLite is compiled in secure-delete mode. In this case,
5933       ** the dropCell() routine will overwrite the entire cell with zeroes.
5934       ** In this case, temporarily copy the cell into the aOvflSpace[]
5935       ** buffer. It will be copied out again as soon as the aSpace[] buffer
5936       ** is allocated.  */
5937       if( pBt->secureDelete ){
5938         int iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
5939         if( (iOff+szNew[i])>(int)pBt->usableSize ){
5940           rc = SQLITE_CORRUPT_BKPT;
5941           memset(apOld, 0, (i+1)*sizeof(MemPage*));
5942           goto balance_cleanup;
5943         }else{
5944           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
5945           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
5946         }
5947       }
5948       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
5949     }
5950   }
5951 
5952   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5953   ** alignment */
5954   nMaxCells = (nMaxCells + 3)&~3;
5955 
5956   /*
5957   ** Allocate space for memory structures
5958   */
5959   k = pBt->pageSize + ROUND8(sizeof(MemPage));
5960   szScratch =
5961        nMaxCells*sizeof(u8*)                       /* apCell */
5962      + nMaxCells*sizeof(u16)                       /* szCell */
5963      + pBt->pageSize                               /* aSpace1 */
5964      + k*nOld;                                     /* Page copies (apCopy) */
5965   apCell = sqlite3ScratchMalloc( szScratch );
5966   if( apCell==0 ){
5967     rc = SQLITE_NOMEM;
5968     goto balance_cleanup;
5969   }
5970   szCell = (u16*)&apCell[nMaxCells];
5971   aSpace1 = (u8*)&szCell[nMaxCells];
5972   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
5973 
5974   /*
5975   ** Load pointers to all cells on sibling pages and the divider cells
5976   ** into the local apCell[] array.  Make copies of the divider cells
5977   ** into space obtained from aSpace1[] and remove the the divider Cells
5978   ** from pParent.
5979   **
5980   ** If the siblings are on leaf pages, then the child pointers of the
5981   ** divider cells are stripped from the cells before they are copied
5982   ** into aSpace1[].  In this way, all cells in apCell[] are without
5983   ** child pointers.  If siblings are not leaves, then all cell in
5984   ** apCell[] include child pointers.  Either way, all cells in apCell[]
5985   ** are alike.
5986   **
5987   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
5988   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
5989   */
5990   leafCorrection = apOld[0]->leaf*4;
5991   leafData = apOld[0]->hasData;
5992   for(i=0; i<nOld; i++){
5993     int limit;
5994 
5995     /* Before doing anything else, take a copy of the i'th original sibling
5996     ** The rest of this function will use data from the copies rather
5997     ** that the original pages since the original pages will be in the
5998     ** process of being overwritten.  */
5999     MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
6000     memcpy(pOld, apOld[i], sizeof(MemPage));
6001     pOld->aData = (void*)&pOld[1];
6002     memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
6003 
6004     limit = pOld->nCell+pOld->nOverflow;
6005     for(j=0; j<limit; j++){
6006       assert( nCell<nMaxCells );
6007       apCell[nCell] = findOverflowCell(pOld, j);
6008       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6009       nCell++;
6010     }
6011     if( i<nOld-1 && !leafData){
6012       u16 sz = (u16)szNew[i];
6013       u8 *pTemp;
6014       assert( nCell<nMaxCells );
6015       szCell[nCell] = sz;
6016       pTemp = &aSpace1[iSpace1];
6017       iSpace1 += sz;
6018       assert( sz<=pBt->maxLocal+23 );
6019       assert( iSpace1<=pBt->pageSize );
6020       memcpy(pTemp, apDiv[i], sz);
6021       apCell[nCell] = pTemp+leafCorrection;
6022       assert( leafCorrection==0 || leafCorrection==4 );
6023       szCell[nCell] = szCell[nCell] - leafCorrection;
6024       if( !pOld->leaf ){
6025         assert( leafCorrection==0 );
6026         assert( pOld->hdrOffset==0 );
6027         /* The right pointer of the child page pOld becomes the left
6028         ** pointer of the divider cell */
6029         memcpy(apCell[nCell], &pOld->aData[8], 4);
6030       }else{
6031         assert( leafCorrection==4 );
6032         if( szCell[nCell]<4 ){
6033           /* Do not allow any cells smaller than 4 bytes. */
6034           szCell[nCell] = 4;
6035         }
6036       }
6037       nCell++;
6038     }
6039   }
6040 
6041   /*
6042   ** Figure out the number of pages needed to hold all nCell cells.
6043   ** Store this number in "k".  Also compute szNew[] which is the total
6044   ** size of all cells on the i-th page and cntNew[] which is the index
6045   ** in apCell[] of the cell that divides page i from page i+1.
6046   ** cntNew[k] should equal nCell.
6047   **
6048   ** Values computed by this block:
6049   **
6050   **           k: The total number of sibling pages
6051   **    szNew[i]: Spaced used on the i-th sibling page.
6052   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
6053   **              the right of the i-th sibling page.
6054   ** usableSpace: Number of bytes of space available on each sibling.
6055   **
6056   */
6057   usableSpace = pBt->usableSize - 12 + leafCorrection;
6058   for(subtotal=k=i=0; i<nCell; i++){
6059     assert( i<nMaxCells );
6060     subtotal += szCell[i] + 2;
6061     if( subtotal > usableSpace ){
6062       szNew[k] = subtotal - szCell[i];
6063       cntNew[k] = i;
6064       if( leafData ){ i--; }
6065       subtotal = 0;
6066       k++;
6067       if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
6068     }
6069   }
6070   szNew[k] = subtotal;
6071   cntNew[k] = nCell;
6072   k++;
6073 
6074   /*
6075   ** The packing computed by the previous block is biased toward the siblings
6076   ** on the left side.  The left siblings are always nearly full, while the
6077   ** right-most sibling might be nearly empty.  This block of code attempts
6078   ** to adjust the packing of siblings to get a better balance.
6079   **
6080   ** This adjustment is more than an optimization.  The packing above might
6081   ** be so out of balance as to be illegal.  For example, the right-most
6082   ** sibling might be completely empty.  This adjustment is not optional.
6083   */
6084   for(i=k-1; i>0; i--){
6085     int szRight = szNew[i];  /* Size of sibling on the right */
6086     int szLeft = szNew[i-1]; /* Size of sibling on the left */
6087     int r;              /* Index of right-most cell in left sibling */
6088     int d;              /* Index of first cell to the left of right sibling */
6089 
6090     r = cntNew[i-1] - 1;
6091     d = r + 1 - leafData;
6092     assert( d<nMaxCells );
6093     assert( r<nMaxCells );
6094     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
6095       szRight += szCell[d] + 2;
6096       szLeft -= szCell[r] + 2;
6097       cntNew[i-1]--;
6098       r = cntNew[i-1] - 1;
6099       d = r + 1 - leafData;
6100     }
6101     szNew[i] = szRight;
6102     szNew[i-1] = szLeft;
6103   }
6104 
6105   /* Either we found one or more cells (cntnew[0])>0) or pPage is
6106   ** a virtual root page.  A virtual root page is when the real root
6107   ** page is page 1 and we are the only child of that page.
6108   */
6109   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
6110 
6111   TRACE(("BALANCE: old: %d %d %d  ",
6112     apOld[0]->pgno,
6113     nOld>=2 ? apOld[1]->pgno : 0,
6114     nOld>=3 ? apOld[2]->pgno : 0
6115   ));
6116 
6117   /*
6118   ** Allocate k new pages.  Reuse old pages where possible.
6119   */
6120   if( apOld[0]->pgno<=1 ){
6121     rc = SQLITE_CORRUPT_BKPT;
6122     goto balance_cleanup;
6123   }
6124   pageFlags = apOld[0]->aData[0];
6125   for(i=0; i<k; i++){
6126     MemPage *pNew;
6127     if( i<nOld ){
6128       pNew = apNew[i] = apOld[i];
6129       apOld[i] = 0;
6130       rc = sqlite3PagerWrite(pNew->pDbPage);
6131       nNew++;
6132       if( rc ) goto balance_cleanup;
6133     }else{
6134       assert( i>0 );
6135       rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
6136       if( rc ) goto balance_cleanup;
6137       apNew[i] = pNew;
6138       nNew++;
6139 
6140       /* Set the pointer-map entry for the new sibling page. */
6141       if( ISAUTOVACUUM ){
6142         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6143         if( rc!=SQLITE_OK ){
6144           goto balance_cleanup;
6145         }
6146       }
6147     }
6148   }
6149 
6150   /* Free any old pages that were not reused as new pages.
6151   */
6152   while( i<nOld ){
6153     freePage(apOld[i], &rc);
6154     if( rc ) goto balance_cleanup;
6155     releasePage(apOld[i]);
6156     apOld[i] = 0;
6157     i++;
6158   }
6159 
6160   /*
6161   ** Put the new pages in accending order.  This helps to
6162   ** keep entries in the disk file in order so that a scan
6163   ** of the table is a linear scan through the file.  That
6164   ** in turn helps the operating system to deliver pages
6165   ** from the disk more rapidly.
6166   **
6167   ** An O(n^2) insertion sort algorithm is used, but since
6168   ** n is never more than NB (a small constant), that should
6169   ** not be a problem.
6170   **
6171   ** When NB==3, this one optimization makes the database
6172   ** about 25% faster for large insertions and deletions.
6173   */
6174   for(i=0; i<k-1; i++){
6175     int minV = apNew[i]->pgno;
6176     int minI = i;
6177     for(j=i+1; j<k; j++){
6178       if( apNew[j]->pgno<(unsigned)minV ){
6179         minI = j;
6180         minV = apNew[j]->pgno;
6181       }
6182     }
6183     if( minI>i ){
6184       int t;
6185       MemPage *pT;
6186       t = apNew[i]->pgno;
6187       pT = apNew[i];
6188       apNew[i] = apNew[minI];
6189       apNew[minI] = pT;
6190     }
6191   }
6192   TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
6193     apNew[0]->pgno, szNew[0],
6194     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
6195     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
6196     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
6197     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
6198 
6199   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6200   put4byte(pRight, apNew[nNew-1]->pgno);
6201 
6202   /*
6203   ** Evenly distribute the data in apCell[] across the new pages.
6204   ** Insert divider cells into pParent as necessary.
6205   */
6206   j = 0;
6207   for(i=0; i<nNew; i++){
6208     /* Assemble the new sibling page. */
6209     MemPage *pNew = apNew[i];
6210     assert( j<nMaxCells );
6211     zeroPage(pNew, pageFlags);
6212     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6213     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6214     assert( pNew->nOverflow==0 );
6215 
6216     j = cntNew[i];
6217 
6218     /* If the sibling page assembled above was not the right-most sibling,
6219     ** insert a divider cell into the parent page.
6220     */
6221     assert( i<nNew-1 || j==nCell );
6222     if( j<nCell ){
6223       u8 *pCell;
6224       u8 *pTemp;
6225       int sz;
6226 
6227       assert( j<nMaxCells );
6228       pCell = apCell[j];
6229       sz = szCell[j] + leafCorrection;
6230       pTemp = &aOvflSpace[iOvflSpace];
6231       if( !pNew->leaf ){
6232         memcpy(&pNew->aData[8], pCell, 4);
6233       }else if( leafData ){
6234         /* If the tree is a leaf-data tree, and the siblings are leaves,
6235         ** then there is no divider cell in apCell[]. Instead, the divider
6236         ** cell consists of the integer key for the right-most cell of
6237         ** the sibling-page assembled above only.
6238         */
6239         CellInfo info;
6240         j--;
6241         btreeParseCellPtr(pNew, apCell[j], &info);
6242         pCell = pTemp;
6243         sz = 4 + putVarint(&pCell[4], info.nKey);
6244         pTemp = 0;
6245       }else{
6246         pCell -= 4;
6247         /* Obscure case for non-leaf-data trees: If the cell at pCell was
6248         ** previously stored on a leaf node, and its reported size was 4
6249         ** bytes, then it may actually be smaller than this
6250         ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6251         ** any cell). But it is important to pass the correct size to
6252         ** insertCell(), so reparse the cell now.
6253         **
6254         ** Note that this can never happen in an SQLite data file, as all
6255         ** cells are at least 4 bytes. It only happens in b-trees used
6256         ** to evaluate "IN (SELECT ...)" and similar clauses.
6257         */
6258         if( szCell[j]==4 ){
6259           assert(leafCorrection==4);
6260           sz = cellSizePtr(pParent, pCell);
6261         }
6262       }
6263       iOvflSpace += sz;
6264       assert( sz<=pBt->maxLocal+23 );
6265       assert( iOvflSpace<=pBt->pageSize );
6266       insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6267       if( rc!=SQLITE_OK ) goto balance_cleanup;
6268       assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6269 
6270       j++;
6271       nxDiv++;
6272     }
6273   }
6274   assert( j==nCell );
6275   assert( nOld>0 );
6276   assert( nNew>0 );
6277   if( (pageFlags & PTF_LEAF)==0 ){
6278     u8 *zChild = &apCopy[nOld-1]->aData[8];
6279     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6280   }
6281 
6282   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6283     /* The root page of the b-tree now contains no cells. The only sibling
6284     ** page is the right-child of the parent. Copy the contents of the
6285     ** child page into the parent, decreasing the overall height of the
6286     ** b-tree structure by one. This is described as the "balance-shallower"
6287     ** sub-algorithm in some documentation.
6288     **
6289     ** If this is an auto-vacuum database, the call to copyNodeContent()
6290     ** sets all pointer-map entries corresponding to database image pages
6291     ** for which the pointer is stored within the content being copied.
6292     **
6293     ** The second assert below verifies that the child page is defragmented
6294     ** (it must be, as it was just reconstructed using assemblePage()). This
6295     ** is important if the parent page happens to be page 1 of the database
6296     ** image.  */
6297     assert( nNew==1 );
6298     assert( apNew[0]->nFree ==
6299         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6300     );
6301     copyNodeContent(apNew[0], pParent, &rc);
6302     freePage(apNew[0], &rc);
6303   }else if( ISAUTOVACUUM ){
6304     /* Fix the pointer-map entries for all the cells that were shifted around.
6305     ** There are several different types of pointer-map entries that need to
6306     ** be dealt with by this routine. Some of these have been set already, but
6307     ** many have not. The following is a summary:
6308     **
6309     **   1) The entries associated with new sibling pages that were not
6310     **      siblings when this function was called. These have already
6311     **      been set. We don't need to worry about old siblings that were
6312     **      moved to the free-list - the freePage() code has taken care
6313     **      of those.
6314     **
6315     **   2) The pointer-map entries associated with the first overflow
6316     **      page in any overflow chains used by new divider cells. These
6317     **      have also already been taken care of by the insertCell() code.
6318     **
6319     **   3) If the sibling pages are not leaves, then the child pages of
6320     **      cells stored on the sibling pages may need to be updated.
6321     **
6322     **   4) If the sibling pages are not internal intkey nodes, then any
6323     **      overflow pages used by these cells may need to be updated
6324     **      (internal intkey nodes never contain pointers to overflow pages).
6325     **
6326     **   5) If the sibling pages are not leaves, then the pointer-map
6327     **      entries for the right-child pages of each sibling may need
6328     **      to be updated.
6329     **
6330     ** Cases 1 and 2 are dealt with above by other code. The next
6331     ** block deals with cases 3 and 4 and the one after that, case 5. Since
6332     ** setting a pointer map entry is a relatively expensive operation, this
6333     ** code only sets pointer map entries for child or overflow pages that have
6334     ** actually moved between pages.  */
6335     MemPage *pNew = apNew[0];
6336     MemPage *pOld = apCopy[0];
6337     int nOverflow = pOld->nOverflow;
6338     int iNextOld = pOld->nCell + nOverflow;
6339     int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1);
6340     j = 0;                             /* Current 'old' sibling page */
6341     k = 0;                             /* Current 'new' sibling page */
6342     for(i=0; i<nCell; i++){
6343       int isDivider = 0;
6344       while( i==iNextOld ){
6345         /* Cell i is the cell immediately following the last cell on old
6346         ** sibling page j. If the siblings are not leaf pages of an
6347         ** intkey b-tree, then cell i was a divider cell. */
6348         pOld = apCopy[++j];
6349         iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6350         if( pOld->nOverflow ){
6351           nOverflow = pOld->nOverflow;
6352           iOverflow = i + !leafData + pOld->aOvfl[0].idx;
6353         }
6354         isDivider = !leafData;
6355       }
6356 
6357       assert(nOverflow>0 || iOverflow<i );
6358       assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1);
6359       assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1);
6360       if( i==iOverflow ){
6361         isDivider = 1;
6362         if( (--nOverflow)>0 ){
6363           iOverflow++;
6364         }
6365       }
6366 
6367       if( i==cntNew[k] ){
6368         /* Cell i is the cell immediately following the last cell on new
6369         ** sibling page k. If the siblings are not leaf pages of an
6370         ** intkey b-tree, then cell i is a divider cell.  */
6371         pNew = apNew[++k];
6372         if( !leafData ) continue;
6373       }
6374       assert( j<nOld );
6375       assert( k<nNew );
6376 
6377       /* If the cell was originally divider cell (and is not now) or
6378       ** an overflow cell, or if the cell was located on a different sibling
6379       ** page before the balancing, then the pointer map entries associated
6380       ** with any child or overflow pages need to be updated.  */
6381       if( isDivider || pOld->pgno!=pNew->pgno ){
6382         if( !leafCorrection ){
6383           ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6384         }
6385         if( szCell[i]>pNew->minLocal ){
6386           ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6387         }
6388       }
6389     }
6390 
6391     if( !leafCorrection ){
6392       for(i=0; i<nNew; i++){
6393         u32 key = get4byte(&apNew[i]->aData[8]);
6394         ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6395       }
6396     }
6397 
6398 #if 0
6399     /* The ptrmapCheckPages() contains assert() statements that verify that
6400     ** all pointer map pages are set correctly. This is helpful while
6401     ** debugging. This is usually disabled because a corrupt database may
6402     ** cause an assert() statement to fail.  */
6403     ptrmapCheckPages(apNew, nNew);
6404     ptrmapCheckPages(&pParent, 1);
6405 #endif
6406   }
6407 
6408   assert( pParent->isInit );
6409   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6410           nOld, nNew, nCell));
6411 
6412   /*
6413   ** Cleanup before returning.
6414   */
6415 balance_cleanup:
6416   sqlite3ScratchFree(apCell);
6417   for(i=0; i<nOld; i++){
6418     releasePage(apOld[i]);
6419   }
6420   for(i=0; i<nNew; i++){
6421     releasePage(apNew[i]);
6422   }
6423 
6424   return rc;
6425 }
6426 
6427 
6428 /*
6429 ** This function is called when the root page of a b-tree structure is
6430 ** overfull (has one or more overflow pages).
6431 **
6432 ** A new child page is allocated and the contents of the current root
6433 ** page, including overflow cells, are copied into the child. The root
6434 ** page is then overwritten to make it an empty page with the right-child
6435 ** pointer pointing to the new page.
6436 **
6437 ** Before returning, all pointer-map entries corresponding to pages
6438 ** that the new child-page now contains pointers to are updated. The
6439 ** entry corresponding to the new right-child pointer of the root
6440 ** page is also updated.
6441 **
6442 ** If successful, *ppChild is set to contain a reference to the child
6443 ** page and SQLITE_OK is returned. In this case the caller is required
6444 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6445 ** an error code is returned and *ppChild is set to 0.
6446 */
6447 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6448   int rc;                        /* Return value from subprocedures */
6449   MemPage *pChild = 0;           /* Pointer to a new child page */
6450   Pgno pgnoChild = 0;            /* Page number of the new child page */
6451   BtShared *pBt = pRoot->pBt;    /* The BTree */
6452 
6453   assert( pRoot->nOverflow>0 );
6454   assert( sqlite3_mutex_held(pBt->mutex) );
6455 
6456   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6457   ** page that will become the new right-child of pPage. Copy the contents
6458   ** of the node stored on pRoot into the new child page.
6459   */
6460   rc = sqlite3PagerWrite(pRoot->pDbPage);
6461   if( rc==SQLITE_OK ){
6462     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6463     copyNodeContent(pRoot, pChild, &rc);
6464     if( ISAUTOVACUUM ){
6465       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6466     }
6467   }
6468   if( rc ){
6469     *ppChild = 0;
6470     releasePage(pChild);
6471     return rc;
6472   }
6473   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6474   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6475   assert( pChild->nCell==pRoot->nCell );
6476 
6477   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6478 
6479   /* Copy the overflow cells from pRoot to pChild */
6480   memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
6481   pChild->nOverflow = pRoot->nOverflow;
6482 
6483   /* Zero the contents of pRoot. Then install pChild as the right-child. */
6484   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6485   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6486 
6487   *ppChild = pChild;
6488   return SQLITE_OK;
6489 }
6490 
6491 /*
6492 ** The page that pCur currently points to has just been modified in
6493 ** some way. This function figures out if this modification means the
6494 ** tree needs to be balanced, and if so calls the appropriate balancing
6495 ** routine. Balancing routines are:
6496 **
6497 **   balance_quick()
6498 **   balance_deeper()
6499 **   balance_nonroot()
6500 */
6501 static int balance(BtCursor *pCur){
6502   int rc = SQLITE_OK;
6503   const int nMin = pCur->pBt->usableSize * 2 / 3;
6504   u8 aBalanceQuickSpace[13];
6505   u8 *pFree = 0;
6506 
6507   TESTONLY( int balance_quick_called = 0 );
6508   TESTONLY( int balance_deeper_called = 0 );
6509 
6510   do {
6511     int iPage = pCur->iPage;
6512     MemPage *pPage = pCur->apPage[iPage];
6513 
6514     if( iPage==0 ){
6515       if( pPage->nOverflow ){
6516         /* The root page of the b-tree is overfull. In this case call the
6517         ** balance_deeper() function to create a new child for the root-page
6518         ** and copy the current contents of the root-page to it. The
6519         ** next iteration of the do-loop will balance the child page.
6520         */
6521         assert( (balance_deeper_called++)==0 );
6522         rc = balance_deeper(pPage, &pCur->apPage[1]);
6523         if( rc==SQLITE_OK ){
6524           pCur->iPage = 1;
6525           pCur->aiIdx[0] = 0;
6526           pCur->aiIdx[1] = 0;
6527           assert( pCur->apPage[1]->nOverflow );
6528         }
6529       }else{
6530         break;
6531       }
6532     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6533       break;
6534     }else{
6535       MemPage * const pParent = pCur->apPage[iPage-1];
6536       int const iIdx = pCur->aiIdx[iPage-1];
6537 
6538       rc = sqlite3PagerWrite(pParent->pDbPage);
6539       if( rc==SQLITE_OK ){
6540 #ifndef SQLITE_OMIT_QUICKBALANCE
6541         if( pPage->hasData
6542          && pPage->nOverflow==1
6543          && pPage->aOvfl[0].idx==pPage->nCell
6544          && pParent->pgno!=1
6545          && pParent->nCell==iIdx
6546         ){
6547           /* Call balance_quick() to create a new sibling of pPage on which
6548           ** to store the overflow cell. balance_quick() inserts a new cell
6549           ** into pParent, which may cause pParent overflow. If this
6550           ** happens, the next interation of the do-loop will balance pParent
6551           ** use either balance_nonroot() or balance_deeper(). Until this
6552           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6553           ** buffer.
6554           **
6555           ** The purpose of the following assert() is to check that only a
6556           ** single call to balance_quick() is made for each call to this
6557           ** function. If this were not verified, a subtle bug involving reuse
6558           ** of the aBalanceQuickSpace[] might sneak in.
6559           */
6560           assert( (balance_quick_called++)==0 );
6561           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6562         }else
6563 #endif
6564         {
6565           /* In this case, call balance_nonroot() to redistribute cells
6566           ** between pPage and up to 2 of its sibling pages. This involves
6567           ** modifying the contents of pParent, which may cause pParent to
6568           ** become overfull or underfull. The next iteration of the do-loop
6569           ** will balance the parent page to correct this.
6570           **
6571           ** If the parent page becomes overfull, the overflow cell or cells
6572           ** are stored in the pSpace buffer allocated immediately below.
6573           ** A subsequent iteration of the do-loop will deal with this by
6574           ** calling balance_nonroot() (balance_deeper() may be called first,
6575           ** but it doesn't deal with overflow cells - just moves them to a
6576           ** different page). Once this subsequent call to balance_nonroot()
6577           ** has completed, it is safe to release the pSpace buffer used by
6578           ** the previous call, as the overflow cell data will have been
6579           ** copied either into the body of a database page or into the new
6580           ** pSpace buffer passed to the latter call to balance_nonroot().
6581           */
6582           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6583           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
6584           if( pFree ){
6585             /* If pFree is not NULL, it points to the pSpace buffer used
6586             ** by a previous call to balance_nonroot(). Its contents are
6587             ** now stored either on real database pages or within the
6588             ** new pSpace buffer, so it may be safely freed here. */
6589             sqlite3PageFree(pFree);
6590           }
6591 
6592           /* The pSpace buffer will be freed after the next call to
6593           ** balance_nonroot(), or just before this function returns, whichever
6594           ** comes first. */
6595           pFree = pSpace;
6596         }
6597       }
6598 
6599       pPage->nOverflow = 0;
6600 
6601       /* The next iteration of the do-loop balances the parent page. */
6602       releasePage(pPage);
6603       pCur->iPage--;
6604     }
6605   }while( rc==SQLITE_OK );
6606 
6607   if( pFree ){
6608     sqlite3PageFree(pFree);
6609   }
6610   return rc;
6611 }
6612 
6613 
6614 /*
6615 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
6616 ** and the data is given by (pData,nData).  The cursor is used only to
6617 ** define what table the record should be inserted into.  The cursor
6618 ** is left pointing at a random location.
6619 **
6620 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
6621 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
6622 **
6623 ** If the seekResult parameter is non-zero, then a successful call to
6624 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6625 ** been performed. seekResult is the search result returned (a negative
6626 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6627 ** a positive value if pCur points at an etry that is larger than
6628 ** (pKey, nKey)).
6629 **
6630 ** If the seekResult parameter is non-zero, then the caller guarantees that
6631 ** cursor pCur is pointing at the existing copy of a row that is to be
6632 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
6633 ** point to any entry or to no entry at all and so this function has to seek
6634 ** the cursor before the new key can be inserted.
6635 */
6636 int sqlite3BtreeInsert(
6637   BtCursor *pCur,                /* Insert data into the table of this cursor */
6638   const void *pKey, i64 nKey,    /* The key of the new record */
6639   const void *pData, int nData,  /* The data of the new record */
6640   int nZero,                     /* Number of extra 0 bytes to append to data */
6641   int appendBias,                /* True if this is likely an append */
6642   int seekResult                 /* Result of prior MovetoUnpacked() call */
6643 ){
6644   int rc;
6645   int loc = seekResult;          /* -1: before desired location  +1: after */
6646   int szNew = 0;
6647   int idx;
6648   MemPage *pPage;
6649   Btree *p = pCur->pBtree;
6650   BtShared *pBt = p->pBt;
6651   unsigned char *oldCell;
6652   unsigned char *newCell = 0;
6653 
6654   if( pCur->eState==CURSOR_FAULT ){
6655     assert( pCur->skipNext!=SQLITE_OK );
6656     return pCur->skipNext;
6657   }
6658 
6659   assert( cursorHoldsMutex(pCur) );
6660   assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly );
6661   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6662 
6663   /* Assert that the caller has been consistent. If this cursor was opened
6664   ** expecting an index b-tree, then the caller should be inserting blob
6665   ** keys with no associated data. If the cursor was opened expecting an
6666   ** intkey table, the caller should be inserting integer keys with a
6667   ** blob of associated data.  */
6668   assert( (pKey==0)==(pCur->pKeyInfo==0) );
6669 
6670   /* If this is an insert into a table b-tree, invalidate any incrblob
6671   ** cursors open on the row being replaced (assuming this is a replace
6672   ** operation - if it is not, the following is a no-op).  */
6673   if( pCur->pKeyInfo==0 ){
6674     invalidateIncrblobCursors(p, nKey, 0);
6675   }
6676 
6677   /* Save the positions of any other cursors open on this table.
6678   **
6679   ** In some cases, the call to btreeMoveto() below is a no-op. For
6680   ** example, when inserting data into a table with auto-generated integer
6681   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6682   ** integer key to use. It then calls this function to actually insert the
6683   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6684   ** that the cursor is already where it needs to be and returns without
6685   ** doing any work. To avoid thwarting these optimizations, it is important
6686   ** not to clear the cursor here.
6687   */
6688   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6689   if( rc ) return rc;
6690   if( !loc ){
6691     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6692     if( rc ) return rc;
6693   }
6694   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
6695 
6696   pPage = pCur->apPage[pCur->iPage];
6697   assert( pPage->intKey || nKey>=0 );
6698   assert( pPage->leaf || !pPage->intKey );
6699 
6700   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6701           pCur->pgnoRoot, nKey, nData, pPage->pgno,
6702           loc==0 ? "overwrite" : "new entry"));
6703   assert( pPage->isInit );
6704   allocateTempSpace(pBt);
6705   newCell = pBt->pTmpSpace;
6706   if( newCell==0 ) return SQLITE_NOMEM;
6707   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6708   if( rc ) goto end_insert;
6709   assert( szNew==cellSizePtr(pPage, newCell) );
6710   assert( szNew<=MX_CELL_SIZE(pBt) );
6711   idx = pCur->aiIdx[pCur->iPage];
6712   if( loc==0 ){
6713     u16 szOld;
6714     assert( idx<pPage->nCell );
6715     rc = sqlite3PagerWrite(pPage->pDbPage);
6716     if( rc ){
6717       goto end_insert;
6718     }
6719     oldCell = findCell(pPage, idx);
6720     if( !pPage->leaf ){
6721       memcpy(newCell, oldCell, 4);
6722     }
6723     szOld = cellSizePtr(pPage, oldCell);
6724     rc = clearCell(pPage, oldCell);
6725     dropCell(pPage, idx, szOld, &rc);
6726     if( rc ) goto end_insert;
6727   }else if( loc<0 && pPage->nCell>0 ){
6728     assert( pPage->leaf );
6729     idx = ++pCur->aiIdx[pCur->iPage];
6730   }else{
6731     assert( pPage->leaf );
6732   }
6733   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6734   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
6735 
6736   /* If no error has occured and pPage has an overflow cell, call balance()
6737   ** to redistribute the cells within the tree. Since balance() may move
6738   ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6739   ** variables.
6740   **
6741   ** Previous versions of SQLite called moveToRoot() to move the cursor
6742   ** back to the root page as balance() used to invalidate the contents
6743   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6744   ** set the cursor state to "invalid". This makes common insert operations
6745   ** slightly faster.
6746   **
6747   ** There is a subtle but important optimization here too. When inserting
6748   ** multiple records into an intkey b-tree using a single cursor (as can
6749   ** happen while processing an "INSERT INTO ... SELECT" statement), it
6750   ** is advantageous to leave the cursor pointing to the last entry in
6751   ** the b-tree if possible. If the cursor is left pointing to the last
6752   ** entry in the table, and the next row inserted has an integer key
6753   ** larger than the largest existing key, it is possible to insert the
6754   ** row without seeking the cursor. This can be a big performance boost.
6755   */
6756   pCur->info.nSize = 0;
6757   pCur->validNKey = 0;
6758   if( rc==SQLITE_OK && pPage->nOverflow ){
6759     rc = balance(pCur);
6760 
6761     /* Must make sure nOverflow is reset to zero even if the balance()
6762     ** fails. Internal data structure corruption will result otherwise.
6763     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6764     ** from trying to save the current position of the cursor.  */
6765     pCur->apPage[pCur->iPage]->nOverflow = 0;
6766     pCur->eState = CURSOR_INVALID;
6767   }
6768   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
6769 
6770 end_insert:
6771   return rc;
6772 }
6773 
6774 /*
6775 ** Delete the entry that the cursor is pointing to.  The cursor
6776 ** is left pointing at a arbitrary location.
6777 */
6778 int sqlite3BtreeDelete(BtCursor *pCur){
6779   Btree *p = pCur->pBtree;
6780   BtShared *pBt = p->pBt;
6781   int rc;                              /* Return code */
6782   MemPage *pPage;                      /* Page to delete cell from */
6783   unsigned char *pCell;                /* Pointer to cell to delete */
6784   int iCellIdx;                        /* Index of cell to delete */
6785   int iCellDepth;                      /* Depth of node containing pCell */
6786 
6787   assert( cursorHoldsMutex(pCur) );
6788   assert( pBt->inTransaction==TRANS_WRITE );
6789   assert( !pBt->readOnly );
6790   assert( pCur->wrFlag );
6791   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6792   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6793 
6794   if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6795    || NEVER(pCur->eState!=CURSOR_VALID)
6796   ){
6797     return SQLITE_ERROR;  /* Something has gone awry. */
6798   }
6799 
6800   /* If this is a delete operation to remove a row from a table b-tree,
6801   ** invalidate any incrblob cursors open on the row being deleted.  */
6802   if( pCur->pKeyInfo==0 ){
6803     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
6804   }
6805 
6806   iCellDepth = pCur->iPage;
6807   iCellIdx = pCur->aiIdx[iCellDepth];
6808   pPage = pCur->apPage[iCellDepth];
6809   pCell = findCell(pPage, iCellIdx);
6810 
6811   /* If the page containing the entry to delete is not a leaf page, move
6812   ** the cursor to the largest entry in the tree that is smaller than
6813   ** the entry being deleted. This cell will replace the cell being deleted
6814   ** from the internal node. The 'previous' entry is used for this instead
6815   ** of the 'next' entry, as the previous entry is always a part of the
6816   ** sub-tree headed by the child page of the cell being deleted. This makes
6817   ** balancing the tree following the delete operation easier.  */
6818   if( !pPage->leaf ){
6819     int notUsed;
6820     rc = sqlite3BtreePrevious(pCur, &notUsed);
6821     if( rc ) return rc;
6822   }
6823 
6824   /* Save the positions of any other cursors open on this table before
6825   ** making any modifications. Make the page containing the entry to be
6826   ** deleted writable. Then free any overflow pages associated with the
6827   ** entry and finally remove the cell itself from within the page.
6828   */
6829   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6830   if( rc ) return rc;
6831   rc = sqlite3PagerWrite(pPage->pDbPage);
6832   if( rc ) return rc;
6833   rc = clearCell(pPage, pCell);
6834   dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
6835   if( rc ) return rc;
6836 
6837   /* If the cell deleted was not located on a leaf page, then the cursor
6838   ** is currently pointing to the largest entry in the sub-tree headed
6839   ** by the child-page of the cell that was just deleted from an internal
6840   ** node. The cell from the leaf node needs to be moved to the internal
6841   ** node to replace the deleted cell.  */
6842   if( !pPage->leaf ){
6843     MemPage *pLeaf = pCur->apPage[pCur->iPage];
6844     int nCell;
6845     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6846     unsigned char *pTmp;
6847 
6848     pCell = findCell(pLeaf, pLeaf->nCell-1);
6849     nCell = cellSizePtr(pLeaf, pCell);
6850     assert( MX_CELL_SIZE(pBt)>=nCell );
6851 
6852     allocateTempSpace(pBt);
6853     pTmp = pBt->pTmpSpace;
6854 
6855     rc = sqlite3PagerWrite(pLeaf->pDbPage);
6856     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6857     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
6858     if( rc ) return rc;
6859   }
6860 
6861   /* Balance the tree. If the entry deleted was located on a leaf page,
6862   ** then the cursor still points to that page. In this case the first
6863   ** call to balance() repairs the tree, and the if(...) condition is
6864   ** never true.
6865   **
6866   ** Otherwise, if the entry deleted was on an internal node page, then
6867   ** pCur is pointing to the leaf page from which a cell was removed to
6868   ** replace the cell deleted from the internal node. This is slightly
6869   ** tricky as the leaf node may be underfull, and the internal node may
6870   ** be either under or overfull. In this case run the balancing algorithm
6871   ** on the leaf node first. If the balance proceeds far enough up the
6872   ** tree that we can be sure that any problem in the internal node has
6873   ** been corrected, so be it. Otherwise, after balancing the leaf node,
6874   ** walk the cursor up the tree to the internal node and balance it as
6875   ** well.  */
6876   rc = balance(pCur);
6877   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
6878     while( pCur->iPage>iCellDepth ){
6879       releasePage(pCur->apPage[pCur->iPage--]);
6880     }
6881     rc = balance(pCur);
6882   }
6883 
6884   if( rc==SQLITE_OK ){
6885     moveToRoot(pCur);
6886   }
6887   return rc;
6888 }
6889 
6890 /*
6891 ** Create a new BTree table.  Write into *piTable the page
6892 ** number for the root page of the new table.
6893 **
6894 ** The type of type is determined by the flags parameter.  Only the
6895 ** following values of flags are currently in use.  Other values for
6896 ** flags might not work:
6897 **
6898 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
6899 **     BTREE_ZERODATA                  Used for SQL indices
6900 */
6901 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
6902   BtShared *pBt = p->pBt;
6903   MemPage *pRoot;
6904   Pgno pgnoRoot;
6905   int rc;
6906   int ptfFlags;          /* Page-type flage for the root page of new table */
6907 
6908   assert( sqlite3BtreeHoldsMutex(p) );
6909   assert( pBt->inTransaction==TRANS_WRITE );
6910   assert( !pBt->readOnly );
6911 
6912 #ifdef SQLITE_OMIT_AUTOVACUUM
6913   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6914   if( rc ){
6915     return rc;
6916   }
6917 #else
6918   if( pBt->autoVacuum ){
6919     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
6920     MemPage *pPageMove; /* The page to move to. */
6921 
6922     /* Creating a new table may probably require moving an existing database
6923     ** to make room for the new tables root page. In case this page turns
6924     ** out to be an overflow page, delete all overflow page-map caches
6925     ** held by open cursors.
6926     */
6927     invalidateAllOverflowCache(pBt);
6928 
6929     /* Read the value of meta[3] from the database to determine where the
6930     ** root page of the new table should go. meta[3] is the largest root-page
6931     ** created so far, so the new root-page is (meta[3]+1).
6932     */
6933     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
6934     pgnoRoot++;
6935 
6936     /* The new root-page may not be allocated on a pointer-map page, or the
6937     ** PENDING_BYTE page.
6938     */
6939     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6940         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6941       pgnoRoot++;
6942     }
6943     assert( pgnoRoot>=3 );
6944 
6945     /* Allocate a page. The page that currently resides at pgnoRoot will
6946     ** be moved to the allocated page (unless the allocated page happens
6947     ** to reside at pgnoRoot).
6948     */
6949     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6950     if( rc!=SQLITE_OK ){
6951       return rc;
6952     }
6953 
6954     if( pgnoMove!=pgnoRoot ){
6955       /* pgnoRoot is the page that will be used for the root-page of
6956       ** the new table (assuming an error did not occur). But we were
6957       ** allocated pgnoMove. If required (i.e. if it was not allocated
6958       ** by extending the file), the current page at position pgnoMove
6959       ** is already journaled.
6960       */
6961       u8 eType = 0;
6962       Pgno iPtrPage = 0;
6963 
6964       releasePage(pPageMove);
6965 
6966       /* Move the page currently at pgnoRoot to pgnoMove. */
6967       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6968       if( rc!=SQLITE_OK ){
6969         return rc;
6970       }
6971       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6972       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6973         rc = SQLITE_CORRUPT_BKPT;
6974       }
6975       if( rc!=SQLITE_OK ){
6976         releasePage(pRoot);
6977         return rc;
6978       }
6979       assert( eType!=PTRMAP_ROOTPAGE );
6980       assert( eType!=PTRMAP_FREEPAGE );
6981       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6982       releasePage(pRoot);
6983 
6984       /* Obtain the page at pgnoRoot */
6985       if( rc!=SQLITE_OK ){
6986         return rc;
6987       }
6988       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6989       if( rc!=SQLITE_OK ){
6990         return rc;
6991       }
6992       rc = sqlite3PagerWrite(pRoot->pDbPage);
6993       if( rc!=SQLITE_OK ){
6994         releasePage(pRoot);
6995         return rc;
6996       }
6997     }else{
6998       pRoot = pPageMove;
6999     }
7000 
7001     /* Update the pointer-map and meta-data with the new root-page number. */
7002     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
7003     if( rc ){
7004       releasePage(pRoot);
7005       return rc;
7006     }
7007 
7008     /* When the new root page was allocated, page 1 was made writable in
7009     ** order either to increase the database filesize, or to decrement the
7010     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
7011     */
7012     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
7013     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
7014     if( NEVER(rc) ){
7015       releasePage(pRoot);
7016       return rc;
7017     }
7018 
7019   }else{
7020     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7021     if( rc ) return rc;
7022   }
7023 #endif
7024   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7025   if( createTabFlags & BTREE_INTKEY ){
7026     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
7027   }else{
7028     ptfFlags = PTF_ZERODATA | PTF_LEAF;
7029   }
7030   zeroPage(pRoot, ptfFlags);
7031   sqlite3PagerUnref(pRoot->pDbPage);
7032   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
7033   *piTable = (int)pgnoRoot;
7034   return SQLITE_OK;
7035 }
7036 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
7037   int rc;
7038   sqlite3BtreeEnter(p);
7039   rc = btreeCreateTable(p, piTable, flags);
7040   sqlite3BtreeLeave(p);
7041   return rc;
7042 }
7043 
7044 /*
7045 ** Erase the given database page and all its children.  Return
7046 ** the page to the freelist.
7047 */
7048 static int clearDatabasePage(
7049   BtShared *pBt,           /* The BTree that contains the table */
7050   Pgno pgno,               /* Page number to clear */
7051   int freePageFlag,        /* Deallocate page if true */
7052   int *pnChange            /* Add number of Cells freed to this counter */
7053 ){
7054   MemPage *pPage;
7055   int rc;
7056   unsigned char *pCell;
7057   int i;
7058 
7059   assert( sqlite3_mutex_held(pBt->mutex) );
7060   if( pgno>btreePagecount(pBt) ){
7061     return SQLITE_CORRUPT_BKPT;
7062   }
7063 
7064   rc = getAndInitPage(pBt, pgno, &pPage);
7065   if( rc ) return rc;
7066   for(i=0; i<pPage->nCell; i++){
7067     pCell = findCell(pPage, i);
7068     if( !pPage->leaf ){
7069       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
7070       if( rc ) goto cleardatabasepage_out;
7071     }
7072     rc = clearCell(pPage, pCell);
7073     if( rc ) goto cleardatabasepage_out;
7074   }
7075   if( !pPage->leaf ){
7076     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
7077     if( rc ) goto cleardatabasepage_out;
7078   }else if( pnChange ){
7079     assert( pPage->intKey );
7080     *pnChange += pPage->nCell;
7081   }
7082   if( freePageFlag ){
7083     freePage(pPage, &rc);
7084   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
7085     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
7086   }
7087 
7088 cleardatabasepage_out:
7089   releasePage(pPage);
7090   return rc;
7091 }
7092 
7093 /*
7094 ** Delete all information from a single table in the database.  iTable is
7095 ** the page number of the root of the table.  After this routine returns,
7096 ** the root page is empty, but still exists.
7097 **
7098 ** This routine will fail with SQLITE_LOCKED if there are any open
7099 ** read cursors on the table.  Open write cursors are moved to the
7100 ** root of the table.
7101 **
7102 ** If pnChange is not NULL, then table iTable must be an intkey table. The
7103 ** integer value pointed to by pnChange is incremented by the number of
7104 ** entries in the table.
7105 */
7106 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
7107   int rc;
7108   BtShared *pBt = p->pBt;
7109   sqlite3BtreeEnter(p);
7110   assert( p->inTrans==TRANS_WRITE );
7111 
7112   /* Invalidate all incrblob cursors open on table iTable (assuming iTable
7113   ** is the root of a table b-tree - if it is not, the following call is
7114   ** a no-op).  */
7115   invalidateIncrblobCursors(p, 0, 1);
7116 
7117   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
7118   if( SQLITE_OK==rc ){
7119     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
7120   }
7121   sqlite3BtreeLeave(p);
7122   return rc;
7123 }
7124 
7125 /*
7126 ** Erase all information in a table and add the root of the table to
7127 ** the freelist.  Except, the root of the principle table (the one on
7128 ** page 1) is never added to the freelist.
7129 **
7130 ** This routine will fail with SQLITE_LOCKED if there are any open
7131 ** cursors on the table.
7132 **
7133 ** If AUTOVACUUM is enabled and the page at iTable is not the last
7134 ** root page in the database file, then the last root page
7135 ** in the database file is moved into the slot formerly occupied by
7136 ** iTable and that last slot formerly occupied by the last root page
7137 ** is added to the freelist instead of iTable.  In this say, all
7138 ** root pages are kept at the beginning of the database file, which
7139 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
7140 ** page number that used to be the last root page in the file before
7141 ** the move.  If no page gets moved, *piMoved is set to 0.
7142 ** The last root page is recorded in meta[3] and the value of
7143 ** meta[3] is updated by this procedure.
7144 */
7145 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
7146   int rc;
7147   MemPage *pPage = 0;
7148   BtShared *pBt = p->pBt;
7149 
7150   assert( sqlite3BtreeHoldsMutex(p) );
7151   assert( p->inTrans==TRANS_WRITE );
7152 
7153   /* It is illegal to drop a table if any cursors are open on the
7154   ** database. This is because in auto-vacuum mode the backend may
7155   ** need to move another root-page to fill a gap left by the deleted
7156   ** root page. If an open cursor was using this page a problem would
7157   ** occur.
7158   **
7159   ** This error is caught long before control reaches this point.
7160   */
7161   if( NEVER(pBt->pCursor) ){
7162     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
7163     return SQLITE_LOCKED_SHAREDCACHE;
7164   }
7165 
7166   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
7167   if( rc ) return rc;
7168   rc = sqlite3BtreeClearTable(p, iTable, 0);
7169   if( rc ){
7170     releasePage(pPage);
7171     return rc;
7172   }
7173 
7174   *piMoved = 0;
7175 
7176   if( iTable>1 ){
7177 #ifdef SQLITE_OMIT_AUTOVACUUM
7178     freePage(pPage, &rc);
7179     releasePage(pPage);
7180 #else
7181     if( pBt->autoVacuum ){
7182       Pgno maxRootPgno;
7183       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
7184 
7185       if( iTable==maxRootPgno ){
7186         /* If the table being dropped is the table with the largest root-page
7187         ** number in the database, put the root page on the free list.
7188         */
7189         freePage(pPage, &rc);
7190         releasePage(pPage);
7191         if( rc!=SQLITE_OK ){
7192           return rc;
7193         }
7194       }else{
7195         /* The table being dropped does not have the largest root-page
7196         ** number in the database. So move the page that does into the
7197         ** gap left by the deleted root-page.
7198         */
7199         MemPage *pMove;
7200         releasePage(pPage);
7201         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7202         if( rc!=SQLITE_OK ){
7203           return rc;
7204         }
7205         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
7206         releasePage(pMove);
7207         if( rc!=SQLITE_OK ){
7208           return rc;
7209         }
7210         pMove = 0;
7211         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7212         freePage(pMove, &rc);
7213         releasePage(pMove);
7214         if( rc!=SQLITE_OK ){
7215           return rc;
7216         }
7217         *piMoved = maxRootPgno;
7218       }
7219 
7220       /* Set the new 'max-root-page' value in the database header. This
7221       ** is the old value less one, less one more if that happens to
7222       ** be a root-page number, less one again if that is the
7223       ** PENDING_BYTE_PAGE.
7224       */
7225       maxRootPgno--;
7226       while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7227              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7228         maxRootPgno--;
7229       }
7230       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7231 
7232       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7233     }else{
7234       freePage(pPage, &rc);
7235       releasePage(pPage);
7236     }
7237 #endif
7238   }else{
7239     /* If sqlite3BtreeDropTable was called on page 1.
7240     ** This really never should happen except in a corrupt
7241     ** database.
7242     */
7243     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7244     releasePage(pPage);
7245   }
7246   return rc;
7247 }
7248 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7249   int rc;
7250   sqlite3BtreeEnter(p);
7251   rc = btreeDropTable(p, iTable, piMoved);
7252   sqlite3BtreeLeave(p);
7253   return rc;
7254 }
7255 
7256 
7257 /*
7258 ** This function may only be called if the b-tree connection already
7259 ** has a read or write transaction open on the database.
7260 **
7261 ** Read the meta-information out of a database file.  Meta[0]
7262 ** is the number of free pages currently in the database.  Meta[1]
7263 ** through meta[15] are available for use by higher layers.  Meta[0]
7264 ** is read-only, the others are read/write.
7265 **
7266 ** The schema layer numbers meta values differently.  At the schema
7267 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7268 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
7269 */
7270 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7271   BtShared *pBt = p->pBt;
7272 
7273   sqlite3BtreeEnter(p);
7274   assert( p->inTrans>TRANS_NONE );
7275   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7276   assert( pBt->pPage1 );
7277   assert( idx>=0 && idx<=15 );
7278 
7279   *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7280 
7281   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7282   ** database, mark the database as read-only.  */
7283 #ifdef SQLITE_OMIT_AUTOVACUUM
7284   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1;
7285 #endif
7286 
7287   sqlite3BtreeLeave(p);
7288 }
7289 
7290 /*
7291 ** Write meta-information back into the database.  Meta[0] is
7292 ** read-only and may not be written.
7293 */
7294 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7295   BtShared *pBt = p->pBt;
7296   unsigned char *pP1;
7297   int rc;
7298   assert( idx>=1 && idx<=15 );
7299   sqlite3BtreeEnter(p);
7300   assert( p->inTrans==TRANS_WRITE );
7301   assert( pBt->pPage1!=0 );
7302   pP1 = pBt->pPage1->aData;
7303   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7304   if( rc==SQLITE_OK ){
7305     put4byte(&pP1[36 + idx*4], iMeta);
7306 #ifndef SQLITE_OMIT_AUTOVACUUM
7307     if( idx==BTREE_INCR_VACUUM ){
7308       assert( pBt->autoVacuum || iMeta==0 );
7309       assert( iMeta==0 || iMeta==1 );
7310       pBt->incrVacuum = (u8)iMeta;
7311     }
7312 #endif
7313   }
7314   sqlite3BtreeLeave(p);
7315   return rc;
7316 }
7317 
7318 #ifndef SQLITE_OMIT_BTREECOUNT
7319 /*
7320 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7321 ** number of entries in the b-tree and write the result to *pnEntry.
7322 **
7323 ** SQLITE_OK is returned if the operation is successfully executed.
7324 ** Otherwise, if an error is encountered (i.e. an IO error or database
7325 ** corruption) an SQLite error code is returned.
7326 */
7327 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7328   i64 nEntry = 0;                      /* Value to return in *pnEntry */
7329   int rc;                              /* Return code */
7330   rc = moveToRoot(pCur);
7331 
7332   /* Unless an error occurs, the following loop runs one iteration for each
7333   ** page in the B-Tree structure (not including overflow pages).
7334   */
7335   while( rc==SQLITE_OK ){
7336     int iIdx;                          /* Index of child node in parent */
7337     MemPage *pPage;                    /* Current page of the b-tree */
7338 
7339     /* If this is a leaf page or the tree is not an int-key tree, then
7340     ** this page contains countable entries. Increment the entry counter
7341     ** accordingly.
7342     */
7343     pPage = pCur->apPage[pCur->iPage];
7344     if( pPage->leaf || !pPage->intKey ){
7345       nEntry += pPage->nCell;
7346     }
7347 
7348     /* pPage is a leaf node. This loop navigates the cursor so that it
7349     ** points to the first interior cell that it points to the parent of
7350     ** the next page in the tree that has not yet been visited. The
7351     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7352     ** of the page, or to the number of cells in the page if the next page
7353     ** to visit is the right-child of its parent.
7354     **
7355     ** If all pages in the tree have been visited, return SQLITE_OK to the
7356     ** caller.
7357     */
7358     if( pPage->leaf ){
7359       do {
7360         if( pCur->iPage==0 ){
7361           /* All pages of the b-tree have been visited. Return successfully. */
7362           *pnEntry = nEntry;
7363           return SQLITE_OK;
7364         }
7365         moveToParent(pCur);
7366       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7367 
7368       pCur->aiIdx[pCur->iPage]++;
7369       pPage = pCur->apPage[pCur->iPage];
7370     }
7371 
7372     /* Descend to the child node of the cell that the cursor currently
7373     ** points at. This is the right-child if (iIdx==pPage->nCell).
7374     */
7375     iIdx = pCur->aiIdx[pCur->iPage];
7376     if( iIdx==pPage->nCell ){
7377       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7378     }else{
7379       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7380     }
7381   }
7382 
7383   /* An error has occurred. Return an error code. */
7384   return rc;
7385 }
7386 #endif
7387 
7388 /*
7389 ** Return the pager associated with a BTree.  This routine is used for
7390 ** testing and debugging only.
7391 */
7392 Pager *sqlite3BtreePager(Btree *p){
7393   return p->pBt->pPager;
7394 }
7395 
7396 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7397 /*
7398 ** Append a message to the error message string.
7399 */
7400 static void checkAppendMsg(
7401   IntegrityCk *pCheck,
7402   char *zMsg1,
7403   const char *zFormat,
7404   ...
7405 ){
7406   va_list ap;
7407   if( !pCheck->mxErr ) return;
7408   pCheck->mxErr--;
7409   pCheck->nErr++;
7410   va_start(ap, zFormat);
7411   if( pCheck->errMsg.nChar ){
7412     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7413   }
7414   if( zMsg1 ){
7415     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7416   }
7417   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7418   va_end(ap);
7419   if( pCheck->errMsg.mallocFailed ){
7420     pCheck->mallocFailed = 1;
7421   }
7422 }
7423 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7424 
7425 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7426 /*
7427 ** Add 1 to the reference count for page iPage.  If this is the second
7428 ** reference to the page, add an error message to pCheck->zErrMsg.
7429 ** Return 1 if there are 2 ore more references to the page and 0 if
7430 ** if this is the first reference to the page.
7431 **
7432 ** Also check that the page number is in bounds.
7433 */
7434 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7435   if( iPage==0 ) return 1;
7436   if( iPage>pCheck->nPage ){
7437     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7438     return 1;
7439   }
7440   if( pCheck->anRef[iPage]==1 ){
7441     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7442     return 1;
7443   }
7444   return  (pCheck->anRef[iPage]++)>1;
7445 }
7446 
7447 #ifndef SQLITE_OMIT_AUTOVACUUM
7448 /*
7449 ** Check that the entry in the pointer-map for page iChild maps to
7450 ** page iParent, pointer type ptrType. If not, append an error message
7451 ** to pCheck.
7452 */
7453 static void checkPtrmap(
7454   IntegrityCk *pCheck,   /* Integrity check context */
7455   Pgno iChild,           /* Child page number */
7456   u8 eType,              /* Expected pointer map type */
7457   Pgno iParent,          /* Expected pointer map parent page number */
7458   char *zContext         /* Context description (used for error msg) */
7459 ){
7460   int rc;
7461   u8 ePtrmapType;
7462   Pgno iPtrmapParent;
7463 
7464   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7465   if( rc!=SQLITE_OK ){
7466     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7467     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7468     return;
7469   }
7470 
7471   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7472     checkAppendMsg(pCheck, zContext,
7473       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7474       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7475   }
7476 }
7477 #endif
7478 
7479 /*
7480 ** Check the integrity of the freelist or of an overflow page list.
7481 ** Verify that the number of pages on the list is N.
7482 */
7483 static void checkList(
7484   IntegrityCk *pCheck,  /* Integrity checking context */
7485   int isFreeList,       /* True for a freelist.  False for overflow page list */
7486   int iPage,            /* Page number for first page in the list */
7487   int N,                /* Expected number of pages in the list */
7488   char *zContext        /* Context for error messages */
7489 ){
7490   int i;
7491   int expected = N;
7492   int iFirst = iPage;
7493   while( N-- > 0 && pCheck->mxErr ){
7494     DbPage *pOvflPage;
7495     unsigned char *pOvflData;
7496     if( iPage<1 ){
7497       checkAppendMsg(pCheck, zContext,
7498          "%d of %d pages missing from overflow list starting at %d",
7499           N+1, expected, iFirst);
7500       break;
7501     }
7502     if( checkRef(pCheck, iPage, zContext) ) break;
7503     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7504       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7505       break;
7506     }
7507     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7508     if( isFreeList ){
7509       int n = get4byte(&pOvflData[4]);
7510 #ifndef SQLITE_OMIT_AUTOVACUUM
7511       if( pCheck->pBt->autoVacuum ){
7512         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7513       }
7514 #endif
7515       if( n>(int)pCheck->pBt->usableSize/4-2 ){
7516         checkAppendMsg(pCheck, zContext,
7517            "freelist leaf count too big on page %d", iPage);
7518         N--;
7519       }else{
7520         for(i=0; i<n; i++){
7521           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7522 #ifndef SQLITE_OMIT_AUTOVACUUM
7523           if( pCheck->pBt->autoVacuum ){
7524             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7525           }
7526 #endif
7527           checkRef(pCheck, iFreePage, zContext);
7528         }
7529         N -= n;
7530       }
7531     }
7532 #ifndef SQLITE_OMIT_AUTOVACUUM
7533     else{
7534       /* If this database supports auto-vacuum and iPage is not the last
7535       ** page in this overflow list, check that the pointer-map entry for
7536       ** the following page matches iPage.
7537       */
7538       if( pCheck->pBt->autoVacuum && N>0 ){
7539         i = get4byte(pOvflData);
7540         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7541       }
7542     }
7543 #endif
7544     iPage = get4byte(pOvflData);
7545     sqlite3PagerUnref(pOvflPage);
7546   }
7547 }
7548 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7549 
7550 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7551 /*
7552 ** Do various sanity checks on a single page of a tree.  Return
7553 ** the tree depth.  Root pages return 0.  Parents of root pages
7554 ** return 1, and so forth.
7555 **
7556 ** These checks are done:
7557 **
7558 **      1.  Make sure that cells and freeblocks do not overlap
7559 **          but combine to completely cover the page.
7560 **  NO  2.  Make sure cell keys are in order.
7561 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
7562 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
7563 **      5.  Check the integrity of overflow pages.
7564 **      6.  Recursively call checkTreePage on all children.
7565 **      7.  Verify that the depth of all children is the same.
7566 **      8.  Make sure this page is at least 33% full or else it is
7567 **          the root of the tree.
7568 */
7569 static int checkTreePage(
7570   IntegrityCk *pCheck,  /* Context for the sanity check */
7571   int iPage,            /* Page number of the page to check */
7572   char *zParentContext, /* Parent context */
7573   i64 *pnParentMinKey,
7574   i64 *pnParentMaxKey
7575 ){
7576   MemPage *pPage;
7577   int i, rc, depth, d2, pgno, cnt;
7578   int hdr, cellStart;
7579   int nCell;
7580   u8 *data;
7581   BtShared *pBt;
7582   int usableSize;
7583   char zContext[100];
7584   char *hit = 0;
7585   i64 nMinKey = 0;
7586   i64 nMaxKey = 0;
7587 
7588   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7589 
7590   /* Check that the page exists
7591   */
7592   pBt = pCheck->pBt;
7593   usableSize = pBt->usableSize;
7594   if( iPage==0 ) return 0;
7595   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7596   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7597     checkAppendMsg(pCheck, zContext,
7598        "unable to get the page. error code=%d", rc);
7599     return 0;
7600   }
7601 
7602   /* Clear MemPage.isInit to make sure the corruption detection code in
7603   ** btreeInitPage() is executed.  */
7604   pPage->isInit = 0;
7605   if( (rc = btreeInitPage(pPage))!=0 ){
7606     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
7607     checkAppendMsg(pCheck, zContext,
7608                    "btreeInitPage() returns error code %d", rc);
7609     releasePage(pPage);
7610     return 0;
7611   }
7612 
7613   /* Check out all the cells.
7614   */
7615   depth = 0;
7616   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7617     u8 *pCell;
7618     u32 sz;
7619     CellInfo info;
7620 
7621     /* Check payload overflow pages
7622     */
7623     sqlite3_snprintf(sizeof(zContext), zContext,
7624              "On tree page %d cell %d: ", iPage, i);
7625     pCell = findCell(pPage,i);
7626     btreeParseCellPtr(pPage, pCell, &info);
7627     sz = info.nData;
7628     if( !pPage->intKey ) sz += (int)info.nKey;
7629     /* For intKey pages, check that the keys are in order.
7630     */
7631     else if( i==0 ) nMinKey = nMaxKey = info.nKey;
7632     else{
7633       if( info.nKey <= nMaxKey ){
7634         checkAppendMsg(pCheck, zContext,
7635             "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
7636       }
7637       nMaxKey = info.nKey;
7638     }
7639     assert( sz==info.nPayload );
7640     if( (sz>info.nLocal)
7641      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7642     ){
7643       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7644       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7645 #ifndef SQLITE_OMIT_AUTOVACUUM
7646       if( pBt->autoVacuum ){
7647         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7648       }
7649 #endif
7650       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7651     }
7652 
7653     /* Check sanity of left child page.
7654     */
7655     if( !pPage->leaf ){
7656       pgno = get4byte(pCell);
7657 #ifndef SQLITE_OMIT_AUTOVACUUM
7658       if( pBt->autoVacuum ){
7659         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7660       }
7661 #endif
7662       d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
7663       if( i>0 && d2!=depth ){
7664         checkAppendMsg(pCheck, zContext, "Child page depth differs");
7665       }
7666       depth = d2;
7667     }
7668   }
7669 
7670   if( !pPage->leaf ){
7671     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7672     sqlite3_snprintf(sizeof(zContext), zContext,
7673                      "On page %d at right child: ", iPage);
7674 #ifndef SQLITE_OMIT_AUTOVACUUM
7675     if( pBt->autoVacuum ){
7676       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7677     }
7678 #endif
7679     checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
7680   }
7681 
7682   /* For intKey leaf pages, check that the min/max keys are in order
7683   ** with any left/parent/right pages.
7684   */
7685   if( pPage->leaf && pPage->intKey ){
7686     /* if we are a left child page */
7687     if( pnParentMinKey ){
7688       /* if we are the left most child page */
7689       if( !pnParentMaxKey ){
7690         if( nMaxKey > *pnParentMinKey ){
7691           checkAppendMsg(pCheck, zContext,
7692               "Rowid %lld out of order (max larger than parent min of %lld)",
7693               nMaxKey, *pnParentMinKey);
7694         }
7695       }else{
7696         if( nMinKey <= *pnParentMinKey ){
7697           checkAppendMsg(pCheck, zContext,
7698               "Rowid %lld out of order (min less than parent min of %lld)",
7699               nMinKey, *pnParentMinKey);
7700         }
7701         if( nMaxKey > *pnParentMaxKey ){
7702           checkAppendMsg(pCheck, zContext,
7703               "Rowid %lld out of order (max larger than parent max of %lld)",
7704               nMaxKey, *pnParentMaxKey);
7705         }
7706         *pnParentMinKey = nMaxKey;
7707       }
7708     /* else if we're a right child page */
7709     } else if( pnParentMaxKey ){
7710       if( nMinKey <= *pnParentMaxKey ){
7711         checkAppendMsg(pCheck, zContext,
7712             "Rowid %lld out of order (min less than parent max of %lld)",
7713             nMinKey, *pnParentMaxKey);
7714       }
7715     }
7716   }
7717 
7718   /* Check for complete coverage of the page
7719   */
7720   data = pPage->aData;
7721   hdr = pPage->hdrOffset;
7722   hit = sqlite3PageMalloc( pBt->pageSize );
7723   if( hit==0 ){
7724     pCheck->mallocFailed = 1;
7725   }else{
7726     int contentOffset = get2byteNotZero(&data[hdr+5]);
7727     assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
7728     memset(hit+contentOffset, 0, usableSize-contentOffset);
7729     memset(hit, 1, contentOffset);
7730     nCell = get2byte(&data[hdr+3]);
7731     cellStart = hdr + 12 - 4*pPage->leaf;
7732     for(i=0; i<nCell; i++){
7733       int pc = get2byte(&data[cellStart+i*2]);
7734       u32 size = 65536;
7735       int j;
7736       if( pc<=usableSize-4 ){
7737         size = cellSizePtr(pPage, &data[pc]);
7738       }
7739       if( (int)(pc+size-1)>=usableSize ){
7740         checkAppendMsg(pCheck, 0,
7741             "Corruption detected in cell %d on page %d",i,iPage);
7742       }else{
7743         for(j=pc+size-1; j>=pc; j--) hit[j]++;
7744       }
7745     }
7746     i = get2byte(&data[hdr+1]);
7747     while( i>0 ){
7748       int size, j;
7749       assert( i<=usableSize-4 );     /* Enforced by btreeInitPage() */
7750       size = get2byte(&data[i+2]);
7751       assert( i+size<=usableSize );  /* Enforced by btreeInitPage() */
7752       for(j=i+size-1; j>=i; j--) hit[j]++;
7753       j = get2byte(&data[i]);
7754       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
7755       assert( j<=usableSize-4 );   /* Enforced by btreeInitPage() */
7756       i = j;
7757     }
7758     for(i=cnt=0; i<usableSize; i++){
7759       if( hit[i]==0 ){
7760         cnt++;
7761       }else if( hit[i]>1 ){
7762         checkAppendMsg(pCheck, 0,
7763           "Multiple uses for byte %d of page %d", i, iPage);
7764         break;
7765       }
7766     }
7767     if( cnt!=data[hdr+7] ){
7768       checkAppendMsg(pCheck, 0,
7769           "Fragmentation of %d bytes reported as %d on page %d",
7770           cnt, data[hdr+7], iPage);
7771     }
7772   }
7773   sqlite3PageFree(hit);
7774   releasePage(pPage);
7775   return depth+1;
7776 }
7777 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7778 
7779 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7780 /*
7781 ** This routine does a complete check of the given BTree file.  aRoot[] is
7782 ** an array of pages numbers were each page number is the root page of
7783 ** a table.  nRoot is the number of entries in aRoot.
7784 **
7785 ** A read-only or read-write transaction must be opened before calling
7786 ** this function.
7787 **
7788 ** Write the number of error seen in *pnErr.  Except for some memory
7789 ** allocation errors,  an error message held in memory obtained from
7790 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
7791 ** returned.  If a memory allocation error occurs, NULL is returned.
7792 */
7793 char *sqlite3BtreeIntegrityCheck(
7794   Btree *p,     /* The btree to be checked */
7795   int *aRoot,   /* An array of root pages numbers for individual trees */
7796   int nRoot,    /* Number of entries in aRoot[] */
7797   int mxErr,    /* Stop reporting errors after this many */
7798   int *pnErr    /* Write number of errors seen to this variable */
7799 ){
7800   Pgno i;
7801   int nRef;
7802   IntegrityCk sCheck;
7803   BtShared *pBt = p->pBt;
7804   char zErr[100];
7805 
7806   sqlite3BtreeEnter(p);
7807   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
7808   nRef = sqlite3PagerRefcount(pBt->pPager);
7809   sCheck.pBt = pBt;
7810   sCheck.pPager = pBt->pPager;
7811   sCheck.nPage = btreePagecount(sCheck.pBt);
7812   sCheck.mxErr = mxErr;
7813   sCheck.nErr = 0;
7814   sCheck.mallocFailed = 0;
7815   *pnErr = 0;
7816   if( sCheck.nPage==0 ){
7817     sqlite3BtreeLeave(p);
7818     return 0;
7819   }
7820   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
7821   if( !sCheck.anRef ){
7822     *pnErr = 1;
7823     sqlite3BtreeLeave(p);
7824     return 0;
7825   }
7826   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
7827   i = PENDING_BYTE_PAGE(pBt);
7828   if( i<=sCheck.nPage ){
7829     sCheck.anRef[i] = 1;
7830   }
7831   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7832   sCheck.errMsg.useMalloc = 2;
7833 
7834   /* Check the integrity of the freelist
7835   */
7836   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7837             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7838 
7839   /* Check all the tables.
7840   */
7841   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7842     if( aRoot[i]==0 ) continue;
7843 #ifndef SQLITE_OMIT_AUTOVACUUM
7844     if( pBt->autoVacuum && aRoot[i]>1 ){
7845       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7846     }
7847 #endif
7848     checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
7849   }
7850 
7851   /* Make sure every page in the file is referenced
7852   */
7853   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
7854 #ifdef SQLITE_OMIT_AUTOVACUUM
7855     if( sCheck.anRef[i]==0 ){
7856       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7857     }
7858 #else
7859     /* If the database supports auto-vacuum, make sure no tables contain
7860     ** references to pointer-map pages.
7861     */
7862     if( sCheck.anRef[i]==0 &&
7863        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
7864       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7865     }
7866     if( sCheck.anRef[i]!=0 &&
7867        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
7868       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7869     }
7870 #endif
7871   }
7872 
7873   /* Make sure this analysis did not leave any unref() pages.
7874   ** This is an internal consistency check; an integrity check
7875   ** of the integrity check.
7876   */
7877   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
7878     checkAppendMsg(&sCheck, 0,
7879       "Outstanding page count goes from %d to %d during this analysis",
7880       nRef, sqlite3PagerRefcount(pBt->pPager)
7881     );
7882   }
7883 
7884   /* Clean  up and report errors.
7885   */
7886   sqlite3BtreeLeave(p);
7887   sqlite3_free(sCheck.anRef);
7888   if( sCheck.mallocFailed ){
7889     sqlite3StrAccumReset(&sCheck.errMsg);
7890     *pnErr = sCheck.nErr+1;
7891     return 0;
7892   }
7893   *pnErr = sCheck.nErr;
7894   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7895   return sqlite3StrAccumFinish(&sCheck.errMsg);
7896 }
7897 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7898 
7899 /*
7900 ** Return the full pathname of the underlying database file.
7901 **
7902 ** The pager filename is invariant as long as the pager is
7903 ** open so it is safe to access without the BtShared mutex.
7904 */
7905 const char *sqlite3BtreeGetFilename(Btree *p){
7906   assert( p->pBt->pPager!=0 );
7907   return sqlite3PagerFilename(p->pBt->pPager);
7908 }
7909 
7910 /*
7911 ** Return the pathname of the journal file for this database. The return
7912 ** value of this routine is the same regardless of whether the journal file
7913 ** has been created or not.
7914 **
7915 ** The pager journal filename is invariant as long as the pager is
7916 ** open so it is safe to access without the BtShared mutex.
7917 */
7918 const char *sqlite3BtreeGetJournalname(Btree *p){
7919   assert( p->pBt->pPager!=0 );
7920   return sqlite3PagerJournalname(p->pBt->pPager);
7921 }
7922 
7923 /*
7924 ** Return non-zero if a transaction is active.
7925 */
7926 int sqlite3BtreeIsInTrans(Btree *p){
7927   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7928   return (p && (p->inTrans==TRANS_WRITE));
7929 }
7930 
7931 #ifndef SQLITE_OMIT_WAL
7932 /*
7933 ** Run a checkpoint on the Btree passed as the first argument.
7934 **
7935 ** Return SQLITE_LOCKED if this or any other connection has an open
7936 ** transaction on the shared-cache the argument Btree is connected to.
7937 */
7938 int sqlite3BtreeCheckpoint(Btree *p){
7939   int rc = SQLITE_OK;
7940   if( p ){
7941     BtShared *pBt = p->pBt;
7942     sqlite3BtreeEnter(p);
7943     if( pBt->inTransaction!=TRANS_NONE ){
7944       rc = SQLITE_LOCKED;
7945     }else{
7946       rc = sqlite3PagerCheckpoint(pBt->pPager);
7947     }
7948     sqlite3BtreeLeave(p);
7949   }
7950   return rc;
7951 }
7952 #endif
7953 
7954 /*
7955 ** Return non-zero if a read (or write) transaction is active.
7956 */
7957 int sqlite3BtreeIsInReadTrans(Btree *p){
7958   assert( p );
7959   assert( sqlite3_mutex_held(p->db->mutex) );
7960   return p->inTrans!=TRANS_NONE;
7961 }
7962 
7963 int sqlite3BtreeIsInBackup(Btree *p){
7964   assert( p );
7965   assert( sqlite3_mutex_held(p->db->mutex) );
7966   return p->nBackup!=0;
7967 }
7968 
7969 /*
7970 ** This function returns a pointer to a blob of memory associated with
7971 ** a single shared-btree. The memory is used by client code for its own
7972 ** purposes (for example, to store a high-level schema associated with
7973 ** the shared-btree). The btree layer manages reference counting issues.
7974 **
7975 ** The first time this is called on a shared-btree, nBytes bytes of memory
7976 ** are allocated, zeroed, and returned to the caller. For each subsequent
7977 ** call the nBytes parameter is ignored and a pointer to the same blob
7978 ** of memory returned.
7979 **
7980 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7981 ** allocated, a null pointer is returned. If the blob has already been
7982 ** allocated, it is returned as normal.
7983 **
7984 ** Just before the shared-btree is closed, the function passed as the
7985 ** xFree argument when the memory allocation was made is invoked on the
7986 ** blob of allocated memory. This function should not call sqlite3_free()
7987 ** on the memory, the btree layer does that.
7988 */
7989 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7990   BtShared *pBt = p->pBt;
7991   sqlite3BtreeEnter(p);
7992   if( !pBt->pSchema && nBytes ){
7993     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
7994     pBt->xFreeSchema = xFree;
7995   }
7996   sqlite3BtreeLeave(p);
7997   return pBt->pSchema;
7998 }
7999 
8000 /*
8001 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
8002 ** btree as the argument handle holds an exclusive lock on the
8003 ** sqlite_master table. Otherwise SQLITE_OK.
8004 */
8005 int sqlite3BtreeSchemaLocked(Btree *p){
8006   int rc;
8007   assert( sqlite3_mutex_held(p->db->mutex) );
8008   sqlite3BtreeEnter(p);
8009   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
8010   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
8011   sqlite3BtreeLeave(p);
8012   return rc;
8013 }
8014 
8015 
8016 #ifndef SQLITE_OMIT_SHARED_CACHE
8017 /*
8018 ** Obtain a lock on the table whose root page is iTab.  The
8019 ** lock is a write lock if isWritelock is true or a read lock
8020 ** if it is false.
8021 */
8022 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
8023   int rc = SQLITE_OK;
8024   assert( p->inTrans!=TRANS_NONE );
8025   if( p->sharable ){
8026     u8 lockType = READ_LOCK + isWriteLock;
8027     assert( READ_LOCK+1==WRITE_LOCK );
8028     assert( isWriteLock==0 || isWriteLock==1 );
8029 
8030     sqlite3BtreeEnter(p);
8031     rc = querySharedCacheTableLock(p, iTab, lockType);
8032     if( rc==SQLITE_OK ){
8033       rc = setSharedCacheTableLock(p, iTab, lockType);
8034     }
8035     sqlite3BtreeLeave(p);
8036   }
8037   return rc;
8038 }
8039 #endif
8040 
8041 #ifndef SQLITE_OMIT_INCRBLOB
8042 /*
8043 ** Argument pCsr must be a cursor opened for writing on an
8044 ** INTKEY table currently pointing at a valid table entry.
8045 ** This function modifies the data stored as part of that entry.
8046 **
8047 ** Only the data content may only be modified, it is not possible to
8048 ** change the length of the data stored. If this function is called with
8049 ** parameters that attempt to write past the end of the existing data,
8050 ** no modifications are made and SQLITE_CORRUPT is returned.
8051 */
8052 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
8053   int rc;
8054   assert( cursorHoldsMutex(pCsr) );
8055   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
8056   assert( pCsr->isIncrblobHandle );
8057 
8058   rc = restoreCursorPosition(pCsr);
8059   if( rc!=SQLITE_OK ){
8060     return rc;
8061   }
8062   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
8063   if( pCsr->eState!=CURSOR_VALID ){
8064     return SQLITE_ABORT;
8065   }
8066 
8067   /* Check some assumptions:
8068   **   (a) the cursor is open for writing,
8069   **   (b) there is a read/write transaction open,
8070   **   (c) the connection holds a write-lock on the table (if required),
8071   **   (d) there are no conflicting read-locks, and
8072   **   (e) the cursor points at a valid row of an intKey table.
8073   */
8074   if( !pCsr->wrFlag ){
8075     return SQLITE_READONLY;
8076   }
8077   assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE );
8078   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
8079   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
8080   assert( pCsr->apPage[pCsr->iPage]->intKey );
8081 
8082   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
8083 }
8084 
8085 /*
8086 ** Set a flag on this cursor to cache the locations of pages from the
8087 ** overflow list for the current row. This is used by cursors opened
8088 ** for incremental blob IO only.
8089 **
8090 ** This function sets a flag only. The actual page location cache
8091 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
8092 ** accessPayload() (the worker function for sqlite3BtreeData() and
8093 ** sqlite3BtreePutData()).
8094 */
8095 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
8096   assert( cursorHoldsMutex(pCur) );
8097   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
8098   assert(!pCur->isIncrblobHandle);
8099   assert(!pCur->aOverflow);
8100   pCur->isIncrblobHandle = 1;
8101 }
8102 #endif
8103 
8104 /*
8105 ** Set both the "read version" (single byte at byte offset 18) and
8106 ** "write version" (single byte at byte offset 19) fields in the database
8107 ** header to iVersion.
8108 */
8109 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
8110   BtShared *pBt = pBtree->pBt;
8111   int rc;                         /* Return code */
8112 
8113   assert( pBtree->inTrans==TRANS_NONE );
8114   assert( iVersion==1 || iVersion==2 );
8115 
8116   /* If setting the version fields to 1, do not automatically open the
8117   ** WAL connection, even if the version fields are currently set to 2.
8118   */
8119   pBt->doNotUseWAL = (u8)(iVersion==1);
8120 
8121   rc = sqlite3BtreeBeginTrans(pBtree, 0);
8122   if( rc==SQLITE_OK ){
8123     u8 *aData = pBt->pPage1->aData;
8124     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
8125       rc = sqlite3BtreeBeginTrans(pBtree, 2);
8126       if( rc==SQLITE_OK ){
8127         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8128         if( rc==SQLITE_OK ){
8129           aData[18] = (u8)iVersion;
8130           aData[19] = (u8)iVersion;
8131         }
8132       }
8133     }
8134   }
8135 
8136   pBt->doNotUseWAL = 0;
8137   return rc;
8138 }
8139